]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/reiser4-for-2.6.22-2.patch
Kernel 2.6.22.1 - leider ohne OpenSwan.
[people/pmueller/ipfire-2.x.git] / src / patches / reiser4-for-2.6.22-2.patch
1 The same as reiser4-for-2.6.22.patch plus a fix for file conversion
2 related bug wich caused metadata corruption when REISER4_DEBUG is on.
3
4 diff -urN linux-2.6.22.orig/arch/i386/lib/usercopy.c linux-2.6.22/arch/i386/lib/usercopy.c
5 --- linux-2.6.22.orig/arch/i386/lib/usercopy.c 2007-07-21 00:32:46.973831675 +0400
6 +++ linux-2.6.22/arch/i386/lib/usercopy.c 2007-07-29 00:25:34.800676805 +0400
7 @@ -817,6 +817,7 @@
8 #endif
9 return n;
10 }
11 +EXPORT_SYMBOL(__copy_from_user_ll_nocache);
12
13 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
14 unsigned long n)
15 @@ -831,6 +832,7 @@
16 #endif
17 return n;
18 }
19 +EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
20
21 /**
22 * copy_to_user: - Copy a block of data into user space.
23 diff -urN linux-2.6.22.orig/Documentation/Changes linux-2.6.22/Documentation/Changes
24 --- linux-2.6.22.orig/Documentation/Changes 2007-07-21 00:31:57.012856483 +0400
25 +++ linux-2.6.22/Documentation/Changes 2007-07-29 00:25:34.800676805 +0400
26 @@ -36,6 +36,7 @@
27 o e2fsprogs 1.29 # tune2fs
28 o jfsutils 1.1.3 # fsck.jfs -V
29 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
30 +o reiser4progs 1.0.0 # fsck.reiser4 -V
31 o xfsprogs 2.6.0 # xfs_db -V
32 o pcmciautils 004 # pccardctl -V
33 o quota-tools 3.09 # quota -V
34 @@ -144,6 +145,13 @@
35 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
36 reiserfsck. These utils work on both i386 and alpha platforms.
37
38 +Reiser4progs
39 +------------
40 +
41 +The reiser4progs package contains utilities for the reiser4 file system.
42 +Detailed instructions are provided in the README file located at:
43 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
44 +
45 Xfsprogs
46 --------
47
48 @@ -322,6 +330,10 @@
49 -------------
50 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
51
52 +Reiser4progs
53 +------------
54 +o <ftp://ftp.namesys.com/pub/reiser4progs/>
55 +
56 Xfsprogs
57 --------
58 o <ftp://oss.sgi.com/projects/xfs/download/>
59 diff -urN linux-2.6.22.orig/Documentation/filesystems/reiser4.txt linux-2.6.22/Documentation/filesystems/reiser4.txt
60 --- linux-2.6.22.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
61 +++ linux-2.6.22/Documentation/filesystems/reiser4.txt 2007-07-29 00:25:34.800676805 +0400
62 @@ -0,0 +1,75 @@
63 +Reiser4 filesystem
64 +==================
65 +Reiser4 is a file system based on dancing tree algorithms, and is
66 +described at http://www.namesys.com
67 +
68 +
69 +References
70 +==========
71 +web page http://namesys.com/v4/v4.html
72 +source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
73 +userland tools ftp://ftp.namesys.com/pub/reiser4progs/
74 +install page http://www.namesys.com/install_v4.html
75 +
76 +Compile options
77 +===============
78 +Enable reiser4 debug mode
79 + This checks everything imaginable while reiser4
80 + runs
81 +
82 +Mount options
83 +=============
84 +tmgr.atom_max_size=N
85 + Atoms containing more than N blocks will be forced to commit.
86 + N is decimal.
87 + Default is nr_free_pagecache_pages() / 2 at mount time.
88 +
89 +tmgr.atom_max_age=N
90 + Atoms older than N seconds will be forced to commit. N is decimal.
91 + Default is 600.
92 +
93 +tmgr.atom_max_flushers=N
94 + Limit of concurrent flushers for one atom. 0 means no limit.
95 + Default is 0.
96 +
97 +tree.cbk_cache.nr_slots=N
98 + Number of slots in the cbk cache.
99 +
100 +flush.relocate_threshold=N
101 + If flush finds more than N adjacent dirty leaf-level blocks it
102 + will force them to be relocated.
103 + Default is 64.
104 +
105 +flush.relocate_distance=N
106 + If flush finds can find a block allocation closer than at most
107 + N from the preceder it will relocate to that position.
108 + Default is 64.
109 +
110 +flush.scan_maxnodes=N
111 + The maximum number of nodes to scan left on a level during
112 + flush.
113 + Default is 10000.
114 +
115 +optimal_io_size=N
116 + Preferred IO size. This value is used to set st_blksize of
117 + struct stat.
118 + Default is 65536.
119 +
120 +bsdgroups
121 + Turn on BSD-style gid assignment.
122 +
123 +32bittimes
124 + By default file in reiser4 have 64 bit timestamps. Files
125 + created when filesystem is mounted with 32bittimes mount
126 + option will get 32 bit timestamps.
127 +
128 +mtflush
129 + Turn off concurrent flushing.
130 +
131 +nopseudo
132 + Disable pseudo files support. See
133 + http://namesys.com/v4/pseudo.html for more about pseudo files.
134 +
135 +dont_load_bitmap
136 + Don't load all bitmap blocks at mount time, it is useful for
137 + machines with tiny RAM and large disks.
138 diff -urN linux-2.6.22.orig/fs/fs-writeback.c linux-2.6.22/fs/fs-writeback.c
139 --- linux-2.6.22.orig/fs/fs-writeback.c 2007-07-21 00:32:04.502801671 +0400
140 +++ linux-2.6.22/fs/fs-writeback.c 2007-07-29 00:25:34.808678876 +0400
141 @@ -296,8 +296,6 @@
142 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
143 * that it can be located for waiting on in __writeback_single_inode().
144 *
145 - * Called under inode_lock.
146 - *
147 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
148 * This function assumes that the blockdev superblock's inodes are backed by
149 * a variety of queues, so all inodes are searched. For other superblocks,
150 @@ -313,11 +311,13 @@
151 * on the writer throttling path, and we get decent balancing between many
152 * throttled threads: we don't want them all piling up on __wait_on_inode.
153 */
154 -static void
155 -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
156 +void
157 +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
158 {
159 const unsigned long start = jiffies; /* livelock avoidance */
160
161 + spin_lock(&inode_lock);
162 +
163 if (!wbc->for_kupdate || list_empty(&sb->s_io))
164 list_splice_init(&sb->s_dirty, &sb->s_io);
165
166 @@ -397,8 +397,19 @@
167 if (wbc->nr_to_write <= 0)
168 break;
169 }
170 + spin_unlock(&inode_lock);
171 return; /* Leave any unwritten inodes on s_io */
172 }
173 +EXPORT_SYMBOL(generic_sync_sb_inodes);
174 +
175 +static void
176 +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
177 +{
178 + if (sb->s_op->sync_inodes)
179 + sb->s_op->sync_inodes(sb, wbc);
180 + else
181 + generic_sync_sb_inodes(sb, wbc);
182 +}
183
184 /*
185 * Start writeback of dirty pagecache data against all unlocked inodes.
186 @@ -439,11 +450,8 @@
187 * be unmounted by the time it is released.
188 */
189 if (down_read_trylock(&sb->s_umount)) {
190 - if (sb->s_root) {
191 - spin_lock(&inode_lock);
192 + if (sb->s_root)
193 sync_sb_inodes(sb, wbc);
194 - spin_unlock(&inode_lock);
195 - }
196 up_read(&sb->s_umount);
197 }
198 spin_lock(&sb_lock);
199 @@ -481,9 +489,7 @@
200 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
201 nr_dirty + nr_unstable;
202 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
203 - spin_lock(&inode_lock);
204 sync_sb_inodes(sb, &wbc);
205 - spin_unlock(&inode_lock);
206 }
207
208 /*
209 diff -urN linux-2.6.22.orig/fs/Kconfig linux-2.6.22/fs/Kconfig
210 --- linux-2.6.22.orig/fs/Kconfig 2007-07-21 00:32:57.540575927 +0400
211 +++ linux-2.6.22/fs/Kconfig 2007-07-29 00:25:34.812679911 +0400
212 @@ -272,6 +272,8 @@
213 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
214 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
215
216 +source "fs/reiser4/Kconfig"
217 +
218 config REISERFS_FS
219 tristate "Reiserfs support"
220 help
221 diff -urN linux-2.6.22.orig/fs/Makefile linux-2.6.22/fs/Makefile
222 --- linux-2.6.22.orig/fs/Makefile 2007-07-21 00:32:57.544576967 +0400
223 +++ linux-2.6.22/fs/Makefile 2007-07-29 00:25:34.812679911 +0400
224 @@ -66,6 +66,7 @@
225
226 # Do not add any filesystems before this line
227 obj-$(CONFIG_REISERFS_FS) += reiserfs/
228 +obj-$(CONFIG_REISER4_FS) += reiser4/
229 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
230 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
231 obj-$(CONFIG_JBD) += jbd/
232 diff -urN linux-2.6.22.orig/fs/reiser4/as_ops.c linux-2.6.22/fs/reiser4/as_ops.c
233 --- linux-2.6.22.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
234 +++ linux-2.6.22/fs/reiser4/as_ops.c 2007-07-29 00:25:34.816680947 +0400
235 @@ -0,0 +1,337 @@
236 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
237 +
238 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
239 +
240 +#include "forward.h"
241 +#include "debug.h"
242 +#include "dformat.h"
243 +#include "coord.h"
244 +#include "plugin/item/item.h"
245 +#include "plugin/file/file.h"
246 +#include "plugin/security/perm.h"
247 +#include "plugin/disk_format/disk_format.h"
248 +#include "plugin/plugin.h"
249 +#include "plugin/plugin_set.h"
250 +#include "plugin/object.h"
251 +#include "txnmgr.h"
252 +#include "jnode.h"
253 +#include "znode.h"
254 +#include "block_alloc.h"
255 +#include "tree.h"
256 +#include "vfs_ops.h"
257 +#include "inode.h"
258 +#include "page_cache.h"
259 +#include "ktxnmgrd.h"
260 +#include "super.h"
261 +#include "reiser4.h"
262 +#include "entd.h"
263 +
264 +#include <linux/profile.h>
265 +#include <linux/types.h>
266 +#include <linux/mount.h>
267 +#include <linux/vfs.h>
268 +#include <linux/mm.h>
269 +#include <linux/buffer_head.h>
270 +#include <linux/dcache.h>
271 +#include <linux/list.h>
272 +#include <linux/pagemap.h>
273 +#include <linux/slab.h>
274 +#include <linux/seq_file.h>
275 +#include <linux/init.h>
276 +#include <linux/module.h>
277 +#include <linux/writeback.h>
278 +#include <linux/backing-dev.h>
279 +#include <linux/quotaops.h>
280 +#include <linux/security.h>
281 +
282 +/* address space operations */
283 +
284 +/**
285 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
286 + * @page: page to be dirtied
287 + *
288 + * Operation of struct address_space_operations. This implementation is used by
289 + * unix and cryptcompress file plugins.
290 + *
291 + * This is called when reiser4 page gets dirtied outside of reiser4, for
292 + * example, when dirty bit is moved from pte to physical page.
293 + *
294 + * Tags page in the mapping's page tree with special tag so that it is possible
295 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
296 + * capturing by an atom) later because it can not be done in the contexts where
297 + * set_page_dirty is called.
298 + */
299 +int reiser4_set_page_dirty(struct page *page)
300 +{
301 + /* this page can be unformatted only */
302 + assert("vs-1734", (page->mapping &&
303 + page->mapping->host &&
304 + reiser4_get_super_fake(page->mapping->host->i_sb) !=
305 + page->mapping->host
306 + && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
307 + page->mapping->host
308 + && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
309 + page->mapping->host));
310 +
311 + if (!TestSetPageDirty(page)) {
312 + struct address_space *mapping = page->mapping;
313 +
314 + if (mapping) {
315 + write_lock_irq(&mapping->tree_lock);
316 +
317 + /* check for race with truncate */
318 + if (page->mapping) {
319 + assert("vs-1652", page->mapping == mapping);
320 + if (mapping_cap_account_dirty(mapping))
321 + inc_zone_page_state(page,
322 + NR_FILE_DIRTY);
323 + radix_tree_tag_set(&mapping->page_tree,
324 + page->index,
325 + PAGECACHE_TAG_REISER4_MOVED);
326 + }
327 + write_unlock_irq(&mapping->tree_lock);
328 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
329 + }
330 + }
331 + return 0;
332 +}
333 +
334 +/* ->invalidatepage method for reiser4 */
335 +
336 +/*
337 + * this is called for each truncated page from
338 + * truncate_inode_pages()->truncate_{complete,partial}_page().
339 + *
340 + * At the moment of call, page is under lock, and outstanding io (if any) has
341 + * completed.
342 + */
343 +
344 +/**
345 + * reiser4_invalidatepage
346 + * @page: page to invalidate
347 + * @offset: starting offset for partial invalidation
348 + *
349 + */
350 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
351 +{
352 + int ret = 0;
353 + reiser4_context *ctx;
354 + struct inode *inode;
355 + jnode *node;
356 +
357 + /*
358 + * This is called to truncate file's page.
359 + *
360 + * Originally, reiser4 implemented truncate in a standard way
361 + * (vmtruncate() calls ->invalidatepage() on all truncated pages
362 + * first, then file system ->truncate() call-back is invoked).
363 + *
364 + * This lead to the problem when ->invalidatepage() was called on a
365 + * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
366 + * process. That is, truncate was bypassing transactions. To avoid
367 + * this, try_capture_page_to_invalidate() call was added here.
368 + *
369 + * After many troubles with vmtruncate() based truncate (including
370 + * races with flush, tail conversion, etc.) it was re-written in the
371 + * top-to-bottom style: items are killed in reiser4_cut_tree_object()
372 + * and pages belonging to extent are invalidated in kill_hook_extent().
373 + * So probably now additional call to capture is not needed here.
374 + */
375 +
376 + assert("nikita-3137", PageLocked(page));
377 + assert("nikita-3138", !PageWriteback(page));
378 + inode = page->mapping->host;
379 +
380 + /*
381 + * ->invalidatepage() should only be called for the unformatted
382 + * jnodes. Destruction of all other types of jnodes is performed
383 + * separately. But, during some corner cases (like handling errors
384 + * during mount) it is simpler to let ->invalidatepage to be called on
385 + * them. Check for this, and do nothing.
386 + */
387 + if (reiser4_get_super_fake(inode->i_sb) == inode)
388 + return;
389 + if (reiser4_get_cc_fake(inode->i_sb) == inode)
390 + return;
391 + if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
392 + return;
393 + assert("vs-1426", PagePrivate(page));
394 + assert("vs-1427",
395 + page->mapping == jnode_get_mapping(jnode_by_page(page)));
396 + assert("", jprivate(page) != NULL);
397 + assert("", ergo(inode_file_plugin(inode) !=
398 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
399 + offset == 0));
400 +
401 + ctx = reiser4_init_context(inode->i_sb);
402 + if (IS_ERR(ctx))
403 + return;
404 +
405 + node = jprivate(page);
406 + spin_lock_jnode(node);
407 + if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
408 + (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
409 + /* there is not need to capture */
410 + jref(node);
411 + JF_SET(node, JNODE_HEARD_BANSHEE);
412 + page_clear_jnode(page, node);
413 + reiser4_uncapture_jnode(node);
414 + unhash_unformatted_jnode(node);
415 + jput(node);
416 + reiser4_exit_context(ctx);
417 + return;
418 + }
419 + spin_unlock_jnode(node);
420 +
421 + /* capture page being truncated. */
422 + ret = try_capture_page_to_invalidate(page);
423 + if (ret != 0)
424 + warning("nikita-3141", "Cannot capture: %i", ret);
425 +
426 + if (offset == 0) {
427 + /* remove jnode from transaction and detach it from page. */
428 + jref(node);
429 + JF_SET(node, JNODE_HEARD_BANSHEE);
430 + /* page cannot be detached from jnode concurrently, because it
431 + * is locked */
432 + reiser4_uncapture_page(page);
433 +
434 + /* this detaches page from jnode, so that jdelete will not try
435 + * to lock page which is already locked */
436 + spin_lock_jnode(node);
437 + page_clear_jnode(page, node);
438 + spin_unlock_jnode(node);
439 + unhash_unformatted_jnode(node);
440 +
441 + jput(node);
442 + }
443 +
444 + reiser4_exit_context(ctx);
445 +}
446 +
447 +/* help function called from reiser4_releasepage(). It returns true if jnode
448 + * can be detached from its page and page released. */
449 +int jnode_is_releasable(jnode * node /* node to check */ )
450 +{
451 + assert("nikita-2781", node != NULL);
452 + assert_spin_locked(&(node->guard));
453 + assert_spin_locked(&(node->load));
454 +
455 + /* is some thread is currently using jnode page, later cannot be
456 + * detached */
457 + if (atomic_read(&node->d_count) != 0) {
458 + return 0;
459 + }
460 +
461 + assert("vs-1214", !jnode_is_loaded(node));
462 +
463 + /*
464 + * can only release page if real block number is assigned to it. Simple
465 + * check for ->atom wouldn't do, because it is possible for node to be
466 + * clean, not it atom yet, and still having fake block number. For
467 + * example, node just created in jinit_new().
468 + */
469 + if (reiser4_blocknr_is_fake(jnode_get_block(node)))
470 + return 0;
471 +
472 + /*
473 + * pages prepared for write can not be released anyway, so avoid
474 + * detaching jnode from the page
475 + */
476 + if (JF_ISSET(node, JNODE_WRITE_PREPARED))
477 + return 0;
478 +
479 + /*
480 + * dirty jnode cannot be released. It can however be submitted to disk
481 + * as part of early flushing, but only after getting flush-prepped.
482 + */
483 + if (JF_ISSET(node, JNODE_DIRTY))
484 + return 0;
485 +
486 + /* overwrite set is only written by log writer. */
487 + if (JF_ISSET(node, JNODE_OVRWR))
488 + return 0;
489 +
490 + /* jnode is already under writeback */
491 + if (JF_ISSET(node, JNODE_WRITEBACK))
492 + return 0;
493 +
494 + /* don't flush bitmaps or journal records */
495 + if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
496 + return 0;
497 +
498 + return 1;
499 +}
500 +
501 +/*
502 + * ->releasepage method for reiser4
503 + *
504 + * This is called by VM scanner when it comes across clean page. What we have
505 + * to do here is to check whether page can really be released (freed that is)
506 + * and if so, detach jnode from it and remove page from the page cache.
507 + *
508 + * Check for releasability is done by releasable() function.
509 + */
510 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
511 +{
512 + jnode *node;
513 +
514 + assert("nikita-2257", PagePrivate(page));
515 + assert("nikita-2259", PageLocked(page));
516 + assert("nikita-2892", !PageWriteback(page));
517 + assert("nikita-3019", reiser4_schedulable());
518 +
519 + /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
520 + is not clear what to do in this case. A lot of deadlocks seems be
521 + possible. */
522 +
523 + node = jnode_by_page(page);
524 + assert("nikita-2258", node != NULL);
525 + assert("reiser4-4", page->mapping != NULL);
526 + assert("reiser4-5", page->mapping->host != NULL);
527 +
528 + if (PageDirty(page))
529 + return 0;
530 +
531 + /* extra page reference is used by reiser4 to protect
532 + * jnode<->page link from this ->releasepage(). */
533 + if (page_count(page) > 3)
534 + return 0;
535 +
536 + /* releasable() needs jnode lock, because it looks at the jnode fields
537 + * and we need jload_lock here to avoid races with jload(). */
538 + spin_lock_jnode(node);
539 + spin_lock(&(node->load));
540 + if (jnode_is_releasable(node)) {
541 + struct address_space *mapping;
542 +
543 + mapping = page->mapping;
544 + jref(node);
545 + /* there is no need to synchronize against
546 + * jnode_extent_write() here, because pages seen by
547 + * jnode_extent_write() are !releasable(). */
548 + page_clear_jnode(page, node);
549 + spin_unlock(&(node->load));
550 + spin_unlock_jnode(node);
551 +
552 + /* we are under memory pressure so release jnode also. */
553 + jput(node);
554 +
555 + return 1;
556 + } else {
557 + spin_unlock(&(node->load));
558 + spin_unlock_jnode(node);
559 + assert("nikita-3020", reiser4_schedulable());
560 + return 0;
561 + }
562 +}
563 +
564 +/* Make Linus happy.
565 + Local variables:
566 + c-indentation-style: "K&R"
567 + mode-name: "LC"
568 + c-basic-offset: 8
569 + tab-width: 8
570 + fill-column: 120
571 + End:
572 +*/
573 diff -urN linux-2.6.22.orig/fs/reiser4/block_alloc.c linux-2.6.22/fs/reiser4/block_alloc.c
574 --- linux-2.6.22.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
575 +++ linux-2.6.22/fs/reiser4/block_alloc.c 2007-07-29 00:25:34.816680947 +0400
576 @@ -0,0 +1,1137 @@
577 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
578 +
579 +#include "debug.h"
580 +#include "dformat.h"
581 +#include "plugin/plugin.h"
582 +#include "txnmgr.h"
583 +#include "znode.h"
584 +#include "block_alloc.h"
585 +#include "tree.h"
586 +#include "super.h"
587 +
588 +#include <linux/types.h> /* for __u?? */
589 +#include <linux/fs.h> /* for struct super_block */
590 +#include <linux/spinlock.h>
591 +
592 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
593 +
594 +/* We need to be able to reserve enough disk space to ensure that an atomic
595 + operation will have enough disk space to flush (see flush.c and
596 + http://namesys.com/v4/v4.html) and commit it once it is started.
597 +
598 + In our design a call for reserving disk space may fail but not an actual
599 + block allocation.
600 +
601 + All free blocks, already allocated blocks, and all kinds of reserved blocks
602 + are counted in different per-fs block counters.
603 +
604 + A reiser4 super block's set of block counters currently is:
605 +
606 + free -- free blocks,
607 + used -- already allocated blocks,
608 +
609 + grabbed -- initially reserved for performing an fs operation, those blocks
610 + are taken from free blocks, then grabbed disk space leaks from grabbed
611 + blocks counter to other counters like "fake allocated", "flush
612 + reserved", "used", the rest of not used grabbed space is returned to
613 + free space at the end of fs operation;
614 +
615 + fake allocated -- counts all nodes without real disk block numbers assigned,
616 + we have separate accounting for formatted and unformatted
617 + nodes (for easier debugging);
618 +
619 + flush reserved -- disk space needed for flushing and committing an atom.
620 + Each dirty already allocated block could be written as a
621 + part of atom's overwrite set or as a part of atom's
622 + relocate set. In both case one additional block is needed,
623 + it is used as a wandered block if we do overwrite or as a
624 + new location for a relocated block.
625 +
626 + In addition, blocks in some states are counted on per-thread and per-atom
627 + basis. A reiser4 context has a counter of blocks grabbed by this transaction
628 + and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
629 + of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
630 + blocks, which are reserved for flush processing and atom commit. */
631 +
632 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
633 + number of blocks to grab for most expensive case of balancing when the leaf
634 + node we insert new item to gets split and new leaf node is allocated.
635 +
636 + So, we need to grab blocks for
637 +
638 + 1) one block for possible dirtying the node we insert an item to. That block
639 + would be used for node relocation at flush time or for allocating of a
640 + wandered one, it depends what will be a result (what set, relocate or
641 + overwrite the node gets assigned to) of the node processing by the flush
642 + algorithm.
643 +
644 + 2) one block for either allocating a new node, or dirtying of right or left
645 + clean neighbor, only one case may happen.
646 +
647 + VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
648 + node, and creation of new node. have I forgotten something? email me.
649 +
650 + These grabbed blocks are counted in both reiser4 context "grabbed blocks"
651 + counter and in the fs-wide one (both ctx->grabbed_blocks and
652 + sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
653 + decremented by 2.
654 +
655 + Suppose both two blocks were spent for dirtying of an already allocated clean
656 + node (one block went from "grabbed" to "flush reserved") and for new block
657 + allocating (one block went from "grabbed" to "fake allocated formatted").
658 +
659 + Inserting of a child pointer to the parent node caused parent node to be
660 + split, the balancing code takes care about this grabbing necessary space
661 + immediately by calling reiser4_grab with BA_RESERVED flag set which means
662 + "can use the 5% reserved disk space".
663 +
664 + At this moment insertion completes and grabbed blocks (if they were not used)
665 + should be returned to the free space counter.
666 +
667 + However the atom life-cycle is not completed. The atom had one "flush
668 + reserved" block added by our insertion and the new fake allocated node is
669 + counted as a "fake allocated formatted" one. The atom has to be fully
670 + processed by flush before commit. Suppose that the flush moved the first,
671 + already allocated node to the atom's overwrite list, the new fake allocated
672 + node, obviously, went into the atom relocate set. The reiser4 flush
673 + allocates the new node using one unit from "fake allocated formatted"
674 + counter, the log writer uses one from "flush reserved" for wandered block
675 + allocation.
676 +
677 + And, it is not the end. When the wandered block is deallocated after the
678 + atom gets fully played (see wander.c for term description), the disk space
679 + occupied for it is returned to free blocks. */
680 +
681 +/* BLOCK NUMBERS */
682 +
683 +/* Any reiser4 node has a block number assigned to it. We use these numbers for
684 + indexing in hash tables, so if a block has not yet been assigned a location
685 + on disk we need to give it a temporary fake block number.
686 +
687 + Current implementation of reiser4 uses 64-bit integers for block numbers. We
688 + use highest bit in 64-bit block number to distinguish fake and real block
689 + numbers. So, only 63 bits may be used to addressing of real device
690 + blocks. That "fake" block numbers space is divided into subspaces of fake
691 + block numbers for data blocks and for shadow (working) bitmap blocks.
692 +
693 + Fake block numbers for data blocks are generated by a cyclic counter, which
694 + gets incremented after each real block allocation. We assume that it is
695 + impossible to overload this counter during one transaction life. */
696 +
697 +/* Initialize a blocknr hint. */
698 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
699 +{
700 + memset(hint, 0, sizeof(reiser4_blocknr_hint));
701 +}
702 +
703 +/* Release any resources of a blocknr hint. */
704 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
705 +{
706 + /* No resources should be freed in current blocknr_hint implementation. */
707 +}
708 +
709 +/* see above for explanation of fake block number. */
710 +/* Audited by: green(2002.06.11) */
711 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
712 +{
713 + /* The reason for not simply returning result of '&' operation is that
714 + while return value is (possibly 32bit) int, the reiser4_block_nr is
715 + at least 64 bits long, and high bit (which is the only possible
716 + non zero bit after the masking) would be stripped off */
717 + return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
718 +}
719 +
720 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
721 + arithmetic. Mostly, they are isolated to not to code same assertions in
722 + several places. */
723 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
724 +{
725 + BUG_ON(ctx->grabbed_blocks < count);
726 + assert("zam-527", ctx->grabbed_blocks >= count);
727 + ctx->grabbed_blocks -= count;
728 +}
729 +
730 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
731 +{
732 + ctx->grabbed_blocks += count;
733 +}
734 +
735 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
736 +{
737 + assert("zam-525", sbinfo->blocks_grabbed >= count);
738 + sbinfo->blocks_grabbed -= count;
739 +}
740 +
741 +/* Decrease the counter of block reserved for flush in super block. */
742 +static void
743 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
744 +{
745 + assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
746 + sbinfo->blocks_flush_reserved -= count;
747 +}
748 +
749 +static void
750 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
751 + reiser4_ba_flags_t flags)
752 +{
753 + if (flags & BA_FORMATTED) {
754 + assert("zam-806", sbinfo->blocks_fake_allocated >= count);
755 + sbinfo->blocks_fake_allocated -= count;
756 + } else {
757 + assert("zam-528",
758 + sbinfo->blocks_fake_allocated_unformatted >= count);
759 + sbinfo->blocks_fake_allocated_unformatted -= count;
760 + }
761 +}
762 +
763 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
764 +{
765 + assert("zam-530",
766 + sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
767 + sbinfo->blocks_used -= count;
768 +}
769 +
770 +static void
771 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
772 +{
773 + assert("edward-501", sbinfo->blocks_clustered >= count);
774 + sbinfo->blocks_clustered -= count;
775 +}
776 +
777 +/* Increase the counter of block reserved for flush in atom. */
778 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
779 +{
780 + assert("zam-772", atom != NULL);
781 + assert_spin_locked(&(atom->alock));
782 + atom->flush_reserved += count;
783 +}
784 +
785 +/* Decrease the counter of block reserved for flush in atom. */
786 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
787 +{
788 + assert("zam-774", atom != NULL);
789 + assert_spin_locked(&(atom->alock));
790 + assert("nikita-2790", atom->flush_reserved >= count);
791 + atom->flush_reserved -= count;
792 +}
793 +
794 +/* super block has 6 counters: free, used, grabbed, fake allocated
795 + (formatted and unformatted) and flush reserved. Their sum must be
796 + number of blocks on a device. This function checks this */
797 +int reiser4_check_block_counters(const struct super_block *super)
798 +{
799 + __u64 sum;
800 +
801 + sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
802 + reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
803 + reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
804 + reiser4_clustered_blocks(super);
805 + if (reiser4_block_count(super) != sum) {
806 + printk("super block counters: "
807 + "used %llu, free %llu, "
808 + "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
809 + "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
810 + (unsigned long long)reiser4_data_blocks(super),
811 + (unsigned long long)reiser4_free_blocks(super),
812 + (unsigned long long)reiser4_grabbed_blocks(super),
813 + (unsigned long long)reiser4_fake_allocated(super),
814 + (unsigned long long)
815 + reiser4_fake_allocated_unformatted(super),
816 + (unsigned long long)reiser4_flush_reserved(super),
817 + (unsigned long long)reiser4_clustered_blocks(super),
818 + (unsigned long long)sum,
819 + (unsigned long long)reiser4_block_count(super));
820 + return 0;
821 + }
822 + return 1;
823 +}
824 +
825 +/* Adjust "working" free blocks counter for number of blocks we are going to
826 + allocate. Record number of grabbed blocks in fs-wide and per-thread
827 + counters. This function should be called before bitmap scanning or
828 + allocating fake block numbers
829 +
830 + @super -- pointer to reiser4 super block;
831 + @count -- number of blocks we reserve;
832 +
833 + @return -- 0 if success, -ENOSPC, if all
834 + free blocks are preserved or already allocated.
835 +*/
836 +
837 +static int
838 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
839 +{
840 + __u64 free_blocks;
841 + int ret = 0, use_reserved = flags & BA_RESERVED;
842 + reiser4_super_info_data *sbinfo;
843 +
844 + assert("vs-1276", ctx == get_current_context());
845 +
846 + /* Do not grab anything on ro-mounted fs. */
847 + if (rofs_super(ctx->super)) {
848 + ctx->grab_enabled = 0;
849 + return 0;
850 + }
851 +
852 + sbinfo = get_super_private(ctx->super);
853 +
854 + spin_lock_reiser4_super(sbinfo);
855 +
856 + free_blocks = sbinfo->blocks_free;
857 +
858 + if ((use_reserved && free_blocks < count) ||
859 + (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
860 + ret = RETERR(-ENOSPC);
861 + goto unlock_and_ret;
862 + }
863 +
864 + add_to_ctx_grabbed(ctx, count);
865 +
866 + sbinfo->blocks_grabbed += count;
867 + sbinfo->blocks_free -= count;
868 +
869 +#if REISER4_DEBUG
870 + if (ctx->grabbed_initially == 0)
871 + ctx->grabbed_initially = count;
872 +#endif
873 +
874 + assert("nikita-2986", reiser4_check_block_counters(ctx->super));
875 +
876 + /* disable grab space in current context */
877 + ctx->grab_enabled = 0;
878 +
879 + unlock_and_ret:
880 + spin_unlock_reiser4_super(sbinfo);
881 +
882 + return ret;
883 +}
884 +
885 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
886 +{
887 + int ret;
888 + reiser4_context *ctx;
889 +
890 + assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
891 + lock_stack_isclean(get_current_lock_stack
892 + ())));
893 + ctx = get_current_context();
894 + if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
895 + return 0;
896 + }
897 +
898 + ret = reiser4_grab(ctx, count, flags);
899 + if (ret == -ENOSPC) {
900 +
901 + /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
902 + if (flags & BA_CAN_COMMIT) {
903 + txnmgr_force_commit_all(ctx->super, 0);
904 + ctx->grab_enabled = 1;
905 + ret = reiser4_grab(ctx, count, flags);
906 + }
907 + }
908 + /*
909 + * allocation from reserved pool cannot fail. This is severe error.
910 + */
911 + assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
912 + return ret;
913 +}
914 +
915 +/*
916 + * SPACE RESERVED FOR UNLINK/TRUNCATE
917 + *
918 + * Unlink and truncate require space in transaction (to update stat data, at
919 + * least). But we don't want rm(1) to fail with "No space on device" error.
920 + *
921 + * Solution is to reserve 5% of disk space for truncates and
922 + * unlinks. Specifically, normal space grabbing requests don't grab space from
923 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
924 + * drain it. Per super block delete mutex is used to allow only one
925 + * thread at a time to grab from reserved area.
926 + *
927 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
928 + * flag.
929 + *
930 + */
931 +
932 +int reiser4_grab_reserved(struct super_block *super,
933 + __u64 count, reiser4_ba_flags_t flags)
934 +{
935 + reiser4_super_info_data *sbinfo = get_super_private(super);
936 +
937 + assert("nikita-3175", flags & BA_CAN_COMMIT);
938 +
939 + /* Check the delete mutex already taken by us, we assume that
940 + * reading of machine word is atomic. */
941 + if (sbinfo->delete_mutex_owner == current) {
942 + if (reiser4_grab_space
943 + (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
944 + warning("zam-1003",
945 + "nested call of grab_reserved fails count=(%llu)",
946 + (unsigned long long)count);
947 + reiser4_release_reserved(super);
948 + return RETERR(-ENOSPC);
949 + }
950 + return 0;
951 + }
952 +
953 + if (reiser4_grab_space(count, flags)) {
954 + mutex_lock(&sbinfo->delete_mutex);
955 + assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
956 + sbinfo->delete_mutex_owner = current;
957 +
958 + if (reiser4_grab_space(count, flags | BA_RESERVED)) {
959 + warning("zam-833",
960 + "reserved space is not enough (%llu)",
961 + (unsigned long long)count);
962 + reiser4_release_reserved(super);
963 + return RETERR(-ENOSPC);
964 + }
965 + }
966 + return 0;
967 +}
968 +
969 +void reiser4_release_reserved(struct super_block *super)
970 +{
971 + reiser4_super_info_data *info;
972 +
973 + info = get_super_private(super);
974 + if (info->delete_mutex_owner == current) {
975 + info->delete_mutex_owner = NULL;
976 + mutex_unlock(&info->delete_mutex);
977 + }
978 +}
979 +
980 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
981 +{
982 + reiser4_context *ctx;
983 + reiser4_super_info_data *sbinfo;
984 +
985 + ctx = get_current_context();
986 + sub_from_ctx_grabbed(ctx, count);
987 +
988 + sbinfo = get_super_private(ctx->super);
989 + spin_lock_reiser4_super(sbinfo);
990 +
991 + sub_from_sb_grabbed(sbinfo, count);
992 + /* return sbinfo locked */
993 + return sbinfo;
994 +}
995 +
996 +/* is called after @count fake block numbers are allocated and pointer to
997 + those blocks are inserted into tree. */
998 +static void grabbed2fake_allocated_formatted(void)
999 +{
1000 + reiser4_super_info_data *sbinfo;
1001 +
1002 + sbinfo = grabbed2fake_allocated_head(1);
1003 + sbinfo->blocks_fake_allocated++;
1004 +
1005 + assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1006 +
1007 + spin_unlock_reiser4_super(sbinfo);
1008 +}
1009 +
1010 +/**
1011 + * grabbed2fake_allocated_unformatted
1012 + * @count:
1013 + *
1014 + */
1015 +static void grabbed2fake_allocated_unformatted(int count)
1016 +{
1017 + reiser4_super_info_data *sbinfo;
1018 +
1019 + sbinfo = grabbed2fake_allocated_head(count);
1020 + sbinfo->blocks_fake_allocated_unformatted += count;
1021 +
1022 + assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1023 +
1024 + spin_unlock_reiser4_super(sbinfo);
1025 +}
1026 +
1027 +void grabbed2cluster_reserved(int count)
1028 +{
1029 + reiser4_context *ctx;
1030 + reiser4_super_info_data *sbinfo;
1031 +
1032 + ctx = get_current_context();
1033 + sub_from_ctx_grabbed(ctx, count);
1034 +
1035 + sbinfo = get_super_private(ctx->super);
1036 + spin_lock_reiser4_super(sbinfo);
1037 +
1038 + sub_from_sb_grabbed(sbinfo, count);
1039 + sbinfo->blocks_clustered += count;
1040 +
1041 + assert("edward-504", reiser4_check_block_counters(ctx->super));
1042 +
1043 + spin_unlock_reiser4_super(sbinfo);
1044 +}
1045 +
1046 +void cluster_reserved2grabbed(int count)
1047 +{
1048 + reiser4_context *ctx;
1049 + reiser4_super_info_data *sbinfo;
1050 +
1051 + ctx = get_current_context();
1052 +
1053 + sbinfo = get_super_private(ctx->super);
1054 + spin_lock_reiser4_super(sbinfo);
1055 +
1056 + sub_from_cluster_reserved(sbinfo, count);
1057 + sbinfo->blocks_grabbed += count;
1058 +
1059 + assert("edward-505", reiser4_check_block_counters(ctx->super));
1060 +
1061 + spin_unlock_reiser4_super(sbinfo);
1062 + add_to_ctx_grabbed(ctx, count);
1063 +}
1064 +
1065 +void cluster_reserved2free(int count)
1066 +{
1067 + reiser4_context *ctx;
1068 + reiser4_super_info_data *sbinfo;
1069 +
1070 + ctx = get_current_context();
1071 + sbinfo = get_super_private(ctx->super);
1072 +
1073 + cluster_reserved2grabbed(count);
1074 + grabbed2free(ctx, sbinfo, count);
1075 +}
1076 +
1077 +static DEFINE_SPINLOCK(fake_lock);
1078 +static reiser4_block_nr fake_gen = 0;
1079 +
1080 +/**
1081 + * assign_fake_blocknr
1082 + * @blocknr:
1083 + * @count:
1084 + *
1085 + * Obtain a fake block number for new node which will be used to refer to
1086 + * this newly allocated node until real allocation is done.
1087 + */
1088 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1089 +{
1090 + spin_lock(&fake_lock);
1091 + *blocknr = fake_gen;
1092 + fake_gen += count;
1093 + spin_unlock(&fake_lock);
1094 +
1095 + BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1096 + /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1097 + *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1098 + assert("zam-394", zlook(current_tree, blocknr) == NULL);
1099 +}
1100 +
1101 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1102 +{
1103 + assign_fake_blocknr(blocknr, 1);
1104 + grabbed2fake_allocated_formatted();
1105 + return 0;
1106 +}
1107 +
1108 +/**
1109 + * fake_blocknrs_unformatted
1110 + * @count: number of fake numbers to get
1111 + *
1112 + * Allocates @count fake block numbers which will be assigned to jnodes
1113 + */
1114 +reiser4_block_nr fake_blocknr_unformatted(int count)
1115 +{
1116 + reiser4_block_nr blocknr;
1117 +
1118 + assign_fake_blocknr(&blocknr, count);
1119 + grabbed2fake_allocated_unformatted(count);
1120 +
1121 + return blocknr;
1122 +}
1123 +
1124 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1125 + follows grabbing of free disk space. */
1126 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1127 + __u64 count)
1128 +{
1129 + sub_from_ctx_grabbed(ctx, count);
1130 +
1131 + spin_lock_reiser4_super(sbinfo);
1132 +
1133 + sub_from_sb_grabbed(sbinfo, count);
1134 + sbinfo->blocks_used += count;
1135 +
1136 + assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1137 +
1138 + spin_unlock_reiser4_super(sbinfo);
1139 +}
1140 +
1141 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1142 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1143 + reiser4_ba_flags_t flags)
1144 +{
1145 + spin_lock_reiser4_super(sbinfo);
1146 +
1147 + sub_from_sb_fake_allocated(sbinfo, count, flags);
1148 + sbinfo->blocks_used += count;
1149 +
1150 + assert("nikita-2680",
1151 + reiser4_check_block_counters(reiser4_get_current_sb()));
1152 +
1153 + spin_unlock_reiser4_super(sbinfo);
1154 +}
1155 +
1156 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1157 +{
1158 + reiser4_super_info_data *sbinfo;
1159 +
1160 + assert("zam-787", atom != NULL);
1161 + assert_spin_locked(&(atom->alock));
1162 +
1163 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1164 +
1165 + sbinfo = get_current_super_private();
1166 + spin_lock_reiser4_super(sbinfo);
1167 +
1168 + sub_from_sb_flush_reserved(sbinfo, count);
1169 + sbinfo->blocks_used += count;
1170 +
1171 + assert("zam-789",
1172 + reiser4_check_block_counters(reiser4_get_current_sb()));
1173 +
1174 + spin_unlock_reiser4_super(sbinfo);
1175 +}
1176 +
1177 +/* update the per fs blocknr hint default value. */
1178 +void
1179 +update_blocknr_hint_default(const struct super_block *s,
1180 + const reiser4_block_nr * block)
1181 +{
1182 + reiser4_super_info_data *sbinfo = get_super_private(s);
1183 +
1184 + assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1185 +
1186 + spin_lock_reiser4_super(sbinfo);
1187 + if (*block < sbinfo->block_count) {
1188 + sbinfo->blocknr_hint_default = *block;
1189 + } else {
1190 + warning("zam-676",
1191 + "block number %llu is too large to be used in a blocknr hint\n",
1192 + (unsigned long long)*block);
1193 + dump_stack();
1194 + DEBUGON(1);
1195 + }
1196 + spin_unlock_reiser4_super(sbinfo);
1197 +}
1198 +
1199 +/* get current value of the default blocknr hint. */
1200 +void get_blocknr_hint_default(reiser4_block_nr * result)
1201 +{
1202 + reiser4_super_info_data *sbinfo = get_current_super_private();
1203 +
1204 + spin_lock_reiser4_super(sbinfo);
1205 + *result = sbinfo->blocknr_hint_default;
1206 + assert("zam-677", *result < sbinfo->block_count);
1207 + spin_unlock_reiser4_super(sbinfo);
1208 +}
1209 +
1210 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1211 + * method. Blocks are allocated in one contiguous disk region. The plugin
1212 + * independent part accounts blocks by subtracting allocated amount from grabbed
1213 + * or fake block counter and add the same amount to the counter of allocated
1214 + * blocks.
1215 + *
1216 + * @hint -- a reiser4 blocknr hint object which contains further block
1217 + * allocation hints and parameters (search start, a stage of block
1218 + * which will be mapped to disk, etc.),
1219 + * @blk -- an out parameter for the beginning of the allocated region,
1220 + * @len -- in/out parameter, it should contain the maximum number of allocated
1221 + * blocks, after block allocation completes, it contains the length of
1222 + * allocated disk region.
1223 + * @flags -- see reiser4_ba_flags_t description.
1224 + *
1225 + * @return -- 0 if success, error code otherwise.
1226 + */
1227 +int
1228 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1229 + reiser4_block_nr * len, reiser4_ba_flags_t flags)
1230 +{
1231 + __u64 needed = *len;
1232 + reiser4_context *ctx;
1233 + reiser4_super_info_data *sbinfo;
1234 + int ret;
1235 +
1236 + assert("zam-986", hint != NULL);
1237 +
1238 + ctx = get_current_context();
1239 + sbinfo = get_super_private(ctx->super);
1240 +
1241 + /* For write-optimized data we use default search start value, which is
1242 + * close to last write location. */
1243 + if (flags & BA_USE_DEFAULT_SEARCH_START) {
1244 + get_blocknr_hint_default(&hint->blk);
1245 + }
1246 +
1247 + /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1248 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1249 + if (hint->block_stage == BLOCK_NOT_COUNTED) {
1250 + ret = reiser4_grab_space_force(*len, flags);
1251 + if (ret != 0)
1252 + return ret;
1253 + }
1254 +
1255 + ret =
1256 + sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1257 + hint, (int)needed, blk, len);
1258 +
1259 + if (!ret) {
1260 + assert("zam-680", *blk < reiser4_block_count(ctx->super));
1261 + assert("zam-681",
1262 + *blk + *len <= reiser4_block_count(ctx->super));
1263 +
1264 + if (flags & BA_PERMANENT) {
1265 + /* we assume that current atom exists at this moment */
1266 + txn_atom *atom = get_current_atom_locked();
1267 + atom->nr_blocks_allocated += *len;
1268 + spin_unlock_atom(atom);
1269 + }
1270 +
1271 + switch (hint->block_stage) {
1272 + case BLOCK_NOT_COUNTED:
1273 + case BLOCK_GRABBED:
1274 + grabbed2used(ctx, sbinfo, *len);
1275 + break;
1276 + case BLOCK_UNALLOCATED:
1277 + fake_allocated2used(sbinfo, *len, flags);
1278 + break;
1279 + case BLOCK_FLUSH_RESERVED:
1280 + {
1281 + txn_atom *atom = get_current_atom_locked();
1282 + flush_reserved2used(atom, *len);
1283 + spin_unlock_atom(atom);
1284 + }
1285 + break;
1286 + default:
1287 + impossible("zam-531", "wrong block stage");
1288 + }
1289 + } else {
1290 + assert("zam-821",
1291 + ergo(hint->max_dist == 0
1292 + && !hint->backward, ret != -ENOSPC));
1293 + if (hint->block_stage == BLOCK_NOT_COUNTED)
1294 + grabbed2free(ctx, sbinfo, needed);
1295 + }
1296 +
1297 + return ret;
1298 +}
1299 +
1300 +/* used -> fake_allocated -> grabbed -> free */
1301 +
1302 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1303 + disk */
1304 +static void
1305 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1306 + int formatted)
1307 +{
1308 + spin_lock_reiser4_super(sbinfo);
1309 +
1310 + if (formatted)
1311 + sbinfo->blocks_fake_allocated += count;
1312 + else
1313 + sbinfo->blocks_fake_allocated_unformatted += count;
1314 +
1315 + sub_from_sb_used(sbinfo, count);
1316 +
1317 + assert("nikita-2681",
1318 + reiser4_check_block_counters(reiser4_get_current_sb()));
1319 +
1320 + spin_unlock_reiser4_super(sbinfo);
1321 +}
1322 +
1323 +static void
1324 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1325 + __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1326 +{
1327 + assert("nikita-2791", atom != NULL);
1328 + assert_spin_locked(&(atom->alock));
1329 +
1330 + add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1331 +
1332 + spin_lock_reiser4_super(sbinfo);
1333 +
1334 + sbinfo->blocks_flush_reserved += count;
1335 + /*add_to_sb_flush_reserved(sbinfo, count); */
1336 + sub_from_sb_used(sbinfo, count);
1337 +
1338 + assert("nikita-2681",
1339 + reiser4_check_block_counters(reiser4_get_current_sb()));
1340 +
1341 + spin_unlock_reiser4_super(sbinfo);
1342 +}
1343 +
1344 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1345 +static void
1346 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1347 + __u64 count, reiser4_ba_flags_t flags)
1348 +{
1349 + add_to_ctx_grabbed(ctx, count);
1350 +
1351 + spin_lock_reiser4_super(sbinfo);
1352 +
1353 + assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1354 +
1355 + sbinfo->blocks_grabbed += count;
1356 + sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1357 +
1358 + assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1359 +
1360 + spin_unlock_reiser4_super(sbinfo);
1361 +}
1362 +
1363 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1364 +{
1365 + reiser4_context *ctx;
1366 + reiser4_super_info_data *sbinfo;
1367 +
1368 + ctx = get_current_context();
1369 + sbinfo = get_super_private(ctx->super);
1370 +
1371 + fake_allocated2grabbed(ctx, sbinfo, count, flags);
1372 + grabbed2free(ctx, sbinfo, count);
1373 +}
1374 +
1375 +void grabbed2free_mark(__u64 mark)
1376 +{
1377 + reiser4_context *ctx;
1378 + reiser4_super_info_data *sbinfo;
1379 +
1380 + ctx = get_current_context();
1381 + sbinfo = get_super_private(ctx->super);
1382 +
1383 + assert("nikita-3007", (__s64) mark >= 0);
1384 + assert("nikita-3006", ctx->grabbed_blocks >= mark);
1385 + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1386 +}
1387 +
1388 +/**
1389 + * grabbed2free - adjust grabbed and free block counters
1390 + * @ctx: context to update grabbed block counter of
1391 + * @sbinfo: super block to update grabbed and free block counters of
1392 + * @count: number of blocks to adjust counters by
1393 + *
1394 + * Decreases context's and per filesystem's counters of grabbed
1395 + * blocks. Increases per filesystem's counter of free blocks.
1396 + */
1397 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1398 + __u64 count)
1399 +{
1400 + sub_from_ctx_grabbed(ctx, count);
1401 +
1402 + spin_lock_reiser4_super(sbinfo);
1403 +
1404 + sub_from_sb_grabbed(sbinfo, count);
1405 + sbinfo->blocks_free += count;
1406 + assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1407 +
1408 + spin_unlock_reiser4_super(sbinfo);
1409 +}
1410 +
1411 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1412 +{
1413 + reiser4_context *ctx;
1414 + reiser4_super_info_data *sbinfo;
1415 +
1416 + assert("vs-1095", atom);
1417 +
1418 + ctx = get_current_context();
1419 + sbinfo = get_super_private(ctx->super);
1420 +
1421 + sub_from_ctx_grabbed(ctx, count);
1422 +
1423 + add_to_atom_flush_reserved_nolock(atom, count);
1424 +
1425 + spin_lock_reiser4_super(sbinfo);
1426 +
1427 + sbinfo->blocks_flush_reserved += count;
1428 + sub_from_sb_grabbed(sbinfo, count);
1429 +
1430 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1431 +
1432 + spin_unlock_reiser4_super(sbinfo);
1433 +}
1434 +
1435 +void grabbed2flush_reserved(__u64 count)
1436 +{
1437 + txn_atom *atom = get_current_atom_locked();
1438 +
1439 + grabbed2flush_reserved_nolock(atom, count);
1440 +
1441 + spin_unlock_atom(atom);
1442 +}
1443 +
1444 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1445 +{
1446 + reiser4_context *ctx;
1447 + reiser4_super_info_data *sbinfo;
1448 +
1449 + assert("nikita-2788", atom != NULL);
1450 + assert_spin_locked(&(atom->alock));
1451 +
1452 + ctx = get_current_context();
1453 + sbinfo = get_super_private(ctx->super);
1454 +
1455 + add_to_ctx_grabbed(ctx, count);
1456 +
1457 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1458 +
1459 + spin_lock_reiser4_super(sbinfo);
1460 +
1461 + sbinfo->blocks_grabbed += count;
1462 + sub_from_sb_flush_reserved(sbinfo, count);
1463 +
1464 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1465 +
1466 + spin_unlock_reiser4_super(sbinfo);
1467 +}
1468 +
1469 +/**
1470 + * all_grabbed2free - releases all blocks grabbed in context
1471 + *
1472 + * Decreases context's and super block's grabbed block counters by number of
1473 + * blocks grabbed by current context and increases super block's free block
1474 + * counter correspondingly.
1475 + */
1476 +void all_grabbed2free(void)
1477 +{
1478 + reiser4_context *ctx = get_current_context();
1479 +
1480 + grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1481 +}
1482 +
1483 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1484 + after freeing, @count blocks become "grabbed". */
1485 +static void
1486 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1487 + __u64 count)
1488 +{
1489 + add_to_ctx_grabbed(ctx, count);
1490 +
1491 + spin_lock_reiser4_super(sbinfo);
1492 +
1493 + sbinfo->blocks_grabbed += count;
1494 + sub_from_sb_used(sbinfo, count);
1495 +
1496 + assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1497 +
1498 + spin_unlock_reiser4_super(sbinfo);
1499 +}
1500 +
1501 +/* this used to be done through used2grabbed and grabbed2free*/
1502 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1503 +{
1504 + spin_lock_reiser4_super(sbinfo);
1505 +
1506 + sbinfo->blocks_free += count;
1507 + sub_from_sb_used(sbinfo, count);
1508 +
1509 + assert("nikita-2685",
1510 + reiser4_check_block_counters(reiser4_get_current_sb()));
1511 +
1512 + spin_unlock_reiser4_super(sbinfo);
1513 +}
1514 +
1515 +#if REISER4_DEBUG
1516 +
1517 +/* check "allocated" state of given block range */
1518 +static void
1519 +reiser4_check_blocks(const reiser4_block_nr * start,
1520 + const reiser4_block_nr * len, int desired)
1521 +{
1522 + sa_check_blocks(start, len, desired);
1523 +}
1524 +
1525 +/* check "allocated" state of given block */
1526 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1527 +{
1528 + const reiser4_block_nr one = 1;
1529 +
1530 + reiser4_check_blocks(block, &one, desired);
1531 +}
1532 +
1533 +#endif
1534 +
1535 +/* Blocks deallocation function may do an actual deallocation through space
1536 + plugin allocation or store deleted block numbers in atom's delete_set data
1537 + structure depend on @defer parameter. */
1538 +
1539 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1540 + will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1541 + freed but disk space is still grabbed by current thread, or these blocks must
1542 + not be counted in any reiser4 sb block counters, see block_stage_t comment */
1543 +
1544 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1545 + distinguish blocks allocated for unformatted and formatted nodes */
1546 +
1547 +int
1548 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
1549 + const reiser4_block_nr * len,
1550 + block_stage_t target_stage, reiser4_ba_flags_t flags)
1551 +{
1552 + txn_atom *atom = NULL;
1553 + int ret;
1554 + reiser4_context *ctx;
1555 + reiser4_super_info_data *sbinfo;
1556 +
1557 + ctx = get_current_context();
1558 + sbinfo = get_super_private(ctx->super);
1559 +
1560 + if (REISER4_DEBUG) {
1561 + assert("zam-431", *len != 0);
1562 + assert("zam-432", *start != 0);
1563 + assert("zam-558", !reiser4_blocknr_is_fake(start));
1564 +
1565 + spin_lock_reiser4_super(sbinfo);
1566 + assert("zam-562", *start < sbinfo->block_count);
1567 + spin_unlock_reiser4_super(sbinfo);
1568 + }
1569 +
1570 + if (flags & BA_DEFER) {
1571 + blocknr_set_entry *bsep = NULL;
1572 +
1573 + /* storing deleted block numbers in a blocknr set
1574 + datastructure for further actual deletion */
1575 + do {
1576 + atom = get_current_atom_locked();
1577 + assert("zam-430", atom != NULL);
1578 +
1579 + ret =
1580 + blocknr_set_add_extent(atom, &atom->delete_set,
1581 + &bsep, start, len);
1582 +
1583 + if (ret == -ENOMEM)
1584 + return ret;
1585 +
1586 + /* This loop might spin at most two times */
1587 + } while (ret == -E_REPEAT);
1588 +
1589 + assert("zam-477", ret == 0);
1590 + assert("zam-433", atom != NULL);
1591 +
1592 + spin_unlock_atom(atom);
1593 +
1594 + } else {
1595 + assert("zam-425", get_current_super_private() != NULL);
1596 + sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1597 + *start, *len);
1598 +
1599 + if (flags & BA_PERMANENT) {
1600 + /* These blocks were counted as allocated, we have to revert it
1601 + * back if allocation is discarded. */
1602 + txn_atom *atom = get_current_atom_locked();
1603 + atom->nr_blocks_allocated -= *len;
1604 + spin_unlock_atom(atom);
1605 + }
1606 +
1607 + switch (target_stage) {
1608 + case BLOCK_NOT_COUNTED:
1609 + assert("vs-960", flags & BA_FORMATTED);
1610 + /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1611 + used2free(sbinfo, *len);
1612 + break;
1613 +
1614 + case BLOCK_GRABBED:
1615 + used2grabbed(ctx, sbinfo, *len);
1616 + break;
1617 +
1618 + case BLOCK_UNALLOCATED:
1619 + used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1620 + break;
1621 +
1622 + case BLOCK_FLUSH_RESERVED:{
1623 + txn_atom *atom;
1624 +
1625 + atom = get_current_atom_locked();
1626 + used2flush_reserved(sbinfo, atom, *len,
1627 + flags & BA_FORMATTED);
1628 + spin_unlock_atom(atom);
1629 + break;
1630 + }
1631 + default:
1632 + impossible("zam-532", "wrong block stage");
1633 + }
1634 + }
1635 +
1636 + return 0;
1637 +}
1638 +
1639 +/* wrappers for block allocator plugin methods */
1640 +int reiser4_pre_commit_hook(void)
1641 +{
1642 + assert("zam-502", get_current_super_private() != NULL);
1643 + sa_pre_commit_hook();
1644 + return 0;
1645 +}
1646 +
1647 +/* an actor which applies delete set to block allocator data */
1648 +static int
1649 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1650 + const reiser4_block_nr * b, void *data UNUSED_ARG)
1651 +{
1652 + reiser4_context *ctx;
1653 + reiser4_super_info_data *sbinfo;
1654 +
1655 + __u64 len = 1;
1656 +
1657 + ctx = get_current_context();
1658 + sbinfo = get_super_private(ctx->super);
1659 +
1660 + assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1661 + assert("zam-552", sbinfo != NULL);
1662 +
1663 + if (b != NULL)
1664 + len = *b;
1665 +
1666 + if (REISER4_DEBUG) {
1667 + spin_lock_reiser4_super(sbinfo);
1668 +
1669 + assert("zam-554", *a < reiser4_block_count(ctx->super));
1670 + assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1671 +
1672 + spin_unlock_reiser4_super(sbinfo);
1673 + }
1674 +
1675 + sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1676 + /* adjust sb block counters */
1677 + used2free(sbinfo, len);
1678 + return 0;
1679 +}
1680 +
1681 +void reiser4_post_commit_hook(void)
1682 +{
1683 + txn_atom *atom;
1684 +
1685 + atom = get_current_atom_locked();
1686 + assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1687 + spin_unlock_atom(atom);
1688 +
1689 + /* do the block deallocation which was deferred
1690 + until commit is done */
1691 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1692 +
1693 + assert("zam-504", get_current_super_private() != NULL);
1694 + sa_post_commit_hook();
1695 +}
1696 +
1697 +void reiser4_post_write_back_hook(void)
1698 +{
1699 + assert("zam-504", get_current_super_private() != NULL);
1700 +
1701 + sa_post_commit_hook();
1702 +}
1703 +
1704 +/*
1705 + Local variables:
1706 + c-indentation-style: "K&R"
1707 + mode-name: "LC"
1708 + c-basic-offset: 8
1709 + tab-width: 8
1710 + fill-column: 120
1711 + scroll-step: 1
1712 + End:
1713 +*/
1714 diff -urN linux-2.6.22.orig/fs/reiser4/block_alloc.h linux-2.6.22/fs/reiser4/block_alloc.h
1715 --- linux-2.6.22.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1716 +++ linux-2.6.22/fs/reiser4/block_alloc.h 2007-07-29 00:25:34.820681982 +0400
1717 @@ -0,0 +1,175 @@
1718 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1719 +
1720 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1721 +#define __FS_REISER4_BLOCK_ALLOC_H__
1722 +
1723 +#include "dformat.h"
1724 +#include "forward.h"
1725 +
1726 +#include <linux/types.h> /* for __u?? */
1727 +#include <linux/fs.h>
1728 +
1729 +/* Mask when is applied to given block number shows is that block number is a fake one */
1730 +#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1731 +/* Mask which isolates a type of object this fake block number was assigned to */
1732 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1733 +
1734 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1735 + against these two values to understand is the object unallocated or bitmap
1736 + shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1737 +#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1738 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1739 +
1740 +/* specification how block allocation was counted in sb block counters */
1741 +typedef enum {
1742 + BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1743 + BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1744 + of this block */
1745 + BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1746 + BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1747 + ( unallocated formatted or unformatted
1748 + node) */
1749 + BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1750 + number assigned */
1751 +} block_stage_t;
1752 +
1753 +/* a hint for block allocator */
1754 +struct reiser4_blocknr_hint {
1755 + /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1756 + is to prevent jnode_flush() calls from interleaving allocations on the same
1757 + bitmap, once a hint is established. */
1758 +
1759 + /* search start hint */
1760 + reiser4_block_nr blk;
1761 + /* if not zero, it is a region size we search for free blocks in */
1762 + reiser4_block_nr max_dist;
1763 + /* level for allocation, may be useful have branch-level and higher
1764 + write-optimized. */
1765 + tree_level level;
1766 + /* block allocator assumes that blocks, which will be mapped to disk,
1767 + are in this specified block_stage */
1768 + block_stage_t block_stage;
1769 + /* If direction = 1 allocate blocks in backward direction from the end
1770 + * of disk to the beginning of disk. */
1771 + unsigned int backward:1;
1772 +
1773 +};
1774 +
1775 +/* These flags control block allocation/deallocation behavior */
1776 +enum reiser4_ba_flags {
1777 + /* do allocatations from reserved (5%) area */
1778 + BA_RESERVED = (1 << 0),
1779 +
1780 + /* block allocator can do commit trying to recover free space */
1781 + BA_CAN_COMMIT = (1 << 1),
1782 +
1783 + /* if operation will be applied to formatted block */
1784 + BA_FORMATTED = (1 << 2),
1785 +
1786 + /* defer actual block freeing until transaction commit */
1787 + BA_DEFER = (1 << 3),
1788 +
1789 + /* allocate blocks for permanent fs objects (formatted or unformatted), not
1790 + wandered of log blocks */
1791 + BA_PERMANENT = (1 << 4),
1792 +
1793 + /* grab space even it was disabled */
1794 + BA_FORCE = (1 << 5),
1795 +
1796 + /* use default start value for free blocks search. */
1797 + BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1798 +};
1799 +
1800 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1801 +
1802 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1803 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1804 +extern void update_blocknr_hint_default(const struct super_block *,
1805 + const reiser4_block_nr *);
1806 +extern void get_blocknr_hint_default(reiser4_block_nr *);
1807 +
1808 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1809 +
1810 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
1811 +reiser4_block_nr fake_blocknr_unformatted(int);
1812 +
1813 +/* free -> grabbed -> fake_allocated -> used */
1814 +
1815 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1816 +void all_grabbed2free(void);
1817 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1818 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1819 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1820 +void grabbed2flush_reserved(__u64 count);
1821 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1822 + reiser4_block_nr * start,
1823 + reiser4_block_nr * len, reiser4_ba_flags_t flags);
1824 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
1825 + const reiser4_block_nr *,
1826 + block_stage_t, reiser4_ba_flags_t flags);
1827 +
1828 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1829 + reiser4_block_nr * start,
1830 + reiser4_ba_flags_t flags)
1831 +{
1832 + reiser4_block_nr one = 1;
1833 + return reiser4_alloc_blocks(hint, start, &one, flags);
1834 +}
1835 +
1836 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1837 + block_stage_t stage,
1838 + reiser4_ba_flags_t flags)
1839 +{
1840 + const reiser4_block_nr one = 1;
1841 + return reiser4_dealloc_blocks(block, &one, stage, flags);
1842 +}
1843 +
1844 +#define reiser4_grab_space_force(count, flags) \
1845 + reiser4_grab_space(count, flags | BA_FORCE)
1846 +
1847 +extern void grabbed2free_mark(__u64 mark);
1848 +extern int reiser4_grab_reserved(struct super_block *,
1849 + __u64, reiser4_ba_flags_t);
1850 +extern void reiser4_release_reserved(struct super_block *super);
1851 +
1852 +/* grabbed -> fake_allocated */
1853 +
1854 +/* fake_allocated -> used */
1855 +
1856 +/* used -> fake_allocated -> grabbed -> free */
1857 +
1858 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1859 +
1860 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1861 +
1862 +extern void grabbed2cluster_reserved(int count);
1863 +extern void cluster_reserved2grabbed(int count);
1864 +extern void cluster_reserved2free(int count);
1865 +
1866 +extern int reiser4_check_block_counters(const struct super_block *);
1867 +
1868 +#if REISER4_DEBUG
1869 +
1870 +extern void reiser4_check_block(const reiser4_block_nr *, int);
1871 +
1872 +#else
1873 +
1874 +# define reiser4_check_block(beg, val) noop
1875 +
1876 +#endif
1877 +
1878 +extern int reiser4_pre_commit_hook(void);
1879 +extern void reiser4_post_commit_hook(void);
1880 +extern void reiser4_post_write_back_hook(void);
1881 +
1882 +#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1883 +
1884 +/* Make Linus happy.
1885 + Local variables:
1886 + c-indentation-style: "K&R"
1887 + mode-name: "LC"
1888 + c-basic-offset: 8
1889 + tab-width: 8
1890 + fill-column: 120
1891 + End:
1892 +*/
1893 diff -urN linux-2.6.22.orig/fs/reiser4/blocknrset.c linux-2.6.22/fs/reiser4/blocknrset.c
1894 --- linux-2.6.22.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1895 +++ linux-2.6.22/fs/reiser4/blocknrset.c 2007-07-29 00:25:34.820681982 +0400
1896 @@ -0,0 +1,368 @@
1897 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1898 +
1899 +/* This file contains code for various block number sets used by the atom to
1900 + track the deleted set and wandered block mappings. */
1901 +
1902 +#include "debug.h"
1903 +#include "dformat.h"
1904 +#include "txnmgr.h"
1905 +#include "context.h"
1906 +
1907 +#include <linux/slab.h>
1908 +
1909 +/* The proposed data structure for storing unordered block number sets is a
1910 + list of elements, each of which contains an array of block number or/and
1911 + array of block number pairs. That element called blocknr_set_entry is used
1912 + to store block numbers from the beginning and for extents from the end of
1913 + the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1914 + count numbers of blocks and extents.
1915 +
1916 + +------------------- blocknr_set_entry->data ------------------+
1917 + |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1918 + +------------------------------------------------------------+
1919 +
1920 + When current blocknr_set_entry is full, allocate a new one. */
1921 +
1922 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1923 + * set (single blocks and block extents), in that case blocknr pair represent an
1924 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1925 + * there represent a (real block) -> (wandered block) mapping. */
1926 +
1927 +/* Protection: blocknr sets belong to reiser4 atom, and
1928 + * their modifications are performed with the atom lock held */
1929 +
1930 +/* The total size of a blocknr_set_entry. */
1931 +#define BLOCKNR_SET_ENTRY_SIZE 128
1932 +
1933 +/* The number of blocks that can fit the blocknr data area. */
1934 +#define BLOCKNR_SET_ENTRIES_NUMBER \
1935 + ((BLOCKNR_SET_ENTRY_SIZE - \
1936 + 2 * sizeof (unsigned) - \
1937 + sizeof(struct list_head)) / \
1938 + sizeof(reiser4_block_nr))
1939 +
1940 +/* An entry of the blocknr_set */
1941 +struct blocknr_set_entry {
1942 + unsigned nr_singles;
1943 + unsigned nr_pairs;
1944 + struct list_head link;
1945 + reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1946 +};
1947 +
1948 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
1949 +struct blocknr_pair {
1950 + reiser4_block_nr a;
1951 + reiser4_block_nr b;
1952 +};
1953 +
1954 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
1955 +/* Audited by: green(2002.06.11) */
1956 +static unsigned bse_avail(blocknr_set_entry * bse)
1957 +{
1958 + unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1959 +
1960 + assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1961 + cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1962 +
1963 + return BLOCKNR_SET_ENTRIES_NUMBER - used;
1964 +}
1965 +
1966 +/* Initialize a blocknr_set_entry. */
1967 +static void bse_init(blocknr_set_entry *bse)
1968 +{
1969 + bse->nr_singles = 0;
1970 + bse->nr_pairs = 0;
1971 + INIT_LIST_HEAD(&bse->link);
1972 +}
1973 +
1974 +/* Allocate and initialize a blocknr_set_entry. */
1975 +/* Audited by: green(2002.06.11) */
1976 +static blocknr_set_entry *bse_alloc(void)
1977 +{
1978 + blocknr_set_entry *e;
1979 +
1980 + if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
1981 + reiser4_ctx_gfp_mask_get())) == NULL)
1982 + return NULL;
1983 +
1984 + bse_init(e);
1985 +
1986 + return e;
1987 +}
1988 +
1989 +/* Free a blocknr_set_entry. */
1990 +/* Audited by: green(2002.06.11) */
1991 +static void bse_free(blocknr_set_entry * bse)
1992 +{
1993 + kfree(bse);
1994 +}
1995 +
1996 +/* Add a block number to a blocknr_set_entry */
1997 +/* Audited by: green(2002.06.11) */
1998 +static void
1999 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2000 +{
2001 + assert("jmacd-5099", bse_avail(bse) >= 1);
2002 +
2003 + bse->entries[bse->nr_singles++] = *block;
2004 +}
2005 +
2006 +/* Get a pair of block numbers */
2007 +/* Audited by: green(2002.06.11) */
2008 +static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
2009 + unsigned pno)
2010 +{
2011 + assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2012 +
2013 + return (struct blocknr_pair *) (bse->entries +
2014 + BLOCKNR_SET_ENTRIES_NUMBER -
2015 + 2 * (pno + 1));
2016 +}
2017 +
2018 +/* Add a pair of block numbers to a blocknr_set_entry */
2019 +/* Audited by: green(2002.06.11) */
2020 +static void
2021 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2022 + const reiser4_block_nr * b)
2023 +{
2024 + struct blocknr_pair *pair;
2025 +
2026 + assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2027 +
2028 + pair = bse_get_pair(bse, bse->nr_pairs++);
2029 +
2030 + pair->a = *a;
2031 + pair->b = *b;
2032 +}
2033 +
2034 +/* Add either a block or pair of blocks to the block number set. The first
2035 + blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2036 + @b is non-NULL a pair is added. The block number set belongs to atom, and
2037 + the call is made with the atom lock held. There may not be enough space in
2038 + the current blocknr_set_entry. If new_bsep points to a non-NULL
2039 + blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2040 + will be set to NULL. If new_bsep contains NULL then the atom lock will be
2041 + released and a new bse will be allocated in new_bsep. E_REPEAT will be
2042 + returned with the atom unlocked for the operation to be tried again. If
2043 + the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2044 + used during the call, it will be freed automatically. */
2045 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2046 + blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2047 + const reiser4_block_nr *b)
2048 +{
2049 + blocknr_set_entry *bse;
2050 + unsigned entries_needed;
2051 +
2052 + assert("jmacd-5101", a != NULL);
2053 +
2054 + entries_needed = (b == NULL) ? 1 : 2;
2055 + if (list_empty(bset) ||
2056 + bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2057 + /* See if a bse was previously allocated. */
2058 + if (*new_bsep == NULL) {
2059 + spin_unlock_atom(atom);
2060 + *new_bsep = bse_alloc();
2061 + return (*new_bsep != NULL) ? -E_REPEAT :
2062 + RETERR(-ENOMEM);
2063 + }
2064 +
2065 + /* Put it on the head of the list. */
2066 + list_add(&((*new_bsep)->link), bset);
2067 +
2068 + *new_bsep = NULL;
2069 + }
2070 +
2071 + /* Add the single or pair. */
2072 + bse = list_entry(bset->next, blocknr_set_entry, link);
2073 + if (b == NULL) {
2074 + bse_put_single(bse, a);
2075 + } else {
2076 + bse_put_pair(bse, a, b);
2077 + }
2078 +
2079 + /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2080 + if (*new_bsep != NULL) {
2081 + bse_free(*new_bsep);
2082 + *new_bsep = NULL;
2083 + }
2084 +
2085 + return 0;
2086 +}
2087 +
2088 +/* Add an extent to the block set. If the length is 1, it is treated as a
2089 + single block (e.g., reiser4_set_add_block). */
2090 +/* Audited by: green(2002.06.11) */
2091 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2092 + kmalloc might schedule. The only exception is atom spinlock, which is
2093 + properly freed. */
2094 +int
2095 +blocknr_set_add_extent(txn_atom * atom,
2096 + struct list_head * bset,
2097 + blocknr_set_entry ** new_bsep,
2098 + const reiser4_block_nr * start,
2099 + const reiser4_block_nr * len)
2100 +{
2101 + assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2102 + return blocknr_set_add(atom, bset, new_bsep, start,
2103 + *len == 1 ? NULL : len);
2104 +}
2105 +
2106 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2107 + * by an assertion that both arguments are not null.*/
2108 +/* Audited by: green(2002.06.11) */
2109 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2110 + kmalloc might schedule. The only exception is atom spinlock, which is
2111 + properly freed. */
2112 +int
2113 +blocknr_set_add_pair(txn_atom * atom,
2114 + struct list_head * bset,
2115 + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2116 + const reiser4_block_nr * b)
2117 +{
2118 + assert("jmacd-5103", a != NULL && b != NULL);
2119 + return blocknr_set_add(atom, bset, new_bsep, a, b);
2120 +}
2121 +
2122 +/* Initialize a blocknr_set. */
2123 +void blocknr_set_init(struct list_head *bset)
2124 +{
2125 + INIT_LIST_HEAD(bset);
2126 +}
2127 +
2128 +/* Release the entries of a blocknr_set. */
2129 +void blocknr_set_destroy(struct list_head *bset)
2130 +{
2131 + blocknr_set_entry *bse;
2132 +
2133 + while (!list_empty(bset)) {
2134 + bse = list_entry(bset->next, blocknr_set_entry, link);
2135 + list_del_init(&bse->link);
2136 + bse_free(bse);
2137 + }
2138 +}
2139 +
2140 +/* Merge blocknr_set entries out of @from into @into. */
2141 +/* Audited by: green(2002.06.11) */
2142 +/* Auditor comments: This merge does not know if merged sets contain
2143 + blocks pairs (As for wandered sets) or extents, so it cannot really merge
2144 + overlapping ranges if there is some. So I believe it may lead to
2145 + some blocks being presented several times in one blocknr_set. To help
2146 + debugging such problems it might help to check for duplicate entries on
2147 + actual processing of this set. Testing this kind of stuff right here is
2148 + also complicated by the fact that these sets are not sorted and going
2149 + through whole set on each element addition is going to be CPU-heavy task */
2150 +void blocknr_set_merge(struct list_head * from, struct list_head * into)
2151 +{
2152 + blocknr_set_entry *bse_into = NULL;
2153 +
2154 + /* If @from is empty, no work to perform. */
2155 + if (list_empty(from))
2156 + return;
2157 + /* If @into is not empty, try merging partial-entries. */
2158 + if (!list_empty(into)) {
2159 +
2160 + /* Neither set is empty, pop the front to members and try to combine them. */
2161 + blocknr_set_entry *bse_from;
2162 + unsigned into_avail;
2163 +
2164 + bse_into = list_entry(into->next, blocknr_set_entry, link);
2165 + list_del_init(&bse_into->link);
2166 + bse_from = list_entry(from->next, blocknr_set_entry, link);
2167 + list_del_init(&bse_from->link);
2168 +
2169 + /* Combine singles. */
2170 + for (into_avail = bse_avail(bse_into);
2171 + into_avail != 0 && bse_from->nr_singles != 0;
2172 + into_avail -= 1) {
2173 + bse_put_single(bse_into,
2174 + &bse_from->entries[--bse_from->
2175 + nr_singles]);
2176 + }
2177 +
2178 + /* Combine pairs. */
2179 + for (; into_avail > 1 && bse_from->nr_pairs != 0;
2180 + into_avail -= 2) {
2181 + struct blocknr_pair *pair =
2182 + bse_get_pair(bse_from, --bse_from->nr_pairs);
2183 + bse_put_pair(bse_into, &pair->a, &pair->b);
2184 + }
2185 +
2186 + /* If bse_from is empty, delete it now. */
2187 + if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2188 + bse_free(bse_from);
2189 + } else {
2190 + /* Otherwise, bse_into is full or nearly full (e.g.,
2191 + it could have one slot avail and bse_from has one
2192 + pair left). Push it back onto the list. bse_from
2193 + becomes bse_into, which will be the new partial. */
2194 + list_add(&bse_into->link, into);
2195 + bse_into = bse_from;
2196 + }
2197 + }
2198 +
2199 + /* Splice lists together. */
2200 + list_splice_init(from, into->prev);
2201 +
2202 + /* Add the partial entry back to the head of the list. */
2203 + if (bse_into != NULL)
2204 + list_add(&bse_into->link, into);
2205 +}
2206 +
2207 +/* Iterate over all blocknr set elements. */
2208 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2209 + blocknr_set_actor_f actor, void *data, int delete)
2210 +{
2211 +
2212 + blocknr_set_entry *entry;
2213 +
2214 + assert("zam-429", atom != NULL);
2215 + assert("zam-430", atom_is_protected(atom));
2216 + assert("zam-431", bset != 0);
2217 + assert("zam-432", actor != NULL);
2218 +
2219 + entry = list_entry(bset->next, blocknr_set_entry, link);
2220 + while (bset != &entry->link) {
2221 + blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2222 + unsigned int i;
2223 + int ret;
2224 +
2225 + for (i = 0; i < entry->nr_singles; i++) {
2226 + ret = actor(atom, &entry->entries[i], NULL, data);
2227 +
2228 + /* We can't break a loop if delete flag is set. */
2229 + if (ret != 0 && !delete)
2230 + return ret;
2231 + }
2232 +
2233 + for (i = 0; i < entry->nr_pairs; i++) {
2234 + struct blocknr_pair *ab;
2235 +
2236 + ab = bse_get_pair(entry, i);
2237 +
2238 + ret = actor(atom, &ab->a, &ab->b, data);
2239 +
2240 + if (ret != 0 && !delete)
2241 + return ret;
2242 + }
2243 +
2244 + if (delete) {
2245 + list_del(&entry->link);
2246 + bse_free(entry);
2247 + }
2248 +
2249 + entry = tmp;
2250 + }
2251 +
2252 + return 0;
2253 +}
2254 +
2255 +/*
2256 + * Local variables:
2257 + * c-indentation-style: "K&R"
2258 + * mode-name: "LC"
2259 + * c-basic-offset: 8
2260 + * tab-width: 8
2261 + * fill-column: 79
2262 + * scroll-step: 1
2263 + * End:
2264 + */
2265 diff -urN linux-2.6.22.orig/fs/reiser4/carry.c linux-2.6.22/fs/reiser4/carry.c
2266 --- linux-2.6.22.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2267 +++ linux-2.6.22/fs/reiser4/carry.c 2007-07-29 00:25:34.820681982 +0400
2268 @@ -0,0 +1,1391 @@
2269 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2270 +/* Functions to "carry" tree modification(s) upward. */
2271 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2272 + set of changes that need to be propagated to the next level. We manage
2273 + node locking such that any searches that collide with carrying are
2274 + restarted, from the root if necessary.
2275 +
2276 + Insertion of a new item may result in items being moved among nodes and
2277 + this requires the delimiting key to be updated at the least common parent
2278 + of the nodes modified to preserve search tree invariants. Also, insertion
2279 + may require allocation of a new node. A pointer to the new node has to be
2280 + inserted into some node on the parent level, etc.
2281 +
2282 + Tree carrying is meant to be analogous to arithmetic carrying.
2283 +
2284 + A carry operation is always associated with some node (&carry_node).
2285 +
2286 + Carry process starts with some initial set of operations to be performed
2287 + and an initial set of already locked nodes. Operations are performed one
2288 + by one. Performing each single operation has following possible effects:
2289 +
2290 + - content of carry node associated with operation is modified
2291 + - new carry nodes are locked and involved into carry process on this level
2292 + - new carry operations are posted to the next level
2293 +
2294 + After all carry operations on this level are done, process is repeated for
2295 + the accumulated sequence on carry operations for the next level. This
2296 + starts by trying to lock (in left to right order) all carry nodes
2297 + associated with carry operations on the parent level. After this, we decide
2298 + whether more nodes are required on the left of already locked set. If so,
2299 + all locks taken on the parent level are released, new carry nodes are
2300 + added, and locking process repeats.
2301 +
2302 + It may happen that balancing process fails owing to unrecoverable error on
2303 + some of upper levels of a tree (possible causes are io error, failure to
2304 + allocate new node, etc.). In this case we should unmount the filesystem,
2305 + rebooting if it is the root, and possibly advise the use of fsck.
2306 +
2307 + USAGE:
2308 +
2309 + int some_tree_operation( znode *node, ... )
2310 + {
2311 + // Allocate on a stack pool of carry objects: operations and nodes.
2312 + // Most carry processes will only take objects from here, without
2313 + // dynamic allocation.
2314 +
2315 +I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2316 +
2317 + carry_pool pool;
2318 + carry_level lowest_level;
2319 + carry_op *op;
2320 +
2321 + init_carry_pool( &pool );
2322 + init_carry_level( &lowest_level, &pool );
2323 +
2324 + // operation may be one of:
2325 + // COP_INSERT --- insert new item into node
2326 + // COP_CUT --- remove part of or whole node
2327 + // COP_PASTE --- increase size of item
2328 + // COP_DELETE --- delete pointer from parent node
2329 + // COP_UPDATE --- update delimiting key in least
2330 + // common ancestor of two
2331 +
2332 + op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2333 + if( IS_ERR( op ) || ( op == NULL ) ) {
2334 + handle error
2335 + } else {
2336 + // fill in remaining fields in @op, according to carry.h:carry_op
2337 + result = carry( &lowest_level, NULL );
2338 + }
2339 + done_carry_pool( &pool );
2340 + }
2341 +
2342 + When you are implementing node plugin method that participates in carry
2343 + (shifting, insertion, deletion, etc.), do the following:
2344 +
2345 + int foo_node_method( znode *node, ..., carry_level *todo )
2346 + {
2347 + carry_op *op;
2348 +
2349 + ....
2350 +
2351 + // note, that last argument to reiser4_post_carry() is non-null
2352 + // here, because @op is to be applied to the parent of @node, rather
2353 + // than to the @node itself as in the previous case.
2354 +
2355 + op = node_post_carry( todo, operation, node, 1 );
2356 + // fill in remaining fields in @op, according to carry.h:carry_op
2357 +
2358 + ....
2359 +
2360 + }
2361 +
2362 + BATCHING:
2363 +
2364 + One of the main advantages of level-by-level balancing implemented here is
2365 + ability to batch updates on a parent level and to peform them more
2366 + efficiently as a result.
2367 +
2368 + Description To Be Done (TBD).
2369 +
2370 + DIFFICULTIES AND SUBTLE POINTS:
2371 +
2372 + 1. complex plumbing is required, because:
2373 +
2374 + a. effective allocation through pools is needed
2375 +
2376 + b. target of operation is not exactly known when operation is
2377 + posted. This is worked around through bitfields in &carry_node and
2378 + logic in lock_carry_node()
2379 +
2380 + c. of interaction with locking code: node should be added into sibling
2381 + list when pointer to it is inserted into its parent, which is some time
2382 + after node was created. Between these moments, node is somewhat in
2383 + suspended state and is only registered in the carry lists
2384 +
2385 + 2. whole balancing logic is implemented here, in particular, insertion
2386 + logic is coded in make_space().
2387 +
2388 + 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2389 + (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2390 + (insert_paste()) have to be handled.
2391 +
2392 + 4. there is non-trivial interdependency between allocation of new nodes
2393 + and almost everything else. This is mainly due to the (1.c) above. I shall
2394 + write about this later.
2395 +
2396 +*/
2397 +
2398 +#include "forward.h"
2399 +#include "debug.h"
2400 +#include "key.h"
2401 +#include "coord.h"
2402 +#include "plugin/item/item.h"
2403 +#include "plugin/item/extent.h"
2404 +#include "plugin/node/node.h"
2405 +#include "jnode.h"
2406 +#include "znode.h"
2407 +#include "tree_mod.h"
2408 +#include "tree_walk.h"
2409 +#include "block_alloc.h"
2410 +#include "pool.h"
2411 +#include "tree.h"
2412 +#include "carry.h"
2413 +#include "carry_ops.h"
2414 +#include "super.h"
2415 +#include "reiser4.h"
2416 +
2417 +#include <linux/types.h>
2418 +
2419 +/* level locking/unlocking */
2420 +static int lock_carry_level(carry_level * level);
2421 +static void unlock_carry_level(carry_level * level, int failure);
2422 +static void done_carry_level(carry_level * level);
2423 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2424 +
2425 +int lock_carry_node(carry_level * level, carry_node * node);
2426 +int lock_carry_node_tail(carry_node * node);
2427 +
2428 +/* carry processing proper */
2429 +static int carry_on_level(carry_level * doing, carry_level * todo);
2430 +
2431 +static carry_op *add_op(carry_level * level, pool_ordering order,
2432 + carry_op * reference);
2433 +
2434 +/* handlers for carry operations. */
2435 +
2436 +static void fatal_carry_error(carry_level * doing, int ecode);
2437 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2438 +
2439 +static void print_level(const char *prefix, carry_level * level);
2440 +
2441 +#if REISER4_DEBUG
2442 +typedef enum {
2443 + CARRY_TODO,
2444 + CARRY_DOING
2445 +} carry_queue_state;
2446 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2447 +#endif
2448 +
2449 +/* main entry point for tree balancing.
2450 +
2451 + Tree carry performs operations from @doing and while doing so accumulates
2452 + information about operations to be performed on the next level ("carried"
2453 + to the parent level). Carried operations are performed, causing possibly
2454 + more operations to be carried upward etc. carry() takes care about
2455 + locking and pinning znodes while operating on them.
2456 +
2457 + For usage, see comment at the top of fs/reiser4/carry.c
2458 +
2459 +*/
2460 +int reiser4_carry(carry_level * doing /* set of carry operations to be
2461 + * performed */ ,
2462 + carry_level * done /* set of nodes, already performed
2463 + * at the previous level.
2464 + * NULL in most cases */)
2465 +{
2466 + int result = 0;
2467 + /* queue of new requests */
2468 + carry_level *todo;
2469 + ON_DEBUG(STORE_COUNTERS);
2470 +
2471 + assert("nikita-888", doing != NULL);
2472 + BUG_ON(done != NULL);
2473 +
2474 + todo = doing + 1;
2475 + init_carry_level(todo, doing->pool);
2476 +
2477 + /* queue of requests preformed on the previous level */
2478 + done = todo + 1;
2479 + init_carry_level(done, doing->pool);
2480 +
2481 + /* iterate until there is nothing more to do */
2482 + while (result == 0 && doing->ops_num > 0) {
2483 + carry_level *tmp;
2484 +
2485 + /* at this point @done is locked. */
2486 + /* repeat lock/do/unlock while
2487 +
2488 + (1) lock_carry_level() fails due to deadlock avoidance, or
2489 +
2490 + (2) carry_on_level() decides that more nodes have to
2491 + be involved.
2492 +
2493 + (3) some unexpected error occurred while balancing on the
2494 + upper levels. In this case all changes are rolled back.
2495 +
2496 + */
2497 + while (1) {
2498 + result = lock_carry_level(doing);
2499 + if (result == 0) {
2500 + /* perform operations from @doing and
2501 + accumulate new requests in @todo */
2502 + result = carry_on_level(doing, todo);
2503 + if (result == 0)
2504 + break;
2505 + else if (result != -E_REPEAT ||
2506 + !doing->restartable) {
2507 + warning("nikita-1043",
2508 + "Fatal error during carry: %i",
2509 + result);
2510 + print_level("done", done);
2511 + print_level("doing", doing);
2512 + print_level("todo", todo);
2513 + /* do some rough stuff like aborting
2514 + all pending transcrashes and thus
2515 + pushing tree back to the consistent
2516 + state. Alternatvely, just panic.
2517 + */
2518 + fatal_carry_error(doing, result);
2519 + return result;
2520 + }
2521 + } else if (result != -E_REPEAT) {
2522 + fatal_carry_error(doing, result);
2523 + return result;
2524 + }
2525 + unlock_carry_level(doing, 1);
2526 + }
2527 + /* at this point @done can be safely unlocked */
2528 + done_carry_level(done);
2529 +
2530 + /* cyclically shift queues */
2531 + tmp = done;
2532 + done = doing;
2533 + doing = todo;
2534 + todo = tmp;
2535 + init_carry_level(todo, doing->pool);
2536 +
2537 + /* give other threads chance to run */
2538 + reiser4_preempt_point();
2539 + }
2540 + done_carry_level(done);
2541 +
2542 + /* all counters, but x_refs should remain the same. x_refs can change
2543 + owing to transaction manager */
2544 + ON_DEBUG(CHECK_COUNTERS);
2545 + return result;
2546 +}
2547 +
2548 +/* perform carry operations on given level.
2549 +
2550 + Optimizations proposed by pooh:
2551 +
2552 + (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2553 + required;
2554 +
2555 + (2) unlock node if there are no more operations to be performed upon it and
2556 + node didn't add any operation to @todo. This can be implemented by
2557 + attaching to each node two counters: counter of operaions working on this
2558 + node and counter and operations carried upward from this node.
2559 +
2560 +*/
2561 +static int carry_on_level(carry_level * doing /* queue of carry operations to
2562 + * do on this level */ ,
2563 + carry_level * todo /* queue where new carry
2564 + * operations to be performed on
2565 + * the * parent level are
2566 + * accumulated during @doing
2567 + * processing. */ )
2568 +{
2569 + int result;
2570 + int (*f) (carry_op *, carry_level *, carry_level *);
2571 + carry_op *op;
2572 + carry_op *tmp_op;
2573 +
2574 + assert("nikita-1034", doing != NULL);
2575 + assert("nikita-1035", todo != NULL);
2576 +
2577 + /* @doing->nodes are locked. */
2578 +
2579 + /* This function can be split into two phases: analysis and modification.
2580 +
2581 + Analysis calculates precisely what items should be moved between
2582 + nodes. This information is gathered in some structures attached to
2583 + each carry_node in a @doing queue. Analysis also determines whether
2584 + new nodes are to be allocated etc.
2585 +
2586 + After analysis is completed, actual modification is performed. Here
2587 + we can take advantage of "batch modification": if there are several
2588 + operations acting on the same node, modifications can be performed
2589 + more efficiently when batched together.
2590 +
2591 + Above is an optimization left for the future.
2592 + */
2593 + /* Important, but delayed optimization: it's possible to batch
2594 + operations together and perform them more efficiently as a
2595 + result. For example, deletion of several neighboring items from a
2596 + node can be converted to a single ->cut() operation.
2597 +
2598 + Before processing queue, it should be scanned and "mergeable"
2599 + operations merged.
2600 + */
2601 + result = 0;
2602 + for_all_ops(doing, op, tmp_op) {
2603 + carry_opcode opcode;
2604 +
2605 + assert("nikita-1041", op != NULL);
2606 + opcode = op->op;
2607 + assert("nikita-1042", op->op < COP_LAST_OP);
2608 + f = op_dispatch_table[op->op].handler;
2609 + result = f(op, doing, todo);
2610 + /* locking can fail with -E_REPEAT. Any different error is fatal
2611 + and will be handled by fatal_carry_error() sledgehammer.
2612 + */
2613 + if (result != 0)
2614 + break;
2615 + }
2616 + if (result == 0) {
2617 + carry_plugin_info info;
2618 + carry_node *scan;
2619 + carry_node *tmp_scan;
2620 +
2621 + info.doing = doing;
2622 + info.todo = todo;
2623 +
2624 + assert("nikita-3002",
2625 + carry_level_invariant(doing, CARRY_DOING));
2626 + for_all_nodes(doing, scan, tmp_scan) {
2627 + znode *node;
2628 +
2629 + node = reiser4_carry_real(scan);
2630 + assert("nikita-2547", node != NULL);
2631 + if (node_is_empty(node)) {
2632 + result =
2633 + node_plugin_by_node(node)->
2634 + prepare_removal(node, &info);
2635 + if (result != 0)
2636 + break;
2637 + }
2638 + }
2639 + }
2640 + return result;
2641 +}
2642 +
2643 +/* post carry operation
2644 +
2645 + This is main function used by external carry clients: node layout plugins
2646 + and tree operations to create new carry operation to be performed on some
2647 + level.
2648 +
2649 + New operation will be included in the @level queue. To actually perform it,
2650 + call carry( level, ... ). This function takes write lock on @node. Carry
2651 + manages all its locks by itself, don't worry about this.
2652 +
2653 + This function adds operation and node at the end of the queue. It is up to
2654 + caller to guarantee proper ordering of node queue.
2655 +
2656 +*/
2657 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2658 + * is to be posted at */ ,
2659 + carry_opcode op /* opcode of operation */ ,
2660 + znode * node /* node on which this operation
2661 + * will operate */ ,
2662 + int apply_to_parent_p /* whether operation will
2663 + * operate directly on @node
2664 + * or on it parent. */)
2665 +{
2666 + carry_op *result;
2667 + carry_node *child;
2668 +
2669 + assert("nikita-1046", level != NULL);
2670 + assert("nikita-1788", znode_is_write_locked(node));
2671 +
2672 + result = add_op(level, POOLO_LAST, NULL);
2673 + if (IS_ERR(result))
2674 + return result;
2675 + child = reiser4_add_carry(level, POOLO_LAST, NULL);
2676 + if (IS_ERR(child)) {
2677 + reiser4_pool_free(&level->pool->op_pool, &result->header);
2678 + return (carry_op *) child;
2679 + }
2680 + result->node = child;
2681 + result->op = op;
2682 + child->parent = apply_to_parent_p;
2683 + if (ZF_ISSET(node, JNODE_ORPHAN))
2684 + child->left_before = 1;
2685 + child->node = node;
2686 + return result;
2687 +}
2688 +
2689 +/* initialize carry queue */
2690 +void init_carry_level(carry_level * level /* level to initialize */ ,
2691 + carry_pool * pool /* pool @level will allocate objects
2692 + * from */ )
2693 +{
2694 + assert("nikita-1045", level != NULL);
2695 + assert("nikita-967", pool != NULL);
2696 +
2697 + memset(level, 0, sizeof *level);
2698 + level->pool = pool;
2699 +
2700 + INIT_LIST_HEAD(&level->nodes);
2701 + INIT_LIST_HEAD(&level->ops);
2702 +}
2703 +
2704 +/* allocate carry pool and initialize pools within queue */
2705 +carry_pool *init_carry_pool(int size)
2706 +{
2707 + carry_pool *pool;
2708 +
2709 + assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2710 + pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2711 + if (pool == NULL)
2712 + return ERR_PTR(RETERR(-ENOMEM));
2713 +
2714 + reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2715 + (char *)pool->op);
2716 + reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2717 + NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2718 + return pool;
2719 +}
2720 +
2721 +/* finish with queue pools */
2722 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2723 +{
2724 + reiser4_done_pool(&pool->op_pool);
2725 + reiser4_done_pool(&pool->node_pool);
2726 + kfree(pool);
2727 +}
2728 +
2729 +/* add new carry node to the @level.
2730 +
2731 + Returns pointer to the new carry node allocated from pool. It's up to
2732 + callers to maintain proper order in the @level. Assumption is that if carry
2733 + nodes on one level are already sorted and modifications are peroformed from
2734 + left to right, carry nodes added on the parent level will be ordered
2735 + automatically. To control ordering use @order and @reference parameters.
2736 +
2737 +*/
2738 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2739 + * node to */ ,
2740 + pool_ordering order /* where to insert:
2741 + * at the beginning of
2742 + * @level,
2743 + * before @reference,
2744 + * after @reference,
2745 + * at the end of @level
2746 + */ ,
2747 + carry_node * reference/* reference node for
2748 + * insertion */)
2749 +{
2750 + ON_DEBUG(carry_node * orig_ref = reference);
2751 +
2752 + if (order == POOLO_BEFORE) {
2753 + reference = find_left_carry(reference, level);
2754 + if (reference == NULL)
2755 + reference = list_entry(level->nodes.next, carry_node,
2756 + header.level_linkage);
2757 + else
2758 + reference = list_entry(reference->header.level_linkage.next,
2759 + carry_node, header.level_linkage);
2760 + } else if (order == POOLO_AFTER) {
2761 + reference = find_right_carry(reference, level);
2762 + if (reference == NULL)
2763 + reference = list_entry(level->nodes.prev, carry_node,
2764 + header.level_linkage);
2765 + else
2766 + reference = list_entry(reference->header.level_linkage.prev,
2767 + carry_node, header.level_linkage);
2768 + }
2769 + assert("nikita-2209",
2770 + ergo(orig_ref != NULL,
2771 + reiser4_carry_real(reference) ==
2772 + reiser4_carry_real(orig_ref)));
2773 + return reiser4_add_carry(level, order, reference);
2774 +}
2775 +
2776 +carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
2777 + * to */ ,
2778 + pool_ordering order /* where to insert: at the
2779 + * beginning of @level, before
2780 + * @reference, after @reference,
2781 + * at the end of @level */ ,
2782 + carry_node * reference /* reference node for
2783 + * insertion */ )
2784 +{
2785 + carry_node *result;
2786 +
2787 + result =
2788 + (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2789 + &level->nodes,
2790 + order, &reference->header);
2791 + if (!IS_ERR(result) && (result != NULL))
2792 + ++level->nodes_num;
2793 + return result;
2794 +}
2795 +
2796 +/* add new carry operation to the @level.
2797 +
2798 + Returns pointer to the new carry operations allocated from pool. It's up to
2799 + callers to maintain proper order in the @level. To control ordering use
2800 + @order and @reference parameters.
2801 +
2802 +*/
2803 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2804 + pool_ordering order /* where to insert: at the beginning of
2805 + * @level, before @reference, after
2806 + * @reference, at the end of @level */ ,
2807 + carry_op *
2808 + reference /* reference node for insertion */ )
2809 +{
2810 + carry_op *result;
2811 +
2812 + result =
2813 + (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2814 + order, &reference->header);
2815 + if (!IS_ERR(result) && (result != NULL))
2816 + ++level->ops_num;
2817 + return result;
2818 +}
2819 +
2820 +/* Return node on the right of which @node was created.
2821 +
2822 + Each node is created on the right of some existing node (or it is new root,
2823 + which is special case not handled here).
2824 +
2825 + @node is new node created on some level, but not yet inserted into its
2826 + parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2827 +
2828 +*/
2829 +static carry_node *find_begetting_brother(carry_node * node /* node to start search
2830 + * from */ ,
2831 + carry_level * kin UNUSED_ARG /* level to
2832 + * scan */ )
2833 +{
2834 + carry_node *scan;
2835 +
2836 + assert("nikita-1614", node != NULL);
2837 + assert("nikita-1615", kin != NULL);
2838 + assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2839 + assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2840 + ZF_ISSET(reiser4_carry_real(node),
2841 + JNODE_ORPHAN)));
2842 + for (scan = node;;
2843 + scan = list_entry(scan->header.level_linkage.prev, carry_node,
2844 + header.level_linkage)) {
2845 + assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2846 + if ((scan->node != node->node) &&
2847 + !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2848 + assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2849 + break;
2850 + }
2851 + }
2852 + return scan;
2853 +}
2854 +
2855 +static cmp_t
2856 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2857 +{
2858 + assert("nikita-2199", n1 != NULL);
2859 + assert("nikita-2200", n2 != NULL);
2860 +
2861 + if (n1 == n2)
2862 + return EQUAL_TO;
2863 + while (1) {
2864 + n1 = carry_node_next(n1);
2865 + if (carry_node_end(level, n1))
2866 + return GREATER_THAN;
2867 + if (n1 == n2)
2868 + return LESS_THAN;
2869 + }
2870 + impossible("nikita-2201", "End of level reached");
2871 +}
2872 +
2873 +carry_node *find_carry_node(carry_level * level, const znode * node)
2874 +{
2875 + carry_node *scan;
2876 + carry_node *tmp_scan;
2877 +
2878 + assert("nikita-2202", level != NULL);
2879 + assert("nikita-2203", node != NULL);
2880 +
2881 + for_all_nodes(level, scan, tmp_scan) {
2882 + if (reiser4_carry_real(scan) == node)
2883 + return scan;
2884 + }
2885 + return NULL;
2886 +}
2887 +
2888 +znode *reiser4_carry_real(const carry_node * node)
2889 +{
2890 + assert("nikita-3061", node != NULL);
2891 +
2892 + return node->lock_handle.node;
2893 +}
2894 +
2895 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2896 + const znode * node)
2897 +{
2898 + carry_node *base;
2899 + carry_node *scan;
2900 + carry_node *tmp_scan;
2901 + carry_node *proj;
2902 +
2903 + base = find_carry_node(doing, node);
2904 + assert("nikita-2204", base != NULL);
2905 +
2906 + for_all_nodes(todo, scan, tmp_scan) {
2907 + proj = find_carry_node(doing, scan->node);
2908 + assert("nikita-2205", proj != NULL);
2909 + if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2910 + break;
2911 + }
2912 + return scan;
2913 +}
2914 +
2915 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2916 + znode * node)
2917 +{
2918 + carry_node *reference;
2919 +
2920 + assert("nikita-2994", doing != NULL);
2921 + assert("nikita-2995", todo != NULL);
2922 + assert("nikita-2996", node != NULL);
2923 +
2924 + reference = insert_carry_node(doing, todo, node);
2925 + assert("nikita-2997", reference != NULL);
2926 +
2927 + return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2928 +}
2929 +
2930 +/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2931 + This function is different from reiser4_post_carry() in that it finds proper
2932 + place to insert node in the queue. */
2933 +carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2934 + * passed down to node
2935 + * plugin */ ,
2936 + carry_opcode op /* opcode of operation */ ,
2937 + znode * node /* node on which this
2938 + * operation will operate */ ,
2939 + int apply_to_parent_p /* whether operation will
2940 + * operate directly on @node
2941 + * or on it parent. */ )
2942 +{
2943 + carry_op *result;
2944 + carry_node *child;
2945 +
2946 + assert("nikita-2207", info != NULL);
2947 + assert("nikita-2208", info->todo != NULL);
2948 +
2949 + if (info->doing == NULL)
2950 + return reiser4_post_carry(info->todo, op, node,
2951 + apply_to_parent_p);
2952 +
2953 + result = add_op(info->todo, POOLO_LAST, NULL);
2954 + if (IS_ERR(result))
2955 + return result;
2956 + child = add_carry_atplace(info->doing, info->todo, node);
2957 + if (IS_ERR(child)) {
2958 + reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2959 + return (carry_op *) child;
2960 + }
2961 + result->node = child;
2962 + result->op = op;
2963 + child->parent = apply_to_parent_p;
2964 + if (ZF_ISSET(node, JNODE_ORPHAN))
2965 + child->left_before = 1;
2966 + child->node = node;
2967 + return result;
2968 +}
2969 +
2970 +/* lock all carry nodes in @level */
2971 +static int lock_carry_level(carry_level * level /* level to lock */ )
2972 +{
2973 + int result;
2974 + carry_node *node;
2975 + carry_node *tmp_node;
2976 +
2977 + assert("nikita-881", level != NULL);
2978 + assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
2979 +
2980 + /* lock nodes from left to right */
2981 + result = 0;
2982 + for_all_nodes(level, node, tmp_node) {
2983 + result = lock_carry_node(level, node);
2984 + if (result != 0)
2985 + break;
2986 + }
2987 + return result;
2988 +}
2989 +
2990 +/* Synchronize delimiting keys between @node and its left neighbor.
2991 +
2992 + To reduce contention on dk key and simplify carry code, we synchronize
2993 + delimiting keys only when carry ultimately leaves tree level (carrying
2994 + changes upward) and unlocks nodes at this level.
2995 +
2996 + This function first finds left neighbor of @node and then updates left
2997 + neighbor's right delimiting key to conincide with least key in @node.
2998 +
2999 +*/
3000 +
3001 +ON_DEBUG(extern atomic_t delim_key_version;
3002 + )
3003 +
3004 +static void sync_dkeys(znode * spot /* node to update */ )
3005 +{
3006 + reiser4_key pivot;
3007 + reiser4_tree *tree;
3008 +
3009 + assert("nikita-1610", spot != NULL);
3010 + assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3011 +
3012 + tree = znode_get_tree(spot);
3013 + read_lock_tree(tree);
3014 + write_lock_dk(tree);
3015 +
3016 + assert("nikita-2192", znode_is_loaded(spot));
3017 +
3018 + /* sync left delimiting key of @spot with key in its leftmost item */
3019 + if (node_is_empty(spot))
3020 + pivot = *znode_get_rd_key(spot);
3021 + else
3022 + leftmost_key_in_node(spot, &pivot);
3023 +
3024 + znode_set_ld_key(spot, &pivot);
3025 +
3026 + /* there can be sequence of empty nodes pending removal on the left of
3027 + @spot. Scan them and update their left and right delimiting keys to
3028 + match left delimiting key of @spot. Also, update right delimiting
3029 + key of first non-empty left neighbor.
3030 + */
3031 + while (1) {
3032 + if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3033 + break;
3034 +
3035 + spot = spot->left;
3036 + if (spot == NULL)
3037 + break;
3038 +
3039 + znode_set_rd_key(spot, &pivot);
3040 + /* don't sink into the domain of another balancing */
3041 + if (!znode_is_write_locked(spot))
3042 + break;
3043 + if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3044 + znode_set_ld_key(spot, &pivot);
3045 + else
3046 + break;
3047 + }
3048 +
3049 + write_unlock_dk(tree);
3050 + read_unlock_tree(tree);
3051 +}
3052 +
3053 +/* unlock all carry nodes in @level */
3054 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3055 + int failure /* true if unlocking owing to
3056 + * failure */ )
3057 +{
3058 + carry_node *node;
3059 + carry_node *tmp_node;
3060 +
3061 + assert("nikita-889", level != NULL);
3062 +
3063 + if (!failure) {
3064 + znode *spot;
3065 +
3066 + spot = NULL;
3067 + /* update delimiting keys */
3068 + for_all_nodes(level, node, tmp_node) {
3069 + if (reiser4_carry_real(node) != spot) {
3070 + spot = reiser4_carry_real(node);
3071 + sync_dkeys(spot);
3072 + }
3073 + }
3074 + }
3075 +
3076 + /* nodes can be unlocked in arbitrary order. In preemptible
3077 + environment it's better to unlock in reverse order of locking,
3078 + though.
3079 + */
3080 + for_all_nodes_back(level, node, tmp_node) {
3081 + /* all allocated nodes should be already linked to their
3082 + parents at this moment. */
3083 + assert("nikita-1631",
3084 + ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3085 + JNODE_ORPHAN)));
3086 + ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3087 + unlock_carry_node(level, node, failure);
3088 + }
3089 + level->new_root = NULL;
3090 +}
3091 +
3092 +/* finish with @level
3093 +
3094 + Unlock nodes and release all allocated resources */
3095 +static void done_carry_level(carry_level * level /* level to finish */ )
3096 +{
3097 + carry_node *node;
3098 + carry_node *tmp_node;
3099 + carry_op *op;
3100 + carry_op *tmp_op;
3101 +
3102 + assert("nikita-1076", level != NULL);
3103 +
3104 + unlock_carry_level(level, 0);
3105 + for_all_nodes(level, node, tmp_node) {
3106 + assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3107 + assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3108 + reiser4_pool_free(&level->pool->node_pool, &node->header);
3109 + }
3110 + for_all_ops(level, op, tmp_op)
3111 + reiser4_pool_free(&level->pool->op_pool, &op->header);
3112 +}
3113 +
3114 +/* helper function to complete locking of carry node
3115 +
3116 + Finish locking of carry node. There are several ways in which new carry
3117 + node can be added into carry level and locked. Normal is through
3118 + lock_carry_node(), but also from find_{left|right}_neighbor(). This
3119 + function factors out common final part of all locking scenarios. It
3120 + supposes that @node -> lock_handle is lock handle for lock just taken and
3121 + fills ->real_node from this lock handle.
3122 +
3123 +*/
3124 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3125 +{
3126 + assert("nikita-1052", node != NULL);
3127 + assert("nikita-1187", reiser4_carry_real(node) != NULL);
3128 + assert("nikita-1188", !node->unlock);
3129 +
3130 + node->unlock = 1;
3131 + /* Load node content into memory and install node plugin by
3132 + looking at the node header.
3133 +
3134 + Most of the time this call is cheap because the node is
3135 + already in memory.
3136 +
3137 + Corresponding zrelse() is in unlock_carry_node()
3138 + */
3139 + return zload(reiser4_carry_real(node));
3140 +}
3141 +
3142 +/* lock carry node
3143 +
3144 + "Resolve" node to real znode, lock it and mark as locked.
3145 + This requires recursive locking of znodes.
3146 +
3147 + When operation is posted to the parent level, node it will be applied to is
3148 + not yet known. For example, when shifting data between two nodes,
3149 + delimiting has to be updated in parent or parents of nodes involved. But
3150 + their parents is not yet locked and, moreover said nodes can be reparented
3151 + by concurrent balancing.
3152 +
3153 + To work around this, carry operation is applied to special "carry node"
3154 + rather than to the znode itself. Carry node consists of some "base" or
3155 + "reference" znode and flags indicating how to get to the target of carry
3156 + operation (->real_node field of carry_node) from base.
3157 +
3158 +*/
3159 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3160 + carry_node * node /* node to lock */ )
3161 +{
3162 + int result;
3163 + znode *reference_point;
3164 + lock_handle lh;
3165 + lock_handle tmp_lh;
3166 + reiser4_tree *tree;
3167 +
3168 + assert("nikita-887", level != NULL);
3169 + assert("nikita-882", node != NULL);
3170 +
3171 + result = 0;
3172 + reference_point = node->node;
3173 + init_lh(&lh);
3174 + init_lh(&tmp_lh);
3175 + if (node->left_before) {
3176 + /* handling of new nodes, allocated on the previous level:
3177 +
3178 + some carry ops were propably posted from the new node, but
3179 + this node neither has parent pointer set, nor is
3180 + connected. This will be done in ->create_hook() for
3181 + internal item.
3182 +
3183 + No then less, parent of new node has to be locked. To do
3184 + this, first go to the "left" in the carry order. This
3185 + depends on the decision to always allocate new node on the
3186 + right of existing one.
3187 +
3188 + Loop handles case when multiple nodes, all orphans, were
3189 + inserted.
3190 +
3191 + Strictly speaking, taking tree lock is not necessary here,
3192 + because all nodes scanned by loop in
3193 + find_begetting_brother() are write-locked by this thread,
3194 + and thus, their sibling linkage cannot change.
3195 +
3196 + */
3197 + tree = znode_get_tree(reference_point);
3198 + read_lock_tree(tree);
3199 + reference_point = find_begetting_brother(node, level)->node;
3200 + read_unlock_tree(tree);
3201 + assert("nikita-1186", reference_point != NULL);
3202 + }
3203 + if (node->parent && (result == 0)) {
3204 + result =
3205 + reiser4_get_parent(&tmp_lh, reference_point,
3206 + ZNODE_WRITE_LOCK);
3207 + if (result != 0) {
3208 + ; /* nothing */
3209 + } else if (znode_get_level(tmp_lh.node) == 0) {
3210 + assert("nikita-1347", znode_above_root(tmp_lh.node));
3211 + result = add_new_root(level, node, tmp_lh.node);
3212 + if (result == 0) {
3213 + reference_point = level->new_root;
3214 + move_lh(&lh, &node->lock_handle);
3215 + }
3216 + } else if ((level->new_root != NULL)
3217 + && (level->new_root !=
3218 + znode_parent_nolock(reference_point))) {
3219 + /* parent of node exists, but this level aready
3220 + created different new root, so */
3221 + warning("nikita-1109",
3222 + /* it should be "radicis", but tradition is
3223 + tradition. do banshees read latin? */
3224 + "hodie natus est radici frater");
3225 + result = -EIO;
3226 + } else {
3227 + move_lh(&lh, &tmp_lh);
3228 + reference_point = lh.node;
3229 + }
3230 + }
3231 + if (node->left && (result == 0)) {
3232 + assert("nikita-1183", node->parent);
3233 + assert("nikita-883", reference_point != NULL);
3234 + result =
3235 + reiser4_get_left_neighbor(&tmp_lh, reference_point,
3236 + ZNODE_WRITE_LOCK,
3237 + GN_CAN_USE_UPPER_LEVELS);
3238 + if (result == 0) {
3239 + done_lh(&lh);
3240 + move_lh(&lh, &tmp_lh);
3241 + reference_point = lh.node;
3242 + }
3243 + }
3244 + if (!node->parent && !node->left && !node->left_before) {
3245 + result =
3246 + longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3247 + ZNODE_LOCK_HIPRI);
3248 + }
3249 + if (result == 0) {
3250 + move_lh(&node->lock_handle, &lh);
3251 + result = lock_carry_node_tail(node);
3252 + }
3253 + done_lh(&tmp_lh);
3254 + done_lh(&lh);
3255 + return result;
3256 +}
3257 +
3258 +/* release a lock on &carry_node.
3259 +
3260 + Release if necessary lock on @node. This opearion is pair of
3261 + lock_carry_node() and is idempotent: you can call it more than once on the
3262 + same node.
3263 +
3264 +*/
3265 +static void
3266 +unlock_carry_node(carry_level * level,
3267 + carry_node * node /* node to be released */ ,
3268 + int failure /* 0 if node is unlocked due
3269 + * to some error */ )
3270 +{
3271 + znode *real_node;
3272 +
3273 + assert("nikita-884", node != NULL);
3274 +
3275 + real_node = reiser4_carry_real(node);
3276 + /* pair to zload() in lock_carry_node_tail() */
3277 + zrelse(real_node);
3278 + if (node->unlock && (real_node != NULL)) {
3279 + assert("nikita-899", real_node == node->lock_handle.node);
3280 + longterm_unlock_znode(&node->lock_handle);
3281 + }
3282 + if (failure) {
3283 + if (node->deallocate && (real_node != NULL)) {
3284 + /* free node in bitmap
3285 +
3286 + Prepare node for removal. Last zput() will finish
3287 + with it.
3288 + */
3289 + ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3290 + }
3291 + if (node->free) {
3292 + assert("nikita-2177",
3293 + list_empty_careful(&node->lock_handle.locks_link));
3294 + assert("nikita-2112",
3295 + list_empty_careful(&node->lock_handle.owners_link));
3296 + reiser4_pool_free(&level->pool->node_pool,
3297 + &node->header);
3298 + }
3299 + }
3300 +}
3301 +
3302 +/* fatal_carry_error() - all-catching error handling function
3303 +
3304 + It is possible that carry faces unrecoverable error, like unability to
3305 + insert pointer at the internal level. Our simple solution is just panic in
3306 + this situation. More sophisticated things like attempt to remount
3307 + file-system as read-only can be implemented without much difficlties.
3308 +
3309 + It is believed, that:
3310 +
3311 + 1. in stead of panicking, all current transactions can be aborted rolling
3312 + system back to the consistent state.
3313 +
3314 +Umm, if you simply panic without doing anything more at all, then all current
3315 +transactions are aborted and the system is rolled back to a consistent state,
3316 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3317 +precise. If an internal node is corrupted on disk due to hardware failure,
3318 +then there may be no consistent state that can be rolled back to, so instead
3319 +we should say that it will rollback the transactions, which barring other
3320 +factors means rolling back to a consistent state.
3321 +
3322 +# Nikita: there is a subtle difference between panic and aborting
3323 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3324 +# don't using reiser4 (not that we care about such processes), or using other
3325 +# reiser4 mounts (about them we do care) will simply continue to run. With
3326 +# some luck, even application using aborted file system can survive: it will
3327 +# get some error, like EBADF, from each file descriptor on failed file system,
3328 +# but applications that do care about tolerance will cope with this (squid
3329 +# will).
3330 +
3331 +It would be a nice feature though to support rollback without rebooting
3332 +followed by remount, but this can wait for later versions.
3333 +
3334 + 2. once isolated transactions will be implemented it will be possible to
3335 + roll back offending transaction.
3336 +
3337 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3338 +it more before deciding if it should be done. -Hans
3339 +
3340 +*/
3341 +static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3342 + * where
3343 + * unrecoverable
3344 + * error
3345 + * occurred */ ,
3346 + int ecode /* error code */ )
3347 +{
3348 + assert("nikita-1230", doing != NULL);
3349 + assert("nikita-1231", ecode < 0);
3350 +
3351 + reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3352 +}
3353 +
3354 +/* add new root to the tree
3355 +
3356 + This function itself only manages changes in carry structures and delegates
3357 + all hard work (allocation of znode for new root, changes of parent and
3358 + sibling pointers to the reiser4_add_tree_root().
3359 +
3360 + Locking: old tree root is locked by carry at this point. Fake znode is also
3361 + locked.
3362 +
3363 +*/
3364 +static int add_new_root(carry_level * level /* carry level in context of which
3365 + * operation is performed */ ,
3366 + carry_node * node /* carry node for existing root */ ,
3367 + znode * fake /* "fake" znode already locked by
3368 + * us */ )
3369 +{
3370 + int result;
3371 +
3372 + assert("nikita-1104", level != NULL);
3373 + assert("nikita-1105", node != NULL);
3374 +
3375 + assert("nikita-1403", znode_is_write_locked(node->node));
3376 + assert("nikita-1404", znode_is_write_locked(fake));
3377 +
3378 + /* trying to create new root. */
3379 + /* @node is root and it's already locked by us. This
3380 + means that nobody else can be trying to add/remove
3381 + tree root right now.
3382 + */
3383 + if (level->new_root == NULL)
3384 + level->new_root = reiser4_add_tree_root(node->node, fake);
3385 + if (!IS_ERR(level->new_root)) {
3386 + assert("nikita-1210", znode_is_root(level->new_root));
3387 + node->deallocate = 1;
3388 + result =
3389 + longterm_lock_znode(&node->lock_handle, level->new_root,
3390 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3391 + if (result == 0)
3392 + zput(level->new_root);
3393 + } else {
3394 + result = PTR_ERR(level->new_root);
3395 + level->new_root = NULL;
3396 + }
3397 + return result;
3398 +}
3399 +
3400 +/* allocate new znode and add the operation that inserts the
3401 + pointer to it into the parent node into the todo level
3402 +
3403 + Allocate new znode, add it into carry queue and post into @todo queue
3404 + request to add pointer to new node into its parent.
3405 +
3406 + This is carry related routing that calls reiser4_new_node() to allocate new
3407 + node.
3408 +*/
3409 +carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3410 + * node */ ,
3411 + carry_node * ref /* carry node after which new
3412 + * carry node is to be inserted
3413 + * into queue. This affects
3414 + * locking. */ ,
3415 + carry_level * doing /* carry queue where new node is
3416 + * to be added */ ,
3417 + carry_level * todo /* carry queue where COP_INSERT
3418 + * operation to add pointer to
3419 + * new node will ne added */ )
3420 +{
3421 + carry_node *fresh;
3422 + znode *new_znode;
3423 + carry_op *add_pointer;
3424 + carry_plugin_info info;
3425 +
3426 + assert("nikita-1048", brother != NULL);
3427 + assert("nikita-1049", todo != NULL);
3428 +
3429 + /* There is a lot of possible variations here: to what parent
3430 + new node will be attached and where. For simplicity, always
3431 + do the following:
3432 +
3433 + (1) new node and @brother will have the same parent.
3434 +
3435 + (2) new node is added on the right of @brother
3436 +
3437 + */
3438 +
3439 + fresh = reiser4_add_carry_skip(doing,
3440 + ref ? POOLO_AFTER : POOLO_LAST, ref);
3441 + if (IS_ERR(fresh))
3442 + return fresh;
3443 +
3444 + fresh->deallocate = 1;
3445 + fresh->free = 1;
3446 +
3447 + new_znode = reiser4_new_node(brother, znode_get_level(brother));
3448 + if (IS_ERR(new_znode))
3449 + /* @fresh will be deallocated automatically by error
3450 + handling code in the caller. */
3451 + return (carry_node *) new_znode;
3452 +
3453 + /* new_znode returned znode with x_count 1. Caller has to decrease
3454 + it. make_space() does. */
3455 +
3456 + ZF_SET(new_znode, JNODE_ORPHAN);
3457 + fresh->node = new_znode;
3458 +
3459 + while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3460 + ref = carry_node_prev(ref);
3461 + assert("nikita-1606", !carry_node_end(doing, ref));
3462 + }
3463 +
3464 + info.todo = todo;
3465 + info.doing = doing;
3466 + add_pointer = node_post_carry(&info, COP_INSERT,
3467 + reiser4_carry_real(ref), 1);
3468 + if (IS_ERR(add_pointer)) {
3469 + /* no need to deallocate @new_znode here: it will be
3470 + deallocated during carry error handling. */
3471 + return (carry_node *) add_pointer;
3472 + }
3473 +
3474 + add_pointer->u.insert.type = COPT_CHILD;
3475 + add_pointer->u.insert.child = fresh;
3476 + add_pointer->u.insert.brother = brother;
3477 + /* initially new node spawns empty key range */
3478 + write_lock_dk(znode_get_tree(brother));
3479 + znode_set_ld_key(new_znode,
3480 + znode_set_rd_key(new_znode,
3481 + znode_get_rd_key(brother)));
3482 + write_unlock_dk(znode_get_tree(brother));
3483 + return fresh;
3484 +}
3485 +
3486 +/* DEBUGGING FUNCTIONS.
3487 +
3488 + Probably we also should leave them on even when
3489 + debugging is turned off to print dumps at errors.
3490 +*/
3491 +#if REISER4_DEBUG
3492 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3493 +{
3494 + carry_node *node;
3495 + carry_node *tmp_node;
3496 +
3497 + if (level == NULL)
3498 + return 0;
3499 +
3500 + if (level->track_type != 0 &&
3501 + level->track_type != CARRY_TRACK_NODE &&
3502 + level->track_type != CARRY_TRACK_CHANGE)
3503 + return 0;
3504 +
3505 + /* check that nodes are in ascending order */
3506 + for_all_nodes(level, node, tmp_node) {
3507 + znode *left;
3508 + znode *right;
3509 +
3510 + reiser4_key lkey;
3511 + reiser4_key rkey;
3512 +
3513 + if (node != carry_node_front(level)) {
3514 + if (state == CARRY_TODO) {
3515 + right = node->node;
3516 + left = carry_node_prev(node)->node;
3517 + } else {
3518 + right = reiser4_carry_real(node);
3519 + left = reiser4_carry_real(carry_node_prev(node));
3520 + }
3521 + if (right == NULL || left == NULL)
3522 + continue;
3523 + if (node_is_empty(right) || node_is_empty(left))
3524 + continue;
3525 + if (!keyle(leftmost_key_in_node(left, &lkey),
3526 + leftmost_key_in_node(right, &rkey))) {
3527 + warning("", "wrong key order");
3528 + return 0;
3529 + }
3530 + }
3531 + }
3532 + return 1;
3533 +}
3534 +#endif
3535 +
3536 +/* get symbolic name for boolean */
3537 +static const char *tf(int boolean /* truth value */ )
3538 +{
3539 + return boolean ? "t" : "f";
3540 +}
3541 +
3542 +/* symbolic name for carry operation */
3543 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3544 +{
3545 + switch (op) {
3546 + case COP_INSERT:
3547 + return "COP_INSERT";
3548 + case COP_DELETE:
3549 + return "COP_DELETE";
3550 + case COP_CUT:
3551 + return "COP_CUT";
3552 + case COP_PASTE:
3553 + return "COP_PASTE";
3554 + case COP_UPDATE:
3555 + return "COP_UPDATE";
3556 + case COP_EXTENT:
3557 + return "COP_EXTENT";
3558 + case COP_INSERT_FLOW:
3559 + return "COP_INSERT_FLOW";
3560 + default:{
3561 + /* not mt safe, but who cares? */
3562 + static char buf[20];
3563 +
3564 + sprintf(buf, "unknown op: %x", op);
3565 + return buf;
3566 + }
3567 + }
3568 +}
3569 +
3570 +/* dump information about carry node */
3571 +static void print_carry(const char *prefix /* prefix to print */ ,
3572 + carry_node * node /* node to print */ )
3573 +{
3574 + if (node == NULL) {
3575 + printk("%s: null\n", prefix);
3576 + return;
3577 + }
3578 + printk
3579 + ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3580 + prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3581 + tf(node->free), tf(node->deallocate));
3582 +}
3583 +
3584 +/* dump information about carry operation */
3585 +static void print_op(const char *prefix /* prefix to print */ ,
3586 + carry_op * op /* operation to print */ )
3587 +{
3588 + if (op == NULL) {
3589 + printk("%s: null\n", prefix);
3590 + return;
3591 + }
3592 + printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3593 + print_carry("\tnode", op->node);
3594 + switch (op->op) {
3595 + case COP_INSERT:
3596 + case COP_PASTE:
3597 + print_coord("\tcoord",
3598 + op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3599 + reiser4_print_key("\tkey",
3600 + op->u.insert.d ? op->u.insert.d->key : NULL);
3601 + print_carry("\tchild", op->u.insert.child);
3602 + break;
3603 + case COP_DELETE:
3604 + print_carry("\tchild", op->u.delete.child);
3605 + break;
3606 + case COP_CUT:
3607 + if (op->u.cut_or_kill.is_cut) {
3608 + print_coord("\tfrom",
3609 + op->u.cut_or_kill.u.kill->params.from, 0);
3610 + print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3611 + 0);
3612 + } else {
3613 + print_coord("\tfrom",
3614 + op->u.cut_or_kill.u.cut->params.from, 0);
3615 + print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3616 + 0);
3617 + }
3618 + break;
3619 + case COP_UPDATE:
3620 + print_carry("\tleft", op->u.update.left);
3621 + break;
3622 + default:
3623 + /* do nothing */
3624 + break;
3625 + }
3626 +}
3627 +
3628 +/* dump information about all nodes and operations in a @level */
3629 +static void print_level(const char *prefix /* prefix to print */ ,
3630 + carry_level * level /* level to print */ )
3631 +{
3632 + carry_node *node;
3633 + carry_node *tmp_node;
3634 + carry_op *op;
3635 + carry_op *tmp_op;
3636 +
3637 + if (level == NULL) {
3638 + printk("%s: null\n", prefix);
3639 + return;
3640 + }
3641 + printk("%s: %p, restartable: %s\n",
3642 + prefix, level, tf(level->restartable));
3643 +
3644 + for_all_nodes(level, node, tmp_node)
3645 + print_carry("\tcarry node", node);
3646 + for_all_ops(level, op, tmp_op)
3647 + print_op("\tcarry op", op);
3648 +}
3649 +
3650 +/* Make Linus happy.
3651 + Local variables:
3652 + c-indentation-style: "K&R"
3653 + mode-name: "LC"
3654 + c-basic-offset: 8
3655 + tab-width: 8
3656 + fill-column: 120
3657 + scroll-step: 1
3658 + End:
3659 +*/
3660 diff -urN linux-2.6.22.orig/fs/reiser4/carry.h linux-2.6.22/fs/reiser4/carry.h
3661 --- linux-2.6.22.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3662 +++ linux-2.6.22/fs/reiser4/carry.h 2007-07-29 00:25:34.824683017 +0400
3663 @@ -0,0 +1,442 @@
3664 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3665 +
3666 +/* Functions and data types to "carry" tree modification(s) upward.
3667 + See fs/reiser4/carry.c for details. */
3668 +
3669 +#if !defined( __FS_REISER4_CARRY_H__ )
3670 +#define __FS_REISER4_CARRY_H__
3671 +
3672 +#include "forward.h"
3673 +#include "debug.h"
3674 +#include "pool.h"
3675 +#include "znode.h"
3676 +
3677 +#include <linux/types.h>
3678 +
3679 +/* &carry_node - "location" of carry node.
3680 +
3681 + "location" of node that is involved or going to be involved into
3682 + carry process. Node where operation will be carried to on the
3683 + parent level cannot be recorded explicitly. Operation will be carried
3684 + usually to the parent of some node (where changes are performed at
3685 + the current level) or, to the left neighbor of its parent. But while
3686 + modifications are performed at the current level, parent may
3687 + change. So, we have to allow some indirection (or, positevly,
3688 + flexibility) in locating carry nodes.
3689 +
3690 +*/
3691 +typedef struct carry_node {
3692 + /* pool linkage */
3693 + struct reiser4_pool_header header;
3694 +
3695 + /* base node from which real_node is calculated. See
3696 + fs/reiser4/carry.c:lock_carry_node(). */
3697 + znode *node;
3698 +
3699 + /* how to get ->real_node */
3700 + /* to get ->real_node obtain parent of ->node */
3701 + __u32 parent:1;
3702 + /* to get ->real_node obtain left neighbor of parent of
3703 + ->node */
3704 + __u32 left:1;
3705 + __u32 left_before:1;
3706 +
3707 + /* locking */
3708 +
3709 + /* this node was locked by carry process and should be
3710 + unlocked when carry leaves a level */
3711 + __u32 unlock:1;
3712 +
3713 + /* disk block for this node was allocated by carry process and
3714 + should be deallocated when carry leaves a level */
3715 + __u32 deallocate:1;
3716 + /* this carry node was allocated by carry process and should be
3717 + freed when carry leaves a level */
3718 + __u32 free:1;
3719 +
3720 + /* type of lock we want to take on this node */
3721 + lock_handle lock_handle;
3722 +} carry_node;
3723 +
3724 +/* &carry_opcode - elementary operations that can be carried upward
3725 +
3726 + Operations that carry() can handle. This list is supposed to be
3727 + expanded.
3728 +
3729 + Each carry operation (cop) is handled by appropriate function defined
3730 + in fs/reiser4/carry.c. For example COP_INSERT is handled by
3731 + fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3732 + call plugins of nodes affected by operation to modify nodes' content
3733 + and to gather operations to be performed on the next level.
3734 +
3735 +*/
3736 +typedef enum {
3737 + /* insert new item into node. */
3738 + COP_INSERT,
3739 + /* delete pointer from parent node */
3740 + COP_DELETE,
3741 + /* remove part of or whole node. */
3742 + COP_CUT,
3743 + /* increase size of item. */
3744 + COP_PASTE,
3745 + /* insert extent (that is sequence of unformatted nodes). */
3746 + COP_EXTENT,
3747 + /* update delimiting key in least common ancestor of two
3748 + nodes. This is performed when items are moved between two
3749 + nodes.
3750 + */
3751 + COP_UPDATE,
3752 + /* insert flow */
3753 + COP_INSERT_FLOW,
3754 + COP_LAST_OP,
3755 +} carry_opcode;
3756 +
3757 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
3758 +
3759 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3760 + item is determined. */
3761 +typedef enum {
3762 + /* target item is one containing pointer to the ->child node */
3763 + COPT_CHILD,
3764 + /* target item is given explicitly by @coord */
3765 + COPT_ITEM_DATA,
3766 + /* target item is given by key */
3767 + COPT_KEY,
3768 + /* see insert_paste_common() for more comments on this. */
3769 + COPT_PASTE_RESTARTED,
3770 +} cop_insert_pos_type;
3771 +
3772 +/* flags to cut and delete */
3773 +typedef enum {
3774 + /* don't kill node even if it became completely empty as results of
3775 + * cut. This is needed for eottl handling. See carry_extent() for
3776 + * details. */
3777 + DELETE_RETAIN_EMPTY = (1 << 0)
3778 +} cop_delete_flag;
3779 +
3780 +/*
3781 + * carry() implements "lock handle tracking" feature.
3782 + *
3783 + * Callers supply carry with node where to perform initial operation and lock
3784 + * handle on this node. Trying to optimize node utilization carry may actually
3785 + * move insertion point to different node. Callers expect that lock handle
3786 + * will rebe transferred to the new node also.
3787 + *
3788 + */
3789 +typedef enum {
3790 + /* transfer lock handle along with insertion point */
3791 + CARRY_TRACK_CHANGE = 1,
3792 + /* acquire new lock handle to the node where insertion point is. This
3793 + * is used when carry() client doesn't initially possess lock handle
3794 + * on the insertion point node, for example, by extent insertion
3795 + * code. See carry_extent(). */
3796 + CARRY_TRACK_NODE = 2
3797 +} carry_track_type;
3798 +
3799 +/* data supplied to COP_{INSERT|PASTE} by callers */
3800 +typedef struct carry_insert_data {
3801 + /* position where new item is to be inserted */
3802 + coord_t *coord;
3803 + /* new item description */
3804 + reiser4_item_data *data;
3805 + /* key of new item */
3806 + const reiser4_key *key;
3807 +} carry_insert_data;
3808 +
3809 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3810 +struct cut_kill_params {
3811 + /* coord where cut starts (inclusive) */
3812 + coord_t *from;
3813 + /* coord where cut stops (inclusive, this item/unit will also be
3814 + * cut) */
3815 + coord_t *to;
3816 + /* starting key. This is necessary when item and unit pos don't
3817 + * uniquely identify what portion or tree to remove. For example, this
3818 + * indicates what portion of extent unit will be affected. */
3819 + const reiser4_key *from_key;
3820 + /* exclusive stop key */
3821 + const reiser4_key *to_key;
3822 + /* if this is not NULL, smallest actually removed key is stored
3823 + * here. */
3824 + reiser4_key *smallest_removed;
3825 + /* kill_node_content() is called for file truncate */
3826 + int truncate;
3827 +};
3828 +
3829 +struct carry_cut_data {
3830 + struct cut_kill_params params;
3831 +};
3832 +
3833 +struct carry_kill_data {
3834 + struct cut_kill_params params;
3835 + /* parameter to be passed to the ->kill_hook() method of item
3836 + * plugin */
3837 + /*void *iplug_params; *//* FIXME: unused currently */
3838 + /* if not NULL---inode whose items are being removed. This is needed
3839 + * for ->kill_hook() of extent item to update VM structures when
3840 + * removing pages. */
3841 + struct inode *inode;
3842 + /* sibling list maintenance is complicated by existence of eottl. When
3843 + * eottl whose left and right neighbors are formatted leaves is
3844 + * removed, one has to connect said leaves in the sibling list. This
3845 + * cannot be done when extent removal is just started as locking rules
3846 + * require sibling list update to happen atomically with removal of
3847 + * extent item. Therefore: 1. pointers to left and right neighbors
3848 + * have to be passed down to the ->kill_hook() of extent item, and
3849 + * 2. said neighbors have to be locked. */
3850 + lock_handle *left;
3851 + lock_handle *right;
3852 + /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3853 + unsigned flags;
3854 + char *buf;
3855 +};
3856 +
3857 +/* &carry_tree_op - operation to "carry" upward.
3858 +
3859 + Description of an operation we want to "carry" to the upper level of
3860 + a tree: e.g, when we insert something and there is not enough space
3861 + we allocate a new node and "carry" the operation of inserting a
3862 + pointer to the new node to the upper level, on removal of empty node,
3863 + we carry up operation of removing appropriate entry from parent.
3864 +
3865 + There are two types of carry ops: when adding or deleting node we
3866 + node at the parent level where appropriate modification has to be
3867 + performed is known in advance. When shifting items between nodes
3868 + (split, merge), delimiting key should be changed in the least common
3869 + parent of the nodes involved that is not known in advance.
3870 +
3871 + For the operations of the first type we store in &carry_op pointer to
3872 + the &carry_node at the parent level. For the operation of the second
3873 + type we store &carry_node or parents of the left and right nodes
3874 + modified and keep track of them upward until they coincide.
3875 +
3876 +*/
3877 +typedef struct carry_op {
3878 + /* pool linkage */
3879 + struct reiser4_pool_header header;
3880 + carry_opcode op;
3881 + /* node on which operation is to be performed:
3882 +
3883 + for insert, paste: node where new item is to be inserted
3884 +
3885 + for delete: node where pointer is to be deleted
3886 +
3887 + for cut: node to cut from
3888 +
3889 + for update: node where delimiting key is to be modified
3890 +
3891 + for modify: parent of modified node
3892 +
3893 + */
3894 + carry_node *node;
3895 + union {
3896 + struct {
3897 + /* (sub-)type of insertion/paste. Taken from
3898 + cop_insert_pos_type. */
3899 + __u8 type;
3900 + /* various operation flags. Taken from
3901 + cop_insert_flag. */
3902 + __u8 flags;
3903 + carry_insert_data *d;
3904 + carry_node *child;
3905 + znode *brother;
3906 + } insert, paste, extent;
3907 +
3908 + struct {
3909 + int is_cut;
3910 + union {
3911 + carry_kill_data *kill;
3912 + carry_cut_data *cut;
3913 + } u;
3914 + } cut_or_kill;
3915 +
3916 + struct {
3917 + carry_node *left;
3918 + } update;
3919 + struct {
3920 + /* changed child */
3921 + carry_node *child;
3922 + /* bitmask of changes. See &cop_modify_flag */
3923 + __u32 flag;
3924 + } modify;
3925 + struct {
3926 + /* flags to deletion operation. Are taken from
3927 + cop_delete_flag */
3928 + __u32 flags;
3929 + /* child to delete from parent. If this is
3930 + NULL, delete op->node. */
3931 + carry_node *child;
3932 + } delete;
3933 + struct {
3934 + /* various operation flags. Taken from
3935 + cop_insert_flag. */
3936 + __u32 flags;
3937 + flow_t *flow;
3938 + coord_t *insert_point;
3939 + reiser4_item_data *data;
3940 + /* flow insertion is limited by number of new blocks
3941 + added in that operation which do not get any data
3942 + but part of flow. This limit is set by macro
3943 + CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3944 + of nodes added already during one carry_flow */
3945 + int new_nodes;
3946 + } insert_flow;
3947 + } u;
3948 +} carry_op;
3949 +
3950 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3951 +typedef struct carry_pool {
3952 + carry_op op[CARRIES_POOL_SIZE];
3953 + struct reiser4_pool op_pool;
3954 + carry_node node[NODES_LOCKED_POOL_SIZE];
3955 + struct reiser4_pool node_pool;
3956 +} carry_pool;
3957 +
3958 +/* &carry_tree_level - carry process on given level
3959 +
3960 + Description of balancing process on the given level.
3961 +
3962 + No need for locking here, as carry_tree_level is essentially per
3963 + thread thing (for now).
3964 +
3965 +*/
3966 +struct carry_level {
3967 + /* this level may be restarted */
3968 + __u32 restartable:1;
3969 + /* list of carry nodes on this level, ordered by key order */
3970 + struct list_head nodes;
3971 + struct list_head ops;
3972 + /* pool where new objects are allocated from */
3973 + carry_pool *pool;
3974 + int ops_num;
3975 + int nodes_num;
3976 + /* new root created on this level, if any */
3977 + znode *new_root;
3978 + /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
3979 + when they want ->tracked to automagically wander to the node where
3980 + insertion point moved after insert or paste.
3981 + */
3982 + carry_track_type track_type;
3983 + /* lock handle supplied by user that we are tracking. See
3984 + above. */
3985 + lock_handle *tracked;
3986 +};
3987 +
3988 +/* information carry passes to plugin methods that may add new operations to
3989 + the @todo queue */
3990 +struct carry_plugin_info {
3991 + carry_level *doing;
3992 + carry_level *todo;
3993 +};
3994 +
3995 +int reiser4_carry(carry_level * doing, carry_level * done);
3996 +
3997 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
3998 + carry_node * reference);
3999 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4000 + carry_node * reference);
4001 +
4002 +extern carry_node *insert_carry_node(carry_level * doing,
4003 + carry_level * todo, const znode * node);
4004 +
4005 +extern carry_pool *init_carry_pool(int);
4006 +extern void done_carry_pool(carry_pool * pool);
4007 +
4008 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4009 +
4010 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4011 + znode * node, int apply_to_parent);
4012 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4013 + znode * node, int apply_to_parent_p);
4014 +
4015 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4016 + carry_level * doing, carry_level * todo);
4017 +
4018 +carry_node *find_carry_node(carry_level * level, const znode * node);
4019 +
4020 +extern znode *reiser4_carry_real(const carry_node * node);
4021 +
4022 +/* helper macros to iterate over carry queues */
4023 +
4024 +#define carry_node_next( node ) \
4025 + list_entry((node)->header.level_linkage.next, carry_node, \
4026 + header.level_linkage)
4027 +
4028 +#define carry_node_prev( node ) \
4029 + list_entry((node)->header.level_linkage.prev, carry_node, \
4030 + header.level_linkage)
4031 +
4032 +#define carry_node_front( level ) \
4033 + list_entry((level)->nodes.next, carry_node, header.level_linkage)
4034 +
4035 +#define carry_node_back( level ) \
4036 + list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4037 +
4038 +#define carry_node_end( level, node ) \
4039 + (&(level)->nodes == &(node)->header.level_linkage)
4040 +
4041 +/* macro to iterate over all operations in a @level */
4042 +#define for_all_ops( level /* carry level (of type carry_level *) */, \
4043 + op /* pointer to carry operation, modified by loop (of \
4044 + * type carry_op *) */, \
4045 + tmp /* pointer to carry operation (of type carry_op *), \
4046 + * used to make iterator stable in the face of \
4047 + * deletions from the level */ ) \
4048 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4049 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4050 + &op->header.level_linkage != &level->ops; \
4051 + op = tmp, \
4052 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4053 +
4054 +#if 0
4055 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4056 + tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4057 + ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4058 + op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4059 +#endif
4060 +
4061 +/* macro to iterate over all nodes in a @level */ \
4062 +#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4063 + node /* pointer to carry node, modified by loop (of \
4064 + * type carry_node *) */, \
4065 + tmp /* pointer to carry node (of type carry_node *), \
4066 + * used to make iterator stable in the face of * \
4067 + * deletions from the level */ ) \
4068 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4069 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4070 + &node->header.level_linkage != &level->nodes; \
4071 + node = tmp, \
4072 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4073 +
4074 +#if 0
4075 +for( node = carry_node_front( level ), \
4076 + tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4077 + node = tmp, tmp = carry_node_next( node ) )
4078 +#endif
4079 +
4080 +/* macro to iterate over all nodes in a @level in reverse order
4081 +
4082 + This is used, because nodes are unlocked in reversed order of locking */
4083 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4084 + node /* pointer to carry node, modified by loop \
4085 + * (of type carry_node *) */, \
4086 + tmp /* pointer to carry node (of type carry_node \
4087 + * *), used to make iterator stable in the \
4088 + * face of deletions from the level */ ) \
4089 +for( node = carry_node_back( level ), \
4090 + tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4091 + node = tmp, tmp = carry_node_prev( node ) )
4092 +
4093 +/* __FS_REISER4_CARRY_H__ */
4094 +#endif
4095 +
4096 +/* Make Linus happy.
4097 + Local variables:
4098 + c-indentation-style: "K&R"
4099 + mode-name: "LC"
4100 + c-basic-offset: 8
4101 + tab-width: 8
4102 + fill-column: 120
4103 + scroll-step: 1
4104 + End:
4105 +*/
4106 diff -urN linux-2.6.22.orig/fs/reiser4/carry_ops.c linux-2.6.22/fs/reiser4/carry_ops.c
4107 --- linux-2.6.22.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300
4108 +++ linux-2.6.22/fs/reiser4/carry_ops.c 2007-07-29 00:25:34.828684053 +0400
4109 @@ -0,0 +1,2131 @@
4110 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4111 +
4112 +/* implementation of carry operations */
4113 +
4114 +#include "forward.h"
4115 +#include "debug.h"
4116 +#include "key.h"
4117 +#include "coord.h"
4118 +#include "plugin/item/item.h"
4119 +#include "plugin/node/node.h"
4120 +#include "jnode.h"
4121 +#include "znode.h"
4122 +#include "block_alloc.h"
4123 +#include "tree_walk.h"
4124 +#include "pool.h"
4125 +#include "tree_mod.h"
4126 +#include "carry.h"
4127 +#include "carry_ops.h"
4128 +#include "tree.h"
4129 +#include "super.h"
4130 +#include "reiser4.h"
4131 +
4132 +#include <linux/types.h>
4133 +#include <linux/err.h>
4134 +
4135 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4136 + carry_level * doing, carry_level * todo,
4137 + unsigned int including_insert_coord_p);
4138 +
4139 +extern int lock_carry_node(carry_level * level, carry_node * node);
4140 +extern int lock_carry_node_tail(carry_node * node);
4141 +
4142 +/* find left neighbor of a carry node
4143 +
4144 + Look for left neighbor of @node and add it to the @doing queue. See
4145 + comments in the body.
4146 +
4147 +*/
4148 +static carry_node *find_left_neighbor(carry_op * op /* node to find left
4149 + * neighbor of */ ,
4150 + carry_level * doing /* level to scan */ )
4151 +{
4152 + int result;
4153 + carry_node *node;
4154 + carry_node *left;
4155 + int flags;
4156 + reiser4_tree *tree;
4157 +
4158 + node = op->node;
4159 +
4160 + tree = current_tree;
4161 + read_lock_tree(tree);
4162 + /* first, check whether left neighbor is already in a @doing queue */
4163 + if (reiser4_carry_real(node)->left != NULL) {
4164 + /* NOTE: there is locking subtlety here. Look into
4165 + * find_right_neighbor() for more info */
4166 + if (find_carry_node(doing,
4167 + reiser4_carry_real(node)->left) != NULL) {
4168 + read_unlock_tree(tree);
4169 + left = node;
4170 + do {
4171 + left = list_entry(left->header.level_linkage.prev,
4172 + carry_node, header.level_linkage);
4173 + assert("nikita-3408", !carry_node_end(doing,
4174 + left));
4175 + } while (reiser4_carry_real(left) ==
4176 + reiser4_carry_real(node));
4177 + return left;
4178 + }
4179 + }
4180 + read_unlock_tree(tree);
4181 +
4182 + left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4183 + if (IS_ERR(left))
4184 + return left;
4185 +
4186 + left->node = node->node;
4187 + left->free = 1;
4188 +
4189 + flags = GN_TRY_LOCK;
4190 + if (!op->u.insert.flags & COPI_LOAD_LEFT)
4191 + flags |= GN_NO_ALLOC;
4192 +
4193 + /* then, feeling lucky, peek left neighbor in the cache. */
4194 + result = reiser4_get_left_neighbor(&left->lock_handle,
4195 + reiser4_carry_real(node),
4196 + ZNODE_WRITE_LOCK, flags);
4197 + if (result == 0) {
4198 + /* ok, node found and locked. */
4199 + result = lock_carry_node_tail(left);
4200 + if (result != 0)
4201 + left = ERR_PTR(result);
4202 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4203 + /* node is leftmost node in a tree, or neighbor wasn't in
4204 + cache, or there is an extent on the left. */
4205 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4206 + left = NULL;
4207 + } else if (doing->restartable) {
4208 + /* if left neighbor is locked, and level is restartable, add
4209 + new node to @doing and restart. */
4210 + assert("nikita-913", node->parent != 0);
4211 + assert("nikita-914", node->node != NULL);
4212 + left->left = 1;
4213 + left->free = 0;
4214 + left = ERR_PTR(-E_REPEAT);
4215 + } else {
4216 + /* left neighbor is locked, level cannot be restarted. Just
4217 + ignore left neighbor. */
4218 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4219 + left = NULL;
4220 + }
4221 + return left;
4222 +}
4223 +
4224 +/* find right neighbor of a carry node
4225 +
4226 + Look for right neighbor of @node and add it to the @doing queue. See
4227 + comments in the body.
4228 +
4229 +*/
4230 +static carry_node *find_right_neighbor(carry_op * op /* node to find right
4231 + * neighbor of */ ,
4232 + carry_level * doing /* level to scan */ )
4233 +{
4234 + int result;
4235 + carry_node *node;
4236 + carry_node *right;
4237 + lock_handle lh;
4238 + int flags;
4239 + reiser4_tree *tree;
4240 +
4241 + init_lh(&lh);
4242 +
4243 + node = op->node;
4244 +
4245 + tree = current_tree;
4246 + read_lock_tree(tree);
4247 + /* first, check whether right neighbor is already in a @doing queue */
4248 + if (reiser4_carry_real(node)->right != NULL) {
4249 + /*
4250 + * Tree lock is taken here anyway, because, even if _outcome_
4251 + * of (find_carry_node() != NULL) doesn't depends on
4252 + * concurrent updates to ->right, find_carry_node() cannot
4253 + * work with second argument NULL. Hence, following comment is
4254 + * of historic importance only.
4255 + *
4256 + * Subtle:
4257 + *
4258 + * Q: why don't we need tree lock here, looking for the right
4259 + * neighbor?
4260 + *
4261 + * A: even if value of node->real_node->right were changed
4262 + * during find_carry_node() execution, outcome of execution
4263 + * wouldn't change, because (in short) other thread cannot add
4264 + * elements to the @doing, and if node->real_node->right
4265 + * already was in @doing, value of node->real_node->right
4266 + * couldn't change, because node cannot be inserted between
4267 + * locked neighbors.
4268 + */
4269 + if (find_carry_node(doing,
4270 + reiser4_carry_real(node)->right) != NULL) {
4271 + read_unlock_tree(tree);
4272 + /*
4273 + * What we are doing here (this is also applicable to
4274 + * the find_left_neighbor()).
4275 + *
4276 + * tree_walk.c code requires that insertion of a
4277 + * pointer to a child, modification of parent pointer
4278 + * in the child, and insertion of the child into
4279 + * sibling list are atomic (see
4280 + * plugin/item/internal.c:create_hook_internal()).
4281 + *
4282 + * carry allocates new node long before pointer to it
4283 + * is inserted into parent and, actually, long before
4284 + * parent is even known. Such allocated-but-orphaned
4285 + * nodes are only trackable through carry level lists.
4286 + *
4287 + * Situation that is handled here is following: @node
4288 + * has valid ->right pointer, but there is
4289 + * allocated-but-orphaned node in the carry queue that
4290 + * is logically between @node and @node->right. Here
4291 + * we are searching for it. Critical point is that
4292 + * this is only possible if @node->right is also in
4293 + * the carry queue (this is checked above), because
4294 + * this is the only way new orphaned node could be
4295 + * inserted between them (before inserting new node,
4296 + * make_space() first tries to shift to the right, so,
4297 + * right neighbor will be locked and queued).
4298 + *
4299 + */
4300 + right = node;
4301 + do {
4302 + right = list_entry(right->header.level_linkage.next,
4303 + carry_node, header.level_linkage);
4304 + assert("nikita-3408", !carry_node_end(doing,
4305 + right));
4306 + } while (reiser4_carry_real(right) ==
4307 + reiser4_carry_real(node));
4308 + return right;
4309 + }
4310 + }
4311 + read_unlock_tree(tree);
4312 +
4313 + flags = GN_CAN_USE_UPPER_LEVELS;
4314 + if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4315 + flags = GN_NO_ALLOC;
4316 +
4317 + /* then, try to lock right neighbor */
4318 + init_lh(&lh);
4319 + result = reiser4_get_right_neighbor(&lh,
4320 + reiser4_carry_real(node),
4321 + ZNODE_WRITE_LOCK, flags);
4322 + if (result == 0) {
4323 + /* ok, node found and locked. */
4324 + right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4325 + if (!IS_ERR(right)) {
4326 + right->node = lh.node;
4327 + move_lh(&right->lock_handle, &lh);
4328 + right->free = 1;
4329 + result = lock_carry_node_tail(right);
4330 + if (result != 0)
4331 + right = ERR_PTR(result);
4332 + }
4333 + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4334 + /* node is rightmost node in a tree, or neighbor wasn't in
4335 + cache, or there is an extent on the right. */
4336 + right = NULL;
4337 + } else
4338 + right = ERR_PTR(result);
4339 + done_lh(&lh);
4340 + return right;
4341 +}
4342 +
4343 +/* how much free space in a @node is needed for @op
4344 +
4345 + How much space in @node is required for completion of @op, where @op is
4346 + insert or paste operation.
4347 +*/
4348 +static unsigned int space_needed_for_op(znode * node /* znode data are
4349 + * inserted or
4350 + * pasted in */ ,
4351 + carry_op * op /* carry
4352 + operation */ )
4353 +{
4354 + assert("nikita-919", op != NULL);
4355 +
4356 + switch (op->op) {
4357 + default:
4358 + impossible("nikita-1701", "Wrong opcode");
4359 + case COP_INSERT:
4360 + return space_needed(node, NULL, op->u.insert.d->data, 1);
4361 + case COP_PASTE:
4362 + return space_needed(node, op->u.insert.d->coord,
4363 + op->u.insert.d->data, 0);
4364 + }
4365 +}
4366 +
4367 +/* how much space in @node is required to insert or paste @data at
4368 + @coord. */
4369 +unsigned int space_needed(const znode * node /* node data are inserted or
4370 + * pasted in */ ,
4371 + const coord_t * coord /* coord where data are
4372 + * inserted or pasted
4373 + * at */ ,
4374 + const reiser4_item_data * data /* data to insert or
4375 + * paste */ ,
4376 + int insertion /* non-0 is inserting, 0---paste */ )
4377 +{
4378 + int result;
4379 + item_plugin *iplug;
4380 +
4381 + assert("nikita-917", node != NULL);
4382 + assert("nikita-918", node_plugin_by_node(node) != NULL);
4383 + assert("vs-230", !insertion || (coord == NULL));
4384 +
4385 + result = 0;
4386 + iplug = data->iplug;
4387 + if (iplug->b.estimate != NULL) {
4388 + /* ask item plugin how much space is needed to insert this
4389 + item */
4390 + result += iplug->b.estimate(insertion ? NULL : coord, data);
4391 + } else {
4392 + /* reasonable default */
4393 + result += data->length;
4394 + }
4395 + if (insertion) {
4396 + node_plugin *nplug;
4397 +
4398 + nplug = node->nplug;
4399 + /* and add node overhead */
4400 + if (nplug->item_overhead != NULL) {
4401 + result += nplug->item_overhead(node, NULL);
4402 + }
4403 + }
4404 + return result;
4405 +}
4406 +
4407 +/* find &coord in parent where pointer to new child is to be stored. */
4408 +static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4409 + * insert pointer to new
4410 + * child */ )
4411 +{
4412 + int result;
4413 + znode *node;
4414 + znode *child;
4415 +
4416 + assert("nikita-941", op != NULL);
4417 + assert("nikita-942", op->op == COP_INSERT);
4418 +
4419 + node = reiser4_carry_real(op->node);
4420 + assert("nikita-943", node != NULL);
4421 + assert("nikita-944", node_plugin_by_node(node) != NULL);
4422 +
4423 + child = reiser4_carry_real(op->u.insert.child);
4424 + result =
4425 + find_new_child_ptr(node, child, op->u.insert.brother,
4426 + op->u.insert.d->coord);
4427 +
4428 + build_child_ptr_data(child, op->u.insert.d->data);
4429 + return result;
4430 +}
4431 +
4432 +/* additional amount of free space in @node required to complete @op */
4433 +static int free_space_shortage(znode * node /* node to check */ ,
4434 + carry_op * op /* operation being performed */ )
4435 +{
4436 + assert("nikita-1061", node != NULL);
4437 + assert("nikita-1062", op != NULL);
4438 +
4439 + switch (op->op) {
4440 + default:
4441 + impossible("nikita-1702", "Wrong opcode");
4442 + case COP_INSERT:
4443 + case COP_PASTE:
4444 + return space_needed_for_op(node, op) - znode_free_space(node);
4445 + case COP_EXTENT:
4446 + /* when inserting extent shift data around until insertion
4447 + point is utmost in the node. */
4448 + if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4449 + return +1;
4450 + else
4451 + return -1;
4452 + }
4453 +}
4454 +
4455 +/* helper function: update node pointer in operation after insertion
4456 + point was probably shifted into @target. */
4457 +static znode *sync_op(carry_op * op, carry_node * target)
4458 +{
4459 + znode *insertion_node;
4460 +
4461 + /* reget node from coord: shift might move insertion coord to
4462 + the neighbor */
4463 + insertion_node = op->u.insert.d->coord->node;
4464 + /* if insertion point was actually moved into new node,
4465 + update carry node pointer in operation. */
4466 + if (insertion_node != reiser4_carry_real(op->node)) {
4467 + op->node = target;
4468 + assert("nikita-2540",
4469 + reiser4_carry_real(target) == insertion_node);
4470 + }
4471 + assert("nikita-2541",
4472 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4473 + return insertion_node;
4474 +}
4475 +
4476 +/*
4477 + * complete make_space() call: update tracked lock handle if necessary. See
4478 + * comments for fs/reiser4/carry.h:carry_track_type
4479 + */
4480 +static int
4481 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4482 +{
4483 + int result;
4484 + carry_track_type tracking;
4485 + znode *node;
4486 +
4487 + tracking = doing->track_type;
4488 + node = op->u.insert.d->coord->node;
4489 +
4490 + if (tracking == CARRY_TRACK_NODE ||
4491 + (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4492 + /* inserting or pasting into node different from
4493 + original. Update lock handle supplied by caller. */
4494 + assert("nikita-1417", doing->tracked != NULL);
4495 + done_lh(doing->tracked);
4496 + init_lh(doing->tracked);
4497 + result = longterm_lock_znode(doing->tracked, node,
4498 + ZNODE_WRITE_LOCK,
4499 + ZNODE_LOCK_HIPRI);
4500 + } else
4501 + result = 0;
4502 + return result;
4503 +}
4504 +
4505 +/* This is insertion policy function. It shifts data to the left and right
4506 + neighbors of insertion coord and allocates new nodes until there is enough
4507 + free space to complete @op.
4508 +
4509 + See comments in the body.
4510 +
4511 + Assumes that the node format favors insertions at the right end of the node
4512 + as node40 does.
4513 +
4514 + See carry_flow() on detail about flow insertion
4515 +*/
4516 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4517 + carry_level * doing /* current carry queue */ ,
4518 + carry_level * todo /* carry queue on the parent level */ )
4519 +{
4520 + znode *node;
4521 + int result;
4522 + int not_enough_space;
4523 + int blk_alloc;
4524 + znode *orig_node;
4525 + __u32 flags;
4526 +
4527 + coord_t *coord;
4528 +
4529 + assert("nikita-890", op != NULL);
4530 + assert("nikita-891", todo != NULL);
4531 + assert("nikita-892",
4532 + op->op == COP_INSERT ||
4533 + op->op == COP_PASTE || op->op == COP_EXTENT);
4534 + assert("nikita-1607",
4535 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4536 +
4537 + flags = op->u.insert.flags;
4538 +
4539 + /* NOTE check that new node can only be allocated after checking left
4540 + * and right neighbors. This is necessary for proper work of
4541 + * find_{left,right}_neighbor(). */
4542 + assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4543 + flags & COPI_DONT_SHIFT_LEFT));
4544 + assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4545 + flags & COPI_DONT_SHIFT_RIGHT));
4546 +
4547 + coord = op->u.insert.d->coord;
4548 + orig_node = node = coord->node;
4549 +
4550 + assert("nikita-908", node != NULL);
4551 + assert("nikita-909", node_plugin_by_node(node) != NULL);
4552 +
4553 + result = 0;
4554 + /* If there is not enough space in a node, try to shift something to
4555 + the left neighbor. This is a bit tricky, as locking to the left is
4556 + low priority. This is handled by restart logic in carry().
4557 + */
4558 + not_enough_space = free_space_shortage(node, op);
4559 + if (not_enough_space <= 0)
4560 + /* it is possible that carry was called when there actually
4561 + was enough space in the node. For example, when inserting
4562 + leftmost item so that delimiting keys have to be updated.
4563 + */
4564 + return make_space_tail(op, doing, orig_node);
4565 + if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4566 + carry_node *left;
4567 + /* make note in statistics of an attempt to move
4568 + something into the left neighbor */
4569 + left = find_left_neighbor(op, doing);
4570 + if (unlikely(IS_ERR(left))) {
4571 + if (PTR_ERR(left) == -E_REPEAT)
4572 + return -E_REPEAT;
4573 + else {
4574 + /* some error other than restart request
4575 + occurred. This shouldn't happen. Issue a
4576 + warning and continue as if left neighbor
4577 + weren't existing.
4578 + */
4579 + warning("nikita-924",
4580 + "Error accessing left neighbor: %li",
4581 + PTR_ERR(left));
4582 + }
4583 + } else if (left != NULL) {
4584 +
4585 + /* shift everything possible on the left of and
4586 + including insertion coord into the left neighbor */
4587 + result = carry_shift_data(LEFT_SIDE, coord,
4588 + reiser4_carry_real(left),
4589 + doing, todo,
4590 + flags & COPI_GO_LEFT);
4591 +
4592 + /* reget node from coord: shift_left() might move
4593 + insertion coord to the left neighbor */
4594 + node = sync_op(op, left);
4595 +
4596 + not_enough_space = free_space_shortage(node, op);
4597 + /* There is not enough free space in @node, but
4598 + may be, there is enough free space in
4599 + @left. Various balancing decisions are valid here.
4600 + The same for the shifiting to the right.
4601 + */
4602 + }
4603 + }
4604 + /* If there still is not enough space, shift to the right */
4605 + if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4606 + carry_node *right;
4607 +
4608 + right = find_right_neighbor(op, doing);
4609 + if (IS_ERR(right)) {
4610 + warning("nikita-1065",
4611 + "Error accessing right neighbor: %li",
4612 + PTR_ERR(right));
4613 + } else if (right != NULL) {
4614 + /* node containing insertion point, and its right
4615 + neighbor node are write locked by now.
4616 +
4617 + shift everything possible on the right of but
4618 + excluding insertion coord into the right neighbor
4619 + */
4620 + result = carry_shift_data(RIGHT_SIDE, coord,
4621 + reiser4_carry_real(right),
4622 + doing, todo,
4623 + flags & COPI_GO_RIGHT);
4624 + /* reget node from coord: shift_right() might move
4625 + insertion coord to the right neighbor */
4626 + node = sync_op(op, right);
4627 + not_enough_space = free_space_shortage(node, op);
4628 + }
4629 + }
4630 + /* If there is still not enough space, allocate new node(s).
4631 +
4632 + We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4633 + the carry operation flags (currently this is needed during flush
4634 + only).
4635 + */
4636 + for (blk_alloc = 0;
4637 + not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4638 + !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4639 + carry_node *fresh; /* new node we are allocating */
4640 + coord_t coord_shadow; /* remembered insertion point before
4641 + * shifting data into new node */
4642 + carry_node *node_shadow; /* remembered insertion node before
4643 + * shifting */
4644 + unsigned int gointo; /* whether insertion point should move
4645 + * into newly allocated node */
4646 +
4647 + /* allocate new node on the right of @node. Znode and disk
4648 + fake block number for new node are allocated.
4649 +
4650 + add_new_znode() posts carry operation COP_INSERT with
4651 + COPT_CHILD option to the parent level to add
4652 + pointer to newly created node to its parent.
4653 +
4654 + Subtle point: if several new nodes are required to complete
4655 + insertion operation at this level, they will be inserted
4656 + into their parents in the order of creation, which means
4657 + that @node will be valid "cookie" at the time of insertion.
4658 +
4659 + */
4660 + fresh = add_new_znode(node, op->node, doing, todo);
4661 + if (IS_ERR(fresh))
4662 + return PTR_ERR(fresh);
4663 +
4664 + /* Try to shift into new node. */
4665 + result = lock_carry_node(doing, fresh);
4666 + zput(reiser4_carry_real(fresh));
4667 + if (result != 0) {
4668 + warning("nikita-947",
4669 + "Cannot lock new node: %i", result);
4670 + return result;
4671 + }
4672 +
4673 + /* both nodes are write locked by now.
4674 +
4675 + shift everything possible on the right of and
4676 + including insertion coord into the right neighbor.
4677 + */
4678 + coord_dup(&coord_shadow, op->u.insert.d->coord);
4679 + node_shadow = op->node;
4680 + /* move insertion point into newly created node if:
4681 +
4682 + . insertion point is rightmost in the source node, or
4683 + . this is not the first node we are allocating in a row.
4684 + */
4685 + gointo =
4686 + (blk_alloc > 0) ||
4687 + coord_is_after_rightmost(op->u.insert.d->coord);
4688 +
4689 + if (gointo &&
4690 + op->op == COP_PASTE &&
4691 + coord_is_existing_item(op->u.insert.d->coord) &&
4692 + is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4693 + /* paste into solid (atomic) item, which can contain
4694 + only one unit, so we need to shift it right, where
4695 + insertion point supposed to be */
4696 +
4697 + assert("edward-1444", op->u.insert.d->data->iplug ==
4698 + item_plugin_by_id(STATIC_STAT_DATA_ID));
4699 + assert("edward-1445",
4700 + op->u.insert.d->data->length >
4701 + node_plugin_by_node(coord->node)->free_space
4702 + (coord->node));
4703 +
4704 + op->u.insert.d->coord->between = BEFORE_UNIT;
4705 + }
4706 +
4707 + result = carry_shift_data(RIGHT_SIDE, coord,
4708 + reiser4_carry_real(fresh),
4709 + doing, todo, gointo);
4710 + /* if insertion point was actually moved into new node,
4711 + update carry node pointer in operation. */
4712 + node = sync_op(op, fresh);
4713 + not_enough_space = free_space_shortage(node, op);
4714 + if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4715 + /* there is not enough free in new node. Shift
4716 + insertion point back to the @shadow_node so that
4717 + next new node would be inserted between
4718 + @shadow_node and @fresh.
4719 + */
4720 + coord_normalize(&coord_shadow);
4721 + coord_dup(coord, &coord_shadow);
4722 + node = coord->node;
4723 + op->node = node_shadow;
4724 + if (1 || (flags & COPI_STEP_BACK)) {
4725 + /* still not enough space?! Maybe there is
4726 + enough space in the source node (i.e., node
4727 + data are moved from) now.
4728 + */
4729 + not_enough_space =
4730 + free_space_shortage(node, op);
4731 + }
4732 + }
4733 + }
4734 + if (not_enough_space > 0) {
4735 + if (!(flags & COPI_DONT_ALLOCATE))
4736 + warning("nikita-948", "Cannot insert new item");
4737 + result = -E_NODE_FULL;
4738 + }
4739 + assert("nikita-1622", ergo(result == 0,
4740 + reiser4_carry_real(op->node) == coord->node));
4741 + assert("nikita-2616", coord == op->u.insert.d->coord);
4742 + if (result == 0)
4743 + result = make_space_tail(op, doing, orig_node);
4744 + return result;
4745 +}
4746 +
4747 +/* insert_paste_common() - common part of insert and paste operations
4748 +
4749 + This function performs common part of COP_INSERT and COP_PASTE.
4750 +
4751 + There are two ways in which insertion/paste can be requested:
4752 +
4753 + . by directly supplying reiser4_item_data. In this case, op ->
4754 + u.insert.type is set to COPT_ITEM_DATA.
4755 +
4756 + . by supplying child pointer to which is to inserted into parent. In this
4757 + case op -> u.insert.type == COPT_CHILD.
4758 +
4759 + . by supplying key of new item/unit. This is currently only used during
4760 + extent insertion
4761 +
4762 + This is required, because when new node is allocated we don't know at what
4763 + position pointer to it is to be stored in the parent. Actually, we don't
4764 + even know what its parent will be, because parent can be re-balanced
4765 + concurrently and new node re-parented, and because parent can be full and
4766 + pointer to the new node will go into some other node.
4767 +
4768 + insert_paste_common() resolves pointer to child node into position in the
4769 + parent by calling find_new_child_coord(), that fills
4770 + reiser4_item_data. After this, insertion/paste proceeds uniformly.
4771 +
4772 + Another complication is with finding free space during pasting. It may
4773 + happen that while shifting items to the neighbors and newly allocated
4774 + nodes, insertion coord can no longer be in the item we wanted to paste
4775 + into. At this point, paste becomes (morphs) into insert. Moreover free
4776 + space analysis has to be repeated, because amount of space required for
4777 + insertion is different from that of paste (item header overhead, etc).
4778 +
4779 + This function "unifies" different insertion modes (by resolving child
4780 + pointer or key into insertion coord), and then calls make_space() to free
4781 + enough space in the node by shifting data to the left and right and by
4782 + allocating new nodes if necessary. Carry operation knows amount of space
4783 + required for its completion. After enough free space is obtained, caller of
4784 + this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4785 + by calling item plugin method.
4786 +
4787 +*/
4788 +static int insert_paste_common(carry_op * op /* carry operation being
4789 + * performed */ ,
4790 + carry_level * doing /* current carry level */ ,
4791 + carry_level * todo /* next carry level */ ,
4792 + carry_insert_data * cdata /* pointer to
4793 + * cdata */ ,
4794 + coord_t * coord /* insertion/paste coord */ ,
4795 + reiser4_item_data * data /* data to be
4796 + * inserted/pasted */ )
4797 +{
4798 + assert("nikita-981", op != NULL);
4799 + assert("nikita-980", todo != NULL);
4800 + assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4801 + || (op->op == COP_EXTENT));
4802 +
4803 + if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4804 + /* nothing to do. Fall through to make_space(). */
4805 + ;
4806 + } else if (op->u.insert.type == COPT_KEY) {
4807 + node_search_result intra_node;
4808 + znode *node;
4809 + /* Problem with doing batching at the lowest level, is that
4810 + operations here are given by coords where modification is
4811 + to be performed, and one modification can invalidate coords
4812 + of all following operations.
4813 +
4814 + So, we are implementing yet another type for operation that
4815 + will use (the only) "locator" stable across shifting of
4816 + data between nodes, etc.: key (COPT_KEY).
4817 +
4818 + This clause resolves key to the coord in the node.
4819 +
4820 + But node can change also. Probably some pieces have to be
4821 + added to the lock_carry_node(), to lock node by its key.
4822 +
4823 + */
4824 + /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4825 + if you need something else. */
4826 + op->u.insert.d->coord = coord;
4827 + node = reiser4_carry_real(op->node);
4828 + intra_node = node_plugin_by_node(node)->lookup
4829 + (node, op->u.insert.d->key, FIND_EXACT,
4830 + op->u.insert.d->coord);
4831 + if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4832 + warning("nikita-1715", "Intra node lookup failure: %i",
4833 + intra_node);
4834 + return intra_node;
4835 + }
4836 + } else if (op->u.insert.type == COPT_CHILD) {
4837 + /* if we are asked to insert pointer to the child into
4838 + internal node, first convert pointer to the child into
4839 + coord within parent node.
4840 + */
4841 + znode *child;
4842 + int result;
4843 +
4844 + op->u.insert.d = cdata;
4845 + op->u.insert.d->coord = coord;
4846 + op->u.insert.d->data = data;
4847 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4848 + result = find_new_child_coord(op);
4849 + child = reiser4_carry_real(op->u.insert.child);
4850 + if (result != NS_NOT_FOUND) {
4851 + warning("nikita-993",
4852 + "Cannot find a place for child pointer: %i",
4853 + result);
4854 + return result;
4855 + }
4856 + /* This only happens when we did multiple insertions at
4857 + the previous level, trying to insert single item and
4858 + it so happened, that insertion of pointers to all new
4859 + nodes before this one already caused parent node to
4860 + split (may be several times).
4861 +
4862 + I am going to come up with better solution.
4863 +
4864 + You are not expected to understand this.
4865 + -- v6root/usr/sys/ken/slp.c
4866 +
4867 + Basically, what happens here is the following: carry came
4868 + to the parent level and is about to insert internal item
4869 + pointing to the child node that it just inserted in the
4870 + level below. Position where internal item is to be inserted
4871 + was found by find_new_child_coord() above, but node of the
4872 + current carry operation (that is, parent node of child
4873 + inserted on the previous level), was determined earlier in
4874 + the lock_carry_level/lock_carry_node. It could so happen
4875 + that other carry operations already performed on the parent
4876 + level already split parent node, so that insertion point
4877 + moved into another node. Handle this by creating new carry
4878 + node for insertion point if necessary.
4879 + */
4880 + if (reiser4_carry_real(op->node) !=
4881 + op->u.insert.d->coord->node) {
4882 + pool_ordering direction;
4883 + znode *z1;
4884 + znode *z2;
4885 + reiser4_key k1;
4886 + reiser4_key k2;
4887 +
4888 + /*
4889 + * determine in what direction insertion point
4890 + * moved. Do this by comparing delimiting keys.
4891 + */
4892 + z1 = op->u.insert.d->coord->node;
4893 + z2 = reiser4_carry_real(op->node);
4894 + if (keyle(leftmost_key_in_node(z1, &k1),
4895 + leftmost_key_in_node(z2, &k2)))
4896 + /* insertion point moved to the left */
4897 + direction = POOLO_BEFORE;
4898 + else
4899 + /* insertion point moved to the right */
4900 + direction = POOLO_AFTER;
4901 +
4902 + op->node = reiser4_add_carry_skip(doing,
4903 + direction, op->node);
4904 + if (IS_ERR(op->node))
4905 + return PTR_ERR(op->node);
4906 + op->node->node = op->u.insert.d->coord->node;
4907 + op->node->free = 1;
4908 + result = lock_carry_node(doing, op->node);
4909 + if (result != 0)
4910 + return result;
4911 + }
4912 +
4913 + /*
4914 + * set up key of an item being inserted: we are inserting
4915 + * internal item and its key is (by the very definition of
4916 + * search tree) is leftmost key in the child node.
4917 + */
4918 + write_lock_dk(znode_get_tree(child));
4919 + op->u.insert.d->key = leftmost_key_in_node(child,
4920 + znode_get_ld_key(child));
4921 + write_unlock_dk(znode_get_tree(child));
4922 + op->u.insert.d->data->arg = op->u.insert.brother;
4923 + } else {
4924 + assert("vs-243", op->u.insert.d->coord != NULL);
4925 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4926 + }
4927 +
4928 + /* find free space. */
4929 + return make_space(op, doing, todo);
4930 +}
4931 +
4932 +/* handle carry COP_INSERT operation.
4933 +
4934 + Insert new item into node. New item can be given in one of two ways:
4935 +
4936 + - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4937 + only applicable at the leaf/twig level.
4938 +
4939 + - by passing a child node pointer to which is to be inserted by this
4940 + operation.
4941 +
4942 +*/
4943 +static int carry_insert(carry_op * op /* operation to perform */ ,
4944 + carry_level * doing /* queue of operations @op
4945 + * is part of */ ,
4946 + carry_level * todo /* queue where new operations
4947 + * are accumulated */ )
4948 +{
4949 + znode *node;
4950 + carry_insert_data cdata;
4951 + coord_t coord;
4952 + reiser4_item_data data;
4953 + carry_plugin_info info;
4954 + int result;
4955 +
4956 + assert("nikita-1036", op != NULL);
4957 + assert("nikita-1037", todo != NULL);
4958 + assert("nikita-1038", op->op == COP_INSERT);
4959 +
4960 + coord_init_zero(&coord);
4961 +
4962 + /* perform common functionality of insert and paste. */
4963 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
4964 + if (result != 0)
4965 + return result;
4966 +
4967 + node = op->u.insert.d->coord->node;
4968 + assert("nikita-1039", node != NULL);
4969 + assert("nikita-1040", node_plugin_by_node(node) != NULL);
4970 +
4971 + assert("nikita-949",
4972 + space_needed_for_op(node, op) <= znode_free_space(node));
4973 +
4974 + /* ask node layout to create new item. */
4975 + info.doing = doing;
4976 + info.todo = todo;
4977 + result = node_plugin_by_node(node)->create_item
4978 + (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
4979 + &info);
4980 + doing->restartable = 0;
4981 + znode_make_dirty(node);
4982 +
4983 + return result;
4984 +}
4985 +
4986 +/*
4987 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
4988 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
4989 + * by slicing into multiple items.
4990 + */
4991 +
4992 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
4993 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
4994 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
4995 +
4996 +static size_t item_data_overhead(carry_op * op)
4997 +{
4998 + if (flow_insert_data(op)->iplug->b.estimate == NULL)
4999 + return 0;
5000 + return (flow_insert_data(op)->iplug->b.
5001 + estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5002 + flow_insert_data(op)->length);
5003 +}
5004 +
5005 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5006 + and it will always return the same result. Some optimization could be made
5007 + by calculating this value once at the beginning and passing it around. That
5008 + would reduce some flexibility in future changes
5009 +*/
5010 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5011 +static size_t flow_insertion_overhead(carry_op * op)
5012 +{
5013 + znode *node;
5014 + size_t insertion_overhead;
5015 +
5016 + node = flow_insert_point(op)->node;
5017 + insertion_overhead = 0;
5018 + if (node->nplug->item_overhead &&
5019 + !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5020 + flow_insert_data(op)))
5021 + insertion_overhead =
5022 + node->nplug->item_overhead(node, NULL) +
5023 + item_data_overhead(op);
5024 + return insertion_overhead;
5025 +}
5026 +
5027 +/* how many bytes of flow does fit to the node */
5028 +static int what_can_fit_into_node(carry_op * op)
5029 +{
5030 + size_t free, overhead;
5031 +
5032 + overhead = flow_insertion_overhead(op);
5033 + free = znode_free_space(flow_insert_point(op)->node);
5034 + if (free <= overhead)
5035 + return 0;
5036 + free -= overhead;
5037 + /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5038 + if (free < op->u.insert_flow.flow->length)
5039 + return free;
5040 + return (int)op->u.insert_flow.flow->length;
5041 +}
5042 +
5043 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5044 + fits into a node or whether minimal fraction of flow fits into a node */
5045 +static int enough_space_for_whole_flow(carry_op * op)
5046 +{
5047 + return (unsigned)what_can_fit_into_node(op) ==
5048 + op->u.insert_flow.flow->length;
5049 +}
5050 +
5051 +#define MIN_FLOW_FRACTION 1
5052 +static int enough_space_for_min_flow_fraction(carry_op * op)
5053 +{
5054 + assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5055 +
5056 + return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5057 +}
5058 +
5059 +/* this returns 0 if left neighbor was obtained successfully and everything
5060 + upto insertion point including it were shifted and left neighbor still has
5061 + some free space to put minimal fraction of flow into it */
5062 +static int
5063 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5064 +{
5065 + carry_node *left;
5066 + znode *orig;
5067 +
5068 + left = find_left_neighbor(op, doing);
5069 + if (unlikely(IS_ERR(left))) {
5070 + warning("vs-899",
5071 + "make_space_by_shift_left: "
5072 + "error accessing left neighbor: %li", PTR_ERR(left));
5073 + return 1;
5074 + }
5075 + if (left == NULL)
5076 + /* left neighbor either does not exist or is unformatted
5077 + node */
5078 + return 1;
5079 +
5080 + orig = flow_insert_point(op)->node;
5081 + /* try to shift content of node @orig from its head upto insert point
5082 + including insertion point into the left neighbor */
5083 + carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5084 + reiser4_carry_real(left), doing, todo,
5085 + 1 /* including insert point */);
5086 + if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5087 + /* insertion point did not move */
5088 + return 1;
5089 + }
5090 +
5091 + /* insertion point is set after last item in the node */
5092 + assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5093 +
5094 + if (!enough_space_for_min_flow_fraction(op)) {
5095 + /* insertion point node does not have enough free space to put
5096 + even minimal portion of flow into it, therefore, move
5097 + insertion point back to orig node (before first item) */
5098 + coord_init_before_first_item(flow_insert_point(op), orig);
5099 + return 1;
5100 + }
5101 +
5102 + /* part of flow is to be written to the end of node */
5103 + op->node = left;
5104 + return 0;
5105 +}
5106 +
5107 +/* this returns 0 if right neighbor was obtained successfully and everything to
5108 + the right of insertion point was shifted to it and node got enough free
5109 + space to put minimal fraction of flow into it */
5110 +static int
5111 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5112 + carry_level * todo)
5113 +{
5114 + carry_node *right;
5115 +
5116 + right = find_right_neighbor(op, doing);
5117 + if (unlikely(IS_ERR(right))) {
5118 + warning("nikita-1065", "shift_right_excluding_insert_point: "
5119 + "error accessing right neighbor: %li", PTR_ERR(right));
5120 + return 1;
5121 + }
5122 + if (right) {
5123 + /* shift everything possible on the right of but excluding
5124 + insertion coord into the right neighbor */
5125 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5126 + reiser4_carry_real(right), doing, todo,
5127 + 0 /* not including insert point */);
5128 + } else {
5129 + /* right neighbor either does not exist or is unformatted
5130 + node */
5131 + ;
5132 + }
5133 + if (coord_is_after_rightmost(flow_insert_point(op))) {
5134 + if (enough_space_for_min_flow_fraction(op)) {
5135 + /* part of flow is to be written to the end of node */
5136 + return 0;
5137 + }
5138 + }
5139 +
5140 + /* new node is to be added if insert point node did not get enough
5141 + space for whole flow */
5142 + return 1;
5143 +}
5144 +
5145 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5146 + fits into that node */
5147 +static int
5148 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5149 +{
5150 + int result;
5151 + znode *node;
5152 + carry_node *new;
5153 +
5154 + node = flow_insert_point(op)->node;
5155 +
5156 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5157 + return RETERR(-E_NODE_FULL);
5158 + /* add new node after insert point node */
5159 + new = add_new_znode(node, op->node, doing, todo);
5160 + if (unlikely(IS_ERR(new))) {
5161 + return PTR_ERR(new);
5162 + }
5163 + result = lock_carry_node(doing, new);
5164 + zput(reiser4_carry_real(new));
5165 + if (unlikely(result)) {
5166 + return result;
5167 + }
5168 + op->u.insert_flow.new_nodes++;
5169 + if (!coord_is_after_rightmost(flow_insert_point(op))) {
5170 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5171 + reiser4_carry_real(new), doing, todo,
5172 + 0 /* not including insert point */);
5173 + assert("vs-901",
5174 + coord_is_after_rightmost(flow_insert_point(op)));
5175 +
5176 + if (enough_space_for_min_flow_fraction(op)) {
5177 + return 0;
5178 + }
5179 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5180 + return RETERR(-E_NODE_FULL);
5181 +
5182 + /* add one more new node */
5183 + new = add_new_znode(node, op->node, doing, todo);
5184 + if (unlikely(IS_ERR(new))) {
5185 + return PTR_ERR(new);
5186 + }
5187 + result = lock_carry_node(doing, new);
5188 + zput(reiser4_carry_real(new));
5189 + if (unlikely(result)) {
5190 + return result;
5191 + }
5192 + op->u.insert_flow.new_nodes++;
5193 + }
5194 +
5195 + /* move insertion point to new node */
5196 + coord_init_before_first_item(flow_insert_point(op),
5197 + reiser4_carry_real(new));
5198 + op->node = new;
5199 + return 0;
5200 +}
5201 +
5202 +static int
5203 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5204 + carry_level * todo)
5205 +{
5206 + __u32 flags = op->u.insert_flow.flags;
5207 +
5208 + if (enough_space_for_whole_flow(op)) {
5209 + /* whole flow fits into insert point node */
5210 + return 0;
5211 + }
5212 +
5213 + if (!(flags & COPI_DONT_SHIFT_LEFT)
5214 + && (make_space_by_shift_left(op, doing, todo) == 0)) {
5215 + /* insert point is shifted to left neighbor of original insert
5216 + point node and is set after last unit in that node. It has
5217 + enough space to fit at least minimal fraction of flow. */
5218 + return 0;
5219 + }
5220 +
5221 + if (enough_space_for_whole_flow(op)) {
5222 + /* whole flow fits into insert point node */
5223 + return 0;
5224 + }
5225 +
5226 + if (!(flags & COPI_DONT_SHIFT_RIGHT)
5227 + && (make_space_by_shift_right(op, doing, todo) == 0)) {
5228 + /* insert point is still set to the same node, but there is
5229 + nothing to the right of insert point. */
5230 + return 0;
5231 + }
5232 +
5233 + if (enough_space_for_whole_flow(op)) {
5234 + /* whole flow fits into insert point node */
5235 + return 0;
5236 + }
5237 +
5238 + return make_space_by_new_nodes(op, doing, todo);
5239 +}
5240 +
5241 +/* implements COP_INSERT_FLOW operation */
5242 +static int
5243 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5244 +{
5245 + int result;
5246 + flow_t *f;
5247 + coord_t *insert_point;
5248 + node_plugin *nplug;
5249 + carry_plugin_info info;
5250 + znode *orig_node;
5251 + lock_handle *orig_lh;
5252 +
5253 + f = op->u.insert_flow.flow;
5254 + result = 0;
5255 +
5256 + /* carry system needs this to work */
5257 + info.doing = doing;
5258 + info.todo = todo;
5259 +
5260 + orig_node = flow_insert_point(op)->node;
5261 + orig_lh = doing->tracked;
5262 +
5263 + while (f->length) {
5264 + result = make_space_for_flow_insertion(op, doing, todo);
5265 + if (result)
5266 + break;
5267 +
5268 + insert_point = flow_insert_point(op);
5269 + nplug = node_plugin_by_node(insert_point->node);
5270 +
5271 + /* compose item data for insertion/pasting */
5272 + flow_insert_data(op)->data = f->data;
5273 + flow_insert_data(op)->length = what_can_fit_into_node(op);
5274 +
5275 + if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5276 + /* insert point is set to item of file we are writing to and we have to append to it */
5277 + assert("vs-903", insert_point->between == AFTER_UNIT);
5278 + nplug->change_item_size(insert_point,
5279 + flow_insert_data(op)->length);
5280 + flow_insert_data(op)->iplug->b.paste(insert_point,
5281 + flow_insert_data
5282 + (op), &info);
5283 + } else {
5284 + /* new item must be inserted */
5285 + pos_in_node_t new_pos;
5286 + flow_insert_data(op)->length += item_data_overhead(op);
5287 +
5288 + /* FIXME-VS: this is because node40_create_item changes
5289 + insert_point for obscure reasons */
5290 + switch (insert_point->between) {
5291 + case AFTER_ITEM:
5292 + new_pos = insert_point->item_pos + 1;
5293 + break;
5294 + case EMPTY_NODE:
5295 + new_pos = 0;
5296 + break;
5297 + case BEFORE_ITEM:
5298 + assert("vs-905", insert_point->item_pos == 0);
5299 + new_pos = 0;
5300 + break;
5301 + default:
5302 + impossible("vs-906",
5303 + "carry_insert_flow: invalid coord");
5304 + new_pos = 0;
5305 + break;
5306 + }
5307 +
5308 + nplug->create_item(insert_point, &f->key,
5309 + flow_insert_data(op), &info);
5310 + coord_set_item_pos(insert_point, new_pos);
5311 + }
5312 + coord_init_after_item_end(insert_point);
5313 + doing->restartable = 0;
5314 + znode_make_dirty(insert_point->node);
5315 +
5316 + move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5317 + }
5318 +
5319 + if (orig_node != flow_insert_point(op)->node) {
5320 + /* move lock to new insert point */
5321 + done_lh(orig_lh);
5322 + init_lh(orig_lh);
5323 + result =
5324 + longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5325 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5326 + }
5327 +
5328 + return result;
5329 +}
5330 +
5331 +/* implements COP_DELETE operation
5332 +
5333 + Remove pointer to @op -> u.delete.child from it's parent.
5334 +
5335 + This function also handles killing of a tree root is last pointer from it
5336 + was removed. This is complicated by our handling of "twig" level: root on
5337 + twig level is never killed.
5338 +
5339 +*/
5340 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5341 + carry_level * doing UNUSED_ARG /* current carry
5342 + * level */ ,
5343 + carry_level * todo /* next carry level */ )
5344 +{
5345 + int result;
5346 + coord_t coord;
5347 + coord_t coord2;
5348 + znode *parent;
5349 + znode *child;
5350 + carry_plugin_info info;
5351 + reiser4_tree *tree;
5352 +
5353 + /*
5354 + * This operation is called to delete internal item pointing to the
5355 + * child node that was removed by carry from the tree on the previous
5356 + * tree level.
5357 + */
5358 +
5359 + assert("nikita-893", op != NULL);
5360 + assert("nikita-894", todo != NULL);
5361 + assert("nikita-895", op->op == COP_DELETE);
5362 +
5363 + coord_init_zero(&coord);
5364 + coord_init_zero(&coord2);
5365 +
5366 + parent = reiser4_carry_real(op->node);
5367 + child = op->u.delete.child ?
5368 + reiser4_carry_real(op->u.delete.child) : op->node->node;
5369 + tree = znode_get_tree(child);
5370 + read_lock_tree(tree);
5371 +
5372 + /*
5373 + * @parent was determined when carry entered parent level
5374 + * (lock_carry_level/lock_carry_node). Since then, actual parent of
5375 + * @child node could change due to other carry operations performed on
5376 + * the parent level. Check for this.
5377 + */
5378 +
5379 + if (znode_parent(child) != parent) {
5380 + /* NOTE-NIKITA add stat counter for this. */
5381 + parent = znode_parent(child);
5382 + assert("nikita-2581", find_carry_node(doing, parent));
5383 + }
5384 + read_unlock_tree(tree);
5385 +
5386 + assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5387 +
5388 + /* Twig level horrors: tree should be of height at least 2. So, last
5389 + pointer from the root at twig level is preserved even if child is
5390 + empty. This is ugly, but so it was architectured.
5391 + */
5392 +
5393 + if (znode_is_root(parent) &&
5394 + znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5395 + node_num_items(parent) == 1) {
5396 + /* Delimiting key manipulations. */
5397 + write_lock_dk(tree);
5398 + znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5399 + znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5400 + ZF_SET(child, JNODE_DKSET);
5401 + write_unlock_dk(tree);
5402 +
5403 + /* @child escaped imminent death! */
5404 + ZF_CLR(child, JNODE_HEARD_BANSHEE);
5405 + return 0;
5406 + }
5407 +
5408 + /* convert child pointer to the coord_t */
5409 + result = find_child_ptr(parent, child, &coord);
5410 + if (result != NS_FOUND) {
5411 + warning("nikita-994", "Cannot find child pointer: %i", result);
5412 + print_coord_content("coord", &coord);
5413 + return result;
5414 + }
5415 +
5416 + coord_dup(&coord2, &coord);
5417 + info.doing = doing;
5418 + info.todo = todo;
5419 + {
5420 + /*
5421 + * Actually kill internal item: prepare structure with
5422 + * arguments for ->cut_and_kill() method...
5423 + */
5424 +
5425 + struct carry_kill_data kdata;
5426 + kdata.params.from = &coord;
5427 + kdata.params.to = &coord2;
5428 + kdata.params.from_key = NULL;
5429 + kdata.params.to_key = NULL;
5430 + kdata.params.smallest_removed = NULL;
5431 + kdata.params.truncate = 1;
5432 + kdata.flags = op->u.delete.flags;
5433 + kdata.inode = NULL;
5434 + kdata.left = NULL;
5435 + kdata.right = NULL;
5436 + kdata.buf = NULL;
5437 + /* ... and call it. */
5438 + result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5439 + &info);
5440 + }
5441 + doing->restartable = 0;
5442 +
5443 + /* check whether root should be killed violently */
5444 + if (znode_is_root(parent) &&
5445 + /* don't kill roots at and lower than twig level */
5446 + znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5447 + node_num_items(parent) == 1) {
5448 + result = reiser4_kill_tree_root(coord.node);
5449 + }
5450 +
5451 + return result < 0 ? : 0;
5452 +}
5453 +
5454 +/* implements COP_CUT opration
5455 +
5456 + Cuts part or whole content of node.
5457 +
5458 +*/
5459 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5460 + carry_level * doing /* current carry level */ ,
5461 + carry_level * todo /* next carry level */ )
5462 +{
5463 + int result;
5464 + carry_plugin_info info;
5465 + node_plugin *nplug;
5466 +
5467 + assert("nikita-896", op != NULL);
5468 + assert("nikita-897", todo != NULL);
5469 + assert("nikita-898", op->op == COP_CUT);
5470 +
5471 + info.doing = doing;
5472 + info.todo = todo;
5473 +
5474 + nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5475 + if (op->u.cut_or_kill.is_cut)
5476 + result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5477 + else
5478 + result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5479 +
5480 + doing->restartable = 0;
5481 + return result < 0 ? : 0;
5482 +}
5483 +
5484 +/* helper function for carry_paste(): returns true if @op can be continued as
5485 + paste */
5486 +static int
5487 +can_paste(coord_t * icoord, const reiser4_key * key,
5488 + const reiser4_item_data * data)
5489 +{
5490 + coord_t circa;
5491 + item_plugin *new_iplug;
5492 + item_plugin *old_iplug;
5493 + int result = 0; /* to keep gcc shut */
5494 +
5495 + assert("", icoord->between != AT_UNIT);
5496 +
5497 + /* obviously, one cannot paste when node is empty---there is nothing
5498 + to paste into. */
5499 + if (node_is_empty(icoord->node))
5500 + return 0;
5501 + /* if insertion point is at the middle of the item, then paste */
5502 + if (!coord_is_between_items(icoord))
5503 + return 1;
5504 + coord_dup(&circa, icoord);
5505 + circa.between = AT_UNIT;
5506 +
5507 + old_iplug = item_plugin_by_coord(&circa);
5508 + new_iplug = data->iplug;
5509 +
5510 + /* check whether we can paste to the item @icoord is "at" when we
5511 + ignore ->between field */
5512 + if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5513 + result = 1;
5514 + } else if (icoord->between == BEFORE_UNIT
5515 + || icoord->between == BEFORE_ITEM) {
5516 + /* otherwise, try to glue to the item at the left, if any */
5517 + coord_dup(&circa, icoord);
5518 + if (coord_set_to_left(&circa)) {
5519 + result = 0;
5520 + coord_init_before_item(icoord);
5521 + } else {
5522 + old_iplug = item_plugin_by_coord(&circa);
5523 + result = (old_iplug == new_iplug)
5524 + && item_can_contain_key(icoord, key, data);
5525 + if (result) {
5526 + coord_dup(icoord, &circa);
5527 + icoord->between = AFTER_UNIT;
5528 + }
5529 + }
5530 + } else if (icoord->between == AFTER_UNIT
5531 + || icoord->between == AFTER_ITEM) {
5532 + coord_dup(&circa, icoord);
5533 + /* otherwise, try to glue to the item at the right, if any */
5534 + if (coord_set_to_right(&circa)) {
5535 + result = 0;
5536 + coord_init_after_item(icoord);
5537 + } else {
5538 + int (*cck) (const coord_t *, const reiser4_key *,
5539 + const reiser4_item_data *);
5540 +
5541 + old_iplug = item_plugin_by_coord(&circa);
5542 +
5543 + cck = old_iplug->b.can_contain_key;
5544 + if (cck == NULL)
5545 + /* item doesn't define ->can_contain_key
5546 + method? So it is not expandable. */
5547 + result = 0;
5548 + else {
5549 + result = (old_iplug == new_iplug)
5550 + && cck(&circa /*icoord */ , key, data);
5551 + if (result) {
5552 + coord_dup(icoord, &circa);
5553 + icoord->between = BEFORE_UNIT;
5554 + }
5555 + }
5556 + }
5557 + } else
5558 + impossible("nikita-2513", "Nothing works");
5559 + if (result) {
5560 + if (icoord->between == BEFORE_ITEM) {
5561 + assert("vs-912", icoord->unit_pos == 0);
5562 + icoord->between = BEFORE_UNIT;
5563 + } else if (icoord->between == AFTER_ITEM) {
5564 + coord_init_after_item_end(icoord);
5565 + }
5566 + }
5567 + return result;
5568 +}
5569 +
5570 +/* implements COP_PASTE operation
5571 +
5572 + Paste data into existing item. This is complicated by the fact that after
5573 + we shifted something to the left or right neighbors trying to free some
5574 + space, item we were supposed to paste into can be in different node than
5575 + insertion coord. If so, we are no longer doing paste, but insert. See
5576 + comments in insert_paste_common().
5577 +
5578 +*/
5579 +static int carry_paste(carry_op * op /* operation to be performed */ ,
5580 + carry_level * doing UNUSED_ARG /* current carry
5581 + * level */ ,
5582 + carry_level * todo /* next carry level */ )
5583 +{
5584 + znode *node;
5585 + carry_insert_data cdata;
5586 + coord_t dcoord;
5587 + reiser4_item_data data;
5588 + int result;
5589 + int real_size;
5590 + item_plugin *iplug;
5591 + carry_plugin_info info;
5592 + coord_t *coord;
5593 +
5594 + assert("nikita-982", op != NULL);
5595 + assert("nikita-983", todo != NULL);
5596 + assert("nikita-984", op->op == COP_PASTE);
5597 +
5598 + coord_init_zero(&dcoord);
5599 +
5600 + result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5601 + if (result != 0)
5602 + return result;
5603 +
5604 + coord = op->u.insert.d->coord;
5605 +
5606 + /* handle case when op -> u.insert.coord doesn't point to the item
5607 + of required type. restart as insert. */
5608 + if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5609 + op->op = COP_INSERT;
5610 + op->u.insert.type = COPT_PASTE_RESTARTED;
5611 + result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5612 +
5613 + return result;
5614 + }
5615 +
5616 + node = coord->node;
5617 + iplug = item_plugin_by_coord(coord);
5618 + assert("nikita-992", iplug != NULL);
5619 +
5620 + assert("nikita-985", node != NULL);
5621 + assert("nikita-986", node_plugin_by_node(node) != NULL);
5622 +
5623 + assert("nikita-987",
5624 + space_needed_for_op(node, op) <= znode_free_space(node));
5625 +
5626 + assert("nikita-1286", coord_is_existing_item(coord));
5627 +
5628 + /*
5629 + * if item is expanded as a result of this operation, we should first
5630 + * change item size, than call ->b.paste item method. If item is
5631 + * shrunk, it should be done other way around: first call ->b.paste
5632 + * method, then reduce item size.
5633 + */
5634 +
5635 + real_size = space_needed_for_op(node, op);
5636 + if (real_size > 0)
5637 + node->nplug->change_item_size(coord, real_size);
5638 +
5639 + doing->restartable = 0;
5640 + info.doing = doing;
5641 + info.todo = todo;
5642 +
5643 + result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5644 +
5645 + if (real_size < 0)
5646 + node->nplug->change_item_size(coord, real_size);
5647 +
5648 + /* if we pasted at the beginning of the item, update item's key. */
5649 + if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5650 + node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5651 +
5652 + znode_make_dirty(node);
5653 + return result;
5654 +}
5655 +
5656 +/* handle carry COP_EXTENT operation. */
5657 +static int carry_extent(carry_op * op /* operation to perform */ ,
5658 + carry_level * doing /* queue of operations @op
5659 + * is part of */ ,
5660 + carry_level * todo /* queue where new operations
5661 + * are accumulated */ )
5662 +{
5663 + znode *node;
5664 + carry_insert_data cdata;
5665 + coord_t coord;
5666 + reiser4_item_data data;
5667 + carry_op *delete_dummy;
5668 + carry_op *insert_extent;
5669 + int result;
5670 + carry_plugin_info info;
5671 +
5672 + assert("nikita-1751", op != NULL);
5673 + assert("nikita-1752", todo != NULL);
5674 + assert("nikita-1753", op->op == COP_EXTENT);
5675 +
5676 + /* extent insertion overview:
5677 +
5678 + extents live on the TWIG LEVEL, which is level one above the leaf
5679 + one. This complicates extent insertion logic somewhat: it may
5680 + happen (and going to happen all the time) that in logical key
5681 + ordering extent has to be placed between items I1 and I2, located
5682 + at the leaf level, but I1 and I2 are in the same formatted leaf
5683 + node N1. To insert extent one has to
5684 +
5685 + (1) reach node N1 and shift data between N1, its neighbors and
5686 + possibly newly allocated nodes until I1 and I2 fall into different
5687 + nodes. Since I1 and I2 are still neighboring items in logical key
5688 + order, they will be necessary utmost items in their respective
5689 + nodes.
5690 +
5691 + (2) After this new extent item is inserted into node on the twig
5692 + level.
5693 +
5694 + Fortunately this process can reuse almost all code from standard
5695 + insertion procedure (viz. make_space() and insert_paste_common()),
5696 + due to the following observation: make_space() only shifts data up
5697 + to and excluding or including insertion point. It never
5698 + "over-moves" through insertion point. Thus, one can use
5699 + make_space() to perform step (1). All required for this is just to
5700 + instruct free_space_shortage() to keep make_space() shifting data
5701 + until insertion point is at the node border.
5702 +
5703 + */
5704 +
5705 + /* perform common functionality of insert and paste. */
5706 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5707 + if (result != 0)
5708 + return result;
5709 +
5710 + node = op->u.extent.d->coord->node;
5711 + assert("nikita-1754", node != NULL);
5712 + assert("nikita-1755", node_plugin_by_node(node) != NULL);
5713 + assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5714 +
5715 + /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5716 + extent fits between items. */
5717 +
5718 + info.doing = doing;
5719 + info.todo = todo;
5720 +
5721 + /* there is another complication due to placement of extents on the
5722 + twig level: extents are "rigid" in the sense that key-range
5723 + occupied by extent cannot grow indefinitely to the right as it is
5724 + for the formatted leaf nodes. Because of this when search finds two
5725 + adjacent extents on the twig level, it has to "drill" to the leaf
5726 + level, creating new node. Here we are removing this node.
5727 + */
5728 + if (node_is_empty(node)) {
5729 + delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5730 + if (IS_ERR(delete_dummy))
5731 + return PTR_ERR(delete_dummy);
5732 + delete_dummy->u.delete.child = NULL;
5733 + delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5734 + ZF_SET(node, JNODE_HEARD_BANSHEE);
5735 + }
5736 +
5737 + /* proceed with inserting extent item into parent. We are definitely
5738 + inserting rather than pasting if we get that far. */
5739 + insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5740 + if (IS_ERR(insert_extent))
5741 + /* @delete_dummy will be automatically destroyed on the level
5742 + exiting */
5743 + return PTR_ERR(insert_extent);
5744 + /* NOTE-NIKITA insertion by key is simplest option here. Another
5745 + possibility is to insert on the left or right of already existing
5746 + item.
5747 + */
5748 + insert_extent->u.insert.type = COPT_KEY;
5749 + insert_extent->u.insert.d = op->u.extent.d;
5750 + assert("nikita-1719", op->u.extent.d->key != NULL);
5751 + insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5752 + insert_extent->u.insert.flags =
5753 + znode_get_tree(node)->carry.new_extent_flags;
5754 +
5755 + /*
5756 + * if carry was asked to track lock handle we should actually track
5757 + * lock handle on the twig node rather than on the leaf where
5758 + * operation was started from. Transfer tracked lock handle.
5759 + */
5760 + if (doing->track_type) {
5761 + assert("nikita-3242", doing->tracked != NULL);
5762 + assert("nikita-3244", todo->tracked == NULL);
5763 + todo->tracked = doing->tracked;
5764 + todo->track_type = CARRY_TRACK_NODE;
5765 + doing->tracked = NULL;
5766 + doing->track_type = 0;
5767 + }
5768 +
5769 + return 0;
5770 +}
5771 +
5772 +/* update key in @parent between pointers to @left and @right.
5773 +
5774 + Find coords of @left and @right and update delimiting key between them.
5775 + This is helper function called by carry_update(). Finds position of
5776 + internal item involved. Updates item key. Updates delimiting keys of child
5777 + nodes involved.
5778 +*/
5779 +static int update_delimiting_key(znode * parent /* node key is updated
5780 + * in */ ,
5781 + znode * left /* child of @parent */ ,
5782 + znode * right /* child of @parent */ ,
5783 + carry_level * doing /* current carry
5784 + * level */ ,
5785 + carry_level * todo /* parent carry
5786 + * level */ ,
5787 + const char **error_msg /* place to
5788 + * store error
5789 + * message */ )
5790 +{
5791 + coord_t left_pos;
5792 + coord_t right_pos;
5793 + int result;
5794 + reiser4_key ldkey;
5795 + carry_plugin_info info;
5796 +
5797 + assert("nikita-1177", right != NULL);
5798 + /* find position of right left child in a parent */
5799 + result = find_child_ptr(parent, right, &right_pos);
5800 + if (result != NS_FOUND) {
5801 + *error_msg = "Cannot find position of right child";
5802 + return result;
5803 + }
5804 +
5805 + if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5806 + /* find position of the left child in a parent */
5807 + result = find_child_ptr(parent, left, &left_pos);
5808 + if (result != NS_FOUND) {
5809 + *error_msg = "Cannot find position of left child";
5810 + return result;
5811 + }
5812 + assert("nikita-1355", left_pos.node != NULL);
5813 + } else
5814 + left_pos.node = NULL;
5815 +
5816 + /* check that they are separated by exactly one key and are basically
5817 + sane */
5818 + if (REISER4_DEBUG) {
5819 + if ((left_pos.node != NULL)
5820 + && !coord_is_existing_unit(&left_pos)) {
5821 + *error_msg = "Left child is bastard";
5822 + return RETERR(-EIO);
5823 + }
5824 + if (!coord_is_existing_unit(&right_pos)) {
5825 + *error_msg = "Right child is bastard";
5826 + return RETERR(-EIO);
5827 + }
5828 + if (left_pos.node != NULL &&
5829 + !coord_are_neighbors(&left_pos, &right_pos)) {
5830 + *error_msg = "Children are not direct siblings";
5831 + return RETERR(-EIO);
5832 + }
5833 + }
5834 + *error_msg = NULL;
5835 +
5836 + info.doing = doing;
5837 + info.todo = todo;
5838 +
5839 + /*
5840 + * If child node is not empty, new key of internal item is a key of
5841 + * leftmost item in the child node. If the child is empty, take its
5842 + * right delimiting key as a new key of the internal item. Precise key
5843 + * in the latter case is not important per se, because the child (and
5844 + * the internal item) are going to be killed shortly anyway, but we
5845 + * have to preserve correct order of keys in the parent node.
5846 + */
5847 +
5848 + if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5849 + leftmost_key_in_node(right, &ldkey);
5850 + else {
5851 + read_lock_dk(znode_get_tree(parent));
5852 + ldkey = *znode_get_rd_key(right);
5853 + read_unlock_dk(znode_get_tree(parent));
5854 + }
5855 + node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5856 + doing->restartable = 0;
5857 + znode_make_dirty(parent);
5858 + return 0;
5859 +}
5860 +
5861 +/* implements COP_UPDATE opration
5862 +
5863 + Update delimiting keys.
5864 +
5865 +*/
5866 +static int carry_update(carry_op * op /* operation to be performed */ ,
5867 + carry_level * doing /* current carry level */ ,
5868 + carry_level * todo /* next carry level */ )
5869 +{
5870 + int result;
5871 + carry_node *missing UNUSED_ARG;
5872 + znode *left;
5873 + znode *right;
5874 + carry_node *lchild;
5875 + carry_node *rchild;
5876 + const char *error_msg;
5877 + reiser4_tree *tree;
5878 +
5879 + /*
5880 + * This operation is called to update key of internal item. This is
5881 + * necessary when carry shifted of cut data on the child
5882 + * level. Arguments of this operation are:
5883 + *
5884 + * @right --- child node. Operation should update key of internal
5885 + * item pointing to @right.
5886 + *
5887 + * @left --- left neighbor of @right. This parameter is optional.
5888 + */
5889 +
5890 + assert("nikita-902", op != NULL);
5891 + assert("nikita-903", todo != NULL);
5892 + assert("nikita-904", op->op == COP_UPDATE);
5893 +
5894 + lchild = op->u.update.left;
5895 + rchild = op->node;
5896 +
5897 + if (lchild != NULL) {
5898 + assert("nikita-1001", lchild->parent);
5899 + assert("nikita-1003", !lchild->left);
5900 + left = reiser4_carry_real(lchild);
5901 + } else
5902 + left = NULL;
5903 +
5904 + tree = znode_get_tree(rchild->node);
5905 + read_lock_tree(tree);
5906 + right = znode_parent(rchild->node);
5907 + read_unlock_tree(tree);
5908 +
5909 + if (right != NULL) {
5910 + result = update_delimiting_key(right,
5911 + lchild ? lchild->node : NULL,
5912 + rchild->node,
5913 + doing, todo, &error_msg);
5914 + } else {
5915 + error_msg = "Cannot find node to update key in";
5916 + result = RETERR(-EIO);
5917 + }
5918 + /* operation will be reposted to the next level by the
5919 + ->update_item_key() method of node plugin, if necessary. */
5920 +
5921 + if (result != 0) {
5922 + warning("nikita-999", "Error updating delimiting key: %s (%i)",
5923 + error_msg ? : "", result);
5924 + }
5925 + return result;
5926 +}
5927 +
5928 +/* move items from @node during carry */
5929 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
5930 + coord_t * insert_coord /* coord where new item
5931 + * is to be inserted */ ,
5932 + znode * node /* node which data are moved from */ ,
5933 + carry_level * doing /* active carry queue */ ,
5934 + carry_level * todo /* carry queue where new
5935 + * operations are to be put
5936 + * in */ ,
5937 + unsigned int including_insert_coord_p /* true if
5938 + * @insertion_coord
5939 + * can be moved */ )
5940 +{
5941 + int result;
5942 + znode *source;
5943 + carry_plugin_info info;
5944 + node_plugin *nplug;
5945 +
5946 + source = insert_coord->node;
5947 +
5948 + info.doing = doing;
5949 + info.todo = todo;
5950 +
5951 + nplug = node_plugin_by_node(node);
5952 + result = nplug->shift(insert_coord, node,
5953 + (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5954 + (int)including_insert_coord_p, &info);
5955 + /* the only error ->shift() method of node plugin can return is
5956 + -ENOMEM due to carry node/operation allocation. */
5957 + assert("nikita-915", result >= 0 || result == -ENOMEM);
5958 + if (result > 0) {
5959 + /*
5960 + * if some number of bytes was actually shifted, mark nodes
5961 + * dirty, and carry level as non-restartable.
5962 + */
5963 + doing->restartable = 0;
5964 + znode_make_dirty(source);
5965 + znode_make_dirty(node);
5966 + }
5967 +
5968 + assert("nikita-2077", coord_check(insert_coord));
5969 + return 0;
5970 +}
5971 +
5972 +typedef carry_node *(*carry_iterator) (carry_node * node);
5973 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
5974 + carry_iterator iterator);
5975 +
5976 +static carry_node *pool_level_list_prev(carry_node *node)
5977 +{
5978 + return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
5979 +}
5980 +
5981 +/* look for the left neighbor of given carry node in a carry queue.
5982 +
5983 + This is used by find_left_neighbor(), but I am not sure that this
5984 + really gives any advantage. More statistics required.
5985 +
5986 +*/
5987 +carry_node *find_left_carry(carry_node * node /* node to find left neighbor
5988 + * of */ ,
5989 + carry_level * level /* level to scan */ )
5990 +{
5991 + return find_dir_carry(node, level,
5992 + (carry_iterator) pool_level_list_prev);
5993 +}
5994 +
5995 +static carry_node *pool_level_list_next(carry_node *node)
5996 +{
5997 + return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
5998 +}
5999 +
6000 +/* look for the right neighbor of given carry node in a
6001 + carry queue.
6002 +
6003 + This is used by find_right_neighbor(), but I am not sure that this
6004 + really gives any advantage. More statistics required.
6005 +
6006 +*/
6007 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6008 + * of */ ,
6009 + carry_level * level /* level to scan */ )
6010 +{
6011 + return find_dir_carry(node, level,
6012 + (carry_iterator) pool_level_list_next);
6013 +}
6014 +
6015 +/* look for the left or right neighbor of given carry node in a carry
6016 + queue.
6017 +
6018 + Helper function used by find_{left|right}_carry().
6019 +*/
6020 +static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6021 + * from */ ,
6022 + carry_level * level /* level to scan */ ,
6023 + carry_iterator iterator /* operation to
6024 + * move to the next
6025 + * node */ )
6026 +{
6027 + carry_node *neighbor;
6028 +
6029 + assert("nikita-1059", node != NULL);
6030 + assert("nikita-1060", level != NULL);
6031 +
6032 + /* scan list of carry nodes on this list dir-ward, skipping all
6033 + carry nodes referencing the same znode. */
6034 + neighbor = node;
6035 + while (1) {
6036 + neighbor = iterator(neighbor);
6037 + if (carry_node_end(level, neighbor))
6038 + /* list head is reached */
6039 + return NULL;
6040 + if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6041 + return neighbor;
6042 + }
6043 +}
6044 +
6045 +/*
6046 + * Memory reservation estimation.
6047 + *
6048 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6049 + * takes tree in consistent state (e.g., that search tree invariants hold),
6050 + * and leaves tree consistent after it finishes. This means that when some
6051 + * error occurs carry cannot simply return if there are pending carry
6052 + * operations. Generic solution for this problem is carry-undo either as
6053 + * transaction manager feature (requiring checkpoints and isolation), or
6054 + * through some carry specific mechanism.
6055 + *
6056 + * Our current approach is to panic if carry hits an error while tree is
6057 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6058 + * this "memory reservation" mechanism was added.
6059 + *
6060 + * Memory reservation is implemented by perthread-pages.diff patch from
6061 + * core-patches. Its API is defined in <linux/gfp.h>
6062 + *
6063 + * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6064 + * void perthread_pages_release(int nrpages);
6065 + * int perthread_pages_count(void);
6066 + *
6067 + * carry estimates its worst case memory requirements at the entry, reserved
6068 + * enough memory, and released unused pages before returning.
6069 + *
6070 + * Code below estimates worst case memory requirements for a given carry
6071 + * queue. This is dome by summing worst case memory requirements for each
6072 + * operation in the queue.
6073 + *
6074 + */
6075 +
6076 +/*
6077 + * Memory memory requirements of many operations depends on the tree
6078 + * height. For example, item insertion requires new node to be inserted at
6079 + * each tree level in the worst case. What tree height should be used for
6080 + * estimation? Current tree height is wrong, because tree height can change
6081 + * between the time when estimation was done and the time when operation is
6082 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6083 + * is also not desirable, because it would lead to the huge over-estimation
6084 + * all the time. Plausible solution is "capped tree height": if current tree
6085 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6086 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6087 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6088 + * to be increased even more during short interval of time.
6089 + */
6090 +#define TREE_HEIGHT_CAP (5)
6091 +
6092 +/* return capped tree height for the @tree. See comment above. */
6093 +static int cap_tree_height(reiser4_tree * tree)
6094 +{
6095 + return max_t(int, tree->height, TREE_HEIGHT_CAP);
6096 +}
6097 +
6098 +/* return capped tree height for the current tree. */
6099 +static int capped_height(void)
6100 +{
6101 + return cap_tree_height(current_tree);
6102 +}
6103 +
6104 +/* return number of pages required to store given number of bytes */
6105 +static int bytes_to_pages(int bytes)
6106 +{
6107 + return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6108 +}
6109 +
6110 +/* how many pages are required to allocate znodes during item insertion. */
6111 +static int carry_estimate_znodes(void)
6112 +{
6113 + /*
6114 + * Note, that there we have some problem here: there is no way to
6115 + * reserve pages specifically for the given slab. This means that
6116 + * these pages can be hijacked for some other end.
6117 + */
6118 +
6119 + /* in the worst case we need 3 new znode on each tree level */
6120 + return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6121 +}
6122 +
6123 +/*
6124 + * how many pages are required to load bitmaps. One bitmap per level.
6125 + */
6126 +static int carry_estimate_bitmaps(void)
6127 +{
6128 + if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6129 + int bytes;
6130 +
6131 + bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6132 + * bitmap.c, skip for now. */
6133 + 2 * sizeof(jnode)); /* working and commit jnodes */
6134 + return bytes_to_pages(bytes) + 2; /* and their contents */
6135 + } else
6136 + /* bitmaps were pre-loaded during mount */
6137 + return 0;
6138 +}
6139 +
6140 +/* worst case item insertion memory requirements */
6141 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6142 +{
6143 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6144 + capped_height() + /* new block on each level */
6145 + 1 + /* and possibly extra new block at the leaf level */
6146 + 3; /* loading of leaves into memory */
6147 +}
6148 +
6149 +/* worst case item deletion memory requirements */
6150 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6151 +{
6152 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6153 + 3; /* loading of leaves into memory */
6154 +}
6155 +
6156 +/* worst case tree cut memory requirements */
6157 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6158 +{
6159 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6160 + 3; /* loading of leaves into memory */
6161 +}
6162 +
6163 +/* worst case memory requirements of pasting into item */
6164 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6165 +{
6166 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6167 + capped_height() + /* new block on each level */
6168 + 1 + /* and possibly extra new block at the leaf level */
6169 + 3; /* loading of leaves into memory */
6170 +}
6171 +
6172 +/* worst case memory requirements of extent insertion */
6173 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6174 +{
6175 + return carry_estimate_insert(op, level) + /* insert extent */
6176 + carry_estimate_delete(op, level); /* kill leaf */
6177 +}
6178 +
6179 +/* worst case memory requirements of key update */
6180 +static int carry_estimate_update(carry_op * op, carry_level * level)
6181 +{
6182 + return 0;
6183 +}
6184 +
6185 +/* worst case memory requirements of flow insertion */
6186 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6187 +{
6188 + int newnodes;
6189 +
6190 + newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6191 + CARRY_FLOW_NEW_NODES_LIMIT);
6192 + /*
6193 + * roughly estimate insert_flow as a sequence of insertions.
6194 + */
6195 + return newnodes * carry_estimate_insert(op, level);
6196 +}
6197 +
6198 +/* This is dispatch table for carry operations. It can be trivially
6199 + abstracted into useful plugin: tunable balancing policy is a good
6200 + thing. */
6201 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6202 + [COP_INSERT] = {
6203 + .handler = carry_insert,
6204 + .estimate = carry_estimate_insert}
6205 + ,
6206 + [COP_DELETE] = {
6207 + .handler = carry_delete,
6208 + .estimate = carry_estimate_delete}
6209 + ,
6210 + [COP_CUT] = {
6211 + .handler = carry_cut,
6212 + .estimate = carry_estimate_cut}
6213 + ,
6214 + [COP_PASTE] = {
6215 + .handler = carry_paste,
6216 + .estimate = carry_estimate_paste}
6217 + ,
6218 + [COP_EXTENT] = {
6219 + .handler = carry_extent,
6220 + .estimate = carry_estimate_extent}
6221 + ,
6222 + [COP_UPDATE] = {
6223 + .handler = carry_update,
6224 + .estimate = carry_estimate_update}
6225 + ,
6226 + [COP_INSERT_FLOW] = {
6227 + .handler = carry_insert_flow,
6228 + .estimate = carry_estimate_insert_flow}
6229 +};
6230 +
6231 +/* Make Linus happy.
6232 + Local variables:
6233 + c-indentation-style: "K&R"
6234 + mode-name: "LC"
6235 + c-basic-offset: 8
6236 + tab-width: 8
6237 + fill-column: 120
6238 + scroll-step: 1
6239 + End:
6240 +*/
6241 diff -urN linux-2.6.22.orig/fs/reiser4/carry_ops.h linux-2.6.22/fs/reiser4/carry_ops.h
6242 --- linux-2.6.22.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300
6243 +++ linux-2.6.22/fs/reiser4/carry_ops.h 2007-07-29 00:25:34.828684053 +0400
6244 @@ -0,0 +1,42 @@
6245 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6246 +
6247 +/* implementation of carry operations. See carry_ops.c for details. */
6248 +
6249 +#if !defined( __CARRY_OPS_H__ )
6250 +#define __CARRY_OPS_H__
6251 +
6252 +#include "forward.h"
6253 +#include "znode.h"
6254 +#include "carry.h"
6255 +
6256 +/* carry operation handlers */
6257 +typedef struct carry_op_handler {
6258 + /* perform operation */
6259 + int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6260 + /* estimate memory requirements for @op */
6261 + int (*estimate) (carry_op * op, carry_level * level);
6262 +} carry_op_handler;
6263 +
6264 +/* This is dispatch table for carry operations. It can be trivially
6265 + abstracted into useful plugin: tunable balancing policy is a good
6266 + thing. */
6267 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6268 +
6269 +unsigned int space_needed(const znode * node, const coord_t * coord,
6270 + const reiser4_item_data * data, int inserting);
6271 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6272 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6273 +
6274 +/* __CARRY_OPS_H__ */
6275 +#endif
6276 +
6277 +/* Make Linus happy.
6278 + Local variables:
6279 + c-indentation-style: "K&R"
6280 + mode-name: "LC"
6281 + c-basic-offset: 8
6282 + tab-width: 8
6283 + fill-column: 120
6284 + scroll-step: 1
6285 + End:
6286 +*/
6287 diff -urN linux-2.6.22.orig/fs/reiser4/context.c linux-2.6.22/fs/reiser4/context.c
6288 --- linux-2.6.22.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300
6289 +++ linux-2.6.22/fs/reiser4/context.c 2007-07-29 00:25:34.832685088 +0400
6290 @@ -0,0 +1,288 @@
6291 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6292 +
6293 +/* Manipulation of reiser4_context */
6294 +
6295 +/*
6296 + * global context used during system call. Variable of this type is allocated
6297 + * on the stack at the beginning of the reiser4 part of the system call and
6298 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6299 + * passing pointer to current transaction and current lockstack (both in
6300 + * one-to-one mapping with threads) all over the call chain.
6301 + *
6302 + * It's kind of like those global variables the prof used to tell you not to
6303 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6304 + *
6305 + * In some situations it is desirable to have ability to enter reiser4_context
6306 + * more than once for the same thread (nested contexts). For example, there
6307 + * are some functions that can be called either directly from VFS/VM or from
6308 + * already active reiser4 context (->writepage, for example).
6309 + *
6310 + * In such situations "child" context acts like dummy: all activity is
6311 + * actually performed in the top level context, and get_current_context()
6312 + * always returns top level context.
6313 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6314 + * nested any way.
6315 + *
6316 + * Note that there is an important difference between reiser4 uses
6317 + * ->fs_context and the way other file systems use it. Other file systems
6318 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6319 + * (this is why ->fs_context was initially called ->journal_info). This means,
6320 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6321 + * to the file system, they assume that some transaction is already underway,
6322 + * and usually bail out, because starting nested transaction would most likely
6323 + * lead to the deadlock. This gives false positives with reiser4, because we
6324 + * set ->fs_context before starting transaction.
6325 + */
6326 +
6327 +#include "debug.h"
6328 +#include "super.h"
6329 +#include "context.h"
6330 +
6331 +#include <linux/writeback.h> /* balance_dirty_pages() */
6332 +#include <linux/hardirq.h>
6333 +
6334 +static void _reiser4_init_context(reiser4_context * context,
6335 + struct super_block *super)
6336 +{
6337 + memset(context, 0, sizeof(*context));
6338 +
6339 + context->super = super;
6340 + context->magic = context_magic;
6341 + context->outer = current->journal_info;
6342 + current->journal_info = (void *)context;
6343 + context->nr_children = 0;
6344 + context->gfp_mask = GFP_KERNEL;
6345 +
6346 + init_lock_stack(&context->stack);
6347 +
6348 + reiser4_txn_begin(context);
6349 +
6350 + /* initialize head of tap list */
6351 + INIT_LIST_HEAD(&context->taps);
6352 +#if REISER4_DEBUG
6353 + context->task = current;
6354 +#endif
6355 + grab_space_enable();
6356 +}
6357 +
6358 +/* initialize context and bind it to the current thread
6359 +
6360 + This function should be called at the beginning of reiser4 part of
6361 + syscall.
6362 +*/
6363 +reiser4_context * reiser4_init_context(struct super_block * super)
6364 +{
6365 + reiser4_context *context;
6366 +
6367 + assert("nikita-2662", !in_interrupt() && !in_irq());
6368 + assert("nikita-3357", super != NULL);
6369 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6370 +
6371 + context = get_current_context_check();
6372 + if (context && context->super == super) {
6373 + context = (reiser4_context *) current->journal_info;
6374 + context->nr_children++;
6375 + return context;
6376 + }
6377 +
6378 + context = kmalloc(sizeof(*context), GFP_KERNEL);
6379 + if (context == NULL)
6380 + return ERR_PTR(RETERR(-ENOMEM));
6381 +
6382 + _reiser4_init_context(context, super);
6383 + return context;
6384 +}
6385 +
6386 +/* this is used in scan_mgr which is called with spinlock held and in
6387 + reiser4_fill_super magic */
6388 +void init_stack_context(reiser4_context *context, struct super_block *super)
6389 +{
6390 + assert("nikita-2662", !in_interrupt() && !in_irq());
6391 + assert("nikita-3357", super != NULL);
6392 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6393 + assert("vs-12", !is_in_reiser4_context());
6394 +
6395 + _reiser4_init_context(context, super);
6396 + context->on_stack = 1;
6397 + return;
6398 +}
6399 +
6400 +/* cast lock stack embedded into reiser4 context up to its container */
6401 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6402 +{
6403 + return container_of(owner, reiser4_context, stack);
6404 +}
6405 +
6406 +/* true if there is already _any_ reiser4 context for the current thread */
6407 +int is_in_reiser4_context(void)
6408 +{
6409 + reiser4_context *ctx;
6410 +
6411 + ctx = current->journal_info;
6412 + return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6413 +}
6414 +
6415 +/*
6416 + * call balance dirty pages for the current context.
6417 + *
6418 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6419 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6420 + * write---this covers vast majority of all dirty traffic), but we cannot do
6421 + * this immediately when formatted node is dirtied, because long term lock is
6422 + * usually held at that time. To work around this, dirtying of formatted node
6423 + * simply increases ->nr_marked_dirty counter in the current reiser4
6424 + * context. When we are about to leave this context,
6425 + * balance_dirty_pages_ratelimited() is called, if necessary.
6426 + *
6427 + * This introduces another problem: sometimes we do not want to run
6428 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6429 + * because some important lock (like ->i_mutex on the parent directory) is
6430 + * held. To achieve this, ->nobalance flag can be set in the current context.
6431 + */
6432 +static void balance_dirty_pages_at(reiser4_context *context)
6433 +{
6434 + reiser4_super_info_data *sbinfo = get_super_private(context->super);
6435 +
6436 + /*
6437 + * call balance_dirty_pages_ratelimited() to process formatted nodes
6438 + * dirtied during this system call. Do that only if we are not in mount
6439 + * and there were nodes dirtied in this context and we are not in
6440 + * writepage (to avoid deadlock) and not in pdflush
6441 + */
6442 + if (sbinfo != NULL && sbinfo->fake != NULL &&
6443 + context->nr_marked_dirty != 0 &&
6444 + !(current->flags & PF_MEMALLOC) &&
6445 + !current_is_pdflush())
6446 + balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6447 +}
6448 +
6449 +/* release resources associated with context.
6450 +
6451 + This function should be called at the end of "session" with reiser4,
6452 + typically just before leaving reiser4 driver back to VFS.
6453 +
6454 + This is good place to put some degugging consistency checks, like that
6455 + thread released all locks and closed transcrash etc.
6456 +
6457 +*/
6458 +static void reiser4_done_context(reiser4_context * context /* context being released */ )
6459 +{
6460 + assert("nikita-860", context != NULL);
6461 + assert("nikita-859", context->magic == context_magic);
6462 + assert("vs-646", (reiser4_context *) current->journal_info == context);
6463 + assert("zam-686", !in_interrupt() && !in_irq());
6464 +
6465 + /* only do anything when leaving top-level reiser4 context. All nested
6466 + * contexts are just dummies. */
6467 + if (context->nr_children == 0) {
6468 + assert("jmacd-673", context->trans == NULL);
6469 + assert("jmacd-1002", lock_stack_isclean(&context->stack));
6470 + assert("nikita-1936", reiser4_no_counters_are_held());
6471 + assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6472 + assert("zam-1004", ergo(get_super_private(context->super),
6473 + get_super_private(context->super)->delete_mutex_owner !=
6474 + current));
6475 +
6476 + /* release all grabbed but as yet unused blocks */
6477 + if (context->grabbed_blocks != 0)
6478 + all_grabbed2free();
6479 +
6480 + /*
6481 + * synchronize against longterm_unlock_znode():
6482 + * wake_up_requestor() wakes up requestors without holding
6483 + * zlock (otherwise they will immediately bump into that lock
6484 + * after wake up on another CPU). To work around (rare)
6485 + * situation where requestor has been woken up asynchronously
6486 + * and managed to run until completion (and destroy its
6487 + * context and lock stack) before wake_up_requestor() called
6488 + * wake_up() on it, wake_up_requestor() synchronize on lock
6489 + * stack spin lock. It has actually been observed that spin
6490 + * lock _was_ locked at this point, because
6491 + * wake_up_requestor() took interrupt.
6492 + */
6493 + spin_lock_stack(&context->stack);
6494 + spin_unlock_stack(&context->stack);
6495 +
6496 + assert("zam-684", context->nr_children == 0);
6497 + /* restore original ->fs_context value */
6498 + current->journal_info = context->outer;
6499 + if (context->on_stack == 0)
6500 + kfree(context);
6501 + } else {
6502 + context->nr_children--;
6503 +#if REISER4_DEBUG
6504 + assert("zam-685", context->nr_children >= 0);
6505 +#endif
6506 + }
6507 +}
6508 +
6509 +/*
6510 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6511 + * transaction. Call done_context() to do context related book-keeping.
6512 + */
6513 +void reiser4_exit_context(reiser4_context * context)
6514 +{
6515 + assert("nikita-3021", reiser4_schedulable());
6516 +
6517 + if (context->nr_children == 0) {
6518 + if (!context->nobalance) {
6519 + reiser4_txn_restart(context);
6520 + balance_dirty_pages_at(context);
6521 + }
6522 +
6523 + /* if filesystem is mounted with -o sync or -o dirsync - commit
6524 + transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6525 + commiting on exit_context when inode semaphore is held and
6526 + to have ktxnmgrd to do commit instead to get better
6527 + concurrent filesystem accesses. But, when one mounts with -o
6528 + sync, he cares more about reliability than about
6529 + performance. So, for now we have this simple mount -o sync
6530 + support. */
6531 + if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6532 + txn_atom *atom;
6533 +
6534 + atom = get_current_atom_locked_nocheck();
6535 + if (atom) {
6536 + atom->flags |= ATOM_FORCE_COMMIT;
6537 + context->trans->flags &= ~TXNH_DONT_COMMIT;
6538 + spin_unlock_atom(atom);
6539 + }
6540 + }
6541 + reiser4_txn_end(context);
6542 + }
6543 + reiser4_done_context(context);
6544 +}
6545 +
6546 +void reiser4_ctx_gfp_mask_set(void)
6547 +{
6548 + reiser4_context *ctx;
6549 +
6550 + ctx = get_current_context();
6551 + if (ctx->entd == 0 &&
6552 + list_empty(&ctx->stack.locks) &&
6553 + ctx->trans->atom == NULL)
6554 + ctx->gfp_mask = GFP_KERNEL;
6555 + else
6556 + ctx->gfp_mask = GFP_NOFS;
6557 +}
6558 +
6559 +void reiser4_ctx_gfp_mask_force (gfp_t mask)
6560 +{
6561 + reiser4_context *ctx;
6562 + ctx = get_current_context();
6563 +
6564 + assert("edward-1454", ctx != NULL);
6565 +
6566 + ctx->gfp_mask = mask;
6567 +}
6568 +
6569 +/*
6570 + * Local variables:
6571 + * c-indentation-style: "K&R"
6572 + * mode-name: "LC"
6573 + * c-basic-offset: 8
6574 + * tab-width: 8
6575 + * fill-column: 120
6576 + * scroll-step: 1
6577 + * End:
6578 + */
6579 diff -urN linux-2.6.22.orig/fs/reiser4/context.h linux-2.6.22/fs/reiser4/context.h
6580 --- linux-2.6.22.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300
6581 +++ linux-2.6.22/fs/reiser4/context.h 2007-07-29 00:25:34.832685088 +0400
6582 @@ -0,0 +1,228 @@
6583 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6584 + * reiser4/README */
6585 +
6586 +/* Reiser4 context. See context.c for details. */
6587 +
6588 +#if !defined( __REISER4_CONTEXT_H__ )
6589 +#define __REISER4_CONTEXT_H__
6590 +
6591 +#include "forward.h"
6592 +#include "debug.h"
6593 +#include "dformat.h"
6594 +#include "tap.h"
6595 +#include "lock.h"
6596 +
6597 +#include <linux/types.h> /* for __u?? */
6598 +#include <linux/fs.h> /* for struct super_block */
6599 +#include <linux/spinlock.h>
6600 +#include <linux/sched.h> /* for struct task_struct */
6601 +
6602 +/* reiser4 per-thread context */
6603 +struct reiser4_context {
6604 + /* magic constant. For identification of reiser4 contexts. */
6605 + __u32 magic;
6606 +
6607 + /* current lock stack. See lock.[ch]. This is where list of all
6608 + locks taken by current thread is kept. This is also used in
6609 + deadlock detection. */
6610 + lock_stack stack;
6611 +
6612 + /* current transcrash. */
6613 + txn_handle *trans;
6614 + /* transaction handle embedded into reiser4_context. ->trans points
6615 + * here by default. */
6616 + txn_handle trans_in_ctx;
6617 +
6618 + /* super block we are working with. To get the current tree
6619 + use &get_super_private (reiser4_get_current_sb ())->tree. */
6620 + struct super_block *super;
6621 +
6622 + /* parent fs activation */
6623 + struct fs_activation *outer;
6624 +
6625 + /* per-thread grabbed (for further allocation) blocks counter */
6626 + reiser4_block_nr grabbed_blocks;
6627 +
6628 + /* list of taps currently monitored. See tap.c */
6629 + struct list_head taps;
6630 +
6631 + /* grabbing space is enabled */
6632 + unsigned int grab_enabled:1;
6633 + /* should be set when we are write dirty nodes to disk in jnode_flush or
6634 + * reiser4_write_logs() */
6635 + unsigned int writeout_mode:1;
6636 + /* true, if current thread is an ent thread */
6637 + unsigned int entd:1;
6638 + /* true, if balance_dirty_pages() should not be run when leaving this
6639 + * context. This is used to avoid lengthly balance_dirty_pages()
6640 + * operation when holding some important resource, like directory
6641 + * ->i_mutex */
6642 + unsigned int nobalance:1;
6643 +
6644 + /* this bit is used on reiser4_done_context to decide whether context is
6645 + kmalloc-ed and has to be kfree-ed */
6646 + unsigned int on_stack:1;
6647 +
6648 + /* count non-trivial jnode_set_dirty() calls */
6649 + unsigned long nr_marked_dirty;
6650 +
6651 + /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6652 + * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6653 + * captures pages. When number of pages captured in one
6654 + * reiser4_sync_inodes reaches some threshold - some atoms get
6655 + * flushed */
6656 + int nr_captured;
6657 + int nr_children; /* number of child contexts */
6658 +#if REISER4_DEBUG
6659 + /* debugging information about reiser4 locks held by the current
6660 + * thread */
6661 + reiser4_lock_cnt_info locks;
6662 + struct task_struct *task; /* so we can easily find owner of the stack */
6663 +
6664 + /*
6665 + * disk space grabbing debugging support
6666 + */
6667 + /* how many disk blocks were grabbed by the first call to
6668 + * reiser4_grab_space() in this context */
6669 + reiser4_block_nr grabbed_initially;
6670 +
6671 + /* list of all threads doing flush currently */
6672 + struct list_head flushers_link;
6673 + /* information about last error encountered by reiser4 */
6674 + err_site err;
6675 +#endif
6676 + void *vp;
6677 + gfp_t gfp_mask;
6678 +};
6679 +
6680 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6681 +
6682 +/* Debugging helps. */
6683 +#if REISER4_DEBUG
6684 +extern void print_contexts(void);
6685 +#endif
6686 +
6687 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6688 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
6689 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6690 +
6691 +extern reiser4_context *reiser4_init_context(struct super_block *);
6692 +extern void init_stack_context(reiser4_context *, struct super_block *);
6693 +extern void reiser4_exit_context(reiser4_context *);
6694 +
6695 +/* magic constant we store in reiser4_context allocated at the stack. Used to
6696 + catch accesses to staled or uninitialized contexts. */
6697 +#define context_magic ((__u32) 0x4b1b5d0b)
6698 +
6699 +extern int is_in_reiser4_context(void);
6700 +
6701 +/*
6702 + * return reiser4_context for the thread @tsk
6703 + */
6704 +static inline reiser4_context *get_context(const struct task_struct *tsk)
6705 +{
6706 + assert("vs-1682",
6707 + ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6708 + return (reiser4_context *) tsk->journal_info;
6709 +}
6710 +
6711 +/*
6712 + * return reiser4 context of the current thread, or NULL if there is none.
6713 + */
6714 +static inline reiser4_context *get_current_context_check(void)
6715 +{
6716 + if (is_in_reiser4_context())
6717 + return get_context(current);
6718 + else
6719 + return NULL;
6720 +}
6721 +
6722 +static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6723 +
6724 +/* return context associated with current thread */
6725 +static inline reiser4_context *get_current_context(void)
6726 +{
6727 + return get_context(current);
6728 +}
6729 +
6730 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6731 +{
6732 + reiser4_context *ctx;
6733 +
6734 + ctx = get_current_context_check();
6735 + return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6736 +}
6737 +
6738 +void reiser4_ctx_gfp_mask_set(void);
6739 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
6740 +
6741 +/*
6742 + * true if current thread is in the write-out mode. Thread enters write-out
6743 + * mode during jnode_flush and reiser4_write_logs().
6744 + */
6745 +static inline int is_writeout_mode(void)
6746 +{
6747 + return get_current_context()->writeout_mode;
6748 +}
6749 +
6750 +/*
6751 + * enter write-out mode
6752 + */
6753 +static inline void writeout_mode_enable(void)
6754 +{
6755 + assert("zam-941", !get_current_context()->writeout_mode);
6756 + get_current_context()->writeout_mode = 1;
6757 +}
6758 +
6759 +/*
6760 + * leave write-out mode
6761 + */
6762 +static inline void writeout_mode_disable(void)
6763 +{
6764 + assert("zam-942", get_current_context()->writeout_mode);
6765 + get_current_context()->writeout_mode = 0;
6766 +}
6767 +
6768 +static inline void grab_space_enable(void)
6769 +{
6770 + get_current_context()->grab_enabled = 1;
6771 +}
6772 +
6773 +static inline void grab_space_disable(void)
6774 +{
6775 + get_current_context()->grab_enabled = 0;
6776 +}
6777 +
6778 +static inline void grab_space_set_enabled(int enabled)
6779 +{
6780 + get_current_context()->grab_enabled = enabled;
6781 +}
6782 +
6783 +static inline int is_grab_enabled(reiser4_context * ctx)
6784 +{
6785 + return ctx->grab_enabled;
6786 +}
6787 +
6788 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6789 + * flush would be performed when it is closed. This is necessary when handle
6790 + * has to be closed under some coarse semaphore, like i_mutex of
6791 + * directory. Commit will be performed by ktxnmgrd. */
6792 +static inline void context_set_commit_async(reiser4_context * context)
6793 +{
6794 + context->nobalance = 1;
6795 + context->trans->flags |= TXNH_DONT_COMMIT;
6796 +}
6797 +
6798 +/* __REISER4_CONTEXT_H__ */
6799 +#endif
6800 +
6801 +/* Make Linus happy.
6802 + Local variables:
6803 + c-indentation-style: "K&R"
6804 + mode-name: "LC"
6805 + c-basic-offset: 8
6806 + tab-width: 8
6807 + fill-column: 120
6808 + scroll-step: 1
6809 + End:
6810 +*/
6811 diff -urN linux-2.6.22.orig/fs/reiser4/coord.c linux-2.6.22/fs/reiser4/coord.c
6812 --- linux-2.6.22.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300
6813 +++ linux-2.6.22/fs/reiser4/coord.c 2007-07-29 00:25:34.832685088 +0400
6814 @@ -0,0 +1,935 @@
6815 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6816 +
6817 +#include "forward.h"
6818 +#include "debug.h"
6819 +#include "dformat.h"
6820 +#include "tree.h"
6821 +#include "plugin/item/item.h"
6822 +#include "znode.h"
6823 +#include "coord.h"
6824 +
6825 +/* Internal constructor. */
6826 +static inline void
6827 +coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6828 + pos_in_node_t unit_pos, between_enum between)
6829 +{
6830 + coord->node = (znode *) node;
6831 + coord_set_item_pos(coord, item_pos);
6832 + coord->unit_pos = unit_pos;
6833 + coord->between = between;
6834 + ON_DEBUG(coord->plug_v = 0);
6835 + ON_DEBUG(coord->body_v = 0);
6836 +
6837 + /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6838 +}
6839 +
6840 +/* after shifting of node content, coord previously set properly may become
6841 + invalid, try to "normalize" it. */
6842 +void coord_normalize(coord_t * coord)
6843 +{
6844 + znode *node;
6845 +
6846 + node = coord->node;
6847 + assert("vs-683", node);
6848 +
6849 + coord_clear_iplug(coord);
6850 +
6851 + if (node_is_empty(node)) {
6852 + coord_init_first_unit(coord, node);
6853 + } else if ((coord->between == AFTER_ITEM)
6854 + || (coord->between == AFTER_UNIT)) {
6855 + return;
6856 + } else if (coord->item_pos == coord_num_items(coord)
6857 + && coord->between == BEFORE_ITEM) {
6858 + coord_dec_item_pos(coord);
6859 + coord->between = AFTER_ITEM;
6860 + } else if (coord->unit_pos == coord_num_units(coord)
6861 + && coord->between == BEFORE_UNIT) {
6862 + coord->unit_pos--;
6863 + coord->between = AFTER_UNIT;
6864 + } else if (coord->item_pos == coord_num_items(coord)
6865 + && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6866 + coord_dec_item_pos(coord);
6867 + coord->unit_pos = 0;
6868 + coord->between = AFTER_ITEM;
6869 + }
6870 +}
6871 +
6872 +/* Copy a coordinate. */
6873 +void coord_dup(coord_t * coord, const coord_t * old_coord)
6874 +{
6875 + assert("jmacd-9800", coord_check(old_coord));
6876 + coord_dup_nocheck(coord, old_coord);
6877 +}
6878 +
6879 +/* Copy a coordinate without check. Useful when old_coord->node is not
6880 + loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6881 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6882 +{
6883 + coord->node = old_coord->node;
6884 + coord_set_item_pos(coord, old_coord->item_pos);
6885 + coord->unit_pos = old_coord->unit_pos;
6886 + coord->between = old_coord->between;
6887 + coord->iplugid = old_coord->iplugid;
6888 + ON_DEBUG(coord->plug_v = old_coord->plug_v);
6889 + ON_DEBUG(coord->body_v = old_coord->body_v);
6890 +}
6891 +
6892 +/* Initialize an invalid coordinate. */
6893 +void coord_init_invalid(coord_t * coord, const znode * node)
6894 +{
6895 + coord_init_values(coord, node, 0, 0, INVALID_COORD);
6896 +}
6897 +
6898 +void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6899 +{
6900 + coord_init_values(coord, node, 0, 0, AT_UNIT);
6901 +}
6902 +
6903 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
6904 + empty, it is positioned at the EMPTY_NODE. */
6905 +void coord_init_first_unit(coord_t * coord, const znode * node)
6906 +{
6907 + int is_empty = node_is_empty(node);
6908 +
6909 + coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6910 +
6911 + assert("jmacd-9801", coord_check(coord));
6912 +}
6913 +
6914 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
6915 + empty, it is positioned at the EMPTY_NODE. */
6916 +void coord_init_last_unit(coord_t * coord, const znode * node)
6917 +{
6918 + int is_empty = node_is_empty(node);
6919 +
6920 + coord_init_values(coord, node,
6921 + (is_empty ? 0 : node_num_items(node) - 1), 0,
6922 + (is_empty ? EMPTY_NODE : AT_UNIT));
6923 + if (!is_empty)
6924 + coord->unit_pos = coord_last_unit_pos(coord);
6925 + assert("jmacd-9802", coord_check(coord));
6926 +}
6927 +
6928 +/* Initialize a coordinate to before the first item. If the node is empty, it is
6929 + positioned at the EMPTY_NODE. */
6930 +void coord_init_before_first_item(coord_t * coord, const znode * node)
6931 +{
6932 + int is_empty = node_is_empty(node);
6933 +
6934 + coord_init_values(coord, node, 0, 0,
6935 + (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6936 +
6937 + assert("jmacd-9803", coord_check(coord));
6938 +}
6939 +
6940 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
6941 + at the EMPTY_NODE. */
6942 +void coord_init_after_last_item(coord_t * coord, const znode * node)
6943 +{
6944 + int is_empty = node_is_empty(node);
6945 +
6946 + coord_init_values(coord, node,
6947 + (is_empty ? 0 : node_num_items(node) - 1), 0,
6948 + (is_empty ? EMPTY_NODE : AFTER_ITEM));
6949 +
6950 + assert("jmacd-9804", coord_check(coord));
6951 +}
6952 +
6953 +/* Initialize a coordinate to after last unit in the item. Coord must be set
6954 + already to existing item */
6955 +void coord_init_after_item_end(coord_t * coord)
6956 +{
6957 + coord->between = AFTER_UNIT;
6958 + coord->unit_pos = coord_last_unit_pos(coord);
6959 +}
6960 +
6961 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6962 +void coord_init_before_item(coord_t * coord)
6963 +{
6964 + coord->unit_pos = 0;
6965 + coord->between = BEFORE_ITEM;
6966 +}
6967 +
6968 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
6969 +void coord_init_after_item(coord_t * coord)
6970 +{
6971 + coord->unit_pos = 0;
6972 + coord->between = AFTER_ITEM;
6973 +}
6974 +
6975 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
6976 + it was not clear how actually */
6977 +void coord_init_zero(coord_t * coord)
6978 +{
6979 + memset(coord, 0, sizeof(*coord));
6980 +}
6981 +
6982 +/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
6983 +unsigned coord_num_units(const coord_t * coord)
6984 +{
6985 + assert("jmacd-9806", coord_is_existing_item(coord));
6986 +
6987 + return item_plugin_by_coord(coord)->b.nr_units(coord);
6988 +}
6989 +
6990 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
6991 +/* Audited by: green(2002.06.15) */
6992 +int coord_is_invalid(const coord_t * coord)
6993 +{
6994 + return coord->between == INVALID_COORD;
6995 +}
6996 +
6997 +/* Returns true if the coordinate is positioned at an existing item, not before or after
6998 + an item. It may be placed at, before, or after any unit within the item, whether
6999 + existing or not. */
7000 +int coord_is_existing_item(const coord_t * coord)
7001 +{
7002 + switch (coord->between) {
7003 + case EMPTY_NODE:
7004 + case BEFORE_ITEM:
7005 + case AFTER_ITEM:
7006 + case INVALID_COORD:
7007 + return 0;
7008 +
7009 + case BEFORE_UNIT:
7010 + case AT_UNIT:
7011 + case AFTER_UNIT:
7012 + return coord->item_pos < coord_num_items(coord);
7013 + }
7014 +
7015 + impossible("jmacd-9900", "unreachable coord: %p", coord);
7016 + return 0;
7017 +}
7018 +
7019 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7020 + unit. */
7021 +/* Audited by: green(2002.06.15) */
7022 +int coord_is_existing_unit(const coord_t * coord)
7023 +{
7024 + switch (coord->between) {
7025 + case EMPTY_NODE:
7026 + case BEFORE_UNIT:
7027 + case AFTER_UNIT:
7028 + case BEFORE_ITEM:
7029 + case AFTER_ITEM:
7030 + case INVALID_COORD:
7031 + return 0;
7032 +
7033 + case AT_UNIT:
7034 + return (coord->item_pos < coord_num_items(coord)
7035 + && coord->unit_pos < coord_num_units(coord));
7036 + }
7037 +
7038 + impossible("jmacd-9902", "unreachable");
7039 + return 0;
7040 +}
7041 +
7042 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7043 + true for empty nodes nor coordinates positioned before the first item. */
7044 +/* Audited by: green(2002.06.15) */
7045 +int coord_is_leftmost_unit(const coord_t * coord)
7046 +{
7047 + return (coord->between == AT_UNIT && coord->item_pos == 0
7048 + && coord->unit_pos == 0);
7049 +}
7050 +
7051 +#if REISER4_DEBUG
7052 +/* For assertions only, checks for a valid coordinate. */
7053 +int coord_check(const coord_t * coord)
7054 +{
7055 + if (coord->node == NULL) {
7056 + return 0;
7057 + }
7058 + if (znode_above_root(coord->node))
7059 + return 1;
7060 +
7061 + switch (coord->between) {
7062 + default:
7063 + case INVALID_COORD:
7064 + return 0;
7065 + case EMPTY_NODE:
7066 + if (!node_is_empty(coord->node)) {
7067 + return 0;
7068 + }
7069 + return coord->item_pos == 0 && coord->unit_pos == 0;
7070 +
7071 + case BEFORE_UNIT:
7072 + case AFTER_UNIT:
7073 + if (node_is_empty(coord->node) && (coord->item_pos == 0)
7074 + && (coord->unit_pos == 0))
7075 + return 1;
7076 + case AT_UNIT:
7077 + break;
7078 + case AFTER_ITEM:
7079 + case BEFORE_ITEM:
7080 + /* before/after item should not set unit_pos. */
7081 + if (coord->unit_pos != 0) {
7082 + return 0;
7083 + }
7084 + break;
7085 + }
7086 +
7087 + if (coord->item_pos >= node_num_items(coord->node)) {
7088 + return 0;
7089 + }
7090 +
7091 + /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7092 + between is set either AFTER_ITEM or BEFORE_ITEM */
7093 + if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7094 + return 1;
7095 +
7096 + if (coord_is_iplug_set(coord) &&
7097 + coord->unit_pos >
7098 + item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7099 + return 0;
7100 + }
7101 + return 1;
7102 +}
7103 +#endif
7104 +
7105 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7106 + Returns 1 if the new position is does not exist. */
7107 +static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7108 +{
7109 + /* If the node is invalid, leave it. */
7110 + if (coord->between == INVALID_COORD) {
7111 + return 1;
7112 + }
7113 +
7114 + /* If the node is empty, set it appropriately. */
7115 + if (items == 0) {
7116 + coord->between = EMPTY_NODE;
7117 + coord_set_item_pos(coord, 0);
7118 + coord->unit_pos = 0;
7119 + return 1;
7120 + }
7121 +
7122 + /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7123 + if (coord->between == EMPTY_NODE) {
7124 + coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7125 + coord_set_item_pos(coord, 0);
7126 + coord->unit_pos = 0;
7127 + return 0;
7128 + }
7129 +
7130 + /* If the item_pos is out-of-range, set it appropriatly. */
7131 + if (coord->item_pos >= items) {
7132 + coord->between = AFTER_ITEM;
7133 + coord_set_item_pos(coord, items - 1);
7134 + coord->unit_pos = 0;
7135 + /* If is_next, return 1 (can't go any further). */
7136 + return is_next;
7137 + }
7138 +
7139 + return 0;
7140 +}
7141 +
7142 +/* Advances the coordinate by one unit to the right. If empty, no change. If
7143 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7144 + existing unit. */
7145 +int coord_next_unit(coord_t * coord)
7146 +{
7147 + unsigned items = coord_num_items(coord);
7148 +
7149 + if (coord_adjust_items(coord, items, 1) == 1) {
7150 + return 1;
7151 + }
7152 +
7153 + switch (coord->between) {
7154 + case BEFORE_UNIT:
7155 + /* Now it is positioned at the same unit. */
7156 + coord->between = AT_UNIT;
7157 + return 0;
7158 +
7159 + case AFTER_UNIT:
7160 + case AT_UNIT:
7161 + /* If it was at or after a unit and there are more units in this item,
7162 + advance to the next one. */
7163 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7164 + coord->unit_pos += 1;
7165 + coord->between = AT_UNIT;
7166 + return 0;
7167 + }
7168 +
7169 + /* Otherwise, it is crossing an item boundary and treated as if it was
7170 + after the current item. */
7171 + coord->between = AFTER_ITEM;
7172 + coord->unit_pos = 0;
7173 + /* FALLTHROUGH */
7174 +
7175 + case AFTER_ITEM:
7176 + /* Check for end-of-node. */
7177 + if (coord->item_pos == items - 1) {
7178 + return 1;
7179 + }
7180 +
7181 + coord_inc_item_pos(coord);
7182 + coord->unit_pos = 0;
7183 + coord->between = AT_UNIT;
7184 + return 0;
7185 +
7186 + case BEFORE_ITEM:
7187 + /* The adjust_items checks ensure that we are valid here. */
7188 + coord->unit_pos = 0;
7189 + coord->between = AT_UNIT;
7190 + return 0;
7191 +
7192 + case INVALID_COORD:
7193 + case EMPTY_NODE:
7194 + /* Handled in coord_adjust_items(). */
7195 + break;
7196 + }
7197 +
7198 + impossible("jmacd-9902", "unreachable");
7199 + return 0;
7200 +}
7201 +
7202 +/* Advances the coordinate by one item to the right. If empty, no change. If
7203 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7204 + an existing item. */
7205 +int coord_next_item(coord_t * coord)
7206 +{
7207 + unsigned items = coord_num_items(coord);
7208 +
7209 + if (coord_adjust_items(coord, items, 1) == 1) {
7210 + return 1;
7211 + }
7212 +
7213 + switch (coord->between) {
7214 + case AFTER_UNIT:
7215 + case AT_UNIT:
7216 + case BEFORE_UNIT:
7217 + case AFTER_ITEM:
7218 + /* Check for end-of-node. */
7219 + if (coord->item_pos == items - 1) {
7220 + coord->between = AFTER_ITEM;
7221 + coord->unit_pos = 0;
7222 + coord_clear_iplug(coord);
7223 + return 1;
7224 + }
7225 +
7226 + /* Anywhere in an item, go to the next one. */
7227 + coord->between = AT_UNIT;
7228 + coord_inc_item_pos(coord);
7229 + coord->unit_pos = 0;
7230 + return 0;
7231 +
7232 + case BEFORE_ITEM:
7233 + /* The out-of-range check ensures that we are valid here. */
7234 + coord->unit_pos = 0;
7235 + coord->between = AT_UNIT;
7236 + return 0;
7237 + case INVALID_COORD:
7238 + case EMPTY_NODE:
7239 + /* Handled in coord_adjust_items(). */
7240 + break;
7241 + }
7242 +
7243 + impossible("jmacd-9903", "unreachable");
7244 + return 0;
7245 +}
7246 +
7247 +/* Advances the coordinate by one unit to the left. If empty, no change. If
7248 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7249 + is an existing unit. */
7250 +int coord_prev_unit(coord_t * coord)
7251 +{
7252 + unsigned items = coord_num_items(coord);
7253 +
7254 + if (coord_adjust_items(coord, items, 0) == 1) {
7255 + return 1;
7256 + }
7257 +
7258 + switch (coord->between) {
7259 + case AT_UNIT:
7260 + case BEFORE_UNIT:
7261 + if (coord->unit_pos > 0) {
7262 + coord->unit_pos -= 1;
7263 + coord->between = AT_UNIT;
7264 + return 0;
7265 + }
7266 +
7267 + if (coord->item_pos == 0) {
7268 + coord->between = BEFORE_ITEM;
7269 + return 1;
7270 + }
7271 +
7272 + coord_dec_item_pos(coord);
7273 + coord->unit_pos = coord_last_unit_pos(coord);
7274 + coord->between = AT_UNIT;
7275 + return 0;
7276 +
7277 + case AFTER_UNIT:
7278 + /* What if unit_pos is out-of-range? */
7279 + assert("jmacd-5442",
7280 + coord->unit_pos <= coord_last_unit_pos(coord));
7281 + coord->between = AT_UNIT;
7282 + return 0;
7283 +
7284 + case BEFORE_ITEM:
7285 + if (coord->item_pos == 0) {
7286 + return 1;
7287 + }
7288 +
7289 + coord_dec_item_pos(coord);
7290 + /* FALLTHROUGH */
7291 +
7292 + case AFTER_ITEM:
7293 + coord->between = AT_UNIT;
7294 + coord->unit_pos = coord_last_unit_pos(coord);
7295 + return 0;
7296 +
7297 + case INVALID_COORD:
7298 + case EMPTY_NODE:
7299 + break;
7300 + }
7301 +
7302 + impossible("jmacd-9904", "unreachable");
7303 + return 0;
7304 +}
7305 +
7306 +/* Advances the coordinate by one item to the left. If empty, no change. If
7307 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7308 + is an existing item. */
7309 +int coord_prev_item(coord_t * coord)
7310 +{
7311 + unsigned items = coord_num_items(coord);
7312 +
7313 + if (coord_adjust_items(coord, items, 0) == 1) {
7314 + return 1;
7315 + }
7316 +
7317 + switch (coord->between) {
7318 + case AT_UNIT:
7319 + case AFTER_UNIT:
7320 + case BEFORE_UNIT:
7321 + case BEFORE_ITEM:
7322 +
7323 + if (coord->item_pos == 0) {
7324 + coord->between = BEFORE_ITEM;
7325 + coord->unit_pos = 0;
7326 + return 1;
7327 + }
7328 +
7329 + coord_dec_item_pos(coord);
7330 + coord->unit_pos = 0;
7331 + coord->between = AT_UNIT;
7332 + return 0;
7333 +
7334 + case AFTER_ITEM:
7335 + coord->between = AT_UNIT;
7336 + coord->unit_pos = 0;
7337 + return 0;
7338 +
7339 + case INVALID_COORD:
7340 + case EMPTY_NODE:
7341 + break;
7342 + }
7343 +
7344 + impossible("jmacd-9905", "unreachable");
7345 + return 0;
7346 +}
7347 +
7348 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7349 +void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7350 +{
7351 + assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7352 + if (dir == LEFT_SIDE) {
7353 + coord_init_first_unit(coord, node);
7354 + } else {
7355 + coord_init_last_unit(coord, node);
7356 + }
7357 +}
7358 +
7359 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7360 + argument. */
7361 +/* Audited by: green(2002.06.15) */
7362 +int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7363 +{
7364 + assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7365 + if (dir == LEFT_SIDE) {
7366 + return coord_is_before_leftmost(coord);
7367 + } else {
7368 + return coord_is_after_rightmost(coord);
7369 + }
7370 +}
7371 +
7372 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7373 +/* Audited by: green(2002.06.15) */
7374 +int coord_sideof_unit(coord_t * coord, sideof dir)
7375 +{
7376 + assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7377 + if (dir == LEFT_SIDE) {
7378 + return coord_prev_unit(coord);
7379 + } else {
7380 + return coord_next_unit(coord);
7381 + }
7382 +}
7383 +
7384 +#if REISER4_DEBUG
7385 +int coords_equal(const coord_t * c1, const coord_t * c2)
7386 +{
7387 + assert("nikita-2840", c1 != NULL);
7388 + assert("nikita-2841", c2 != NULL);
7389 +
7390 + return
7391 + c1->node == c2->node &&
7392 + c1->item_pos == c2->item_pos &&
7393 + c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7394 +}
7395 +#endif /* REISER4_DEBUG */
7396 +
7397 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7398 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7399 +/* Audited by: green(2002.06.15) */
7400 +coord_wrt_node coord_wrt(const coord_t * coord)
7401 +{
7402 + if (coord_is_before_leftmost(coord)) {
7403 + return COORD_ON_THE_LEFT;
7404 + }
7405 +
7406 + if (coord_is_after_rightmost(coord)) {
7407 + return COORD_ON_THE_RIGHT;
7408 + }
7409 +
7410 + return COORD_INSIDE;
7411 +}
7412 +
7413 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7414 + of the last item or it is an empty node. */
7415 +/* Audited by: green(2002.06.15) */
7416 +int coord_is_after_rightmost(const coord_t * coord)
7417 +{
7418 + assert("jmacd-7313", coord_check(coord));
7419 +
7420 + switch (coord->between) {
7421 + case INVALID_COORD:
7422 + case AT_UNIT:
7423 + case BEFORE_UNIT:
7424 + case BEFORE_ITEM:
7425 + return 0;
7426 +
7427 + case EMPTY_NODE:
7428 + return 1;
7429 +
7430 + case AFTER_ITEM:
7431 + return (coord->item_pos == node_num_items(coord->node) - 1);
7432 +
7433 + case AFTER_UNIT:
7434 + return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7435 + coord->unit_pos == coord_last_unit_pos(coord));
7436 + }
7437 +
7438 + impossible("jmacd-9908", "unreachable");
7439 + return 0;
7440 +}
7441 +
7442 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7443 + node. */
7444 +int coord_is_before_leftmost(const coord_t * coord)
7445 +{
7446 + /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7447 + necessary to check if coord is set before leftmost
7448 + assert ("jmacd-7313", coord_check (coord)); */
7449 + switch (coord->between) {
7450 + case INVALID_COORD:
7451 + case AT_UNIT:
7452 + case AFTER_ITEM:
7453 + case AFTER_UNIT:
7454 + return 0;
7455 +
7456 + case EMPTY_NODE:
7457 + return 1;
7458 +
7459 + case BEFORE_ITEM:
7460 + case BEFORE_UNIT:
7461 + return (coord->item_pos == 0) && (coord->unit_pos == 0);
7462 + }
7463 +
7464 + impossible("jmacd-9908", "unreachable");
7465 + return 0;
7466 +}
7467 +
7468 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7469 + last unit of an item, before the first unit of an item, or at an empty node. */
7470 +/* Audited by: green(2002.06.15) */
7471 +int coord_is_between_items(const coord_t * coord)
7472 +{
7473 + assert("jmacd-7313", coord_check(coord));
7474 +
7475 + switch (coord->between) {
7476 + case INVALID_COORD:
7477 + case AT_UNIT:
7478 + return 0;
7479 +
7480 + case AFTER_ITEM:
7481 + case BEFORE_ITEM:
7482 + case EMPTY_NODE:
7483 + return 1;
7484 +
7485 + case BEFORE_UNIT:
7486 + return coord->unit_pos == 0;
7487 +
7488 + case AFTER_UNIT:
7489 + return coord->unit_pos == coord_last_unit_pos(coord);
7490 + }
7491 +
7492 + impossible("jmacd-9908", "unreachable");
7493 + return 0;
7494 +}
7495 +
7496 +#if REISER4_DEBUG
7497 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7498 + before-after or item boundaries. */
7499 +int coord_are_neighbors(coord_t * c1, coord_t * c2)
7500 +{
7501 + coord_t *left;
7502 + coord_t *right;
7503 +
7504 + assert("nikita-1241", c1 != NULL);
7505 + assert("nikita-1242", c2 != NULL);
7506 + assert("nikita-1243", c1->node == c2->node);
7507 + assert("nikita-1244", coord_is_existing_unit(c1));
7508 + assert("nikita-1245", coord_is_existing_unit(c2));
7509 +
7510 + left = right = NULL;
7511 + switch (coord_compare(c1, c2)) {
7512 + case COORD_CMP_ON_LEFT:
7513 + left = c1;
7514 + right = c2;
7515 + break;
7516 + case COORD_CMP_ON_RIGHT:
7517 + left = c2;
7518 + right = c1;
7519 + break;
7520 + case COORD_CMP_SAME:
7521 + return 0;
7522 + default:
7523 + wrong_return_value("nikita-1246", "compare_coords()");
7524 + }
7525 + assert("vs-731", left && right);
7526 + if (left->item_pos == right->item_pos) {
7527 + return left->unit_pos + 1 == right->unit_pos;
7528 + } else if (left->item_pos + 1 == right->item_pos) {
7529 + return (left->unit_pos == coord_last_unit_pos(left))
7530 + && (right->unit_pos == 0);
7531 + } else {
7532 + return 0;
7533 + }
7534 +}
7535 +#endif /* REISER4_DEBUG */
7536 +
7537 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7538 + COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7539 +/* Audited by: green(2002.06.15) */
7540 +coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7541 +{
7542 + assert("vs-209", c1->node == c2->node);
7543 + assert("vs-194", coord_is_existing_unit(c1)
7544 + && coord_is_existing_unit(c2));
7545 +
7546 + if (c1->item_pos > c2->item_pos)
7547 + return COORD_CMP_ON_RIGHT;
7548 + if (c1->item_pos < c2->item_pos)
7549 + return COORD_CMP_ON_LEFT;
7550 + if (c1->unit_pos > c2->unit_pos)
7551 + return COORD_CMP_ON_RIGHT;
7552 + if (c1->unit_pos < c2->unit_pos)
7553 + return COORD_CMP_ON_LEFT;
7554 + return COORD_CMP_SAME;
7555 +}
7556 +
7557 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7558 + non-zero if there is no position to the right. */
7559 +int coord_set_to_right(coord_t * coord)
7560 +{
7561 + unsigned items = coord_num_items(coord);
7562 +
7563 + if (coord_adjust_items(coord, items, 1) == 1) {
7564 + return 1;
7565 + }
7566 +
7567 + switch (coord->between) {
7568 + case AT_UNIT:
7569 + return 0;
7570 +
7571 + case BEFORE_ITEM:
7572 + case BEFORE_UNIT:
7573 + coord->between = AT_UNIT;
7574 + return 0;
7575 +
7576 + case AFTER_UNIT:
7577 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7578 + coord->unit_pos += 1;
7579 + coord->between = AT_UNIT;
7580 + return 0;
7581 + } else {
7582 +
7583 + coord->unit_pos = 0;
7584 +
7585 + if (coord->item_pos == items - 1) {
7586 + coord->between = AFTER_ITEM;
7587 + return 1;
7588 + }
7589 +
7590 + coord_inc_item_pos(coord);
7591 + coord->between = AT_UNIT;
7592 + return 0;
7593 + }
7594 +
7595 + case AFTER_ITEM:
7596 + if (coord->item_pos == items - 1) {
7597 + return 1;
7598 + }
7599 +
7600 + coord_inc_item_pos(coord);
7601 + coord->unit_pos = 0;
7602 + coord->between = AT_UNIT;
7603 + return 0;
7604 +
7605 + case EMPTY_NODE:
7606 + return 1;
7607 +
7608 + case INVALID_COORD:
7609 + break;
7610 + }
7611 +
7612 + impossible("jmacd-9920", "unreachable");
7613 + return 0;
7614 +}
7615 +
7616 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7617 + non-zero if there is no position to the left. */
7618 +int coord_set_to_left(coord_t * coord)
7619 +{
7620 + unsigned items = coord_num_items(coord);
7621 +
7622 + if (coord_adjust_items(coord, items, 0) == 1) {
7623 + return 1;
7624 + }
7625 +
7626 + switch (coord->between) {
7627 + case AT_UNIT:
7628 + return 0;
7629 +
7630 + case AFTER_UNIT:
7631 + coord->between = AT_UNIT;
7632 + return 0;
7633 +
7634 + case AFTER_ITEM:
7635 + coord->between = AT_UNIT;
7636 + coord->unit_pos = coord_last_unit_pos(coord);
7637 + return 0;
7638 +
7639 + case BEFORE_UNIT:
7640 + if (coord->unit_pos > 0) {
7641 + coord->unit_pos -= 1;
7642 + coord->between = AT_UNIT;
7643 + return 0;
7644 + } else {
7645 +
7646 + if (coord->item_pos == 0) {
7647 + coord->between = BEFORE_ITEM;
7648 + return 1;
7649 + }
7650 +
7651 + coord->unit_pos = coord_last_unit_pos(coord);
7652 + coord_dec_item_pos(coord);
7653 + coord->between = AT_UNIT;
7654 + return 0;
7655 + }
7656 +
7657 + case BEFORE_ITEM:
7658 + if (coord->item_pos == 0) {
7659 + return 1;
7660 + }
7661 +
7662 + coord_dec_item_pos(coord);
7663 + coord->unit_pos = coord_last_unit_pos(coord);
7664 + coord->between = AT_UNIT;
7665 + return 0;
7666 +
7667 + case EMPTY_NODE:
7668 + return 1;
7669 +
7670 + case INVALID_COORD:
7671 + break;
7672 + }
7673 +
7674 + impossible("jmacd-9920", "unreachable");
7675 + return 0;
7676 +}
7677 +
7678 +static const char *coord_tween_tostring(between_enum n)
7679 +{
7680 + switch (n) {
7681 + case BEFORE_UNIT:
7682 + return "before unit";
7683 + case BEFORE_ITEM:
7684 + return "before item";
7685 + case AT_UNIT:
7686 + return "at unit";
7687 + case AFTER_UNIT:
7688 + return "after unit";
7689 + case AFTER_ITEM:
7690 + return "after item";
7691 + case EMPTY_NODE:
7692 + return "empty node";
7693 + case INVALID_COORD:
7694 + return "invalid";
7695 + default:
7696 + {
7697 + static char buf[30];
7698 +
7699 + sprintf(buf, "unknown: %i", n);
7700 + return buf;
7701 + }
7702 + }
7703 +}
7704 +
7705 +void print_coord(const char *mes, const coord_t * coord, int node)
7706 +{
7707 + if (coord == NULL) {
7708 + printk("%s: null\n", mes);
7709 + return;
7710 + }
7711 + printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7712 + mes, coord->item_pos, coord->unit_pos,
7713 + coord_tween_tostring(coord->between), coord->iplugid);
7714 +}
7715 +
7716 +int
7717 +item_utmost_child_real_block(const coord_t * coord, sideof side,
7718 + reiser4_block_nr * blk)
7719 +{
7720 + return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7721 + side,
7722 + blk);
7723 +}
7724 +
7725 +int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7726 +{
7727 + return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7728 +}
7729 +
7730 +/* @count bytes of flow @f got written, update correspondingly f->length,
7731 + f->data and f->key */
7732 +void move_flow_forward(flow_t * f, unsigned count)
7733 +{
7734 + if (f->data)
7735 + f->data += count;
7736 + f->length -= count;
7737 + set_key_offset(&f->key, get_key_offset(&f->key) + count);
7738 +}
7739 +
7740 +/*
7741 + Local variables:
7742 + c-indentation-style: "K&R"
7743 + mode-name: "LC"
7744 + c-basic-offset: 8
7745 + tab-width: 8
7746 + fill-column: 120
7747 + scroll-step: 1
7748 + End:
7749 +*/
7750 diff -urN linux-2.6.22.orig/fs/reiser4/coord.h linux-2.6.22/fs/reiser4/coord.h
7751 --- linux-2.6.22.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300
7752 +++ linux-2.6.22/fs/reiser4/coord.h 2007-07-29 00:25:34.832685088 +0400
7753 @@ -0,0 +1,389 @@
7754 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7755 +
7756 +/* Coords */
7757 +
7758 +#if !defined( __REISER4_COORD_H__ )
7759 +#define __REISER4_COORD_H__
7760 +
7761 +#include "forward.h"
7762 +#include "debug.h"
7763 +#include "dformat.h"
7764 +#include "key.h"
7765 +
7766 +/* insertions happen between coords in the tree, so we need some means
7767 + of specifying the sense of betweenness. */
7768 +typedef enum {
7769 + BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7770 + AT_UNIT,
7771 + AFTER_UNIT,
7772 + BEFORE_ITEM,
7773 + AFTER_ITEM,
7774 + INVALID_COORD,
7775 + EMPTY_NODE,
7776 +} between_enum;
7777 +
7778 +/* location of coord w.r.t. its node */
7779 +typedef enum {
7780 + COORD_ON_THE_LEFT = -1,
7781 + COORD_ON_THE_RIGHT = +1,
7782 + COORD_INSIDE = 0
7783 +} coord_wrt_node;
7784 +
7785 +typedef enum {
7786 + COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7787 +} coord_cmp;
7788 +
7789 +struct coord {
7790 + /* node in a tree */
7791 + /* 0 */ znode *node;
7792 +
7793 + /* position of item within node */
7794 + /* 4 */ pos_in_node_t item_pos;
7795 + /* position of unit within item */
7796 + /* 6 */ pos_in_node_t unit_pos;
7797 + /* optimization: plugin of item is stored in coord_t. Until this was
7798 + implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7799 + is invalidated (set to 0xff) on each modification of ->item_pos,
7800 + and all such modifications are funneled through coord_*_item_pos()
7801 + functions below.
7802 + */
7803 + /* 8 */ char iplugid;
7804 + /* position of coord w.r.t. to neighboring items and/or units.
7805 + Values are taken from &between_enum above.
7806 + */
7807 + /* 9 */ char between;
7808 + /* padding. It will be added by the compiler anyway to conform to the
7809 + * C language alignment requirements. We keep it here to be on the
7810 + * safe side and to have a clear picture of the memory layout of this
7811 + * structure. */
7812 + /* 10 */ __u16 pad;
7813 + /* 12 */ int offset;
7814 +#if REISER4_DEBUG
7815 + unsigned long plug_v;
7816 + unsigned long body_v;
7817 +#endif
7818 +};
7819 +
7820 +#define INVALID_PLUGID ((char)((1 << 8) - 1))
7821 +#define INVALID_OFFSET -1
7822 +
7823 +static inline void coord_clear_iplug(coord_t * coord)
7824 +{
7825 + assert("nikita-2835", coord != NULL);
7826 + coord->iplugid = INVALID_PLUGID;
7827 + coord->offset = INVALID_OFFSET;
7828 +}
7829 +
7830 +static inline int coord_is_iplug_set(const coord_t * coord)
7831 +{
7832 + assert("nikita-2836", coord != NULL);
7833 + return coord->iplugid != INVALID_PLUGID;
7834 +}
7835 +
7836 +static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7837 +{
7838 + assert("nikita-2478", coord != NULL);
7839 + coord->item_pos = pos;
7840 + coord_clear_iplug(coord);
7841 +}
7842 +
7843 +static inline void coord_dec_item_pos(coord_t * coord)
7844 +{
7845 + assert("nikita-2480", coord != NULL);
7846 + --coord->item_pos;
7847 + coord_clear_iplug(coord);
7848 +}
7849 +
7850 +static inline void coord_inc_item_pos(coord_t * coord)
7851 +{
7852 + assert("nikita-2481", coord != NULL);
7853 + ++coord->item_pos;
7854 + coord_clear_iplug(coord);
7855 +}
7856 +
7857 +static inline void coord_add_item_pos(coord_t * coord, int delta)
7858 +{
7859 + assert("nikita-2482", coord != NULL);
7860 + coord->item_pos += delta;
7861 + coord_clear_iplug(coord);
7862 +}
7863 +
7864 +static inline void coord_invalid_item_pos(coord_t * coord)
7865 +{
7866 + assert("nikita-2832", coord != NULL);
7867 + coord->item_pos = (unsigned short)~0;
7868 + coord_clear_iplug(coord);
7869 +}
7870 +
7871 +/* Reverse a direction. */
7872 +static inline sideof sideof_reverse(sideof side)
7873 +{
7874 + return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7875 +}
7876 +
7877 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7878 +
7879 + "first" and "last"
7880 + "next" and "prev"
7881 + "before" and "after"
7882 + "leftmost" and "rightmost"
7883 +
7884 + But I think the chosen names are decent the way they are.
7885 +*/
7886 +
7887 +/* COORD INITIALIZERS */
7888 +
7889 +/* Initialize an invalid coordinate. */
7890 +extern void coord_init_invalid(coord_t * coord, const znode * node);
7891 +
7892 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7893 +
7894 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
7895 + empty, it is positioned at the EMPTY_NODE. */
7896 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
7897 +
7898 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
7899 + empty, it is positioned at the EMPTY_NODE. */
7900 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
7901 +
7902 +/* Initialize a coordinate to before the first item. If the node is empty, it is
7903 + positioned at the EMPTY_NODE. */
7904 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7905 +
7906 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7907 + at the EMPTY_NODE. */
7908 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7909 +
7910 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7911 + already to existing item */
7912 +void coord_init_after_item_end(coord_t * coord);
7913 +
7914 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7915 +void coord_init_before_item(coord_t *);
7916 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7917 +void coord_init_after_item(coord_t *);
7918 +
7919 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7920 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7921 + sideof dir);
7922 +
7923 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7924 + it was not clear how actually
7925 + FIXME-VS: added by vs (2002, june, 8) */
7926 +extern void coord_init_zero(coord_t * coord);
7927 +
7928 +/* COORD METHODS */
7929 +
7930 +/* after shifting of node content, coord previously set properly may become
7931 + invalid, try to "normalize" it. */
7932 +void coord_normalize(coord_t * coord);
7933 +
7934 +/* Copy a coordinate. */
7935 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7936 +
7937 +/* Copy a coordinate without check. */
7938 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7939 +
7940 +unsigned coord_num_units(const coord_t * coord);
7941 +
7942 +/* Return the last valid unit number at the present item (i.e.,
7943 + coord_num_units() - 1). */
7944 +static inline unsigned coord_last_unit_pos(const coord_t * coord)
7945 +{
7946 + return coord_num_units(coord) - 1;
7947 +}
7948 +
7949 +#if REISER4_DEBUG
7950 +/* For assertions only, checks for a valid coordinate. */
7951 +extern int coord_check(const coord_t * coord);
7952 +
7953 +extern unsigned long znode_times_locked(const znode * z);
7954 +
7955 +static inline void coord_update_v(coord_t * coord)
7956 +{
7957 + coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7958 +}
7959 +#endif
7960 +
7961 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
7962 +
7963 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
7964 +
7965 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7966 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7967 +extern coord_wrt_node coord_wrt(const coord_t * coord);
7968 +
7969 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7970 + before-after or item boundaries. */
7971 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
7972 +
7973 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
7974 + NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
7975 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
7976 +
7977 +/* COORD PREDICATES */
7978 +
7979 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7980 +extern int coord_is_invalid(const coord_t * coord);
7981 +
7982 +/* Returns true if the coordinate is positioned at an existing item, not before or after
7983 + an item. It may be placed at, before, or after any unit within the item, whether
7984 + existing or not. If this is true you can call methods of the item plugin. */
7985 +extern int coord_is_existing_item(const coord_t * coord);
7986 +
7987 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7988 + last unit of an item, before the first unit of an item, or at an empty node. */
7989 +extern int coord_is_between_items(const coord_t * coord);
7990 +
7991 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7992 + unit. */
7993 +extern int coord_is_existing_unit(const coord_t * coord);
7994 +
7995 +/* Returns true if the coordinate is positioned at an empty node. */
7996 +extern int coord_is_empty(const coord_t * coord);
7997 +
7998 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7999 + true for empty nodes nor coordinates positioned before the first item. */
8000 +extern int coord_is_leftmost_unit(const coord_t * coord);
8001 +
8002 +/* Returns true if the coordinate is positioned after the last item or after the last unit
8003 + of the last item or it is an empty node. */
8004 +extern int coord_is_after_rightmost(const coord_t * coord);
8005 +
8006 +/* Returns true if the coordinate is positioned before the first item or it is an empty
8007 + node. */
8008 +extern int coord_is_before_leftmost(const coord_t * coord);
8009 +
8010 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8011 + argument. */
8012 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8013 +
8014 +/* COORD MODIFIERS */
8015 +
8016 +/* Advances the coordinate by one unit to the right. If empty, no change. If
8017 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8018 + an existing unit. */
8019 +extern int coord_next_unit(coord_t * coord);
8020 +
8021 +/* Advances the coordinate by one item to the right. If empty, no change. If
8022 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8023 + an existing item. */
8024 +extern int coord_next_item(coord_t * coord);
8025 +
8026 +/* Advances the coordinate by one unit to the left. If empty, no change. If
8027 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8028 + is an existing unit. */
8029 +extern int coord_prev_unit(coord_t * coord);
8030 +
8031 +/* Advances the coordinate by one item to the left. If empty, no change. If
8032 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8033 + is an existing item. */
8034 +extern int coord_prev_item(coord_t * coord);
8035 +
8036 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8037 + non-zero if there is no position to the right. */
8038 +extern int coord_set_to_right(coord_t * coord);
8039 +
8040 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8041 + non-zero if there is no position to the left. */
8042 +extern int coord_set_to_left(coord_t * coord);
8043 +
8044 +/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8045 + and non-zero if the unit did not exist. */
8046 +extern int coord_set_after_unit(coord_t * coord);
8047 +
8048 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8049 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
8050 +
8051 +/* iterate over all units in @node */
8052 +#define for_all_units( coord, node ) \
8053 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8054 + coord_next_unit( coord ) == 0 ; )
8055 +
8056 +/* iterate over all items in @node */
8057 +#define for_all_items( coord, node ) \
8058 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8059 + coord_next_item( coord ) == 0 ; )
8060 +
8061 +/* COORD/ITEM METHODS */
8062 +
8063 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8064 + reiser4_block_nr * blk);
8065 +extern int item_utmost_child(const coord_t * coord, sideof side,
8066 + jnode ** child);
8067 +
8068 +/* a flow is a sequence of bytes being written to or read from the tree. The
8069 + tree will slice the flow into items while storing it into nodes, but all of
8070 + that is hidden from anything outside the tree. */
8071 +
8072 +struct flow {
8073 + reiser4_key key; /* key of start of flow's sequence of bytes */
8074 + loff_t length; /* length of flow's sequence of bytes */
8075 + char *data; /* start of flow's sequence of bytes */
8076 + int user; /* if 1 data is user space, 0 - kernel space */
8077 + rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8078 +};
8079 +
8080 +void move_flow_forward(flow_t * f, unsigned count);
8081 +
8082 +/* &reiser4_item_data - description of data to be inserted or pasted
8083 +
8084 + Q: articulate the reasons for the difference between this and flow.
8085 +
8086 + A: Becides flow we insert into tree other things: stat data, directory
8087 + entry, etc. To insert them into tree one has to provide this structure. If
8088 + one is going to insert flow - he can use insert_flow, where this structure
8089 + does not have to be created
8090 +*/
8091 +struct reiser4_item_data {
8092 + /* actual data to be inserted. If NULL, ->create_item() will not
8093 + do xmemcpy itself, leaving this up to the caller. This can
8094 + save some amount of unnecessary memory copying, for example,
8095 + during insertion of stat data.
8096 +
8097 + */
8098 + char *data;
8099 + /* 1 if 'char * data' contains pointer to user space and 0 if it is
8100 + kernel space */
8101 + int user;
8102 + /* amount of data we are going to insert or paste */
8103 + int length;
8104 + /* "Arg" is opaque data that is passed down to the
8105 + ->create_item() method of node layout, which in turn
8106 + hands it to the ->create_hook() of item being created. This
8107 + arg is currently used by:
8108 +
8109 + . ->create_hook() of internal item
8110 + (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8111 + . ->paste() method of directory item.
8112 + . ->create_hook() of extent item
8113 +
8114 + For internal item, this is left "brother" of new node being
8115 + inserted and it is used to add new node into sibling list
8116 + after parent to it was just inserted into parent.
8117 +
8118 + While ->arg does look somewhat of unnecessary compication,
8119 + it actually saves a lot of headache in many places, because
8120 + all data necessary to insert or paste new data into tree are
8121 + collected in one place, and this eliminates a lot of extra
8122 + argument passing and storing everywhere.
8123 +
8124 + */
8125 + void *arg;
8126 + /* plugin of item we are inserting */
8127 + item_plugin *iplug;
8128 +};
8129 +
8130 +/* __REISER4_COORD_H__ */
8131 +#endif
8132 +
8133 +/* Make Linus happy.
8134 + Local variables:
8135 + c-indentation-style: "K&R"
8136 + mode-name: "LC"
8137 + c-basic-offset: 8
8138 + tab-width: 8
8139 + fill-column: 120
8140 + scroll-step: 1
8141 + End:
8142 +*/
8143 diff -urN linux-2.6.22.orig/fs/reiser4/debug.c linux-2.6.22/fs/reiser4/debug.c
8144 --- linux-2.6.22.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300
8145 +++ linux-2.6.22/fs/reiser4/debug.c 2007-07-29 00:25:34.836686123 +0400
8146 @@ -0,0 +1,308 @@
8147 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8148 + * reiser4/README */
8149 +
8150 +/* Debugging facilities. */
8151 +
8152 +/*
8153 + * This file contains generic debugging functions used by reiser4. Roughly
8154 + * following:
8155 + *
8156 + * panicking: reiser4_do_panic(), reiser4_print_prefix().
8157 + *
8158 + * locking:
8159 + * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8160 + * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8161 + *
8162 + * error code monitoring (see comment before RETERR macro):
8163 + * reiser4_return_err(), reiser4_report_err().
8164 + *
8165 + * stack back-tracing: fill_backtrace()
8166 + *
8167 + * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8168 + * reiser4_debugtrap().
8169 + *
8170 + */
8171 +
8172 +#include "reiser4.h"
8173 +#include "context.h"
8174 +#include "super.h"
8175 +#include "txnmgr.h"
8176 +#include "znode.h"
8177 +
8178 +#include <linux/sysfs.h>
8179 +#include <linux/slab.h>
8180 +#include <linux/types.h>
8181 +#include <linux/fs.h>
8182 +#include <linux/spinlock.h>
8183 +#include <linux/kallsyms.h>
8184 +#include <linux/vmalloc.h>
8185 +#include <linux/ctype.h>
8186 +#include <linux/sysctl.h>
8187 +#include <linux/hardirq.h>
8188 +
8189 +#if 0
8190 +#if REISER4_DEBUG
8191 +static void reiser4_report_err(void);
8192 +#else
8193 +#define reiser4_report_err() noop
8194 +#endif
8195 +#endif /* 0 */
8196 +
8197 +/*
8198 + * global buffer where message given to reiser4_panic is formatted.
8199 + */
8200 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8201 +
8202 +/*
8203 + * lock protecting consistency of panic_buf under concurrent panics
8204 + */
8205 +static DEFINE_SPINLOCK(panic_guard);
8206 +
8207 +/* Your best friend. Call it on each occasion. This is called by
8208 + fs/reiser4/debug.h:reiser4_panic(). */
8209 +void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8210 +{
8211 + static int in_panic = 0;
8212 + va_list args;
8213 +
8214 + /*
8215 + * check for recursive panic.
8216 + */
8217 + if (in_panic == 0) {
8218 + in_panic = 1;
8219 +
8220 + spin_lock(&panic_guard);
8221 + va_start(args, format);
8222 + vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8223 + va_end(args);
8224 + printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8225 + spin_unlock(&panic_guard);
8226 +
8227 + /*
8228 + * if kernel debugger is configured---drop in. Early dropping
8229 + * into kgdb is not always convenient, because panic message
8230 + * is not yet printed most of the times. But:
8231 + *
8232 + * (1) message can be extracted from printk_buf[]
8233 + * (declared static inside of printk()), and
8234 + *
8235 + * (2) sometimes serial/kgdb combo dies while printing
8236 + * long panic message, so it's more prudent to break into
8237 + * debugger earlier.
8238 + *
8239 + */
8240 + DEBUGON(1);
8241 + }
8242 + /* to make gcc happy about noreturn attribute */
8243 + panic("%s", panic_buf);
8244 +}
8245 +
8246 +#if 0
8247 +void
8248 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8249 + const char *function, const char *file, int lineno)
8250 +{
8251 + const char *comm;
8252 + int pid;
8253 +
8254 + if (unlikely(in_interrupt() || in_irq())) {
8255 + comm = "interrupt";
8256 + pid = 0;
8257 + } else {
8258 + comm = current->comm;
8259 + pid = current->pid;
8260 + }
8261 + printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8262 + level, comm, pid, function, file, lineno, mid);
8263 + if (reperr)
8264 + reiser4_report_err();
8265 +}
8266 +#endif /* 0 */
8267 +
8268 +/* Preemption point: this should be called periodically during long running
8269 + operations (carry, allocate, and squeeze are best examples) */
8270 +int reiser4_preempt_point(void)
8271 +{
8272 + assert("nikita-3008", reiser4_schedulable());
8273 + cond_resched();
8274 + return signal_pending(current);
8275 +}
8276 +
8277 +#if REISER4_DEBUG
8278 +/* Debugging aid: return struct where information about locks taken by current
8279 + thread is accumulated. This can be used to formulate lock ordering
8280 + constraints and various assertions.
8281 +
8282 +*/
8283 +reiser4_lock_cnt_info *reiser4_lock_counters(void)
8284 +{
8285 + reiser4_context *ctx = get_current_context();
8286 + assert("jmacd-1123", ctx != NULL);
8287 + return &ctx->locks;
8288 +}
8289 +
8290 +/*
8291 + * print human readable information about locks held by the reiser4 context.
8292 + */
8293 +static void print_lock_counters(const char *prefix,
8294 + const reiser4_lock_cnt_info * info)
8295 +{
8296 + printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8297 + "jload: %i, "
8298 + "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8299 + "ktxnmgrd: %i, fq: %i\n"
8300 + "inode: %i, "
8301 + "cbk_cache: %i (r:%i,w%i), "
8302 + "eflush: %i, "
8303 + "zlock: %i,\n"
8304 + "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8305 + "d: %i, x: %i, t: %i\n", prefix,
8306 + info->spin_locked_jnode,
8307 + info->rw_locked_tree, info->read_locked_tree,
8308 + info->write_locked_tree,
8309 + info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8310 + info->spin_locked_jload,
8311 + info->spin_locked_txnh,
8312 + info->spin_locked_atom, info->spin_locked_stack,
8313 + info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8314 + info->spin_locked_fq,
8315 + info->spin_locked_inode,
8316 + info->rw_locked_cbk_cache,
8317 + info->read_locked_cbk_cache,
8318 + info->write_locked_cbk_cache,
8319 + info->spin_locked_super_eflush,
8320 + info->spin_locked_zlock,
8321 + info->spin_locked,
8322 + info->long_term_locked_znode,
8323 + info->inode_sem_r, info->inode_sem_w,
8324 + info->d_refs, info->x_refs, info->t_refs);
8325 +}
8326 +
8327 +/* check that no spinlocks are held */
8328 +int reiser4_schedulable(void)
8329 +{
8330 + if (get_current_context_check() != NULL) {
8331 + if (!LOCK_CNT_NIL(spin_locked)) {
8332 + print_lock_counters("in atomic", reiser4_lock_counters());
8333 + return 0;
8334 + }
8335 + }
8336 + might_sleep();
8337 + return 1;
8338 +}
8339 +/*
8340 + * return true, iff no locks are held.
8341 + */
8342 +int reiser4_no_counters_are_held(void)
8343 +{
8344 + reiser4_lock_cnt_info *counters;
8345 +
8346 + counters = reiser4_lock_counters();
8347 + return
8348 + (counters->spin_locked_zlock == 0) &&
8349 + (counters->spin_locked_jnode == 0) &&
8350 + (counters->rw_locked_tree == 0) &&
8351 + (counters->read_locked_tree == 0) &&
8352 + (counters->write_locked_tree == 0) &&
8353 + (counters->rw_locked_dk == 0) &&
8354 + (counters->read_locked_dk == 0) &&
8355 + (counters->write_locked_dk == 0) &&
8356 + (counters->spin_locked_txnh == 0) &&
8357 + (counters->spin_locked_atom == 0) &&
8358 + (counters->spin_locked_stack == 0) &&
8359 + (counters->spin_locked_txnmgr == 0) &&
8360 + (counters->spin_locked_inode == 0) &&
8361 + (counters->spin_locked == 0) &&
8362 + (counters->long_term_locked_znode == 0) &&
8363 + (counters->inode_sem_r == 0) &&
8364 + (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8365 +}
8366 +
8367 +/*
8368 + * return true, iff transaction commit can be done under locks held by the
8369 + * current thread.
8370 + */
8371 +int reiser4_commit_check_locks(void)
8372 +{
8373 + reiser4_lock_cnt_info *counters;
8374 + int inode_sem_r;
8375 + int inode_sem_w;
8376 + int result;
8377 +
8378 + /*
8379 + * inode's read/write semaphore is the only reiser4 lock that can be
8380 + * held during commit.
8381 + */
8382 +
8383 + counters = reiser4_lock_counters();
8384 + inode_sem_r = counters->inode_sem_r;
8385 + inode_sem_w = counters->inode_sem_w;
8386 +
8387 + counters->inode_sem_r = counters->inode_sem_w = 0;
8388 + result = reiser4_no_counters_are_held();
8389 + counters->inode_sem_r = inode_sem_r;
8390 + counters->inode_sem_w = inode_sem_w;
8391 + return result;
8392 +}
8393 +
8394 +/*
8395 + * fill "error site" in the current reiser4 context. See comment before RETERR
8396 + * macro for more details.
8397 + */
8398 +void reiser4_return_err(int code, const char *file, int line)
8399 +{
8400 + if (code < 0 && is_in_reiser4_context()) {
8401 + reiser4_context *ctx = get_current_context();
8402 +
8403 + if (ctx != NULL) {
8404 + ctx->err.code = code;
8405 + ctx->err.file = file;
8406 + ctx->err.line = line;
8407 + }
8408 + }
8409 +}
8410 +
8411 +#if 0
8412 +/*
8413 + * report error information recorder by reiser4_return_err().
8414 + */
8415 +static void reiser4_report_err(void)
8416 +{
8417 + reiser4_context *ctx = get_current_context_check();
8418 +
8419 + if (ctx != NULL) {
8420 + if (ctx->err.code != 0) {
8421 + printk("code: %i at %s:%i\n",
8422 + ctx->err.code, ctx->err.file, ctx->err.line);
8423 + }
8424 + }
8425 +}
8426 +#endif /* 0 */
8427 +
8428 +#endif /* REISER4_DEBUG */
8429 +
8430 +#if KERNEL_DEBUGGER
8431 +
8432 +/*
8433 + * this functions just drops into kernel debugger. It is a convenient place to
8434 + * put breakpoint in.
8435 + */
8436 +void reiser4_debugtrap(void)
8437 +{
8438 + /* do nothing. Put break point here. */
8439 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8440 + extern void breakpoint(void);
8441 + breakpoint();
8442 +#endif
8443 +}
8444 +#endif
8445 +
8446 +/* Make Linus happy.
8447 + Local variables:
8448 + c-indentation-style: "K&R"
8449 + mode-name: "LC"
8450 + c-basic-offset: 8
8451 + tab-width: 8
8452 + fill-column: 120
8453 + End:
8454 +*/
8455 diff -urN linux-2.6.22.orig/fs/reiser4/debug.h linux-2.6.22/fs/reiser4/debug.h
8456 --- linux-2.6.22.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300
8457 +++ linux-2.6.22/fs/reiser4/debug.h 2007-07-29 00:25:34.836686123 +0400
8458 @@ -0,0 +1,350 @@
8459 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8460 +
8461 +/* Declarations of debug macros. */
8462 +
8463 +#if !defined( __FS_REISER4_DEBUG_H__ )
8464 +#define __FS_REISER4_DEBUG_H__
8465 +
8466 +#include "forward.h"
8467 +#include "reiser4.h"
8468 +
8469 +/* generic function to produce formatted output, decorating it with
8470 + whatever standard prefixes/postfixes we want. "Fun" is a function
8471 + that will be actually called, can be printk, panic etc.
8472 + This is for use by other debugging macros, not by users. */
8473 +#define DCALL(lev, fun, reperr, label, format, ...) \
8474 +({ \
8475 + fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8476 + current->comm, current->pid, __FUNCTION__, \
8477 + __FILE__, __LINE__, label, ## __VA_ARGS__); \
8478 +})
8479 +
8480 +/*
8481 + * cause kernel to crash
8482 + */
8483 +#define reiser4_panic(mid, format, ...) \
8484 + DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8485 +
8486 +/* print message with indication of current process, file, line and
8487 + function */
8488 +#define reiser4_log(label, format, ...) \
8489 + DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8490 +
8491 +/* Assertion checked during compilation.
8492 + If "cond" is false (0) we get duplicate case label in switch.
8493 + Use this to check something like famous
8494 + cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8495 + in 3.x journal.c. If cassertion fails you get compiler error,
8496 + so no "maintainer-id".
8497 +*/
8498 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8499 +
8500 +#define noop do {;} while(0)
8501 +
8502 +#if REISER4_DEBUG
8503 +/* version of info that only actually prints anything when _d_ebugging
8504 + is on */
8505 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8506 +/* macro to catch logical errors. Put it into `default' clause of
8507 + switch() statement. */
8508 +#define impossible(label, format, ...) \
8509 + reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8510 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8511 + called. Use this for checking logical consistency and _never_ call
8512 + this to check correctness of external data: disk blocks and user-input . */
8513 +#define assert(label, cond) \
8514 +({ \
8515 + /* call_on_each_assert(); */ \
8516 + if (cond) { \
8517 + /* put negated check to avoid using !(cond) that would lose \
8518 + * warnings for things like assert(a = b); */ \
8519 + ; \
8520 + } else { \
8521 + DEBUGON(1); \
8522 + reiser4_panic(label, "assertion failed: %s", #cond); \
8523 + } \
8524 +})
8525 +
8526 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8527 +#define check_me( label, expr ) assert( label, ( expr ) )
8528 +
8529 +#define ON_DEBUG( exp ) exp
8530 +
8531 +extern int reiser4_schedulable(void);
8532 +extern void call_on_each_assert(void);
8533 +
8534 +#else
8535 +
8536 +#define dinfo( format, args... ) noop
8537 +#define impossible( label, format, args... ) noop
8538 +#define assert( label, cond ) noop
8539 +#define check_me( label, expr ) ( ( void ) ( expr ) )
8540 +#define ON_DEBUG( exp )
8541 +#define reiser4_schedulable() might_sleep()
8542 +
8543 +/* REISER4_DEBUG */
8544 +#endif
8545 +
8546 +#if REISER4_DEBUG
8547 +/* per-thread information about lock acquired by this thread. Used by lock
8548 + * ordering checking in spin_macros.h */
8549 +typedef struct reiser4_lock_cnt_info {
8550 + int rw_locked_tree;
8551 + int read_locked_tree;
8552 + int write_locked_tree;
8553 +
8554 + int rw_locked_dk;
8555 + int read_locked_dk;
8556 + int write_locked_dk;
8557 +
8558 + int rw_locked_cbk_cache;
8559 + int read_locked_cbk_cache;
8560 + int write_locked_cbk_cache;
8561 +
8562 + int spin_locked_zlock;
8563 + int spin_locked_jnode;
8564 + int spin_locked_jload;
8565 + int spin_locked_txnh;
8566 + int spin_locked_atom;
8567 + int spin_locked_stack;
8568 + int spin_locked_txnmgr;
8569 + int spin_locked_ktxnmgrd;
8570 + int spin_locked_fq;
8571 + int spin_locked_inode;
8572 + int spin_locked_super_eflush;
8573 + int spin_locked;
8574 + int long_term_locked_znode;
8575 +
8576 + int inode_sem_r;
8577 + int inode_sem_w;
8578 +
8579 + int d_refs;
8580 + int x_refs;
8581 + int t_refs;
8582 +} reiser4_lock_cnt_info;
8583 +
8584 +extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
8585 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8586 +
8587 +/* increment lock-counter @counter, if present */
8588 +#define LOCK_CNT_INC(counter) \
8589 + IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8590 +
8591 +/* decrement lock-counter @counter, if present */
8592 +#define LOCK_CNT_DEC(counter) \
8593 + IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8594 +
8595 +/* check that lock-counter is zero. This is for use in assertions */
8596 +#define LOCK_CNT_NIL(counter) \
8597 + IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8598 +
8599 +/* check that lock-counter is greater than zero. This is for use in
8600 + * assertions */
8601 +#define LOCK_CNT_GTZ(counter) \
8602 + IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8603 +#define LOCK_CNT_LT(counter,n) \
8604 + IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8605 +
8606 +#else /* REISER4_DEBUG */
8607 +
8608 +/* no-op versions on the above */
8609 +
8610 +typedef struct reiser4_lock_cnt_info {
8611 +} reiser4_lock_cnt_info;
8612 +
8613 +#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
8614 +#define LOCK_CNT_INC(counter) noop
8615 +#define LOCK_CNT_DEC(counter) noop
8616 +#define LOCK_CNT_NIL(counter) (1)
8617 +#define LOCK_CNT_GTZ(counter) (1)
8618 +#define LOCK_CNT_LT(counter,n) (1)
8619 +
8620 +#endif /* REISER4_DEBUG */
8621 +
8622 +#define assert_spin_not_locked(lock) BUG_ON(0)
8623 +#define assert_rw_write_locked(lock) BUG_ON(0)
8624 +#define assert_rw_read_locked(lock) BUG_ON(0)
8625 +#define assert_rw_locked(lock) BUG_ON(0)
8626 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
8627 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
8628 +#define assert_rw_not_locked(lock) BUG_ON(0)
8629 +
8630 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
8631 + option. */
8632 +typedef enum {
8633 + /* print a lot of information during panic. When this is on all jnodes
8634 + * are listed. This can be *very* large output. Usually you don't want
8635 + * this. Especially over serial line. */
8636 + REISER4_VERBOSE_PANIC = 0x00000001,
8637 + /* print a lot of information during umount */
8638 + REISER4_VERBOSE_UMOUNT = 0x00000002,
8639 + /* print gathered statistics on umount */
8640 + REISER4_STATS_ON_UMOUNT = 0x00000004,
8641 + /* check node consistency */
8642 + REISER4_CHECK_NODE = 0x00000008
8643 +} reiser4_debug_flags;
8644 +
8645 +extern int is_in_reiser4_context(void);
8646 +
8647 +/*
8648 + * evaluate expression @e only if with reiser4 context
8649 + */
8650 +#define ON_CONTEXT(e) do { \
8651 + if(is_in_reiser4_context()) { \
8652 + e; \
8653 + } } while(0)
8654 +
8655 +/*
8656 + * evaluate expression @e only when within reiser4_context and debugging is
8657 + * on.
8658 + */
8659 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8660 +
8661 +/*
8662 + * complain about unexpected function result and crash. Used in "default"
8663 + * branches of switch statements and alike to assert that invalid results are
8664 + * not silently ignored.
8665 + */
8666 +#define wrong_return_value( label, function ) \
8667 + impossible( label, "wrong return value from " function )
8668 +
8669 +/* Issue different types of reiser4 messages to the console */
8670 +#define warning( label, format, ... ) \
8671 + DCALL( KERN_WARNING, \
8672 + printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8673 +#define notice( label, format, ... ) \
8674 + DCALL( KERN_NOTICE, \
8675 + printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8676 +
8677 +/* mark not yet implemented functionality */
8678 +#define not_yet( label, format, ... ) \
8679 + reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8680 +
8681 +extern void reiser4_do_panic(const char *format, ...)
8682 + __attribute__ ((noreturn, format(printf, 1, 2)));
8683 +
8684 +extern int reiser4_preempt_point(void);
8685 +extern void reiser4_print_stats(void);
8686 +
8687 +#if REISER4_DEBUG
8688 +extern int reiser4_no_counters_are_held(void);
8689 +extern int reiser4_commit_check_locks(void);
8690 +#else
8691 +#define reiser4_no_counters_are_held() (1)
8692 +#define reiser4_commit_check_locks() (1)
8693 +#endif
8694 +
8695 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8696 +#define IS_POW(i) \
8697 +({ \
8698 + typeof(i) __i; \
8699 + \
8700 + __i = (i); \
8701 + !(__i & (__i - 1)); \
8702 +})
8703 +
8704 +#define KERNEL_DEBUGGER (1)
8705 +
8706 +#if KERNEL_DEBUGGER
8707 +
8708 +extern void reiser4_debugtrap(void);
8709 +
8710 +/*
8711 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8712 + * kgdb is not compiled in, do nothing.
8713 + */
8714 +#define DEBUGON(cond) \
8715 +({ \
8716 + if (unlikely(cond)) \
8717 + reiser4_debugtrap(); \
8718 +})
8719 +#else
8720 +#define DEBUGON(cond) noop
8721 +#endif
8722 +
8723 +/*
8724 + * Error code tracing facility. (Idea is borrowed from XFS code.)
8725 + *
8726 + * Suppose some strange and/or unexpected code is returned from some function
8727 + * (for example, write(2) returns -EEXIST). It is possible to place a
8728 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
8729 + * in what particular place -EEXIST was generated first?
8730 + *
8731 + * In reiser4 all places where actual error codes are produced (that is,
8732 + * statements of the form
8733 + *
8734 + * return -EFOO; // (1), or
8735 + *
8736 + * result = -EFOO; // (2)
8737 + *
8738 + * are replaced with
8739 + *
8740 + * return RETERR(-EFOO); // (1a), and
8741 + *
8742 + * result = RETERR(-EFOO); // (2a) respectively
8743 + *
8744 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8745 + * printed in error and warning messages. Moreover, it's possible to put a
8746 + * conditional breakpoint in reiser4_return_err (low-level function called
8747 + * by RETERR() to do the actual work) to break into debugger immediately
8748 + * when particular error happens.
8749 + *
8750 + */
8751 +
8752 +#if REISER4_DEBUG
8753 +
8754 +/*
8755 + * data-type to store information about where error happened ("error site").
8756 + */
8757 +typedef struct err_site {
8758 + int code; /* error code */
8759 + const char *file; /* source file, filled by __FILE__ */
8760 + int line; /* source file line, filled by __LINE__ */
8761 +} err_site;
8762 +
8763 +extern void reiser4_return_err(int code, const char *file, int line);
8764 +
8765 +/*
8766 + * fill &get_current_context()->err_site with error information.
8767 + */
8768 +#define RETERR(code) \
8769 +({ \
8770 + typeof(code) __code; \
8771 + \
8772 + __code = (code); \
8773 + reiser4_return_err(__code, __FILE__, __LINE__); \
8774 + __code; \
8775 +})
8776 +
8777 +#else
8778 +
8779 +/*
8780 + * no-op versions of the above
8781 + */
8782 +
8783 +typedef struct err_site {
8784 +} err_site;
8785 +#define RETERR(code) code
8786 +#endif
8787 +
8788 +#if REISER4_LARGE_KEY
8789 +/*
8790 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8791 + */
8792 +#define ON_LARGE_KEY(...) __VA_ARGS__
8793 +#else
8794 +#define ON_LARGE_KEY(...)
8795 +#endif
8796 +
8797 +/* __FS_REISER4_DEBUG_H__ */
8798 +#endif
8799 +
8800 +/* Make Linus happy.
8801 + Local variables:
8802 + c-indentation-style: "K&R"
8803 + mode-name: "LC"
8804 + c-basic-offset: 8
8805 + tab-width: 8
8806 + fill-column: 120
8807 + End:
8808 +*/
8809 diff -urN linux-2.6.22.orig/fs/reiser4/dformat.h linux-2.6.22/fs/reiser4/dformat.h
8810 --- linux-2.6.22.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300
8811 +++ linux-2.6.22/fs/reiser4/dformat.h 2007-07-29 00:25:34.836686123 +0400
8812 @@ -0,0 +1,70 @@
8813 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8814 +
8815 +/* Formats of on-disk data and conversion functions. */
8816 +
8817 +/* put all item formats in the files describing the particular items,
8818 + our model is, everything you need to do to add an item to reiser4,
8819 + (excepting the changes to the plugin that uses the item which go
8820 + into the file defining that plugin), you put into one file. */
8821 +/* Data on disk are stored in little-endian format.
8822 + To declare fields of on-disk structures, use d8, d16, d32 and d64.
8823 + d??tocpu() and cputod??() to convert. */
8824 +
8825 +#if !defined( __FS_REISER4_DFORMAT_H__ )
8826 +#define __FS_REISER4_DFORMAT_H__
8827 +
8828 +#include <asm/byteorder.h>
8829 +#include <asm/unaligned.h>
8830 +#include <linux/types.h>
8831 +
8832 +typedef __u8 d8;
8833 +typedef __le16 d16;
8834 +typedef __le32 d32;
8835 +typedef __le64 d64;
8836 +
8837 +#define PACKED __attribute__((packed))
8838 +
8839 +/* data-type for block number */
8840 +typedef __u64 reiser4_block_nr;
8841 +
8842 +/* data-type for block number on disk, disk format */
8843 +typedef __le64 reiser4_dblock_nr;
8844 +
8845 +/**
8846 + * disk_addr_eq - compare disk addresses
8847 + * @b1: pointer to block number ot compare
8848 + * @b2: pointer to block number ot compare
8849 + *
8850 + * Returns true if if disk addresses are the same
8851 + */
8852 +static inline int disk_addr_eq(const reiser4_block_nr *b1,
8853 + const reiser4_block_nr * b2)
8854 +{
8855 + assert("nikita-1033", b1 != NULL);
8856 + assert("nikita-1266", b2 != NULL);
8857 +
8858 + return !memcmp(b1, b2, sizeof *b1);
8859 +}
8860 +
8861 +/* structure of master reiser4 super block */
8862 +typedef struct reiser4_master_sb {
8863 + char magic[16]; /* "ReIsEr4" */
8864 + __le16 disk_plugin_id; /* id of disk layout plugin */
8865 + __le16 blocksize;
8866 + char uuid[16]; /* unique id */
8867 + char label[16]; /* filesystem label */
8868 + __le64 diskmap; /* location of the diskmap. 0 if not present */
8869 +} reiser4_master_sb;
8870 +
8871 +/* __FS_REISER4_DFORMAT_H__ */
8872 +#endif
8873 +
8874 +/*
8875 + * Local variables:
8876 + * c-indentation-style: "K&R"
8877 + * mode-name: "LC"
8878 + * c-basic-offset: 8
8879 + * tab-width: 8
8880 + * fill-column: 79
8881 + * End:
8882 + */
8883 diff -urN linux-2.6.22.orig/fs/reiser4/dscale.c linux-2.6.22/fs/reiser4/dscale.c
8884 --- linux-2.6.22.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
8885 +++ linux-2.6.22/fs/reiser4/dscale.c 2007-07-29 00:25:34.836686123 +0400
8886 @@ -0,0 +1,174 @@
8887 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8888 + * reiser4/README */
8889 +
8890 +/* Scalable on-disk integers */
8891 +
8892 +/*
8893 + * Various on-disk structures contain integer-like structures. Stat-data
8894 + * contain [yes, "data" is plural, check the dictionary] file size, link
8895 + * count; extent unit contains extent width etc. To accommodate for general
8896 + * case enough space is reserved to keep largest possible value. 64 bits in
8897 + * all cases above. But in overwhelming majority of cases numbers actually
8898 + * stored in these fields will be comparatively small and reserving 8 bytes is
8899 + * a waste of precious disk bandwidth.
8900 + *
8901 + * Scalable integers are one way to solve this problem. dscale_write()
8902 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8903 + * depending on the magnitude of the value supplied. dscale_read() reads value
8904 + * previously stored by dscale_write().
8905 + *
8906 + * dscale_write() produces format not completely unlike of UTF: two highest
8907 + * bits of the first byte are used to store "tag". One of 4 possible tag
8908 + * values is chosen depending on the number being encoded:
8909 + *
8910 + * 0 ... 0x3f => 0 [table 1]
8911 + * 0x40 ... 0x3fff => 1
8912 + * 0x4000 ... 0x3fffffff => 2
8913 + * 0x40000000 ... 0xffffffffffffffff => 3
8914 + *
8915 + * (see dscale_range() function)
8916 + *
8917 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8918 + * to be stored, so in this case there is no place in the first byte to store
8919 + * tag. For such values tag is stored in an extra 9th byte.
8920 + *
8921 + * As _highest_ bits are used for the test (which is natural) scaled integers
8922 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8923 + * uses LITTLE-ENDIAN.
8924 + *
8925 + */
8926 +
8927 +#include "debug.h"
8928 +#include "dscale.h"
8929 +
8930 +/* return tag of scaled integer stored at @address */
8931 +static int gettag(const unsigned char *address)
8932 +{
8933 + /* tag is stored in two highest bits */
8934 + return (*address) >> 6;
8935 +}
8936 +
8937 +/* clear tag from value. Clear tag embedded into @value. */
8938 +static void cleartag(__u64 * value, int tag)
8939 +{
8940 + /*
8941 + * W-w-what ?!
8942 + *
8943 + * Actually, this is rather simple: @value passed here was read by
8944 + * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8945 + * zeroes. Tag is still stored in the highest (arithmetically)
8946 + * non-zero bits of @value, but relative position of tag within __u64
8947 + * depends on @tag.
8948 + *
8949 + * For example if @tag is 0, it's stored 2 highest bits of lowest
8950 + * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8951 + *
8952 + * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8953 + * and it's offset if (2 * 8) - 2 == 14 bits.
8954 + *
8955 + * See table 1 above for details.
8956 + *
8957 + * All these cases are captured by the formula:
8958 + */
8959 + *value &= ~(3 << (((1 << tag) << 3) - 2));
8960 + /*
8961 + * That is, clear two (3 == 0t11) bits at the offset
8962 + *
8963 + * 8 * (2 ^ tag) - 2,
8964 + *
8965 + * that is, two highest bits of (2 ^ tag)-th byte of @value.
8966 + */
8967 +}
8968 +
8969 +/* return tag for @value. See table 1 above for details. */
8970 +static int dscale_range(__u64 value)
8971 +{
8972 + if (value > 0x3fffffff)
8973 + return 3;
8974 + if (value > 0x3fff)
8975 + return 2;
8976 + if (value > 0x3f)
8977 + return 1;
8978 + return 0;
8979 +}
8980 +
8981 +/* restore value stored at @adderss by dscale_write() and return number of
8982 + * bytes consumed */
8983 +int dscale_read(unsigned char *address, __u64 * value)
8984 +{
8985 + int tag;
8986 +
8987 + /* read tag */
8988 + tag = gettag(address);
8989 + switch (tag) {
8990 + case 3:
8991 + /* In this case tag is stored in an extra byte, skip this byte
8992 + * and decode value stored in the next 8 bytes.*/
8993 + *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
8994 + /* worst case: 8 bytes for value itself plus one byte for
8995 + * tag. */
8996 + return 9;
8997 + case 0:
8998 + *value = get_unaligned(address);
8999 + break;
9000 + case 1:
9001 + *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9002 + break;
9003 + case 2:
9004 + *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9005 + break;
9006 + default:
9007 + return RETERR(-EIO);
9008 + }
9009 + /* clear tag embedded into @value */
9010 + cleartag(value, tag);
9011 + /* number of bytes consumed is (2 ^ tag)---see table 1. */
9012 + return 1 << tag;
9013 +}
9014 +
9015 +/* store @value at @address and return number of bytes consumed */
9016 +int dscale_write(unsigned char *address, __u64 value)
9017 +{
9018 + int tag;
9019 + int shift;
9020 + __be64 v;
9021 + unsigned char *valarr;
9022 +
9023 + tag = dscale_range(value);
9024 + v = __cpu_to_be64(value);
9025 + valarr = (unsigned char *)&v;
9026 + shift = (tag == 3) ? 1 : 0;
9027 + memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9028 + *address |= (tag << 6);
9029 + return shift + (1 << tag);
9030 +}
9031 +
9032 +/* number of bytes required to store @value */
9033 +int dscale_bytes(__u64 value)
9034 +{
9035 + int bytes;
9036 +
9037 + bytes = 1 << dscale_range(value);
9038 + if (bytes == 8)
9039 + ++bytes;
9040 + return bytes;
9041 +}
9042 +
9043 +/* returns true if @value and @other require the same number of bytes to be
9044 + * stored. Used by detect when data structure (like stat-data) has to be
9045 + * expanded or contracted. */
9046 +int dscale_fit(__u64 value, __u64 other)
9047 +{
9048 + return dscale_range(value) == dscale_range(other);
9049 +}
9050 +
9051 +/* Make Linus happy.
9052 + Local variables:
9053 + c-indentation-style: "K&R"
9054 + mode-name: "LC"
9055 + c-basic-offset: 8
9056 + tab-width: 8
9057 + fill-column: 120
9058 + scroll-step: 1
9059 + End:
9060 +*/
9061 diff -urN linux-2.6.22.orig/fs/reiser4/dscale.h linux-2.6.22/fs/reiser4/dscale.h
9062 --- linux-2.6.22.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
9063 +++ linux-2.6.22/fs/reiser4/dscale.h 2007-07-29 00:25:34.836686123 +0400
9064 @@ -0,0 +1,27 @@
9065 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9066 + * reiser4/README */
9067 +
9068 +/* Scalable on-disk integers. See dscale.h for details. */
9069 +
9070 +#if !defined( __FS_REISER4_DSCALE_H__ )
9071 +#define __FS_REISER4_DSCALE_H__
9072 +
9073 +#include "dformat.h"
9074 +
9075 +extern int dscale_read(unsigned char *address, __u64 * value);
9076 +extern int dscale_write(unsigned char *address, __u64 value);
9077 +extern int dscale_bytes(__u64 value);
9078 +extern int dscale_fit(__u64 value, __u64 other);
9079 +
9080 +/* __FS_REISER4_DSCALE_H__ */
9081 +#endif
9082 +
9083 +/* Make Linus happy.
9084 + Local variables:
9085 + c-indentation-style: "K&R"
9086 + mode-name: "LC"
9087 + c-basic-offset: 8
9088 + tab-width: 8
9089 + fill-column: 120
9090 + End:
9091 +*/
9092 diff -urN linux-2.6.22.orig/fs/reiser4/entd.c linux-2.6.22/fs/reiser4/entd.c
9093 --- linux-2.6.22.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9094 +++ linux-2.6.22/fs/reiser4/entd.c 2007-07-29 00:25:34.840687159 +0400
9095 @@ -0,0 +1,335 @@
9096 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9097 + * reiser4/README */
9098 +
9099 +/* Ent daemon. */
9100 +
9101 +#include "debug.h"
9102 +#include "txnmgr.h"
9103 +#include "tree.h"
9104 +#include "entd.h"
9105 +#include "super.h"
9106 +#include "context.h"
9107 +#include "reiser4.h"
9108 +#include "vfs_ops.h"
9109 +#include "page_cache.h"
9110 +#include "inode.h"
9111 +
9112 +#include <linux/sched.h> /* struct task_struct */
9113 +#include <linux/suspend.h>
9114 +#include <linux/kernel.h>
9115 +#include <linux/writeback.h>
9116 +#include <linux/time.h> /* INITIAL_JIFFIES */
9117 +#include <linux/backing-dev.h> /* bdi_write_congested */
9118 +#include <linux/wait.h>
9119 +#include <linux/kthread.h>
9120 +#include <linux/freezer.h>
9121 +
9122 +#define DEF_PRIORITY 12
9123 +#define MAX_ENTD_ITERS 10
9124 +
9125 +static void entd_flush(struct super_block *, struct wbq *);
9126 +static int entd(void *arg);
9127 +
9128 +/*
9129 + * set ->comm field of end thread to make its state visible to the user level
9130 + */
9131 +#define entd_set_comm(state) \
9132 + snprintf(current->comm, sizeof(current->comm), \
9133 + "ent:%s%s", super->s_id, (state))
9134 +
9135 +/**
9136 + * reiser4_init_entd - initialize entd context and start kernel daemon
9137 + * @super: super block to start ent thread for
9138 + *
9139 + * Creates entd contexts, starts kernel thread and waits until it
9140 + * initializes.
9141 + */
9142 +int reiser4_init_entd(struct super_block *super)
9143 +{
9144 + entd_context *ctx;
9145 +
9146 + assert("nikita-3104", super != NULL);
9147 +
9148 + ctx = get_entd_context(super);
9149 +
9150 + memset(ctx, 0, sizeof *ctx);
9151 + spin_lock_init(&ctx->guard);
9152 + init_waitqueue_head(&ctx->wait);
9153 +#if REISER4_DEBUG
9154 + INIT_LIST_HEAD(&ctx->flushers_list);
9155 +#endif
9156 + /* lists of writepage requests */
9157 + INIT_LIST_HEAD(&ctx->todo_list);
9158 + INIT_LIST_HEAD(&ctx->done_list);
9159 + /* start entd */
9160 + ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9161 + if (IS_ERR(ctx->tsk))
9162 + return PTR_ERR(ctx->tsk);
9163 + return 0;
9164 +}
9165 +
9166 +static void put_wbq(struct wbq *rq)
9167 +{
9168 + iput(rq->mapping->host);
9169 + complete(&rq->completion);
9170 +}
9171 +
9172 +/* ent should be locked */
9173 +static struct wbq *__get_wbq(entd_context * ent)
9174 +{
9175 + struct wbq *wbq;
9176 +
9177 + if (list_empty(&ent->todo_list))
9178 + return NULL;
9179 +
9180 + ent->nr_todo_reqs --;
9181 + wbq = list_entry(ent->todo_list.next, struct wbq, link);
9182 + list_del_init(&wbq->link);
9183 + return wbq;
9184 +}
9185 +
9186 +/* ent thread function */
9187 +static int entd(void *arg)
9188 +{
9189 + struct super_block *super;
9190 + entd_context *ent;
9191 + int done = 0;
9192 +
9193 + super = arg;
9194 + /* do_fork() just copies task_struct into the new
9195 + thread. ->fs_context shouldn't be copied of course. This shouldn't
9196 + be a problem for the rest of the code though.
9197 + */
9198 + current->journal_info = NULL;
9199 +
9200 + ent = get_entd_context(super);
9201 +
9202 + while (!done) {
9203 + try_to_freeze();
9204 +
9205 + spin_lock(&ent->guard);
9206 + while (ent->nr_todo_reqs != 0) {
9207 + struct wbq *rq;
9208 +
9209 + assert("", list_empty(&ent->done_list));
9210 +
9211 + /* take request from the queue head */
9212 + rq = __get_wbq(ent);
9213 + assert("", rq != NULL);
9214 + ent->cur_request = rq;
9215 + spin_unlock(&ent->guard);
9216 +
9217 + entd_set_comm("!");
9218 + entd_flush(super, rq);
9219 +
9220 + put_wbq(rq);
9221 +
9222 + /*
9223 + * wakeup all requestors and iput their inodes
9224 + */
9225 + spin_lock(&ent->guard);
9226 + while (!list_empty(&ent->done_list)) {
9227 + rq = list_entry(ent->done_list.next, struct wbq, link);
9228 + list_del_init(&rq->link);
9229 + ent->nr_done_reqs --;
9230 + spin_unlock(&ent->guard);
9231 + assert("", rq->written == 1);
9232 + put_wbq(rq);
9233 + spin_lock(&ent->guard);
9234 + }
9235 + }
9236 + spin_unlock(&ent->guard);
9237 +
9238 + entd_set_comm(".");
9239 +
9240 + {
9241 + DEFINE_WAIT(__wait);
9242 +
9243 + do {
9244 + prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9245 + if (kthread_should_stop()) {
9246 + done = 1;
9247 + break;
9248 + }
9249 + if (ent->nr_todo_reqs != 0)
9250 + break;
9251 + schedule();
9252 + } while (0);
9253 + finish_wait(&ent->wait, &__wait);
9254 + }
9255 + }
9256 + BUG_ON(ent->nr_todo_reqs != 0);
9257 + return 0;
9258 +}
9259 +
9260 +/**
9261 + * reiser4_done_entd - stop entd kernel thread
9262 + * @super: super block to stop ent thread for
9263 + *
9264 + * It is called on umount. Sends stop signal to entd and wait until it handles
9265 + * it.
9266 + */
9267 +void reiser4_done_entd(struct super_block *super)
9268 +{
9269 + entd_context *ent;
9270 +
9271 + assert("nikita-3103", super != NULL);
9272 +
9273 + ent = get_entd_context(super);
9274 + assert("zam-1055", ent->tsk != NULL);
9275 + kthread_stop(ent->tsk);
9276 +}
9277 +
9278 +/* called at the beginning of jnode_flush to register flusher thread with ent
9279 + * daemon */
9280 +void reiser4_enter_flush(struct super_block *super)
9281 +{
9282 + entd_context *ent;
9283 +
9284 + assert("zam-1029", super != NULL);
9285 + ent = get_entd_context(super);
9286 +
9287 + assert("zam-1030", ent != NULL);
9288 +
9289 + spin_lock(&ent->guard);
9290 + ent->flushers++;
9291 +#if REISER4_DEBUG
9292 + list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9293 +#endif
9294 + spin_unlock(&ent->guard);
9295 +}
9296 +
9297 +/* called at the end of jnode_flush */
9298 +void reiser4_leave_flush(struct super_block *super)
9299 +{
9300 + entd_context *ent;
9301 + int wake_up_ent;
9302 +
9303 + assert("zam-1027", super != NULL);
9304 + ent = get_entd_context(super);
9305 +
9306 + assert("zam-1028", ent != NULL);
9307 +
9308 + spin_lock(&ent->guard);
9309 + ent->flushers--;
9310 + wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9311 +#if REISER4_DEBUG
9312 + list_del_init(&get_current_context()->flushers_link);
9313 +#endif
9314 + spin_unlock(&ent->guard);
9315 + if (wake_up_ent)
9316 + wake_up(&ent->wait);
9317 +}
9318 +
9319 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9320 +
9321 +static void entd_flush(struct super_block *super, struct wbq *rq)
9322 +{
9323 + reiser4_context ctx;
9324 + int tmp;
9325 +
9326 + init_stack_context(&ctx, super);
9327 + ctx.entd = 1;
9328 + ctx.gfp_mask = GFP_NOFS;
9329 +
9330 + rq->wbc->range_start = page_offset(rq->page);
9331 + rq->wbc->range_end = rq->wbc->range_start +
9332 + (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9333 + tmp = rq->wbc->nr_to_write;
9334 + rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9335 +
9336 + if (rq->wbc->nr_to_write > 0) {
9337 + rq->wbc->range_start = 0;
9338 + rq->wbc->range_end = LLONG_MAX;
9339 + generic_sync_sb_inodes(super, rq->wbc);
9340 + }
9341 + rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9342 + reiser4_writeout(super, rq->wbc);
9343 +
9344 + context_set_commit_async(&ctx);
9345 + reiser4_exit_context(&ctx);
9346 +}
9347 +
9348 +/**
9349 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9350 + * @page: page to be written
9351 + * @wbc: writeback control passed to reiser4_writepage
9352 + *
9353 + * Creates a request, puts it on entd list of requests, wakeups entd if
9354 + * necessary, waits until entd completes with the request.
9355 + */
9356 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9357 +{
9358 + struct super_block *sb;
9359 + struct inode *inode;
9360 + entd_context *ent;
9361 + struct wbq rq;
9362 +
9363 + assert("", PageLocked(page));
9364 + assert("", page->mapping != NULL);
9365 +
9366 + sb = page->mapping->host->i_sb;
9367 + ent = get_entd_context(sb);
9368 + assert("", ent && ent->done == 0);
9369 +
9370 + /*
9371 + * we are going to unlock page and ask ent thread to write the
9372 + * page. Re-dirty page before unlocking so that if ent thread fails to
9373 + * write it - it will remain dirty
9374 + */
9375 + reiser4_set_page_dirty_internal(page);
9376 +
9377 + /*
9378 + * pin inode in memory, unlock page, entd_flush will iput. We can not
9379 + * iput here becasue we can not allow delete_inode to be called here
9380 + */
9381 + inode = igrab(page->mapping->host);
9382 + unlock_page(page);
9383 + if (inode == NULL)
9384 + /* inode is getting freed */
9385 + return 0;
9386 +
9387 + /* init wbq */
9388 + INIT_LIST_HEAD(&rq.link);
9389 + rq.magic = WBQ_MAGIC;
9390 + rq.wbc = wbc;
9391 + rq.page = page;
9392 + rq.mapping = inode->i_mapping;
9393 + rq.node = NULL;
9394 + rq.written = 0;
9395 + init_completion(&rq.completion);
9396 +
9397 + /* add request to entd's list of writepage requests */
9398 + spin_lock(&ent->guard);
9399 + ent->nr_todo_reqs++;
9400 + list_add_tail(&rq.link, &ent->todo_list);
9401 + if (ent->nr_todo_reqs == 1)
9402 + wake_up(&ent->wait);
9403 +
9404 + spin_unlock(&ent->guard);
9405 +
9406 + /* wait until entd finishes */
9407 + wait_for_completion(&rq.completion);
9408 +
9409 + if (rq.written)
9410 + /* Eventually ENTD has written the page to disk. */
9411 + return 0;
9412 + return 0;
9413 +}
9414 +
9415 +int wbq_available(void)
9416 +{
9417 + struct super_block *sb = reiser4_get_current_sb();
9418 + entd_context *ent = get_entd_context(sb);
9419 + return ent->nr_todo_reqs;
9420 +}
9421 +
9422 +/*
9423 + * Local variables:
9424 + * c-indentation-style: "K&R"
9425 + * mode-name: "LC"
9426 + * c-basic-offset: 8
9427 + * tab-width: 8
9428 + * fill-column: 79
9429 + * End:
9430 + */
9431 diff -urN linux-2.6.22.orig/fs/reiser4/entd.h linux-2.6.22/fs/reiser4/entd.h
9432 --- linux-2.6.22.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9433 +++ linux-2.6.22/fs/reiser4/entd.h 2007-07-29 00:25:34.840687159 +0400
9434 @@ -0,0 +1,90 @@
9435 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9436 +
9437 +/* Ent daemon. */
9438 +
9439 +#ifndef __ENTD_H__
9440 +#define __ENTD_H__
9441 +
9442 +#include "context.h"
9443 +
9444 +#include <linux/fs.h>
9445 +#include <linux/completion.h>
9446 +#include <linux/wait.h>
9447 +#include <linux/spinlock.h>
9448 +#include <linux/sched.h> /* for struct task_struct */
9449 +
9450 +#define WBQ_MAGIC 0x7876dc76
9451 +
9452 +/* write-back request. */
9453 +struct wbq {
9454 + int magic;
9455 + struct list_head link; /* list head of this list is in entd context */
9456 + struct writeback_control *wbc;
9457 + struct page *page;
9458 + struct address_space *mapping;
9459 + struct completion completion;
9460 + jnode *node; /* set if ent thread captured requested page */
9461 + int written; /* set if ent thread wrote requested page */
9462 +};
9463 +
9464 +/* ent-thread context. This is used to synchronize starting/stopping ent
9465 + * threads. */
9466 +typedef struct entd_context {
9467 + /* wait queue that ent thread waits on for more work. It's
9468 + * signaled by write_page_by_ent(). */
9469 + wait_queue_head_t wait;
9470 + /* spinlock protecting other fields */
9471 + spinlock_t guard;
9472 + /* ent thread */
9473 + struct task_struct *tsk;
9474 + /* set to indicate that ent thread should leave. */
9475 + int done;
9476 + /* counter of active flushers */
9477 + int flushers;
9478 + /*
9479 + * when reiser4_writepage asks entd to write a page - it adds struct
9480 + * wbq to this list
9481 + */
9482 + struct list_head todo_list;
9483 + /* number of elements on the above list */
9484 + int nr_todo_reqs;
9485 +
9486 + struct wbq *cur_request;
9487 + /*
9488 + * when entd writes a page it moves write-back request from todo_list
9489 + * to done_list. This list is used at the end of entd iteration to
9490 + * wakeup requestors and iput inodes.
9491 + */
9492 + struct list_head done_list;
9493 + /* number of elements on the above list */
9494 + int nr_done_reqs;
9495 +
9496 +#if REISER4_DEBUG
9497 + /* list of all active flushers */
9498 + struct list_head flushers_list;
9499 +#endif
9500 +} entd_context;
9501 +
9502 +extern int reiser4_init_entd(struct super_block *);
9503 +extern void reiser4_done_entd(struct super_block *);
9504 +
9505 +extern void reiser4_enter_flush(struct super_block *);
9506 +extern void reiser4_leave_flush(struct super_block *);
9507 +
9508 +extern int write_page_by_ent(struct page *, struct writeback_control *);
9509 +extern int wbq_available(void);
9510 +extern void ent_writes_page(struct super_block *, struct page *);
9511 +
9512 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9513 +/* __ENTD_H__ */
9514 +#endif
9515 +
9516 +/* Make Linus happy.
9517 + Local variables:
9518 + c-indentation-style: "K&R"
9519 + mode-name: "LC"
9520 + c-basic-offset: 8
9521 + tab-width: 8
9522 + fill-column: 120
9523 + End:
9524 +*/
9525 diff -urN linux-2.6.22.orig/fs/reiser4/eottl.c linux-2.6.22/fs/reiser4/eottl.c
9526 --- linux-2.6.22.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300
9527 +++ linux-2.6.22/fs/reiser4/eottl.c 2007-07-29 00:25:34.840687159 +0400
9528 @@ -0,0 +1,509 @@
9529 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9530 +
9531 +#include "forward.h"
9532 +#include "debug.h"
9533 +#include "key.h"
9534 +#include "coord.h"
9535 +#include "plugin/item/item.h"
9536 +#include "plugin/node/node.h"
9537 +#include "znode.h"
9538 +#include "block_alloc.h"
9539 +#include "tree_walk.h"
9540 +#include "tree_mod.h"
9541 +#include "carry.h"
9542 +#include "tree.h"
9543 +#include "super.h"
9544 +
9545 +#include <linux/types.h> /* for __u?? */
9546 +
9547 +/*
9548 + * Extents on the twig level (EOTTL) handling.
9549 + *
9550 + * EOTTL poses some problems to the tree traversal, that are better explained
9551 + * by example.
9552 + *
9553 + * Suppose we have block B1 on the twig level with the following items:
9554 + *
9555 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9556 + * offset)
9557 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9558 + * 2. internal item I2 with key (10:0:0:0)
9559 + *
9560 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9561 + * then intra-node lookup is done. This lookup finished on the E1, because the
9562 + * key we are looking for is larger than the key of E1 and is smaller than key
9563 + * the of I2.
9564 + *
9565 + * Here search is stuck.
9566 + *
9567 + * After some thought it is clear what is wrong here: extents on the twig level
9568 + * break some basic property of the *search* tree (on the pretext, that they
9569 + * restore property of balanced tree).
9570 + *
9571 + * Said property is the following: if in the internal node of the search tree
9572 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9573 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9574 + * through the Pointer.
9575 + *
9576 + * This is not true, when Pointer is Extent-Pointer, simply because extent
9577 + * cannot expand indefinitely to the right to include any item with
9578 + *
9579 + * Key1 <= Key <= Key2.
9580 + *
9581 + * For example, our E1 extent is only responsible for the data with keys
9582 + *
9583 + * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9584 + *
9585 + * so, key range
9586 + *
9587 + * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9588 + *
9589 + * is orphaned: there is no way to get there from the tree root.
9590 + *
9591 + * In other words, extent pointers are different than normal child pointers as
9592 + * far as search tree is concerned, and this creates such problems.
9593 + *
9594 + * Possible solution for this problem is to insert our item into node pointed
9595 + * to by I2. There are some problems through:
9596 + *
9597 + * (1) I2 can be in a different node.
9598 + * (2) E1 can be immediately followed by another extent E2.
9599 + *
9600 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9601 + * for locks/coords as necessary.
9602 + *
9603 + * (2) is more complex. Solution here is to insert new empty leaf node and
9604 + * insert internal item between E1 and E2 pointing to said leaf node. This is
9605 + * further complicated by possibility that E2 is in a different node, etc.
9606 + *
9607 + * Problems:
9608 + *
9609 + * (1) if there was internal item I2 immediately on the right of an extent E1
9610 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9611 + * key of S1 will be less than smallest key in the N2. Normally, search key
9612 + * checks that key we are looking for is in the range of keys covered by the
9613 + * node key is being looked in. To work around of this situation, while
9614 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9615 + * cbk falgs bitmask. This flag is automatically set on entrance to the
9616 + * coord_by_key() and is only cleared when we are about to enter situation
9617 + * described above.
9618 + *
9619 + * (2) If extent E1 is immediately followed by another extent E2 and we are
9620 + * searching for the key that is between E1 and E2 we only have to insert new
9621 + * empty leaf node when coord_by_key was called for insertion, rather than just
9622 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9623 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9624 + * performed by insert_by_key() and friends.
9625 + *
9626 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9627 + * case it requires modification of node content which is only possible under
9628 + * write lock. It may well happen that we only have read lock on the node where
9629 + * new internal pointer is to be inserted (common case: lookup of non-existent
9630 + * stat-data that fells between two extents). If only read lock is held, tree
9631 + * traversal is restarted with lock_level modified so that next time we hit
9632 + * this problem, write lock will be held. Once we have write lock, balancing
9633 + * will be performed.
9634 + */
9635 +
9636 +/**
9637 + * is_next_item_internal - check whether next item is internal
9638 + * @coord: coordinate of extent item in twig node
9639 + * @key: search key
9640 + * @lh: twig node lock handle
9641 + *
9642 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9643 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9644 + * to that node, @coord is set to its first unit. If next item is not internal
9645 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9646 + * is returned if search restart has to be done.
9647 + */
9648 +static int
9649 +is_next_item_internal(coord_t *coord, const reiser4_key *key,
9650 + lock_handle *lh)
9651 +{
9652 + coord_t next;
9653 + lock_handle rn;
9654 + int result;
9655 +
9656 + coord_dup(&next, coord);
9657 + if (coord_next_unit(&next) == 0) {
9658 + /* next unit is in this node */
9659 + if (item_is_internal(&next)) {
9660 + coord_dup(coord, &next);
9661 + return 1;
9662 + }
9663 + assert("vs-3", item_is_extent(&next));
9664 + return 0;
9665 + }
9666 +
9667 + /*
9668 + * next unit either does not exist or is in right neighbor. If it is in
9669 + * right neighbor we have to check right delimiting key because
9670 + * concurrent thread could get their first and insert item with a key
9671 + * smaller than @key
9672 + */
9673 + read_lock_dk(current_tree);
9674 + result = keycmp(key, znode_get_rd_key(coord->node));
9675 + read_unlock_dk(current_tree);
9676 + assert("vs-6", result != EQUAL_TO);
9677 + if (result == GREATER_THAN)
9678 + return 2;
9679 +
9680 + /* lock right neighbor */
9681 + init_lh(&rn);
9682 + result = reiser4_get_right_neighbor(&rn, coord->node,
9683 + znode_is_wlocked(coord->node) ?
9684 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9685 + GN_CAN_USE_UPPER_LEVELS);
9686 + if (result == -E_NO_NEIGHBOR) {
9687 + /* we are on the rightmost edge of the tree */
9688 + done_lh(&rn);
9689 + return 0;
9690 + }
9691 +
9692 + if (result) {
9693 + assert("vs-4", result < 0);
9694 + done_lh(&rn);
9695 + return result;
9696 + }
9697 +
9698 + /*
9699 + * check whether concurrent thread managed to insert item with a key
9700 + * smaller than @key
9701 + */
9702 + read_lock_dk(current_tree);
9703 + result = keycmp(key, znode_get_ld_key(rn.node));
9704 + read_unlock_dk(current_tree);
9705 + assert("vs-6", result != EQUAL_TO);
9706 + if (result == GREATER_THAN) {
9707 + done_lh(&rn);
9708 + return 2;
9709 + }
9710 +
9711 + result = zload(rn.node);
9712 + if (result) {
9713 + assert("vs-5", result < 0);
9714 + done_lh(&rn);
9715 + return result;
9716 + }
9717 +
9718 + coord_init_first_unit(&next, rn.node);
9719 + if (item_is_internal(&next)) {
9720 + /*
9721 + * next unit is in right neighbor and it is an unit of internal
9722 + * item. Unlock coord->node. Move @lh to right neighbor. @coord
9723 + * is set to the first unit of right neighbor.
9724 + */
9725 + coord_dup(coord, &next);
9726 + zrelse(rn.node);
9727 + done_lh(lh);
9728 + move_lh(lh, &rn);
9729 + return 1;
9730 + }
9731 +
9732 + /*
9733 + * next unit is unit of extent item. Return without chaning @lh and
9734 + * @coord.
9735 + */
9736 + assert("vs-6", item_is_extent(&next));
9737 + zrelse(rn.node);
9738 + done_lh(&rn);
9739 + return 0;
9740 +}
9741 +
9742 +/**
9743 + * rd_key - calculate key of an item next to the given one
9744 + * @coord: position in a node
9745 + * @key: storage for result key
9746 + *
9747 + * @coord is set between items or after the last item in a node. Calculate key
9748 + * of item to the right of @coord.
9749 + */
9750 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9751 +{
9752 + coord_t dup;
9753 +
9754 + assert("nikita-2281", coord_is_between_items(coord));
9755 + coord_dup(&dup, coord);
9756 +
9757 + if (coord_set_to_right(&dup) == 0)
9758 + /* next item is in this node. Return its key. */
9759 + unit_key_by_coord(&dup, key);
9760 + else {
9761 + /*
9762 + * next item either does not exist or is in right
9763 + * neighbor. Return znode's right delimiting key.
9764 + */
9765 + read_lock_dk(current_tree);
9766 + *key = *znode_get_rd_key(coord->node);
9767 + read_unlock_dk(current_tree);
9768 + }
9769 + return key;
9770 +}
9771 +
9772 +/**
9773 + * add_empty_leaf - insert empty leaf between two extents
9774 + * @insert_coord: position in twig node between two extents
9775 + * @lh: twig node lock handle
9776 + * @key: left delimiting key of new node
9777 + * @rdkey: right delimiting key of new node
9778 + *
9779 + * Inserts empty leaf node between two extent items. It is necessary when we
9780 + * have to insert an item on leaf level between two extents (items on the twig
9781 + * level).
9782 + */
9783 +static int
9784 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9785 + const reiser4_key *key, const reiser4_key *rdkey)
9786 +{
9787 + int result;
9788 + carry_pool *pool;
9789 + carry_level *todo;
9790 + reiser4_item_data *item;
9791 + carry_insert_data *cdata;
9792 + carry_op *op;
9793 + znode *node;
9794 + reiser4_tree *tree;
9795 +
9796 + assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9797 + tree = znode_get_tree(insert_coord->node);
9798 + node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9799 + if (IS_ERR(node))
9800 + return PTR_ERR(node);
9801 +
9802 + /* setup delimiting keys for node being inserted */
9803 + write_lock_dk(tree);
9804 + znode_set_ld_key(node, key);
9805 + znode_set_rd_key(node, rdkey);
9806 + ON_DEBUG(node->creator = current);
9807 + ON_DEBUG(node->first_key = *key);
9808 + write_unlock_dk(tree);
9809 +
9810 + ZF_SET(node, JNODE_ORPHAN);
9811 +
9812 + /*
9813 + * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9814 + * carry_insert_data
9815 + */
9816 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9817 + sizeof(*item) + sizeof(*cdata));
9818 + if (IS_ERR(pool))
9819 + return PTR_ERR(pool);
9820 + todo = (carry_level *) (pool + 1);
9821 + init_carry_level(todo, pool);
9822 +
9823 + item = (reiser4_item_data *) (todo + 3);
9824 + cdata = (carry_insert_data *) (item + 1);
9825 +
9826 + op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9827 + if (!IS_ERR(op)) {
9828 + cdata->coord = insert_coord;
9829 + cdata->key = key;
9830 + cdata->data = item;
9831 + op->u.insert.d = cdata;
9832 + op->u.insert.type = COPT_ITEM_DATA;
9833 + build_child_ptr_data(node, item);
9834 + item->arg = NULL;
9835 + /* have @insert_coord to be set at inserted item after
9836 + insertion is done */
9837 + todo->track_type = CARRY_TRACK_CHANGE;
9838 + todo->tracked = lh;
9839 +
9840 + result = reiser4_carry(todo, NULL);
9841 + if (result == 0) {
9842 + /*
9843 + * pin node in memory. This is necessary for
9844 + * znode_make_dirty() below.
9845 + */
9846 + result = zload(node);
9847 + if (result == 0) {
9848 + lock_handle local_lh;
9849 +
9850 + /*
9851 + * if we inserted new child into tree we have
9852 + * to mark it dirty so that flush will be able
9853 + * to process it.
9854 + */
9855 + init_lh(&local_lh);
9856 + result = longterm_lock_znode(&local_lh, node,
9857 + ZNODE_WRITE_LOCK,
9858 + ZNODE_LOCK_LOPRI);
9859 + if (result == 0) {
9860 + znode_make_dirty(node);
9861 +
9862 + /*
9863 + * when internal item pointing to @node
9864 + * was inserted into twig node
9865 + * create_hook_internal did not connect
9866 + * it properly because its right
9867 + * neighbor was not known. Do it
9868 + * here
9869 + */
9870 + write_lock_tree(tree);
9871 + assert("nikita-3312",
9872 + znode_is_right_connected(node));
9873 + assert("nikita-2984",
9874 + node->right == NULL);
9875 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9876 + write_unlock_tree(tree);
9877 + result =
9878 + connect_znode(insert_coord, node);
9879 + ON_DEBUG(if (result == 0) check_dkeys(node););
9880 +
9881 + done_lh(lh);
9882 + move_lh(lh, &local_lh);
9883 + assert("vs-1676", node_is_empty(node));
9884 + coord_init_first_unit(insert_coord,
9885 + node);
9886 + } else {
9887 + warning("nikita-3136",
9888 + "Cannot lock child");
9889 + }
9890 + done_lh(&local_lh);
9891 + zrelse(node);
9892 + }
9893 + }
9894 + } else
9895 + result = PTR_ERR(op);
9896 + zput(node);
9897 + done_carry_pool(pool);
9898 + return result;
9899 +}
9900 +
9901 +/**
9902 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9903 + * @h: search handle
9904 + * @outcome: flag saying whether search has to restart or is done
9905 + *
9906 + * Handles search on twig level. If this function completes search itself then
9907 + * it returns 1. If search has to go one level down then 0 is returned. If
9908 + * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9909 + * in @h->result.
9910 + */
9911 +int handle_eottl(cbk_handle *h, int *outcome)
9912 +{
9913 + int result;
9914 + reiser4_key key;
9915 + coord_t *coord;
9916 +
9917 + coord = h->coord;
9918 +
9919 + if (h->level != TWIG_LEVEL ||
9920 + (coord_is_existing_item(coord) && item_is_internal(coord))) {
9921 + /* Continue to traverse tree downward. */
9922 + return 0;
9923 + }
9924 +
9925 + /*
9926 + * make sure that @h->coord is set to twig node and that it is either
9927 + * set to extent item or after extent item
9928 + */
9929 + assert("vs-356", h->level == TWIG_LEVEL);
9930 + assert("vs-357", ( {
9931 + coord_t lcoord;
9932 + coord_dup(&lcoord, coord);
9933 + check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9934 + item_is_extent(&lcoord);
9935 + }
9936 + ));
9937 +
9938 + if (*outcome == NS_FOUND) {
9939 + /* we have found desired key on twig level in extent item */
9940 + h->result = CBK_COORD_FOUND;
9941 + *outcome = LOOKUP_DONE;
9942 + return 1;
9943 + }
9944 +
9945 + if (!(h->flags & CBK_FOR_INSERT)) {
9946 + /* tree traversal is not for insertion. Just return
9947 + CBK_COORD_NOTFOUND. */
9948 + h->result = CBK_COORD_NOTFOUND;
9949 + *outcome = LOOKUP_DONE;
9950 + return 1;
9951 + }
9952 +
9953 + /* take a look at the item to the right of h -> coord */
9954 + result = is_next_item_internal(coord, h->key, h->active_lh);
9955 + if (unlikely(result < 0)) {
9956 + h->error = "get_right_neighbor failed";
9957 + h->result = result;
9958 + *outcome = LOOKUP_DONE;
9959 + return 1;
9960 + }
9961 + if (result == 0) {
9962 + /*
9963 + * item to the right is also an extent one. Allocate a new node
9964 + * and insert pointer to it after item h -> coord.
9965 + *
9966 + * This is a result of extents being located at the twig
9967 + * level. For explanation, see comment just above
9968 + * is_next_item_internal().
9969 + */
9970 + znode *loaded;
9971 +
9972 + if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
9973 + /*
9974 + * we got node read locked, restart coord_by_key to
9975 + * have write lock on twig level
9976 + */
9977 + h->lock_level = TWIG_LEVEL;
9978 + h->lock_mode = ZNODE_WRITE_LOCK;
9979 + *outcome = LOOKUP_REST;
9980 + return 1;
9981 + }
9982 +
9983 + loaded = coord->node;
9984 + result =
9985 + add_empty_leaf(coord, h->active_lh, h->key,
9986 + rd_key(coord, &key));
9987 + if (result) {
9988 + h->error = "could not add empty leaf";
9989 + h->result = result;
9990 + *outcome = LOOKUP_DONE;
9991 + return 1;
9992 + }
9993 + /* added empty leaf is locked (h->active_lh), its parent node
9994 + is unlocked, h->coord is set as EMPTY */
9995 + assert("vs-13", coord->between == EMPTY_NODE);
9996 + assert("vs-14", znode_is_write_locked(coord->node));
9997 + assert("vs-15",
9998 + WITH_DATA(coord->node, node_is_empty(coord->node)));
9999 + assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10000 + assert("vs-17", coord->node == h->active_lh->node);
10001 + *outcome = LOOKUP_DONE;
10002 + h->result = CBK_COORD_NOTFOUND;
10003 + return 1;
10004 + } else if (result == 1) {
10005 + /*
10006 + * this is special case mentioned in the comment on
10007 + * tree.h:cbk_flags. We have found internal item immediately on
10008 + * the right of extent, and we are going to insert new item
10009 + * there. Key of item we are going to insert is smaller than
10010 + * leftmost key in the node pointed to by said internal item
10011 + * (otherwise search wouldn't come to the extent in the first
10012 + * place).
10013 + *
10014 + * This is a result of extents being located at the twig
10015 + * level. For explanation, see comment just above
10016 + * is_next_item_internal().
10017 + */
10018 + h->flags &= ~CBK_TRUST_DK;
10019 + } else {
10020 + assert("vs-8", result == 2);
10021 + *outcome = LOOKUP_REST;
10022 + return 1;
10023 + }
10024 + assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10025 + return 0;
10026 +}
10027 +
10028 +/*
10029 + * Local variables:
10030 + * c-indentation-style: "K&R"
10031 + * mode-name: "LC"
10032 + * c-basic-offset: 8
10033 + * tab-width: 8
10034 + * fill-column: 120
10035 + * scroll-step: 1
10036 + * End:
10037 + */
10038 diff -urN linux-2.6.22.orig/fs/reiser4/estimate.c linux-2.6.22/fs/reiser4/estimate.c
10039 --- linux-2.6.22.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300
10040 +++ linux-2.6.22/fs/reiser4/estimate.c 2007-07-29 00:25:34.840687159 +0400
10041 @@ -0,0 +1,120 @@
10042 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10043 +
10044 +#include "debug.h"
10045 +#include "dformat.h"
10046 +#include "tree.h"
10047 +#include "carry.h"
10048 +#include "inode.h"
10049 +#include "plugin/cluster.h"
10050 +#include "plugin/item/ctail.h"
10051 +
10052 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10053 +
10054 + Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10055 + is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10056 + neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10057 + leaf level, 3 for twig level, 2 on upper + 1 for root.
10058 +
10059 + Do not calculate the current node of the lowest level here - this is overhead only.
10060 +
10061 + children is almost always 1 here. Exception is flow insertion
10062 +*/
10063 +static reiser4_block_nr
10064 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10065 +{
10066 + reiser4_block_nr ten_percent;
10067 +
10068 + ten_percent = ((103 * childen) >> 10);
10069 +
10070 + /* If we have too many balancings at the time, tree height can raise on more
10071 + then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10072 + return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10073 +}
10074 +
10075 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10076 + perform insertion of one item into the tree */
10077 +/* it is only called when tree height changes, or gets initialized */
10078 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10079 +{
10080 + return 1 + max_balance_overhead(1, height);
10081 +}
10082 +
10083 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10084 +{
10085 + return tree->estimate_one_insert;
10086 +}
10087 +
10088 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10089 + perform insertion of one unit into an item in the tree */
10090 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10091 +{
10092 + /* estimate insert into item just like item insertion */
10093 + return tree->estimate_one_insert;
10094 +}
10095 +
10096 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10097 +{
10098 + /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10099 + level */
10100 + return tree->estimate_one_insert;
10101 +}
10102 +
10103 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10104 + both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10105 + levels */
10106 +reiser4_block_nr estimate_insert_flow(tree_level height)
10107 +{
10108 + return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10109 + CARRY_FLOW_NEW_NODES_LIMIT,
10110 + height);
10111 +}
10112 +
10113 +/* returnes max number of nodes can be occupied by disk cluster */
10114 +static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10115 +{
10116 + int per_cluster;
10117 + per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10118 + return 3 + per_cluster +
10119 + max_balance_overhead(3 + per_cluster,
10120 + REISER4_MAX_ZTREE_HEIGHT);
10121 +}
10122 +
10123 +/* how many nodes might get dirty and added
10124 + during insertion of a disk cluster */
10125 +reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10126 +{
10127 + return estimate_cluster(inode, 1); /* 24 */
10128 +}
10129 +
10130 +/* how many nodes might get dirty and added
10131 + during update of a (prepped or unprepped) disk cluster */
10132 +reiser4_block_nr estimate_update_cluster(struct inode * inode)
10133 +{
10134 + return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10135 +}
10136 +
10137 +/* How many nodes occupied by a disk cluster might get dirty.
10138 + Note that this estimation is not precise (i.e. disk cluster
10139 + can occupy more nodes).
10140 + Q: Why we don't use precise estimation?
10141 + A: 1.Because precise estimation is fairly bad: 65536 nodes
10142 + for 64K logical cluster, it means 256M of dead space on
10143 + a partition
10144 + 2.It is a very rare case when disk cluster occupies more
10145 + nodes then this estimation returns.
10146 +*/
10147 +reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10148 +{
10149 + return cluster_nrpages(inode) + 4;
10150 +}
10151 +
10152 +/* Make Linus happy.
10153 + Local variables:
10154 + c-indentation-style: "K&R"
10155 + mode-name: "LC"
10156 + c-basic-offset: 8
10157 + tab-width: 8
10158 + fill-column: 120
10159 + scroll-step: 1
10160 + End:
10161 +*/
10162 diff -urN linux-2.6.22.orig/fs/reiser4/export_ops.c linux-2.6.22/fs/reiser4/export_ops.c
10163 --- linux-2.6.22.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300
10164 +++ linux-2.6.22/fs/reiser4/export_ops.c 2007-07-29 00:25:34.840687159 +0400
10165 @@ -0,0 +1,295 @@
10166 +/* Copyright 2005 by Hans Reiser, licensing governed by
10167 + * reiser4/README */
10168 +
10169 +#include "inode.h"
10170 +#include "plugin/plugin.h"
10171 +
10172 +/*
10173 + * Supported file-handle types
10174 + */
10175 +typedef enum {
10176 + FH_WITH_PARENT = 0x10, /* file handle with parent */
10177 + FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10178 +} reiser4_fhtype;
10179 +
10180 +#define NFSERROR (255)
10181 +
10182 +/* initialize place-holder for object */
10183 +static void object_on_wire_init(reiser4_object_on_wire *o)
10184 +{
10185 + o->plugin = NULL;
10186 +}
10187 +
10188 +/* finish with @o */
10189 +static void object_on_wire_done(reiser4_object_on_wire *o)
10190 +{
10191 + if (o->plugin != NULL)
10192 + o->plugin->wire.done(o);
10193 +}
10194 +
10195 +/*
10196 + * read serialized object identity from @addr and store information about
10197 + * object in @obj. This is dual to encode_inode().
10198 + */
10199 +static char *decode_inode(struct super_block *s, char *addr,
10200 + reiser4_object_on_wire * obj)
10201 +{
10202 + file_plugin *fplug;
10203 +
10204 + /* identifier of object plugin is stored in the first two bytes,
10205 + * followed by... */
10206 + fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10207 + if (fplug != NULL) {
10208 + addr += sizeof(d16);
10209 + obj->plugin = fplug;
10210 + assert("nikita-3520", fplug->wire.read != NULL);
10211 + /* plugin specific encoding of object identity. */
10212 + addr = fplug->wire.read(addr, obj);
10213 + } else
10214 + addr = ERR_PTR(RETERR(-EINVAL));
10215 + return addr;
10216 +}
10217 +
10218 +/**
10219 + * reiser4_decode_fh - decode_fh of export operations
10220 + * @super: super block
10221 + * @fh: nfsd file handle
10222 + * @len: length of file handle
10223 + * @fhtype: type of file handle
10224 + * @acceptable: acceptability testing function
10225 + * @context: argument for @acceptable
10226 + *
10227 + * Returns dentry referring to the same file as @fh.
10228 + */
10229 +static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10230 + int len, int fhtype,
10231 + int (*acceptable) (void *context,
10232 + struct dentry *de),
10233 + void *context)
10234 +{
10235 + reiser4_context *ctx;
10236 + reiser4_object_on_wire object;
10237 + reiser4_object_on_wire parent;
10238 + char *addr;
10239 + int with_parent;
10240 +
10241 + ctx = reiser4_init_context(super);
10242 + if (IS_ERR(ctx))
10243 + return (struct dentry *)ctx;
10244 +
10245 + assert("vs-1482",
10246 + fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10247 +
10248 + with_parent = (fhtype == FH_WITH_PARENT);
10249 +
10250 + addr = (char *)fh;
10251 +
10252 + object_on_wire_init(&object);
10253 + object_on_wire_init(&parent);
10254 +
10255 + addr = decode_inode(super, addr, &object);
10256 + if (!IS_ERR(addr)) {
10257 + if (with_parent)
10258 + addr = decode_inode(super, addr, &parent);
10259 + if (!IS_ERR(addr)) {
10260 + struct dentry *d;
10261 + typeof(super->s_export_op->find_exported_dentry) fn;
10262 +
10263 + fn = super->s_export_op->find_exported_dentry;
10264 + assert("nikita-3521", fn != NULL);
10265 + d = fn(super, &object, with_parent ? &parent : NULL,
10266 + acceptable, context);
10267 + if (d != NULL && !IS_ERR(d))
10268 + /* FIXME check for -ENOMEM */
10269 + reiser4_get_dentry_fsdata(d)->stateless = 1;
10270 + addr = (char *)d;
10271 + }
10272 + }
10273 +
10274 + object_on_wire_done(&object);
10275 + object_on_wire_done(&parent);
10276 +
10277 + reiser4_exit_context(ctx);
10278 + return (void *)addr;
10279 +}
10280 +
10281 +/*
10282 + * Object serialization support.
10283 + *
10284 + * To support knfsd file system provides export_operations that are used to
10285 + * construct and interpret NFS file handles. As a generalization of this,
10286 + * reiser4 object plugins have serialization support: it provides methods to
10287 + * create on-wire representation of identity of reiser4 object, and
10288 + * re-create/locate object given its on-wire identity.
10289 + *
10290 + */
10291 +
10292 +/*
10293 + * return number of bytes that on-wire representation of @inode's identity
10294 + * consumes.
10295 + */
10296 +static int encode_inode_size(struct inode *inode)
10297 +{
10298 + assert("nikita-3514", inode != NULL);
10299 + assert("nikita-3515", inode_file_plugin(inode) != NULL);
10300 + assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10301 +
10302 + return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10303 +}
10304 +
10305 +/*
10306 + * store on-wire representation of @inode's identity at the area beginning at
10307 + * @start.
10308 + */
10309 +static char *encode_inode(struct inode *inode, char *start)
10310 +{
10311 + assert("nikita-3517", inode != NULL);
10312 + assert("nikita-3518", inode_file_plugin(inode) != NULL);
10313 + assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10314 +
10315 + /*
10316 + * first, store two-byte identifier of object plugin, then
10317 + */
10318 + save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10319 + (d16 *) start);
10320 + start += sizeof(d16);
10321 + /*
10322 + * call plugin to serialize object's identity
10323 + */
10324 + return inode_file_plugin(inode)->wire.write(inode, start);
10325 +}
10326 +
10327 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10328 + * returned if file handle can not be stored */
10329 +/**
10330 + * reiser4_encode_fh - encode_fh of export operations
10331 + * @dentry:
10332 + * @fh:
10333 + * @lenp:
10334 + * @need_parent:
10335 + *
10336 + */
10337 +static int
10338 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10339 + int need_parent)
10340 +{
10341 + struct inode *inode;
10342 + struct inode *parent;
10343 + char *addr;
10344 + int need;
10345 + int delta;
10346 + int result;
10347 + reiser4_context *ctx;
10348 +
10349 + /*
10350 + * knfsd asks as to serialize object in @dentry, and, optionally its
10351 + * parent (if need_parent != 0).
10352 + *
10353 + * encode_inode() and encode_inode_size() is used to build
10354 + * representation of object and its parent. All hard work is done by
10355 + * object plugins.
10356 + */
10357 + inode = dentry->d_inode;
10358 + parent = dentry->d_parent->d_inode;
10359 +
10360 + addr = (char *)fh;
10361 +
10362 + need = encode_inode_size(inode);
10363 + if (need < 0)
10364 + return NFSERROR;
10365 + if (need_parent) {
10366 + delta = encode_inode_size(parent);
10367 + if (delta < 0)
10368 + return NFSERROR;
10369 + need += delta;
10370 + }
10371 +
10372 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
10373 + if (IS_ERR(ctx))
10374 + return PTR_ERR(ctx);
10375 +
10376 + if (need <= sizeof(__u32) * (*lenp)) {
10377 + addr = encode_inode(inode, addr);
10378 + if (need_parent)
10379 + addr = encode_inode(parent, addr);
10380 +
10381 + /* store in lenp number of 32bit words required for file
10382 + * handle. */
10383 + *lenp = (need + sizeof(__u32) - 1) >> 2;
10384 + result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10385 + } else
10386 + /* no enough space in file handle */
10387 + result = NFSERROR;
10388 + reiser4_exit_context(ctx);
10389 + return result;
10390 +}
10391 +
10392 +/**
10393 + * reiser4_get_dentry_parent - get_parent of export operations
10394 + * @child:
10395 + *
10396 + */
10397 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10398 +{
10399 + struct inode *dir;
10400 + dir_plugin *dplug;
10401 +
10402 + assert("nikita-3527", child != NULL);
10403 + /* see comment in reiser4_get_dentry() about following assertion */
10404 + assert("nikita-3528", is_in_reiser4_context());
10405 +
10406 + dir = child->d_inode;
10407 + assert("nikita-3529", dir != NULL);
10408 + dplug = inode_dir_plugin(dir);
10409 + assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10410 + if (dplug != NULL)
10411 + return dplug->get_parent(dir);
10412 + else
10413 + return ERR_PTR(RETERR(-ENOTDIR));
10414 +}
10415 +
10416 +/**
10417 + * reiser4_get_dentry - get_dentry of export operations
10418 + * @super:
10419 + * @data:
10420 + *
10421 + *
10422 + */
10423 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10424 +{
10425 + reiser4_object_on_wire *o;
10426 +
10427 + assert("nikita-3522", super != NULL);
10428 + assert("nikita-3523", data != NULL);
10429 + /*
10430 + * this is only supposed to be called by
10431 + *
10432 + * reiser4_decode_fh->find_exported_dentry
10433 + *
10434 + * so, reiser4_context should be here already.
10435 + */
10436 + assert("nikita-3526", is_in_reiser4_context());
10437 +
10438 + o = (reiser4_object_on_wire *)data;
10439 + assert("nikita-3524", o->plugin != NULL);
10440 + assert("nikita-3525", o->plugin->wire.get != NULL);
10441 +
10442 + return o->plugin->wire.get(super, o);
10443 +}
10444 +
10445 +struct export_operations reiser4_export_operations = {
10446 + .encode_fh = reiser4_encode_fh,
10447 + .decode_fh = reiser4_decode_fh,
10448 + .get_parent = reiser4_get_dentry_parent,
10449 + .get_dentry = reiser4_get_dentry
10450 +};
10451 +
10452 +/*
10453 + * Local variables:
10454 + * c-indentation-style: "K&R"
10455 + * mode-name: "LC"
10456 + * c-basic-offset: 8
10457 + * tab-width: 8
10458 + * fill-column: 79
10459 + * End:
10460 + */
10461 diff -urN linux-2.6.22.orig/fs/reiser4/flush.c linux-2.6.22/fs/reiser4/flush.c
10462 --- linux-2.6.22.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300
10463 +++ linux-2.6.22/fs/reiser4/flush.c 2007-07-29 00:25:34.000000000 +0400
10464 @@ -0,0 +1,3625 @@
10465 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10466 +
10467 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10468 +
10469 +#include "forward.h"
10470 +#include "debug.h"
10471 +#include "dformat.h"
10472 +#include "key.h"
10473 +#include "coord.h"
10474 +#include "plugin/item/item.h"
10475 +#include "plugin/plugin.h"
10476 +#include "plugin/object.h"
10477 +#include "txnmgr.h"
10478 +#include "jnode.h"
10479 +#include "znode.h"
10480 +#include "block_alloc.h"
10481 +#include "tree_walk.h"
10482 +#include "carry.h"
10483 +#include "tree.h"
10484 +#include "vfs_ops.h"
10485 +#include "inode.h"
10486 +#include "page_cache.h"
10487 +#include "wander.h"
10488 +#include "super.h"
10489 +#include "entd.h"
10490 +#include "reiser4.h"
10491 +#include "flush.h"
10492 +#include "writeout.h"
10493 +
10494 +#include <asm/atomic.h>
10495 +#include <linux/fs.h> /* for struct super_block */
10496 +#include <linux/mm.h> /* for struct page */
10497 +#include <linux/bio.h> /* for struct bio */
10498 +#include <linux/pagemap.h>
10499 +#include <linux/blkdev.h>
10500 +
10501 +/* IMPLEMENTATION NOTES */
10502 +
10503 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10504 + order to the nodes of the tree in which the parent is placed before its children, which
10505 + are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10506 + describes the node that "came before in forward parent-first order". When we speak of a
10507 + "parent-first follower", it describes the node that "comes next in parent-first
10508 + order" (alternatively the node that "came before in reverse parent-first order").
10509 +
10510 + The following pseudo-code prints the nodes of a tree in forward parent-first order:
10511 +
10512 + void parent_first (node)
10513 + {
10514 + print_node (node);
10515 + if (node->level > leaf) {
10516 + for (i = 0; i < num_children; i += 1) {
10517 + parent_first (node->child[i]);
10518 + }
10519 + }
10520 + }
10521 +*/
10522 +
10523 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10524 + that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10525 + can be accomplished with sequential reads, which results in reading nodes in their
10526 + parent-first order. This is a read-optimization aspect of the flush algorithm, and
10527 + there is also a write-optimization aspect, which is that we wish to make large
10528 + sequential writes to the disk by allocating or reallocating blocks so that they can be
10529 + written in sequence. Sometimes the read-optimization and write-optimization goals
10530 + conflict with each other, as we discuss in more detail below.
10531 +*/
10532 +
10533 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10534 + the relevant jnode->state bits and their relevence to flush:
10535 +
10536 + JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10537 + must be allocated first. In order to be considered allocated, the jnode must have
10538 + exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10539 + all dirtied jnodes eventually have one of these bits set during each transaction.
10540 +
10541 + JNODE_CREATED: The node was freshly created in its transaction and has no previous
10542 + block address, so it is unconditionally assigned to be relocated, although this is
10543 + mainly for code-convenience. It is not being 'relocated' from anything, but in
10544 + almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10545 + remains set even after JNODE_RELOC is set, so the actual relocate can be
10546 + distinguished from the created-and-allocated set easily: relocate-set members
10547 + (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10548 + have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10549 +
10550 + JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10551 + decision to maintain the pre-existing location for this node and it will be written
10552 + to the wandered-log.
10553 +
10554 + JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10555 + not created, see note above). A block with JNODE_RELOC set is eligible for
10556 + early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10557 + bit is set on a znode, the parent node's internal item is modified and the znode is
10558 + rehashed.
10559 +
10560 + JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10561 + and calls plugin->f.squeeze() method for its items. By this technology we update disk
10562 + clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10563 + has this flag (races with write(), rare case) the flush algorythm makes the decision
10564 + to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10565 + repeated allocation.
10566 +
10567 + JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10568 + flush queue. This means the jnode is not on any clean or dirty list, instead it is
10569 + moved to one of the flush queue (see flush_queue.h) object private list. This
10570 + prevents multiple concurrent flushes from attempting to start flushing from the
10571 + same node.
10572 +
10573 + (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10574 + squeeze-and-allocate on a node while its children are actively being squeezed and
10575 + allocated. This flag was created to avoid submitting a write request for a node
10576 + while its children are still being allocated and squeezed. Then flush queue was
10577 + re-implemented to allow unlimited number of nodes be queued. This flag support was
10578 + commented out in source code because we decided that there was no reason to submit
10579 + queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10580 + during a slum traversal and may submit "busy nodes" to disk. Probably we can
10581 + re-enable the JNODE_FLUSH_BUSY bit support in future.
10582 +
10583 + With these state bits, we describe a test used frequently in the code below,
10584 + jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10585 + test for "flushprepped" returns true if any of the following are true:
10586 +
10587 + - The node is not dirty
10588 + - The node has JNODE_RELOC set
10589 + - The node has JNODE_OVRWR set
10590 +
10591 + If either the node is not dirty or it has already been processed by flush (and assigned
10592 + JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10593 + true then flush has work to do on that node.
10594 +*/
10595 +
10596 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10597 + flushprepped twice (unless an explicit call to flush_unprep is made as described in
10598 + detail below). For example a node is dirtied, allocated, and then early-flushed to
10599 + disk and set clean. Before the transaction commits, the page is dirtied again and, due
10600 + to memory pressure, the node is flushed again. The flush algorithm will not relocate
10601 + the node to a new disk location, it will simply write it to the same, previously
10602 + relocated position again.
10603 +*/
10604 +
10605 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10606 + start at a leaf node and allocate in parent-first order by iterating to the right. At
10607 + each step of the iteration, we check for the right neighbor. Before advancing to the
10608 + right neighbor, we check if the current position and the right neighbor share the same
10609 + parent. If they do not share the same parent, the parent is allocated before the right
10610 + neighbor.
10611 +
10612 + This process goes recursively up the tree and squeeze nodes level by level as long as
10613 + the right neighbor and the current position have different parents, then it allocates
10614 + the right-neighbors-with-different-parents on the way back down. This process is
10615 + described in more detail in flush_squalloc_changed_ancestor and the recursive function
10616 + squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10617 + specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10618 + approaches.
10619 +
10620 + The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10621 + approach, we find a starting point by scanning left along each level past dirty nodes,
10622 + then going up and repeating the process until the left node and the parent node are
10623 + clean. We then perform a parent-first traversal from the starting point, which makes
10624 + allocating in parent-first order trivial. After one subtree has been allocated in this
10625 + manner, we move to the right, try moving upward, then repeat the parent-first
10626 + traversal.
10627 +
10628 + Both approaches have problems that need to be addressed. Both are approximately the
10629 + same amount of code, but the bottom-up approach has advantages in the order it acquires
10630 + locks which, at the very least, make it the better approach. At first glance each one
10631 + makes the other one look simpler, so it is important to remember a few of the problems
10632 + with each one.
10633 +
10634 + Main problem with the top-down approach: When you encounter a clean child during the
10635 + parent-first traversal, what do you do? You would like to avoid searching through a
10636 + large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10637 + obvious solution. One of the advantages of the top-down approach is that during the
10638 + parent-first traversal you check every child of a parent to see if it is dirty. In
10639 + this way, the top-down approach easily handles the main problem of the bottom-up
10640 + approach: unallocated children.
10641 +
10642 + The unallocated children problem is that before writing a node to disk we must make
10643 + sure that all of its children are allocated. Otherwise, the writing the node means
10644 + extra I/O because the node will have to be written again when the child is finally
10645 + allocated.
10646 +
10647 + WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10648 + should not cause any file system corruption, it only degrades I/O performance because a
10649 + node may be written when it is sure to be written at least one more time in the same
10650 + transaction when the remaining children are allocated. What follows is a description
10651 + of how we will solve the problem.
10652 +*/
10653 +
10654 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10655 + proceeding in parent first order, allocate some of its left-children, then encounter a
10656 + clean child in the middle of the parent. We do not allocate the clean child, but there
10657 + may remain unallocated (dirty) children to the right of the clean child. If we were to
10658 + stop flushing at this moment and write everything to disk, the parent might still
10659 + contain unallocated children.
10660 +
10661 + We could try to allocate all the descendents of every node that we allocate, but this
10662 + is not necessary. Doing so could result in allocating the entire tree: if the root
10663 + node is allocated then every unallocated node would have to be allocated before
10664 + flushing. Actually, we do not have to write a node just because we allocate it. It is
10665 + possible to allocate but not write a node during flush, when it still has unallocated
10666 + children. However, this approach is probably not optimal for the following reason.
10667 +
10668 + The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10669 + to optimize reads that occur in the same order. Thus we are read-optimizing for a
10670 + left-to-right scan through all the leaves in the system, and we are hoping to
10671 + write-optimize at the same time because those nodes will be written together in batch.
10672 + What happens, however, if we assign a block number to a node in its read-optimized
10673 + order but then avoid writing it because it has unallocated children? In that
10674 + situation, we lose out on the write-optimization aspect because a node will have to be
10675 + written again to the its location on the device, later, which likely means seeking back
10676 + to that location.
10677 +
10678 + So there are tradeoffs. We can choose either:
10679 +
10680 + A. Allocate all unallocated children to preserve both write-optimization and
10681 + read-optimization, but this is not always desirable because it may mean having to
10682 + allocate and flush very many nodes at once.
10683 +
10684 + B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10685 + but sacrifice write-optimization because those nodes will be written again.
10686 +
10687 + C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10688 + locations. Instead, choose to write-optimize them later, when they are written. To
10689 + facilitate this, we "undo" the read-optimized allocation that was given to the node so
10690 + that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10691 + case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10692 + call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10693 + if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10694 + location, and set the JNODE_CREATED bit, effectively setting the node back to an
10695 + unallocated state.
10696 +
10697 + We will take the following approach in v4.0: for twig nodes we will always finish
10698 + allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10699 + writing and choose write-optimization (C).
10700 +
10701 + To summarize, there are several parts to a solution that avoids the problem with
10702 + unallocated children:
10703 +
10704 + FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10705 + problem because there was an experiment which was done showed that we have 1-2 nodes
10706 + with unallocated children for thousands of written nodes. The experiment was simple
10707 + like coping / deletion of linux kernel sources. However the problem can arise in more
10708 + complex tests. I think we have jnode_io_hook to insert a check for unallocated
10709 + children and see what kind of problem we have.
10710 +
10711 + 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10712 + squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
10713 + implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10714 + comments in that function.
10715 +
10716 + 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10717 + have unallocated children. If the twig level has unallocated children it is an
10718 + assertion failure. If a higher-level node has unallocated children, then it should be
10719 + explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
10720 + should be simple.
10721 +
10722 + 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10723 + CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10724 + this somewhat in the case where large sub-trees are flushed. The following observation
10725 + helps: if both the left- and right-neighbor of a node are processed by the flush
10726 + algorithm then the node itself is guaranteed to have all of its children allocated.
10727 + However, the cost of this check may not be so expensive after all: it is not needed for
10728 + leaves and flush can guarantee this property for twigs. That leaves only (level >
10729 + TWIG) nodes that have to be checked, so this optimization only helps if at least three
10730 + (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10731 + there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
10732 + then the number of blocks being written will be very large, so the savings may be
10733 + insignificant. That said, the idea is to maintain both the left and right edges of
10734 + nodes that are processed in flush. When flush_empty_queue() is called, a relatively
10735 + simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
10736 + edge, the slow check is necessary, but if it is in the interior then it can be assumed
10737 + to have all of its children allocated. FIXME: medium complexity to implement, but
10738 + simple to verify given that we must have a slow check anyway.
10739 +
10740 + 4. (Optional) This part is optional, not for v4.0--flush should work independently of
10741 + whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
10742 + left-scan operation to take unallocated children into account. Normally, the left-scan
10743 + operation goes left as long as adjacent nodes are dirty up until some large maximum
10744 + value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
10745 + may stop at a position where there are unallocated children to the left with the same
10746 + parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10747 + FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10748 + with a rapid scan. The rapid scan skips all the interior children of a node--if the
10749 + leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10750 + twig to the left). If the left neighbor of the leftmost child is also dirty, then
10751 + continue the scan at the left twig and repeat. This option will cause flush to
10752 + allocate more twigs in a single pass, but it also has the potential to write many more
10753 + nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
10754 + was partially implemented, code removed August 12, 2002 by JMACD.
10755 +*/
10756 +
10757 +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
10758 + starting point for flush is a leaf node, but actually the flush code cares very little
10759 + about whether or not this is true. It is possible that all the leaf nodes are flushed
10760 + and dirty parent nodes still remain, in which case jnode_flush() is called on a
10761 + non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
10762 + leaf, even when it is not. This is a simple approach, and there may be a more optimal
10763 + policy but until a problem with this approach is discovered, simplest is probably best.
10764 +
10765 + NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10766 + the leaves. This is done as a matter of simplicity and there is only one (shaky)
10767 + justification. When an atom commits, it flushes all leaf level nodes first, followed
10768 + by twigs, and so on. With flushing done in this order, if flush is eventually called
10769 + on a non-leaf node it means that (somehow) we reached a point where all leaves are
10770 + clean and only internal nodes need to be flushed. If that it the case, then it means
10771 + there were no leaves that were the parent-first preceder/follower of the parent. This
10772 + is expected to be a rare case, which is why we do nothing special about it. However,
10773 + memory pressure may pass an internal node to flush when there are still dirty leaf
10774 + nodes that need to be flushed, which could prove our original assumptions
10775 + "inoperative". If this needs to be fixed, then scan_left/right should have
10776 + special checks for the non-leaf levels. For example, instead of passing from a node to
10777 + the left neighbor, it should pass from the node to the left neighbor's rightmost
10778 + descendent (if dirty).
10779 +
10780 +*/
10781 +
10782 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10783 + it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
10784 + logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10785 + device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
10786 + device becomes sorted such that tree order and block number order fully correlate.
10787 +
10788 + Resizing is done by shifting everything either all the way to the left or all the way
10789 + to the right, and then reporting the last block.
10790 +*/
10791 +
10792 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
10793 + descibes the policy from the highest level:
10794 +
10795 + The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10796 + leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10797 + leaf nodes.
10798 +
10799 + Otherwise, there are two contexts in which we make a decision to relocate:
10800 +
10801 + 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10802 + During the initial stages of flush, after scan-right completes, we want to ask the
10803 + question: should we relocate this leaf node and thus dirty the parent node. Then if
10804 + the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10805 + the question at the next level up, and so on. In these cases we are moving in the
10806 + reverse-parent first direction.
10807 +
10808 + There is another case which is considered the reverse direction, which comes at the end
10809 + of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
10810 + reach a point where there is a clean twig to the right with a dirty leftmost child. In
10811 + this case, we may wish to relocate the child by testing if it should be relocated
10812 + relative to its parent.
10813 +
10814 + 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10815 + allocate_znode. What distinguishes the forward parent-first case from the
10816 + reverse-parent first case is that the preceder has already been allocated in the
10817 + forward case, whereas in the reverse case we don't know what the preceder is until we
10818 + finish "going in reverse". That simplifies the forward case considerably, and there we
10819 + actually use the block allocator to determine whether, e.g., a block closer to the
10820 + preceder is available.
10821 +*/
10822 +
10823 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
10824 + finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10825 + squeeze the parent's left neighbor and the parent. This may change the
10826 + flush-starting-node's parent. Repeat until the child's parent is stable. If the child
10827 + is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10828 + Note that we cannot allocate extents during this or they will be out of parent-first
10829 + order. There is also some difficult coordinate maintenence issues. We can't do a tree
10830 + search to find coordinates again (because we hold locks), we have to determine them
10831 + from the two nodes being squeezed. Looks difficult, but has potential to increase
10832 + space utilization. */
10833 +
10834 +/* Flush-scan helper functions. */
10835 +static void scan_init(flush_scan * scan);
10836 +static void scan_done(flush_scan * scan);
10837 +
10838 +/* Flush-scan algorithm. */
10839 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10840 + unsigned limit);
10841 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10842 +static int scan_common(flush_scan * scan, flush_scan * other);
10843 +static int scan_formatted(flush_scan * scan);
10844 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
10845 +static int scan_by_coord(flush_scan * scan);
10846 +
10847 +/* Initial flush-point ancestor allocation. */
10848 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
10849 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10850 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10851 +
10852 +/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
10853 +static int squalloc(flush_pos_t * pos);
10854 +
10855 +/* Flush squeeze implementation. */
10856 +static int squeeze_right_non_twig(znode * left, znode * right);
10857 +static int shift_one_internal_unit(znode * left, znode * right);
10858 +
10859 +/* Flush reverse parent-first relocation routines. */
10860 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10861 + const reiser4_block_nr * nblk);
10862 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10863 + flush_pos_t * pos);
10864 +static int reverse_relocate_check_dirty_parent(jnode * node,
10865 + const coord_t * parent_coord,
10866 + flush_pos_t * pos);
10867 +
10868 +/* Flush allocate write-queueing functions: */
10869 +static int allocate_znode(znode * node, const coord_t * parent_coord,
10870 + flush_pos_t * pos);
10871 +static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10872 + flush_pos_t * pos);
10873 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10874 +
10875 +/* Flush helper functions: */
10876 +static int jnode_lock_parent_coord(jnode * node,
10877 + coord_t * coord,
10878 + lock_handle * parent_lh,
10879 + load_count * parent_zh,
10880 + znode_lock_mode mode, int try);
10881 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
10882 + znode_lock_mode mode, int check_dirty, int expected);
10883 +static int znode_same_parents(znode * a, znode * b);
10884 +
10885 +static int znode_check_flushprepped(znode * node)
10886 +{
10887 + return jnode_check_flushprepped(ZJNODE(node));
10888 +}
10889 +
10890 +/* Flush position functions */
10891 +static void pos_init(flush_pos_t * pos);
10892 +static int pos_valid(flush_pos_t * pos);
10893 +static void pos_done(flush_pos_t * pos);
10894 +static int pos_stop(flush_pos_t * pos);
10895 +
10896 +/* check that @org is first jnode extent unit, if extent is unallocated,
10897 + * because all jnodes of unallocated extent are dirty and of the same atom. */
10898 +#define checkchild(scan) \
10899 +assert("nikita-3435", \
10900 + ergo(scan->direction == LEFT_SIDE && \
10901 + (scan->parent_coord.node->level == TWIG_LEVEL) && \
10902 + jnode_is_unformatted(scan->node) && \
10903 + extent_is_unallocated(&scan->parent_coord), \
10904 + extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10905 +
10906 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
10907 + useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
10908 + no static initializer function...) */
10909 +ON_DEBUG(atomic_t flush_cnt;
10910 + )
10911 +
10912 +/* check fs backing device for write congestion */
10913 +static int check_write_congestion(void)
10914 +{
10915 + struct super_block *sb;
10916 + struct backing_dev_info *bdi;
10917 +
10918 + sb = reiser4_get_current_sb();
10919 + bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
10920 + return bdi_write_congested(bdi);
10921 +}
10922 +
10923 +/* conditionally write flush queue */
10924 +static int write_prepped_nodes(flush_pos_t * pos)
10925 +{
10926 + int ret;
10927 +
10928 + assert("zam-831", pos);
10929 + assert("zam-832", pos->fq);
10930 +
10931 + if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
10932 + return 0;
10933 +
10934 + if (check_write_congestion())
10935 + return 0;
10936 +
10937 + ret = reiser4_write_fq(pos->fq, pos->nr_written,
10938 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
10939 + return ret;
10940 +}
10941 +
10942 +/* Proper release all flush pos. resources then move flush position to new
10943 + locked node */
10944 +static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
10945 + load_count * new_load, const coord_t * new_coord)
10946 +{
10947 + assert("zam-857", new_lock->node == new_load->node);
10948 +
10949 + if (new_coord) {
10950 + assert("zam-858", new_coord->node == new_lock->node);
10951 + coord_dup(&pos->coord, new_coord);
10952 + } else {
10953 + coord_init_first_unit(&pos->coord, new_lock->node);
10954 + }
10955 +
10956 + if (pos->child) {
10957 + jput(pos->child);
10958 + pos->child = NULL;
10959 + }
10960 +
10961 + move_load_count(&pos->load, new_load);
10962 + done_lh(&pos->lock);
10963 + move_lh(&pos->lock, new_lock);
10964 +}
10965 +
10966 +/* delete empty node which link from the parent still exists. */
10967 +static int delete_empty_node(znode * node)
10968 +{
10969 + reiser4_key smallest_removed;
10970 +
10971 + assert("zam-1019", node != NULL);
10972 + assert("zam-1020", node_is_empty(node));
10973 + assert("zam-1023", znode_is_wlocked(node));
10974 +
10975 + return reiser4_delete_node(node, &smallest_removed, NULL, 1);
10976 +}
10977 +
10978 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
10979 +static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
10980 +{
10981 + int ret;
10982 + load_count load;
10983 + lock_handle lock;
10984 +
10985 + init_lh(&lock);
10986 + init_load_count(&load);
10987 +
10988 + if (jnode_is_znode(org)) {
10989 + ret = longterm_lock_znode(&lock, JZNODE(org),
10990 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
10991 + if (ret)
10992 + return ret;
10993 +
10994 + ret = incr_load_count_znode(&load, JZNODE(org));
10995 + if (ret)
10996 + return ret;
10997 +
10998 + pos->state =
10999 + (jnode_get_level(org) ==
11000 + LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11001 + move_flush_pos(pos, &lock, &load, NULL);
11002 + } else {
11003 + coord_t parent_coord;
11004 + ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11005 + &load, ZNODE_WRITE_LOCK, 0);
11006 + if (ret)
11007 + goto done;
11008 + if (!item_is_extent(&parent_coord)) {
11009 + /* file was converted to tail, org became HB, we found internal
11010 + item */
11011 + ret = -EAGAIN;
11012 + goto done;
11013 + }
11014 +
11015 + pos->state = POS_ON_EPOINT;
11016 + move_flush_pos(pos, &lock, &load, &parent_coord);
11017 + pos->child = jref(org);
11018 + if (extent_is_unallocated(&parent_coord)
11019 + && extent_unit_index(&parent_coord) != index_jnode(org)) {
11020 + /* @org is not first child of its parent unit. This may happen
11021 + because longerm lock of its parent node was released between
11022 + scan_left and scan_right. For now work around this having flush to repeat */
11023 + ret = -EAGAIN;
11024 + }
11025 + }
11026 +
11027 + done:
11028 + done_load_count(&load);
11029 + done_lh(&lock);
11030 + return ret;
11031 +}
11032 +
11033 +/* TODO LIST (no particular order): */
11034 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
11035 + indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11036 + specific names mentioned instead that need to be inspected/resolved. */
11037 +/* B. There is an issue described in reverse_relocate_test having to do with an
11038 + imprecise is_preceder? check having to do with partially-dirty extents. The code that
11039 + sets preceder hints and computes the preceder is basically untested. Careful testing
11040 + needs to be done that preceder calculations are done correctly, since if it doesn't
11041 + affect correctness we will not catch this stuff during regular testing. */
11042 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11043 + considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11044 + but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11045 + Many of the calls that may produce one of these return values (i.e.,
11046 + longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11047 + values themselves and, for instance, stop flushing instead of resulting in a restart.
11048 + If any of these results are true error conditions then flush will go into a busy-loop,
11049 + as we noticed during testing when a corrupt tree caused find_child_ptr to return
11050 + ENOENT. It needs careful thought and testing of corner conditions.
11051 +*/
11052 +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11053 + block is assigned a block number then early-flushed to disk. It is dirtied again and
11054 + flush is called again. Concurrently, that block is deleted, and the de-allocation of
11055 + its block number does not need to be deferred, since it is not part of the preserve set
11056 + (i.e., it didn't exist before the transaction). I think there may be a race condition
11057 + where flush writes the dirty, created block after the non-deferred deallocated block
11058 + number is re-allocated, making it possible to write deleted data on top of non-deleted
11059 + data. Its just a theory, but it needs to be thought out. */
11060 +/* F. bio_alloc() failure is not handled gracefully. */
11061 +/* G. Unallocated children. */
11062 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11063 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11064 +
11065 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11066 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11067 + neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11068 + blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11069 + a part of transaction commit.
11070 +
11071 + Our objective here is to prep and flush the slum the jnode belongs to. We want to
11072 + squish the slum together, and allocate the nodes in it as we squish because allocation
11073 + of children affects squishing of parents.
11074 +
11075 + The "argument" @node tells flush where to start. From there, flush finds the left edge
11076 + of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11077 + "better place" to start squalloc first we perform a flush_scan.
11078 +
11079 + Flush-scanning may be performed in both left and right directions, but for different
11080 + purposes. When scanning to the left, we are searching for a node that precedes a
11081 + sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11082 + During flush-scanning, we also take the opportunity to count the number of consecutive
11083 + leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11084 + make a decision to reallocate leaf nodes (thus favoring write-optimization).
11085 +
11086 + Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11087 + also be dirty nodes to the right of the argument. If the scan-left operation does not
11088 + count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11089 + operation to see whether there is, in fact, enough nodes to meet the relocate
11090 + threshold. Each right- and left-scan operation uses a single flush_scan object.
11091 +
11092 + After left-scan and possibly right-scan, we prepare a flush_position object with the
11093 + starting flush point or parent coordinate, which was determined using scan-left.
11094 +
11095 + Next we call the main flush routine, squalloc, which iterates along the
11096 + leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11097 +
11098 + After squalloc returns we take extra steps to ensure that all the children
11099 + of the final twig node are allocated--this involves repeating squalloc
11100 + until we finish at a twig with no unallocated children.
11101 +
11102 + Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11103 + any above-twig nodes during flush_empty_queue that still have unallocated children, we
11104 + flush_unprep them.
11105 +
11106 + Flush treats several "failure" cases as non-failures, essentially causing them to start
11107 + over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11108 + probably be handled properly rather than restarting, but there are a bunch of cases to
11109 + audit.
11110 +*/
11111 +
11112 +static int
11113 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11114 + flush_queue_t * fq, int flags)
11115 +{
11116 + long ret = 0;
11117 + flush_scan *right_scan;
11118 + flush_scan *left_scan;
11119 + flush_pos_t *flush_pos;
11120 + int todo;
11121 + struct super_block *sb;
11122 + reiser4_super_info_data *sbinfo;
11123 + jnode *leftmost_in_slum = NULL;
11124 +
11125 + assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11126 + assert("nikita-3022", reiser4_schedulable());
11127 +
11128 + assert("nikita-3185",
11129 + get_current_super_private()->delete_mutex_owner != current);
11130 +
11131 + /* allocate right_scan, left_scan and flush_pos */
11132 + right_scan =
11133 + kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11134 + reiser4_ctx_gfp_mask_get());
11135 + if (right_scan == NULL)
11136 + return RETERR(-ENOMEM);
11137 + left_scan = right_scan + 1;
11138 + flush_pos = (flush_pos_t *) (left_scan + 1);
11139 +
11140 + sb = reiser4_get_current_sb();
11141 + sbinfo = get_super_private(sb);
11142 +
11143 + /* Flush-concurrency debug code */
11144 +#if REISER4_DEBUG
11145 + atomic_inc(&flush_cnt);
11146 +#endif
11147 +
11148 + reiser4_enter_flush(sb);
11149 +
11150 + /* Initialize a flush position. */
11151 + pos_init(flush_pos);
11152 +
11153 + flush_pos->nr_written = nr_written;
11154 + flush_pos->fq = fq;
11155 + flush_pos->flags = flags;
11156 + flush_pos->nr_to_write = nr_to_write;
11157 +
11158 + scan_init(right_scan);
11159 + scan_init(left_scan);
11160 +
11161 + /* First scan left and remember the leftmost scan position. If the leftmost
11162 + position is unformatted we remember its parent_coord. We scan until counting
11163 + FLUSH_SCAN_MAXNODES.
11164 +
11165 + If starting @node is unformatted, at the beginning of left scan its
11166 + parent (twig level node, containing extent item) will be long term
11167 + locked and lock handle will be stored in the
11168 + @right_scan->parent_lock. This lock is used to start the rightward
11169 + scan without redoing the tree traversal (necessary to find parent)
11170 + and, hence, is kept during leftward scan. As a result, we have to
11171 + use try-lock when taking long term locks during the leftward scan.
11172 + */
11173 + ret = scan_left(left_scan, right_scan,
11174 + node, sbinfo->flush.scan_maxnodes);
11175 + if (ret != 0)
11176 + goto failed;
11177 +
11178 + leftmost_in_slum = jref(left_scan->node);
11179 + scan_done(left_scan);
11180 +
11181 + /* Then possibly go right to decide if we will use a policy of relocating leaves.
11182 + This is only done if we did not scan past (and count) enough nodes during the
11183 + leftward scan. If we do scan right, we only care to go far enough to establish
11184 + that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11185 + scan limit is the difference between left_scan.count and the threshold. */
11186 +
11187 + todo = sbinfo->flush.relocate_threshold - left_scan->count;
11188 + /* scan right is inherently deadlock prone, because we are
11189 + * (potentially) holding a lock on the twig node at this moment.
11190 + * FIXME: this is incorrect comment: lock is not held */
11191 + if (todo > 0) {
11192 + ret = scan_right(right_scan, node, (unsigned)todo);
11193 + if (ret != 0)
11194 + goto failed;
11195 + }
11196 +
11197 + /* Only the right-scan count is needed, release any rightward locks right away. */
11198 + scan_done(right_scan);
11199 +
11200 + /* ... and the answer is: we should relocate leaf nodes if at least
11201 + FLUSH_RELOCATE_THRESHOLD nodes were found. */
11202 + flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11203 + (left_scan->count + right_scan->count >=
11204 + sbinfo->flush.relocate_threshold);
11205 +
11206 + /* Funny business here. We set the 'point' in the flush_position at prior to
11207 + starting squalloc regardless of whether the first point is
11208 + formatted or unformatted. Without this there would be an invariant, in the
11209 + rest of the code, that if the flush_position is unformatted then
11210 + flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11211 + and if the flush_position is formatted then flush_position->point is non-NULL
11212 + and no parent info is set.
11213 +
11214 + This seems lazy, but it makes the initial calls to reverse_relocate_test
11215 + (which ask "is it the pos->point the leftmost child of its parent") much easier
11216 + because we know the first child already. Nothing is broken by this, but the
11217 + reasoning is subtle. Holding an extra reference on a jnode during flush can
11218 + cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11219 + removed from sibling lists until they have zero reference count. Flush would
11220 + never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11221 + deleted to the right. So if nothing is broken, why fix it?
11222 +
11223 + NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11224 + point and in any moment, because of the concurrent file system
11225 + activity (for example, truncate). */
11226 +
11227 + /* Check jnode state after flush_scan completed. Having a lock on this
11228 + node or its parent (in case of unformatted) helps us in case of
11229 + concurrent flushing. */
11230 + if (jnode_check_flushprepped(leftmost_in_slum)
11231 + && !jnode_convertible(leftmost_in_slum)) {
11232 + ret = 0;
11233 + goto failed;
11234 + }
11235 +
11236 + /* Now setup flush_pos using scan_left's endpoint. */
11237 + ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11238 + if (ret)
11239 + goto failed;
11240 +
11241 + if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11242 + && node_is_empty(flush_pos->coord.node)) {
11243 + znode *empty = flush_pos->coord.node;
11244 +
11245 + assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11246 + ret = delete_empty_node(empty);
11247 + goto failed;
11248 + }
11249 +
11250 + if (jnode_check_flushprepped(leftmost_in_slum)
11251 + && !jnode_convertible(leftmost_in_slum)) {
11252 + ret = 0;
11253 + goto failed;
11254 + }
11255 +
11256 + /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11257 + ret = alloc_pos_and_ancestors(flush_pos);
11258 + if (ret)
11259 + goto failed;
11260 +
11261 + /* Do the main rightward-bottom-up squeeze and allocate loop. */
11262 + ret = squalloc(flush_pos);
11263 + pos_stop(flush_pos);
11264 + if (ret)
11265 + goto failed;
11266 +
11267 + /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11268 + First, the pos_stop() and pos_valid() routines should be modified
11269 + so that pos_stop() sets a flush_position->stop flag to 1 without
11270 + releasing the current position immediately--instead release it in
11271 + pos_done(). This is a better implementation than the current one anyway.
11272 +
11273 + It is not clear that all fields of the flush_position should not be released,
11274 + but at the very least the parent_lock, parent_coord, and parent_load should
11275 + remain held because they are hold the last twig when pos_stop() is
11276 + called.
11277 +
11278 + When we reach this point in the code, if the parent_coord is set to after the
11279 + last item then we know that flush reached the end of a twig (and according to
11280 + the new flush queueing design, we will return now). If parent_coord is not
11281 + past the last item, we should check if the current twig has any unallocated
11282 + children to the right (we are not concerned with unallocated children to the
11283 + left--in that case the twig itself should not have been allocated). If the
11284 + twig has unallocated children to the right, set the parent_coord to that
11285 + position and then repeat the call to squalloc.
11286 +
11287 + Testing for unallocated children may be defined in two ways: if any internal
11288 + item has a fake block number, it is unallocated; if any extent item is
11289 + unallocated then all of its children are unallocated. But there is a more
11290 + aggressive approach: if there are any dirty children of the twig to the right
11291 + of the current position, we may wish to relocate those nodes now. Checking for
11292 + potential relocation is more expensive as it requires knowing whether there are
11293 + any dirty children that are not unallocated. The extent_needs_allocation
11294 + should be used after setting the correct preceder.
11295 +
11296 + When we reach the end of a twig at this point in the code, if the flush can
11297 + continue (when the queue is ready) it will need some information on the future
11298 + starting point. That should be stored away in the flush_handle using a seal, I
11299 + believe. Holding a jref() on the future starting point may break other code
11300 + that deletes that node.
11301 + */
11302 +
11303 + /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11304 + above the twig level. If the VM calls flush above the twig level, do nothing
11305 + and return (but figure out why this happens). The txnmgr should be modified to
11306 + only flush its leaf-level dirty list. This will do all the necessary squeeze
11307 + and allocate steps but leave unallocated branches and possibly unallocated
11308 + twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11309 + level, the remaining unallocated nodes should be given write-optimized
11310 + locations. (Possibly, the remaining unallocated twigs should be allocated just
11311 + before their leftmost child.)
11312 + */
11313 +
11314 + /* Any failure reaches this point. */
11315 + failed:
11316 +
11317 + switch (ret) {
11318 + case -E_REPEAT:
11319 + case -EINVAL:
11320 + case -E_DEADLOCK:
11321 + case -E_NO_NEIGHBOR:
11322 + case -ENOENT:
11323 + /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11324 + in each case. They already are handled in many cases. */
11325 + /* Something bad happened, but difficult to avoid... Try again! */
11326 + ret = 0;
11327 + }
11328 +
11329 + if (leftmost_in_slum)
11330 + jput(leftmost_in_slum);
11331 +
11332 + pos_done(flush_pos);
11333 + scan_done(left_scan);
11334 + scan_done(right_scan);
11335 + kfree(right_scan);
11336 +
11337 + ON_DEBUG(atomic_dec(&flush_cnt));
11338 +
11339 + reiser4_leave_flush(sb);
11340 +
11341 + return ret;
11342 +}
11343 +
11344 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11345 + * flusher should submit all prepped nodes immediately without keeping them in
11346 + * flush queues for long time. The reason for rapid flush mode is to free
11347 + * memory as fast as possible. */
11348 +
11349 +#if REISER4_USE_RAPID_FLUSH
11350 +
11351 +/**
11352 + * submit all prepped nodes if rapid flush mode is set,
11353 + * turn rapid flush mode off.
11354 + */
11355 +
11356 +static int rapid_flush(flush_pos_t * pos)
11357 +{
11358 + if (!wbq_available())
11359 + return 0;
11360 +
11361 + return write_prepped_nodes(pos);
11362 +}
11363 +
11364 +#else
11365 +
11366 +#define rapid_flush(pos) (0)
11367 +
11368 +#endif /* REISER4_USE_RAPID_FLUSH */
11369 +
11370 +static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11371 + flush_queue_t *fq, int *nr_queued,
11372 + int flags)
11373 +{
11374 + jnode * node;
11375 +
11376 + if (start != NULL) {
11377 + spin_lock_jnode(start);
11378 + if (!jnode_is_flushprepped(start)) {
11379 + assert("zam-1056", start->atom == atom);
11380 + node = start;
11381 + goto enter;
11382 + }
11383 + spin_unlock_jnode(start);
11384 + }
11385 + /*
11386 + * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11387 + * nodes. The atom spin lock is not released until all dirty nodes processed or
11388 + * not prepped node found in the atom dirty lists.
11389 + */
11390 + while ((node = find_first_dirty_jnode(atom, flags))) {
11391 + spin_lock_jnode(node);
11392 + enter:
11393 + assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11394 + assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11395 +
11396 + if (JF_ISSET(node, JNODE_WRITEBACK)) {
11397 + /* move node to the end of atom's writeback list */
11398 + list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11399 +
11400 + /*
11401 + * jnode is not necessarily on dirty list: if it was dirtied when
11402 + * it was on flush queue - it does not get moved to dirty list
11403 + */
11404 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11405 + WB_LIST, 1));
11406 +
11407 + } else if (jnode_is_znode(node)
11408 + && znode_above_root(JZNODE(node))) {
11409 + /*
11410 + * A special case for znode-above-root. The above-root (fake)
11411 + * znode is captured and dirtied when the tree height changes or
11412 + * when the root node is relocated. This causes atoms to fuse so
11413 + * that changes at the root are serialized. However, this node is
11414 + * never flushed. This special case used to be in lock.c to
11415 + * prevent the above-root node from ever being captured, but now
11416 + * that it is captured we simply prevent it from flushing. The
11417 + * log-writer code relies on this to properly log superblock
11418 + * modifications of the tree height.
11419 + */
11420 + jnode_make_wander_nolock(node);
11421 + } else if (JF_ISSET(node, JNODE_RELOC)) {
11422 + queue_jnode(fq, node);
11423 + ++(*nr_queued);
11424 + } else
11425 + break;
11426 +
11427 + spin_unlock_jnode(node);
11428 + }
11429 + return node;
11430 +}
11431 +
11432 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11433 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11434 + * other errors as they are. */
11435 +int
11436 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11437 + txn_atom ** atom, jnode *start)
11438 +{
11439 + reiser4_super_info_data *sinfo = get_current_super_private();
11440 + flush_queue_t *fq = NULL;
11441 + jnode *node;
11442 + int nr_queued;
11443 + int ret;
11444 +
11445 + assert("zam-889", atom != NULL && *atom != NULL);
11446 + assert_spin_locked(&((*atom)->alock));
11447 + assert("zam-892", get_current_context()->trans->atom == *atom);
11448 +
11449 + nr_to_write = LONG_MAX;
11450 + while (1) {
11451 + ret = reiser4_fq_by_atom(*atom, &fq);
11452 + if (ret != -E_REPEAT)
11453 + break;
11454 + *atom = get_current_atom_locked();
11455 + }
11456 + if (ret)
11457 + return ret;
11458 +
11459 + assert_spin_locked(&((*atom)->alock));
11460 +
11461 + /* parallel flushers limit */
11462 + if (sinfo->tmgr.atom_max_flushers != 0) {
11463 + while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11464 + /* An reiser4_atom_send_event() call is inside
11465 + reiser4_fq_put_nolock() which is called when flush is
11466 + finished and nr_flushers is decremented. */
11467 + reiser4_atom_wait_event(*atom);
11468 + *atom = get_current_atom_locked();
11469 + }
11470 + }
11471 +
11472 + /* count ourself as a flusher */
11473 + (*atom)->nr_flushers++;
11474 +
11475 + writeout_mode_enable();
11476 +
11477 + nr_queued = 0;
11478 + node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11479 +
11480 + if (node == NULL) {
11481 + if (nr_queued == 0) {
11482 + (*atom)->nr_flushers--;
11483 + reiser4_fq_put_nolock(fq);
11484 + reiser4_atom_send_event(*atom);
11485 + /* current atom remains locked */
11486 + writeout_mode_disable();
11487 + return 0;
11488 + }
11489 + spin_unlock_atom(*atom);
11490 + } else {
11491 + jref(node);
11492 + BUG_ON((*atom)->super != node->tree->super);
11493 + spin_unlock_atom(*atom);
11494 + spin_unlock_jnode(node);
11495 + BUG_ON(nr_to_write == 0);
11496 + ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11497 + jput(node);
11498 + }
11499 +
11500 + ret =
11501 + reiser4_write_fq(fq, nr_submitted,
11502 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11503 +
11504 + *atom = get_current_atom_locked();
11505 + (*atom)->nr_flushers--;
11506 + reiser4_fq_put_nolock(fq);
11507 + reiser4_atom_send_event(*atom);
11508 + spin_unlock_atom(*atom);
11509 +
11510 + writeout_mode_disable();
11511 +
11512 + if (ret == 0)
11513 + ret = -E_REPEAT;
11514 +
11515 + return ret;
11516 +}
11517 +
11518 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11519 +
11520 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11521 + reverse parent-first relocate context. Here all we know is the preceder and the block
11522 + number. Since we are going in reverse, the preceder may still be relocated as well, so
11523 + we can't ask the block allocator "is there a closer block available to relocate?" here.
11524 + In the _forward_ parent-first relocate context (not here) we actually call the block
11525 + allocator to try and find a closer location. */
11526 +static int
11527 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11528 + const reiser4_block_nr * nblk)
11529 +{
11530 + reiser4_block_nr dist;
11531 +
11532 + assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11533 + assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11534 + assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11535 +
11536 + /* Distance is the absolute value. */
11537 + dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11538 +
11539 + /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11540 + block, do not relocate. */
11541 + if (dist <= get_current_super_private()->flush.relocate_distance) {
11542 + return 0;
11543 + }
11544 +
11545 + return 1;
11546 +}
11547 +
11548 +/* This function is a predicate that tests for relocation. Always called in the
11549 + reverse-parent-first context, when we are asking whether the current node should be
11550 + relocated in order to expand the flush by dirtying the parent level (and thus
11551 + proceeding to flush that level). When traversing in the forward parent-first direction
11552 + (not here), relocation decisions are handled in two places: allocate_znode() and
11553 + extent_needs_allocation(). */
11554 +static int
11555 +reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11556 + flush_pos_t * pos)
11557 +{
11558 + reiser4_block_nr pblk = 0;
11559 + reiser4_block_nr nblk = 0;
11560 +
11561 + assert("jmacd-8989", !jnode_is_root(node));
11562 +
11563 + /*
11564 + * This function is called only from the
11565 + * reverse_relocate_check_dirty_parent() and only if the parent
11566 + * node is clean. This implies that the parent has the real (i.e., not
11567 + * fake) block number, and, so does the child, because otherwise the
11568 + * parent would be dirty.
11569 + */
11570 +
11571 + /* New nodes are treated as if they are being relocated. */
11572 + if (JF_ISSET (node, JNODE_CREATED) ||
11573 + (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11574 + return 1;
11575 + }
11576 +
11577 + /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11578 + existing node, the coord may be leftmost even though the child is not the
11579 + parent-first preceder of the parent. If the first dirty node appears somewhere
11580 + in the middle of the first extent unit, this preceder calculation is wrong.
11581 + Needs more logic in here. */
11582 + if (coord_is_leftmost_unit(parent_coord)) {
11583 + pblk = *znode_get_block(parent_coord->node);
11584 + } else {
11585 + pblk = pos->preceder.blk;
11586 + }
11587 + check_preceder(pblk);
11588 +
11589 + /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11590 + if (pblk == 0) {
11591 + return 1;
11592 + }
11593 +
11594 + nblk = *jnode_get_block(node);
11595 +
11596 + if (reiser4_blocknr_is_fake(&nblk))
11597 + /* child is unallocated, mark parent dirty */
11598 + return 1;
11599 +
11600 + return reverse_relocate_if_close_enough(&pblk, &nblk);
11601 +}
11602 +
11603 +/* This function calls reverse_relocate_test to make a reverse-parent-first
11604 + relocation decision and then, if yes, it marks the parent dirty. */
11605 +static int
11606 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11607 + flush_pos_t * pos)
11608 +{
11609 + int ret;
11610 +
11611 + if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11612 +
11613 + ret = reverse_relocate_test(node, parent_coord, pos);
11614 + if (ret < 0) {
11615 + return ret;
11616 + }
11617 +
11618 + /* FIXME-ZAM
11619 + if parent is already relocated - we do not want to grab space, right? */
11620 + if (ret == 1) {
11621 + int grabbed;
11622 +
11623 + grabbed = get_current_context()->grabbed_blocks;
11624 + if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11625 + 0)
11626 + reiser4_panic("umka-1250",
11627 + "No space left during flush.");
11628 +
11629 + assert("jmacd-18923",
11630 + znode_is_write_locked(parent_coord->node));
11631 + znode_make_dirty(parent_coord->node);
11632 + grabbed2free_mark(grabbed);
11633 + }
11634 + }
11635 +
11636 + return 0;
11637 +}
11638 +
11639 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11640 + PARENT-FIRST LOOP BEGINS) */
11641 +
11642 +/* Get the leftmost child for given coord. */
11643 +static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11644 +{
11645 + int ret;
11646 +
11647 + ret = item_utmost_child(coord, LEFT_SIDE, child);
11648 +
11649 + if (ret)
11650 + return ret;
11651 +
11652 + if (IS_ERR(*child))
11653 + return PTR_ERR(*child);
11654 +
11655 + return 0;
11656 +}
11657 +
11658 +/* This step occurs after the left- and right-scans are completed, before starting the
11659 + forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11660 + flush point, which means continuing in the reverse parent-first direction to the
11661 + parent, grandparent, and so on (as long as the child is a leftmost child). This
11662 + routine calls a recursive process, alloc_one_ancestor, which does the real work,
11663 + except there is special-case handling here for the first ancestor, which may be a twig.
11664 + At each level (here and alloc_one_ancestor), we check for relocation and then, if
11665 + the child is a leftmost child, repeat at the next level. On the way back down (the
11666 + recursion), we allocate the ancestors in parent-first order. */
11667 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
11668 +{
11669 + int ret = 0;
11670 + lock_handle plock;
11671 + load_count pload;
11672 + coord_t pcoord;
11673 +
11674 + if (znode_check_flushprepped(pos->lock.node))
11675 + return 0;
11676 +
11677 + coord_init_invalid(&pcoord, NULL);
11678 + init_lh(&plock);
11679 + init_load_count(&pload);
11680 +
11681 + if (pos->state == POS_ON_EPOINT) {
11682 + /* a special case for pos on twig level, where we already have
11683 + a lock on parent node. */
11684 + /* The parent may not be dirty, in which case we should decide
11685 + whether to relocate the child now. If decision is made to
11686 + relocate the child, the parent is marked dirty. */
11687 + ret =
11688 + reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11689 + pos);
11690 + if (ret)
11691 + goto exit;
11692 +
11693 + /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11694 + is leftmost) and the leaf/child, so recursion is not needed.
11695 + Levels above the twig will be allocated for
11696 + write-optimization before the transaction commits. */
11697 +
11698 + /* Do the recursive step, allocating zero or more of our
11699 + * ancestors. */
11700 + ret = alloc_one_ancestor(&pos->coord, pos);
11701 +
11702 + } else {
11703 + if (!znode_is_root(pos->lock.node)) {
11704 + /* all formatted nodes except tree root */
11705 + ret =
11706 + reiser4_get_parent(&plock, pos->lock.node,
11707 + ZNODE_WRITE_LOCK);
11708 + if (ret)
11709 + goto exit;
11710 +
11711 + ret = incr_load_count_znode(&pload, plock.node);
11712 + if (ret)
11713 + goto exit;
11714 +
11715 + ret =
11716 + find_child_ptr(plock.node, pos->lock.node, &pcoord);
11717 + if (ret)
11718 + goto exit;
11719 +
11720 + ret =
11721 + reverse_relocate_check_dirty_parent(ZJNODE
11722 + (pos->lock.
11723 + node), &pcoord,
11724 + pos);
11725 + if (ret)
11726 + goto exit;
11727 +
11728 + ret = alloc_one_ancestor(&pcoord, pos);
11729 + if (ret)
11730 + goto exit;
11731 + }
11732 +
11733 + ret = allocate_znode(pos->lock.node, &pcoord, pos);
11734 + }
11735 + exit:
11736 + done_load_count(&pload);
11737 + done_lh(&plock);
11738 + return ret;
11739 +}
11740 +
11741 +/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
11742 + call to set_preceder, which is the next function described, this checks if the
11743 + child is a leftmost child and returns if it is not. If the child is a leftmost child
11744 + it checks for relocation, possibly dirtying the parent. Then it performs the recursive
11745 + step. */
11746 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11747 +{
11748 + int ret = 0;
11749 + lock_handle alock;
11750 + load_count aload;
11751 + coord_t acoord;
11752 +
11753 + /* As we ascend at the left-edge of the region to flush, take this opportunity at
11754 + the twig level to find our parent-first preceder unless we have already set
11755 + it. */
11756 + if (pos->preceder.blk == 0) {
11757 + ret = set_preceder(coord, pos);
11758 + if (ret != 0)
11759 + return ret;
11760 + }
11761 +
11762 + /* If the ancestor is clean or already allocated, or if the child is not a
11763 + leftmost child, stop going up, even leaving coord->node not flushprepped. */
11764 + if (znode_check_flushprepped(coord->node)
11765 + || !coord_is_leftmost_unit(coord))
11766 + return 0;
11767 +
11768 + init_lh(&alock);
11769 + init_load_count(&aload);
11770 + coord_init_invalid(&acoord, NULL);
11771 +
11772 + /* Only ascend to the next level if it is a leftmost child, but write-lock the
11773 + parent in case we will relocate the child. */
11774 + if (!znode_is_root(coord->node)) {
11775 +
11776 + ret =
11777 + jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11778 + &alock, &aload, ZNODE_WRITE_LOCK,
11779 + 0);
11780 + if (ret != 0) {
11781 + /* FIXME(C): check EINVAL, E_DEADLOCK */
11782 + goto exit;
11783 + }
11784 +
11785 + ret =
11786 + reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11787 + &acoord, pos);
11788 + if (ret != 0) {
11789 + goto exit;
11790 + }
11791 +
11792 + /* Recursive call. */
11793 + if (!znode_check_flushprepped(acoord.node)) {
11794 + ret = alloc_one_ancestor(&acoord, pos);
11795 + if (ret)
11796 + goto exit;
11797 + }
11798 + }
11799 +
11800 + /* Note: we call allocate with the parent write-locked (except at the root) in
11801 + case we relocate the child, in which case it will modify the parent during this
11802 + call. */
11803 + ret = allocate_znode(coord->node, &acoord, pos);
11804 +
11805 + exit:
11806 + done_load_count(&aload);
11807 + done_lh(&alock);
11808 + return ret;
11809 +}
11810 +
11811 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11812 + a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
11813 + should this node be relocated (in reverse parent-first context)? We repeat this
11814 + process as long as the child is the leftmost child, eventually reaching an ancestor of
11815 + the flush point that is not a leftmost child. The preceder of that ancestors, which is
11816 + not a leftmost child, is actually on the leaf level. The preceder of that block is the
11817 + left-neighbor of the flush point. The preceder of that block is the rightmost child of
11818 + the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
11819 + level, it stops momentarily to remember the block of the rightmost child of the twig on
11820 + the left and sets it to the flush_position's preceder_hint.
11821 +
11822 + There is one other place where we may set the flush_position's preceder hint, which is
11823 + during scan-left.
11824 +*/
11825 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11826 +{
11827 + int ret;
11828 + coord_t coord;
11829 + lock_handle left_lock;
11830 + load_count left_load;
11831 +
11832 + coord_dup(&coord, coord_in);
11833 +
11834 + init_lh(&left_lock);
11835 + init_load_count(&left_load);
11836 +
11837 + /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11838 + coord_is_leftmost_unit is not the right test if the unformatted child is in the
11839 + middle of the first extent unit. */
11840 + if (!coord_is_leftmost_unit(&coord)) {
11841 + coord_prev_unit(&coord);
11842 + } else {
11843 + ret =
11844 + reiser4_get_left_neighbor(&left_lock, coord.node,
11845 + ZNODE_READ_LOCK, GN_SAME_ATOM);
11846 + if (ret) {
11847 + /* If we fail for any reason it doesn't matter because the
11848 + preceder is only a hint. We are low-priority at this point, so
11849 + this must be the case. */
11850 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11851 + ret == -ENOENT || ret == -EINVAL
11852 + || ret == -E_DEADLOCK) {
11853 + ret = 0;
11854 + }
11855 + goto exit;
11856 + }
11857 +
11858 + ret = incr_load_count_znode(&left_load, left_lock.node);
11859 + if (ret)
11860 + goto exit;
11861 +
11862 + coord_init_last_unit(&coord, left_lock.node);
11863 + }
11864 +
11865 + ret =
11866 + item_utmost_child_real_block(&coord, RIGHT_SIDE,
11867 + &pos->preceder.blk);
11868 + exit:
11869 + check_preceder(pos->preceder.blk);
11870 + done_load_count(&left_load);
11871 + done_lh(&left_lock);
11872 + return ret;
11873 +}
11874 +
11875 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11876 +
11877 +/* This procedure implements the outer loop of the flush algorithm. To put this in
11878 + context, here is the general list of steps taken by the flush routine as a whole:
11879 +
11880 + 1. Scan-left
11881 + 2. Scan-right (maybe)
11882 + 3. Allocate initial flush position and its ancestors
11883 + 4. <handle extents>
11884 + 5. <squeeze and next position and its ancestors to-the-right,
11885 + then update position to-the-right>
11886 + 6. <repeat from #4 until flush is stopped>
11887 +
11888 + This procedure implements the loop in steps 4 through 6 in the above listing.
11889 +
11890 + Step 4: if the current flush position is an extent item (position on the twig level),
11891 + it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11892 + coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
11893 + If the next coordinate is an internal item, we descend back to the leaf level,
11894 + otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
11895 + brings us past the end of the twig level, then we call
11896 + reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11897 + step #5 which moves to the right.
11898 +
11899 + Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11900 + tree to allocate any ancestors of the next-right flush position that are not also
11901 + ancestors of the current position. Those ancestors (in top-down order) are the next in
11902 + parent-first order. We squeeze adjacent nodes on the way up until the right node and
11903 + current node share the same parent, then allocate on the way back down. Finally, this
11904 + step sets the flush position to the next-right node. Then repeat steps 4 and 5.
11905 +*/
11906 +
11907 +/* SQUEEZE CODE */
11908 +
11909 +/* squalloc_right_twig helper function, cut a range of extent items from
11910 + cut node to->node from the beginning up to coord @to. */
11911 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11912 + znode * left)
11913 +{
11914 + coord_t from;
11915 + reiser4_key from_key;
11916 +
11917 + coord_init_first_unit(&from, to->node);
11918 + item_key_by_coord(&from, &from_key);
11919 +
11920 + return cut_node_content(&from, to, &from_key, to_key, NULL);
11921 +}
11922 +
11923 +/* Copy as much of the leading extents from @right to @left, allocating
11924 + unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
11925 + SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
11926 + internal item it calls shift_one_internal_unit and may then return
11927 + SUBTREE_MOVED. */
11928 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
11929 +{
11930 + int ret = SUBTREE_MOVED;
11931 + coord_t coord; /* used to iterate over items */
11932 + reiser4_key stop_key;
11933 +
11934 + assert("jmacd-2008", !node_is_empty(right));
11935 + coord_init_first_unit(&coord, right);
11936 +
11937 + /* FIXME: can be optimized to cut once */
11938 + while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
11939 + ON_DEBUG(void *vp);
11940 +
11941 + assert("vs-1468", coord_is_leftmost_unit(&coord));
11942 + ON_DEBUG(vp = shift_check_prepare(left, coord.node));
11943 +
11944 + /* stop_key is used to find what was copied and what to cut */
11945 + stop_key = *reiser4_min_key();
11946 + ret = squalloc_extent(left, &coord, pos, &stop_key);
11947 + if (ret != SQUEEZE_CONTINUE) {
11948 + ON_DEBUG(kfree(vp));
11949 + break;
11950 + }
11951 + assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
11952 +
11953 + /* Helper function to do the cutting. */
11954 + set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
11955 + check_me("vs-1466",
11956 + squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
11957 +
11958 + ON_DEBUG(shift_check(vp, left, coord.node));
11959 + }
11960 +
11961 + if (node_is_empty(coord.node))
11962 + ret = SQUEEZE_SOURCE_EMPTY;
11963 +
11964 + if (ret == SQUEEZE_TARGET_FULL) {
11965 + goto out;
11966 + }
11967 +
11968 + if (node_is_empty(right)) {
11969 + /* The whole right node was copied into @left. */
11970 + assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
11971 + goto out;
11972 + }
11973 +
11974 + coord_init_first_unit(&coord, right);
11975 +
11976 + if (!item_is_internal(&coord)) {
11977 + /* we do not want to squeeze anything else to left neighbor because "slum"
11978 + is over */
11979 + ret = SQUEEZE_TARGET_FULL;
11980 + goto out;
11981 + }
11982 + assert("jmacd-433", item_is_internal(&coord));
11983 +
11984 + /* Shift an internal unit. The child must be allocated before shifting any more
11985 + extents, so we stop here. */
11986 + ret = shift_one_internal_unit(left, right);
11987 +
11988 + out:
11989 + assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
11990 + || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
11991 +
11992 + if (ret == SQUEEZE_TARGET_FULL) {
11993 + /* We submit prepped nodes here and expect that this @left twig
11994 + * will not be modified again during this jnode_flush() call. */
11995 + int ret1;
11996 +
11997 + /* NOTE: seems like io is done under long term locks. */
11998 + ret1 = write_prepped_nodes(pos);
11999 + if (ret1 < 0)
12000 + return ret1;
12001 + }
12002 +
12003 + return ret;
12004 +}
12005 +
12006 +#if REISER4_DEBUG
12007 +static void item_convert_invariant(flush_pos_t * pos)
12008 +{
12009 + assert("edward-1225", coord_is_existing_item(&pos->coord));
12010 + if (chaining_data_present(pos)) {
12011 + item_plugin *iplug = item_convert_plug(pos);
12012 +
12013 + assert("edward-1000",
12014 + iplug == item_plugin_by_coord(&pos->coord));
12015 + assert("edward-1001", iplug->f.convert != NULL);
12016 + } else
12017 + assert("edward-1226", pos->child == NULL);
12018 +}
12019 +#else
12020 +
12021 +#define item_convert_invariant(pos) noop
12022 +
12023 +#endif
12024 +
12025 +/* Scan node items starting from the first one and apply for each
12026 + item its flush ->convert() method (if any). This method may
12027 + resize/kill the item so the tree will be changed.
12028 +*/
12029 +static int convert_node(flush_pos_t * pos, znode * node)
12030 +{
12031 + int ret = 0;
12032 + item_plugin *iplug;
12033 +
12034 + assert("edward-304", pos != NULL);
12035 + assert("edward-305", pos->child == NULL);
12036 + assert("edward-475", znode_convertible(node));
12037 + assert("edward-669", znode_is_wlocked(node));
12038 + assert("edward-1210", !node_is_empty(node));
12039 +
12040 + if (znode_get_level(node) != LEAF_LEVEL)
12041 + /* unsupported */
12042 + goto exit;
12043 +
12044 + coord_init_first_unit(&pos->coord, node);
12045 +
12046 + while (1) {
12047 + ret = 0;
12048 + coord_set_to_left(&pos->coord);
12049 + item_convert_invariant(pos);
12050 +
12051 + iplug = item_plugin_by_coord(&pos->coord);
12052 + assert("edward-844", iplug != NULL);
12053 +
12054 + if (iplug->f.convert) {
12055 + ret = iplug->f.convert(pos);
12056 + if (ret)
12057 + goto exit;
12058 + }
12059 + assert("edward-307", pos->child == NULL);
12060 +
12061 + if (coord_next_item(&pos->coord)) {
12062 + /* node is over */
12063 +
12064 + if (!chaining_data_present(pos))
12065 + /* finished this node */
12066 + break;
12067 + if (should_chain_next_node(pos)) {
12068 + /* go to next node */
12069 + move_chaining_data(pos, 0 /* to next node */ );
12070 + break;
12071 + }
12072 + /* repeat this node */
12073 + move_chaining_data(pos, 1 /* this node */ );
12074 + continue;
12075 + }
12076 + /* Node is not over.
12077 + Check if there is attached convert data.
12078 + If so roll one item position back and repeat
12079 + on this node
12080 + */
12081 + if (chaining_data_present(pos)) {
12082 +
12083 + if (iplug != item_plugin_by_coord(&pos->coord))
12084 + set_item_convert_count(pos, 0);
12085 +
12086 + ret = coord_prev_item(&pos->coord);
12087 + assert("edward-1003", !ret);
12088 +
12089 + move_chaining_data(pos, 1 /* this node */ );
12090 + }
12091 + }
12092 + JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12093 + znode_make_dirty(node);
12094 + exit:
12095 + assert("edward-1004", !ret);
12096 + return ret;
12097 +}
12098 +
12099 +/* Squeeze and allocate the right neighbor. This is called after @left and
12100 + its current children have been squeezed and allocated already. This
12101 + procedure's job is to squeeze and items from @right to @left.
12102 +
12103 + If at the leaf level, use the shift_everything_left memcpy-optimized
12104 + version of shifting (squeeze_right_leaf).
12105 +
12106 + If at the twig level, extents are allocated as they are shifted from @right
12107 + to @left (squalloc_right_twig).
12108 +
12109 + At any other level, shift one internal item and return to the caller
12110 + (squalloc_parent_first) so that the shifted-subtree can be processed in
12111 + parent-first order.
12112 +
12113 + When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12114 + returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12115 + returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12116 + is returned.
12117 +*/
12118 +
12119 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12120 + znode * right)
12121 +{
12122 + int ret;
12123 +
12124 + /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12125 + * tree owing to error (for example, ENOSPC) in write */
12126 + /* assert("jmacd-9321", !node_is_empty(left)); */
12127 + assert("jmacd-9322", !node_is_empty(right));
12128 + assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12129 +
12130 + switch (znode_get_level(left)) {
12131 + case TWIG_LEVEL:
12132 + /* Shift with extent allocating until either an internal item
12133 + is encountered or everything is shifted or no free space
12134 + left in @left */
12135 + ret = squeeze_right_twig(left, right, pos);
12136 + break;
12137 +
12138 + default:
12139 + /* All other levels can use shift_everything until we implement per-item
12140 + flush plugins. */
12141 + ret = squeeze_right_non_twig(left, right);
12142 + break;
12143 + }
12144 +
12145 + assert("jmacd-2011", (ret < 0 ||
12146 + ret == SQUEEZE_SOURCE_EMPTY
12147 + || ret == SQUEEZE_TARGET_FULL
12148 + || ret == SUBTREE_MOVED));
12149 + return ret;
12150 +}
12151 +
12152 +static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12153 + znode * right)
12154 +{
12155 + int ret;
12156 +
12157 + ret = squeeze_right_twig(pos->lock.node, right, pos);
12158 + if (ret < 0)
12159 + return ret;
12160 + if (ret > 0) {
12161 + coord_init_after_last_item(&pos->coord, pos->lock.node);
12162 + return ret;
12163 + }
12164 +
12165 + coord_init_last_unit(&pos->coord, pos->lock.node);
12166 + return 0;
12167 +}
12168 +
12169 +/* forward declaration */
12170 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12171 +
12172 +/* do a fast check for "same parents" condition before calling
12173 + * squalloc_upper_levels() */
12174 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12175 + znode * left,
12176 + znode * right)
12177 +{
12178 + if (znode_same_parents(left, right))
12179 + return 0;
12180 +
12181 + return squalloc_upper_levels(pos, left, right);
12182 +}
12183 +
12184 +/* Check whether the parent of given @right node needs to be processes
12185 + ((re)allocated) prior to processing of the child. If @left and @right do not
12186 + share at least the parent of the @right is after the @left but before the
12187 + @right in parent-first order, we have to (re)allocate it before the @right
12188 + gets (re)allocated. */
12189 +static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12190 +{
12191 + int ret;
12192 +
12193 + lock_handle left_parent_lock;
12194 + lock_handle right_parent_lock;
12195 +
12196 + load_count left_parent_load;
12197 + load_count right_parent_load;
12198 +
12199 + init_lh(&left_parent_lock);
12200 + init_lh(&right_parent_lock);
12201 +
12202 + init_load_count(&left_parent_load);
12203 + init_load_count(&right_parent_load);
12204 +
12205 + ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12206 + if (ret)
12207 + goto out;
12208 +
12209 + ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12210 + if (ret)
12211 + goto out;
12212 +
12213 + /* Check for same parents */
12214 + if (left_parent_lock.node == right_parent_lock.node)
12215 + goto out;
12216 +
12217 + if (znode_check_flushprepped(right_parent_lock.node)) {
12218 + /* Keep parent-first order. In the order, the right parent node stands
12219 + before the @right node. If it is already allocated, we set the
12220 + preceder (next block search start point) to its block number, @right
12221 + node should be allocated after it.
12222 +
12223 + However, preceder is set only if the right parent is on twig level.
12224 + The explanation is the following: new branch nodes are allocated over
12225 + already allocated children while the tree grows, it is difficult to
12226 + keep tree ordered, we assume that only leaves and twings are correctly
12227 + allocated. So, only twigs are used as a preceder for allocating of the
12228 + rest of the slum. */
12229 + if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12230 + pos->preceder.blk =
12231 + *znode_get_block(right_parent_lock.node);
12232 + check_preceder(pos->preceder.blk);
12233 + }
12234 + goto out;
12235 + }
12236 +
12237 + ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12238 + if (ret)
12239 + goto out;
12240 +
12241 + ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12242 + if (ret)
12243 + goto out;
12244 +
12245 + ret =
12246 + squeeze_right_neighbor(pos, left_parent_lock.node,
12247 + right_parent_lock.node);
12248 + /* We stop if error. We stop if some items/units were shifted (ret == 0)
12249 + * and thus @right changed its parent. It means we have not process
12250 + * right_parent node prior to processing of @right. Positive return
12251 + * values say that shifting items was not happen because of "empty
12252 + * source" or "target full" conditions. */
12253 + if (ret <= 0)
12254 + goto out;
12255 +
12256 + /* parent(@left) and parent(@right) may have different parents also. We
12257 + * do a recursive call for checking that. */
12258 + ret =
12259 + check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12260 + right_parent_lock.node);
12261 + if (ret)
12262 + goto out;
12263 +
12264 + /* allocate znode when going down */
12265 + ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12266 +
12267 + out:
12268 + done_load_count(&left_parent_load);
12269 + done_load_count(&right_parent_load);
12270 +
12271 + done_lh(&left_parent_lock);
12272 + done_lh(&right_parent_lock);
12273 +
12274 + return ret;
12275 +}
12276 +
12277 +/* Check the leftmost child "flushprepped" status, also returns true if child
12278 + * node was not found in cache. */
12279 +static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12280 +{
12281 + int ret;
12282 + int prepped;
12283 +
12284 + jnode *child;
12285 +
12286 + ret = get_leftmost_child_of_unit(coord, &child);
12287 +
12288 + if (ret)
12289 + return ret;
12290 +
12291 + if (child) {
12292 + prepped = jnode_check_flushprepped(child);
12293 + jput(child);
12294 + } else {
12295 + /* We consider not existing child as a node which slum
12296 + processing should not continue to. Not cached node is clean,
12297 + so it is flushprepped. */
12298 + prepped = 1;
12299 + }
12300 +
12301 + return prepped;
12302 +}
12303 +
12304 +/* (re)allocate znode with automated getting parent node */
12305 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12306 +{
12307 + int ret;
12308 + lock_handle parent_lock;
12309 + load_count parent_load;
12310 + coord_t pcoord;
12311 +
12312 + assert("zam-851", znode_is_write_locked(node));
12313 +
12314 + init_lh(&parent_lock);
12315 + init_load_count(&parent_load);
12316 +
12317 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12318 + if (ret)
12319 + goto out;
12320 +
12321 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12322 + if (ret)
12323 + goto out;
12324 +
12325 + ret = find_child_ptr(parent_lock.node, node, &pcoord);
12326 + if (ret)
12327 + goto out;
12328 +
12329 + ret = allocate_znode(node, &pcoord, pos);
12330 +
12331 + out:
12332 + done_load_count(&parent_load);
12333 + done_lh(&parent_lock);
12334 + return ret;
12335 +}
12336 +
12337 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12338 + * slum reached. */
12339 +static int handle_pos_on_formatted(flush_pos_t * pos)
12340 +{
12341 + int ret;
12342 + lock_handle right_lock;
12343 + load_count right_load;
12344 +
12345 + init_lh(&right_lock);
12346 + init_load_count(&right_load);
12347 +
12348 + if (should_convert_node(pos, pos->lock.node)) {
12349 + ret = convert_node(pos, pos->lock.node);
12350 + if (ret)
12351 + return ret;
12352 + }
12353 +
12354 + while (1) {
12355 + int expected;
12356 + expected = should_convert_next_node(pos);
12357 + ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12358 + ZNODE_WRITE_LOCK, !expected, expected);
12359 + if (ret) {
12360 + if (expected)
12361 + warning("edward-1495",
12362 + "Expected neighbor not found (ret = %d). Fsck?",
12363 + ret);
12364 + break;
12365 + }
12366 +
12367 + /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12368 + * can be optimal. For now we choose to live with the risk that it will
12369 + * be suboptimal because it would be quite complex to code it to be
12370 + * smarter. */
12371 + if (znode_check_flushprepped(right_lock.node)
12372 + && !znode_convertible(right_lock.node)) {
12373 + assert("edward-1005", !should_convert_next_node(pos));
12374 + pos_stop(pos);
12375 + break;
12376 + }
12377 +
12378 + ret = incr_load_count_znode(&right_load, right_lock.node);
12379 + if (ret)
12380 + break;
12381 + if (should_convert_node(pos, right_lock.node)) {
12382 + ret = convert_node(pos, right_lock.node);
12383 + if (ret)
12384 + break;
12385 + if (node_is_empty(right_lock.node)) {
12386 + /* node became empty after converting, repeat */
12387 + done_load_count(&right_load);
12388 + done_lh(&right_lock);
12389 + continue;
12390 + }
12391 + }
12392 +
12393 + /* squeeze _before_ going upward. */
12394 + ret =
12395 + squeeze_right_neighbor(pos, pos->lock.node,
12396 + right_lock.node);
12397 + if (ret < 0)
12398 + break;
12399 +
12400 + if (znode_check_flushprepped(right_lock.node)) {
12401 + if (should_convert_next_node(pos)) {
12402 + /* in spite of flushprepped status of the node,
12403 + its right slum neighbor should be converted */
12404 + assert("edward-953", convert_data(pos));
12405 + assert("edward-954", item_convert_data(pos));
12406 +
12407 + if (node_is_empty(right_lock.node)) {
12408 + done_load_count(&right_load);
12409 + done_lh(&right_lock);
12410 + } else
12411 + move_flush_pos(pos, &right_lock,
12412 + &right_load, NULL);
12413 + continue;
12414 + }
12415 + pos_stop(pos);
12416 + break;
12417 + }
12418 +
12419 + if (node_is_empty(right_lock.node)) {
12420 + /* repeat if right node was squeezed completely */
12421 + done_load_count(&right_load);
12422 + done_lh(&right_lock);
12423 + continue;
12424 + }
12425 +
12426 + /* parent(right_lock.node) has to be processed before
12427 + * (right_lock.node) due to "parent-first" allocation order. */
12428 + ret =
12429 + check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12430 + right_lock.node);
12431 + if (ret)
12432 + break;
12433 + /* (re)allocate _after_ going upward */
12434 + ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12435 + if (ret)
12436 + break;
12437 + if (should_terminate_squalloc(pos)) {
12438 + set_item_convert_count(pos, 0);
12439 + break;
12440 + }
12441 +
12442 + /* advance the flush position to the right neighbor */
12443 + move_flush_pos(pos, &right_lock, &right_load, NULL);
12444 +
12445 + ret = rapid_flush(pos);
12446 + if (ret)
12447 + break;
12448 + }
12449 + check_convert_info(pos);
12450 + done_load_count(&right_load);
12451 + done_lh(&right_lock);
12452 +
12453 + /* This function indicates via pos whether to stop or go to twig or continue on current
12454 + * level. */
12455 + return ret;
12456 +
12457 +}
12458 +
12459 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12460 + * slum reached. */
12461 +static int handle_pos_on_leaf(flush_pos_t * pos)
12462 +{
12463 + int ret;
12464 +
12465 + assert("zam-845", pos->state == POS_ON_LEAF);
12466 +
12467 + ret = handle_pos_on_formatted(pos);
12468 +
12469 + if (ret == -E_NO_NEIGHBOR) {
12470 + /* cannot get right neighbor, go process extents. */
12471 + pos->state = POS_TO_TWIG;
12472 + return 0;
12473 + }
12474 +
12475 + return ret;
12476 +}
12477 +
12478 +/* Process slum on level > 1 */
12479 +static int handle_pos_on_internal(flush_pos_t * pos)
12480 +{
12481 + assert("zam-850", pos->state == POS_ON_INTERNAL);
12482 + return handle_pos_on_formatted(pos);
12483 +}
12484 +
12485 +/* check whether squalloc should stop before processing given extent */
12486 +static int squalloc_extent_should_stop(flush_pos_t * pos)
12487 +{
12488 + assert("zam-869", item_is_extent(&pos->coord));
12489 +
12490 + /* pos->child is a jnode handle_pos_on_extent() should start with in
12491 + * stead of the first child of the first extent unit. */
12492 + if (pos->child) {
12493 + int prepped;
12494 +
12495 + assert("vs-1383", jnode_is_unformatted(pos->child));
12496 + prepped = jnode_check_flushprepped(pos->child);
12497 + pos->pos_in_unit =
12498 + jnode_get_index(pos->child) -
12499 + extent_unit_index(&pos->coord);
12500 + assert("vs-1470",
12501 + pos->pos_in_unit < extent_unit_width(&pos->coord));
12502 + assert("nikita-3434",
12503 + ergo(extent_is_unallocated(&pos->coord),
12504 + pos->pos_in_unit == 0));
12505 + jput(pos->child);
12506 + pos->child = NULL;
12507 +
12508 + return prepped;
12509 + }
12510 +
12511 + pos->pos_in_unit = 0;
12512 + if (extent_is_unallocated(&pos->coord))
12513 + return 0;
12514 +
12515 + return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12516 +}
12517 +
12518 +/* Handle the case when regular reiser4 tree (znodes connected one to its
12519 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
12520 + * unformatted nodes. By having a lock on twig level and use extent code
12521 + * routines to process unformatted nodes we swim around an irregular part of
12522 + * reiser4 tree. */
12523 +static int handle_pos_on_twig(flush_pos_t * pos)
12524 +{
12525 + int ret;
12526 +
12527 + assert("zam-844", pos->state == POS_ON_EPOINT);
12528 + assert("zam-843", item_is_extent(&pos->coord));
12529 +
12530 + /* We decide should we continue slum processing with current extent
12531 + unit: if leftmost child of current extent unit is flushprepped
12532 + (i.e. clean or already processed by flush) we stop squalloc(). There
12533 + is a fast check for unallocated extents which we assume contain all
12534 + not flushprepped nodes. */
12535 + /* FIXME: Here we implement simple check, we are only looking on the
12536 + leftmost child. */
12537 + ret = squalloc_extent_should_stop(pos);
12538 + if (ret != 0) {
12539 + pos_stop(pos);
12540 + return ret;
12541 + }
12542 +
12543 + while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12544 + && item_is_extent(&pos->coord)) {
12545 + ret = reiser4_alloc_extent(pos);
12546 + if (ret) {
12547 + break;
12548 + }
12549 + coord_next_unit(&pos->coord);
12550 + }
12551 +
12552 + if (coord_is_after_rightmost(&pos->coord)) {
12553 + pos->state = POS_END_OF_TWIG;
12554 + return 0;
12555 + }
12556 + if (item_is_internal(&pos->coord)) {
12557 + pos->state = POS_TO_LEAF;
12558 + return 0;
12559 + }
12560 +
12561 + assert("zam-860", item_is_extent(&pos->coord));
12562 +
12563 + /* "slum" is over */
12564 + pos->state = POS_INVALID;
12565 + return 0;
12566 +}
12567 +
12568 +/* When we about to return flush position from twig to leaf level we can process
12569 + * the right twig node or move position to the leaf. This processes right twig
12570 + * if it is possible and jump to leaf level if not. */
12571 +static int handle_pos_end_of_twig(flush_pos_t * pos)
12572 +{
12573 + int ret;
12574 + lock_handle right_lock;
12575 + load_count right_load;
12576 + coord_t at_right;
12577 + jnode *child = NULL;
12578 +
12579 + assert("zam-848", pos->state == POS_END_OF_TWIG);
12580 + assert("zam-849", coord_is_after_rightmost(&pos->coord));
12581 +
12582 + init_lh(&right_lock);
12583 + init_load_count(&right_load);
12584 +
12585 + /* We get a lock on the right twig node even it is not dirty because
12586 + * slum continues or discontinues on leaf level not on next twig. This
12587 + * lock on the right twig is needed for getting its leftmost child. */
12588 + ret =
12589 + reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12590 + ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12591 + if (ret)
12592 + goto out;
12593 +
12594 + ret = incr_load_count_znode(&right_load, right_lock.node);
12595 + if (ret)
12596 + goto out;
12597 +
12598 + /* right twig could be not dirty */
12599 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12600 + /* If right twig node is dirty we always attempt to squeeze it
12601 + * content to the left... */
12602 + became_dirty:
12603 + ret =
12604 + squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12605 + if (ret <= 0) {
12606 + /* pos->coord is on internal item, go to leaf level, or
12607 + * we have an error which will be caught in squalloc() */
12608 + pos->state = POS_TO_LEAF;
12609 + goto out;
12610 + }
12611 +
12612 + /* If right twig was squeezed completely we wave to re-lock
12613 + * right twig. now it is done through the top-level squalloc
12614 + * routine. */
12615 + if (node_is_empty(right_lock.node))
12616 + goto out;
12617 +
12618 + /* ... and prep it if it is not yet prepped */
12619 + if (!znode_check_flushprepped(right_lock.node)) {
12620 + /* As usual, process parent before ... */
12621 + ret =
12622 + check_parents_and_squalloc_upper_levels(pos,
12623 + pos->lock.
12624 + node,
12625 + right_lock.
12626 + node);
12627 + if (ret)
12628 + goto out;
12629 +
12630 + /* ... processing the child */
12631 + ret =
12632 + lock_parent_and_allocate_znode(right_lock.node,
12633 + pos);
12634 + if (ret)
12635 + goto out;
12636 + }
12637 + } else {
12638 + coord_init_first_unit(&at_right, right_lock.node);
12639 +
12640 + /* check first child of next twig, should we continue there ? */
12641 + ret = get_leftmost_child_of_unit(&at_right, &child);
12642 + if (ret || child == NULL || jnode_check_flushprepped(child)) {
12643 + pos_stop(pos);
12644 + goto out;
12645 + }
12646 +
12647 + /* check clean twig for possible relocation */
12648 + if (!znode_check_flushprepped(right_lock.node)) {
12649 + ret =
12650 + reverse_relocate_check_dirty_parent(child,
12651 + &at_right, pos);
12652 + if (ret)
12653 + goto out;
12654 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12655 + goto became_dirty;
12656 + }
12657 + }
12658 +
12659 + assert("zam-875", znode_check_flushprepped(right_lock.node));
12660 +
12661 + /* Update the preceder by a block number of just processed right twig
12662 + * node. The code above could miss the preceder updating because
12663 + * allocate_znode() could not be called for this node. */
12664 + pos->preceder.blk = *znode_get_block(right_lock.node);
12665 + check_preceder(pos->preceder.blk);
12666 +
12667 + coord_init_first_unit(&at_right, right_lock.node);
12668 + assert("zam-868", coord_is_existing_unit(&at_right));
12669 +
12670 + pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12671 + move_flush_pos(pos, &right_lock, &right_load, &at_right);
12672 +
12673 + out:
12674 + done_load_count(&right_load);
12675 + done_lh(&right_lock);
12676 +
12677 + if (child)
12678 + jput(child);
12679 +
12680 + return ret;
12681 +}
12682 +
12683 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12684 + * continue there. */
12685 +static int handle_pos_to_leaf(flush_pos_t * pos)
12686 +{
12687 + int ret;
12688 + lock_handle child_lock;
12689 + load_count child_load;
12690 + jnode *child;
12691 +
12692 + assert("zam-846", pos->state == POS_TO_LEAF);
12693 + assert("zam-847", item_is_internal(&pos->coord));
12694 +
12695 + init_lh(&child_lock);
12696 + init_load_count(&child_load);
12697 +
12698 + ret = get_leftmost_child_of_unit(&pos->coord, &child);
12699 + if (ret)
12700 + return ret;
12701 + if (child == NULL) {
12702 + pos_stop(pos);
12703 + return 0;
12704 + }
12705 +
12706 + if (jnode_check_flushprepped(child)) {
12707 + pos->state = POS_INVALID;
12708 + goto out;
12709 + }
12710 +
12711 + ret =
12712 + longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12713 + ZNODE_LOCK_LOPRI);
12714 + if (ret)
12715 + goto out;
12716 +
12717 + ret = incr_load_count_znode(&child_load, JZNODE(child));
12718 + if (ret)
12719 + goto out;
12720 +
12721 + ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12722 + if (ret)
12723 + goto out;
12724 +
12725 + /* move flush position to leaf level */
12726 + pos->state = POS_ON_LEAF;
12727 + move_flush_pos(pos, &child_lock, &child_load, NULL);
12728 +
12729 + if (node_is_empty(JZNODE(child))) {
12730 + ret = delete_empty_node(JZNODE(child));
12731 + pos->state = POS_INVALID;
12732 + }
12733 + out:
12734 + done_load_count(&child_load);
12735 + done_lh(&child_lock);
12736 + jput(child);
12737 +
12738 + return ret;
12739 +}
12740 +
12741 +/* move pos from leaf to twig, and move lock from leaf to twig. */
12742 +/* Move pos->lock to upper (twig) level */
12743 +static int handle_pos_to_twig(flush_pos_t * pos)
12744 +{
12745 + int ret;
12746 +
12747 + lock_handle parent_lock;
12748 + load_count parent_load;
12749 + coord_t pcoord;
12750 +
12751 + assert("zam-852", pos->state == POS_TO_TWIG);
12752 +
12753 + init_lh(&parent_lock);
12754 + init_load_count(&parent_load);
12755 +
12756 + ret =
12757 + reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12758 + if (ret)
12759 + goto out;
12760 +
12761 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12762 + if (ret)
12763 + goto out;
12764 +
12765 + ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12766 + if (ret)
12767 + goto out;
12768 +
12769 + assert("zam-870", item_is_internal(&pcoord));
12770 + coord_next_item(&pcoord);
12771 +
12772 + if (coord_is_after_rightmost(&pcoord))
12773 + pos->state = POS_END_OF_TWIG;
12774 + else if (item_is_extent(&pcoord))
12775 + pos->state = POS_ON_EPOINT;
12776 + else {
12777 + /* Here we understand that getting -E_NO_NEIGHBOR in
12778 + * handle_pos_on_leaf() was because of just a reaching edge of
12779 + * slum */
12780 + pos_stop(pos);
12781 + goto out;
12782 + }
12783 +
12784 + move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12785 +
12786 + out:
12787 + done_load_count(&parent_load);
12788 + done_lh(&parent_lock);
12789 +
12790 + return ret;
12791 +}
12792 +
12793 +typedef int (*pos_state_handle_t) (flush_pos_t *);
12794 +static pos_state_handle_t flush_pos_handlers[] = {
12795 + /* process formatted nodes on leaf level, keep lock on a leaf node */
12796 + [POS_ON_LEAF] = handle_pos_on_leaf,
12797 + /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12798 + * being processed */
12799 + [POS_ON_EPOINT] = handle_pos_on_twig,
12800 + /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12801 + [POS_TO_TWIG] = handle_pos_to_twig,
12802 + /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12803 + * pos->coord points to the leaf node we jump to */
12804 + [POS_TO_LEAF] = handle_pos_to_leaf,
12805 + /* after processing last extent in the twig node, attempting to shift items from the twigs
12806 + * right neighbor and process them while shifting */
12807 + [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12808 + /* process formatted nodes on internal level, keep lock on an internal node */
12809 + [POS_ON_INTERNAL] = handle_pos_on_internal
12810 +};
12811 +
12812 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12813 + * encrypt) nodes and their ancestors in "parent-first" order */
12814 +static int squalloc(flush_pos_t * pos)
12815 +{
12816 + int ret = 0;
12817 +
12818 + /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12819 + * greater CPU efficiency? Measure and see.... -Hans */
12820 + while (pos_valid(pos)) {
12821 + ret = flush_pos_handlers[pos->state] (pos);
12822 + if (ret < 0)
12823 + break;
12824 +
12825 + ret = rapid_flush(pos);
12826 + if (ret)
12827 + break;
12828 + }
12829 +
12830 + /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12831 + routines, -E_NO_NEIGHBOR means that slum edge was reached */
12832 + if (ret > 0 || ret == -E_NO_NEIGHBOR)
12833 + ret = 0;
12834 +
12835 + return ret;
12836 +}
12837 +
12838 +static void update_ldkey(znode * node)
12839 +{
12840 + reiser4_key ldkey;
12841 +
12842 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12843 + if (node_is_empty(node))
12844 + return;
12845 +
12846 + znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12847 +}
12848 +
12849 +/* this is to be called after calling of shift node's method to shift data from @right to
12850 + @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12851 + and @right correspondingly and sets right delimiting key of @left to first key of @right */
12852 +static void update_znode_dkeys(znode * left, znode * right)
12853 +{
12854 + assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12855 + assert("vs-1629", (znode_is_write_locked(left) &&
12856 + znode_is_write_locked(right)));
12857 +
12858 + /* we need to update left delimiting of left if it was empty before shift */
12859 + update_ldkey(left);
12860 + update_ldkey(right);
12861 + if (node_is_empty(right))
12862 + znode_set_rd_key(left, znode_get_rd_key(right));
12863 + else
12864 + znode_set_rd_key(left, znode_get_ld_key(right));
12865 +}
12866 +
12867 +/* try to shift everything from @right to @left. If everything was shifted -
12868 + @right is removed from the tree. Result is the number of bytes shifted. */
12869 +static int
12870 +shift_everything_left(znode * right, znode * left, carry_level * todo)
12871 +{
12872 + coord_t from;
12873 + node_plugin *nplug;
12874 + carry_plugin_info info;
12875 +
12876 + coord_init_after_last_item(&from, right);
12877 +
12878 + nplug = node_plugin_by_node(right);
12879 + info.doing = NULL;
12880 + info.todo = todo;
12881 + return nplug->shift(&from, left, SHIFT_LEFT,
12882 + 1 /* delete @right if it becomes empty */ ,
12883 + 1
12884 + /* move coord @from to node @left if everything will be shifted */
12885 + ,
12886 + &info);
12887 +}
12888 +
12889 +/* Shift as much as possible from @right to @left using the memcpy-optimized
12890 + shift_everything_left. @left and @right are formatted neighboring nodes on
12891 + leaf level. */
12892 +static int squeeze_right_non_twig(znode * left, znode * right)
12893 +{
12894 + int ret;
12895 + carry_pool *pool;
12896 + carry_level *todo;
12897 +
12898 + assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12899 +
12900 + if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12901 + !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12902 + return SQUEEZE_TARGET_FULL;
12903 +
12904 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12905 + if (IS_ERR(pool))
12906 + return PTR_ERR(pool);
12907 + todo = (carry_level *) (pool + 1);
12908 + init_carry_level(todo, pool);
12909 +
12910 + ret = shift_everything_left(right, left, todo);
12911 + if (ret > 0) {
12912 + /* something was shifted */
12913 + reiser4_tree *tree;
12914 + __u64 grabbed;
12915 +
12916 + znode_make_dirty(left);
12917 + znode_make_dirty(right);
12918 +
12919 + /* update delimiting keys of nodes which participated in
12920 + shift. FIXME: it would be better to have this in shift
12921 + node's operation. But it can not be done there. Nobody
12922 + remembers why, though */
12923 + tree = znode_get_tree(left);
12924 + write_lock_dk(tree);
12925 + update_znode_dkeys(left, right);
12926 + write_unlock_dk(tree);
12927 +
12928 + /* Carry is called to update delimiting key and, maybe, to remove empty
12929 + node. */
12930 + grabbed = get_current_context()->grabbed_blocks;
12931 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
12932 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
12933 + ret = reiser4_carry(todo, NULL /* previous level */ );
12934 + grabbed2free_mark(grabbed);
12935 + } else {
12936 + /* Shifting impossible, we return appropriate result code */
12937 + ret =
12938 + node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
12939 + SQUEEZE_TARGET_FULL;
12940 + }
12941 +
12942 + done_carry_pool(pool);
12943 +
12944 + return ret;
12945 +}
12946 +
12947 +#if REISER4_DEBUG
12948 +static int sibling_link_is_ok(const znode *left, const znode *right)
12949 +{
12950 + int result;
12951 +
12952 + read_lock_tree(znode_get_tree(left));
12953 + result = (left->right == right && left == right->left);
12954 + read_unlock_tree(znode_get_tree(left));
12955 + return result;
12956 +}
12957 +#endif
12958 +
12959 +/* Shift first unit of first item if it is an internal one. Return
12960 + SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
12961 + SUBTREE_MOVED. */
12962 +static int shift_one_internal_unit(znode * left, znode * right)
12963 +{
12964 + int ret;
12965 + carry_pool *pool;
12966 + carry_level *todo;
12967 + coord_t *coord;
12968 + carry_plugin_info *info;
12969 + int size, moved;
12970 +
12971 + assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
12972 + assert("nikita-2435", znode_is_write_locked(left));
12973 + assert("nikita-2436", znode_is_write_locked(right));
12974 + assert("nikita-2434", sibling_link_is_ok(left, right));
12975 +
12976 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
12977 + sizeof(*coord) + sizeof(*info)
12978 +#if REISER4_DEBUG
12979 + + sizeof(*coord) + 2 * sizeof(reiser4_key)
12980 +#endif
12981 + );
12982 + if (IS_ERR(pool))
12983 + return PTR_ERR(pool);
12984 + todo = (carry_level *) (pool + 1);
12985 + init_carry_level(todo, pool);
12986 +
12987 + coord = (coord_t *) (todo + 3);
12988 + coord_init_first_unit(coord, right);
12989 + info = (carry_plugin_info *) (coord + 1);
12990 +
12991 +#if REISER4_DEBUG
12992 + if (!node_is_empty(left)) {
12993 + coord_t *last;
12994 + reiser4_key *right_key;
12995 + reiser4_key *left_key;
12996 +
12997 + last = (coord_t *) (info + 1);
12998 + right_key = (reiser4_key *) (last + 1);
12999 + left_key = right_key + 1;
13000 + coord_init_last_unit(last, left);
13001 +
13002 + assert("nikita-2463",
13003 + keyle(item_key_by_coord(last, left_key),
13004 + item_key_by_coord(coord, right_key)));
13005 + }
13006 +#endif
13007 +
13008 + assert("jmacd-2007", item_is_internal(coord));
13009 +
13010 + size = item_length_by_coord(coord);
13011 + info->todo = todo;
13012 + info->doing = NULL;
13013 +
13014 + ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13015 + 1
13016 + /* delete @right if it becomes empty */
13017 + ,
13018 + 0
13019 + /* do not move coord @coord to node @left */
13020 + ,
13021 + info);
13022 +
13023 + /* If shift returns positive, then we shifted the item. */
13024 + assert("vs-423", ret <= 0 || size == ret);
13025 + moved = (ret > 0);
13026 +
13027 + if (moved) {
13028 + /* something was moved */
13029 + reiser4_tree *tree;
13030 + int grabbed;
13031 +
13032 + znode_make_dirty(left);
13033 + znode_make_dirty(right);
13034 + tree = znode_get_tree(left);
13035 + write_lock_dk(tree);
13036 + update_znode_dkeys(left, right);
13037 + write_unlock_dk(tree);
13038 +
13039 + /* reserve space for delimiting keys after shifting */
13040 + grabbed = get_current_context()->grabbed_blocks;
13041 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13042 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13043 +
13044 + ret = reiser4_carry(todo, NULL /* previous level */ );
13045 + grabbed2free_mark(grabbed);
13046 + }
13047 +
13048 + done_carry_pool(pool);
13049 +
13050 + if (ret != 0) {
13051 + /* Shift or carry operation failed. */
13052 + assert("jmacd-7325", ret < 0);
13053 + return ret;
13054 + }
13055 +
13056 + return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13057 +}
13058 +
13059 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
13060 + znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13061 +static int
13062 +allocate_znode_loaded(znode * node,
13063 + const coord_t * parent_coord, flush_pos_t * pos)
13064 +{
13065 + int ret;
13066 + reiser4_super_info_data *sbinfo = get_current_super_private();
13067 + /* FIXME(D): We have the node write-locked and should have checked for !
13068 + allocated() somewhere before reaching this point, but there can be a race, so
13069 + this assertion is bogus. */
13070 + assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13071 + assert("jmacd-7988", znode_is_write_locked(node));
13072 + assert("jmacd-7989", coord_is_invalid(parent_coord)
13073 + || znode_is_write_locked(parent_coord->node));
13074 +
13075 + if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13076 + znode_is_root(node) ||
13077 + /* We have enough nodes to relocate no matter what. */
13078 + (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13079 + /* No need to decide with new nodes, they are treated the same as
13080 + relocate. If the root node is dirty, relocate. */
13081 + if (pos->preceder.blk == 0) {
13082 + /* preceder is unknown and we have decided to relocate node --
13083 + using of default value for search start is better than search
13084 + from block #0. */
13085 + get_blocknr_hint_default(&pos->preceder.blk);
13086 + check_preceder(pos->preceder.blk);
13087 + }
13088 +
13089 + goto best_reloc;
13090 +
13091 + } else if (pos->preceder.blk == 0) {
13092 + /* If we don't know the preceder, leave it where it is. */
13093 + jnode_make_wander(ZJNODE(node));
13094 + } else {
13095 + /* Make a decision based on block distance. */
13096 + reiser4_block_nr dist;
13097 + reiser4_block_nr nblk = *znode_get_block(node);
13098 +
13099 + assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13100 + assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13101 + assert("jmacd-6174", pos->preceder.blk != 0);
13102 +
13103 + if (pos->preceder.blk == nblk - 1) {
13104 + /* Ideal. */
13105 + jnode_make_wander(ZJNODE(node));
13106 + } else {
13107 +
13108 + dist =
13109 + (nblk <
13110 + pos->preceder.blk) ? (pos->preceder.blk -
13111 + nblk) : (nblk -
13112 + pos->preceder.blk);
13113 +
13114 + /* See if we can find a closer block (forward direction only). */
13115 + pos->preceder.max_dist =
13116 + min((reiser4_block_nr) sbinfo->flush.
13117 + relocate_distance, dist);
13118 + pos->preceder.level = znode_get_level(node);
13119 +
13120 + ret = allocate_znode_update(node, parent_coord, pos);
13121 +
13122 + pos->preceder.max_dist = 0;
13123 +
13124 + if (ret && (ret != -ENOSPC))
13125 + return ret;
13126 +
13127 + if (ret == 0) {
13128 + /* Got a better allocation. */
13129 + znode_make_reloc(node, pos->fq);
13130 + } else if (dist < sbinfo->flush.relocate_distance) {
13131 + /* The present allocation is good enough. */
13132 + jnode_make_wander(ZJNODE(node));
13133 + } else {
13134 + /* Otherwise, try to relocate to the best position. */
13135 + best_reloc:
13136 + ret =
13137 + allocate_znode_update(node, parent_coord,
13138 + pos);
13139 + if (ret != 0)
13140 + return ret;
13141 +
13142 + /* set JNODE_RELOC bit _after_ node gets allocated */
13143 + znode_make_reloc(node, pos->fq);
13144 + }
13145 + }
13146 + }
13147 +
13148 + /* This is the new preceder. */
13149 + pos->preceder.blk = *znode_get_block(node);
13150 + check_preceder(pos->preceder.blk);
13151 + pos->alloc_cnt += 1;
13152 +
13153 + assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13154 +
13155 + return 0;
13156 +}
13157 +
13158 +static int
13159 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13160 +{
13161 + /*
13162 + * perform znode allocation with znode pinned in memory to avoid races
13163 + * with asynchronous emergency flush (which plays with
13164 + * JNODE_FLUSH_RESERVED bit).
13165 + */
13166 + return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13167 +}
13168 +
13169 +/* A subroutine of allocate_znode, this is called first to see if there is a close
13170 + position to relocate to. It may return ENOSPC if there is no close position. If there
13171 + is no close position it may not relocate. This takes care of updating the parent node
13172 + with the relocated block address. */
13173 +static int
13174 +allocate_znode_update(znode * node, const coord_t * parent_coord,
13175 + flush_pos_t * pos)
13176 +{
13177 + int ret;
13178 + reiser4_block_nr blk;
13179 + lock_handle uber_lock;
13180 + int flush_reserved_used = 0;
13181 + int grabbed;
13182 + reiser4_context *ctx;
13183 + reiser4_super_info_data *sbinfo;
13184 +
13185 + init_lh(&uber_lock);
13186 +
13187 + ctx = get_current_context();
13188 + sbinfo = get_super_private(ctx->super);
13189 +
13190 + grabbed = ctx->grabbed_blocks;
13191 +
13192 + /* discard e-flush allocation */
13193 + ret = zload(node);
13194 + if (ret)
13195 + return ret;
13196 +
13197 + if (ZF_ISSET(node, JNODE_CREATED)) {
13198 + assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13199 + pos->preceder.block_stage = BLOCK_UNALLOCATED;
13200 + } else {
13201 + pos->preceder.block_stage = BLOCK_GRABBED;
13202 +
13203 + /* The disk space for relocating the @node is already reserved in "flush reserved"
13204 + * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13205 + * space from whole disk not from only 95%). */
13206 + if (znode_get_level(node) == LEAF_LEVEL) {
13207 + /*
13208 + * earlier (during do_jnode_make_dirty()) we decided
13209 + * that @node can possibly go into overwrite set and
13210 + * reserved block for its wandering location.
13211 + */
13212 + txn_atom *atom = get_current_atom_locked();
13213 + assert("nikita-3449",
13214 + ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13215 + flush_reserved2grabbed(atom, (__u64) 1);
13216 + spin_unlock_atom(atom);
13217 + /*
13218 + * we are trying to move node into relocate
13219 + * set. Allocation of relocated position "uses"
13220 + * reserved block.
13221 + */
13222 + ZF_CLR(node, JNODE_FLUSH_RESERVED);
13223 + flush_reserved_used = 1;
13224 + } else {
13225 + ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13226 + if (ret != 0)
13227 + goto exit;
13228 + }
13229 + }
13230 +
13231 + /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13232 + ret = reiser4_alloc_block(&pos->preceder, &blk,
13233 + BA_FORMATTED | BA_PERMANENT);
13234 + if (ret)
13235 + goto exit;
13236 +
13237 + if (!ZF_ISSET(node, JNODE_CREATED) &&
13238 + (ret =
13239 + reiser4_dealloc_block(znode_get_block(node), 0,
13240 + BA_DEFER | BA_FORMATTED)))
13241 + goto exit;
13242 +
13243 + if (likely(!znode_is_root(node))) {
13244 + item_plugin *iplug;
13245 +
13246 + iplug = item_plugin_by_coord(parent_coord);
13247 + assert("nikita-2954", iplug->f.update != NULL);
13248 + iplug->f.update(parent_coord, &blk);
13249 +
13250 + znode_make_dirty(parent_coord->node);
13251 +
13252 + } else {
13253 + reiser4_tree *tree = znode_get_tree(node);
13254 + znode *uber;
13255 +
13256 + /* We take a longterm lock on the fake node in order to change
13257 + the root block number. This may cause atom fusion. */
13258 + ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13259 + &uber_lock);
13260 + /* The fake node cannot be deleted, and we must have priority
13261 + here, and may not be confused with ENOSPC. */
13262 + assert("jmacd-74412",
13263 + ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13264 +
13265 + if (ret)
13266 + goto exit;
13267 +
13268 + uber = uber_lock.node;
13269 +
13270 + write_lock_tree(tree);
13271 + tree->root_block = blk;
13272 + write_unlock_tree(tree);
13273 +
13274 + znode_make_dirty(uber);
13275 + }
13276 +
13277 + ret = znode_rehash(node, &blk);
13278 + exit:
13279 + if (ret) {
13280 + /* Get flush reserved block back if something fails, because
13281 + * callers assume that on error block wasn't relocated and its
13282 + * flush reserved block wasn't used. */
13283 + if (flush_reserved_used) {
13284 + /*
13285 + * ok, we failed to move node into relocate
13286 + * set. Restore status quo.
13287 + */
13288 + grabbed2flush_reserved((__u64) 1);
13289 + ZF_SET(node, JNODE_FLUSH_RESERVED);
13290 + }
13291 + }
13292 + zrelse(node);
13293 + done_lh(&uber_lock);
13294 + grabbed2free_mark(grabbed);
13295 + return ret;
13296 +}
13297 +
13298 +/* JNODE INTERFACE */
13299 +
13300 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13301 + coordinate in the parent. If the child is the root node, the above_root
13302 + znode is returned but the coord is not set. This function may cause atom
13303 + fusion, but it is only used for read locks (at this point) and therefore
13304 + fusion only occurs when the parent is already dirty. */
13305 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13306 + pointer in jnodes. */
13307 +static int
13308 +jnode_lock_parent_coord(jnode * node,
13309 + coord_t * coord,
13310 + lock_handle * parent_lh,
13311 + load_count * parent_zh,
13312 + znode_lock_mode parent_mode, int try)
13313 +{
13314 + int ret;
13315 +
13316 + assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13317 + assert("edward-54", jnode_is_unformatted(node)
13318 + || znode_is_any_locked(JZNODE(node)));
13319 +
13320 + if (!jnode_is_znode(node)) {
13321 + reiser4_key key;
13322 + tree_level stop_level = TWIG_LEVEL;
13323 + lookup_bias bias = FIND_EXACT;
13324 +
13325 + assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13326 +
13327 + /* The case when node is not znode, but can have parent coord
13328 + (unformatted node, node which represents cluster page,
13329 + etc..). Generate a key for the appropriate entry, search
13330 + in the tree using coord_by_key, which handles locking for
13331 + us. */
13332 +
13333 + /*
13334 + * nothing is locked at this moment, so, nothing prevents
13335 + * concurrent truncate from removing jnode from inode. To
13336 + * prevent this spin-lock jnode. jnode can be truncated just
13337 + * after call to the jnode_build_key(), but this is ok,
13338 + * because coord_by_key() will just fail to find appropriate
13339 + * extent.
13340 + */
13341 + spin_lock_jnode(node);
13342 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13343 + jnode_build_key(node, &key);
13344 + ret = 0;
13345 + } else
13346 + ret = RETERR(-ENOENT);
13347 + spin_unlock_jnode(node);
13348 +
13349 + if (ret != 0)
13350 + return ret;
13351 +
13352 + if (jnode_is_cluster_page(node))
13353 + stop_level = LEAF_LEVEL;
13354 +
13355 + assert("jmacd-1812", coord != NULL);
13356 +
13357 + ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13358 + parent_mode, bias, stop_level, stop_level,
13359 + CBK_UNIQUE, NULL /*ra_info */ );
13360 + switch (ret) {
13361 + case CBK_COORD_NOTFOUND:
13362 + assert("edward-1038",
13363 + ergo(jnode_is_cluster_page(node),
13364 + JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13365 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13366 + warning("nikita-3177", "Parent not found");
13367 + return ret;
13368 + case CBK_COORD_FOUND:
13369 + if (coord->between != AT_UNIT) {
13370 + /* FIXME: comment needed */
13371 + done_lh(parent_lh);
13372 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13373 + warning("nikita-3178",
13374 + "Found but not happy: %i",
13375 + coord->between);
13376 + }
13377 + return RETERR(-ENOENT);
13378 + }
13379 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13380 + if (ret != 0)
13381 + return ret;
13382 + /* if (jnode_is_cluster_page(node)) {
13383 + races with write() are possible
13384 + check_child_cluster (parent_lh->node);
13385 + }
13386 + */
13387 + break;
13388 + default:
13389 + return ret;
13390 + }
13391 +
13392 + } else {
13393 + int flags;
13394 + znode *z;
13395 +
13396 + z = JZNODE(node);
13397 + /* Formatted node case: */
13398 + assert("jmacd-2061", !znode_is_root(z));
13399 +
13400 + flags = GN_ALLOW_NOT_CONNECTED;
13401 + if (try)
13402 + flags |= GN_TRY_LOCK;
13403 +
13404 + ret =
13405 + reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13406 + if (ret != 0)
13407 + /* -E_REPEAT is ok here, it is handled by the caller. */
13408 + return ret;
13409 +
13410 + /* Make the child's position "hint" up-to-date. (Unless above
13411 + root, which caller must check.) */
13412 + if (coord != NULL) {
13413 +
13414 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13415 + if (ret != 0) {
13416 + warning("jmacd-976812386",
13417 + "incr_load_count_znode failed: %d",
13418 + ret);
13419 + return ret;
13420 + }
13421 +
13422 + ret = find_child_ptr(parent_lh->node, z, coord);
13423 + if (ret != 0) {
13424 + warning("jmacd-976812",
13425 + "find_child_ptr failed: %d", ret);
13426 + return ret;
13427 + }
13428 + }
13429 + }
13430 +
13431 + return 0;
13432 +}
13433 +
13434 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13435 + If there is no next neighbor or the neighbor is not in memory or if there is a
13436 + neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13437 + In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13438 +static int neighbor_in_slum(znode * node, /* starting point */
13439 + lock_handle * lock, /* lock on starting point */
13440 + sideof side, /* left or right direction we seek the next node in */
13441 + znode_lock_mode mode, /* kind of lock we want */
13442 + int check_dirty, /* true if the neighbor should be dirty */
13443 + int use_upper_levels /* get neighbor by going though
13444 + upper levels */)
13445 +{
13446 + int ret;
13447 + int flags;
13448 +
13449 + assert("jmacd-6334", znode_is_connected(node));
13450 +
13451 + flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13452 + if (use_upper_levels)
13453 + flags |= GN_CAN_USE_UPPER_LEVELS;
13454 +
13455 + ret = reiser4_get_neighbor(lock, node, mode, flags);
13456 + if (ret) {
13457 + /* May return -ENOENT or -E_NO_NEIGHBOR. */
13458 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13459 + if (ret == -ENOENT) {
13460 + ret = RETERR(-E_NO_NEIGHBOR);
13461 + }
13462 + return ret;
13463 + }
13464 + if (!check_dirty)
13465 + return 0;
13466 + /* Check dirty bit of locked znode, no races here */
13467 + if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13468 + return 0;
13469 +
13470 + done_lh(lock);
13471 + return RETERR(-E_NO_NEIGHBOR);
13472 +}
13473 +
13474 +/* Return true if two znodes have the same parent. This is called with both nodes
13475 + write-locked (for squeezing) so no tree lock is needed. */
13476 +static int znode_same_parents(znode * a, znode * b)
13477 +{
13478 + int result;
13479 +
13480 + assert("jmacd-7011", znode_is_write_locked(a));
13481 + assert("jmacd-7012", znode_is_write_locked(b));
13482 +
13483 + /* We lock the whole tree for this check.... I really don't like whole tree
13484 + * locks... -Hans */
13485 + read_lock_tree(znode_get_tree(a));
13486 + result = (znode_parent(a) == znode_parent(b));
13487 + read_unlock_tree(znode_get_tree(a));
13488 + return result;
13489 +}
13490 +
13491 +/* FLUSH SCAN */
13492 +
13493 +/* Initialize the flush_scan data structure. */
13494 +static void scan_init(flush_scan * scan)
13495 +{
13496 + memset(scan, 0, sizeof(*scan));
13497 + init_lh(&scan->node_lock);
13498 + init_lh(&scan->parent_lock);
13499 + init_load_count(&scan->parent_load);
13500 + init_load_count(&scan->node_load);
13501 + coord_init_invalid(&scan->parent_coord, NULL);
13502 +}
13503 +
13504 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13505 +static void scan_done(flush_scan * scan)
13506 +{
13507 + done_load_count(&scan->node_load);
13508 + if (scan->node != NULL) {
13509 + jput(scan->node);
13510 + scan->node = NULL;
13511 + }
13512 + done_load_count(&scan->parent_load);
13513 + done_lh(&scan->parent_lock);
13514 + done_lh(&scan->node_lock);
13515 +}
13516 +
13517 +/* Returns true if flush scanning is finished. */
13518 +int reiser4_scan_finished(flush_scan * scan)
13519 +{
13520 + return scan->stop || (scan->direction == RIGHT_SIDE &&
13521 + scan->count >= scan->max_count);
13522 +}
13523 +
13524 +/* Return true if the scan should continue to the @tonode. True if the node meets the
13525 + same_slum_check condition. If not, deref the "left" node and stop the scan. */
13526 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13527 +{
13528 + int go = same_slum_check(scan->node, tonode, 1, 0);
13529 +
13530 + if (!go) {
13531 + scan->stop = 1;
13532 + jput(tonode);
13533 + }
13534 +
13535 + return go;
13536 +}
13537 +
13538 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13539 + count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13540 + parent coordinate. */
13541 +int
13542 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13543 + const coord_t * parent)
13544 +{
13545 + /* Release the old references, take the new reference. */
13546 + done_load_count(&scan->node_load);
13547 +
13548 + if (scan->node != NULL) {
13549 + jput(scan->node);
13550 + }
13551 + scan->node = node;
13552 + scan->count += add_count;
13553 +
13554 + /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
13555 + delay this update step until it finishes and update the parent_coord only once.
13556 + It did that before, but there was a bug and this was the easiest way to make it
13557 + correct. */
13558 + if (parent != NULL) {
13559 + coord_dup(&scan->parent_coord, parent);
13560 + }
13561 +
13562 + /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13563 + is safely taken. */
13564 + return incr_load_count_jnode(&scan->node_load, node);
13565 +}
13566 +
13567 +/* Return true if scanning in the leftward direction. */
13568 +int reiser4_scanning_left(flush_scan * scan)
13569 +{
13570 + return scan->direction == LEFT_SIDE;
13571 +}
13572 +
13573 +/* Performs leftward scanning starting from either kind of node. Counts the starting
13574 + node. The right-scan object is passed in for the left-scan in order to copy the parent
13575 + of an unformatted starting position. This way we avoid searching for the unformatted
13576 + node's parent when scanning in each direction. If we search for the parent once it is
13577 + set in both scan objects. The limit parameter tells flush-scan when to stop.
13578 +
13579 + Rapid scanning is used only during scan_left, where we are interested in finding the
13580 + 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13581 + of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13582 + problem is finding a way to flush only those nodes without unallocated children, and it
13583 + is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13584 + problem can be solved by scanning left at every level as we go upward, but this would
13585 + basically bring us back to using a top-down allocation strategy, which we already tried
13586 + (see BK history from May 2002), and has a different set of problems. The top-down
13587 + strategy makes avoiding unallocated children easier, but makes it difficult to
13588 + propertly flush dirty children with clean parents that would otherwise stop the
13589 + top-down flush, only later to dirty the parent once the children are flushed. So we
13590 + solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13591 + only.
13592 +
13593 + The first step in solving the problem is this rapid leftward scan. After we determine
13594 + that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13595 + are no longer interested in the exact count, we are only interested in finding a the
13596 + best place to start the flush. We could choose one of two possibilities:
13597 +
13598 + 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13599 + This requires checking one leaf per rapid-scan twig
13600 +
13601 + 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13602 + to the left. This requires checking possibly all of the in-memory children of each
13603 + twig during the rapid scan.
13604 +
13605 + For now we implement the first policy.
13606 +*/
13607 +static int
13608 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13609 +{
13610 + int ret = 0;
13611 +
13612 + scan->max_count = limit;
13613 + scan->direction = LEFT_SIDE;
13614 +
13615 + ret = scan_set_current(scan, jref(node), 1, NULL);
13616 + if (ret != 0) {
13617 + return ret;
13618 + }
13619 +
13620 + ret = scan_common(scan, right);
13621 + if (ret != 0) {
13622 + return ret;
13623 + }
13624 +
13625 + /* Before rapid scanning, we need a lock on scan->node so that we can get its
13626 + parent, only if formatted. */
13627 + if (jnode_is_znode(scan->node)) {
13628 + ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13629 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13630 + }
13631 +
13632 + /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13633 + return ret;
13634 +}
13635 +
13636 +/* Performs rightward scanning... Does not count the starting node. The limit parameter
13637 + is described in scan_left. If the starting node is unformatted then the
13638 + parent_coord was already set during scan_left. The rapid_after parameter is not used
13639 + during right-scanning.
13640 +
13641 + scan_right is only called if the scan_left operation does not count at least
13642 + FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13643 + the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13644 + scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13645 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13646 +{
13647 + int ret;
13648 +
13649 + scan->max_count = limit;
13650 + scan->direction = RIGHT_SIDE;
13651 +
13652 + ret = scan_set_current(scan, jref(node), 0, NULL);
13653 + if (ret != 0) {
13654 + return ret;
13655 + }
13656 +
13657 + return scan_common(scan, NULL);
13658 +}
13659 +
13660 +/* Common code to perform left or right scanning. */
13661 +static int scan_common(flush_scan * scan, flush_scan * other)
13662 +{
13663 + int ret;
13664 +
13665 + assert("nikita-2376", scan->node != NULL);
13666 + assert("edward-54", jnode_is_unformatted(scan->node)
13667 + || jnode_is_znode(scan->node));
13668 +
13669 + /* Special case for starting at an unformatted node. Optimization: we only want
13670 + to search for the parent (which requires a tree traversal) once. Obviously, we
13671 + shouldn't have to call it once for the left scan and once for the right scan.
13672 + For this reason, if we search for the parent during scan-left we then duplicate
13673 + the coord/lock/load into the scan-right object. */
13674 + if (jnode_is_unformatted(scan->node)) {
13675 + ret = scan_unformatted(scan, other);
13676 + if (ret != 0)
13677 + return ret;
13678 + }
13679 + /* This loop expects to start at a formatted position and performs chaining of
13680 + formatted regions */
13681 + while (!reiser4_scan_finished(scan)) {
13682 +
13683 + ret = scan_formatted(scan);
13684 + if (ret != 0) {
13685 + return ret;
13686 + }
13687 + }
13688 +
13689 + return 0;
13690 +}
13691 +
13692 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
13693 +{
13694 + int ret = 0;
13695 + int try = 0;
13696 +
13697 + if (!coord_is_invalid(&scan->parent_coord))
13698 + goto scan;
13699 +
13700 + /* set parent coord from */
13701 + if (!jnode_is_unformatted(scan->node)) {
13702 + /* formatted position */
13703 +
13704 + lock_handle lock;
13705 + assert("edward-301", jnode_is_znode(scan->node));
13706 + init_lh(&lock);
13707 +
13708 + /*
13709 + * when flush starts from unformatted node, first thing it
13710 + * does is tree traversal to find formatted parent of starting
13711 + * node. This parent is then kept lock across scans to the
13712 + * left and to the right. This means that during scan to the
13713 + * left we cannot take left-ward lock, because this is
13714 + * dead-lock prone. So, if we are scanning to the left and
13715 + * there is already lock held by this thread,
13716 + * jnode_lock_parent_coord() should use try-lock.
13717 + */
13718 + try = reiser4_scanning_left(scan)
13719 + && !lock_stack_isclean(get_current_lock_stack());
13720 + /* Need the node locked to get the parent lock, We have to
13721 + take write lock since there is at least one call path
13722 + where this znode is already write-locked by us. */
13723 + ret =
13724 + longterm_lock_znode(&lock, JZNODE(scan->node),
13725 + ZNODE_WRITE_LOCK,
13726 + reiser4_scanning_left(scan) ?
13727 + ZNODE_LOCK_LOPRI :
13728 + ZNODE_LOCK_HIPRI);
13729 + if (ret != 0)
13730 + /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
13731 + scanned too far and can't back out, just start over. */
13732 + return ret;
13733 +
13734 + ret = jnode_lock_parent_coord(scan->node,
13735 + &scan->parent_coord,
13736 + &scan->parent_lock,
13737 + &scan->parent_load,
13738 + ZNODE_WRITE_LOCK, try);
13739 +
13740 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13741 + done_lh(&lock);
13742 + if (ret == -E_REPEAT) {
13743 + scan->stop = 1;
13744 + return 0;
13745 + }
13746 + if (ret)
13747 + return ret;
13748 +
13749 + } else {
13750 + /* unformatted position */
13751 +
13752 + ret =
13753 + jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13754 + &scan->parent_lock,
13755 + &scan->parent_load,
13756 + ZNODE_WRITE_LOCK, try);
13757 +
13758 + if (IS_CBKERR(ret))
13759 + return ret;
13760 +
13761 + if (ret == CBK_COORD_NOTFOUND)
13762 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13763 + return ret;
13764 +
13765 + /* parent was found */
13766 + assert("jmacd-8661", other != NULL);
13767 + /* Duplicate the reference into the other flush_scan. */
13768 + coord_dup(&other->parent_coord, &scan->parent_coord);
13769 + copy_lh(&other->parent_lock, &scan->parent_lock);
13770 + copy_load_count(&other->parent_load, &scan->parent_load);
13771 + }
13772 + scan:
13773 + return scan_by_coord(scan);
13774 +}
13775 +
13776 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
13777 + pointers under tree lock as long as:
13778 +
13779 + - node->left/right is non-NULL
13780 + - node->left/right is connected, dirty
13781 + - node->left/right belongs to the same atom
13782 + - scan has not reached maximum count
13783 +*/
13784 +static int scan_formatted(flush_scan * scan)
13785 +{
13786 + int ret;
13787 + znode *neighbor = NULL;
13788 +
13789 + assert("jmacd-1401", !reiser4_scan_finished(scan));
13790 +
13791 + do {
13792 + znode *node = JZNODE(scan->node);
13793 +
13794 + /* Node should be connected, but if not stop the scan. */
13795 + if (!znode_is_connected(node)) {
13796 + scan->stop = 1;
13797 + break;
13798 + }
13799 +
13800 + /* Lock the tree, check-for and reference the next sibling. */
13801 + read_lock_tree(znode_get_tree(node));
13802 +
13803 + /* It may be that a node is inserted or removed between a node and its
13804 + left sibling while the tree lock is released, but the flush-scan count
13805 + does not need to be precise. Thus, we release the tree lock as soon as
13806 + we get the neighboring node. */
13807 + neighbor =
13808 + reiser4_scanning_left(scan) ? node->left : node->right;
13809 + if (neighbor != NULL) {
13810 + zref(neighbor);
13811 + }
13812 +
13813 + read_unlock_tree(znode_get_tree(node));
13814 +
13815 + /* If neighbor is NULL at the leaf level, need to check for an unformatted
13816 + sibling using the parent--break in any case. */
13817 + if (neighbor == NULL) {
13818 + break;
13819 + }
13820 +
13821 + /* Check the condition for going left, break if it is not met. This also
13822 + releases (jputs) the neighbor if false. */
13823 + if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
13824 + break;
13825 + }
13826 +
13827 + /* Advance the flush_scan state to the left, repeat. */
13828 + ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13829 + if (ret != 0) {
13830 + return ret;
13831 + }
13832 +
13833 + } while (!reiser4_scan_finished(scan));
13834 +
13835 + /* If neighbor is NULL then we reached the end of a formatted region, or else the
13836 + sibling is out of memory, now check for an extent to the left (as long as
13837 + LEAF_LEVEL). */
13838 + if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
13839 + || reiser4_scan_finished(scan)) {
13840 + scan->stop = 1;
13841 + return 0;
13842 + }
13843 + /* Otherwise, calls scan_by_coord for the right(left)most item of the
13844 + left(right) neighbor on the parent level, then possibly continue. */
13845 +
13846 + coord_init_invalid(&scan->parent_coord, NULL);
13847 + return scan_unformatted(scan, NULL);
13848 +}
13849 +
13850 +/* NOTE-EDWARD:
13851 + This scans adjacent items of the same type and calls scan flush plugin for each one.
13852 + Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
13853 + from unformatted node, then we continue only if the next neighbor is also unformatted.
13854 + When called from scan_formatted, we skip first iteration (to make sure that
13855 + right(left)most item of the left(right) neighbor on the parent level is of the same
13856 + type and set appropriate coord). */
13857 +static int scan_by_coord(flush_scan * scan)
13858 +{
13859 + int ret = 0;
13860 + int scan_this_coord;
13861 + lock_handle next_lock;
13862 + load_count next_load;
13863 + coord_t next_coord;
13864 + jnode *child;
13865 + item_plugin *iplug;
13866 +
13867 + init_lh(&next_lock);
13868 + init_load_count(&next_load);
13869 + scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13870 +
13871 + /* set initial item id */
13872 + iplug = item_plugin_by_coord(&scan->parent_coord);
13873 +
13874 + for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
13875 + if (scan_this_coord) {
13876 + /* Here we expect that unit is scannable. it would not be so due
13877 + * to race with extent->tail conversion. */
13878 + if (iplug->f.scan == NULL) {
13879 + scan->stop = 1;
13880 + ret = -E_REPEAT;
13881 + /* skip the check at the end. */
13882 + goto race;
13883 + }
13884 +
13885 + ret = iplug->f.scan(scan);
13886 + if (ret != 0)
13887 + goto exit;
13888 +
13889 + if (reiser4_scan_finished(scan)) {
13890 + checkchild(scan);
13891 + break;
13892 + }
13893 + } else {
13894 + /* the same race against truncate as above is possible
13895 + * here, it seems */
13896 +
13897 + /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13898 + the first coordinate. */
13899 + assert("jmacd-1231",
13900 + item_is_internal(&scan->parent_coord));
13901 + }
13902 +
13903 + if (iplug->f.utmost_child == NULL
13904 + || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13905 + /* stop this coord and continue on parrent level */
13906 + ret =
13907 + scan_set_current(scan,
13908 + ZJNODE(zref
13909 + (scan->parent_coord.node)),
13910 + 1, NULL);
13911 + if (ret != 0)
13912 + goto exit;
13913 + break;
13914 + }
13915 +
13916 + /* Either way, the invariant is that scan->parent_coord is set to the
13917 + parent of scan->node. Now get the next unit. */
13918 + coord_dup(&next_coord, &scan->parent_coord);
13919 + coord_sideof_unit(&next_coord, scan->direction);
13920 +
13921 + /* If off-the-end of the twig, try the next twig. */
13922 + if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
13923 + /* We take the write lock because we may start flushing from this
13924 + * coordinate. */
13925 + ret = neighbor_in_slum(next_coord.node,
13926 + &next_lock,
13927 + scan->direction,
13928 + ZNODE_WRITE_LOCK,
13929 + 1 /* check dirty */,
13930 + 0 /* don't go though upper
13931 + levels */);
13932 + if (ret == -E_NO_NEIGHBOR) {
13933 + scan->stop = 1;
13934 + ret = 0;
13935 + break;
13936 + }
13937 +
13938 + if (ret != 0) {
13939 + goto exit;
13940 + }
13941 +
13942 + ret = incr_load_count_znode(&next_load, next_lock.node);
13943 + if (ret != 0) {
13944 + goto exit;
13945 + }
13946 +
13947 + coord_init_sideof_unit(&next_coord, next_lock.node,
13948 + sideof_reverse(scan->direction));
13949 + }
13950 +
13951 + iplug = item_plugin_by_coord(&next_coord);
13952 +
13953 + /* Get the next child. */
13954 + ret =
13955 + iplug->f.utmost_child(&next_coord,
13956 + sideof_reverse(scan->direction),
13957 + &child);
13958 + if (ret != 0)
13959 + goto exit;
13960 + /* If the next child is not in memory, or, item_utmost_child
13961 + failed (due to race with unlink, most probably), stop
13962 + here. */
13963 + if (child == NULL || IS_ERR(child)) {
13964 + scan->stop = 1;
13965 + checkchild(scan);
13966 + break;
13967 + }
13968 +
13969 + assert("nikita-2374", jnode_is_unformatted(child)
13970 + || jnode_is_znode(child));
13971 +
13972 + /* See if it is dirty, part of the same atom. */
13973 + if (!reiser4_scan_goto(scan, child)) {
13974 + checkchild(scan);
13975 + break;
13976 + }
13977 +
13978 + /* If so, make this child current. */
13979 + ret = scan_set_current(scan, child, 1, &next_coord);
13980 + if (ret != 0)
13981 + goto exit;
13982 +
13983 + /* Now continue. If formatted we release the parent lock and return, then
13984 + proceed. */
13985 + if (jnode_is_znode(child))
13986 + break;
13987 +
13988 + /* Otherwise, repeat the above loop with next_coord. */
13989 + if (next_load.node != NULL) {
13990 + done_lh(&scan->parent_lock);
13991 + move_lh(&scan->parent_lock, &next_lock);
13992 + move_load_count(&scan->parent_load, &next_load);
13993 + }
13994 + }
13995 +
13996 + assert("jmacd-6233",
13997 + reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
13998 + exit:
13999 + checkchild(scan);
14000 + race: /* skip the above check */
14001 + if (jnode_is_znode(scan->node)) {
14002 + done_lh(&scan->parent_lock);
14003 + done_load_count(&scan->parent_load);
14004 + }
14005 +
14006 + done_load_count(&next_load);
14007 + done_lh(&next_lock);
14008 + return ret;
14009 +}
14010 +
14011 +/* FLUSH POS HELPERS */
14012 +
14013 +/* Initialize the fields of a flush_position. */
14014 +static void pos_init(flush_pos_t * pos)
14015 +{
14016 + memset(pos, 0, sizeof *pos);
14017 +
14018 + pos->state = POS_INVALID;
14019 + coord_init_invalid(&pos->coord, NULL);
14020 + init_lh(&pos->lock);
14021 + init_load_count(&pos->load);
14022 +
14023 + reiser4_blocknr_hint_init(&pos->preceder);
14024 +}
14025 +
14026 +/* The flush loop inside squalloc periodically checks pos_valid to
14027 + determine when "enough flushing" has been performed. This will return true until one
14028 + of the following conditions is met:
14029 +
14030 + 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14031 + parameter, meaning we have flushed as many blocks as the kernel requested. When
14032 + flushing to commit, this parameter is NULL.
14033 +
14034 + 2. pos_stop() is called because squalloc discovers that the "next" node in the
14035 + flush order is either non-existant, not dirty, or not in the same atom.
14036 +*/
14037 +
14038 +static int pos_valid(flush_pos_t * pos)
14039 +{
14040 + return pos->state != POS_INVALID;
14041 +}
14042 +
14043 +/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14044 +static void pos_done(flush_pos_t * pos)
14045 +{
14046 + pos_stop(pos);
14047 + reiser4_blocknr_hint_done(&pos->preceder);
14048 + if (convert_data(pos))
14049 + free_convert_data(pos);
14050 +}
14051 +
14052 +/* Reset the point and parent. Called during flush subroutines to terminate the
14053 + squalloc loop. */
14054 +static int pos_stop(flush_pos_t * pos)
14055 +{
14056 + pos->state = POS_INVALID;
14057 + done_lh(&pos->lock);
14058 + done_load_count(&pos->load);
14059 + coord_init_invalid(&pos->coord, NULL);
14060 +
14061 + if (pos->child) {
14062 + jput(pos->child);
14063 + pos->child = NULL;
14064 + }
14065 +
14066 + return 0;
14067 +}
14068 +
14069 +/* Return the flush_position's block allocator hint. */
14070 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14071 +{
14072 + return &pos->preceder;
14073 +}
14074 +
14075 +flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14076 +{
14077 + return pos->fq;
14078 +}
14079 +
14080 +/* Make Linus happy.
14081 + Local variables:
14082 + c-indentation-style: "K&R"
14083 + mode-name: "LC"
14084 + c-basic-offset: 8
14085 + tab-width: 8
14086 + fill-column: 90
14087 + LocalWords: preceder
14088 + End:
14089 +*/
14090 diff -urN linux-2.6.22.orig/fs/reiser4/flush.h linux-2.6.22/fs/reiser4/flush.h
14091 --- linux-2.6.22.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300
14092 +++ linux-2.6.22/fs/reiser4/flush.h 2007-07-29 00:25:34.864693371 +0400
14093 @@ -0,0 +1,295 @@
14094 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14095 +
14096 +/* DECLARATIONS: */
14097 +
14098 +#if !defined(__REISER4_FLUSH_H__)
14099 +#define __REISER4_FLUSH_H__
14100 +
14101 +#include "plugin/cluster.h"
14102 +
14103 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14104 + single level of the tree. A flush-scan is used for counting the number of adjacent
14105 + nodes to flush, which is used to determine whether we should relocate, and it is also
14106 + used to find a starting point for flush. A flush-scan object can scan in both right
14107 + and left directions via the scan_left() and scan_right() interfaces. The
14108 + right- and left-variations are similar but perform different functions. When scanning
14109 + left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14110 + When scanning right we are simply counting the number of adjacent, dirty nodes. */
14111 +struct flush_scan {
14112 +
14113 + /* The current number of nodes scanned on this level. */
14114 + unsigned count;
14115 +
14116 + /* There may be a maximum number of nodes for a scan on any single level. When
14117 + going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14118 + unsigned max_count;
14119 +
14120 + /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14121 + sideof direction;
14122 +
14123 + /* Initially @stop is set to false then set true once some condition stops the
14124 + search (e.g., we found a clean node before reaching max_count or we found a
14125 + node belonging to another atom). */
14126 + int stop;
14127 +
14128 + /* The current scan position. If @node is non-NULL then its reference count has
14129 + been incremented to reflect this reference. */
14130 + jnode *node;
14131 +
14132 + /* A handle for zload/zrelse of current scan position node. */
14133 + load_count node_load;
14134 +
14135 + /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14136 + node is locked using this lock handle. The endpoint needs to be locked for
14137 + transfer to the flush_position object after scanning finishes. */
14138 + lock_handle node_lock;
14139 +
14140 + /* When the position is unformatted, its parent, coordinate, and parent
14141 + zload/zrelse handle. */
14142 + lock_handle parent_lock;
14143 + coord_t parent_coord;
14144 + load_count parent_load;
14145 +
14146 + /* The block allocator preceder hint. Sometimes flush_scan determines what the
14147 + preceder is and if so it sets it here, after which it is copied into the
14148 + flush_position. Otherwise, the preceder is computed later. */
14149 + reiser4_block_nr preceder_blk;
14150 +};
14151 +
14152 +struct convert_item_info {
14153 + dc_item_stat d_cur; /* disk cluster state of the current item */
14154 + dc_item_stat d_next; /* disk cluster state of the next slum item */
14155 + struct inode *inode;
14156 + flow_t flow;
14157 +};
14158 +
14159 +struct convert_info {
14160 + int count; /* for squalloc terminating */
14161 + item_plugin *iplug; /* current item plugin */
14162 + struct convert_item_info *itm; /* current item info */
14163 + struct cluster_handle clust; /* transform cluster */
14164 +};
14165 +
14166 +typedef enum flush_position_state {
14167 + POS_INVALID, /* Invalid or stopped pos, do not continue slum
14168 + * processing */
14169 + POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14170 + * leaf level */
14171 + POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14172 + * to traverse unformatted nodes */
14173 + POS_TO_LEAF, /* pos is being moved to leaf level */
14174 + POS_TO_TWIG, /* pos is being moved to twig level */
14175 + POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14176 + * rightmost unit of the current twig */
14177 + POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14178 +} flushpos_state_t;
14179 +
14180 +/* An encapsulation of the current flush point and all the parameters that are passed
14181 + through the entire squeeze-and-allocate stage of the flush routine. A single
14182 + flush_position object is constructed after left- and right-scanning finishes. */
14183 +struct flush_position {
14184 + flushpos_state_t state;
14185 +
14186 + coord_t coord; /* coord to traverse unformatted nodes */
14187 + lock_handle lock; /* current lock we hold */
14188 + load_count load; /* load status for current locked formatted node */
14189 +
14190 + jnode *child; /* for passing a reference to unformatted child
14191 + * across pos state changes */
14192 +
14193 + reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14194 + int leaf_relocate; /* True if enough leaf-level nodes were
14195 + * found to suggest a relocate policy. */
14196 + int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14197 + int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14198 + flush_queue_t *fq;
14199 + long *nr_written; /* number of nodes submitted to disk */
14200 + int flags; /* a copy of jnode_flush flags argument */
14201 +
14202 + znode *prev_twig; /* previous parent pointer value, used to catch
14203 + * processing of new twig node */
14204 + struct convert_info *sq; /* convert info */
14205 +
14206 + unsigned long pos_in_unit; /* for extents only. Position
14207 + within an extent unit of first
14208 + jnode of slum */
14209 + long nr_to_write; /* number of unformatted nodes to handle on flush */
14210 +};
14211 +
14212 +static inline int item_convert_count(flush_pos_t * pos)
14213 +{
14214 + return pos->sq->count;
14215 +}
14216 +static inline void inc_item_convert_count(flush_pos_t * pos)
14217 +{
14218 + pos->sq->count++;
14219 +}
14220 +static inline void set_item_convert_count(flush_pos_t * pos, int count)
14221 +{
14222 + pos->sq->count = count;
14223 +}
14224 +static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14225 +{
14226 + return pos->sq->iplug;
14227 +}
14228 +
14229 +static inline struct convert_info *convert_data(flush_pos_t * pos)
14230 +{
14231 + return pos->sq;
14232 +}
14233 +
14234 +static inline struct convert_item_info *item_convert_data(flush_pos_t * pos)
14235 +{
14236 + assert("edward-955", convert_data(pos));
14237 + return pos->sq->itm;
14238 +}
14239 +
14240 +static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos)
14241 +{
14242 + return &pos->sq->clust.tc;
14243 +}
14244 +
14245 +static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos,
14246 + tfm_stream_id id)
14247 +{
14248 + assert("edward-854", pos->sq != NULL);
14249 + return get_tfm_stream(tfm_cluster_sq(pos), id);
14250 +}
14251 +
14252 +static inline int chaining_data_present(flush_pos_t * pos)
14253 +{
14254 + return convert_data(pos) && item_convert_data(pos);
14255 +}
14256 +
14257 +/* Returns true if next node contains next item of the disk cluster
14258 + so item convert data should be moved to the right slum neighbor.
14259 +*/
14260 +static inline int should_chain_next_node(flush_pos_t * pos)
14261 +{
14262 + int result = 0;
14263 +
14264 + assert("edward-1007", chaining_data_present(pos));
14265 +
14266 + switch (item_convert_data(pos)->d_next) {
14267 + case DC_CHAINED_ITEM:
14268 + result = 1;
14269 + break;
14270 + case DC_AFTER_CLUSTER:
14271 + break;
14272 + default:
14273 + impossible("edward-1009", "bad state of next slum item");
14274 + }
14275 + return result;
14276 +}
14277 +
14278 +/* update item state in a disk cluster to assign conversion mode */
14279 +static inline void
14280 +move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14281 +{
14282 +
14283 + assert("edward-1010", chaining_data_present(pos));
14284 +
14285 + if (this_node == 0) {
14286 + /* next item is on the right neighbor */
14287 + assert("edward-1011",
14288 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14289 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14290 + assert("edward-1012",
14291 + item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14292 +
14293 + item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14294 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14295 + } else {
14296 + /* next item is on the same node */
14297 + assert("edward-1013",
14298 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14299 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14300 + assert("edward-1227",
14301 + item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14302 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
14303 +
14304 + item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14305 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14306 + }
14307 +}
14308 +
14309 +static inline int should_convert_node(flush_pos_t * pos, znode * node)
14310 +{
14311 + return znode_convertible(node);
14312 +}
14313 +
14314 +/* true if there is attached convert item info */
14315 +static inline int should_convert_next_node(flush_pos_t * pos)
14316 +{
14317 + return convert_data(pos) && item_convert_data(pos);
14318 +}
14319 +
14320 +#define SQUALLOC_THRESHOLD 256
14321 +
14322 +static inline int should_terminate_squalloc(flush_pos_t * pos)
14323 +{
14324 + return convert_data(pos) &&
14325 + !item_convert_data(pos) &&
14326 + item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14327 +}
14328 +
14329 +#if 1
14330 +#define check_convert_info(pos) \
14331 +do { \
14332 + if (unlikely(should_convert_next_node(pos))){ \
14333 + warning("edward-1006", "unprocessed chained data"); \
14334 + printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \
14335 + item_convert_data(pos)->d_cur, \
14336 + item_convert_data(pos)->d_next, \
14337 + item_convert_data(pos)->flow.length); \
14338 + printk("inode %llu, size = %llu, cluster %lu\n", \
14339 + (unsigned long long)get_inode_oid \
14340 + (item_convert_data(pos)->inode), \
14341 + i_size_read(item_convert_data(pos)->inode), \
14342 + convert_data(pos)->clust.index); \
14343 + } \
14344 +} while (0)
14345 +#else
14346 +#define check_convert_info(pos)
14347 +#endif /* REISER4_DEBUG */
14348 +
14349 +void free_convert_data(flush_pos_t * pos);
14350 +/* used in extent.c */
14351 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14352 + const coord_t * parent);
14353 +int reiser4_scan_finished(flush_scan * scan);
14354 +int reiser4_scanning_left(flush_scan * scan);
14355 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14356 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14357 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
14358 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14359 + reiser4_key *stop_key);
14360 +extern int reiser4_init_fqs(void);
14361 +extern void reiser4_done_fqs(void);
14362 +
14363 +#if REISER4_DEBUG
14364 +
14365 +extern void reiser4_check_fq(const txn_atom *atom);
14366 +extern atomic_t flush_cnt;
14367 +
14368 +#define check_preceder(blk) \
14369 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14370 +extern void check_pos(flush_pos_t * pos);
14371 +#else
14372 +#define check_preceder(b) noop
14373 +#define check_pos(pos) noop
14374 +#endif
14375 +
14376 +/* __REISER4_FLUSH_H__ */
14377 +#endif
14378 +
14379 +/* Make Linus happy.
14380 + Local variables:
14381 + c-indentation-style: "K&R"
14382 + mode-name: "LC"
14383 + c-basic-offset: 8
14384 + tab-width: 8
14385 + fill-column: 90
14386 + LocalWords: preceder
14387 + End:
14388 +*/
14389 diff -urN linux-2.6.22.orig/fs/reiser4/flush_queue.c linux-2.6.22/fs/reiser4/flush_queue.c
14390 --- linux-2.6.22.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300
14391 +++ linux-2.6.22/fs/reiser4/flush_queue.c 2007-07-29 00:25:34.864693371 +0400
14392 @@ -0,0 +1,680 @@
14393 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14394 +
14395 +#include "debug.h"
14396 +#include "super.h"
14397 +#include "txnmgr.h"
14398 +#include "jnode.h"
14399 +#include "znode.h"
14400 +#include "page_cache.h"
14401 +#include "wander.h"
14402 +#include "vfs_ops.h"
14403 +#include "writeout.h"
14404 +#include "flush.h"
14405 +
14406 +#include <linux/bio.h>
14407 +#include <linux/mm.h>
14408 +#include <linux/pagemap.h>
14409 +#include <linux/blkdev.h>
14410 +#include <linux/writeback.h>
14411 +
14412 +/* A flush queue object is an accumulator for keeping jnodes prepared
14413 + by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14414 + kept on the flush queue until memory pressure or atom commit asks
14415 + flush queues to write some or all from their jnodes. */
14416 +
14417 +/*
14418 + LOCKING:
14419 +
14420 + fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14421 + list protected by atom spin lock. fq->prepped list uses the following
14422 + locking:
14423 +
14424 + two ways to protect fq->prepped list for read-only list traversal:
14425 +
14426 + 1. atom spin-lock atom.
14427 + 2. fq is IN_USE, atom->nr_running_queues increased.
14428 +
14429 + and one for list modification:
14430 +
14431 + 1. atom is spin-locked and one condition is true: fq is IN_USE or
14432 + atom->nr_running_queues == 0.
14433 +
14434 + The deadlock-safe order for flush queues and atoms is: first lock atom, then
14435 + lock flush queue, then lock jnode.
14436 +*/
14437 +
14438 +#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14439 +#define fq_ready(fq) (!fq_in_use(fq))
14440 +
14441 +#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14442 +#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14443 +
14444 +/* get lock on atom from locked flush queue object */
14445 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14446 +{
14447 + /* This code is similar to jnode_get_atom(), look at it for the
14448 + * explanation. */
14449 + txn_atom *atom;
14450 +
14451 + assert_spin_locked(&(fq->guard));
14452 +
14453 + while (1) {
14454 + atom = fq->atom;
14455 + if (atom == NULL)
14456 + break;
14457 +
14458 + if (spin_trylock_atom(atom))
14459 + break;
14460 +
14461 + atomic_inc(&atom->refcount);
14462 + spin_unlock(&(fq->guard));
14463 + spin_lock_atom(atom);
14464 + spin_lock(&(fq->guard));
14465 +
14466 + if (fq->atom == atom) {
14467 + atomic_dec(&atom->refcount);
14468 + break;
14469 + }
14470 +
14471 + spin_unlock(&(fq->guard));
14472 + atom_dec_and_unlock(atom);
14473 + spin_lock(&(fq->guard));
14474 + }
14475 +
14476 + return atom;
14477 +}
14478 +
14479 +txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14480 +{
14481 + txn_atom *atom;
14482 +
14483 + spin_lock(&(fq->guard));
14484 + atom = atom_locked_by_fq_nolock(fq);
14485 + spin_unlock(&(fq->guard));
14486 + return atom;
14487 +}
14488 +
14489 +static void init_fq(flush_queue_t * fq)
14490 +{
14491 + memset(fq, 0, sizeof *fq);
14492 +
14493 + atomic_set(&fq->nr_submitted, 0);
14494 +
14495 + INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14496 +
14497 + init_waitqueue_head(&fq->wait);
14498 + spin_lock_init(&fq->guard);
14499 +}
14500 +
14501 +/* slab for flush queues */
14502 +static struct kmem_cache *fq_slab;
14503 +
14504 +/**
14505 + * reiser4_init_fqs - create flush queue cache
14506 + *
14507 + * Initializes slab cache of flush queues. It is part of reiser4 module
14508 + * initialization.
14509 + */
14510 +int reiser4_init_fqs(void)
14511 +{
14512 + fq_slab = kmem_cache_create("fq",
14513 + sizeof(flush_queue_t),
14514 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14515 + if (fq_slab == NULL)
14516 + return RETERR(-ENOMEM);
14517 + return 0;
14518 +}
14519 +
14520 +/**
14521 + * reiser4_done_fqs - delete flush queue cache
14522 + *
14523 + * This is called on reiser4 module unloading or system shutdown.
14524 + */
14525 +void reiser4_done_fqs(void)
14526 +{
14527 + destroy_reiser4_cache(&fq_slab);
14528 +}
14529 +
14530 +/* create new flush queue object */
14531 +static flush_queue_t *create_fq(gfp_t gfp)
14532 +{
14533 + flush_queue_t *fq;
14534 +
14535 + fq = kmem_cache_alloc(fq_slab, gfp);
14536 + if (fq)
14537 + init_fq(fq);
14538 +
14539 + return fq;
14540 +}
14541 +
14542 +/* adjust atom's and flush queue's counters of queued nodes */
14543 +static void count_enqueued_node(flush_queue_t * fq)
14544 +{
14545 + ON_DEBUG(fq->atom->num_queued++);
14546 +}
14547 +
14548 +static void count_dequeued_node(flush_queue_t * fq)
14549 +{
14550 + assert("zam-993", fq->atom->num_queued > 0);
14551 + ON_DEBUG(fq->atom->num_queued--);
14552 +}
14553 +
14554 +/* attach flush queue object to the atom */
14555 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14556 +{
14557 + assert_spin_locked(&(atom->alock));
14558 + list_add(&fq->alink, &atom->flush_queues);
14559 + fq->atom = atom;
14560 + ON_DEBUG(atom->nr_flush_queues++);
14561 +}
14562 +
14563 +static void detach_fq(flush_queue_t * fq)
14564 +{
14565 + assert_spin_locked(&(fq->atom->alock));
14566 +
14567 + spin_lock(&(fq->guard));
14568 + list_del_init(&fq->alink);
14569 + assert("vs-1456", fq->atom->nr_flush_queues > 0);
14570 + ON_DEBUG(fq->atom->nr_flush_queues--);
14571 + fq->atom = NULL;
14572 + spin_unlock(&(fq->guard));
14573 +}
14574 +
14575 +/* destroy flush queue object */
14576 +static void done_fq(flush_queue_t * fq)
14577 +{
14578 + assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14579 + assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14580 +
14581 + kmem_cache_free(fq_slab, fq);
14582 +}
14583 +
14584 +/* */
14585 +static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14586 +{
14587 + JF_SET(node, JNODE_FLUSH_QUEUED);
14588 + count_enqueued_node(fq);
14589 +}
14590 +
14591 +/* Putting jnode into the flush queue. Both atom and jnode should be
14592 + spin-locked. */
14593 +void queue_jnode(flush_queue_t * fq, jnode * node)
14594 +{
14595 + assert_spin_locked(&(node->guard));
14596 + assert("zam-713", node->atom != NULL);
14597 + assert_spin_locked(&(node->atom->alock));
14598 + assert("zam-716", fq->atom != NULL);
14599 + assert("zam-717", fq->atom == node->atom);
14600 + assert("zam-907", fq_in_use(fq));
14601 +
14602 + assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14603 + assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14604 + assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14605 + assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14606 +
14607 + mark_jnode_queued(fq, node);
14608 + list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14609 +
14610 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14611 + FQ_LIST, 1));
14612 +}
14613 +
14614 +/* repeatable process for waiting io completion on a flush queue object */
14615 +static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14616 +{
14617 + assert("zam-738", fq->atom != NULL);
14618 + assert_spin_locked(&(fq->atom->alock));
14619 + assert("zam-736", fq_in_use(fq));
14620 + assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14621 +
14622 + if (atomic_read(&fq->nr_submitted) != 0) {
14623 + struct super_block *super;
14624 +
14625 + spin_unlock_atom(fq->atom);
14626 +
14627 + assert("nikita-3013", reiser4_schedulable());
14628 +
14629 + super = reiser4_get_current_sb();
14630 +
14631 + /* FIXME: this is instead of blk_run_queues() */
14632 + blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14633 +
14634 + if (!(super->s_flags & MS_RDONLY))
14635 + wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
14636 +
14637 + /* Ask the caller to re-acquire the locks and call this
14638 + function again. Note: this technique is commonly used in
14639 + the txnmgr code. */
14640 + return -E_REPEAT;
14641 + }
14642 +
14643 + *nr_io_errors += atomic_read(&fq->nr_errors);
14644 + return 0;
14645 +}
14646 +
14647 +/* wait on I/O completion, re-submit dirty nodes to write */
14648 +static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14649 +{
14650 + int ret;
14651 + txn_atom *atom = fq->atom;
14652 +
14653 + assert("zam-801", atom != NULL);
14654 + assert_spin_locked(&(atom->alock));
14655 + assert("zam-762", fq_in_use(fq));
14656 +
14657 + ret = wait_io(fq, nr_io_errors);
14658 + if (ret)
14659 + return ret;
14660 +
14661 + detach_fq(fq);
14662 + done_fq(fq);
14663 +
14664 + reiser4_atom_send_event(atom);
14665 +
14666 + return 0;
14667 +}
14668 +
14669 +/* wait for all i/o for given atom to be completed, actually do one iteration
14670 + on that and return -E_REPEAT if there more iterations needed */
14671 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14672 +{
14673 + flush_queue_t *fq;
14674 +
14675 + assert_spin_locked(&(atom->alock));
14676 +
14677 + if (list_empty_careful(&atom->flush_queues))
14678 + return 0;
14679 +
14680 + list_for_each_entry(fq, &atom->flush_queues, alink) {
14681 + if (fq_ready(fq)) {
14682 + int ret;
14683 +
14684 + mark_fq_in_use(fq);
14685 + assert("vs-1247", fq->owner == NULL);
14686 + ON_DEBUG(fq->owner = current);
14687 + ret = finish_fq(fq, nr_io_errors);
14688 +
14689 + if (*nr_io_errors)
14690 + reiser4_handle_error();
14691 +
14692 + if (ret) {
14693 + reiser4_fq_put(fq);
14694 + return ret;
14695 + }
14696 +
14697 + spin_unlock_atom(atom);
14698 +
14699 + return -E_REPEAT;
14700 + }
14701 + }
14702 +
14703 + /* All flush queues are in use; atom remains locked */
14704 + return -EBUSY;
14705 +}
14706 +
14707 +/* wait all i/o for current atom */
14708 +int current_atom_finish_all_fq(void)
14709 +{
14710 + txn_atom *atom;
14711 + int nr_io_errors = 0;
14712 + int ret = 0;
14713 +
14714 + do {
14715 + while (1) {
14716 + atom = get_current_atom_locked();
14717 + ret = finish_all_fq(atom, &nr_io_errors);
14718 + if (ret != -EBUSY)
14719 + break;
14720 + reiser4_atom_wait_event(atom);
14721 + }
14722 + } while (ret == -E_REPEAT);
14723 +
14724 + /* we do not need locked atom after this function finishes, SUCCESS or
14725 + -EBUSY are two return codes when atom remains locked after
14726 + finish_all_fq */
14727 + if (!ret)
14728 + spin_unlock_atom(atom);
14729 +
14730 + assert_spin_not_locked(&(atom->alock));
14731 +
14732 + if (ret)
14733 + return ret;
14734 +
14735 + if (nr_io_errors)
14736 + return RETERR(-EIO);
14737 +
14738 + return 0;
14739 +}
14740 +
14741 +/* change node->atom field for all jnode from given list */
14742 +static void
14743 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14744 +{
14745 + jnode *cur;
14746 +
14747 + list_for_each_entry(cur, list, capture_link) {
14748 + spin_lock_jnode(cur);
14749 + cur->atom = atom;
14750 + spin_unlock_jnode(cur);
14751 + }
14752 +}
14753 +
14754 +/* support for atom fusion operation */
14755 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14756 +{
14757 + flush_queue_t *fq;
14758 +
14759 + assert_spin_locked(&(to->alock));
14760 + assert_spin_locked(&(from->alock));
14761 +
14762 + list_for_each_entry(fq, &from->flush_queues, alink) {
14763 + scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14764 + spin_lock(&(fq->guard));
14765 + fq->atom = to;
14766 + spin_unlock(&(fq->guard));
14767 + }
14768 +
14769 + list_splice_init(&from->flush_queues, to->flush_queues.prev);
14770 +
14771 +#if REISER4_DEBUG
14772 + to->num_queued += from->num_queued;
14773 + to->nr_flush_queues += from->nr_flush_queues;
14774 + from->nr_flush_queues = 0;
14775 +#endif
14776 +}
14777 +
14778 +#if REISER4_DEBUG
14779 +int atom_fq_parts_are_clean(txn_atom * atom)
14780 +{
14781 + assert("zam-915", atom != NULL);
14782 + return list_empty_careful(&atom->flush_queues);
14783 +}
14784 +#endif
14785 +/* Bio i/o completion routine for reiser4 write operations. */
14786 +static int
14787 +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
14788 + int err)
14789 +{
14790 + int i;
14791 + int nr_errors = 0;
14792 + flush_queue_t *fq;
14793 +
14794 + assert("zam-958", bio->bi_rw & WRITE);
14795 +
14796 + /* i/o op. is not fully completed */
14797 + if (bio->bi_size != 0)
14798 + return 1;
14799 +
14800 + if (err == -EOPNOTSUPP)
14801 + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14802 +
14803 + /* we expect that bio->private is set to NULL or fq object which is used
14804 + * for synchronization and error counting. */
14805 + fq = bio->bi_private;
14806 + /* Check all elements of io_vec for correct write completion. */
14807 + for (i = 0; i < bio->bi_vcnt; i += 1) {
14808 + struct page *pg = bio->bi_io_vec[i].bv_page;
14809 +
14810 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14811 + SetPageError(pg);
14812 + nr_errors++;
14813 + }
14814 +
14815 + {
14816 + /* jnode WRITEBACK ("write is in progress bit") is
14817 + * atomically cleared here. */
14818 + jnode *node;
14819 +
14820 + assert("zam-736", pg != NULL);
14821 + assert("zam-736", PagePrivate(pg));
14822 + node = jprivate(pg);
14823 +
14824 + JF_CLR(node, JNODE_WRITEBACK);
14825 + }
14826 +
14827 + end_page_writeback(pg);
14828 + page_cache_release(pg);
14829 + }
14830 +
14831 + if (fq) {
14832 + /* count i/o error in fq object */
14833 + atomic_add(nr_errors, &fq->nr_errors);
14834 +
14835 + /* If all write requests registered in this "fq" are done we up
14836 + * the waiter. */
14837 + if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
14838 + wake_up(&fq->wait);
14839 + }
14840 +
14841 + bio_put(bio);
14842 + return 0;
14843 +}
14844 +
14845 +/* Count I/O requests which will be submitted by @bio in given flush queues
14846 + @fq */
14847 +void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14848 +{
14849 + bio->bi_private = fq;
14850 + bio->bi_end_io = end_io_handler;
14851 +
14852 + if (fq)
14853 + atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14854 +}
14855 +
14856 +/* Move all queued nodes out from @fq->prepped list. */
14857 +static void release_prepped_list(flush_queue_t * fq)
14858 +{
14859 + txn_atom *atom;
14860 +
14861 + assert("zam-904", fq_in_use(fq));
14862 + atom = atom_locked_by_fq(fq);
14863 +
14864 + while (!list_empty(ATOM_FQ_LIST(fq))) {
14865 + jnode *cur;
14866 +
14867 + cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14868 + list_del_init(&cur->capture_link);
14869 +
14870 + count_dequeued_node(fq);
14871 + spin_lock_jnode(cur);
14872 + assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14873 + assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14874 + assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14875 + JF_CLR(cur, JNODE_FLUSH_QUEUED);
14876 +
14877 + if (JF_ISSET(cur, JNODE_DIRTY)) {
14878 + list_add_tail(&cur->capture_link,
14879 + ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14880 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14881 + DIRTY_LIST, 1));
14882 + } else {
14883 + list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14884 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14885 + CLEAN_LIST, 1));
14886 + }
14887 +
14888 + spin_unlock_jnode(cur);
14889 + }
14890 +
14891 + if (--atom->nr_running_queues == 0)
14892 + reiser4_atom_send_event(atom);
14893 +
14894 + spin_unlock_atom(atom);
14895 +}
14896 +
14897 +/* Submit write requests for nodes on the already filled flush queue @fq.
14898 +
14899 + @fq: flush queue object which contains jnodes we can (and will) write.
14900 + @return: number of submitted blocks (>=0) if success, otherwise -- an error
14901 + code (<0). */
14902 +int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
14903 +{
14904 + int ret;
14905 + txn_atom *atom;
14906 +
14907 + while (1) {
14908 + atom = atom_locked_by_fq(fq);
14909 + assert("zam-924", atom);
14910 + /* do not write fq in parallel. */
14911 + if (atom->nr_running_queues == 0
14912 + || !(flags & WRITEOUT_SINGLE_STREAM))
14913 + break;
14914 + reiser4_atom_wait_event(atom);
14915 + }
14916 +
14917 + atom->nr_running_queues++;
14918 + spin_unlock_atom(atom);
14919 +
14920 + ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14921 + release_prepped_list(fq);
14922 +
14923 + return ret;
14924 +}
14925 +
14926 +/* Getting flush queue object for exclusive use by one thread. May require
14927 + several iterations which is indicated by -E_REPEAT return code.
14928 +
14929 + This function does not contain code for obtaining an atom lock because an
14930 + atom lock is obtained by different ways in different parts of reiser4,
14931 + usually it is current atom, but we need a possibility for getting fq for the
14932 + atom of given jnode. */
14933 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
14934 +{
14935 + flush_queue_t *fq;
14936 +
14937 + assert_spin_locked(&(atom->alock));
14938 +
14939 + fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
14940 + while (&atom->flush_queues != &fq->alink) {
14941 + spin_lock(&(fq->guard));
14942 +
14943 + if (fq_ready(fq)) {
14944 + mark_fq_in_use(fq);
14945 + assert("vs-1246", fq->owner == NULL);
14946 + ON_DEBUG(fq->owner = current);
14947 + spin_unlock(&(fq->guard));
14948 +
14949 + if (*new_fq)
14950 + done_fq(*new_fq);
14951 +
14952 + *new_fq = fq;
14953 +
14954 + return 0;
14955 + }
14956 +
14957 + spin_unlock(&(fq->guard));
14958 +
14959 + fq = list_entry(fq->alink.next, flush_queue_t, alink);
14960 + }
14961 +
14962 + /* Use previously allocated fq object */
14963 + if (*new_fq) {
14964 + mark_fq_in_use(*new_fq);
14965 + assert("vs-1248", (*new_fq)->owner == 0);
14966 + ON_DEBUG((*new_fq)->owner = current);
14967 + attach_fq(atom, *new_fq);
14968 +
14969 + return 0;
14970 + }
14971 +
14972 + spin_unlock_atom(atom);
14973 +
14974 + *new_fq = create_fq(gfp);
14975 +
14976 + if (*new_fq == NULL)
14977 + return RETERR(-ENOMEM);
14978 +
14979 + return RETERR(-E_REPEAT);
14980 +}
14981 +
14982 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
14983 +{
14984 + return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
14985 +}
14986 +
14987 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
14988 + object for current atom, if success fq->atom remains locked. */
14989 +flush_queue_t *get_fq_for_current_atom(void)
14990 +{
14991 + flush_queue_t *fq = NULL;
14992 + txn_atom *atom;
14993 + int ret;
14994 +
14995 + do {
14996 + atom = get_current_atom_locked();
14997 + ret = reiser4_fq_by_atom(atom, &fq);
14998 + } while (ret == -E_REPEAT);
14999 +
15000 + if (ret)
15001 + return ERR_PTR(ret);
15002 + return fq;
15003 +}
15004 +
15005 +/* Releasing flush queue object after exclusive use */
15006 +void reiser4_fq_put_nolock(flush_queue_t *fq)
15007 +{
15008 + assert("zam-747", fq->atom != NULL);
15009 + assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15010 + mark_fq_ready(fq);
15011 + assert("vs-1245", fq->owner == current);
15012 + ON_DEBUG(fq->owner = NULL);
15013 +}
15014 +
15015 +void reiser4_fq_put(flush_queue_t * fq)
15016 +{
15017 + txn_atom *atom;
15018 +
15019 + spin_lock(&(fq->guard));
15020 + atom = atom_locked_by_fq_nolock(fq);
15021 +
15022 + assert("zam-746", atom != NULL);
15023 +
15024 + reiser4_fq_put_nolock(fq);
15025 + reiser4_atom_send_event(atom);
15026 +
15027 + spin_unlock(&(fq->guard));
15028 + spin_unlock_atom(atom);
15029 +}
15030 +
15031 +/* A part of atom object initialization related to the embedded flush queue
15032 + list head */
15033 +
15034 +void init_atom_fq_parts(txn_atom *atom)
15035 +{
15036 + INIT_LIST_HEAD(&atom->flush_queues);
15037 +}
15038 +
15039 +#if REISER4_DEBUG
15040 +
15041 +void reiser4_check_fq(const txn_atom *atom)
15042 +{
15043 + /* check number of nodes on all atom's flush queues */
15044 + flush_queue_t *fq;
15045 + int count;
15046 + struct list_head *pos;
15047 +
15048 + count = 0;
15049 + list_for_each_entry(fq, &atom->flush_queues, alink) {
15050 + spin_lock(&(fq->guard));
15051 + /* calculate number of jnodes on fq' list of prepped jnodes */
15052 + list_for_each(pos, ATOM_FQ_LIST(fq))
15053 + count++;
15054 + spin_unlock(&(fq->guard));
15055 + }
15056 + if (count != atom->fq)
15057 + warning("", "fq counter %d, real %d\n", atom->fq, count);
15058 +
15059 +}
15060 +
15061 +#endif
15062 +
15063 +/*
15064 + * Local variables:
15065 + * c-indentation-style: "K&R"
15066 + * mode-name: "LC"
15067 + * c-basic-offset: 8
15068 + * tab-width: 8
15069 + * fill-column: 79
15070 + * scroll-step: 1
15071 + * End:
15072 + */
15073 diff -urN linux-2.6.22.orig/fs/reiser4/forward.h linux-2.6.22/fs/reiser4/forward.h
15074 --- linux-2.6.22.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300
15075 +++ linux-2.6.22/fs/reiser4/forward.h 2007-07-29 00:25:34.864693371 +0400
15076 @@ -0,0 +1,252 @@
15077 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15078 +
15079 +/* Forward declarations. Thank you Kernighan. */
15080 +
15081 +#if !defined( __REISER4_FORWARD_H__ )
15082 +#define __REISER4_FORWARD_H__
15083 +
15084 +#include <asm/errno.h>
15085 +#include <linux/types.h>
15086 +
15087 +typedef struct zlock zlock;
15088 +typedef struct lock_stack lock_stack;
15089 +typedef struct lock_handle lock_handle;
15090 +typedef struct znode znode;
15091 +typedef struct flow flow_t;
15092 +typedef struct coord coord_t;
15093 +typedef struct tree_access_pointer tap_t;
15094 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15095 +typedef union reiser4_plugin reiser4_plugin;
15096 +typedef __u16 reiser4_plugin_id;
15097 +typedef __u64 reiser4_plugin_groups;
15098 +typedef struct item_plugin item_plugin;
15099 +typedef struct jnode_plugin jnode_plugin;
15100 +typedef struct reiser4_item_data reiser4_item_data;
15101 +typedef union reiser4_key reiser4_key;
15102 +typedef struct reiser4_tree reiser4_tree;
15103 +typedef struct carry_cut_data carry_cut_data;
15104 +typedef struct carry_kill_data carry_kill_data;
15105 +typedef struct carry_tree_op carry_tree_op;
15106 +typedef struct carry_tree_node carry_tree_node;
15107 +typedef struct carry_plugin_info carry_plugin_info;
15108 +typedef struct reiser4_journal reiser4_journal;
15109 +typedef struct txn_atom txn_atom;
15110 +typedef struct txn_handle txn_handle;
15111 +typedef struct txn_mgr txn_mgr;
15112 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15113 +typedef struct reiser4_context reiser4_context;
15114 +typedef struct carry_level carry_level;
15115 +typedef struct blocknr_set_entry blocknr_set_entry;
15116 +/* super_block->s_fs_info points to this */
15117 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15118 +/* next two objects are fields of reiser4_super_info_data */
15119 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15120 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15121 +
15122 +typedef struct flush_scan flush_scan;
15123 +typedef struct flush_position flush_pos_t;
15124 +
15125 +typedef unsigned short pos_in_node_t;
15126 +#define MAX_POS_IN_NODE 65535
15127 +
15128 +typedef struct jnode jnode;
15129 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15130 +
15131 +typedef struct uf_coord uf_coord_t;
15132 +typedef struct hint hint_t;
15133 +
15134 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15135 +
15136 +struct inode;
15137 +struct page;
15138 +struct file;
15139 +struct dentry;
15140 +struct super_block;
15141 +
15142 +/* return values of coord_by_key(). cbk == coord_by_key */
15143 +typedef enum {
15144 + CBK_COORD_FOUND = 0,
15145 + CBK_COORD_NOTFOUND = -ENOENT,
15146 +} lookup_result;
15147 +
15148 +/* results of lookup with directory file */
15149 +typedef enum {
15150 + FILE_NAME_FOUND = 0,
15151 + FILE_NAME_NOTFOUND = -ENOENT,
15152 + FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15153 + FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15154 +} file_lookup_result;
15155 +
15156 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15157 + both coincide. */
15158 +typedef enum {
15159 + /* search exactly for the coord with key given */
15160 + FIND_EXACT,
15161 + /* search for coord with the maximal key not greater than one
15162 + given */
15163 + FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15164 +} lookup_bias;
15165 +
15166 +typedef enum {
15167 + /* number of leaf level of the tree
15168 + The fake root has (tree_level=0). */
15169 + LEAF_LEVEL = 1,
15170 +
15171 + /* number of level one above leaf level of the tree.
15172 +
15173 + It is supposed that internal tree used by reiser4 to store file
15174 + system data and meta data will have height 2 initially (when
15175 + created by mkfs).
15176 + */
15177 + TWIG_LEVEL = 2,
15178 +} tree_level;
15179 +
15180 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15181 + array, since the zero'th level is not used. */
15182 +#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15183 +
15184 +/* enumeration of possible mutual position of item and coord. This enum is
15185 + return type of ->is_in_item() item plugin method which see. */
15186 +typedef enum {
15187 + /* coord is on the left of an item */
15188 + IP_ON_THE_LEFT,
15189 + /* coord is inside item */
15190 + IP_INSIDE,
15191 + /* coord is inside item, but to the right of the rightmost unit of
15192 + this item */
15193 + IP_RIGHT_EDGE,
15194 + /* coord is on the right of an item */
15195 + IP_ON_THE_RIGHT
15196 +} interposition;
15197 +
15198 +/* type of lock to acquire on znode before returning it to caller */
15199 +typedef enum {
15200 + ZNODE_NO_LOCK = 0,
15201 + ZNODE_READ_LOCK = 1,
15202 + ZNODE_WRITE_LOCK = 2,
15203 +} znode_lock_mode;
15204 +
15205 +/* type of lock request */
15206 +typedef enum {
15207 + ZNODE_LOCK_LOPRI = 0,
15208 + ZNODE_LOCK_HIPRI = (1 << 0),
15209 +
15210 + /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15211 + waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15212 + return the value -E_REPEAT. */
15213 + ZNODE_LOCK_NONBLOCK = (1 << 1),
15214 + /* An option for longterm_lock_znode which prevents atom fusion */
15215 + ZNODE_LOCK_DONT_FUSE = (1 << 2)
15216 +} znode_lock_request;
15217 +
15218 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15219 +
15220 +/* used to specify direction of shift. These must be -1 and 1 */
15221 +typedef enum {
15222 + SHIFT_LEFT = 1,
15223 + SHIFT_RIGHT = -1
15224 +} shift_direction;
15225 +
15226 +typedef enum {
15227 + LEFT_SIDE,
15228 + RIGHT_SIDE
15229 +} sideof;
15230 +
15231 +#define round_up( value, order ) \
15232 + ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15233 + ~( ( order ) - 1 ) ) )
15234 +
15235 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15236 +typedef enum {
15237 + /* unit of internal item is moved */
15238 + SUBTREE_MOVED = 0,
15239 + /* nothing else can be squeezed into left neighbor */
15240 + SQUEEZE_TARGET_FULL = 1,
15241 + /* all content of node is squeezed into its left neighbor */
15242 + SQUEEZE_SOURCE_EMPTY = 2,
15243 + /* one more item is copied (this is only returned by
15244 + allocate_and_copy_extent to squalloc_twig)) */
15245 + SQUEEZE_CONTINUE = 3
15246 +} squeeze_result;
15247 +
15248 +/* Do not change items ids. If you do - there will be format change */
15249 +typedef enum {
15250 + STATIC_STAT_DATA_ID = 0x0,
15251 + SIMPLE_DIR_ENTRY_ID = 0x1,
15252 + COMPOUND_DIR_ID = 0x2,
15253 + NODE_POINTER_ID = 0x3,
15254 + EXTENT_POINTER_ID = 0x5,
15255 + FORMATTING_ID = 0x6,
15256 + CTAIL_ID = 0x7,
15257 + BLACK_BOX_ID = 0x8,
15258 + LAST_ITEM_ID = 0x9
15259 +} item_id;
15260 +
15261 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15262 + whether commit() was called or VM memory pressure was applied. */
15263 +typedef enum {
15264 + /* submit flush queue to disk at jnode_flush completion */
15265 + JNODE_FLUSH_WRITE_BLOCKS = 1,
15266 +
15267 + /* flush is called for commit */
15268 + JNODE_FLUSH_COMMIT = 2,
15269 + /* not implemented */
15270 + JNODE_FLUSH_MEMORY_FORMATTED = 4,
15271 +
15272 + /* not implemented */
15273 + JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15274 +} jnode_flush_flags;
15275 +
15276 +/* Flags to insert/paste carry operations. Currently they only used in
15277 + flushing code, but in future, they can be used to optimize for repetitive
15278 + accesses. */
15279 +typedef enum {
15280 + /* carry is not allowed to shift data to the left when trying to find
15281 + free space */
15282 + COPI_DONT_SHIFT_LEFT = (1 << 0),
15283 + /* carry is not allowed to shift data to the right when trying to find
15284 + free space */
15285 + COPI_DONT_SHIFT_RIGHT = (1 << 1),
15286 + /* carry is not allowed to allocate new node(s) when trying to find
15287 + free space */
15288 + COPI_DONT_ALLOCATE = (1 << 2),
15289 + /* try to load left neighbor if its not in a cache */
15290 + COPI_LOAD_LEFT = (1 << 3),
15291 + /* try to load right neighbor if its not in a cache */
15292 + COPI_LOAD_RIGHT = (1 << 4),
15293 + /* shift insertion point to the left neighbor */
15294 + COPI_GO_LEFT = (1 << 5),
15295 + /* shift insertion point to the right neighbor */
15296 + COPI_GO_RIGHT = (1 << 6),
15297 + /* try to step back into original node if insertion into new node
15298 + fails after shifting data there. */
15299 + COPI_STEP_BACK = (1 << 7)
15300 +} cop_insert_flag;
15301 +
15302 +typedef enum {
15303 + SAFE_UNLINK, /* safe-link for unlink */
15304 + SAFE_TRUNCATE /* safe-link for truncate */
15305 +} reiser4_safe_link_t;
15306 +
15307 +/* this is to show on which list of atom jnode is */
15308 +typedef enum {
15309 + NOT_CAPTURED,
15310 + DIRTY_LIST,
15311 + CLEAN_LIST,
15312 + FQ_LIST,
15313 + WB_LIST,
15314 + OVRWR_LIST
15315 +} atom_list;
15316 +
15317 +/* __REISER4_FORWARD_H__ */
15318 +#endif
15319 +
15320 +/* Make Linus happy.
15321 + Local variables:
15322 + c-indentation-style: "K&R"
15323 + mode-name: "LC"
15324 + c-basic-offset: 8
15325 + tab-width: 8
15326 + fill-column: 120
15327 + End:
15328 +*/
15329 diff -urN linux-2.6.22.orig/fs/reiser4/fsdata.c linux-2.6.22/fs/reiser4/fsdata.c
15330 --- linux-2.6.22.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300
15331 +++ linux-2.6.22/fs/reiser4/fsdata.c 2007-07-29 00:25:34.868694406 +0400
15332 @@ -0,0 +1,808 @@
15333 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15334 + * reiser4/README */
15335 +
15336 +#include "fsdata.h"
15337 +#include "inode.h"
15338 +
15339 +
15340 +/* cache or dir_cursors */
15341 +static struct kmem_cache *d_cursor_cache;
15342 +static struct shrinker *d_cursor_shrinker;
15343 +
15344 +/* list of unused cursors */
15345 +static LIST_HEAD(cursor_cache);
15346 +
15347 +/* number of cursors in list of ununsed cursors */
15348 +static unsigned long d_cursor_unused = 0;
15349 +
15350 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15351 +DEFINE_SPINLOCK(d_lock);
15352 +
15353 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15354 +static int file_is_stateless(struct file *file);
15355 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15356 +static void kill_cursor(dir_cursor *);
15357 +
15358 +/**
15359 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15360 + * @nr: number of objects to free
15361 + * @mask: GFP mask
15362 + *
15363 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15364 + * number. Return number of still freeable cursors.
15365 + */
15366 +static int d_cursor_shrink(int nr, gfp_t mask)
15367 +{
15368 + if (nr != 0) {
15369 + dir_cursor *scan;
15370 + int killed;
15371 +
15372 + killed = 0;
15373 + spin_lock(&d_lock);
15374 + while (!list_empty(&cursor_cache)) {
15375 + scan = list_entry(cursor_cache.next, dir_cursor, alist);
15376 + assert("nikita-3567", scan->ref == 0);
15377 + kill_cursor(scan);
15378 + ++killed;
15379 + --nr;
15380 + if (nr == 0)
15381 + break;
15382 + }
15383 + spin_unlock(&d_lock);
15384 + }
15385 + return d_cursor_unused;
15386 +}
15387 +
15388 +/**
15389 + * reiser4_init_d_cursor - create d_cursor cache
15390 + *
15391 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15392 + * initialization.
15393 + */
15394 +int reiser4_init_d_cursor(void)
15395 +{
15396 + d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15397 + SLAB_HWCACHE_ALIGN, NULL, NULL);
15398 + if (d_cursor_cache == NULL)
15399 + return RETERR(-ENOMEM);
15400 +
15401 + /*
15402 + * actually, d_cursors are "priceless", because there is no way to
15403 + * recover information stored in them. On the other hand, we don't
15404 + * want to consume all kernel memory by them. As a compromise, just
15405 + * assign higher "seeks" value to d_cursor cache, so that it will be
15406 + * shrunk only if system is really tight on memory.
15407 + */
15408 + d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15409 + d_cursor_shrink);
15410 + if (d_cursor_shrinker == NULL) {
15411 + destroy_reiser4_cache(&d_cursor_cache);
15412 + d_cursor_cache = NULL;
15413 + return RETERR(-ENOMEM);
15414 + }
15415 + return 0;
15416 +}
15417 +
15418 +/**
15419 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15420 + *
15421 + * This is called on reiser4 module unloading or system shutdown.
15422 + */
15423 +void reiser4_done_d_cursor(void)
15424 +{
15425 + BUG_ON(d_cursor_shrinker == NULL);
15426 + remove_shrinker(d_cursor_shrinker);
15427 + d_cursor_shrinker = NULL;
15428 +
15429 + destroy_reiser4_cache(&d_cursor_cache);
15430 +}
15431 +
15432 +#define D_CURSOR_TABLE_SIZE (256)
15433 +
15434 +static inline unsigned long
15435 +d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key)
15436 +{
15437 + assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15438 + return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15439 +}
15440 +
15441 +static inline int d_cursor_eq(const struct d_cursor_key *k1,
15442 + const struct d_cursor_key *k2)
15443 +{
15444 + return k1->cid == k2->cid && k1->oid == k2->oid;
15445 +}
15446 +
15447 +/*
15448 + * define functions to manipulate reiser4 super block's hash table of
15449 + * dir_cursors
15450 + */
15451 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15452 +#define KFREE(ptr, size) kfree(ptr)
15453 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15454 + dir_cursor,
15455 + struct d_cursor_key,
15456 + key, hash, d_cursor_hash, d_cursor_eq);
15457 +#undef KFREE
15458 +#undef KMALLOC
15459 +
15460 +/**
15461 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15462 + * @super: super block to initialize
15463 + *
15464 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15465 + * of mount.
15466 + */
15467 +int reiser4_init_super_d_info(struct super_block *super)
15468 +{
15469 + struct d_cursor_info *p;
15470 +
15471 + p = &get_super_private(super)->d_info;
15472 +
15473 + INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15474 + return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15475 +}
15476 +
15477 +/**
15478 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
15479 + * @super: super block being umounted
15480 + *
15481 + * It is called on umount. Kills all directory cursors attached to suoer block.
15482 + */
15483 +void reiser4_done_super_d_info(struct super_block *super)
15484 +{
15485 + struct d_cursor_info *d_info;
15486 + dir_cursor *cursor, *next;
15487 +
15488 + d_info = &get_super_private(super)->d_info;
15489 + for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15490 + kill_cursor(cursor);
15491 +
15492 + BUG_ON(d_info->tree.rnode != NULL);
15493 + d_cursor_hash_done(&d_info->table);
15494 +}
15495 +
15496 +/**
15497 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15498 + * @cursor: cursor to free
15499 + *
15500 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15501 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15502 + * indices, hash table, list of unused cursors and frees it.
15503 + */
15504 +static void kill_cursor(dir_cursor *cursor)
15505 +{
15506 + unsigned long index;
15507 +
15508 + assert("nikita-3566", cursor->ref == 0);
15509 + assert("nikita-3572", cursor->fsdata != NULL);
15510 +
15511 + index = (unsigned long)cursor->key.oid;
15512 + list_del_init(&cursor->fsdata->dir.linkage);
15513 + free_fsdata(cursor->fsdata);
15514 + cursor->fsdata = NULL;
15515 +
15516 + if (list_empty_careful(&cursor->list))
15517 + /* this is last cursor for a file. Kill radix-tree entry */
15518 + radix_tree_delete(&cursor->info->tree, index);
15519 + else {
15520 + void **slot;
15521 +
15522 + /*
15523 + * there are other cursors for the same oid.
15524 + */
15525 +
15526 + /*
15527 + * if radix tree point to the cursor being removed, re-target
15528 + * radix tree slot to the next cursor in the (non-empty as was
15529 + * checked above) element of the circular list of all cursors
15530 + * for this oid.
15531 + */
15532 + slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15533 + assert("nikita-3571", *slot != NULL);
15534 + if (*slot == cursor)
15535 + *slot = list_entry(cursor->list.next, dir_cursor, list);
15536 + /* remove cursor from circular list */
15537 + list_del_init(&cursor->list);
15538 + }
15539 + /* remove cursor from the list of unused cursors */
15540 + list_del_init(&cursor->alist);
15541 + /* remove cursor from the hash table */
15542 + d_cursor_hash_remove(&cursor->info->table, cursor);
15543 + /* and free it */
15544 + kmem_cache_free(d_cursor_cache, cursor);
15545 + --d_cursor_unused;
15546 +}
15547 +
15548 +/* possible actions that can be performed on all cursors for the given file */
15549 +enum cursor_action {
15550 + /*
15551 + * load all detached state: this is called when stat-data is loaded
15552 + * from the disk to recover information about all pending readdirs
15553 + */
15554 + CURSOR_LOAD,
15555 + /*
15556 + * detach all state from inode, leaving it in the cache. This is called
15557 + * when inode is removed form the memory by memory pressure
15558 + */
15559 + CURSOR_DISPOSE,
15560 + /*
15561 + * detach cursors from the inode, and free them. This is called when
15562 + * inode is destroyed
15563 + */
15564 + CURSOR_KILL
15565 +};
15566 +
15567 +/*
15568 + * return d_cursor data for the file system @inode is in.
15569 + */
15570 +static inline struct d_cursor_info *d_info(struct inode *inode)
15571 +{
15572 + return &get_super_private(inode->i_sb)->d_info;
15573 +}
15574 +
15575 +/*
15576 + * lookup d_cursor in the per-super-block radix tree.
15577 + */
15578 +static inline dir_cursor *lookup(struct d_cursor_info * info,
15579 + unsigned long index)
15580 +{
15581 + return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15582 +}
15583 +
15584 +/*
15585 + * attach @cursor to the radix tree. There may be multiple cursors for the
15586 + * same oid, they are chained into circular list.
15587 + */
15588 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
15589 +{
15590 + dir_cursor *head;
15591 +
15592 + head = lookup(cursor->info, index);
15593 + if (head == NULL) {
15594 + /* this is the first cursor for this index */
15595 + INIT_LIST_HEAD(&cursor->list);
15596 + radix_tree_insert(&cursor->info->tree, index, cursor);
15597 + } else {
15598 + /* some cursor already exists. Chain ours */
15599 + list_add(&cursor->list, &head->list);
15600 + }
15601 +}
15602 +
15603 +/*
15604 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
15605 + * "unused" list. Called when file descriptor is not longer in active use.
15606 + */
15607 +static void clean_fsdata(struct file *file)
15608 +{
15609 + dir_cursor *cursor;
15610 + reiser4_file_fsdata *fsdata;
15611 +
15612 + assert("nikita-3570", file_is_stateless(file));
15613 +
15614 + fsdata = (reiser4_file_fsdata *) file->private_data;
15615 + if (fsdata != NULL) {
15616 + cursor = fsdata->cursor;
15617 + if (cursor != NULL) {
15618 + spin_lock(&d_lock);
15619 + --cursor->ref;
15620 + if (cursor->ref == 0) {
15621 + list_add_tail(&cursor->alist, &cursor_cache);
15622 + ++d_cursor_unused;
15623 + }
15624 + spin_unlock(&d_lock);
15625 + file->private_data = NULL;
15626 + }
15627 + }
15628 +}
15629 +
15630 +/*
15631 + * global counter used to generate "client ids". These ids are encoded into
15632 + * high bits of fpos.
15633 + */
15634 +static __u32 cid_counter = 0;
15635 +#define CID_SHIFT (20)
15636 +#define CID_MASK (0xfffffull)
15637 +
15638 +static void free_file_fsdata_nolock(struct file *);
15639 +
15640 +/**
15641 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15642 + * @cursor:
15643 + * @file:
15644 + * @inode:
15645 + *
15646 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15647 + * reiser4 super block's hash table and radix tree.
15648 + add detachable readdir
15649 + * state to the @f
15650 + */
15651 +static int insert_cursor(dir_cursor *cursor, struct file *file,
15652 + struct inode *inode)
15653 +{
15654 + int result;
15655 + reiser4_file_fsdata *fsdata;
15656 +
15657 + memset(cursor, 0, sizeof *cursor);
15658 +
15659 + /* this is either first call to readdir, or rewind. Anyway, create new
15660 + * cursor. */
15661 + fsdata = create_fsdata(NULL);
15662 + if (fsdata != NULL) {
15663 + result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15664 + if (result == 0) {
15665 + struct d_cursor_info *info;
15666 + oid_t oid;
15667 +
15668 + info = d_info(inode);
15669 + oid = get_inode_oid(inode);
15670 + /* cid occupies higher 12 bits of f->f_pos. Don't
15671 + * allow it to become negative: this confuses
15672 + * nfsd_readdir() */
15673 + cursor->key.cid = (++cid_counter) & 0x7ff;
15674 + cursor->key.oid = oid;
15675 + cursor->fsdata = fsdata;
15676 + cursor->info = info;
15677 + cursor->ref = 1;
15678 +
15679 + spin_lock_inode(inode);
15680 + /* install cursor as @f's private_data, discarding old
15681 + * one if necessary */
15682 +#if REISER4_DEBUG
15683 + if (file->private_data)
15684 + warning("", "file has fsdata already");
15685 +#endif
15686 + clean_fsdata(file);
15687 + free_file_fsdata_nolock(file);
15688 + file->private_data = fsdata;
15689 + fsdata->cursor = cursor;
15690 + spin_unlock_inode(inode);
15691 + spin_lock(&d_lock);
15692 + /* insert cursor into hash table */
15693 + d_cursor_hash_insert(&info->table, cursor);
15694 + /* and chain it into radix-tree */
15695 + bind_cursor(cursor, (unsigned long)oid);
15696 + spin_unlock(&d_lock);
15697 + radix_tree_preload_end();
15698 + file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15699 + }
15700 + } else
15701 + result = RETERR(-ENOMEM);
15702 + return result;
15703 +}
15704 +
15705 +/**
15706 + * process_cursors - do action on each cursor attached to inode
15707 + * @inode:
15708 + * @act: action to do
15709 + *
15710 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15711 + * and performs action specified by @act on each of cursors.
15712 + */
15713 +static void process_cursors(struct inode *inode, enum cursor_action act)
15714 +{
15715 + oid_t oid;
15716 + dir_cursor *start;
15717 + struct list_head *head;
15718 + reiser4_context *ctx;
15719 + struct d_cursor_info *info;
15720 +
15721 + /* this can be called by
15722 + *
15723 + * kswapd->...->prune_icache->..reiser4_destroy_inode
15724 + *
15725 + * without reiser4_context
15726 + */
15727 + ctx = reiser4_init_context(inode->i_sb);
15728 + if (IS_ERR(ctx)) {
15729 + warning("vs-23", "failed to init context");
15730 + return;
15731 + }
15732 +
15733 + assert("nikita-3558", inode != NULL);
15734 +
15735 + info = d_info(inode);
15736 + oid = get_inode_oid(inode);
15737 + spin_lock_inode(inode);
15738 + head = get_readdir_list(inode);
15739 + spin_lock(&d_lock);
15740 + /* find any cursor for this oid: reference to it is hanging of radix
15741 + * tree */
15742 + start = lookup(info, (unsigned long)oid);
15743 + if (start != NULL) {
15744 + dir_cursor *scan;
15745 + reiser4_file_fsdata *fsdata;
15746 +
15747 + /* process circular list of cursors for this oid */
15748 + scan = start;
15749 + do {
15750 + dir_cursor *next;
15751 +
15752 + next = list_entry(scan->list.next, dir_cursor, list);
15753 + fsdata = scan->fsdata;
15754 + assert("nikita-3557", fsdata != NULL);
15755 + if (scan->key.oid == oid) {
15756 + switch (act) {
15757 + case CURSOR_DISPOSE:
15758 + list_del_init(&fsdata->dir.linkage);
15759 + break;
15760 + case CURSOR_LOAD:
15761 + list_add(&fsdata->dir.linkage, head);
15762 + break;
15763 + case CURSOR_KILL:
15764 + kill_cursor(scan);
15765 + break;
15766 + }
15767 + }
15768 + if (scan == next)
15769 + /* last cursor was just killed */
15770 + break;
15771 + scan = next;
15772 + } while (scan != start);
15773 + }
15774 + spin_unlock(&d_lock);
15775 + /* check that we killed 'em all */
15776 + assert("nikita-3568",
15777 + ergo(act == CURSOR_KILL,
15778 + list_empty_careful(get_readdir_list(inode))));
15779 + assert("nikita-3569",
15780 + ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15781 + spin_unlock_inode(inode);
15782 + reiser4_exit_context(ctx);
15783 +}
15784 +
15785 +/**
15786 + * reiser4_dispose_cursors - removes cursors from inode's list
15787 + * @inode: inode to dispose cursors of
15788 + *
15789 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15790 + * attached to cursor from inode's readdir list. This is called when inode is
15791 + * removed from the memory by memory pressure.
15792 + */
15793 +void reiser4_dispose_cursors(struct inode *inode)
15794 +{
15795 + process_cursors(inode, CURSOR_DISPOSE);
15796 +}
15797 +
15798 +/**
15799 + * reiser4_load_cursors - attach cursors to inode
15800 + * @inode: inode to load cursors to
15801 + *
15802 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15803 + * attached to cursor to inode's readdir list. This is done when inode is
15804 + * loaded into memory.
15805 + */
15806 +void reiser4_load_cursors(struct inode *inode)
15807 +{
15808 + process_cursors(inode, CURSOR_LOAD);
15809 +}
15810 +
15811 +/**
15812 + * reiser4_kill_cursors - kill all inode cursors
15813 + * @inode: inode to kill cursors of
15814 + *
15815 + * Frees all cursors for this inode. This is called when inode is destroyed.
15816 + */
15817 +void reiser4_kill_cursors(struct inode *inode)
15818 +{
15819 + process_cursors(inode, CURSOR_KILL);
15820 +}
15821 +
15822 +/**
15823 + * file_is_stateless -
15824 + * @file:
15825 + *
15826 + * true, if file descriptor @f is created by NFS server by "demand" to serve
15827 + * one file system operation. This means that there may be "detached state"
15828 + * for underlying inode.
15829 + */
15830 +static int file_is_stateless(struct file *file)
15831 +{
15832 + return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15833 +}
15834 +
15835 +/**
15836 + * reiser4_get_dir_fpos -
15837 + * @dir:
15838 + *
15839 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15840 + * in the case of stateless directory operation (readdir-over-nfs), client id
15841 + * was encoded in the high bits of cookie and should me masked off.
15842 + */
15843 +loff_t reiser4_get_dir_fpos(struct file *dir)
15844 +{
15845 + if (file_is_stateless(dir))
15846 + return dir->f_pos & CID_MASK;
15847 + else
15848 + return dir->f_pos;
15849 +}
15850 +
15851 +/**
15852 + * reiser4_attach_fsdata - try to attach fsdata
15853 + * @file:
15854 + * @inode:
15855 + *
15856 + * Finds or creates cursor for readdir-over-nfs.
15857 + */
15858 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
15859 +{
15860 + loff_t pos;
15861 + int result;
15862 + dir_cursor *cursor;
15863 +
15864 + /*
15865 + * we are serialized by inode->i_mutex
15866 + */
15867 + if (!file_is_stateless(file))
15868 + return 0;
15869 +
15870 + pos = file->f_pos;
15871 + result = 0;
15872 + if (pos == 0) {
15873 + /*
15874 + * first call to readdir (or rewind to the beginning of
15875 + * directory)
15876 + */
15877 + cursor = kmem_cache_alloc(d_cursor_cache,
15878 + reiser4_ctx_gfp_mask_get());
15879 + if (cursor != NULL)
15880 + result = insert_cursor(cursor, file, inode);
15881 + else
15882 + result = RETERR(-ENOMEM);
15883 + } else {
15884 + /* try to find existing cursor */
15885 + struct d_cursor_key key;
15886 +
15887 + key.cid = pos >> CID_SHIFT;
15888 + key.oid = get_inode_oid(inode);
15889 + spin_lock(&d_lock);
15890 + cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15891 + if (cursor != NULL) {
15892 + /* cursor was found */
15893 + if (cursor->ref == 0) {
15894 + /* move it from unused list */
15895 + list_del_init(&cursor->alist);
15896 + --d_cursor_unused;
15897 + }
15898 + ++cursor->ref;
15899 + }
15900 + spin_unlock(&d_lock);
15901 + if (cursor != NULL) {
15902 + spin_lock_inode(inode);
15903 + assert("nikita-3556", cursor->fsdata->back == NULL);
15904 + clean_fsdata(file);
15905 + free_file_fsdata_nolock(file);
15906 + file->private_data = cursor->fsdata;
15907 + spin_unlock_inode(inode);
15908 + }
15909 + }
15910 + return result;
15911 +}
15912 +
15913 +/**
15914 + * reiser4_detach_fsdata - ???
15915 + * @file:
15916 + *
15917 + * detach fsdata, if necessary
15918 + */
15919 +void reiser4_detach_fsdata(struct file *file)
15920 +{
15921 + struct inode *inode;
15922 +
15923 + if (!file_is_stateless(file))
15924 + return;
15925 +
15926 + inode = file->f_dentry->d_inode;
15927 + spin_lock_inode(inode);
15928 + clean_fsdata(file);
15929 + spin_unlock_inode(inode);
15930 +}
15931 +
15932 +/* slab for reiser4_dentry_fsdata */
15933 +static struct kmem_cache *dentry_fsdata_cache;
15934 +
15935 +/**
15936 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
15937 + *
15938 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
15939 + * part of reiser4 module initialization.
15940 + */
15941 +int reiser4_init_dentry_fsdata(void)
15942 +{
15943 + dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
15944 + sizeof(struct reiser4_dentry_fsdata),
15945 + 0,
15946 + SLAB_HWCACHE_ALIGN |
15947 + SLAB_RECLAIM_ACCOUNT, NULL,
15948 + NULL);
15949 + if (dentry_fsdata_cache == NULL)
15950 + return RETERR(-ENOMEM);
15951 + return 0;
15952 +}
15953 +
15954 +/**
15955 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
15956 + *
15957 + * This is called on reiser4 module unloading or system shutdown.
15958 + */
15959 +void reiser4_done_dentry_fsdata(void)
15960 +{
15961 + destroy_reiser4_cache(&dentry_fsdata_cache);
15962 +}
15963 +
15964 +/**
15965 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
15966 + * @dentry: queried dentry
15967 + *
15968 + * Allocates if necessary and returns per-dentry data that we attach to each
15969 + * dentry.
15970 + */
15971 +struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
15972 +{
15973 + assert("nikita-1365", dentry != NULL);
15974 +
15975 + if (dentry->d_fsdata == NULL) {
15976 + dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
15977 + reiser4_ctx_gfp_mask_get());
15978 + if (dentry->d_fsdata == NULL)
15979 + return ERR_PTR(RETERR(-ENOMEM));
15980 + memset(dentry->d_fsdata, 0,
15981 + sizeof(struct reiser4_dentry_fsdata));
15982 + }
15983 + return dentry->d_fsdata;
15984 +}
15985 +
15986 +/**
15987 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
15988 + * @dentry: dentry to free fsdata of
15989 + *
15990 + * Detaches and frees fs-specific dentry data
15991 + */
15992 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
15993 +{
15994 + if (dentry->d_fsdata != NULL) {
15995 + kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
15996 + dentry->d_fsdata = NULL;
15997 + }
15998 +}
15999 +
16000 +/* slab for reiser4_file_fsdata */
16001 +static struct kmem_cache *file_fsdata_cache;
16002 +
16003 +/**
16004 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16005 + *
16006 + * Initializes slab cache of structures attached to file->private_data. It is
16007 + * part of reiser4 module initialization.
16008 + */
16009 +int reiser4_init_file_fsdata(void)
16010 +{
16011 + file_fsdata_cache = kmem_cache_create("file_fsdata",
16012 + sizeof(reiser4_file_fsdata),
16013 + 0,
16014 + SLAB_HWCACHE_ALIGN |
16015 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
16016 + if (file_fsdata_cache == NULL)
16017 + return RETERR(-ENOMEM);
16018 + return 0;
16019 +}
16020 +
16021 +/**
16022 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16023 + *
16024 + * This is called on reiser4 module unloading or system shutdown.
16025 + */
16026 +void reiser4_done_file_fsdata(void)
16027 +{
16028 + destroy_reiser4_cache(&file_fsdata_cache);
16029 +}
16030 +
16031 +/**
16032 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16033 + * @file: what to create file_fsdata for, may be NULL
16034 + *
16035 + * Allocates and initializes reiser4_file_fsdata structure.
16036 + */
16037 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16038 +{
16039 + reiser4_file_fsdata *fsdata;
16040 +
16041 + fsdata = kmem_cache_alloc(file_fsdata_cache,
16042 + reiser4_ctx_gfp_mask_get());
16043 + if (fsdata != NULL) {
16044 + memset(fsdata, 0, sizeof *fsdata);
16045 + fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16046 + fsdata->back = file;
16047 + INIT_LIST_HEAD(&fsdata->dir.linkage);
16048 + }
16049 + return fsdata;
16050 +}
16051 +
16052 +/**
16053 + * free_fsdata - free reiser4_file_fsdata
16054 + * @fsdata: object to free
16055 + *
16056 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16057 + */
16058 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16059 +{
16060 + BUG_ON(fsdata == NULL);
16061 + kmem_cache_free(file_fsdata_cache, fsdata);
16062 +}
16063 +
16064 +/**
16065 + * reiser4_get_file_fsdata - get fs-specific file data
16066 + * @file: queried file
16067 + *
16068 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16069 + * to @file.
16070 + */
16071 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16072 +{
16073 + assert("nikita-1603", file != NULL);
16074 +
16075 + if (file->private_data == NULL) {
16076 + reiser4_file_fsdata *fsdata;
16077 + struct inode *inode;
16078 +
16079 + fsdata = create_fsdata(file);
16080 + if (fsdata == NULL)
16081 + return ERR_PTR(RETERR(-ENOMEM));
16082 +
16083 + inode = file->f_dentry->d_inode;
16084 + spin_lock_inode(inode);
16085 + if (file->private_data == NULL) {
16086 + file->private_data = fsdata;
16087 + fsdata = NULL;
16088 + }
16089 + spin_unlock_inode(inode);
16090 + if (fsdata != NULL)
16091 + /* other thread initialized ->fsdata */
16092 + kmem_cache_free(file_fsdata_cache, fsdata);
16093 + }
16094 + assert("nikita-2665", file->private_data != NULL);
16095 + return file->private_data;
16096 +}
16097 +
16098 +/**
16099 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16100 + * @file:
16101 + *
16102 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16103 + * readdir list, frees if it is not linked to d_cursor object.
16104 + */
16105 +static void free_file_fsdata_nolock(struct file *file)
16106 +{
16107 + reiser4_file_fsdata *fsdata;
16108 +
16109 + assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16110 + fsdata = file->private_data;
16111 + if (fsdata != NULL) {
16112 + list_del_init(&fsdata->dir.linkage);
16113 + if (fsdata->cursor == NULL)
16114 + free_fsdata(fsdata);
16115 + }
16116 + file->private_data = NULL;
16117 +}
16118 +
16119 +/**
16120 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16121 + * @file:
16122 + *
16123 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16124 + */
16125 +void reiser4_free_file_fsdata(struct file *file)
16126 +{
16127 + spin_lock_inode(file->f_dentry->d_inode);
16128 + free_file_fsdata_nolock(file);
16129 + spin_unlock_inode(file->f_dentry->d_inode);
16130 +}
16131 +
16132 +/*
16133 + * Local variables:
16134 + * c-indentation-style: "K&R"
16135 + * mode-name: "LC"
16136 + * c-basic-offset: 8
16137 + * tab-width: 8
16138 + * fill-column: 79
16139 + * End:
16140 + */
16141 diff -urN linux-2.6.22.orig/fs/reiser4/fsdata.h linux-2.6.22/fs/reiser4/fsdata.h
16142 --- linux-2.6.22.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300
16143 +++ linux-2.6.22/fs/reiser4/fsdata.h 2007-07-29 00:25:34.868694406 +0400
16144 @@ -0,0 +1,205 @@
16145 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16146 + * reiser4/README */
16147 +
16148 +#if !defined( __REISER4_FSDATA_H__ )
16149 +#define __REISER4_FSDATA_H__
16150 +
16151 +#include "debug.h"
16152 +#include "kassign.h"
16153 +#include "seal.h"
16154 +#include "type_safe_hash.h"
16155 +#include "plugin/file/file.h"
16156 +#include "readahead.h"
16157 +
16158 +/*
16159 + * comment about reiser4_dentry_fsdata
16160 + *
16161 + *
16162 + */
16163 +
16164 +/*
16165 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16166 + * protected by ->i_mutex on inode. Under this lock following invariant
16167 + * holds:
16168 + *
16169 + * file descriptor is "looking" at the entry_no-th directory entry from
16170 + * the beginning of directory. This entry has key dir_entry_key and is
16171 + * pos-th entry with duplicate-key sequence.
16172 + *
16173 + */
16174 +
16175 +/* logical position within directory */
16176 +struct dir_pos {
16177 + /* key of directory entry (actually, part of a key sufficient to
16178 + identify directory entry) */
16179 + de_id dir_entry_key;
16180 + /* ordinal number of directory entry among all entries with the same
16181 + key. (Starting from 0.) */
16182 + unsigned pos;
16183 +};
16184 +
16185 +struct readdir_pos {
16186 + /* f_pos corresponding to this readdir position */
16187 + __u64 fpos;
16188 + /* logical position within directory */
16189 + struct dir_pos position;
16190 + /* logical number of directory entry within
16191 + directory */
16192 + __u64 entry_no;
16193 +};
16194 +
16195 +/*
16196 + * this is used to speed up lookups for directory entry: on initial call to
16197 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16198 + * in struct dentry and reused later to avoid tree traversals.
16199 + */
16200 +struct de_location {
16201 + /* seal covering directory entry */
16202 + seal_t entry_seal;
16203 + /* coord of directory entry */
16204 + coord_t entry_coord;
16205 + /* ordinal number of directory entry among all entries with the same
16206 + key. (Starting from 0.) */
16207 + int pos;
16208 +};
16209 +
16210 +/**
16211 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16212 + *
16213 + * This is allocated dynamically and released in d_op->d_release()
16214 + *
16215 + * Currently it only contains cached location (hint) of directory entry, but
16216 + * it is expected that other information will be accumulated here.
16217 + */
16218 +struct reiser4_dentry_fsdata {
16219 + /*
16220 + * here will go fields filled by ->lookup() to speedup next
16221 + * create/unlink, like blocknr of znode with stat-data, or key of
16222 + * stat-data.
16223 + */
16224 + struct de_location dec;
16225 + int stateless; /* created through reiser4_decode_fh, needs special
16226 + * treatment in readdir. */
16227 +};
16228 +
16229 +extern int reiser4_init_dentry_fsdata(void);
16230 +extern void reiser4_done_dentry_fsdata(void);
16231 +extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16232 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16233 +
16234 +/**
16235 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16236 + *
16237 + * This is allocated dynamically and released in inode->i_fop->release
16238 + */
16239 +typedef struct reiser4_file_fsdata {
16240 + /*
16241 + * pointer back to the struct file which this reiser4_file_fsdata is
16242 + * part of
16243 + */
16244 + struct file *back;
16245 + /* detached cursor for stateless readdir. */
16246 + struct dir_cursor *cursor;
16247 + /*
16248 + * We need both directory and regular file parts here, because there
16249 + * are file system objects that are files and directories.
16250 + */
16251 + struct {
16252 + /*
16253 + * position in directory. It is updated each time directory is
16254 + * modified
16255 + */
16256 + struct readdir_pos readdir;
16257 + /* head of this list is reiser4_inode->lists.readdir_list */
16258 + struct list_head linkage;
16259 + } dir;
16260 + /* hints to speed up operations with regular files: read and write. */
16261 + struct {
16262 + hint_t hint;
16263 + } reg;
16264 + struct reiser4_file_ra_state ra1;
16265 +
16266 +} reiser4_file_fsdata;
16267 +
16268 +extern int reiser4_init_file_fsdata(void);
16269 +extern void reiser4_done_file_fsdata(void);
16270 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16271 +extern void reiser4_free_file_fsdata(struct file *);
16272 +
16273 +/*
16274 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16275 + * used to address problem reiser4 has with readdir accesses via NFS. See
16276 + * plugin/file_ops_readdir.c for more details.
16277 + */
16278 +struct d_cursor_key{
16279 + __u16 cid;
16280 + __u64 oid;
16281 +};
16282 +
16283 +/*
16284 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16285 + * maintain hash table of dir_cursor-s in reiser4's super block
16286 + */
16287 +typedef struct dir_cursor dir_cursor;
16288 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16289 +
16290 +struct dir_cursor {
16291 + int ref;
16292 + reiser4_file_fsdata *fsdata;
16293 +
16294 + /* link to reiser4 super block hash table of cursors */
16295 + d_cursor_hash_link hash;
16296 +
16297 + /*
16298 + * this is to link cursors to reiser4 super block's radix tree of
16299 + * cursors if there are more than one cursor of the same objectid
16300 + */
16301 + struct list_head list;
16302 + struct d_cursor_key key;
16303 + struct d_cursor_info *info;
16304 + /* list of unused cursors */
16305 + struct list_head alist;
16306 +};
16307 +
16308 +extern int reiser4_init_d_cursor(void);
16309 +extern void reiser4_done_d_cursor(void);
16310 +
16311 +extern int reiser4_init_super_d_info(struct super_block *);
16312 +extern void reiser4_done_super_d_info(struct super_block *);
16313 +
16314 +extern loff_t reiser4_get_dir_fpos(struct file *);
16315 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
16316 +extern void reiser4_detach_fsdata(struct file *);
16317 +
16318 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16319 + more details */
16320 +void reiser4_dispose_cursors(struct inode *inode);
16321 +void reiser4_load_cursors(struct inode *inode);
16322 +void reiser4_kill_cursors(struct inode *inode);
16323 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16324 + int offset, int adj);
16325 +
16326 +/*
16327 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16328 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16329 + */
16330 +struct d_cursor_info {
16331 + d_cursor_hash_table table;
16332 + struct radix_tree_root tree;
16333 +};
16334 +
16335 +/* spinlock protecting readdir cursors */
16336 +extern spinlock_t d_lock;
16337 +
16338 +/* __REISER4_FSDATA_H__ */
16339 +#endif
16340 +
16341 +/*
16342 + * Local variables:
16343 + * c-indentation-style: "K&R"
16344 + * mode-name: "LC"
16345 + * c-basic-offset: 8
16346 + * tab-width: 8
16347 + * fill-column: 120
16348 + * End:
16349 + */
16350 diff -urN linux-2.6.22.orig/fs/reiser4/init_super.c linux-2.6.22/fs/reiser4/init_super.c
16351 --- linux-2.6.22.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300
16352 +++ linux-2.6.22/fs/reiser4/init_super.c 2007-07-29 00:25:34.868694406 +0400
16353 @@ -0,0 +1,752 @@
16354 +/* Copyright by Hans Reiser, 2003 */
16355 +
16356 +#include "super.h"
16357 +#include "inode.h"
16358 +#include "plugin/plugin_set.h"
16359 +
16360 +#include <linux/swap.h>
16361 +
16362 +/**
16363 + * init_fs_info - allocate reiser4 specific super block
16364 + * @super: super block of filesystem
16365 + *
16366 + * Allocates and initialize reiser4_super_info_data, attaches it to
16367 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16368 + */
16369 +int reiser4_init_fs_info(struct super_block *super)
16370 +{
16371 + reiser4_super_info_data *sbinfo;
16372 +
16373 + sbinfo = kmalloc(sizeof(reiser4_super_info_data),
16374 + reiser4_ctx_gfp_mask_get());
16375 + if (!sbinfo)
16376 + return RETERR(-ENOMEM);
16377 +
16378 + super->s_fs_info = sbinfo;
16379 + super->s_op = NULL;
16380 + memset(sbinfo, 0, sizeof(*sbinfo));
16381 +
16382 + ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16383 + ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16384 +
16385 + mutex_init(&sbinfo->delete_mutex);
16386 + spin_lock_init(&(sbinfo->guard));
16387 +
16388 + /* initialize per-super-block d_cursor resources */
16389 + reiser4_init_super_d_info(super);
16390 +
16391 + return 0;
16392 +}
16393 +
16394 +/**
16395 + * reiser4_done_fs_info - free reiser4 specific super block
16396 + * @super: super block of filesystem
16397 + *
16398 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16399 + * frees reiser4_super_info_data.
16400 + */
16401 +void reiser4_done_fs_info(struct super_block *super)
16402 +{
16403 + assert("zam-990", super->s_fs_info != NULL);
16404 +
16405 + /* release per-super-block d_cursor resources */
16406 + reiser4_done_super_d_info(super);
16407 +
16408 + /* make sure that there are not jnodes already */
16409 + assert("", list_empty(&get_super_private(super)->all_jnodes));
16410 + assert("", get_current_context()->trans->atom == NULL);
16411 + reiser4_check_block_counters(super);
16412 + kfree(super->s_fs_info);
16413 + super->s_fs_info = NULL;
16414 +}
16415 +
16416 +/* type of option parseable by parse_option() */
16417 +typedef enum {
16418 + /* value of option is arbitrary string */
16419 + OPT_STRING,
16420 +
16421 + /*
16422 + * option specifies bit in a bitmask. When option is set - bit in
16423 + * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16424 + * dont_load_bitmap, atomic_write.
16425 + */
16426 + OPT_BIT,
16427 +
16428 + /*
16429 + * value of option should conform to sprintf() format. Examples are
16430 + * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16431 + */
16432 + OPT_FORMAT,
16433 +
16434 + /*
16435 + * option can take one of predefined values. Example is onerror=panic or
16436 + * onerror=remount-ro
16437 + */
16438 + OPT_ONEOF,
16439 +} opt_type_t;
16440 +
16441 +#if 0
16442 +struct opt_bitmask_bit {
16443 + const char *bit_name;
16444 + int bit_nr;
16445 +};
16446 +#endif
16447 +
16448 +/* description of option parseable by parse_option() */
16449 +struct opt_desc {
16450 + /* option name.
16451 +
16452 + parsed portion of string has a form "name=value".
16453 + */
16454 + const char *name;
16455 + /* type of option */
16456 + opt_type_t type;
16457 + union {
16458 + /* where to store value of string option (type == OPT_STRING) */
16459 + char **string;
16460 + /* description of bits for bit option (type == OPT_BIT) */
16461 + struct {
16462 + int nr;
16463 + void *addr;
16464 + } bit;
16465 + /* description of format and targets for format option (type
16466 + == OPT_FORMAT) */
16467 + struct {
16468 + const char *format;
16469 + int nr_args;
16470 + void *arg1;
16471 + void *arg2;
16472 + void *arg3;
16473 + void *arg4;
16474 + } f;
16475 + struct {
16476 + int *result;
16477 + const char *list[10];
16478 + } oneof;
16479 + struct {
16480 + void *addr;
16481 + int nr_bits;
16482 + //struct opt_bitmask_bit *bits;
16483 + } bitmask;
16484 + } u;
16485 +};
16486 +
16487 +/**
16488 + * parse_option - parse one option
16489 + * @opt_strin: starting point of parsing
16490 + * @opt: option description
16491 + *
16492 + * foo=bar,
16493 + * ^ ^ ^
16494 + * | | +-- replaced to '\0'
16495 + * | +-- val_start
16496 + * +-- opt_string
16497 + * Figures out option type and handles option correspondingly.
16498 + */
16499 +static int parse_option(char *opt_string, struct opt_desc *opt)
16500 +{
16501 + char *val_start;
16502 + int result;
16503 + const char *err_msg;
16504 +
16505 + /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16506 +
16507 + val_start = strchr(opt_string, '=');
16508 + if (val_start != NULL) {
16509 + *val_start = '\0';
16510 + ++val_start;
16511 + }
16512 +
16513 + err_msg = NULL;
16514 + result = 0;
16515 + switch (opt->type) {
16516 + case OPT_STRING:
16517 + if (val_start == NULL) {
16518 + err_msg = "String arg missing";
16519 + result = RETERR(-EINVAL);
16520 + } else
16521 + *opt->u.string = val_start;
16522 + break;
16523 + case OPT_BIT:
16524 + if (val_start != NULL)
16525 + err_msg = "Value ignored";
16526 + else
16527 + set_bit(opt->u.bit.nr, opt->u.bit.addr);
16528 + break;
16529 + case OPT_FORMAT:
16530 + if (val_start == NULL) {
16531 + err_msg = "Formatted arg missing";
16532 + result = RETERR(-EINVAL);
16533 + break;
16534 + }
16535 + if (sscanf(val_start, opt->u.f.format,
16536 + opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16537 + opt->u.f.arg4) != opt->u.f.nr_args) {
16538 + err_msg = "Wrong conversion";
16539 + result = RETERR(-EINVAL);
16540 + }
16541 + break;
16542 + case OPT_ONEOF:
16543 + {
16544 + int i = 0;
16545 +
16546 + if (val_start == NULL) {
16547 + err_msg = "Value is missing";
16548 + result = RETERR(-EINVAL);
16549 + break;
16550 + }
16551 + err_msg = "Wrong option value";
16552 + result = RETERR(-EINVAL);
16553 + while (opt->u.oneof.list[i]) {
16554 + if (!strcmp(opt->u.oneof.list[i], val_start)) {
16555 + result = 0;
16556 + err_msg = NULL;
16557 + *opt->u.oneof.result = i;
16558 + break;
16559 + }
16560 + i++;
16561 + }
16562 + break;
16563 + }
16564 + default:
16565 + wrong_return_value("nikita-2100", "opt -> type");
16566 + break;
16567 + }
16568 + if (err_msg != NULL) {
16569 + warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16570 + err_msg, opt->name, val_start ? "=" : "",
16571 + val_start ? : "");
16572 + }
16573 + return result;
16574 +}
16575 +
16576 +/**
16577 + * parse_options - parse reiser4 mount options
16578 + * @opt_string: starting point
16579 + * @opts: array of option description
16580 + * @nr_opts: number of elements in @opts
16581 + *
16582 + * Parses comma separated list of reiser4 mount options.
16583 + */
16584 +static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
16585 +{
16586 + int result;
16587 +
16588 + result = 0;
16589 + while ((result == 0) && opt_string && *opt_string) {
16590 + int j;
16591 + char *next;
16592 +
16593 + next = strchr(opt_string, ',');
16594 + if (next != NULL) {
16595 + *next = '\0';
16596 + ++next;
16597 + }
16598 + for (j = 0; j < nr_opts; ++j) {
16599 + if (!strncmp(opt_string, opts[j].name,
16600 + strlen(opts[j].name))) {
16601 + result = parse_option(opt_string, &opts[j]);
16602 + break;
16603 + }
16604 + }
16605 + if (j == nr_opts) {
16606 + warning("nikita-2307", "Unrecognized option: \"%s\"",
16607 + opt_string);
16608 + /* traditionally, -EINVAL is returned on wrong mount
16609 + option */
16610 + result = RETERR(-EINVAL);
16611 + }
16612 + opt_string = next;
16613 + }
16614 + return result;
16615 +}
16616 +
16617 +#define NUM_OPT( label, fmt, addr ) \
16618 + { \
16619 + .name = ( label ), \
16620 + .type = OPT_FORMAT, \
16621 + .u = { \
16622 + .f = { \
16623 + .format = ( fmt ), \
16624 + .nr_args = 1, \
16625 + .arg1 = ( addr ), \
16626 + .arg2 = NULL, \
16627 + .arg3 = NULL, \
16628 + .arg4 = NULL \
16629 + } \
16630 + } \
16631 + }
16632 +
16633 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16634 +
16635 +#define BIT_OPT(label, bitnr) \
16636 + { \
16637 + .name = label, \
16638 + .type = OPT_BIT, \
16639 + .u = { \
16640 + .bit = { \
16641 + .nr = bitnr, \
16642 + .addr = &sbinfo->fs_flags \
16643 + } \
16644 + } \
16645 + }
16646 +
16647 +#define MAX_NR_OPTIONS (30)
16648 +
16649 +/**
16650 + * reiser4_init_super_data - initialize reiser4 private super block
16651 + * @super: super block to initialize
16652 + * @opt_string: list of reiser4 mount options
16653 + *
16654 + * Sets various reiser4 parameters to default values. Parses mount options and
16655 + * overwrites default settings.
16656 + */
16657 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
16658 +{
16659 + int result;
16660 + struct opt_desc *opts, *p;
16661 + reiser4_super_info_data *sbinfo = get_super_private(super);
16662 +
16663 + /* initialize super, export, dentry operations */
16664 + sbinfo->ops.super = reiser4_super_operations;
16665 + sbinfo->ops.export = reiser4_export_operations;
16666 + sbinfo->ops.dentry = reiser4_dentry_operations;
16667 + super->s_op = &sbinfo->ops.super;
16668 + super->s_export_op = &sbinfo->ops.export;
16669 +
16670 + /* initialize transaction manager parameters to default values */
16671 + sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16672 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16673 + sbinfo->tmgr.atom_min_size = 256;
16674 + sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16675 +
16676 + /* initialize cbk cache parameter */
16677 + sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16678 +
16679 + /* initialize flush parameters */
16680 + sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16681 + sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16682 + sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16683 + sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16684 +
16685 + sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16686 +
16687 + /* preliminary tree initializations */
16688 + sbinfo->tree.super = super;
16689 + sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16690 + sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16691 + sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16692 + sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16693 + rwlock_init(&(sbinfo->tree.tree_lock));
16694 + spin_lock_init(&(sbinfo->tree.epoch_lock));
16695 +
16696 + /* initialize default readahead params */
16697 + sbinfo->ra_params.max = num_physpages / 4;
16698 + sbinfo->ra_params.flags = 0;
16699 +
16700 + /* allocate memory for structure describing reiser4 mount options */
16701 + opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
16702 + reiser4_ctx_gfp_mask_get());
16703 + if (opts == NULL)
16704 + return RETERR(-ENOMEM);
16705 +
16706 + /* initialize structure describing reiser4 mount options */
16707 + p = opts;
16708 +
16709 +#if REISER4_DEBUG
16710 +# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
16711 + warning ("zam-1046", "opt array is overloaded"); break; \
16712 + }
16713 +#else
16714 +# define OPT_ARRAY_CHECK noop
16715 +#endif
16716 +
16717 +#define PUSH_OPT(...) \
16718 +do { \
16719 + struct opt_desc o = __VA_ARGS__; \
16720 + OPT_ARRAY_CHECK; \
16721 + *p ++ = o; \
16722 +} while (0)
16723 +
16724 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16725 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16726 +
16727 + /*
16728 + * tmgr.atom_max_size=N
16729 + * Atoms containing more than N blocks will be forced to commit. N is
16730 + * decimal.
16731 + */
16732 + PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16733 + /*
16734 + * tmgr.atom_max_age=N
16735 + * Atoms older than N seconds will be forced to commit. N is decimal.
16736 + */
16737 + PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16738 + /*
16739 + * tmgr.atom_min_size=N
16740 + * In committing an atom to free dirty pages, force the atom less than
16741 + * N in size to fuse with another one.
16742 + */
16743 + PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16744 + /*
16745 + * tmgr.atom_max_flushers=N
16746 + * limit of concurrent flushers for one atom. 0 means no limit.
16747 + */
16748 + PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16749 + /*
16750 + * tree.cbk_cache_slots=N
16751 + * Number of slots in the cbk cache.
16752 + */
16753 + PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16754 + /*
16755 + * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16756 + * leaf-level blocks it will force them to be relocated.
16757 + */
16758 + PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16759 + /*
16760 + * If flush finds can find a block allocation closer than at most
16761 + * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16762 + * position.
16763 + */
16764 + PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16765 + /*
16766 + * If we have written this much or more blocks before encountering busy
16767 + * jnode in flush list - abort flushing hoping that next time we get
16768 + * called this jnode will be clean already, and we will save some
16769 + * seeks.
16770 + */
16771 + PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16772 + /* The maximum number of nodes to scan left on a level during flush. */
16773 + PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16774 + /* preferred IO size */
16775 + PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16776 + /* carry flags used for insertion of new nodes */
16777 + PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16778 + /* carry flags used for insertion of new extents */
16779 + PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16780 + /* carry flags used for paste operations */
16781 + PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16782 + /* carry flags used for insert operations */
16783 + PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16784 +
16785 +#ifdef CONFIG_REISER4_BADBLOCKS
16786 + /*
16787 + * Alternative master superblock location in case if it's original
16788 + * location is not writeable/accessable. This is offset in BYTES.
16789 + */
16790 + PUSH_SB_FIELD_OPT(altsuper, "%lu");
16791 +#endif
16792 +
16793 + /* turn on BSD-style gid assignment */
16794 + PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16795 + /* turn on 32 bit times */
16796 + PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16797 + /*
16798 + * Don't load all bitmap blocks at mount time, it is useful for
16799 + * machines with tiny RAM and large disks.
16800 + */
16801 + PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16802 + /* disable transaction commits during write() */
16803 + PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16804 + /* disable use of write barriers in the reiser4 log writer. */
16805 + PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16806 +
16807 + PUSH_OPT(
16808 + {
16809 + /*
16810 + * tree traversal readahead parameters:
16811 + * -o readahead:MAXNUM:FLAGS
16812 + * MAXNUM - max number fo nodes to request readahead for: -1UL
16813 + * will set it to max_sane_readahead()
16814 + * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16815 + * CONTINUE_ON_PRESENT
16816 + */
16817 + .name = "readahead",
16818 + .type = OPT_FORMAT,
16819 + .u = {
16820 + .f = {
16821 + .format = "%u:%u",
16822 + .nr_args = 2,
16823 + .arg1 = &sbinfo->ra_params.max,
16824 + .arg2 = &sbinfo->ra_params.flags,
16825 + .arg3 = NULL,
16826 + .arg4 = NULL
16827 + }
16828 + }
16829 + }
16830 + );
16831 +
16832 + /* What to do in case of fs error */
16833 + PUSH_OPT(
16834 + {
16835 + .name = "onerror",
16836 + .type = OPT_ONEOF,
16837 + .u = {
16838 + .oneof = {
16839 + .result = &sbinfo->onerror,
16840 + .list = {
16841 + "panic", "remount-ro", NULL
16842 + },
16843 + }
16844 + }
16845 + }
16846 + );
16847 +
16848 + /* modify default settings to values set by mount options */
16849 + result = parse_options(opt_string, opts, p - opts);
16850 + kfree(opts);
16851 + if (result != 0)
16852 + return result;
16853 +
16854 + /* correct settings to sanity values */
16855 + sbinfo->tmgr.atom_max_age *= HZ;
16856 + if (sbinfo->tmgr.atom_max_age <= 0)
16857 + /* overflow */
16858 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16859 +
16860 + /* round optimal io size up to 512 bytes */
16861 + sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16862 + sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16863 + if (sbinfo->optimal_io_size == 0) {
16864 + warning("nikita-2497", "optimal_io_size is too small");
16865 + return RETERR(-EINVAL);
16866 + }
16867 + return result;
16868 +}
16869 +
16870 +/**
16871 + * reiser4_init_read_super - read reiser4 master super block
16872 + * @super: super block to fill
16873 + * @silent: if 0 - print warnings
16874 + *
16875 + * Reads reiser4 master super block either from predefined location or from
16876 + * location specified by altsuper mount option, initializes disk format plugin.
16877 + */
16878 +int reiser4_init_read_super(struct super_block *super, int silent)
16879 +{
16880 + struct buffer_head *super_bh;
16881 + struct reiser4_master_sb *master_sb;
16882 + reiser4_super_info_data *sbinfo = get_super_private(super);
16883 + unsigned long blocksize;
16884 +
16885 + read_super_block:
16886 +#ifdef CONFIG_REISER4_BADBLOCKS
16887 + if (sbinfo->altsuper)
16888 + /*
16889 + * read reiser4 master super block at position specified by
16890 + * mount option
16891 + */
16892 + super_bh = sb_bread(super,
16893 + (sector_t)(sbinfo->altsuper / super->s_blocksize));
16894 + else
16895 +#endif
16896 + /* read reiser4 master super block at 16-th 4096 block */
16897 + super_bh = sb_bread(super,
16898 + (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16899 + if (!super_bh)
16900 + return RETERR(-EIO);
16901 +
16902 + master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16903 + /* check reiser4 magic string */
16904 + if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16905 + sizeof(REISER4_SUPER_MAGIC_STRING))) {
16906 + /* reiser4 master super block contains filesystem blocksize */
16907 + blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16908 +
16909 + if (blocksize != PAGE_CACHE_SIZE) {
16910 + /*
16911 + * currenly reiser4's blocksize must be equal to
16912 + * pagesize
16913 + */
16914 + if (!silent)
16915 + warning("nikita-2609",
16916 + "%s: wrong block size %ld\n", super->s_id,
16917 + blocksize);
16918 + brelse(super_bh);
16919 + return RETERR(-EINVAL);
16920 + }
16921 + if (blocksize != super->s_blocksize) {
16922 + /*
16923 + * filesystem uses different blocksize. Reread master
16924 + * super block with correct blocksize
16925 + */
16926 + brelse(super_bh);
16927 + if (!sb_set_blocksize(super, (int)blocksize))
16928 + return RETERR(-EINVAL);
16929 + goto read_super_block;
16930 + }
16931 +
16932 + sbinfo->df_plug =
16933 + disk_format_plugin_by_id(
16934 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16935 + if (sbinfo->df_plug == NULL) {
16936 + if (!silent)
16937 + warning("nikita-26091",
16938 + "%s: unknown disk format plugin %d\n",
16939 + super->s_id,
16940 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16941 + brelse(super_bh);
16942 + return RETERR(-EINVAL);
16943 + }
16944 + sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
16945 + brelse(super_bh);
16946 + return 0;
16947 + }
16948 +
16949 + /* there is no reiser4 on the device */
16950 + if (!silent)
16951 + warning("nikita-2608",
16952 + "%s: wrong master super block magic", super->s_id);
16953 + brelse(super_bh);
16954 + return RETERR(-EINVAL);
16955 +}
16956 +
16957 +static struct {
16958 + reiser4_plugin_type type;
16959 + reiser4_plugin_id id;
16960 +} default_plugins[PSET_LAST] = {
16961 + [PSET_FILE] = {
16962 + .type = REISER4_FILE_PLUGIN_TYPE,
16963 + .id = UNIX_FILE_PLUGIN_ID
16964 + },
16965 + [PSET_DIR] = {
16966 + .type = REISER4_DIR_PLUGIN_TYPE,
16967 + .id = HASHED_DIR_PLUGIN_ID
16968 + },
16969 + [PSET_HASH] = {
16970 + .type = REISER4_HASH_PLUGIN_TYPE,
16971 + .id = R5_HASH_ID
16972 + },
16973 + [PSET_FIBRATION] = {
16974 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
16975 + .id = FIBRATION_DOT_O
16976 + },
16977 + [PSET_PERM] = {
16978 + .type = REISER4_PERM_PLUGIN_TYPE,
16979 + .id = NULL_PERM_ID
16980 + },
16981 + [PSET_FORMATTING] = {
16982 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
16983 + .id = SMALL_FILE_FORMATTING_ID
16984 + },
16985 + [PSET_SD] = {
16986 + .type = REISER4_ITEM_PLUGIN_TYPE,
16987 + .id = STATIC_STAT_DATA_ID
16988 + },
16989 + [PSET_DIR_ITEM] = {
16990 + .type = REISER4_ITEM_PLUGIN_TYPE,
16991 + .id = COMPOUND_DIR_ID
16992 + },
16993 + [PSET_CIPHER] = {
16994 + .type = REISER4_CIPHER_PLUGIN_TYPE,
16995 + .id = NONE_CIPHER_ID
16996 + },
16997 + [PSET_DIGEST] = {
16998 + .type = REISER4_DIGEST_PLUGIN_TYPE,
16999 + .id = SHA256_32_DIGEST_ID
17000 + },
17001 + [PSET_COMPRESSION] = {
17002 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17003 + .id = LZO1_COMPRESSION_ID
17004 + },
17005 + [PSET_COMPRESSION_MODE] = {
17006 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17007 + .id = CONVX_COMPRESSION_MODE_ID
17008 + },
17009 + [PSET_CLUSTER] = {
17010 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
17011 + .id = CLUSTER_64K_ID
17012 + },
17013 + [PSET_CREATE] = {
17014 + .type = REISER4_FILE_PLUGIN_TYPE,
17015 + .id = UNIX_FILE_PLUGIN_ID
17016 + }
17017 +};
17018 +
17019 +/* access to default plugin table */
17020 +reiser4_plugin *get_default_plugin(pset_member memb)
17021 +{
17022 + return plugin_by_id(default_plugins[memb].type,
17023 + default_plugins[memb].id);
17024 +}
17025 +
17026 +/**
17027 + * reiser4_init_root_inode - obtain inode of root directory
17028 + * @super: super block of filesystem
17029 + *
17030 + * Obtains inode of root directory (reading it from disk), initializes plugin
17031 + * set it was not initialized.
17032 + */
17033 +int reiser4_init_root_inode(struct super_block *super)
17034 +{
17035 + reiser4_super_info_data *sbinfo = get_super_private(super);
17036 + struct inode *inode;
17037 + int result = 0;
17038 +
17039 + inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17040 + if (IS_ERR(inode))
17041 + return RETERR(PTR_ERR(inode));
17042 +
17043 + super->s_root = d_alloc_root(inode);
17044 + if (!super->s_root) {
17045 + iput(inode);
17046 + return RETERR(-ENOMEM);
17047 + }
17048 +
17049 + super->s_root->d_op = &sbinfo->ops.dentry;
17050 +
17051 + if (!is_inode_loaded(inode)) {
17052 + pset_member memb;
17053 + plugin_set *pset;
17054 +
17055 + pset = reiser4_inode_data(inode)->pset;
17056 + for (memb = 0; memb < PSET_LAST; ++memb) {
17057 +
17058 + if (aset_get(pset, memb) != NULL)
17059 + continue;
17060 +
17061 + result = grab_plugin_pset(inode, NULL, memb);
17062 + if (result != 0)
17063 + break;
17064 +
17065 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17066 + }
17067 +
17068 + if (result == 0) {
17069 + if (REISER4_DEBUG) {
17070 + for (memb = 0; memb < PSET_LAST; ++memb)
17071 + assert("nikita-3500",
17072 + aset_get(pset, memb) != NULL);
17073 + }
17074 + } else
17075 + warning("nikita-3448", "Cannot set plugins of root: %i",
17076 + result);
17077 + reiser4_iget_complete(inode);
17078 +
17079 + /* As the default pset kept in the root dir may has been changed
17080 + (length is unknown), call update_sd. */
17081 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17082 + result = reiser4_grab_space(
17083 + inode_file_plugin(inode)->estimate.update(inode),
17084 + BA_CAN_COMMIT);
17085 +
17086 + if (result == 0)
17087 + result = reiser4_update_sd(inode);
17088 +
17089 + all_grabbed2free();
17090 + }
17091 + }
17092 +
17093 + super->s_maxbytes = MAX_LFS_FILESIZE;
17094 + return result;
17095 +}
17096 +
17097 +/*
17098 + * Local variables:
17099 + * c-indentation-style: "K&R"
17100 + * mode-name: "LC"
17101 + * c-basic-offset: 8
17102 + * tab-width: 8
17103 + * fill-column: 79
17104 + * End:
17105 + */
17106 diff -urN linux-2.6.22.orig/fs/reiser4/inode.c linux-2.6.22/fs/reiser4/inode.c
17107 --- linux-2.6.22.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300
17108 +++ linux-2.6.22/fs/reiser4/inode.c 2007-07-29 00:25:34.872695441 +0400
17109 @@ -0,0 +1,709 @@
17110 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17111 +
17112 +/* Inode specific operations. */
17113 +
17114 +#include "forward.h"
17115 +#include "debug.h"
17116 +#include "key.h"
17117 +#include "kassign.h"
17118 +#include "coord.h"
17119 +#include "seal.h"
17120 +#include "dscale.h"
17121 +#include "plugin/item/item.h"
17122 +#include "plugin/security/perm.h"
17123 +#include "plugin/plugin.h"
17124 +#include "plugin/object.h"
17125 +#include "znode.h"
17126 +#include "vfs_ops.h"
17127 +#include "inode.h"
17128 +#include "super.h"
17129 +#include "reiser4.h"
17130 +
17131 +#include <linux/fs.h> /* for struct super_block, address_space */
17132 +
17133 +/* return reiser4 internal tree which inode belongs to */
17134 +/* Audited by: green(2002.06.17) */
17135 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17136 +{
17137 + assert("nikita-256", inode != NULL);
17138 + assert("nikita-257", inode->i_sb != NULL);
17139 + return reiser4_get_tree(inode->i_sb);
17140 +}
17141 +
17142 +/* return reiser4-specific inode flags */
17143 +static inline unsigned long *inode_flags(const struct inode *const inode)
17144 +{
17145 + assert("nikita-2842", inode != NULL);
17146 + return &reiser4_inode_data(inode)->flags;
17147 +}
17148 +
17149 +/* set reiser4-specific flag @f in @inode */
17150 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17151 +{
17152 + assert("nikita-2248", inode != NULL);
17153 + set_bit((int)f, inode_flags(inode));
17154 +}
17155 +
17156 +/* clear reiser4-specific flag @f in @inode */
17157 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17158 +{
17159 + assert("nikita-2250", inode != NULL);
17160 + clear_bit((int)f, inode_flags(inode));
17161 +}
17162 +
17163 +/* true if reiser4-specific flag @f is set in @inode */
17164 +int reiser4_inode_get_flag(const struct inode *inode,
17165 + reiser4_file_plugin_flags f)
17166 +{
17167 + assert("nikita-2251", inode != NULL);
17168 + return test_bit((int)f, inode_flags(inode));
17169 +}
17170 +
17171 +/* convert oid to inode number */
17172 +ino_t oid_to_ino(oid_t oid)
17173 +{
17174 + return (ino_t) oid;
17175 +}
17176 +
17177 +/* convert oid to user visible inode number */
17178 +ino_t oid_to_uino(oid_t oid)
17179 +{
17180 + /* reiser4 object is uniquely identified by oid which is 64 bit
17181 + quantity. Kernel in-memory inode is indexed (in the hash table) by
17182 + 32 bit i_ino field, but this is not a problem, because there is a
17183 + way to further distinguish inodes with identical inode numbers
17184 + (find_actor supplied to iget()).
17185 +
17186 + But user space expects unique 32 bit inode number. Obviously this
17187 + is impossible. Work-around is to somehow hash oid into user visible
17188 + inode number.
17189 + */
17190 + oid_t max_ino = (ino_t) ~ 0;
17191 +
17192 + if (REISER4_INO_IS_OID || (oid <= max_ino))
17193 + return oid;
17194 + else
17195 + /* this is remotely similar to algorithm used to find next pid
17196 + to use for process: after wrap-around start from some
17197 + offset rather than from 0. Idea is that there are some long
17198 + living objects with which we don't want to collide.
17199 + */
17200 + return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17201 +}
17202 +
17203 +/* check that "inode" is on reiser4 file-system */
17204 +int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17205 +{
17206 + return inode != NULL && is_reiser4_super(inode->i_sb);
17207 +}
17208 +
17209 +/* Maximal length of a name that can be stored in directory @inode.
17210 +
17211 + This is used in check during file creation and lookup. */
17212 +int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17213 +{
17214 + assert("nikita-287", is_reiser4_inode(inode));
17215 + assert("nikita-1710", inode_dir_item_plugin(inode));
17216 + if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17217 + return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17218 + else
17219 + return 255;
17220 +}
17221 +
17222 +#if REISER4_USE_COLLISION_LIMIT
17223 +/* Maximal number of hash collisions for this directory. */
17224 +int max_hash_collisions(const struct inode *dir /* inode queried */ )
17225 +{
17226 + assert("nikita-1711", dir != NULL);
17227 + return reiser4_inode_data(dir)->plugin.max_collisions;
17228 +}
17229 +#endif /* REISER4_USE_COLLISION_LIMIT */
17230 +
17231 +/* Install file, inode, and address_space operation on @inode, depending on
17232 + its mode. */
17233 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17234 + reiser4_object_create_data * data /* parameters to create
17235 + * object */ )
17236 +{
17237 + reiser4_super_info_data *sinfo;
17238 + file_plugin *fplug;
17239 + dir_plugin *dplug;
17240 +
17241 + fplug = inode_file_plugin(inode);
17242 + dplug = inode_dir_plugin(inode);
17243 +
17244 + sinfo = get_super_private(inode->i_sb);
17245 +
17246 + switch (inode->i_mode & S_IFMT) {
17247 + case S_IFSOCK:
17248 + case S_IFBLK:
17249 + case S_IFCHR:
17250 + case S_IFIFO:
17251 + {
17252 + dev_t rdev; /* to keep gcc happy */
17253 +
17254 + assert("vs-46", fplug != NULL);
17255 + /* ugly hack with rdev */
17256 + if (data == NULL) {
17257 + rdev = inode->i_rdev;
17258 + inode->i_rdev = 0;
17259 + } else
17260 + rdev = data->rdev;
17261 + inode->i_blocks = 0;
17262 + assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17263 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17264 + /* initialize inode->i_fop and inode->i_rdev for block and char
17265 + devices */
17266 + init_special_inode(inode, inode->i_mode, rdev);
17267 + /* all address space operations are null */
17268 + inode->i_mapping->a_ops =
17269 + &file_plugins[fplug->h.id].as_ops;
17270 + break;
17271 + }
17272 + case S_IFLNK:
17273 + assert("vs-46", fplug != NULL);
17274 + assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17275 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17276 + inode->i_fop = NULL;
17277 + /* all address space operations are null */
17278 + inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17279 + break;
17280 + case S_IFDIR:
17281 + assert("vs-46", dplug != NULL);
17282 + assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17283 + dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17284 + inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17285 + inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17286 + inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17287 + break;
17288 + case S_IFREG:
17289 + assert("vs-46", fplug != NULL);
17290 + assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17291 + fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17292 + inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17293 + inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17294 + inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17295 + break;
17296 + default:
17297 + warning("nikita-291", "wrong file mode: %o for %llu",
17298 + inode->i_mode,
17299 + (unsigned long long)get_inode_oid(inode));
17300 + reiser4_make_bad_inode(inode);
17301 + return RETERR(-EINVAL);
17302 + }
17303 + return 0;
17304 +}
17305 +
17306 +/* Initialize inode from disk data. Called with inode locked.
17307 + Return inode locked. */
17308 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17309 + coord_t * coord /* coord of stat data */ )
17310 +{
17311 + int result;
17312 + item_plugin *iplug;
17313 + void *body;
17314 + int length;
17315 + reiser4_inode *state;
17316 +
17317 + assert("nikita-292", coord != NULL);
17318 + assert("nikita-293", inode != NULL);
17319 +
17320 + coord_clear_iplug(coord);
17321 + result = zload(coord->node);
17322 + if (result)
17323 + return result;
17324 + iplug = item_plugin_by_coord(coord);
17325 + body = item_body_by_coord(coord);
17326 + length = item_length_by_coord(coord);
17327 +
17328 + assert("nikita-295", iplug != NULL);
17329 + assert("nikita-296", body != NULL);
17330 + assert("nikita-297", length > 0);
17331 +
17332 + /* inode is under I_LOCK now */
17333 +
17334 + state = reiser4_inode_data(inode);
17335 + /* call stat-data plugin method to load sd content into inode */
17336 + result = iplug->s.sd.init_inode(inode, body, length);
17337 + set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17338 + if (result == 0) {
17339 + result = setup_inode_ops(inode, NULL);
17340 + if (result == 0 && inode->i_sb->s_root &&
17341 + inode->i_sb->s_root->d_inode)
17342 + result = finish_pset(inode);
17343 + }
17344 + zrelse(coord->node);
17345 + return result;
17346 +}
17347 +
17348 +/* read `inode' from the disk. This is what was previously in
17349 + reiserfs_read_inode2().
17350 +
17351 + Must be called with inode locked. Return inode still locked.
17352 +*/
17353 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17354 + const reiser4_key * key /* key of stat data */ ,
17355 + int silent)
17356 +{
17357 + int result;
17358 + lock_handle lh;
17359 + reiser4_inode *info;
17360 + coord_t coord;
17361 +
17362 + assert("nikita-298", inode != NULL);
17363 + assert("nikita-1945", !is_inode_loaded(inode));
17364 +
17365 + info = reiser4_inode_data(inode);
17366 + assert("nikita-300", info->locality_id != 0);
17367 +
17368 + coord_init_zero(&coord);
17369 + init_lh(&lh);
17370 + /* locate stat-data in a tree and return znode locked */
17371 + result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17372 + assert("nikita-301", !is_inode_loaded(inode));
17373 + if (result == 0) {
17374 + /* use stat-data plugin to load sd into inode. */
17375 + result = init_inode(inode, &coord);
17376 + if (result == 0) {
17377 + /* initialize stat-data seal */
17378 + spin_lock_inode(inode);
17379 + reiser4_seal_init(&info->sd_seal, &coord, key);
17380 + info->sd_coord = coord;
17381 + spin_unlock_inode(inode);
17382 +
17383 + /* call file plugin's method to initialize plugin
17384 + * specific part of inode */
17385 + if (inode_file_plugin(inode)->init_inode_data)
17386 + inode_file_plugin(inode)->init_inode_data(inode,
17387 + NULL,
17388 + 0);
17389 + /* load detached directory cursors for stateless
17390 + * directory readers (NFS). */
17391 + reiser4_load_cursors(inode);
17392 +
17393 + /* Check the opened inode for consistency. */
17394 + result =
17395 + get_super_private(inode->i_sb)->df_plug->
17396 + check_open(inode);
17397 + }
17398 + }
17399 + /* lookup_sd() doesn't release coord because we want znode
17400 + stay read-locked while stat-data fields are accessed in
17401 + init_inode() */
17402 + done_lh(&lh);
17403 +
17404 + if (result != 0)
17405 + reiser4_make_bad_inode(inode);
17406 + return result;
17407 +}
17408 +
17409 +/* initialise new reiser4 inode being inserted into hash table. */
17410 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17411 + void *opaque /* key of stat data passed to the
17412 + * iget5_locked as cookie */ )
17413 +{
17414 + reiser4_key *key;
17415 +
17416 + assert("nikita-1995", inode != NULL);
17417 + assert("nikita-1996", opaque != NULL);
17418 + key = opaque;
17419 + set_inode_oid(inode, get_key_objectid(key));
17420 + reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17421 + return 0;
17422 +}
17423 +
17424 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17425 +
17426 + This function is called by iget5_locked() to distinguish reiser4 inodes
17427 + having the same inode numbers. Such inodes can only exist due to some error
17428 + condition. One of them should be bad. Inodes with identical inode numbers
17429 + (objectids) are distinguished by their packing locality.
17430 +
17431 +*/
17432 +static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17433 + * check */ ,
17434 + void *opaque /* "cookie" passed to
17435 + * iget5_locked(). This is stat data
17436 + * key */ )
17437 +{
17438 + reiser4_key *key;
17439 +
17440 + key = opaque;
17441 + return
17442 + /* oid is unique, so first term is enough, actually. */
17443 + get_inode_oid(inode) == get_key_objectid(key) &&
17444 + /*
17445 + * also, locality should be checked, but locality is stored in
17446 + * the reiser4-specific part of the inode, and actor can be
17447 + * called against arbitrary inode that happened to be in this
17448 + * hash chain. Hence we first have to check that this is
17449 + * reiser4 inode at least. is_reiser4_inode() is probably too
17450 + * early to call, as inode may have ->i_op not yet
17451 + * initialised.
17452 + */
17453 + is_reiser4_super(inode->i_sb) &&
17454 + /*
17455 + * usually objectid is unique, but pseudo files use counter to
17456 + * generate objectid. All pseudo files are placed into special
17457 + * (otherwise unused) locality.
17458 + */
17459 + reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17460 +}
17461 +
17462 +/* hook for kmem_cache_create */
17463 +void loading_init_once(reiser4_inode * info)
17464 +{
17465 + mutex_init(&info->loading);
17466 +}
17467 +
17468 +/* for reiser4_alloc_inode */
17469 +void loading_alloc(reiser4_inode * info)
17470 +{
17471 + assert("vs-1717", !mutex_is_locked(&info->loading));
17472 +}
17473 +
17474 +/* for reiser4_destroy */
17475 +void loading_destroy(reiser4_inode * info)
17476 +{
17477 + assert("vs-1717a", !mutex_is_locked(&info->loading));
17478 +}
17479 +
17480 +static void loading_begin(reiser4_inode * info)
17481 +{
17482 + mutex_lock(&info->loading);
17483 +}
17484 +
17485 +static void loading_end(reiser4_inode * info)
17486 +{
17487 + mutex_unlock(&info->loading);
17488 +}
17489 +
17490 +/**
17491 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17492 + * @super: super block of filesystem
17493 + * @key: key of inode's stat-data
17494 + * @silent:
17495 + *
17496 + * This is our helper function a la iget(). This is be called by
17497 + * lookup_common() and reiser4_read_super(). Return inode locked or error
17498 + * encountered.
17499 + */
17500 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17501 + int silent)
17502 +{
17503 + struct inode *inode;
17504 + int result;
17505 + reiser4_inode *info;
17506 +
17507 + assert("nikita-302", super != NULL);
17508 + assert("nikita-303", key != NULL);
17509 +
17510 + result = 0;
17511 +
17512 + /* call iget(). Our ->read_inode() is dummy, so this will either
17513 + find inode in cache or return uninitialised inode */
17514 + inode = iget5_locked(super,
17515 + (unsigned long)get_key_objectid(key),
17516 + reiser4_inode_find_actor,
17517 + init_locked_inode, (reiser4_key *) key);
17518 + if (inode == NULL)
17519 + return ERR_PTR(RETERR(-ENOMEM));
17520 + if (is_bad_inode(inode)) {
17521 + warning("nikita-304", "Bad inode found");
17522 + reiser4_print_key("key", key);
17523 + iput(inode);
17524 + return ERR_PTR(RETERR(-EIO));
17525 + }
17526 +
17527 + info = reiser4_inode_data(inode);
17528 +
17529 + /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17530 + loaded and initialized inode from just allocated inode. If
17531 + REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17532 + info->loading. The place in reiser4 which uses not initialized inode
17533 + is the reiser4 repacker, see repacker-related functions in
17534 + plugin/item/extent.c */
17535 + if (!is_inode_loaded(inode)) {
17536 + loading_begin(info);
17537 + if (!is_inode_loaded(inode)) {
17538 + /* locking: iget5_locked returns locked inode */
17539 + assert("nikita-1941", !is_inode_loaded(inode));
17540 + assert("nikita-1949",
17541 + reiser4_inode_find_actor(inode,
17542 + (reiser4_key *) key));
17543 + /* now, inode has objectid as ->i_ino and locality in
17544 + reiser4-specific part. This is enough for
17545 + read_inode() to read stat data from the disk */
17546 + result = read_inode(inode, key, silent);
17547 + } else
17548 + loading_end(info);
17549 + }
17550 +
17551 + if (inode->i_state & I_NEW)
17552 + unlock_new_inode(inode);
17553 +
17554 + if (is_bad_inode(inode)) {
17555 + assert("vs-1717", result != 0);
17556 + loading_end(info);
17557 + iput(inode);
17558 + inode = ERR_PTR(result);
17559 + } else if (REISER4_DEBUG) {
17560 + reiser4_key found_key;
17561 +
17562 + assert("vs-1717", result == 0);
17563 + build_sd_key(inode, &found_key);
17564 + if (!keyeq(&found_key, key)) {
17565 + warning("nikita-305", "Wrong key in sd");
17566 + reiser4_print_key("sought for", key);
17567 + reiser4_print_key("found", &found_key);
17568 + }
17569 + if (inode->i_nlink == 0) {
17570 + warning("nikita-3559", "Unlinked inode found: %llu\n",
17571 + (unsigned long long)get_inode_oid(inode));
17572 + }
17573 + }
17574 + return inode;
17575 +}
17576 +
17577 +/* reiser4_iget() may return not fully initialized inode, this function should
17578 + * be called after one completes reiser4 inode initializing. */
17579 +void reiser4_iget_complete(struct inode *inode)
17580 +{
17581 + assert("zam-988", is_reiser4_inode(inode));
17582 +
17583 + if (!is_inode_loaded(inode)) {
17584 + reiser4_inode_set_flag(inode, REISER4_LOADED);
17585 + loading_end(reiser4_inode_data(inode));
17586 + }
17587 +}
17588 +
17589 +void reiser4_make_bad_inode(struct inode *inode)
17590 +{
17591 + assert("nikita-1934", inode != NULL);
17592 +
17593 + /* clear LOADED bit */
17594 + reiser4_inode_clr_flag(inode, REISER4_LOADED);
17595 + make_bad_inode(inode);
17596 + return;
17597 +}
17598 +
17599 +file_plugin *inode_file_plugin(const struct inode * inode)
17600 +{
17601 + assert("nikita-1997", inode != NULL);
17602 + return reiser4_inode_data(inode)->pset->file;
17603 +}
17604 +
17605 +dir_plugin *inode_dir_plugin(const struct inode * inode)
17606 +{
17607 + assert("nikita-1998", inode != NULL);
17608 + return reiser4_inode_data(inode)->pset->dir;
17609 +}
17610 +
17611 +formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17612 +{
17613 + assert("nikita-2000", inode != NULL);
17614 + return reiser4_inode_data(inode)->pset->formatting;
17615 +}
17616 +
17617 +hash_plugin *inode_hash_plugin(const struct inode * inode)
17618 +{
17619 + assert("nikita-2001", inode != NULL);
17620 + return reiser4_inode_data(inode)->pset->hash;
17621 +}
17622 +
17623 +fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17624 +{
17625 + assert("nikita-2001", inode != NULL);
17626 + return reiser4_inode_data(inode)->pset->fibration;
17627 +}
17628 +
17629 +cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17630 +{
17631 + assert("edward-36", inode != NULL);
17632 + return reiser4_inode_data(inode)->pset->cipher;
17633 +}
17634 +
17635 +compression_plugin *inode_compression_plugin(const struct inode * inode)
17636 +{
17637 + assert("edward-37", inode != NULL);
17638 + return reiser4_inode_data(inode)->pset->compression;
17639 +}
17640 +
17641 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17642 + inode)
17643 +{
17644 + assert("edward-1330", inode != NULL);
17645 + return reiser4_inode_data(inode)->pset->compression_mode;
17646 +}
17647 +
17648 +cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17649 +{
17650 + assert("edward-1328", inode != NULL);
17651 + return reiser4_inode_data(inode)->pset->cluster;
17652 +}
17653 +
17654 +file_plugin *inode_create_plugin(const struct inode * inode)
17655 +{
17656 + assert("edward-1329", inode != NULL);
17657 + return reiser4_inode_data(inode)->pset->create;
17658 +}
17659 +
17660 +digest_plugin *inode_digest_plugin(const struct inode * inode)
17661 +{
17662 + assert("edward-86", inode != NULL);
17663 + return reiser4_inode_data(inode)->pset->digest;
17664 +}
17665 +
17666 +item_plugin *inode_sd_plugin(const struct inode * inode)
17667 +{
17668 + assert("vs-534", inode != NULL);
17669 + return reiser4_inode_data(inode)->pset->sd;
17670 +}
17671 +
17672 +item_plugin *inode_dir_item_plugin(const struct inode * inode)
17673 +{
17674 + assert("vs-534", inode != NULL);
17675 + return reiser4_inode_data(inode)->pset->dir_item;
17676 +}
17677 +
17678 +file_plugin *child_create_plugin(const struct inode * inode)
17679 +{
17680 + assert("edward-1329", inode != NULL);
17681 + return reiser4_inode_data(inode)->hset->create;
17682 +}
17683 +
17684 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17685 +{
17686 + reiser4_inode *state;
17687 +
17688 + assert("nikita-2716", inode != NULL);
17689 + assert("nikita-2717", ext < LAST_SD_EXTENSION);
17690 + assert("nikita-3491", spin_inode_is_locked(inode));
17691 +
17692 + state = reiser4_inode_data(inode);
17693 + state->extmask |= 1 << ext;
17694 + /* force re-calculation of stat-data length on next call to
17695 + update_sd(). */
17696 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17697 +}
17698 +
17699 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17700 +{
17701 + reiser4_inode *state;
17702 +
17703 + assert("vpf-1926", inode != NULL);
17704 + assert("vpf-1927", ext < LAST_SD_EXTENSION);
17705 + assert("vpf-1928", spin_inode_is_locked(inode));
17706 +
17707 + state = reiser4_inode_data(inode);
17708 + state->extmask &= ~(1 << ext);
17709 + /* force re-calculation of stat-data length on next call to
17710 + update_sd(). */
17711 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17712 +}
17713 +
17714 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17715 +{
17716 + assert("edward-1287", inode != NULL);
17717 + if (!dscale_fit(old, new))
17718 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17719 + return;
17720 +}
17721 +
17722 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17723 +{
17724 + assert("nikita-2875", inode != NULL);
17725 + spin_lock_inode(inode);
17726 + inode_check_scale_nolock(inode, old, new);
17727 + spin_unlock_inode(inode);
17728 +}
17729 +
17730 +/*
17731 + * initialize ->ordering field of inode. This field defines how file stat-data
17732 + * and body is ordered within a tree with respect to other objects within the
17733 + * same parent directory.
17734 + */
17735 +void
17736 +init_inode_ordering(struct inode *inode,
17737 + reiser4_object_create_data * crd, int create)
17738 +{
17739 + reiser4_key key;
17740 +
17741 + if (create) {
17742 + struct inode *parent;
17743 +
17744 + parent = crd->parent;
17745 + assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17746 + inode_dir_plugin(parent)->build_entry_key(parent,
17747 + &crd->dentry->d_name,
17748 + &key);
17749 + } else {
17750 + coord_t *coord;
17751 +
17752 + coord = &reiser4_inode_data(inode)->sd_coord;
17753 + coord_clear_iplug(coord);
17754 + /* safe to use ->sd_coord, because node is under long term
17755 + * lock */
17756 + WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17757 + }
17758 +
17759 + set_inode_ordering(inode, get_key_ordering(&key));
17760 +}
17761 +
17762 +znode *inode_get_vroot(struct inode *inode)
17763 +{
17764 + reiser4_block_nr blk;
17765 + znode *result;
17766 +
17767 + spin_lock_inode(inode);
17768 + blk = reiser4_inode_data(inode)->vroot;
17769 + spin_unlock_inode(inode);
17770 + if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17771 + result = zlook(reiser4_tree_by_inode(inode), &blk);
17772 + else
17773 + result = NULL;
17774 + return result;
17775 +}
17776 +
17777 +void inode_set_vroot(struct inode *inode, znode *vroot)
17778 +{
17779 + spin_lock_inode(inode);
17780 + reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17781 + spin_unlock_inode(inode);
17782 +}
17783 +
17784 +#if REISER4_DEBUG
17785 +
17786 +void reiser4_inode_invariant(const struct inode *inode)
17787 +{
17788 + assert("nikita-3077", spin_inode_is_locked(inode));
17789 +}
17790 +
17791 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
17792 +{
17793 + return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17794 + r4_inode->nr_jnodes == 0;
17795 +}
17796 +
17797 +#endif
17798 +
17799 +/* true if directory is empty (only contains dot and dotdot) */
17800 +/* FIXME: shouldn't it be dir plugin method? */
17801 +int is_dir_empty(const struct inode *dir)
17802 +{
17803 + assert("nikita-1976", dir != NULL);
17804 +
17805 + /* rely on our method to maintain directory i_size being equal to the
17806 + number of entries. */
17807 + return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17808 +}
17809 +
17810 +/* Make Linus happy.
17811 + Local variables:
17812 + c-indentation-style: "K&R"
17813 + mode-name: "LC"
17814 + c-basic-offset: 8
17815 + tab-width: 8
17816 + fill-column: 120
17817 + End:
17818 +*/
17819 diff -urN linux-2.6.22.orig/fs/reiser4/inode.h linux-2.6.22/fs/reiser4/inode.h
17820 --- linux-2.6.22.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300
17821 +++ linux-2.6.22/fs/reiser4/inode.h 2007-07-29 00:25:34.872695441 +0400
17822 @@ -0,0 +1,449 @@
17823 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17824 +
17825 +/* Inode functions. */
17826 +
17827 +#if !defined( __REISER4_INODE_H__ )
17828 +#define __REISER4_INODE_H__
17829 +
17830 +#include "forward.h"
17831 +#include "debug.h"
17832 +#include "key.h"
17833 +#include "seal.h"
17834 +#include "plugin/plugin.h"
17835 +#include "plugin/file/cryptcompress.h"
17836 +#include "plugin/file/file.h"
17837 +#include "plugin/dir/dir.h"
17838 +#include "plugin/plugin_set.h"
17839 +#include "plugin/security/perm.h"
17840 +#include "vfs_ops.h"
17841 +#include "jnode.h"
17842 +#include "fsdata.h"
17843 +
17844 +#include <linux/types.h> /* for __u?? , ino_t */
17845 +#include <linux/fs.h> /* for struct super_block, struct
17846 + * rw_semaphore, etc */
17847 +#include <linux/spinlock.h>
17848 +#include <asm/types.h>
17849 +
17850 +/* reiser4-specific inode flags. They are "transient" and are not
17851 + supposed to be stored on disk. Used to trace "state" of
17852 + inode
17853 +*/
17854 +typedef enum {
17855 + /* this is light-weight inode, inheriting some state from its
17856 + parent */
17857 + REISER4_LIGHT_WEIGHT = 0,
17858 + /* stat data wasn't yet created */
17859 + REISER4_NO_SD = 1,
17860 + /* internal immutable flag. Currently is only used
17861 + to avoid race condition during file creation.
17862 + See comment in create_object(). */
17863 + REISER4_IMMUTABLE = 2,
17864 + /* inode was read from storage */
17865 + REISER4_LOADED = 3,
17866 + /* this bit is set for symlinks. inode->i_private points to target
17867 + name of symlink. */
17868 + REISER4_GENERIC_PTR_USED = 4,
17869 + /* set if size of stat-data item for this inode is known. If this is
17870 + * set we can avoid recalculating size of stat-data on each update. */
17871 + REISER4_SDLEN_KNOWN = 5,
17872 + /* reiser4_inode->crypt points to the crypto stat */
17873 + REISER4_CRYPTO_STAT_LOADED = 6,
17874 + /* cryptcompress_inode_data points to the secret key */
17875 + REISER4_SECRET_KEY_INSTALLED = 7,
17876 + /* File (possibly) has pages corresponding to the tail items, that
17877 + * were created by ->readpage. It is set by mmap_unix_file() and
17878 + * sendfile_unix_file(). This bit is inspected by write_unix_file and
17879 + * kill-hook of tail items. It is never cleared once set. This bit is
17880 + * modified and inspected under i_mutex. */
17881 + REISER4_HAS_MMAP = 8,
17882 + REISER4_PART_MIXED = 9,
17883 + REISER4_PART_IN_CONV = 10,
17884 + /* This flag indicates that file plugin conversion is in progress */
17885 + REISER4_FILE_CONV_IN_PROGRESS = 11
17886 +} reiser4_file_plugin_flags;
17887 +
17888 +/* state associated with each inode.
17889 + reiser4 inode.
17890 +
17891 + NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17892 + be of the same size. File-system allocates inodes by itself through
17893 + s_op->allocate_inode() method. So, it is possible to adjust size of inode
17894 + at the time of its creation.
17895 +
17896 + Invariants involving parts of this data-type:
17897 +
17898 + [inode->eflushed]
17899 +
17900 +*/
17901 +
17902 +typedef struct reiser4_inode reiser4_inode;
17903 +/* return pointer to reiser4-specific part of inode */
17904 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17905 + /* inode queried */ );
17906 +
17907 +#if BITS_PER_LONG == 64
17908 +
17909 +#define REISER4_INO_IS_OID (1)
17910 +typedef struct {;
17911 +} oid_hi_t;
17912 +
17913 +/* BITS_PER_LONG == 64 */
17914 +#else
17915 +
17916 +#define REISER4_INO_IS_OID (0)
17917 +typedef __u32 oid_hi_t;
17918 +
17919 +/* BITS_PER_LONG == 64 */
17920 +#endif
17921 +
17922 +struct reiser4_inode {
17923 + /* spin lock protecting fields of this structure. */
17924 + spinlock_t guard;
17925 + /* main plugin set that control the file
17926 + (see comments in plugin/plugin_set.c) */
17927 + plugin_set *pset;
17928 + /* plugin set for inheritance
17929 + (see comments in plugin/plugin_set.c) */
17930 + plugin_set *hset;
17931 + /* high 32 bits of object id */
17932 + oid_hi_t oid_hi;
17933 + /* seal for stat-data */
17934 + seal_t sd_seal;
17935 + /* locality id for this file */
17936 + oid_t locality_id;
17937 +#if REISER4_LARGE_KEY
17938 + __u64 ordering;
17939 +#endif
17940 + /* coord of stat-data in sealed node */
17941 + coord_t sd_coord;
17942 + /* bit-mask of stat-data extentions used by this file */
17943 + __u64 extmask;
17944 + /* bitmask of non-default plugins for this inode */
17945 + __u16 plugin_mask;
17946 + /* bitmask of set heir plugins for this inode. */
17947 + __u16 heir_mask;
17948 + union {
17949 + struct list_head readdir_list;
17950 + struct list_head not_used;
17951 + } lists;
17952 + /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
17953 + unsigned long flags;
17954 + union {
17955 + /* fields specific to unix_file plugin */
17956 + struct unix_file_info unix_file_info;
17957 + /* fields specific to cryptcompress file plugin */
17958 + struct cryptcompress_info cryptcompress_info;
17959 + } file_plugin_data;
17960 +
17961 + /* this semaphore is to serialize readers and writers of @pset->file
17962 + * when file plugin conversion is enabled
17963 + */
17964 + struct rw_semaphore conv_sem;
17965 +
17966 + /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
17967 + tagged in that tree by EFLUSH_TAG_ANONYMOUS */
17968 + struct radix_tree_root jnodes_tree;
17969 +#if REISER4_DEBUG
17970 + /* number of unformatted node jnodes of this file in jnode hash table */
17971 + unsigned long nr_jnodes;
17972 +#endif
17973 +
17974 + /* block number of virtual root for this object. See comment above
17975 + * fs/reiser4/search.c:handle_vroot() */
17976 + reiser4_block_nr vroot;
17977 + struct mutex loading;
17978 +};
17979 +
17980 +void loading_init_once(reiser4_inode *);
17981 +void loading_alloc(reiser4_inode *);
17982 +void loading_destroy(reiser4_inode *);
17983 +
17984 +struct reiser4_inode_object {
17985 + /* private part */
17986 + reiser4_inode p;
17987 + /* generic fields not specific to reiser4, but used by VFS */
17988 + struct inode vfs_inode;
17989 +};
17990 +
17991 +/* return pointer to the reiser4 specific portion of @inode */
17992 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17993 + /* inode queried */ )
17994 +{
17995 + assert("nikita-254", inode != NULL);
17996 + return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
17997 +}
17998 +
17999 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18000 + r4_inode /* inode queried */
18001 + )
18002 +{
18003 + return &container_of(r4_inode, struct reiser4_inode_object, p)->vfs_inode;
18004 +}
18005 +
18006 +/*
18007 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18008 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18009 + * bits.
18010 + *
18011 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18012 + * of inode, otherwise whole oid is stored in i_ino.
18013 + *
18014 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18015 + */
18016 +
18017 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18018 +
18019 +#if REISER4_INO_IS_OID
18020 +
18021 +static inline oid_t get_inode_oid(const struct inode *inode)
18022 +{
18023 + return inode->i_ino;
18024 +}
18025 +
18026 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18027 +{
18028 + inode->i_ino = oid;
18029 +}
18030 +
18031 +/* REISER4_INO_IS_OID */
18032 +#else
18033 +
18034 +static inline oid_t get_inode_oid(const struct inode *inode)
18035 +{
18036 + return
18037 + ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18038 + inode->i_ino;
18039 +}
18040 +
18041 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18042 +{
18043 + assert("nikita-2519", inode != NULL);
18044 + inode->i_ino = (ino_t) (oid);
18045 + reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18046 + assert("nikita-2521", get_inode_oid(inode) == (oid));
18047 +}
18048 +
18049 +/* REISER4_INO_IS_OID */
18050 +#endif
18051 +
18052 +static inline oid_t get_inode_locality(const struct inode *inode)
18053 +{
18054 + return reiser4_inode_data(inode)->locality_id;
18055 +}
18056 +
18057 +#if REISER4_LARGE_KEY
18058 +static inline __u64 get_inode_ordering(const struct inode *inode)
18059 +{
18060 + return reiser4_inode_data(inode)->ordering;
18061 +}
18062 +
18063 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18064 +{
18065 + reiser4_inode_data(inode)->ordering = ordering;
18066 +}
18067 +
18068 +#else
18069 +
18070 +#define get_inode_ordering(inode) (0)
18071 +#define set_inode_ordering(inode, val) noop
18072 +
18073 +#endif
18074 +
18075 +/* return inode in which @uf_info is embedded */
18076 +static inline struct inode *
18077 +unix_file_info_to_inode(const struct unix_file_info * uf_info)
18078 +{
18079 + return &container_of(uf_info, struct reiser4_inode_object,
18080 + p.file_plugin_data.unix_file_info)->vfs_inode;
18081 +}
18082 +
18083 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18084 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18085 +
18086 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18087 +
18088 +#if REISER4_DEBUG
18089 +extern void reiser4_inode_invariant(const struct inode *inode);
18090 +extern int inode_has_no_jnodes(reiser4_inode *);
18091 +#else
18092 +#define reiser4_inode_invariant(inode) noop
18093 +#endif
18094 +
18095 +static inline int spin_inode_is_locked(const struct inode *inode)
18096 +{
18097 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18098 + return 1;
18099 +}
18100 +
18101 +/**
18102 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18103 + * @inode: inode to lock
18104 + *
18105 + * In debug mode it checks that lower priority locks are not held and
18106 + * increments reiser4_context's lock counters on which lock ordering checking
18107 + * is based.
18108 + */
18109 +static inline void spin_lock_inode(struct inode *inode)
18110 +{
18111 + assert("", LOCK_CNT_NIL(spin_locked));
18112 + /* check lock ordering */
18113 + assert_spin_not_locked(&d_lock);
18114 +
18115 + spin_lock(&reiser4_inode_data(inode)->guard);
18116 +
18117 + LOCK_CNT_INC(spin_locked_inode);
18118 + LOCK_CNT_INC(spin_locked);
18119 +
18120 + reiser4_inode_invariant(inode);
18121 +}
18122 +
18123 +/**
18124 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18125 + * @inode: inode to unlock
18126 + *
18127 + * In debug mode it checks that spinlock is held and decrements
18128 + * reiser4_context's lock counters on which lock ordering checking is based.
18129 + */
18130 +static inline void spin_unlock_inode(struct inode *inode)
18131 +{
18132 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18133 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18134 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18135 +
18136 + reiser4_inode_invariant(inode);
18137 +
18138 + LOCK_CNT_DEC(spin_locked_inode);
18139 + LOCK_CNT_DEC(spin_locked);
18140 +
18141 + spin_unlock(&reiser4_inode_data(inode)->guard);
18142 +}
18143 +
18144 +extern znode *inode_get_vroot(struct inode *inode);
18145 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18146 +
18147 +extern int reiser4_max_filename_len(const struct inode *inode);
18148 +extern int max_hash_collisions(const struct inode *dir);
18149 +extern void reiser4_unlock_inode(struct inode *inode);
18150 +extern int is_reiser4_inode(const struct inode *inode);
18151 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18152 +extern struct inode *reiser4_iget(struct super_block *super,
18153 + const reiser4_key * key, int silent);
18154 +extern void reiser4_iget_complete(struct inode *inode);
18155 +extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18156 +extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18157 +extern int reiser4_inode_get_flag(const struct inode *inode,
18158 + reiser4_file_plugin_flags f);
18159 +
18160 +/* has inode been initialized? */
18161 +static inline int
18162 +is_inode_loaded(const struct inode *inode /* inode queried */ )
18163 +{
18164 + assert("nikita-1120", inode != NULL);
18165 + return reiser4_inode_get_flag(inode, REISER4_LOADED);
18166 +}
18167 +
18168 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18169 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18170 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18171 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18172 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18173 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18174 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18175 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18176 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18177 + *inode);
18178 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18179 +extern file_plugin *inode_create_plugin(const struct inode *inode);
18180 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18181 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18182 +extern file_plugin *child_create_plugin(const struct inode *inode);
18183 +
18184 +extern void reiser4_make_bad_inode(struct inode *inode);
18185 +
18186 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18187 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18188 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18189 +extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18190 +
18191 +#define INODE_SET_SIZE(i, value) \
18192 +({ \
18193 + struct inode *__i; \
18194 + typeof(value) __v; \
18195 + \
18196 + __i = (i); \
18197 + __v = (value); \
18198 + inode_check_scale(__i, __i->i_size, __v); \
18199 + i_size_write(__i, __v); \
18200 +})
18201 +
18202 +/*
18203 + * update field @field in inode @i to contain value @value.
18204 + */
18205 +#define INODE_SET_FIELD(i, field, value) \
18206 +({ \
18207 + struct inode *__i; \
18208 + typeof(value) __v; \
18209 + \
18210 + __i = (i); \
18211 + __v = (value); \
18212 + inode_check_scale(__i, __i->field, __v); \
18213 + __i->field = __v; \
18214 +})
18215 +
18216 +#define INODE_INC_FIELD(i, field) \
18217 +({ \
18218 + struct inode *__i; \
18219 + \
18220 + __i = (i); \
18221 + inode_check_scale(__i, __i->field, __i->field + 1); \
18222 + ++ __i->field; \
18223 +})
18224 +
18225 +#define INODE_DEC_FIELD(i, field) \
18226 +({ \
18227 + struct inode *__i; \
18228 + \
18229 + __i = (i); \
18230 + inode_check_scale(__i, __i->field, __i->field - 1); \
18231 + -- __i->field; \
18232 +})
18233 +
18234 +/* See comment before reiser4_readdir_common() for description. */
18235 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18236 +{
18237 + return &reiser4_inode_data(inode)->lists.readdir_list;
18238 +}
18239 +
18240 +extern void init_inode_ordering(struct inode *inode,
18241 + reiser4_object_create_data * crd, int create);
18242 +
18243 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18244 +{
18245 + return &reiser4_inode_data(inode)->jnodes_tree;
18246 +}
18247 +
18248 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18249 + * r4_inode)
18250 +{
18251 + return &r4_inode->jnodes_tree;
18252 +}
18253 +
18254 +#if REISER4_DEBUG
18255 +extern void print_inode(const char *prefix, const struct inode *i);
18256 +#endif
18257 +
18258 +int is_dir_empty(const struct inode *);
18259 +
18260 +/* __REISER4_INODE_H__ */
18261 +#endif
18262 +
18263 +/* Make Linus happy.
18264 + Local variables:
18265 + c-indentation-style: "K&R"
18266 + mode-name: "LC"
18267 + c-basic-offset: 8
18268 + tab-width: 8
18269 + fill-column: 120
18270 + End:
18271 +*/
18272 diff -urN linux-2.6.22.orig/fs/reiser4/ioctl.h linux-2.6.22/fs/reiser4/ioctl.h
18273 --- linux-2.6.22.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300
18274 +++ linux-2.6.22/fs/reiser4/ioctl.h 2007-07-29 00:25:34.872695441 +0400
18275 @@ -0,0 +1,41 @@
18276 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18277 + * reiser4/README */
18278 +
18279 +#if !defined( __REISER4_IOCTL_H__ )
18280 +#define __REISER4_IOCTL_H__
18281 +
18282 +#include <linux/fs.h>
18283 +
18284 +/*
18285 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18286 + * extents and fix in this state. This is used by applications that rely on
18287 + *
18288 + * . files being block aligned, and
18289 + *
18290 + * . files never migrating on disk
18291 + *
18292 + * for example, boot loaders (LILO) need this.
18293 + *
18294 + * This ioctl should be used as
18295 + *
18296 + * result = ioctl(fd, REISER4_IOC_UNPACK);
18297 + *
18298 + * File behind fd descriptor will be converted to the extents (if necessary),
18299 + * and its stat-data will be updated so that it will never be converted back
18300 + * into tails again.
18301 + */
18302 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18303 +
18304 +/* __REISER4_IOCTL_H__ */
18305 +#endif
18306 +
18307 +/* Make Linus happy.
18308 + Local variables:
18309 + c-indentation-style: "K&R"
18310 + mode-name: "LC"
18311 + c-basic-offset: 8
18312 + tab-width: 8
18313 + fill-column: 120
18314 + scroll-step: 1
18315 + End:
18316 +*/
18317 diff -urN linux-2.6.22.orig/fs/reiser4/jnode.c linux-2.6.22/fs/reiser4/jnode.c
18318 --- linux-2.6.22.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300
18319 +++ linux-2.6.22/fs/reiser4/jnode.c 2007-07-29 00:25:34.876696477 +0400
18320 @@ -0,0 +1,1924 @@
18321 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18322 + * reiser4/README */
18323 +/* Jnode manipulation functions. */
18324 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18325 +
18326 + In particular, jnodes are used to track transactional information
18327 + associated with each block. Each znode contains jnode as ->zjnode field.
18328 +
18329 + Jnode stands for either Josh or Journal node.
18330 +*/
18331 +
18332 +/*
18333 + * Taxonomy.
18334 + *
18335 + * Jnode represents block containing data or meta-data. There are jnodes
18336 + * for:
18337 + *
18338 + * unformatted blocks (jnodes proper). There are plans, however to
18339 + * have a handle per extent unit rather than per each unformatted
18340 + * block, because there are so many of them.
18341 + *
18342 + * For bitmaps. Each bitmap is actually represented by two jnodes--one
18343 + * for working and another for "commit" data, together forming bnode.
18344 + *
18345 + * For io-heads. These are used by log writer.
18346 + *
18347 + * For formatted nodes (znode). See comment at the top of znode.c for
18348 + * details specific to the formatted nodes (znodes).
18349 + *
18350 + * Node data.
18351 + *
18352 + * Jnode provides access to the data of node it represents. Data are
18353 + * stored in a page. Page is kept in a page cache. This means, that jnodes
18354 + * are highly interconnected with page cache and VM internals.
18355 + *
18356 + * jnode has a pointer to page (->pg) containing its data. Pointer to data
18357 + * themselves is cached in ->data field to avoid frequent calls to
18358 + * page_address().
18359 + *
18360 + * jnode and page are attached to each other by jnode_attach_page(). This
18361 + * function places pointer to jnode in set_page_private(), sets PG_private
18362 + * flag and increments page counter.
18363 + *
18364 + * Opposite operation is performed by page_clear_jnode().
18365 + *
18366 + * jnode->pg is protected by jnode spin lock, and page->private is
18367 + * protected by page lock. See comment at the top of page_cache.c for
18368 + * more.
18369 + *
18370 + * page can be detached from jnode for two reasons:
18371 + *
18372 + * . jnode is removed from a tree (file is truncated, of formatted
18373 + * node is removed by balancing).
18374 + *
18375 + * . during memory pressure, VM calls ->releasepage() method
18376 + * (reiser4_releasepage()) to evict page from memory.
18377 + *
18378 + * (there, of course, is also umount, but this is special case we are not
18379 + * concerned with here).
18380 + *
18381 + * To protect jnode page from eviction, one calls jload() function that
18382 + * "pins" page in memory (loading it if necessary), increments
18383 + * jnode->d_count, and kmap()s page. Page is unpinned through call to
18384 + * jrelse().
18385 + *
18386 + * Jnode life cycle.
18387 + *
18388 + * jnode is created, placed in hash table, and, optionally, in per-inode
18389 + * radix tree. Page can be attached to jnode, pinned, released, etc.
18390 + *
18391 + * When jnode is captured into atom its reference counter is
18392 + * increased. While being part of an atom, jnode can be "early
18393 + * flushed". This means that as part of flush procedure, jnode is placed
18394 + * into "relocate set", and its page is submitted to the disk. After io
18395 + * completes, page can be detached, then loaded again, re-dirtied, etc.
18396 + *
18397 + * Thread acquired reference to jnode by calling jref() and releases it by
18398 + * jput(). When last reference is removed, jnode is still retained in
18399 + * memory (cached) if it has page attached, _unless_ it is scheduled for
18400 + * destruction (has JNODE_HEARD_BANSHEE bit set).
18401 + *
18402 + * Tree read-write lock was used as "existential" lock for jnodes. That is,
18403 + * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18404 + * that is, tree lock protected unreferenced jnodes stored in the hash
18405 + * table, from recycling.
18406 + *
18407 + * This resulted in high contention on tree lock, because jref()/jput() is
18408 + * frequent operation. To ameliorate this problem, RCU is used: when jput()
18409 + * is just about to release last reference on jnode it sets JNODE_RIP bit
18410 + * on it, and then proceed with jnode destruction (removing jnode from hash
18411 + * table, cbk_cache, detaching page, etc.). All places that change jnode
18412 + * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18413 + * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18414 + * jnode_rip_check() function), and pretend that nothing was found in hash
18415 + * table if bit is set.
18416 + *
18417 + * jput defers actual return of jnode into slab cache to some later time
18418 + * (by call_rcu()), this guarantees that other threads can safely continue
18419 + * working with JNODE_RIP-ped jnode.
18420 + *
18421 + */
18422 +
18423 +#include "reiser4.h"
18424 +#include "debug.h"
18425 +#include "dformat.h"
18426 +#include "jnode.h"
18427 +#include "plugin/plugin_header.h"
18428 +#include "plugin/plugin.h"
18429 +#include "txnmgr.h"
18430 +/*#include "jnode.h"*/
18431 +#include "znode.h"
18432 +#include "tree.h"
18433 +#include "tree_walk.h"
18434 +#include "super.h"
18435 +#include "inode.h"
18436 +#include "page_cache.h"
18437 +
18438 +#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18439 +#include <linux/types.h>
18440 +#include <linux/slab.h>
18441 +#include <linux/pagemap.h>
18442 +#include <linux/swap.h>
18443 +#include <linux/fs.h> /* for struct address_space */
18444 +#include <linux/writeback.h> /* for inode_lock */
18445 +
18446 +static struct kmem_cache *_jnode_slab = NULL;
18447 +
18448 +static void jnode_set_type(jnode * node, jnode_type type);
18449 +static int jdelete(jnode * node);
18450 +static int jnode_try_drop(jnode * node);
18451 +
18452 +#if REISER4_DEBUG
18453 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18454 +#endif
18455 +
18456 +/* true if valid page is attached to jnode */
18457 +static inline int jnode_is_parsed(jnode * node)
18458 +{
18459 + return JF_ISSET(node, JNODE_PARSED);
18460 +}
18461 +
18462 +/* hash table support */
18463 +
18464 +/* compare two jnode keys for equality. Used by hash-table macros */
18465 +static inline int jnode_key_eq(const struct jnode_key * k1,
18466 + const struct jnode_key * k2)
18467 +{
18468 + assert("nikita-2350", k1 != NULL);
18469 + assert("nikita-2351", k2 != NULL);
18470 +
18471 + return (k1->index == k2->index && k1->objectid == k2->objectid);
18472 +}
18473 +
18474 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18475 +static inline __u32 jnode_key_hashfn(j_hash_table * table,
18476 + const struct jnode_key * key)
18477 +{
18478 + assert("nikita-2352", key != NULL);
18479 + assert("nikita-3346", IS_POW(table->_buckets));
18480 +
18481 + /* yes, this is remarkable simply (where not stupid) hash function. */
18482 + return (key->objectid + key->index) & (table->_buckets - 1);
18483 +}
18484 +
18485 +/* The hash table definition */
18486 +#define KMALLOC(size) reiser4_vmalloc(size)
18487 +#define KFREE(ptr, size) vfree(ptr)
18488 +TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18489 + jnode_key_hashfn, jnode_key_eq);
18490 +#undef KFREE
18491 +#undef KMALLOC
18492 +
18493 +/* call this to initialise jnode hash table */
18494 +int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18495 +{
18496 + assert("nikita-2359", tree != NULL);
18497 + return j_hash_init(&tree->jhash_table, 16384);
18498 +}
18499 +
18500 +/* call this to destroy jnode hash table. This is called during umount. */
18501 +int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18502 +{
18503 + j_hash_table *jtable;
18504 + jnode *node;
18505 + jnode *next;
18506 +
18507 + assert("nikita-2360", tree != NULL);
18508 +
18509 + /*
18510 + * Scan hash table and free all jnodes.
18511 + */
18512 + jtable = &tree->jhash_table;
18513 + if (jtable->_table) {
18514 + for_all_in_htable(jtable, j, node, next) {
18515 + assert("nikita-2361", !atomic_read(&node->x_count));
18516 + jdrop(node);
18517 + }
18518 +
18519 + j_hash_done(&tree->jhash_table);
18520 + }
18521 + return 0;
18522 +}
18523 +
18524 +/**
18525 + * init_jnodes - create jnode cache
18526 + *
18527 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18528 + */
18529 +int init_jnodes(void)
18530 +{
18531 + assert("umka-168", _jnode_slab == NULL);
18532 +
18533 + _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18534 + SLAB_HWCACHE_ALIGN |
18535 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
18536 + if (_jnode_slab == NULL)
18537 + return RETERR(-ENOMEM);
18538 +
18539 + return 0;
18540 +}
18541 +
18542 +/**
18543 + * done_znodes - delete znode cache
18544 + *
18545 + * This is called on reiser4 module unloading or system shutdown.
18546 + */
18547 +void done_jnodes(void)
18548 +{
18549 + destroy_reiser4_cache(&_jnode_slab);
18550 +}
18551 +
18552 +/* Initialize a jnode. */
18553 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18554 +{
18555 + assert("umka-175", node != NULL);
18556 +
18557 + memset(node, 0, sizeof(jnode));
18558 + ON_DEBUG(node->magic = JMAGIC);
18559 + jnode_set_type(node, type);
18560 + atomic_set(&node->d_count, 0);
18561 + atomic_set(&node->x_count, 0);
18562 + spin_lock_init(&node->guard);
18563 + spin_lock_init(&node->load);
18564 + node->atom = NULL;
18565 + node->tree = tree;
18566 + INIT_LIST_HEAD(&node->capture_link);
18567 +
18568 + ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18569 +
18570 + INIT_RCU_HEAD(&node->rcu);
18571 +
18572 +#if REISER4_DEBUG
18573 + {
18574 + reiser4_super_info_data *sbinfo;
18575 +
18576 + sbinfo = get_super_private(tree->super);
18577 + spin_lock_irq(&sbinfo->all_guard);
18578 + list_add(&node->jnodes, &sbinfo->all_jnodes);
18579 + spin_unlock_irq(&sbinfo->all_guard);
18580 + }
18581 +#endif
18582 +}
18583 +
18584 +#if REISER4_DEBUG
18585 +/*
18586 + * Remove jnode from ->all_jnodes list.
18587 + */
18588 +static void jnode_done(jnode * node, reiser4_tree * tree)
18589 +{
18590 + reiser4_super_info_data *sbinfo;
18591 +
18592 + sbinfo = get_super_private(tree->super);
18593 +
18594 + spin_lock_irq(&sbinfo->all_guard);
18595 + assert("nikita-2422", !list_empty(&node->jnodes));
18596 + list_del_init(&node->jnodes);
18597 + spin_unlock_irq(&sbinfo->all_guard);
18598 +}
18599 +#endif
18600 +
18601 +/* return already existing jnode of page */
18602 +jnode *jnode_by_page(struct page *pg)
18603 +{
18604 + assert("nikita-2066", pg != NULL);
18605 + assert("nikita-2400", PageLocked(pg));
18606 + assert("nikita-2068", PagePrivate(pg));
18607 + assert("nikita-2067", jprivate(pg) != NULL);
18608 + return jprivate(pg);
18609 +}
18610 +
18611 +/* exported functions to allocate/free jnode objects outside this file */
18612 +jnode *jalloc(void)
18613 +{
18614 + jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18615 + return jal;
18616 +}
18617 +
18618 +/* return jnode back to the slab allocator */
18619 +inline void jfree(jnode * node)
18620 +{
18621 + assert("zam-449", node != NULL);
18622 +
18623 + assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18624 + NODE_LIST(node) == NOT_CAPTURED));
18625 + assert("nikita-3222", list_empty(&node->jnodes));
18626 + assert("nikita-3221", jnode_page(node) == NULL);
18627 +
18628 + /* not yet phash_jnode_destroy(node); */
18629 +
18630 + kmem_cache_free(_jnode_slab, node);
18631 +}
18632 +
18633 +/*
18634 + * This function is supplied as RCU callback. It actually frees jnode when
18635 + * last reference to it is gone.
18636 + */
18637 +static void jnode_free_actor(struct rcu_head *head)
18638 +{
18639 + jnode *node;
18640 + jnode_type jtype;
18641 +
18642 + node = container_of(head, jnode, rcu);
18643 + jtype = jnode_get_type(node);
18644 +
18645 + ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18646 +
18647 + switch (jtype) {
18648 + case JNODE_IO_HEAD:
18649 + case JNODE_BITMAP:
18650 + case JNODE_UNFORMATTED_BLOCK:
18651 + jfree(node);
18652 + break;
18653 + case JNODE_FORMATTED_BLOCK:
18654 + zfree(JZNODE(node));
18655 + break;
18656 + case JNODE_INODE:
18657 + default:
18658 + wrong_return_value("nikita-3197", "Wrong jnode type");
18659 + }
18660 +}
18661 +
18662 +/*
18663 + * Free a jnode. Post a callback to be executed later through RCU when all
18664 + * references to @node are released.
18665 + */
18666 +static inline void jnode_free(jnode * node, jnode_type jtype)
18667 +{
18668 + if (jtype != JNODE_INODE) {
18669 + /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18670 + call_rcu(&node->rcu, jnode_free_actor);
18671 + } else
18672 + jnode_list_remove(node);
18673 +}
18674 +
18675 +/* allocate new unformatted jnode */
18676 +static jnode *jnew_unformatted(void)
18677 +{
18678 + jnode *jal;
18679 +
18680 + jal = jalloc();
18681 + if (jal == NULL)
18682 + return NULL;
18683 +
18684 + jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18685 + jal->key.j.mapping = NULL;
18686 + jal->key.j.index = (unsigned long)-1;
18687 + jal->key.j.objectid = 0;
18688 + return jal;
18689 +}
18690 +
18691 +/* look for jnode with given mapping and offset within hash table */
18692 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18693 +{
18694 + struct jnode_key jkey;
18695 + jnode *node;
18696 +
18697 + assert("nikita-2353", tree != NULL);
18698 +
18699 + jkey.objectid = objectid;
18700 + jkey.index = index;
18701 +
18702 + /*
18703 + * hash table is _not_ protected by any lock during lookups. All we
18704 + * have to do is to disable preemption to keep RCU happy.
18705 + */
18706 +
18707 + rcu_read_lock();
18708 + node = j_hash_find(&tree->jhash_table, &jkey);
18709 + if (node != NULL) {
18710 + /* protect @node from recycling */
18711 + jref(node);
18712 + assert("nikita-2955", jnode_invariant(node, 0, 0));
18713 + node = jnode_rip_check(tree, node);
18714 + }
18715 + rcu_read_unlock();
18716 + return node;
18717 +}
18718 +
18719 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18720 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18721 +{
18722 + assert("vs-1694", mapping->host != NULL);
18723 +
18724 + return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18725 +}
18726 +
18727 +jnode *jfind(struct address_space * mapping, unsigned long index)
18728 +{
18729 + reiser4_tree *tree;
18730 + jnode *node;
18731 +
18732 + assert("vs-1694", mapping->host != NULL);
18733 + tree = reiser4_tree_by_inode(mapping->host);
18734 +
18735 + read_lock_tree(tree);
18736 + node = jfind_nolock(mapping, index);
18737 + if (node != NULL)
18738 + jref(node);
18739 + read_unlock_tree(tree);
18740 + return node;
18741 +}
18742 +
18743 +static void inode_attach_jnode(jnode * node)
18744 +{
18745 + struct inode *inode;
18746 + reiser4_inode *info;
18747 + struct radix_tree_root *rtree;
18748 +
18749 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18750 + assert("zam-1043", node->key.j.mapping != NULL);
18751 + inode = node->key.j.mapping->host;
18752 + info = reiser4_inode_data(inode);
18753 + rtree = jnode_tree_by_reiser4_inode(info);
18754 + if (rtree->rnode == NULL) {
18755 + /* prevent inode from being pruned when it has jnodes attached
18756 + to it */
18757 + write_lock_irq(&inode->i_data.tree_lock);
18758 + inode->i_data.nrpages++;
18759 + write_unlock_irq(&inode->i_data.tree_lock);
18760 + }
18761 + assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18762 + check_me("zam-1045",
18763 + !radix_tree_insert(rtree, node->key.j.index, node));
18764 + ON_DEBUG(info->nr_jnodes++);
18765 +}
18766 +
18767 +static void inode_detach_jnode(jnode * node)
18768 +{
18769 + struct inode *inode;
18770 + reiser4_inode *info;
18771 + struct radix_tree_root *rtree;
18772 +
18773 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18774 + assert("zam-1044", node->key.j.mapping != NULL);
18775 + inode = node->key.j.mapping->host;
18776 + info = reiser4_inode_data(inode);
18777 + rtree = jnode_tree_by_reiser4_inode(info);
18778 +
18779 + assert("zam-1051", info->nr_jnodes != 0);
18780 + assert("zam-1052", rtree->rnode != NULL);
18781 + ON_DEBUG(info->nr_jnodes--);
18782 +
18783 + /* delete jnode from inode's radix tree of jnodes */
18784 + check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18785 + if (rtree->rnode == NULL) {
18786 + /* inode can be pruned now */
18787 + write_lock_irq(&inode->i_data.tree_lock);
18788 + inode->i_data.nrpages--;
18789 + write_unlock_irq(&inode->i_data.tree_lock);
18790 + }
18791 +}
18792 +
18793 +/* put jnode into hash table (where they can be found by flush who does not know
18794 + mapping) and to inode's tree of jnodes (where they can be found (hopefully
18795 + faster) in places where mapping is known). Currently it is used by
18796 + fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18797 + created */
18798 +static void
18799 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18800 + unsigned long index)
18801 +{
18802 + j_hash_table *jtable;
18803 +
18804 + assert("vs-1446", jnode_is_unformatted(node));
18805 + assert("vs-1442", node->key.j.mapping == 0);
18806 + assert("vs-1443", node->key.j.objectid == 0);
18807 + assert("vs-1444", node->key.j.index == (unsigned long)-1);
18808 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18809 +
18810 + node->key.j.mapping = mapping;
18811 + node->key.j.objectid = get_inode_oid(mapping->host);
18812 + node->key.j.index = index;
18813 +
18814 + jtable = &jnode_get_tree(node)->jhash_table;
18815 +
18816 + /* race with some other thread inserting jnode into the hash table is
18817 + * impossible, because we keep the page lock. */
18818 + /*
18819 + * following assertion no longer holds because of RCU: it is possible
18820 + * jnode is in the hash table, but with JNODE_RIP bit set.
18821 + */
18822 + /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18823 + j_hash_insert_rcu(jtable, node);
18824 + inode_attach_jnode(node);
18825 +}
18826 +
18827 +static void unhash_unformatted_node_nolock(jnode * node)
18828 +{
18829 + assert("vs-1683", node->key.j.mapping != NULL);
18830 + assert("vs-1684",
18831 + node->key.j.objectid ==
18832 + get_inode_oid(node->key.j.mapping->host));
18833 +
18834 + /* remove jnode from hash-table */
18835 + j_hash_remove_rcu(&node->tree->jhash_table, node);
18836 + inode_detach_jnode(node);
18837 + node->key.j.mapping = NULL;
18838 + node->key.j.index = (unsigned long)-1;
18839 + node->key.j.objectid = 0;
18840 +
18841 +}
18842 +
18843 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18844 + reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
18845 + reiser4_uncapture_jnode */
18846 +void unhash_unformatted_jnode(jnode * node)
18847 +{
18848 + assert("vs-1445", jnode_is_unformatted(node));
18849 +
18850 + write_lock_tree(node->tree);
18851 + unhash_unformatted_node_nolock(node);
18852 + write_unlock_tree(node->tree);
18853 +}
18854 +
18855 +/*
18856 + * search hash table for a jnode with given oid and index. If not found,
18857 + * allocate new jnode, insert it, and also insert into radix tree for the
18858 + * given inode/mapping.
18859 + */
18860 +static jnode *find_get_jnode(reiser4_tree * tree,
18861 + struct address_space *mapping,
18862 + oid_t oid, unsigned long index)
18863 +{
18864 + jnode *result;
18865 + jnode *shadow;
18866 + int preload;
18867 +
18868 + result = jnew_unformatted();
18869 +
18870 + if (unlikely(result == NULL))
18871 + return ERR_PTR(RETERR(-ENOMEM));
18872 +
18873 + preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
18874 + if (preload != 0)
18875 + return ERR_PTR(preload);
18876 +
18877 + write_lock_tree(tree);
18878 + shadow = jfind_nolock(mapping, index);
18879 + if (likely(shadow == NULL)) {
18880 + /* add new jnode to hash table and inode's radix tree of jnodes */
18881 + jref(result);
18882 + hash_unformatted_jnode(result, mapping, index);
18883 + } else {
18884 + /* jnode is found in inode's radix tree of jnodes */
18885 + jref(shadow);
18886 + jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18887 + assert("vs-1498", shadow->key.j.mapping == mapping);
18888 + result = shadow;
18889 + }
18890 + write_unlock_tree(tree);
18891 +
18892 + assert("nikita-2955",
18893 + ergo(result != NULL, jnode_invariant(result, 0, 0)));
18894 + radix_tree_preload_end();
18895 + return result;
18896 +}
18897 +
18898 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18899 + creates) jnode corresponding to page @pg. jnode is attached to page and
18900 + inserted into jnode hash-table. */
18901 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18902 +{
18903 + /*
18904 + * There are two ways to create jnode: starting with pre-existing page
18905 + * and without page.
18906 + *
18907 + * When page already exists, jnode is created
18908 + * (jnode_of_page()->do_jget()) under page lock. This is done in
18909 + * ->writepage(), or when capturing anonymous page dirtied through
18910 + * mmap.
18911 + *
18912 + * Jnode without page is created by index_extent_jnode().
18913 + *
18914 + */
18915 +
18916 + jnode *result;
18917 + oid_t oid = get_inode_oid(pg->mapping->host);
18918 +
18919 + assert("umka-176", pg != NULL);
18920 + assert("nikita-2394", PageLocked(pg));
18921 +
18922 + result = jprivate(pg);
18923 + if (likely(result != NULL))
18924 + return jref(result);
18925 +
18926 + tree = reiser4_tree_by_page(pg);
18927 +
18928 + /* check hash-table first */
18929 + result = jfind(pg->mapping, pg->index);
18930 + if (unlikely(result != NULL)) {
18931 + spin_lock_jnode(result);
18932 + jnode_attach_page(result, pg);
18933 + spin_unlock_jnode(result);
18934 + result->key.j.mapping = pg->mapping;
18935 + return result;
18936 + }
18937 +
18938 + /* since page is locked, jnode should be allocated with GFP_NOFS flag */
18939 + reiser4_ctx_gfp_mask_force(GFP_NOFS);
18940 + result = find_get_jnode(tree, pg->mapping, oid, pg->index);
18941 + if (unlikely(IS_ERR(result)))
18942 + return result;
18943 + /* attach jnode to page */
18944 + spin_lock_jnode(result);
18945 + jnode_attach_page(result, pg);
18946 + spin_unlock_jnode(result);
18947 + return result;
18948 +}
18949 +
18950 +/*
18951 + * return jnode for @pg, creating it if necessary.
18952 + */
18953 +jnode *jnode_of_page(struct page * pg)
18954 +{
18955 + jnode *result;
18956 +
18957 + assert("umka-176", pg != NULL);
18958 + assert("nikita-2394", PageLocked(pg));
18959 +
18960 + result = do_jget(reiser4_tree_by_page(pg), pg);
18961 +
18962 + if (REISER4_DEBUG && !IS_ERR(result)) {
18963 + assert("nikita-3210", result == jprivate(pg));
18964 + assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
18965 + if (jnode_is_unformatted(jprivate(pg))) {
18966 + assert("nikita-2364",
18967 + jprivate(pg)->key.j.index == pg->index);
18968 + assert("nikita-2367",
18969 + jprivate(pg)->key.j.mapping == pg->mapping);
18970 + assert("nikita-2365",
18971 + jprivate(pg)->key.j.objectid ==
18972 + get_inode_oid(pg->mapping->host));
18973 + assert("vs-1200",
18974 + jprivate(pg)->key.j.objectid ==
18975 + pg->mapping->host->i_ino);
18976 + assert("nikita-2356",
18977 + jnode_is_unformatted(jnode_by_page(pg)));
18978 + }
18979 + assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
18980 + }
18981 + return result;
18982 +}
18983 +
18984 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
18985 + * page.*/
18986 +void jnode_attach_page(jnode * node, struct page *pg)
18987 +{
18988 + assert("nikita-2060", node != NULL);
18989 + assert("nikita-2061", pg != NULL);
18990 +
18991 + assert("nikita-2050", jprivate(pg) == 0ul);
18992 + assert("nikita-2393", !PagePrivate(pg));
18993 + assert("vs-1741", node->pg == NULL);
18994 +
18995 + assert("nikita-2396", PageLocked(pg));
18996 + assert_spin_locked(&(node->guard));
18997 +
18998 + page_cache_get(pg);
18999 + set_page_private(pg, (unsigned long)node);
19000 + node->pg = pg;
19001 + SetPagePrivate(pg);
19002 +}
19003 +
19004 +/* Dual to jnode_attach_page: break a binding between page and jnode */
19005 +void page_clear_jnode(struct page *page, jnode * node)
19006 +{
19007 + assert("nikita-2424", page != NULL);
19008 + assert("nikita-2425", PageLocked(page));
19009 + assert("nikita-2426", node != NULL);
19010 + assert_spin_locked(&(node->guard));
19011 + assert("nikita-2428", PagePrivate(page));
19012 +
19013 + assert("nikita-3551", !PageWriteback(page));
19014 +
19015 + JF_CLR(node, JNODE_PARSED);
19016 + set_page_private(page, 0ul);
19017 + ClearPagePrivate(page);
19018 + node->pg = NULL;
19019 + page_cache_release(page);
19020 +}
19021 +
19022 +#if 0
19023 +/* it is only used in one place to handle error */
19024 +void
19025 +page_detach_jnode(struct page *page, struct address_space *mapping,
19026 + unsigned long index)
19027 +{
19028 + assert("nikita-2395", page != NULL);
19029 +
19030 + lock_page(page);
19031 + if ((page->mapping == mapping) && (page->index == index)
19032 + && PagePrivate(page)) {
19033 + jnode *node;
19034 +
19035 + node = jprivate(page);
19036 + spin_lock_jnode(node);
19037 + page_clear_jnode(page, node);
19038 + spin_unlock_jnode(node);
19039 + }
19040 + unlock_page(page);
19041 +}
19042 +#endif /* 0 */
19043 +
19044 +/* return @node page locked.
19045 +
19046 + Locking ordering requires that one first takes page lock and afterwards
19047 + spin lock on node attached to this page. Sometimes it is necessary to go in
19048 + the opposite direction. This is done through standard trylock-and-release
19049 + loop.
19050 +*/
19051 +static struct page *jnode_lock_page(jnode * node)
19052 +{
19053 + struct page *page;
19054 +
19055 + assert("nikita-2052", node != NULL);
19056 + assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19057 +
19058 + while (1) {
19059 +
19060 + spin_lock_jnode(node);
19061 + page = jnode_page(node);
19062 + if (page == NULL) {
19063 + break;
19064 + }
19065 +
19066 + /* no need to page_cache_get( page ) here, because page cannot
19067 + be evicted from memory without detaching it from jnode and
19068 + this requires spin lock on jnode that we already hold.
19069 + */
19070 + if (!TestSetPageLocked(page)) {
19071 + /* We won a lock on jnode page, proceed. */
19072 + break;
19073 + }
19074 +
19075 + /* Page is locked by someone else. */
19076 + page_cache_get(page);
19077 + spin_unlock_jnode(node);
19078 + wait_on_page_locked(page);
19079 + /* it is possible that page was detached from jnode and
19080 + returned to the free pool, or re-assigned while we were
19081 + waiting on locked bit. This will be rechecked on the next
19082 + loop iteration.
19083 + */
19084 + page_cache_release(page);
19085 +
19086 + /* try again */
19087 + }
19088 + return page;
19089 +}
19090 +
19091 +/*
19092 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19093 + * validness of jnode content.
19094 + */
19095 +static inline int jparse(jnode * node)
19096 +{
19097 + int result;
19098 +
19099 + assert("nikita-2466", node != NULL);
19100 +
19101 + spin_lock_jnode(node);
19102 + if (likely(!jnode_is_parsed(node))) {
19103 + result = jnode_ops(node)->parse(node);
19104 + if (likely(result == 0))
19105 + JF_SET(node, JNODE_PARSED);
19106 + } else
19107 + result = 0;
19108 + spin_unlock_jnode(node);
19109 + return result;
19110 +}
19111 +
19112 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19113 + * one. */
19114 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19115 +{
19116 + struct page *page;
19117 +
19118 + spin_lock_jnode(node);
19119 + page = jnode_page(node);
19120 +
19121 + if (page == NULL) {
19122 + spin_unlock_jnode(node);
19123 + page = find_or_create_page(jnode_get_mapping(node),
19124 + jnode_get_index(node), gfp_flags);
19125 + if (page == NULL)
19126 + return ERR_PTR(RETERR(-ENOMEM));
19127 + } else {
19128 + if (!TestSetPageLocked(page)) {
19129 + spin_unlock_jnode(node);
19130 + return page;
19131 + }
19132 + page_cache_get(page);
19133 + spin_unlock_jnode(node);
19134 + lock_page(page);
19135 + assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19136 + }
19137 +
19138 + spin_lock_jnode(node);
19139 + if (!jnode_page(node))
19140 + jnode_attach_page(node, page);
19141 + spin_unlock_jnode(node);
19142 +
19143 + page_cache_release(page);
19144 + assert("zam-894", jnode_page(node) == page);
19145 + return page;
19146 +}
19147 +
19148 +/* Start read operation for jnode's page if page is not up-to-date. */
19149 +static int jnode_start_read(jnode * node, struct page *page)
19150 +{
19151 + assert("zam-893", PageLocked(page));
19152 +
19153 + if (PageUptodate(page)) {
19154 + unlock_page(page);
19155 + return 0;
19156 + }
19157 + return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19158 +}
19159 +
19160 +#if REISER4_DEBUG
19161 +static void check_jload(jnode * node, struct page *page)
19162 +{
19163 + if (jnode_is_znode(node)) {
19164 + node40_header *nh;
19165 + znode *z;
19166 +
19167 + z = JZNODE(node);
19168 + if (znode_is_any_locked(z)) {
19169 + nh = (node40_header *) kmap(page);
19170 + /* this only works for node40-only file systems. For
19171 + * debugging. */
19172 + assert("nikita-3253",
19173 + z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19174 + kunmap(page);
19175 + }
19176 + assert("nikita-3565", znode_invariant(z));
19177 + }
19178 +}
19179 +#else
19180 +#define check_jload(node, page) noop
19181 +#endif
19182 +
19183 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19184 + * to call jload() shortly. This will bring appropriate portion of jnode into
19185 + * CPU cache. */
19186 +void jload_prefetch(jnode * node)
19187 +{
19188 + prefetchw(&node->x_count);
19189 +}
19190 +
19191 +/* load jnode's data into memory */
19192 +int jload_gfp(jnode * node /* node to load */ ,
19193 + gfp_t gfp_flags /* allocation flags */ ,
19194 + int do_kmap /* true if page should be kmapped */ )
19195 +{
19196 + struct page *page;
19197 + int result = 0;
19198 + int parsed;
19199 +
19200 + assert("nikita-3010", reiser4_schedulable());
19201 +
19202 + prefetchw(&node->pg);
19203 +
19204 + /* taking d-reference implies taking x-reference. */
19205 + jref(node);
19206 +
19207 + /*
19208 + * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19209 + * should be atomic, otherwise there is a race against
19210 + * reiser4_releasepage().
19211 + */
19212 + spin_lock(&(node->load));
19213 + add_d_ref(node);
19214 + parsed = jnode_is_parsed(node);
19215 + spin_unlock(&(node->load));
19216 +
19217 + if (unlikely(!parsed)) {
19218 + page = jnode_get_page_locked(node, gfp_flags);
19219 + if (unlikely(IS_ERR(page))) {
19220 + result = PTR_ERR(page);
19221 + goto failed;
19222 + }
19223 +
19224 + result = jnode_start_read(node, page);
19225 + if (unlikely(result != 0))
19226 + goto failed;
19227 +
19228 + wait_on_page_locked(page);
19229 + if (unlikely(!PageUptodate(page))) {
19230 + result = RETERR(-EIO);
19231 + goto failed;
19232 + }
19233 +
19234 + if (do_kmap)
19235 + node->data = kmap(page);
19236 +
19237 + result = jparse(node);
19238 + if (unlikely(result != 0)) {
19239 + if (do_kmap)
19240 + kunmap(page);
19241 + goto failed;
19242 + }
19243 + check_jload(node, page);
19244 + } else {
19245 + page = jnode_page(node);
19246 + check_jload(node, page);
19247 + if (do_kmap)
19248 + node->data = kmap(page);
19249 + }
19250 +
19251 + if (!is_writeout_mode())
19252 + /* We do not mark pages active if jload is called as a part of
19253 + * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19254 + * and write_logs() add no value to cached data, there is no
19255 + * sense to mark pages as active when they go to disk, it just
19256 + * confuses vm scanning routines because clean page could be
19257 + * moved out from inactive list as a result of this
19258 + * mark_page_accessed() call. */
19259 + mark_page_accessed(page);
19260 +
19261 + return 0;
19262 +
19263 + failed:
19264 + jrelse_tail(node);
19265 + return result;
19266 +
19267 +}
19268 +
19269 +/* start asynchronous reading for given jnode's page. */
19270 +int jstartio(jnode * node)
19271 +{
19272 + struct page *page;
19273 +
19274 + page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19275 + if (IS_ERR(page))
19276 + return PTR_ERR(page);
19277 +
19278 + return jnode_start_read(node, page);
19279 +}
19280 +
19281 +/* Initialize a node by calling appropriate plugin instead of reading
19282 + * node from disk as in jload(). */
19283 +int jinit_new(jnode * node, gfp_t gfp_flags)
19284 +{
19285 + struct page *page;
19286 + int result;
19287 +
19288 + jref(node);
19289 + add_d_ref(node);
19290 +
19291 + page = jnode_get_page_locked(node, gfp_flags);
19292 + if (IS_ERR(page)) {
19293 + result = PTR_ERR(page);
19294 + goto failed;
19295 + }
19296 +
19297 + SetPageUptodate(page);
19298 + unlock_page(page);
19299 +
19300 + node->data = kmap(page);
19301 +
19302 + if (!jnode_is_parsed(node)) {
19303 + jnode_plugin *jplug = jnode_ops(node);
19304 + spin_lock_jnode(node);
19305 + result = jplug->init(node);
19306 + spin_unlock_jnode(node);
19307 + if (result) {
19308 + kunmap(page);
19309 + goto failed;
19310 + }
19311 + JF_SET(node, JNODE_PARSED);
19312 + }
19313 +
19314 + return 0;
19315 +
19316 + failed:
19317 + jrelse(node);
19318 + return result;
19319 +}
19320 +
19321 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19322 +void jrelse_tail(jnode * node /* jnode to release references to */ )
19323 +{
19324 + assert("nikita-489", atomic_read(&node->d_count) > 0);
19325 + atomic_dec(&node->d_count);
19326 + /* release reference acquired in jload_gfp() or jinit_new() */
19327 + jput(node);
19328 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
19329 + LOCK_CNT_DEC(d_refs);
19330 +}
19331 +
19332 +/* drop reference to node data. When last reference is dropped, data are
19333 + unloaded. */
19334 +void jrelse(jnode * node /* jnode to release references to */ )
19335 +{
19336 + struct page *page;
19337 +
19338 + assert("nikita-487", node != NULL);
19339 + assert_spin_not_locked(&(node->guard));
19340 +
19341 + page = jnode_page(node);
19342 + if (likely(page != NULL)) {
19343 + /*
19344 + * it is safe not to lock jnode here, because at this point
19345 + * @node->d_count is greater than zero (if jrelse() is used
19346 + * correctly, that is). JNODE_PARSED may be not set yet, if,
19347 + * for example, we got here as a result of error handling path
19348 + * in jload(). Anyway, page cannot be detached by
19349 + * reiser4_releasepage(). truncate will invalidate page
19350 + * regardless, but this should not be a problem.
19351 + */
19352 + kunmap(page);
19353 + }
19354 + jrelse_tail(node);
19355 +}
19356 +
19357 +/* called from jput() to wait for io completion */
19358 +static void jnode_finish_io(jnode * node)
19359 +{
19360 + struct page *page;
19361 +
19362 + assert("nikita-2922", node != NULL);
19363 +
19364 + spin_lock_jnode(node);
19365 + page = jnode_page(node);
19366 + if (page != NULL) {
19367 + page_cache_get(page);
19368 + spin_unlock_jnode(node);
19369 + wait_on_page_writeback(page);
19370 + page_cache_release(page);
19371 + } else
19372 + spin_unlock_jnode(node);
19373 +}
19374 +
19375 +/*
19376 + * This is called by jput() when last reference to jnode is released. This is
19377 + * separate function, because we want fast path of jput() to be inline and,
19378 + * therefore, small.
19379 + */
19380 +void jput_final(jnode * node)
19381 +{
19382 + int r_i_p;
19383 +
19384 + /* A fast check for keeping node in cache. We always keep node in cache
19385 + * if its page is present and node was not marked for deletion */
19386 + if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19387 + rcu_read_unlock();
19388 + return;
19389 + }
19390 + r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19391 + /*
19392 + * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19393 + * this case it is safe to access node after unlock.
19394 + */
19395 + rcu_read_unlock();
19396 + if (r_i_p) {
19397 + jnode_finish_io(node);
19398 + if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19399 + /* node is removed from the tree. */
19400 + jdelete(node);
19401 + else
19402 + jnode_try_drop(node);
19403 + }
19404 + /* if !r_i_p some other thread is already killing it */
19405 +}
19406 +
19407 +int jwait_io(jnode * node, int rw)
19408 +{
19409 + struct page *page;
19410 + int result;
19411 +
19412 + assert("zam-447", node != NULL);
19413 + assert("zam-448", jnode_page(node) != NULL);
19414 +
19415 + page = jnode_page(node);
19416 +
19417 + result = 0;
19418 + if (rw == READ) {
19419 + wait_on_page_locked(page);
19420 + } else {
19421 + assert("nikita-2227", rw == WRITE);
19422 + wait_on_page_writeback(page);
19423 + }
19424 + if (PageError(page))
19425 + result = RETERR(-EIO);
19426 +
19427 + return result;
19428 +}
19429 +
19430 +/*
19431 + * jnode types and plugins.
19432 + *
19433 + * jnode by itself is a "base type". There are several different jnode
19434 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19435 + * has to do different things based on jnode type. In the standard reiser4 way
19436 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19437 + *
19438 + * Functions below deal with jnode types and define methods of jnode plugin.
19439 + *
19440 + */
19441 +
19442 +/* set jnode type. This is done during jnode initialization. */
19443 +static void jnode_set_type(jnode * node, jnode_type type)
19444 +{
19445 + static unsigned long type_to_mask[] = {
19446 + [JNODE_UNFORMATTED_BLOCK] = 1,
19447 + [JNODE_FORMATTED_BLOCK] = 0,
19448 + [JNODE_BITMAP] = 2,
19449 + [JNODE_IO_HEAD] = 6,
19450 + [JNODE_INODE] = 4
19451 + };
19452 +
19453 + assert("zam-647", type < LAST_JNODE_TYPE);
19454 + assert("nikita-2815", !jnode_is_loaded(node));
19455 + assert("nikita-3386", node->state == 0);
19456 +
19457 + node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19458 +}
19459 +
19460 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19461 + * specific initialization. */
19462 +static int init_noinit(jnode * node UNUSED_ARG)
19463 +{
19464 + return 0;
19465 +}
19466 +
19467 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19468 + * specific pasring. */
19469 +static int parse_noparse(jnode * node UNUSED_ARG)
19470 +{
19471 + return 0;
19472 +}
19473 +
19474 +/* ->mapping() method for unformatted jnode */
19475 +struct address_space *mapping_jnode(const jnode * node)
19476 +{
19477 + struct address_space *map;
19478 +
19479 + assert("nikita-2713", node != NULL);
19480 +
19481 + /* mapping is stored in jnode */
19482 +
19483 + map = node->key.j.mapping;
19484 + assert("nikita-2714", map != NULL);
19485 + assert("nikita-2897", is_reiser4_inode(map->host));
19486 + assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19487 + return map;
19488 +}
19489 +
19490 +/* ->index() method for unformatted jnodes */
19491 +unsigned long index_jnode(const jnode * node)
19492 +{
19493 + /* index is stored in jnode */
19494 + return node->key.j.index;
19495 +}
19496 +
19497 +/* ->remove() method for unformatted jnodes */
19498 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19499 +{
19500 + /* remove jnode from hash table and radix tree */
19501 + if (node->key.j.mapping)
19502 + unhash_unformatted_node_nolock(node);
19503 +}
19504 +
19505 +/* ->mapping() method for znodes */
19506 +static struct address_space *mapping_znode(const jnode * node)
19507 +{
19508 + /* all znodes belong to fake inode */
19509 + return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19510 +}
19511 +
19512 +/* ->index() method for znodes */
19513 +static unsigned long index_znode(const jnode * node)
19514 +{
19515 + unsigned long addr;
19516 + assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19517 +
19518 + /* index of znode is just its address (shifted) */
19519 + addr = (unsigned long)node;
19520 + return (addr - PAGE_OFFSET) >> znode_shift_order;
19521 +}
19522 +
19523 +/* ->mapping() method for bitmap jnode */
19524 +static struct address_space *mapping_bitmap(const jnode * node)
19525 +{
19526 + /* all bitmap blocks belong to special bitmap inode */
19527 + return get_super_private(jnode_get_tree(node)->super)->bitmap->
19528 + i_mapping;
19529 +}
19530 +
19531 +/* ->index() method for jnodes that are indexed by address */
19532 +static unsigned long index_is_address(const jnode * node)
19533 +{
19534 + unsigned long ind;
19535 +
19536 + ind = (unsigned long)node;
19537 + return ind - PAGE_OFFSET;
19538 +}
19539 +
19540 +/* resolve race with jput */
19541 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19542 +{
19543 + /*
19544 + * This is used as part of RCU-based jnode handling.
19545 + *
19546 + * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19547 + * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19548 + * not protected during this, so concurrent thread may execute
19549 + * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19550 + * freed in jput_final(). To avoid such races, jput_final() sets
19551 + * JNODE_RIP on jnode (under tree lock). All places that work with
19552 + * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19553 + * (first without taking tree lock), and if this bit is set, released
19554 + * reference acquired by the current thread and returns NULL.
19555 + *
19556 + * As a result, if jnode is being concurrently freed, NULL is returned
19557 + * and caller should pretend that jnode wasn't found in the first
19558 + * place.
19559 + *
19560 + * Otherwise it's safe to release "rcu-read-lock" and continue with
19561 + * jnode.
19562 + */
19563 + if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19564 + read_lock_tree(tree);
19565 + if (JF_ISSET(node, JNODE_RIP)) {
19566 + dec_x_ref(node);
19567 + node = NULL;
19568 + }
19569 + read_unlock_tree(tree);
19570 + }
19571 + return node;
19572 +}
19573 +
19574 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19575 +{
19576 + struct inode *inode;
19577 + item_plugin *iplug;
19578 + loff_t off;
19579 +
19580 + assert("nikita-3092", node != NULL);
19581 + assert("nikita-3093", key != NULL);
19582 + assert("nikita-3094", jnode_is_unformatted(node));
19583 +
19584 + off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19585 + inode = mapping_jnode(node)->host;
19586 +
19587 + if (node->parent_item_id != 0)
19588 + iplug = item_plugin_by_id(node->parent_item_id);
19589 + else
19590 + iplug = NULL;
19591 +
19592 + if (iplug != NULL && iplug->f.key_by_offset)
19593 + iplug->f.key_by_offset(inode, off, key);
19594 + else {
19595 + file_plugin *fplug;
19596 +
19597 + fplug = inode_file_plugin(inode);
19598 + assert("zam-1007", fplug != NULL);
19599 + assert("zam-1008", fplug->key_by_inode != NULL);
19600 +
19601 + fplug->key_by_inode(inode, off, key);
19602 + }
19603 +
19604 + return key;
19605 +}
19606 +
19607 +/* ->parse() method for formatted nodes */
19608 +static int parse_znode(jnode * node)
19609 +{
19610 + return zparse(JZNODE(node));
19611 +}
19612 +
19613 +/* ->delete() method for formatted nodes */
19614 +static void delete_znode(jnode * node, reiser4_tree * tree)
19615 +{
19616 + znode *z;
19617 +
19618 + assert_rw_write_locked(&(tree->tree_lock));
19619 + assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19620 +
19621 + z = JZNODE(node);
19622 + assert("vs-899", z->c_count == 0);
19623 +
19624 + /* delete znode from sibling list. */
19625 + sibling_list_remove(z);
19626 +
19627 + znode_remove(z, tree);
19628 +}
19629 +
19630 +/* ->remove() method for formatted nodes */
19631 +static int remove_znode(jnode * node, reiser4_tree * tree)
19632 +{
19633 + znode *z;
19634 +
19635 + assert_rw_write_locked(&(tree->tree_lock));
19636 + z = JZNODE(node);
19637 +
19638 + if (z->c_count == 0) {
19639 + /* detach znode from sibling list. */
19640 + sibling_list_drop(z);
19641 + /* this is called with tree spin-lock held, so call
19642 + znode_remove() directly (rather than znode_lock_remove()). */
19643 + znode_remove(z, tree);
19644 + return 0;
19645 + }
19646 + return RETERR(-EBUSY);
19647 +}
19648 +
19649 +/* ->init() method for formatted nodes */
19650 +static int init_znode(jnode * node)
19651 +{
19652 + znode *z;
19653 +
19654 + z = JZNODE(node);
19655 + /* call node plugin to do actual initialization */
19656 + return z->nplug->init(z);
19657 +}
19658 +
19659 +/* ->clone() method for formatted nodes */
19660 +static jnode *clone_formatted(jnode * node)
19661 +{
19662 + znode *clone;
19663 +
19664 + assert("vs-1430", jnode_is_znode(node));
19665 + clone = zalloc(reiser4_ctx_gfp_mask_get());
19666 + if (clone == NULL)
19667 + return ERR_PTR(RETERR(-ENOMEM));
19668 + zinit(clone, NULL, current_tree);
19669 + jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19670 + /* ZJNODE(clone)->key.z is not initialized */
19671 + clone->level = JZNODE(node)->level;
19672 +
19673 + return ZJNODE(clone);
19674 +}
19675 +
19676 +/* jplug->clone for unformatted nodes */
19677 +static jnode *clone_unformatted(jnode * node)
19678 +{
19679 + jnode *clone;
19680 +
19681 + assert("vs-1431", jnode_is_unformatted(node));
19682 + clone = jalloc();
19683 + if (clone == NULL)
19684 + return ERR_PTR(RETERR(-ENOMEM));
19685 +
19686 + jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19687 + jnode_set_block(clone, jnode_get_block(node));
19688 +
19689 + return clone;
19690 +
19691 +}
19692 +
19693 +/*
19694 + * Setup jnode plugin methods for various jnode types.
19695 + */
19696 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19697 + [JNODE_UNFORMATTED_BLOCK] = {
19698 + .h = {
19699 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19700 + .id = JNODE_UNFORMATTED_BLOCK,
19701 + .pops = NULL,
19702 + .label = "unformatted",
19703 + .desc = "unformatted node",
19704 + .linkage = {NULL, NULL}
19705 + },
19706 + .init = init_noinit,
19707 + .parse = parse_noparse,
19708 + .mapping = mapping_jnode,
19709 + .index = index_jnode,
19710 + .clone = clone_unformatted
19711 + },
19712 + [JNODE_FORMATTED_BLOCK] = {
19713 + .h = {
19714 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19715 + .id = JNODE_FORMATTED_BLOCK,
19716 + .pops = NULL,
19717 + .label = "formatted",
19718 + .desc = "formatted tree node",
19719 + .linkage = {NULL, NULL}
19720 + },
19721 + .init = init_znode,
19722 + .parse = parse_znode,
19723 + .mapping = mapping_znode,
19724 + .index = index_znode,
19725 + .clone = clone_formatted
19726 + },
19727 + [JNODE_BITMAP] = {
19728 + .h = {
19729 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19730 + .id = JNODE_BITMAP,
19731 + .pops = NULL,
19732 + .label = "bitmap",
19733 + .desc = "bitmap node",
19734 + .linkage = {NULL, NULL}
19735 + },
19736 + .init = init_noinit,
19737 + .parse = parse_noparse,
19738 + .mapping = mapping_bitmap,
19739 + .index = index_is_address,
19740 + .clone = NULL
19741 + },
19742 + [JNODE_IO_HEAD] = {
19743 + .h = {
19744 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19745 + .id = JNODE_IO_HEAD,
19746 + .pops = NULL,
19747 + .label = "io head",
19748 + .desc = "io head",
19749 + .linkage = {NULL, NULL}
19750 + },
19751 + .init = init_noinit,
19752 + .parse = parse_noparse,
19753 + .mapping = mapping_bitmap,
19754 + .index = index_is_address,
19755 + .clone = NULL
19756 + },
19757 + [JNODE_INODE] = {
19758 + .h = {
19759 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19760 + .id = JNODE_INODE,
19761 + .pops = NULL,
19762 + .label = "inode",
19763 + .desc = "inode's builtin jnode",
19764 + .linkage = {NULL, NULL}
19765 + },
19766 + .init = NULL,
19767 + .parse = NULL,
19768 + .mapping = NULL,
19769 + .index = NULL,
19770 + .clone = NULL
19771 + }
19772 +};
19773 +
19774 +/*
19775 + * jnode destruction.
19776 + *
19777 + * Thread may use a jnode after it acquired a reference to it. References are
19778 + * counted in ->x_count field. Reference protects jnode from being
19779 + * recycled. This is different from protecting jnode data (that are stored in
19780 + * jnode page) from being evicted from memory. Data are protected by jload()
19781 + * and released by jrelse().
19782 + *
19783 + * If thread already possesses a reference to the jnode it can acquire another
19784 + * one through jref(). Initial reference is obtained (usually) by locating
19785 + * jnode in some indexing structure that depends on jnode type: formatted
19786 + * nodes are kept in global hash table, where they are indexed by block
19787 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19788 + * table, which is indexed by oid and offset within file, and in per-inode
19789 + * radix tree.
19790 + *
19791 + * Reference to jnode is released by jput(). If last reference is released,
19792 + * jput_final() is called. This function determines whether jnode has to be
19793 + * deleted (this happens when corresponding node is removed from the file
19794 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19795 + * should be just "removed" (deleted from memory).
19796 + *
19797 + * Jnode destruction is signally delicate dance because of locking and RCU.
19798 + */
19799 +
19800 +/*
19801 + * Returns true if jnode cannot be removed right now. This check is called
19802 + * under tree lock. If it returns true, jnode is irrevocably committed to be
19803 + * deleted/removed.
19804 + */
19805 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19806 +{
19807 + /* if other thread managed to acquire a reference to this jnode, don't
19808 + * free it. */
19809 + if (atomic_read(&node->x_count) > 0)
19810 + return 1;
19811 + /* also, don't free znode that has children in memory */
19812 + if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19813 + return 1;
19814 + return 0;
19815 +}
19816 +
19817 +/*
19818 + * this is called as part of removing jnode. Based on jnode type, call
19819 + * corresponding function that removes jnode from indices and returns it back
19820 + * to the appropriate slab (through RCU).
19821 + */
19822 +static inline void
19823 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19824 +{
19825 + switch (jtype) {
19826 + case JNODE_UNFORMATTED_BLOCK:
19827 + remove_jnode(node, tree);
19828 + break;
19829 + case JNODE_IO_HEAD:
19830 + case JNODE_BITMAP:
19831 + break;
19832 + case JNODE_INODE:
19833 + break;
19834 + case JNODE_FORMATTED_BLOCK:
19835 + remove_znode(node, tree);
19836 + break;
19837 + default:
19838 + wrong_return_value("nikita-3196", "Wrong jnode type");
19839 + }
19840 +}
19841 +
19842 +/*
19843 + * this is called as part of deleting jnode. Based on jnode type, call
19844 + * corresponding function that removes jnode from indices and returns it back
19845 + * to the appropriate slab (through RCU).
19846 + *
19847 + * This differs from jnode_remove() only for formatted nodes---for them
19848 + * sibling list handling is different for removal and deletion.
19849 + */
19850 +static inline void
19851 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19852 +{
19853 + switch (jtype) {
19854 + case JNODE_UNFORMATTED_BLOCK:
19855 + remove_jnode(node, tree);
19856 + break;
19857 + case JNODE_IO_HEAD:
19858 + case JNODE_BITMAP:
19859 + break;
19860 + case JNODE_FORMATTED_BLOCK:
19861 + delete_znode(node, tree);
19862 + break;
19863 + case JNODE_INODE:
19864 + default:
19865 + wrong_return_value("nikita-3195", "Wrong jnode type");
19866 + }
19867 +}
19868 +
19869 +#if REISER4_DEBUG
19870 +/*
19871 + * remove jnode from the debugging list of all jnodes hanging off super-block.
19872 + */
19873 +void jnode_list_remove(jnode * node)
19874 +{
19875 + reiser4_super_info_data *sbinfo;
19876 +
19877 + sbinfo = get_super_private(jnode_get_tree(node)->super);
19878 +
19879 + spin_lock_irq(&sbinfo->all_guard);
19880 + assert("nikita-2422", !list_empty(&node->jnodes));
19881 + list_del_init(&node->jnodes);
19882 + spin_unlock_irq(&sbinfo->all_guard);
19883 +}
19884 +#endif
19885 +
19886 +/*
19887 + * this is called by jput_final() to remove jnode when last reference to it is
19888 + * released.
19889 + */
19890 +static int jnode_try_drop(jnode * node)
19891 +{
19892 + int result;
19893 + reiser4_tree *tree;
19894 + jnode_type jtype;
19895 +
19896 + assert("nikita-2491", node != NULL);
19897 + assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19898 +
19899 + tree = jnode_get_tree(node);
19900 + jtype = jnode_get_type(node);
19901 +
19902 + spin_lock_jnode(node);
19903 + write_lock_tree(tree);
19904 + /*
19905 + * if jnode has a page---leave it alone. Memory pressure will
19906 + * eventually kill page and jnode.
19907 + */
19908 + if (jnode_page(node) != NULL) {
19909 + write_unlock_tree(tree);
19910 + spin_unlock_jnode(node);
19911 + JF_CLR(node, JNODE_RIP);
19912 + return RETERR(-EBUSY);
19913 + }
19914 +
19915 + /* re-check ->x_count under tree lock. */
19916 + result = jnode_is_busy(node, jtype);
19917 + if (result == 0) {
19918 + assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19919 + assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19920 +
19921 + spin_unlock_jnode(node);
19922 + /* no page and no references---despatch him. */
19923 + jnode_remove(node, jtype, tree);
19924 + write_unlock_tree(tree);
19925 + jnode_free(node, jtype);
19926 + } else {
19927 + /* busy check failed: reference was acquired by concurrent
19928 + * thread. */
19929 + write_unlock_tree(tree);
19930 + spin_unlock_jnode(node);
19931 + JF_CLR(node, JNODE_RIP);
19932 + }
19933 + return result;
19934 +}
19935 +
19936 +/* jdelete() -- Delete jnode from the tree and file system */
19937 +static int jdelete(jnode * node /* jnode to finish with */ )
19938 +{
19939 + struct page *page;
19940 + int result;
19941 + reiser4_tree *tree;
19942 + jnode_type jtype;
19943 +
19944 + assert("nikita-467", node != NULL);
19945 + assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
19946 +
19947 + jtype = jnode_get_type(node);
19948 +
19949 + page = jnode_lock_page(node);
19950 + assert_spin_locked(&(node->guard));
19951 +
19952 + tree = jnode_get_tree(node);
19953 +
19954 + write_lock_tree(tree);
19955 + /* re-check ->x_count under tree lock. */
19956 + result = jnode_is_busy(node, jtype);
19957 + if (likely(!result)) {
19958 + assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19959 + assert("jmacd-511", atomic_read(&node->d_count) == 0);
19960 +
19961 + /* detach page */
19962 + if (page != NULL) {
19963 + /*
19964 + * FIXME this is racy against jnode_extent_write().
19965 + */
19966 + page_clear_jnode(page, node);
19967 + }
19968 + spin_unlock_jnode(node);
19969 + /* goodbye */
19970 + jnode_delete(node, jtype, tree);
19971 + write_unlock_tree(tree);
19972 + jnode_free(node, jtype);
19973 + /* @node is no longer valid pointer */
19974 + if (page != NULL)
19975 + reiser4_drop_page(page);
19976 + } else {
19977 + /* busy check failed: reference was acquired by concurrent
19978 + * thread. */
19979 + JF_CLR(node, JNODE_RIP);
19980 + write_unlock_tree(tree);
19981 + spin_unlock_jnode(node);
19982 + if (page != NULL)
19983 + unlock_page(page);
19984 + }
19985 + return result;
19986 +}
19987 +
19988 +/* drop jnode on the floor.
19989 +
19990 + Return value:
19991 +
19992 + -EBUSY: failed to drop jnode, because there are still references to it
19993 +
19994 + 0: successfully dropped jnode
19995 +
19996 +*/
19997 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
19998 +{
19999 + struct page *page;
20000 + jnode_type jtype;
20001 + int result;
20002 +
20003 + assert("zam-602", node != NULL);
20004 + assert_rw_not_read_locked(&(tree->tree_lock));
20005 + assert_rw_not_write_locked(&(tree->tree_lock));
20006 + assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20007 +
20008 + jtype = jnode_get_type(node);
20009 +
20010 + page = jnode_lock_page(node);
20011 + assert_spin_locked(&(node->guard));
20012 +
20013 + write_lock_tree(tree);
20014 +
20015 + /* re-check ->x_count under tree lock. */
20016 + result = jnode_is_busy(node, jtype);
20017 + if (!result) {
20018 + assert("nikita-2488", page == jnode_page(node));
20019 + assert("nikita-2533", atomic_read(&node->d_count) == 0);
20020 + if (page != NULL) {
20021 + assert("nikita-2126", !PageDirty(page));
20022 + assert("nikita-2127", PageUptodate(page));
20023 + assert("nikita-2181", PageLocked(page));
20024 + page_clear_jnode(page, node);
20025 + }
20026 + spin_unlock_jnode(node);
20027 + jnode_remove(node, jtype, tree);
20028 + write_unlock_tree(tree);
20029 + jnode_free(node, jtype);
20030 + if (page != NULL) {
20031 + reiser4_drop_page(page);
20032 + }
20033 + } else {
20034 + /* busy check failed: reference was acquired by concurrent
20035 + * thread. */
20036 + JF_CLR(node, JNODE_RIP);
20037 + write_unlock_tree(tree);
20038 + spin_unlock_jnode(node);
20039 + if (page != NULL)
20040 + unlock_page(page);
20041 + }
20042 + return result;
20043 +}
20044 +
20045 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20046 + be 0 (where applicable). */
20047 +void jdrop(jnode * node)
20048 +{
20049 + jdrop_in_tree(node, jnode_get_tree(node));
20050 +}
20051 +
20052 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20053 + functionality (these j-nodes are not in any hash table) just for reading
20054 + from and writing to disk. */
20055 +
20056 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20057 +{
20058 + jnode *jal = jalloc();
20059 +
20060 + if (jal != NULL) {
20061 + jnode_init(jal, current_tree, JNODE_IO_HEAD);
20062 + jnode_set_block(jal, block);
20063 + }
20064 +
20065 + jref(jal);
20066 +
20067 + return jal;
20068 +}
20069 +
20070 +void reiser4_drop_io_head(jnode * node)
20071 +{
20072 + assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20073 +
20074 + jput(node);
20075 + jdrop(node);
20076 +}
20077 +
20078 +/* protect keep jnode data from reiser4_releasepage() */
20079 +void pin_jnode_data(jnode * node)
20080 +{
20081 + assert("zam-671", jnode_page(node) != NULL);
20082 + page_cache_get(jnode_page(node));
20083 +}
20084 +
20085 +/* make jnode data free-able again */
20086 +void unpin_jnode_data(jnode * node)
20087 +{
20088 + assert("zam-672", jnode_page(node) != NULL);
20089 + page_cache_release(jnode_page(node));
20090 +}
20091 +
20092 +struct address_space *jnode_get_mapping(const jnode * node)
20093 +{
20094 + assert("nikita-3162", node != NULL);
20095 + return jnode_ops(node)->mapping(node);
20096 +}
20097 +
20098 +#if REISER4_DEBUG
20099 +/* debugging aid: jnode invariant */
20100 +int jnode_invariant_f(const jnode * node, char const **msg)
20101 +{
20102 +#define _ergo(ant, con) \
20103 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20104 +#define _check(exp) ((*msg) = #exp, (exp))
20105 +
20106 + return _check(node != NULL) &&
20107 + /* [jnode-queued] */
20108 + /* only relocated node can be queued, except that when znode
20109 + * is being deleted, its JNODE_RELOC bit is cleared */
20110 + _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20111 + JF_ISSET(node, JNODE_RELOC) ||
20112 + JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20113 + _check(node->jnodes.prev != NULL) &&
20114 + _check(node->jnodes.next != NULL) &&
20115 + /* [jnode-dirty] invariant */
20116 + /* dirty inode is part of atom */
20117 + _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20118 + /* [jnode-oid] invariant */
20119 + /* for unformatted node ->objectid and ->mapping fields are
20120 + * consistent */
20121 + _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20122 + node->key.j.objectid ==
20123 + get_inode_oid(node->key.j.mapping->host)) &&
20124 + /* [jnode-atom-valid] invariant */
20125 + /* node atom has valid state */
20126 + _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20127 + /* [jnode-page-binding] invariant */
20128 + /* if node points to page, it points back to node */
20129 + _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20130 + /* [jnode-refs] invariant */
20131 + /* only referenced jnode can be loaded */
20132 + _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20133 +
20134 +}
20135 +
20136 +static const char *jnode_type_name(jnode_type type)
20137 +{
20138 + switch (type) {
20139 + case JNODE_UNFORMATTED_BLOCK:
20140 + return "unformatted";
20141 + case JNODE_FORMATTED_BLOCK:
20142 + return "formatted";
20143 + case JNODE_BITMAP:
20144 + return "bitmap";
20145 + case JNODE_IO_HEAD:
20146 + return "io head";
20147 + case JNODE_INODE:
20148 + return "inode";
20149 + case LAST_JNODE_TYPE:
20150 + return "last";
20151 + default:{
20152 + static char unknown[30];
20153 +
20154 + sprintf(unknown, "unknown %i", type);
20155 + return unknown;
20156 + }
20157 + }
20158 +}
20159 +
20160 +#define jnode_state_name( node, flag ) \
20161 + ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20162 +
20163 +/* debugging aid: output human readable information about @node */
20164 +static void info_jnode(const char *prefix /* prefix to print */ ,
20165 + const jnode * node /* node to print */ )
20166 +{
20167 + assert("umka-068", prefix != NULL);
20168 +
20169 + if (node == NULL) {
20170 + printk("%s: null\n", prefix);
20171 + return;
20172 + }
20173 +
20174 + printk
20175 + ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20176 + " block: %s, d_count: %d, x_count: %d, "
20177 + "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20178 + node->state,
20179 + jnode_state_name(node, JNODE_PARSED),
20180 + jnode_state_name(node, JNODE_HEARD_BANSHEE),
20181 + jnode_state_name(node, JNODE_LEFT_CONNECTED),
20182 + jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20183 + jnode_state_name(node, JNODE_ORPHAN),
20184 + jnode_state_name(node, JNODE_CREATED),
20185 + jnode_state_name(node, JNODE_RELOC),
20186 + jnode_state_name(node, JNODE_OVRWR),
20187 + jnode_state_name(node, JNODE_DIRTY),
20188 + jnode_state_name(node, JNODE_IS_DYING),
20189 + jnode_state_name(node, JNODE_RIP),
20190 + jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20191 + jnode_state_name(node, JNODE_WRITEBACK),
20192 + jnode_state_name(node, JNODE_NEW),
20193 + jnode_state_name(node, JNODE_DKSET),
20194 + jnode_state_name(node, JNODE_REPACK),
20195 + jnode_state_name(node, JNODE_CLUSTER_PAGE),
20196 + jnode_get_level(node), sprint_address(jnode_get_block(node)),
20197 + atomic_read(&node->d_count), atomic_read(&node->x_count),
20198 + jnode_page(node), node->atom, 0, 0,
20199 + jnode_type_name(jnode_get_type(node)));
20200 + if (jnode_is_unformatted(node)) {
20201 + printk("inode: %llu, index: %lu, ",
20202 + node->key.j.objectid, node->key.j.index);
20203 + }
20204 +}
20205 +
20206 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20207 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20208 +{
20209 + char const *failed_msg;
20210 + int result;
20211 + reiser4_tree *tree;
20212 +
20213 + tree = jnode_get_tree(node);
20214 +
20215 + assert("umka-063312", node != NULL);
20216 + assert("umka-064321", tree != NULL);
20217 +
20218 + if (!jlocked && !tlocked)
20219 + spin_lock_jnode((jnode *) node);
20220 + if (!tlocked)
20221 + read_lock_tree(jnode_get_tree(node));
20222 + result = jnode_invariant_f(node, &failed_msg);
20223 + if (!result) {
20224 + info_jnode("corrupted node", node);
20225 + warning("jmacd-555", "Condition %s failed", failed_msg);
20226 + }
20227 + if (!tlocked)
20228 + read_unlock_tree(jnode_get_tree(node));
20229 + if (!jlocked && !tlocked)
20230 + spin_unlock_jnode((jnode *) node);
20231 + return result;
20232 +}
20233 +
20234 +#endif /* REISER4_DEBUG */
20235 +
20236 +/* Make Linus happy.
20237 + Local variables:
20238 + c-indentation-style: "K&R"
20239 + mode-name: "LC"
20240 + c-basic-offset: 8
20241 + tab-width: 8
20242 + fill-column: 80
20243 + End:
20244 +*/
20245 diff -urN linux-2.6.22.orig/fs/reiser4/jnode.h linux-2.6.22/fs/reiser4/jnode.h
20246 --- linux-2.6.22.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300
20247 +++ linux-2.6.22/fs/reiser4/jnode.h 2007-07-29 00:25:34.876696477 +0400
20248 @@ -0,0 +1,702 @@
20249 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20250 + * reiser4/README */
20251 +
20252 +/* Declaration of jnode. See jnode.c for details. */
20253 +
20254 +#ifndef __JNODE_H__
20255 +#define __JNODE_H__
20256 +
20257 +#include "forward.h"
20258 +#include "type_safe_hash.h"
20259 +#include "txnmgr.h"
20260 +#include "key.h"
20261 +#include "debug.h"
20262 +#include "dformat.h"
20263 +#include "page_cache.h"
20264 +#include "context.h"
20265 +
20266 +#include "plugin/plugin.h"
20267 +
20268 +#include <linux/fs.h>
20269 +#include <linux/mm.h>
20270 +#include <linux/spinlock.h>
20271 +#include <asm/atomic.h>
20272 +#include <asm/bitops.h>
20273 +#include <linux/list.h>
20274 +#include <linux/rcupdate.h>
20275 +
20276 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20277 + nodes) */
20278 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20279 +
20280 +/* declare hash table of znodes */
20281 +TYPE_SAFE_HASH_DECLARE(z, znode);
20282 +
20283 +struct jnode_key {
20284 + __u64 objectid;
20285 + unsigned long index;
20286 + struct address_space *mapping;
20287 +};
20288 +
20289 +/*
20290 + Jnode is the "base class" of other nodes in reiser4. It is also happens to
20291 + be exactly the node we use for unformatted tree nodes.
20292 +
20293 + Jnode provides following basic functionality:
20294 +
20295 + . reference counting and indexing.
20296 +
20297 + . integration with page cache. Jnode has ->pg reference to which page can
20298 + be attached.
20299 +
20300 + . interface to transaction manager. It is jnode that is kept in transaction
20301 + manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20302 + means, there should be special type of jnode for inode.)
20303 +
20304 + Locking:
20305 +
20306 + Spin lock: the following fields are protected by the per-jnode spin lock:
20307 +
20308 + ->state
20309 + ->atom
20310 + ->capture_link
20311 +
20312 + Following fields are protected by the global tree lock:
20313 +
20314 + ->link
20315 + ->key.z (content of ->key.z is only changed in znode_rehash())
20316 + ->key.j
20317 +
20318 + Atomic counters
20319 +
20320 + ->x_count
20321 + ->d_count
20322 +
20323 + ->pg, and ->data are protected by spin lock for unused jnode and are
20324 + immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20325 + is false).
20326 +
20327 + ->tree is immutable after creation
20328 +
20329 + Unclear
20330 +
20331 + ->blocknr: should be under jnode spin-lock, but current interface is based
20332 + on passing of block address.
20333 +
20334 + If you ever need to spin lock two nodes at once, do this in "natural"
20335 + memory order: lock znode with lower address first. (See lock_two_nodes().)
20336 +
20337 + Invariants involving this data-type:
20338 +
20339 + [jnode-dirty]
20340 + [jnode-refs]
20341 + [jnode-oid]
20342 + [jnode-queued]
20343 + [jnode-atom-valid]
20344 + [jnode-page-binding]
20345 +*/
20346 +
20347 +struct jnode {
20348 +#if REISER4_DEBUG
20349 +#define JMAGIC 0x52654973 /* "ReIs" */
20350 + int magic;
20351 +#endif
20352 + /* FIRST CACHE LINE (16 bytes): data used by jload */
20353 +
20354 + /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20355 + /* 0 */ unsigned long state;
20356 +
20357 + /* lock, protecting jnode's fields. */
20358 + /* 4 */ spinlock_t load;
20359 +
20360 + /* counter of references to jnode itself. Increased on jref().
20361 + Decreased on jput().
20362 + */
20363 + /* 8 */ atomic_t x_count;
20364 +
20365 + /* counter of references to jnode's data. Pin data page(s) in
20366 + memory while this is greater than 0. Increased on jload().
20367 + Decreased on jrelse().
20368 + */
20369 + /* 12 */ atomic_t d_count;
20370 +
20371 + /* SECOND CACHE LINE: data used by hash table lookups */
20372 +
20373 + /* 16 */ union {
20374 + /* znodes are hashed by block number */
20375 + reiser4_block_nr z;
20376 + /* unformatted nodes are hashed by mapping plus offset */
20377 + struct jnode_key j;
20378 + } key;
20379 +
20380 + /* THIRD CACHE LINE */
20381 +
20382 + /* 32 */ union {
20383 + /* pointers to maintain hash-table */
20384 + z_hash_link z;
20385 + j_hash_link j;
20386 + } link;
20387 +
20388 + /* pointer to jnode page. */
20389 + /* 36 */ struct page *pg;
20390 + /* pointer to node itself. This is page_address(node->pg) when page is
20391 + attached to the jnode
20392 + */
20393 + /* 40 */ void *data;
20394 +
20395 + /* 44 */ reiser4_tree *tree;
20396 +
20397 + /* FOURTH CACHE LINE: atom related fields */
20398 +
20399 + /* 48 */ spinlock_t guard;
20400 +
20401 + /* atom the block is in, if any */
20402 + /* 52 */ txn_atom *atom;
20403 +
20404 + /* capture list */
20405 + /* 56 */ struct list_head capture_link;
20406 +
20407 + /* FIFTH CACHE LINE */
20408 +
20409 + /* 64 */ struct rcu_head rcu;
20410 + /* crosses cache line */
20411 +
20412 + /* SIXTH CACHE LINE */
20413 +
20414 + /* the real blocknr (where io is going to/from) */
20415 + /* 80 */ reiser4_block_nr blocknr;
20416 + /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20417 + /* NOTE: this parent_item_id looks like jnode type. */
20418 + /* 88 */ reiser4_plugin_id parent_item_id;
20419 + /* 92 */
20420 +#if REISER4_DEBUG
20421 + /* list of all jnodes for debugging purposes. */
20422 + struct list_head jnodes;
20423 + /* how many times this jnode was written in one transaction */
20424 + int written;
20425 + /* this indicates which atom's list the jnode is on */
20426 + atom_list list;
20427 +#endif
20428 +} __attribute__ ((aligned(16)));
20429 +
20430 +/*
20431 + * jnode types. Enumeration of existing jnode types.
20432 + */
20433 +typedef enum {
20434 + JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20435 + JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20436 + JNODE_BITMAP, /* bitmap */
20437 + JNODE_IO_HEAD, /* jnode representing a block in the
20438 + * wandering log */
20439 + JNODE_INODE, /* jnode embedded into inode */
20440 + LAST_JNODE_TYPE
20441 +} jnode_type;
20442 +
20443 +/* jnode states */
20444 +typedef enum {
20445 + /* jnode's page is loaded and data checked */
20446 + JNODE_PARSED = 0,
20447 + /* node was deleted, not all locks on it were released. This
20448 + node is empty and is going to be removed from the tree
20449 + shortly. */
20450 + JNODE_HEARD_BANSHEE = 1,
20451 + /* left sibling pointer is valid */
20452 + JNODE_LEFT_CONNECTED = 2,
20453 + /* right sibling pointer is valid */
20454 + JNODE_RIGHT_CONNECTED = 3,
20455 +
20456 + /* znode was just created and doesn't yet have a pointer from
20457 + its parent */
20458 + JNODE_ORPHAN = 4,
20459 +
20460 + /* this node was created by its transaction and has not been assigned
20461 + a block address. */
20462 + JNODE_CREATED = 5,
20463 +
20464 + /* this node is currently relocated */
20465 + JNODE_RELOC = 6,
20466 + /* this node is currently wandered */
20467 + JNODE_OVRWR = 7,
20468 +
20469 + /* this znode has been modified */
20470 + JNODE_DIRTY = 8,
20471 +
20472 + /* znode lock is being invalidated */
20473 + JNODE_IS_DYING = 9,
20474 +
20475 + /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20476 +
20477 + /* jnode is queued for flushing. */
20478 + JNODE_FLUSH_QUEUED = 12,
20479 +
20480 + /* In the following bits jnode type is encoded. */
20481 + JNODE_TYPE_1 = 13,
20482 + JNODE_TYPE_2 = 14,
20483 + JNODE_TYPE_3 = 15,
20484 +
20485 + /* jnode is being destroyed */
20486 + JNODE_RIP = 16,
20487 +
20488 + /* znode was not captured during locking (it might so be because
20489 + ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20490 + JNODE_MISSED_IN_CAPTURE = 17,
20491 +
20492 + /* write is in progress */
20493 + JNODE_WRITEBACK = 18,
20494 +
20495 + /* FIXME: now it is used by crypto-compress plugin only */
20496 + JNODE_NEW = 19,
20497 +
20498 + /* delimiting keys are already set for this znode. */
20499 + JNODE_DKSET = 20,
20500 +
20501 + /* when this bit is set page and jnode can not be disconnected */
20502 + JNODE_WRITE_PREPARED = 21,
20503 +
20504 + JNODE_CLUSTER_PAGE = 22,
20505 + /* Jnode is marked for repacking, that means the reiser4 flush and the
20506 + * block allocator should process this node special way */
20507 + JNODE_REPACK = 23,
20508 + /* node should be converted by flush in squalloc phase */
20509 + JNODE_CONVERTIBLE = 24,
20510 + /*
20511 + * When jnode is dirtied for the first time in given transaction,
20512 + * do_jnode_make_dirty() checks whether this jnode can possible became
20513 + * member of overwrite set. If so, this bit is set, and one block is
20514 + * reserved in the ->flush_reserved space of atom.
20515 + *
20516 + * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20517 + *
20518 + * (1) flush decides that we want this block to go into relocate
20519 + * set after all.
20520 + *
20521 + * (2) wandering log is allocated (by log writer)
20522 + *
20523 + * (3) extent is allocated
20524 + *
20525 + */
20526 + JNODE_FLUSH_RESERVED = 29
20527 +} reiser4_jnode_state;
20528 +
20529 +/* Macros for accessing the jnode state. */
20530 +
20531 +static inline void JF_CLR(jnode * j, int f)
20532 +{
20533 + assert("unknown-1", j->magic == JMAGIC);
20534 + clear_bit(f, &j->state);
20535 +}
20536 +static inline int JF_ISSET(const jnode * j, int f)
20537 +{
20538 + assert("unknown-2", j->magic == JMAGIC);
20539 + return test_bit(f, &((jnode *) j)->state);
20540 +}
20541 +static inline void JF_SET(jnode * j, int f)
20542 +{
20543 + assert("unknown-3", j->magic == JMAGIC);
20544 + set_bit(f, &j->state);
20545 +}
20546 +
20547 +static inline int JF_TEST_AND_SET(jnode * j, int f)
20548 +{
20549 + assert("unknown-4", j->magic == JMAGIC);
20550 + return test_and_set_bit(f, &j->state);
20551 +}
20552 +
20553 +static inline void spin_lock_jnode(jnode *node)
20554 +{
20555 + /* check that spinlocks of lower priorities are not held */
20556 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20557 + LOCK_CNT_NIL(spin_locked_txnh) &&
20558 + LOCK_CNT_NIL(spin_locked_zlock) &&
20559 + LOCK_CNT_NIL(rw_locked_dk) &&
20560 + LOCK_CNT_LT(spin_locked_jnode, 2)));
20561 +
20562 + spin_lock(&(node->guard));
20563 +
20564 + LOCK_CNT_INC(spin_locked_jnode);
20565 + LOCK_CNT_INC(spin_locked);
20566 +}
20567 +
20568 +static inline void spin_unlock_jnode(jnode *node)
20569 +{
20570 + assert_spin_locked(&(node->guard));
20571 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20572 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20573 +
20574 + LOCK_CNT_DEC(spin_locked_jnode);
20575 + LOCK_CNT_DEC(spin_locked);
20576 +
20577 + spin_unlock(&(node->guard));
20578 +}
20579 +
20580 +static inline int jnode_is_in_deleteset(const jnode * node)
20581 +{
20582 + return JF_ISSET(node, JNODE_RELOC);
20583 +}
20584 +
20585 +extern int init_jnodes(void);
20586 +extern void done_jnodes(void);
20587 +
20588 +/* Jnode routines */
20589 +extern jnode *jalloc(void);
20590 +extern void jfree(jnode * node) NONNULL;
20591 +extern jnode *jclone(jnode *);
20592 +extern jnode *jlookup(reiser4_tree * tree,
20593 + oid_t objectid, unsigned long ind) NONNULL;
20594 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20595 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
20596 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
20597 +void jnode_attach_page(jnode * node, struct page *pg);
20598 +
20599 +void unhash_unformatted_jnode(jnode *);
20600 +extern jnode *page_next_jnode(jnode * node) NONNULL;
20601 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20602 +extern void jnode_make_dirty(jnode * node) NONNULL;
20603 +extern void jnode_make_clean(jnode * node) NONNULL;
20604 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20605 +extern void jnode_make_wander(jnode *) NONNULL;
20606 +extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20607 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20608 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20609 +
20610 +/**
20611 + * jnode_get_block
20612 + * @node: jnode to query
20613 + *
20614 + */
20615 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20616 +{
20617 + assert("nikita-528", node != NULL);
20618 +
20619 + return &node->blocknr;
20620 +}
20621 +
20622 +/**
20623 + * jnode_set_block
20624 + * @node: jnode to update
20625 + * @blocknr: new block nr
20626 + */
20627 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20628 +{
20629 + assert("nikita-2020", node != NULL);
20630 + assert("umka-055", blocknr != NULL);
20631 + node->blocknr = *blocknr;
20632 +}
20633 +
20634 +
20635 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
20636 + * jnode was emergency flushed---then block number chosen by eflush is
20637 + * used. */
20638 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20639 +{
20640 + assert("nikita-2768", node != NULL);
20641 + assert_spin_locked(&(node->guard));
20642 +
20643 + return jnode_get_block(node);
20644 +}
20645 +
20646 +/* Jnode flush interface. */
20647 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20648 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
20649 +
20650 +/* FIXME-VS: these are used in plugin/item/extent.c */
20651 +
20652 +/* does extent_get_block have to be called */
20653 +#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20654 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20655 +
20656 +/* the node should be converted during flush squalloc phase */
20657 +#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20658 +#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20659 +
20660 +/* Macros to convert from jnode to znode, znode to jnode. These are macros
20661 + because C doesn't allow overloading of const prototypes. */
20662 +#define ZJNODE(x) (& (x) -> zjnode)
20663 +#define JZNODE(x) \
20664 +({ \
20665 + typeof (x) __tmp_x; \
20666 + \
20667 + __tmp_x = (x); \
20668 + assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20669 + (znode*) __tmp_x; \
20670 +})
20671 +
20672 +extern int jnodes_tree_init(reiser4_tree * tree);
20673 +extern int jnodes_tree_done(reiser4_tree * tree);
20674 +
20675 +#if REISER4_DEBUG
20676 +
20677 +extern int znode_is_any_locked(const znode * node);
20678 +extern void jnode_list_remove(jnode * node);
20679 +
20680 +#else
20681 +
20682 +#define jnode_list_remove(node) noop
20683 +
20684 +#endif
20685 +
20686 +int znode_is_root(const znode * node) NONNULL;
20687 +
20688 +/* bump reference counter on @node */
20689 +static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20690 +{
20691 + assert("nikita-1911", node != NULL);
20692 +
20693 + atomic_inc(&node->x_count);
20694 + LOCK_CNT_INC(x_refs);
20695 +}
20696 +
20697 +static inline void dec_x_ref(jnode * node)
20698 +{
20699 + assert("nikita-3215", node != NULL);
20700 + assert("nikita-3216", atomic_read(&node->x_count) > 0);
20701 +
20702 + atomic_dec(&node->x_count);
20703 + assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20704 + LOCK_CNT_DEC(x_refs);
20705 +}
20706 +
20707 +/* jref() - increase counter of references to jnode/znode (x_count) */
20708 +static inline jnode *jref(jnode * node)
20709 +{
20710 + assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20711 + add_x_ref(node);
20712 + return node;
20713 +}
20714 +
20715 +/* get the page of jnode */
20716 +static inline struct page *jnode_page(const jnode * node)
20717 +{
20718 + return node->pg;
20719 +}
20720 +
20721 +/* return pointer to jnode data */
20722 +static inline char *jdata(const jnode * node)
20723 +{
20724 + assert("nikita-1415", node != NULL);
20725 + assert("nikita-3198", jnode_page(node) != NULL);
20726 + return node->data;
20727 +}
20728 +
20729 +static inline int jnode_is_loaded(const jnode * node)
20730 +{
20731 + assert("zam-506", node != NULL);
20732 + return atomic_read(&node->d_count) > 0;
20733 +}
20734 +
20735 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20736 +
20737 +static inline void jnode_set_reloc(jnode * node)
20738 +{
20739 + assert("nikita-2431", node != NULL);
20740 + assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20741 + JF_SET(node, JNODE_RELOC);
20742 +}
20743 +
20744 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20745 +
20746 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20747 +
20748 +static inline int jload(jnode *node)
20749 +{
20750 + return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20751 +}
20752 +
20753 +extern int jinit_new(jnode *, gfp_t) NONNULL;
20754 +extern int jstartio(jnode *) NONNULL;
20755 +
20756 +extern void jdrop(jnode *) NONNULL;
20757 +extern int jwait_io(jnode *, int rw) NONNULL;
20758 +
20759 +void jload_prefetch(jnode *);
20760 +
20761 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20762 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
20763 +
20764 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
20765 +{
20766 + assert("nikita-2691", node != NULL);
20767 + return node->tree;
20768 +}
20769 +
20770 +extern void pin_jnode_data(jnode *);
20771 +extern void unpin_jnode_data(jnode *);
20772 +
20773 +static inline jnode_type jnode_get_type(const jnode * node)
20774 +{
20775 + static const unsigned long state_mask =
20776 + (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20777 +
20778 + static jnode_type mask_to_type[] = {
20779 + /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20780 +
20781 + /* 000 */
20782 + [0] = JNODE_FORMATTED_BLOCK,
20783 + /* 001 */
20784 + [1] = JNODE_UNFORMATTED_BLOCK,
20785 + /* 010 */
20786 + [2] = JNODE_BITMAP,
20787 + /* 011 */
20788 + [3] = LAST_JNODE_TYPE, /*invalid */
20789 + /* 100 */
20790 + [4] = JNODE_INODE,
20791 + /* 101 */
20792 + [5] = LAST_JNODE_TYPE,
20793 + /* 110 */
20794 + [6] = JNODE_IO_HEAD,
20795 + /* 111 */
20796 + [7] = LAST_JNODE_TYPE, /* invalid */
20797 + };
20798 +
20799 + return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20800 +}
20801 +
20802 +/* returns true if node is a znode */
20803 +static inline int jnode_is_znode(const jnode * node)
20804 +{
20805 + return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20806 +}
20807 +
20808 +static inline int jnode_is_flushprepped(jnode * node)
20809 +{
20810 + assert("jmacd-78212", node != NULL);
20811 + assert_spin_locked(&(node->guard));
20812 + return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20813 + JF_ISSET(node, JNODE_OVRWR);
20814 +}
20815 +
20816 +/* Return true if @node has already been processed by the squeeze and allocate
20817 + process. This implies the block address has been finalized for the
20818 + duration of this atom (or it is clean and will remain in place). If this
20819 + returns true you may use the block number as a hint. */
20820 +static inline int jnode_check_flushprepped(jnode * node)
20821 +{
20822 + int result;
20823 +
20824 + /* It must be clean or relocated or wandered. New allocations are set to relocate. */
20825 + spin_lock_jnode(node);
20826 + result = jnode_is_flushprepped(node);
20827 + spin_unlock_jnode(node);
20828 + return result;
20829 +}
20830 +
20831 +/* returns true if node is unformatted */
20832 +static inline int jnode_is_unformatted(const jnode * node)
20833 +{
20834 + assert("jmacd-0123", node != NULL);
20835 + return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20836 +}
20837 +
20838 +/* returns true if node represents a cluster cache page */
20839 +static inline int jnode_is_cluster_page(const jnode * node)
20840 +{
20841 + assert("edward-50", node != NULL);
20842 + return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20843 +}
20844 +
20845 +/* returns true is node is builtin inode's jnode */
20846 +static inline int jnode_is_inode(const jnode * node)
20847 +{
20848 + assert("vs-1240", node != NULL);
20849 + return jnode_get_type(node) == JNODE_INODE;
20850 +}
20851 +
20852 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20853 +{
20854 + assert("nikita-2367", type < LAST_JNODE_TYPE);
20855 + return jnode_plugin_by_id((reiser4_plugin_id) type);
20856 +}
20857 +
20858 +static inline jnode_plugin *jnode_ops(const jnode * node)
20859 +{
20860 + assert("nikita-2366", node != NULL);
20861 +
20862 + return jnode_ops_of(jnode_get_type(node));
20863 +}
20864 +
20865 +/* Get the index of a block. */
20866 +static inline unsigned long jnode_get_index(jnode * node)
20867 +{
20868 + return jnode_ops(node)->index(node);
20869 +}
20870 +
20871 +/* return true if "node" is the root */
20872 +static inline int jnode_is_root(const jnode * node)
20873 +{
20874 + return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20875 +}
20876 +
20877 +extern struct address_space *mapping_jnode(const jnode * node);
20878 +extern unsigned long index_jnode(const jnode * node);
20879 +
20880 +static inline void jput(jnode * node);
20881 +extern void jput_final(jnode * node);
20882 +
20883 +/* bump data counter on @node */
20884 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20885 +{
20886 + assert("nikita-1962", node != NULL);
20887 +
20888 + atomic_inc(&node->d_count);
20889 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
20890 + LOCK_CNT_INC(d_refs);
20891 +}
20892 +
20893 +/* jput() - decrement x_count reference counter on znode.
20894 +
20895 + Count may drop to 0, jnode stays in cache until memory pressure causes the
20896 + eviction of its page. The c_count variable also ensures that children are
20897 + pressured out of memory before the parent. The jnode remains hashed as
20898 + long as the VM allows its page to stay in memory.
20899 +*/
20900 +static inline void jput(jnode * node)
20901 +{
20902 + assert("jmacd-509", node != NULL);
20903 + assert("jmacd-510", atomic_read(&node->x_count) > 0);
20904 + assert("zam-926", reiser4_schedulable());
20905 + LOCK_CNT_DEC(x_refs);
20906 +
20907 + rcu_read_lock();
20908 + /*
20909 + * we don't need any kind of lock here--jput_final() uses RCU.
20910 + */
20911 + if (unlikely(atomic_dec_and_test(&node->x_count))) {
20912 + jput_final(node);
20913 + } else
20914 + rcu_read_unlock();
20915 + assert("nikita-3473", reiser4_schedulable());
20916 +}
20917 +
20918 +extern void jrelse(jnode * node);
20919 +extern void jrelse_tail(jnode * node);
20920 +
20921 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20922 +
20923 +/* resolve race with jput */
20924 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20925 +{
20926 + if (unlikely(JF_ISSET(node, JNODE_RIP)))
20927 + node = jnode_rip_sync(tree, node);
20928 + return node;
20929 +}
20930 +
20931 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20932 +
20933 +#if REISER4_DEBUG
20934 +extern int jnode_invariant_f(const jnode *node, char const **msg);
20935 +#endif
20936 +
20937 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
20938 +
20939 +/* __JNODE_H__ */
20940 +#endif
20941 +
20942 +/* Make Linus happy.
20943 + Local variables:
20944 + c-indentation-style: "K&R"
20945 + mode-name: "LC"
20946 + c-basic-offset: 8
20947 + tab-width: 8
20948 + fill-column: 120
20949 + End:
20950 +*/
20951 diff -urN linux-2.6.22.orig/fs/reiser4/kassign.c linux-2.6.22/fs/reiser4/kassign.c
20952 --- linux-2.6.22.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300
20953 +++ linux-2.6.22/fs/reiser4/kassign.c 2007-07-29 00:25:34.880697512 +0400
20954 @@ -0,0 +1,661 @@
20955 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20956 + * reiser4/README */
20957 +
20958 +/* Key assignment policy implementation */
20959 +
20960 +/*
20961 + * In reiser4 every piece of file system data and meta-data has a key. Keys
20962 + * are used to store information in and retrieve it from reiser4 internal
20963 + * tree. In addition to this, keys define _ordering_ of all file system
20964 + * information: things having close keys are placed into the same or
20965 + * neighboring (in the tree order) nodes of the tree. As our block allocator
20966 + * tries to respect tree order (see flush.c), keys also define order in which
20967 + * things are laid out on the disk, and hence, affect performance directly.
20968 + *
20969 + * Obviously, assignment of keys to data and meta-data should be consistent
20970 + * across whole file system. Algorithm that calculates a key for a given piece
20971 + * of data or meta-data is referred to as "key assignment".
20972 + *
20973 + * Key assignment is too expensive to be implemented as a plugin (that is,
20974 + * with an ability to support different key assignment schemas in the same
20975 + * compiled kernel image). As a compromise, all key-assignment functions and
20976 + * data-structures are collected in this single file, so that modifications to
20977 + * key assignment algorithm can be localized. Additional changes may be
20978 + * required in key.[ch].
20979 + *
20980 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
20981 + * may guess, there is "Plan B" too.
20982 + *
20983 + */
20984 +
20985 +/*
20986 + * Additional complication with key assignment implementation is a requirement
20987 + * to support different key length.
20988 + */
20989 +
20990 +/*
20991 + * KEY ASSIGNMENT: PLAN A, LONG KEYS.
20992 + *
20993 + * DIRECTORY ITEMS
20994 + *
20995 + * | 60 | 4 | 7 |1| 56 | 64 | 64 |
20996 + * +--------------+---+---+-+-------------+------------------+-----------------+
20997 + * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
20998 + * +--------------+---+---+-+-------------+------------------+-----------------+
20999 + * | | | | |
21000 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21001 + *
21002 + * dirid objectid of directory this item is for
21003 + *
21004 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21005 + *
21006 + * H 1 if last 8 bytes of the key contain hash,
21007 + * 0 if last 8 bytes of the key contain prefix-3
21008 + *
21009 + * prefix-1 first 7 characters of file name.
21010 + * Padded by zeroes if name is not long enough.
21011 + *
21012 + * prefix-2 next 8 characters of the file name.
21013 + *
21014 + * prefix-3 next 8 characters of the file name.
21015 + *
21016 + * hash hash of the rest of file name (i.e., portion of file
21017 + * name not included into prefix-1 and prefix-2).
21018 + *
21019 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21020 + * in the key. Such file names are called "short". They are distinguished by H
21021 + * bit set 0 in the key.
21022 + *
21023 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21024 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21025 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21026 + * characters of the name.
21027 + *
21028 + * This key assignment reaches following important goals:
21029 + *
21030 + * (1) directory entries are sorted in approximately lexicographical
21031 + * order.
21032 + *
21033 + * (2) collisions (when multiple directory items have the same key), while
21034 + * principally unavoidable in a tree with fixed length keys, are rare.
21035 + *
21036 + * STAT DATA
21037 + *
21038 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21039 + * +--------------+---+-----------------+---+--------------+-----------------+
21040 + * | locality id | 1 | ordering | 0 | objectid | 0 |
21041 + * +--------------+---+-----------------+---+--------------+-----------------+
21042 + * | | | | |
21043 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21044 + *
21045 + * locality id object id of a directory where first name was created for
21046 + * the object
21047 + *
21048 + * ordering copy of second 8-byte portion of the key of directory
21049 + * entry for the first name of this object. Ordering has a form
21050 + * {
21051 + * fibration :7;
21052 + * h :1;
21053 + * prefix1 :56;
21054 + * }
21055 + * see description of key for directory entry above.
21056 + *
21057 + * objectid object id for this object
21058 + *
21059 + * This key assignment policy is designed to keep stat-data in the same order
21060 + * as corresponding directory items, thus speeding up readdir/stat types of
21061 + * workload.
21062 + *
21063 + * FILE BODY
21064 + *
21065 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21066 + * +--------------+---+-----------------+---+--------------+-----------------+
21067 + * | locality id | 4 | ordering | 0 | objectid | offset |
21068 + * +--------------+---+-----------------+---+--------------+-----------------+
21069 + * | | | | |
21070 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21071 + *
21072 + * locality id object id of a directory where first name was created for
21073 + * the object
21074 + *
21075 + * ordering the same as in the key of stat-data for this object
21076 + *
21077 + * objectid object id for this object
21078 + *
21079 + * offset logical offset from the beginning of this file.
21080 + * Measured in bytes.
21081 + *
21082 + *
21083 + * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21084 + *
21085 + * DIRECTORY ITEMS
21086 + *
21087 + * | 60 | 4 | 7 |1| 56 | 64 |
21088 + * +--------------+---+---+-+-------------+-----------------+
21089 + * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21090 + * +--------------+---+---+-+-------------+-----------------+
21091 + * | | | |
21092 + * | 8 bytes | 8 bytes | 8 bytes |
21093 + *
21094 + * dirid objectid of directory this item is for
21095 + *
21096 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21097 + *
21098 + * H 1 if last 8 bytes of the key contain hash,
21099 + * 0 if last 8 bytes of the key contain prefix-2
21100 + *
21101 + * prefix-1 first 7 characters of file name.
21102 + * Padded by zeroes if name is not long enough.
21103 + *
21104 + * prefix-2 next 8 characters of the file name.
21105 + *
21106 + * hash hash of the rest of file name (i.e., portion of file
21107 + * name not included into prefix-1).
21108 + *
21109 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21110 + * the key. Such file names are called "short". They are distinguished by H
21111 + * bit set in the key.
21112 + *
21113 + * Other file names are "long". For long name, H bit is 0, and first 7
21114 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21115 + * key are occupied by hash of the remaining characters of the name.
21116 + *
21117 + * STAT DATA
21118 + *
21119 + * | 60 | 4 | 4 | 60 | 64 |
21120 + * +--------------+---+---+--------------+-----------------+
21121 + * | locality id | 1 | 0 | objectid | 0 |
21122 + * +--------------+---+---+--------------+-----------------+
21123 + * | | | |
21124 + * | 8 bytes | 8 bytes | 8 bytes |
21125 + *
21126 + * locality id object id of a directory where first name was created for
21127 + * the object
21128 + *
21129 + * objectid object id for this object
21130 + *
21131 + * FILE BODY
21132 + *
21133 + * | 60 | 4 | 4 | 60 | 64 |
21134 + * +--------------+---+---+--------------+-----------------+
21135 + * | locality id | 4 | 0 | objectid | offset |
21136 + * +--------------+---+---+--------------+-----------------+
21137 + * | | | |
21138 + * | 8 bytes | 8 bytes | 8 bytes |
21139 + *
21140 + * locality id object id of a directory where first name was created for
21141 + * the object
21142 + *
21143 + * objectid object id for this object
21144 + *
21145 + * offset logical offset from the beginning of this file.
21146 + * Measured in bytes.
21147 + *
21148 + *
21149 + */
21150 +
21151 +#include "debug.h"
21152 +#include "key.h"
21153 +#include "kassign.h"
21154 +#include "vfs_ops.h"
21155 +#include "inode.h"
21156 +#include "super.h"
21157 +#include "dscale.h"
21158 +
21159 +#include <linux/types.h> /* for __u?? */
21160 +#include <linux/fs.h> /* for struct super_block, etc */
21161 +
21162 +/* bitmask for H bit (see comment at the beginning of this file */
21163 +static const __u64 longname_mark = 0x0100000000000000ull;
21164 +/* bitmask for F and H portions of the key. */
21165 +static const __u64 fibration_mask = 0xff00000000000000ull;
21166 +
21167 +/* return true if name is not completely encoded in @key */
21168 +int is_longname_key(const reiser4_key * key)
21169 +{
21170 + __u64 highpart;
21171 +
21172 + assert("nikita-2863", key != NULL);
21173 + if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21174 + reiser4_print_key("oops", key);
21175 + assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21176 +
21177 + if (REISER4_LARGE_KEY)
21178 + highpart = get_key_ordering(key);
21179 + else
21180 + highpart = get_key_objectid(key);
21181 +
21182 + return (highpart & longname_mark) ? 1 : 0;
21183 +}
21184 +
21185 +/* return true if @name is too long to be completely encoded in the key */
21186 +int is_longname(const char *name UNUSED_ARG, int len)
21187 +{
21188 + if (REISER4_LARGE_KEY)
21189 + return len > 23;
21190 + else
21191 + return len > 15;
21192 +}
21193 +
21194 +/* code ascii string into __u64.
21195 +
21196 + Put characters of @name into result (@str) one after another starting
21197 + from @start_idx-th highest (arithmetically) byte. This produces
21198 + endian-safe encoding. memcpy(2) will not do.
21199 +
21200 +*/
21201 +static __u64 pack_string(const char *name /* string to encode */ ,
21202 + int start_idx /* highest byte in result from
21203 + * which to start encoding */ )
21204 +{
21205 + unsigned i;
21206 + __u64 str;
21207 +
21208 + str = 0;
21209 + for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21210 + str <<= 8;
21211 + str |= (unsigned char)name[i];
21212 + }
21213 + str <<= (sizeof str - i - start_idx) << 3;
21214 + return str;
21215 +}
21216 +
21217 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21218 + * string encoded in it and stores result in @buf */
21219 +char * reiser4_unpack_string(__u64 value, char *buf)
21220 +{
21221 + do {
21222 + *buf = value >> (64 - 8);
21223 + if (*buf)
21224 + ++buf;
21225 + value <<= 8;
21226 + } while (value != 0);
21227 + *buf = 0;
21228 + return buf;
21229 +}
21230 +
21231 +/* obtain name encoded in @key and store it in @buf */
21232 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21233 +{
21234 + char *c;
21235 +
21236 + assert("nikita-2868", !is_longname_key(key));
21237 +
21238 + c = buf;
21239 + if (REISER4_LARGE_KEY) {
21240 + c = reiser4_unpack_string(get_key_ordering(key) &
21241 + ~fibration_mask, c);
21242 + c = reiser4_unpack_string(get_key_fulloid(key), c);
21243 + } else
21244 + c = reiser4_unpack_string(get_key_fulloid(key) &
21245 + ~fibration_mask, c);
21246 + reiser4_unpack_string(get_key_offset(key), c);
21247 + return buf;
21248 +}
21249 +
21250 +/**
21251 + * complete_entry_key - calculate entry key by name
21252 + * @dir: directory where entry is (or will be) in
21253 + * @name: name to calculate key of
21254 + * @len: lenth of name
21255 + * @result: place to store result in
21256 + *
21257 + * Sets fields of entry key @result which depend on file name.
21258 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21259 + * objectid and offset. Otherwise, objectid and offset are set.
21260 + */
21261 +void complete_entry_key(const struct inode *dir, const char *name,
21262 + int len, reiser4_key *result)
21263 +{
21264 +#if REISER4_LARGE_KEY
21265 + __u64 ordering;
21266 + __u64 objectid;
21267 + __u64 offset;
21268 +
21269 + assert("nikita-1139", dir != NULL);
21270 + assert("nikita-1142", result != NULL);
21271 + assert("nikita-2867", strlen(name) == len);
21272 +
21273 + /*
21274 + * key allocation algorithm for directory entries in case of large
21275 + * keys:
21276 + *
21277 + * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21278 + * characters into ordering field of key, next 8 charactes (if any)
21279 + * into objectid field of key and next 8 ones (of any) into offset
21280 + * field of key
21281 + *
21282 + * If file name is longer than 23 characters, put first 7 characters
21283 + * into key's ordering, next 8 to objectid and hash of remaining
21284 + * characters into offset field.
21285 + *
21286 + * To distinguish above cases, in latter set up unused high bit in
21287 + * ordering field.
21288 + */
21289 +
21290 + /* [0-6] characters to ordering */
21291 + ordering = pack_string(name, 1);
21292 + if (len > 7) {
21293 + /* [7-14] characters to objectid */
21294 + objectid = pack_string(name + 7, 0);
21295 + if (len > 15) {
21296 + if (len <= 23) {
21297 + /* [15-23] characters to offset */
21298 + offset = pack_string(name + 15, 0);
21299 + } else {
21300 + /* note in a key the fact that offset contains hash. */
21301 + ordering |= longname_mark;
21302 +
21303 + /* offset is the hash of the file name's tail. */
21304 + offset = inode_hash_plugin(dir)->hash(name + 15,
21305 + len - 15);
21306 + }
21307 + } else {
21308 + offset = 0ull;
21309 + }
21310 + } else {
21311 + objectid = 0ull;
21312 + offset = 0ull;
21313 + }
21314 +
21315 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21316 + ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21317 +
21318 + set_key_ordering(result, ordering);
21319 + set_key_fulloid(result, objectid);
21320 + set_key_offset(result, offset);
21321 + return;
21322 +
21323 +#else
21324 + __u64 objectid;
21325 + __u64 offset;
21326 +
21327 + assert("nikita-1139", dir != NULL);
21328 + assert("nikita-1142", result != NULL);
21329 + assert("nikita-2867", strlen(name) == len);
21330 +
21331 + /*
21332 + * key allocation algorithm for directory entries in case of not large
21333 + * keys:
21334 + *
21335 + * If name is not longer than 7 + 8 = 15 characters, put first 7
21336 + * characters into objectid field of key, next 8 charactes (if any)
21337 + * into offset field of key
21338 + *
21339 + * If file name is longer than 15 characters, put first 7 characters
21340 + * into key's objectid, and hash of remaining characters into offset
21341 + * field.
21342 + *
21343 + * To distinguish above cases, in latter set up unused high bit in
21344 + * objectid field.
21345 + */
21346 +
21347 + /* [0-6] characters to objectid */
21348 + objectid = pack_string(name, 1);
21349 + if (len > 7) {
21350 + if (len <= 15) {
21351 + /* [7-14] characters to offset */
21352 + offset = pack_string(name + 7, 0);
21353 + } else {
21354 + /* note in a key the fact that offset contains hash. */
21355 + objectid |= longname_mark;
21356 +
21357 + /* offset is the hash of the file name. */
21358 + offset = inode_hash_plugin(dir)->hash(name + 7,
21359 + len - 7);
21360 + }
21361 + } else
21362 + offset = 0ull;
21363 +
21364 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21365 + objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21366 +
21367 + set_key_fulloid(result, objectid);
21368 + set_key_offset(result, offset);
21369 + return;
21370 +#endif /* ! REISER4_LARGE_KEY */
21371 +}
21372 +
21373 +/* true, if @key is the key of "." */
21374 +int is_dot_key(const reiser4_key * key /* key to check */ )
21375 +{
21376 + assert("nikita-1717", key != NULL);
21377 + assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21378 + return
21379 + (get_key_ordering(key) == 0ull) &&
21380 + (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21381 +}
21382 +
21383 +/* build key for stat-data.
21384 +
21385 + return key of stat-data of this object. This should became sd plugin
21386 + method in the future. For now, let it be here.
21387 +
21388 +*/
21389 +reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21390 + reiser4_key * result /* resulting key of @target
21391 + stat-data */ )
21392 +{
21393 + assert("nikita-261", result != NULL);
21394 +
21395 + reiser4_key_init(result);
21396 + set_key_locality(result, reiser4_inode_data(target)->locality_id);
21397 + set_key_ordering(result, get_inode_ordering(target));
21398 + set_key_objectid(result, get_inode_oid(target));
21399 + set_key_type(result, KEY_SD_MINOR);
21400 + set_key_offset(result, (__u64) 0);
21401 + return result;
21402 +}
21403 +
21404 +/* encode part of key into &obj_key_id
21405 +
21406 + This encodes into @id part of @key sufficient to restore @key later,
21407 + given that latter is key of object (key of stat-data).
21408 +
21409 + See &obj_key_id
21410 +*/
21411 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21412 + obj_key_id * id /* id where key is encoded in */ )
21413 +{
21414 + assert("nikita-1151", key != NULL);
21415 + assert("nikita-1152", id != NULL);
21416 +
21417 + memcpy(id, key, sizeof *id);
21418 + return 0;
21419 +}
21420 +
21421 +/* encode reference to @obj in @id.
21422 +
21423 + This is like build_obj_key_id() above, but takes inode as parameter. */
21424 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21425 + obj_key_id * id /* result */ )
21426 +{
21427 + reiser4_key sdkey;
21428 +
21429 + assert("nikita-1166", obj != NULL);
21430 + assert("nikita-1167", id != NULL);
21431 +
21432 + build_sd_key(obj, &sdkey);
21433 + build_obj_key_id(&sdkey, id);
21434 + return 0;
21435 +}
21436 +
21437 +/* decode @id back into @key
21438 +
21439 + Restore key of object stat-data from @id. This is dual to
21440 + build_obj_key_id() above.
21441 +*/
21442 +int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21443 + * from */ ,
21444 + reiser4_key * key /* result */ )
21445 +{
21446 + assert("nikita-1153", id != NULL);
21447 + assert("nikita-1154", key != NULL);
21448 +
21449 + reiser4_key_init(key);
21450 + memcpy(key, id, sizeof *id);
21451 + return 0;
21452 +}
21453 +
21454 +/* extract objectid of directory from key of directory entry within said
21455 + directory.
21456 + */
21457 +oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21458 + * directory
21459 + * entry */ )
21460 +{
21461 + assert("nikita-1314", de_key != NULL);
21462 + return get_key_locality(de_key);
21463 +}
21464 +
21465 +/* encode into @id key of directory entry.
21466 +
21467 + Encode into @id information sufficient to later distinguish directory
21468 + entries within the same directory. This is not whole key, because all
21469 + directory entries within directory item share locality which is equal
21470 + to objectid of their directory.
21471 +
21472 +*/
21473 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21474 + const struct qstr *name /* name to be given to @obj by
21475 + * directory entry being
21476 + * constructed */ ,
21477 + de_id * id /* short key of directory entry */ )
21478 +{
21479 + reiser4_key key;
21480 +
21481 + assert("nikita-1290", dir != NULL);
21482 + assert("nikita-1292", id != NULL);
21483 +
21484 + /* NOTE-NIKITA this is suboptimal. */
21485 + inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21486 + return build_de_id_by_key(&key, id);
21487 +}
21488 +
21489 +/* encode into @id key of directory entry.
21490 +
21491 + Encode into @id information sufficient to later distinguish directory
21492 + entries within the same directory. This is not whole key, because all
21493 + directory entries within directory item share locality which is equal
21494 + to objectid of their directory.
21495 +
21496 +*/
21497 +int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21498 + * entry */ ,
21499 + de_id * id /* short key of directory entry */ )
21500 +{
21501 + memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21502 + return 0;
21503 +}
21504 +
21505 +/* restore from @id key of directory entry.
21506 +
21507 + Function dual to build_de_id(): given @id and locality, build full
21508 + key of directory entry within directory item.
21509 +
21510 +*/
21511 +int extract_key_from_de_id(const oid_t locality /* locality of directory
21512 + * entry */ ,
21513 + const de_id * id /* directory entry id */ ,
21514 + reiser4_key * key /* result */ )
21515 +{
21516 + /* no need to initialise key here: all fields are overwritten */
21517 + memcpy(((__u64 *) key) + 1, id, sizeof *id);
21518 + set_key_locality(key, locality);
21519 + set_key_type(key, KEY_FILE_NAME_MINOR);
21520 + return 0;
21521 +}
21522 +
21523 +/* compare two &de_id's */
21524 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21525 + const de_id * id2 /* second &de_id to compare */ )
21526 +{
21527 + /* NOTE-NIKITA ugly implementation */
21528 + reiser4_key k1;
21529 + reiser4_key k2;
21530 +
21531 + extract_key_from_de_id((oid_t) 0, id1, &k1);
21532 + extract_key_from_de_id((oid_t) 0, id2, &k2);
21533 + return keycmp(&k1, &k2);
21534 +}
21535 +
21536 +/* compare &de_id with key */
21537 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21538 + const reiser4_key * key /* key to compare */ )
21539 +{
21540 + cmp_t result;
21541 + reiser4_key *k1;
21542 +
21543 + k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21544 + result = KEY_DIFF_EL(k1, key, 1);
21545 + if (result == EQUAL_TO) {
21546 + result = KEY_DIFF_EL(k1, key, 2);
21547 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21548 + result = KEY_DIFF_EL(k1, key, 3);
21549 + }
21550 + }
21551 + return result;
21552 +}
21553 +
21554 +/*
21555 + * return number of bytes necessary to encode @inode identity.
21556 + */
21557 +int inode_onwire_size(const struct inode *inode)
21558 +{
21559 + int result;
21560 +
21561 + result = dscale_bytes(get_inode_oid(inode));
21562 + result += dscale_bytes(get_inode_locality(inode));
21563 +
21564 + /*
21565 + * ordering is large (it usually has highest bits set), so it makes
21566 + * little sense to dscale it.
21567 + */
21568 + if (REISER4_LARGE_KEY)
21569 + result += sizeof(get_inode_ordering(inode));
21570 + return result;
21571 +}
21572 +
21573 +/*
21574 + * encode @inode identity at @start
21575 + */
21576 +char *build_inode_onwire(const struct inode *inode, char *start)
21577 +{
21578 + start += dscale_write(start, get_inode_locality(inode));
21579 + start += dscale_write(start, get_inode_oid(inode));
21580 +
21581 + if (REISER4_LARGE_KEY) {
21582 + put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21583 + start += sizeof(get_inode_ordering(inode));
21584 + }
21585 + return start;
21586 +}
21587 +
21588 +/*
21589 + * extract key that was previously encoded by build_inode_onwire() at @addr
21590 + */
21591 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21592 +{
21593 + __u64 val;
21594 +
21595 + addr += dscale_read(addr, &val);
21596 + val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21597 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21598 + addr += dscale_read(addr, &val);
21599 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21600 +#if REISER4_LARGE_KEY
21601 + memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21602 + addr += sizeof key_id->ordering;
21603 +#endif
21604 + return addr;
21605 +}
21606 +
21607 +/* Make Linus happy.
21608 + Local variables:
21609 + c-indentation-style: "K&R"
21610 + mode-name: "LC"
21611 + c-basic-offset: 8
21612 + tab-width: 8
21613 + fill-column: 120
21614 + End:
21615 +*/
21616 diff -urN linux-2.6.22.orig/fs/reiser4/kassign.h linux-2.6.22/fs/reiser4/kassign.h
21617 --- linux-2.6.22.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300
21618 +++ linux-2.6.22/fs/reiser4/kassign.h 2007-07-29 00:25:34.880697512 +0400
21619 @@ -0,0 +1,110 @@
21620 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21621 + * reiser4/README */
21622 +
21623 +/* Key assignment policy interface. See kassign.c for details. */
21624 +
21625 +#if !defined( __KASSIGN_H__ )
21626 +#define __KASSIGN_H__
21627 +
21628 +#include "forward.h"
21629 +#include "key.h"
21630 +#include "dformat.h"
21631 +
21632 +#include <linux/types.h> /* for __u?? */
21633 +#include <linux/fs.h> /* for struct super_block, etc */
21634 +#include <linux/dcache.h> /* for struct qstr */
21635 +
21636 +/* key assignment functions */
21637 +
21638 +/* Information from which key of file stat-data can be uniquely
21639 + restored. This depends on key assignment policy for
21640 + stat-data. Currently it's enough to store object id and locality id
21641 + (60+60==120) bits, because minor packing locality and offset of
21642 + stat-data key are always known constants: KEY_SD_MINOR and 0
21643 + respectively. For simplicity 4 bits are wasted in each id, and just
21644 + two 64 bit integers are stored.
21645 +
21646 + This field has to be byte-aligned, because we don't want to waste
21647 + space in directory entries. There is another side of a coin of
21648 + course: we waste CPU and bus bandwidth in stead, by copying data back
21649 + and forth.
21650 +
21651 + Next optimization: &obj_key_id is mainly used to address stat data from
21652 + directory entries. Under the assumption that majority of files only have
21653 + only name (one hard link) from *the* parent directory it seems reasonable
21654 + to only store objectid of stat data and take its locality from key of
21655 + directory item.
21656 +
21657 + This requires some flag to be added to the &obj_key_id to distinguish
21658 + between these two cases. Remaining bits in flag byte are then asking to be
21659 + used to store file type.
21660 +
21661 + This optimization requires changes in directory item handling code.
21662 +
21663 +*/
21664 +typedef struct obj_key_id {
21665 + d8 locality[sizeof(__u64)];
21666 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21667 + )
21668 + d8 objectid[sizeof(__u64)];
21669 +}
21670 +obj_key_id;
21671 +
21672 +/* Information sufficient to uniquely identify directory entry within
21673 + compressed directory item.
21674 +
21675 + For alignment issues see &obj_key_id above.
21676 +*/
21677 +typedef struct de_id {
21678 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21679 + d8 objectid[sizeof(__u64)];
21680 + d8 offset[sizeof(__u64)];
21681 +}
21682 +de_id;
21683 +
21684 +extern int inode_onwire_size(const struct inode *obj);
21685 +extern char *build_inode_onwire(const struct inode *obj, char *area);
21686 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21687 +
21688 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21689 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21690 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21691 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21692 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
21693 + de_id * id);
21694 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21695 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21696 + reiser4_key * key);
21697 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21698 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21699 +
21700 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21701 +extern void build_entry_key_common(const struct inode *dir,
21702 + const struct qstr *name,
21703 + reiser4_key * result);
21704 +extern void build_entry_key_stable_entry(const struct inode *dir,
21705 + const struct qstr *name,
21706 + reiser4_key * result);
21707 +extern int is_dot_key(const reiser4_key * key);
21708 +extern reiser4_key *build_sd_key(const struct inode *target,
21709 + reiser4_key * result);
21710 +
21711 +extern int is_longname_key(const reiser4_key * key);
21712 +extern int is_longname(const char *name, int len);
21713 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21714 +extern char *reiser4_unpack_string(__u64 value, char *buf);
21715 +extern void complete_entry_key(const struct inode *dir, const char *name,
21716 + int len, reiser4_key *result);
21717 +
21718 +/* __KASSIGN_H__ */
21719 +#endif
21720 +
21721 +/* Make Linus happy.
21722 + Local variables:
21723 + c-indentation-style: "K&R"
21724 + mode-name: "LC"
21725 + c-basic-offset: 8
21726 + tab-width: 8
21727 + fill-column: 120
21728 + End:
21729 +*/
21730 diff -urN linux-2.6.22.orig/fs/reiser4/Kconfig linux-2.6.22/fs/reiser4/Kconfig
21731 --- linux-2.6.22.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300
21732 +++ linux-2.6.22/fs/reiser4/Kconfig 2007-07-29 00:25:34.880697512 +0400
21733 @@ -0,0 +1,32 @@
21734 +config REISER4_FS
21735 + tristate "Reiser4 (EXPERIMENTAL)"
21736 + depends on EXPERIMENTAL
21737 + select ZLIB_INFLATE
21738 + select ZLIB_DEFLATE
21739 + select CRYPTO
21740 + help
21741 + Reiser4 is a filesystem that performs all filesystem operations
21742 + as atomic transactions, which means that it either performs a
21743 + write, or it does not, and in the event of a crash it does not
21744 + partially perform it or corrupt it.
21745 +
21746 + It stores files in dancing trees, which are like balanced trees but
21747 + faster. It packs small files together so that they share blocks
21748 + without wasting space. This means you can use it to store really
21749 + small files. It also means that it saves you disk space. It avoids
21750 + hassling you with anachronisms like having a maximum number of
21751 + inodes, and wasting space if you use less than that number.
21752 +
21753 + Reiser4 is a distinct filesystem type from reiserfs (V3).
21754 + It's therefore not possible to use reiserfs file systems
21755 + with reiser4.
21756 +
21757 + To learn more about reiser4, go to http://www.namesys.com
21758 +
21759 +config REISER4_DEBUG
21760 + bool "Enable reiser4 debug mode"
21761 + depends on REISER4_FS
21762 + help
21763 + Don't use this unless you are debugging reiser4.
21764 +
21765 + If unsure, say N.
21766 diff -urN linux-2.6.22.orig/fs/reiser4/key.c linux-2.6.22/fs/reiser4/key.c
21767 --- linux-2.6.22.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300
21768 +++ linux-2.6.22/fs/reiser4/key.c 2007-07-29 00:25:34.880697512 +0400
21769 @@ -0,0 +1,137 @@
21770 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21771 +
21772 +/* Key manipulations. */
21773 +
21774 +#include "debug.h"
21775 +#include "key.h"
21776 +#include "super.h"
21777 +#include "reiser4.h"
21778 +
21779 +#include <linux/types.h> /* for __u?? */
21780 +
21781 +/* Minimal possible key: all components are zero. It is presumed that this is
21782 + independent of key scheme. */
21783 +static const reiser4_key MINIMAL_KEY = {
21784 + .el = {
21785 + 0ull,
21786 + ON_LARGE_KEY(0ull,)
21787 + 0ull,
21788 + 0ull
21789 + }
21790 +};
21791 +
21792 +/* Maximal possible key: all components are ~0. It is presumed that this is
21793 + independent of key scheme. */
21794 +static const reiser4_key MAXIMAL_KEY = {
21795 + .el = {
21796 + __constant_cpu_to_le64(~0ull),
21797 + ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21798 + __constant_cpu_to_le64(~0ull),
21799 + __constant_cpu_to_le64(~0ull)
21800 + }
21801 +};
21802 +
21803 +/* Initialize key. */
21804 +void reiser4_key_init(reiser4_key * key /* key to init */ )
21805 +{
21806 + assert("nikita-1169", key != NULL);
21807 + memset(key, 0, sizeof *key);
21808 +}
21809 +
21810 +/* minimal possible key in the tree. Return pointer to the static storage. */
21811 +const reiser4_key *reiser4_min_key(void)
21812 +{
21813 + return &MINIMAL_KEY;
21814 +}
21815 +
21816 +/* maximum possible key in the tree. Return pointer to the static storage. */
21817 +const reiser4_key *reiser4_max_key(void)
21818 +{
21819 + return &MAXIMAL_KEY;
21820 +}
21821 +
21822 +#if REISER4_DEBUG
21823 +/* debugging aid: print symbolic name of key type */
21824 +static const char *type_name(unsigned int key_type /* key type */ )
21825 +{
21826 + switch (key_type) {
21827 + case KEY_FILE_NAME_MINOR:
21828 + return "file name";
21829 + case KEY_SD_MINOR:
21830 + return "stat data";
21831 + case KEY_ATTR_NAME_MINOR:
21832 + return "attr name";
21833 + case KEY_ATTR_BODY_MINOR:
21834 + return "attr body";
21835 + case KEY_BODY_MINOR:
21836 + return "file body";
21837 + default:
21838 + return "unknown";
21839 + }
21840 +}
21841 +
21842 +/* debugging aid: print human readable information about key */
21843 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
21844 + const reiser4_key * key /* key to print */ )
21845 +{
21846 + /* turn bold on */
21847 + /* printf ("\033[1m"); */
21848 + if (key == NULL)
21849 + printk("%s: null key\n", prefix);
21850 + else {
21851 + if (REISER4_LARGE_KEY)
21852 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21853 + get_key_locality(key),
21854 + get_key_type(key),
21855 + get_key_ordering(key),
21856 + get_key_band(key),
21857 + get_key_objectid(key), get_key_offset(key));
21858 + else
21859 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21860 + get_key_locality(key),
21861 + get_key_type(key),
21862 + get_key_band(key),
21863 + get_key_objectid(key), get_key_offset(key));
21864 + /*
21865 + * if this is a key of directory entry, try to decode part of
21866 + * a name stored in the key, and output it.
21867 + */
21868 + if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21869 + char buf[DE_NAME_BUF_LEN];
21870 + char *c;
21871 +
21872 + c = buf;
21873 + c = reiser4_unpack_string(get_key_ordering(key), c);
21874 + reiser4_unpack_string(get_key_fulloid(key), c);
21875 + printk("[%s", buf);
21876 + if (is_longname_key(key))
21877 + /*
21878 + * only part of the name is stored in the key.
21879 + */
21880 + printk("...]\n");
21881 + else {
21882 + /*
21883 + * whole name is stored in the key.
21884 + */
21885 + reiser4_unpack_string(get_key_offset(key), buf);
21886 + printk("%s]\n", buf);
21887 + }
21888 + } else {
21889 + printk("[%s]\n", type_name(get_key_type(key)));
21890 + }
21891 + }
21892 + /* turn bold off */
21893 + /* printf ("\033[m\017"); */
21894 +}
21895 +
21896 +#endif
21897 +
21898 +/* Make Linus happy.
21899 + Local variables:
21900 + c-indentation-style: "K&R"
21901 + mode-name: "LC"
21902 + c-basic-offset: 8
21903 + tab-width: 8
21904 + fill-column: 120
21905 + End:
21906 +*/
21907 diff -urN linux-2.6.22.orig/fs/reiser4/key.h linux-2.6.22/fs/reiser4/key.h
21908 --- linux-2.6.22.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300
21909 +++ linux-2.6.22/fs/reiser4/key.h 2007-07-29 00:25:34.884698547 +0400
21910 @@ -0,0 +1,384 @@
21911 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21912 +
21913 +/* Declarations of key-related data-structures and operations on keys. */
21914 +
21915 +#if !defined( __REISER4_KEY_H__ )
21916 +#define __REISER4_KEY_H__
21917 +
21918 +#include "dformat.h"
21919 +#include "forward.h"
21920 +#include "debug.h"
21921 +
21922 +#include <linux/types.h> /* for __u?? */
21923 +
21924 +/* Operations on keys in reiser4 tree */
21925 +
21926 +/* No access to any of these fields shall be done except via a
21927 + wrapping macro/function, and that wrapping macro/function shall
21928 + convert to little endian order. Compare keys will consider cpu byte order. */
21929 +
21930 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
21931 + which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
21932 + within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
21933 + approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
21934 + right one. */
21935 +
21936 +/* possible values for minor packing locality (4 bits required) */
21937 +typedef enum {
21938 + /* file name */
21939 + KEY_FILE_NAME_MINOR = 0,
21940 + /* stat-data */
21941 + KEY_SD_MINOR = 1,
21942 + /* file attribute name */
21943 + KEY_ATTR_NAME_MINOR = 2,
21944 + /* file attribute value */
21945 + KEY_ATTR_BODY_MINOR = 3,
21946 + /* file body (tail or extent) */
21947 + KEY_BODY_MINOR = 4,
21948 +} key_minor_locality;
21949 +
21950 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
21951 + Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
21952 + and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
21953 + segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
21954 + block_alloc.c to check the node type when deciding where to allocate the node.
21955 +
21956 + The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
21957 + should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
21958 + current implementation tails have a different minor packing locality from extents, and no files have both extents and
21959 + tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
21960 +*/
21961 +
21962 +/* Arbitrary major packing localities can be assigned to objects using
21963 + the reiser4(filenameA/..packing<=some_number) system call.
21964 +
21965 + In reiser4, the creat() syscall creates a directory
21966 +
21967 + whose default flow (that which is referred to if the directory is
21968 + read as a file) is the traditional unix file body.
21969 +
21970 + whose directory plugin is the 'filedir'
21971 +
21972 + whose major packing locality is that of the parent of the object created.
21973 +
21974 + The static_stat item is a particular commonly used directory
21975 + compression (the one for normal unix files).
21976 +
21977 + The filedir plugin checks to see if the static_stat item exists.
21978 + There is a unique key for static_stat. If yes, then it uses the
21979 + static_stat item for all of the values that it contains. The
21980 + static_stat item contains a flag for each stat it contains which
21981 + indicates whether one should look outside the static_stat item for its
21982 + contents.
21983 +*/
21984 +
21985 +/* offset of fields in reiser4_key. Value of each element of this enum
21986 + is index within key (thought as array of __u64's) where this field
21987 + is. */
21988 +typedef enum {
21989 + /* major "locale", aka dirid. Sits in 1st element */
21990 + KEY_LOCALITY_INDEX = 0,
21991 + /* minor "locale", aka item type. Sits in 1st element */
21992 + KEY_TYPE_INDEX = 0,
21993 + ON_LARGE_KEY(KEY_ORDERING_INDEX,)
21994 + /* "object band". Sits in 2nd element */
21995 + KEY_BAND_INDEX,
21996 + /* objectid. Sits in 2nd element */
21997 + KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
21998 + /* full objectid. Sits in 2nd element */
21999 + KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22000 + /* Offset. Sits in 3rd element */
22001 + KEY_OFFSET_INDEX,
22002 + /* Name hash. Sits in 3rd element */
22003 + KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22004 + KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22005 + KEY_LAST_INDEX
22006 +} reiser4_key_field_index;
22007 +
22008 +/* key in reiser4 internal "balanced" tree. It is just array of three
22009 + 64bit integers in disk byte order (little-endian by default). This
22010 + array is actually indexed by reiser4_key_field. Each __u64 within
22011 + this array is called "element". Logical key component encoded within
22012 + elements are called "fields".
22013 +
22014 + We declare this as union with second component dummy to suppress
22015 + inconvenient array<->pointer casts implied in C. */
22016 +union reiser4_key {
22017 + __le64 el[KEY_LAST_INDEX];
22018 + int pad;
22019 +};
22020 +
22021 +/* bitmasks showing where within reiser4_key particular key is stored. */
22022 +/* major locality occupies higher 60 bits of the first element */
22023 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22024 +
22025 +/* minor locality occupies lower 4 bits of the first element */
22026 +#define KEY_TYPE_MASK 0xfull
22027 +
22028 +/* controversial band occupies higher 4 bits of the 2nd element */
22029 +#define KEY_BAND_MASK 0xf000000000000000ull
22030 +
22031 +/* objectid occupies lower 60 bits of the 2nd element */
22032 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22033 +
22034 +/* full 64bit objectid*/
22035 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22036 +
22037 +/* offset is just 3rd L.M.Nt itself */
22038 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22039 +
22040 +/* ordering is whole second element */
22041 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22042 +
22043 +/* how many bits key element should be shifted to left to get particular field */
22044 +typedef enum {
22045 + KEY_LOCALITY_SHIFT = 4,
22046 + KEY_TYPE_SHIFT = 0,
22047 + KEY_BAND_SHIFT = 60,
22048 + KEY_OBJECTID_SHIFT = 0,
22049 + KEY_FULLOID_SHIFT = 0,
22050 + KEY_OFFSET_SHIFT = 0,
22051 + KEY_ORDERING_SHIFT = 0,
22052 +} reiser4_key_field_shift;
22053 +
22054 +static inline __u64
22055 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22056 +{
22057 + assert("nikita-753", key != NULL);
22058 + assert("nikita-754", off < KEY_LAST_INDEX);
22059 + return le64_to_cpu(get_unaligned(&key->el[off]));
22060 +}
22061 +
22062 +static inline void
22063 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22064 +{
22065 + assert("nikita-755", key != NULL);
22066 + assert("nikita-756", off < KEY_LAST_INDEX);
22067 + put_unaligned(cpu_to_le64(value), &key->el[off]);
22068 +}
22069 +
22070 +/* macro to define getter and setter functions for field F with type T */
22071 +#define DEFINE_KEY_FIELD( L, U, T ) \
22072 +static inline T get_key_ ## L ( const reiser4_key *key ) \
22073 +{ \
22074 + assert( "nikita-750", key != NULL ); \
22075 + return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22076 + KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22077 +} \
22078 + \
22079 +static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22080 +{ \
22081 + __u64 el; \
22082 + \
22083 + assert( "nikita-752", key != NULL ); \
22084 + \
22085 + el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22086 + /* clear field bits in the key */ \
22087 + el &= ~KEY_ ## U ## _MASK; \
22088 + /* actually it should be \
22089 + \
22090 + el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22091 + \
22092 + but we trust user to never pass values that wouldn't fit \
22093 + into field. Clearing extra bits is one operation, but this \
22094 + function is time-critical. \
22095 + But check this in assertion. */ \
22096 + assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22097 + ~KEY_ ## U ## _MASK ) == 0 ); \
22098 + el |= ( loc << KEY_ ## U ## _SHIFT ); \
22099 + set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22100 +}
22101 +
22102 +typedef __u64 oid_t;
22103 +
22104 +/* define get_key_locality(), set_key_locality() */
22105 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22106 +/* define get_key_type(), set_key_type() */
22107 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22108 +/* define get_key_band(), set_key_band() */
22109 +DEFINE_KEY_FIELD(band, BAND, __u64);
22110 +/* define get_key_objectid(), set_key_objectid() */
22111 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22112 +/* define get_key_fulloid(), set_key_fulloid() */
22113 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22114 +/* define get_key_offset(), set_key_offset() */
22115 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22116 +#if (REISER4_LARGE_KEY)
22117 +/* define get_key_ordering(), set_key_ordering() */
22118 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22119 +#else
22120 +static inline __u64 get_key_ordering(const reiser4_key * key)
22121 +{
22122 + return 0;
22123 +}
22124 +
22125 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22126 +{
22127 +}
22128 +#endif
22129 +
22130 +/* key comparison result */
22131 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22132 + EQUAL_TO = 0, /* if keys are equal */
22133 + GREATER_THAN = +1 /* if first key is greater than second */
22134 +} cmp_t;
22135 +
22136 +void reiser4_key_init(reiser4_key * key);
22137 +
22138 +/* minimal possible key in the tree. Return pointer to the static storage. */
22139 +extern const reiser4_key *reiser4_min_key(void);
22140 +extern const reiser4_key *reiser4_max_key(void);
22141 +
22142 +/* helper macro for keycmp() */
22143 +#define KEY_DIFF(k1, k2, field) \
22144 +({ \
22145 + typeof (get_key_ ## field (k1)) f1; \
22146 + typeof (get_key_ ## field (k2)) f2; \
22147 + \
22148 + f1 = get_key_ ## field (k1); \
22149 + f2 = get_key_ ## field (k2); \
22150 + \
22151 + (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22152 +})
22153 +
22154 +/* helper macro for keycmp() */
22155 +#define KEY_DIFF_EL(k1, k2, off) \
22156 +({ \
22157 + __u64 e1; \
22158 + __u64 e2; \
22159 + \
22160 + e1 = get_key_el(k1, off); \
22161 + e2 = get_key_el(k2, off); \
22162 + \
22163 + (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22164 +})
22165 +
22166 +/* compare `k1' and `k2'. This function is a heart of "key allocation
22167 + policy". All you need to implement new policy is to add yet another
22168 + clause here. */
22169 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22170 + const reiser4_key * k2 /* second key to compare */ )
22171 +{
22172 + cmp_t result;
22173 +
22174 + /*
22175 + * This function is the heart of reiser4 tree-routines. Key comparison
22176 + * is among most heavily used operations in the file system.
22177 + */
22178 +
22179 + assert("nikita-439", k1 != NULL);
22180 + assert("nikita-440", k2 != NULL);
22181 +
22182 + /* there is no actual branch here: condition is compile time constant
22183 + * and constant folding and propagation ensures that only one branch
22184 + * is actually compiled in. */
22185 +
22186 + if (REISER4_PLANA_KEY_ALLOCATION) {
22187 + /* if physical order of fields in a key is identical
22188 + with logical order, we can implement key comparison
22189 + as three 64bit comparisons. */
22190 + /* logical order of fields in plan-a:
22191 + locality->type->objectid->offset. */
22192 + /* compare locality and type at once */
22193 + result = KEY_DIFF_EL(k1, k2, 0);
22194 + if (result == EQUAL_TO) {
22195 + /* compare objectid (and band if it's there) */
22196 + result = KEY_DIFF_EL(k1, k2, 1);
22197 + /* compare offset */
22198 + if (result == EQUAL_TO) {
22199 + result = KEY_DIFF_EL(k1, k2, 2);
22200 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22201 + result = KEY_DIFF_EL(k1, k2, 3);
22202 + }
22203 + }
22204 + }
22205 + } else if (REISER4_3_5_KEY_ALLOCATION) {
22206 + result = KEY_DIFF(k1, k2, locality);
22207 + if (result == EQUAL_TO) {
22208 + result = KEY_DIFF(k1, k2, objectid);
22209 + if (result == EQUAL_TO) {
22210 + result = KEY_DIFF(k1, k2, type);
22211 + if (result == EQUAL_TO)
22212 + result = KEY_DIFF(k1, k2, offset);
22213 + }
22214 + }
22215 + } else
22216 + impossible("nikita-441", "Unknown key allocation scheme!");
22217 + return result;
22218 +}
22219 +
22220 +/* true if @k1 equals @k2 */
22221 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22222 + const reiser4_key * k2 /* second key to compare */ )
22223 +{
22224 + assert("nikita-1879", k1 != NULL);
22225 + assert("nikita-1880", k2 != NULL);
22226 + return !memcmp(k1, k2, sizeof *k1);
22227 +}
22228 +
22229 +/* true if @k1 is less than @k2 */
22230 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22231 + const reiser4_key * k2 /* second key to compare */ )
22232 +{
22233 + assert("nikita-1952", k1 != NULL);
22234 + assert("nikita-1953", k2 != NULL);
22235 + return keycmp(k1, k2) == LESS_THAN;
22236 +}
22237 +
22238 +/* true if @k1 is less than or equal to @k2 */
22239 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22240 + const reiser4_key * k2 /* second key to compare */ )
22241 +{
22242 + assert("nikita-1954", k1 != NULL);
22243 + assert("nikita-1955", k2 != NULL);
22244 + return keycmp(k1, k2) != GREATER_THAN;
22245 +}
22246 +
22247 +/* true if @k1 is greater than @k2 */
22248 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22249 + const reiser4_key * k2 /* second key to compare */ )
22250 +{
22251 + assert("nikita-1959", k1 != NULL);
22252 + assert("nikita-1960", k2 != NULL);
22253 + return keycmp(k1, k2) == GREATER_THAN;
22254 +}
22255 +
22256 +/* true if @k1 is greater than or equal to @k2 */
22257 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22258 + const reiser4_key * k2 /* second key to compare */ )
22259 +{
22260 + assert("nikita-1956", k1 != NULL);
22261 + assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22262 + * November 3: Laika */
22263 + return keycmp(k1, k2) != LESS_THAN;
22264 +}
22265 +
22266 +static inline void prefetchkey(reiser4_key * key)
22267 +{
22268 + prefetch(key);
22269 + prefetch(&key->el[KEY_CACHELINE_END]);
22270 +}
22271 +
22272 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22273 + 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22274 +/* size of a buffer suitable to hold human readable key representation */
22275 +#define KEY_BUF_LEN (80)
22276 +
22277 +#if REISER4_DEBUG
22278 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22279 +#else
22280 +#define reiser4_print_key(p,k) noop
22281 +#endif
22282 +
22283 +/* __FS_REISERFS_KEY_H__ */
22284 +#endif
22285 +
22286 +/* Make Linus happy.
22287 + Local variables:
22288 + c-indentation-style: "K&R"
22289 + mode-name: "LC"
22290 + c-basic-offset: 8
22291 + tab-width: 8
22292 + fill-column: 120
22293 + End:
22294 +*/
22295 diff -urN linux-2.6.22.orig/fs/reiser4/ktxnmgrd.c linux-2.6.22/fs/reiser4/ktxnmgrd.c
22296 --- linux-2.6.22.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300
22297 +++ linux-2.6.22/fs/reiser4/ktxnmgrd.c 2007-07-29 00:25:34.884698547 +0400
22298 @@ -0,0 +1,215 @@
22299 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22300 +/* Transaction manager daemon. */
22301 +
22302 +/*
22303 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22304 + * needed/important for the following reasons:
22305 + *
22306 + * 1. in reiser4 atom is not committed immediately when last transaction
22307 + * handle closes, unless atom is either too old or too large (see
22308 + * atom_should_commit()). This is done to avoid committing too frequently.
22309 + * because:
22310 + *
22311 + * 2. sometimes we don't want to commit atom when closing last transaction
22312 + * handle even if it is old and fat enough. For example, because we are at
22313 + * this point under directory semaphore, and committing would stall all
22314 + * accesses to this directory.
22315 + *
22316 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22317 + * either due to (tunable) timeout or because it was explicitly woken up by
22318 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22319 + * eligible.
22320 + *
22321 + */
22322 +
22323 +#include "debug.h"
22324 +#include "txnmgr.h"
22325 +#include "tree.h"
22326 +#include "ktxnmgrd.h"
22327 +#include "super.h"
22328 +#include "reiser4.h"
22329 +
22330 +#include <linux/sched.h> /* for struct task_struct */
22331 +#include <linux/wait.h>
22332 +#include <linux/suspend.h>
22333 +#include <linux/kernel.h>
22334 +#include <linux/writeback.h>
22335 +#include <linux/kthread.h>
22336 +#include <linux/freezer.h>
22337 +
22338 +static int scan_mgr(struct super_block *);
22339 +
22340 +/*
22341 + * change current->comm so that ps, top, and friends will see changed
22342 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22343 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22344 + */
22345 +#define set_comm( state ) \
22346 + snprintf( current -> comm, sizeof( current -> comm ), \
22347 + "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22348 +
22349 +/**
22350 + * ktxnmgrd - kernel txnmgr daemon
22351 + * @arg: pointer to super block
22352 + *
22353 + * The background transaction manager daemon, started as a kernel thread during
22354 + * reiser4 initialization.
22355 + */
22356 +static int ktxnmgrd(void *arg)
22357 +{
22358 + struct super_block *super;
22359 + ktxnmgrd_context *ctx;
22360 + txn_mgr *mgr;
22361 + int done = 0;
22362 +
22363 + super = arg;
22364 + mgr = &get_super_private(super)->tmgr;
22365 +
22366 + /*
22367 + * do_fork() just copies task_struct into the new thread. ->fs_context
22368 + * shouldn't be copied of course. This shouldn't be a problem for the
22369 + * rest of the code though.
22370 + */
22371 + current->journal_info = NULL;
22372 + ctx = mgr->daemon;
22373 + while (1) {
22374 + try_to_freeze();
22375 + set_comm("wait");
22376 + {
22377 + DEFINE_WAIT(__wait);
22378 +
22379 + prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22380 + if (kthread_should_stop()) {
22381 + done = 1;
22382 + } else
22383 + schedule_timeout(ctx->timeout);
22384 + finish_wait(&ctx->wait, &__wait);
22385 + }
22386 + if (done)
22387 + break;
22388 + set_comm("run");
22389 + spin_lock(&ctx->guard);
22390 + /*
22391 + * wait timed out or ktxnmgrd was woken up by explicit request
22392 + * to commit something. Scan list of atoms in txnmgr and look
22393 + * for too old atoms.
22394 + */
22395 + do {
22396 + ctx->rescan = 0;
22397 + scan_mgr(super);
22398 + spin_lock(&ctx->guard);
22399 + if (ctx->rescan) {
22400 + /*
22401 + * the list could be modified while ctx
22402 + * spinlock was released, we have to repeat
22403 + * scanning from the beginning
22404 + */
22405 + break;
22406 + }
22407 + } while (ctx->rescan);
22408 + spin_unlock(&ctx->guard);
22409 + }
22410 + return 0;
22411 +}
22412 +
22413 +#undef set_comm
22414 +
22415 +/**
22416 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22417 + * @super: pointer to super block
22418 + *
22419 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22420 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22421 + */
22422 +int reiser4_init_ktxnmgrd(struct super_block *super)
22423 +{
22424 + txn_mgr *mgr;
22425 + ktxnmgrd_context *ctx;
22426 +
22427 + mgr = &get_super_private(super)->tmgr;
22428 +
22429 + assert("zam-1014", mgr->daemon == NULL);
22430 +
22431 + ctx = kmalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22432 + if (ctx == NULL)
22433 + return RETERR(-ENOMEM);
22434 +
22435 + assert("nikita-2442", ctx != NULL);
22436 +
22437 + memset(ctx, 0, sizeof *ctx);
22438 + init_waitqueue_head(&ctx->wait);
22439 +
22440 + /*kcond_init(&ctx->startup);*/
22441 + spin_lock_init(&ctx->guard);
22442 + ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22443 + ctx->rescan = 1;
22444 + mgr->daemon = ctx;
22445 +
22446 + ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22447 + if (IS_ERR(ctx->tsk)) {
22448 + int ret = PTR_ERR(ctx->tsk);
22449 + mgr->daemon = NULL;
22450 + kfree(ctx);
22451 + return RETERR(ret);
22452 + }
22453 + return 0;
22454 +}
22455 +
22456 +void ktxnmgrd_kick(txn_mgr *mgr)
22457 +{
22458 + assert("nikita-3234", mgr != NULL);
22459 + assert("nikita-3235", mgr->daemon != NULL);
22460 + wake_up(&mgr->daemon->wait);
22461 +}
22462 +
22463 +int is_current_ktxnmgrd(void)
22464 +{
22465 + return (get_current_super_private()->tmgr.daemon->tsk == current);
22466 +}
22467 +
22468 +/**
22469 + * scan_mgr - commit atoms which are to be committed
22470 + * @super: super block to commit atoms of
22471 + *
22472 + * Commits old atoms.
22473 + */
22474 +static int scan_mgr(struct super_block *super)
22475 +{
22476 + int ret;
22477 + reiser4_context ctx;
22478 +
22479 + init_stack_context(&ctx, super);
22480 +
22481 + ret = commit_some_atoms(&get_super_private(super)->tmgr);
22482 +
22483 + reiser4_exit_context(&ctx);
22484 + return ret;
22485 +}
22486 +
22487 +/**
22488 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22489 + * @mgr:
22490 + *
22491 + * This is called on umount. Stops ktxnmgrd and free t
22492 + */
22493 +void reiser4_done_ktxnmgrd(struct super_block *super)
22494 +{
22495 + txn_mgr *mgr;
22496 +
22497 + mgr = &get_super_private(super)->tmgr;
22498 + assert("zam-1012", mgr->daemon != NULL);
22499 +
22500 + kthread_stop(mgr->daemon->tsk);
22501 + kfree(mgr->daemon);
22502 + mgr->daemon = NULL;
22503 +}
22504 +
22505 +/*
22506 + * Local variables:
22507 + * c-indentation-style: "K&R"
22508 + * mode-name: "LC"
22509 + * c-basic-offset: 8
22510 + * tab-width: 8
22511 + * fill-column: 120
22512 + * End:
22513 + */
22514 diff -urN linux-2.6.22.orig/fs/reiser4/ktxnmgrd.h linux-2.6.22/fs/reiser4/ktxnmgrd.h
22515 --- linux-2.6.22.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300
22516 +++ linux-2.6.22/fs/reiser4/ktxnmgrd.h 2007-07-29 00:25:34.884698547 +0400
22517 @@ -0,0 +1,52 @@
22518 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22519 + * reiser4/README */
22520 +
22521 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22522 +
22523 +#ifndef __KTXNMGRD_H__
22524 +#define __KTXNMGRD_H__
22525 +
22526 +#include "txnmgr.h"
22527 +
22528 +#include <linux/fs.h>
22529 +#include <linux/wait.h>
22530 +#include <linux/completion.h>
22531 +#include <linux/spinlock.h>
22532 +#include <asm/atomic.h>
22533 +#include <linux/sched.h> /* for struct task_struct */
22534 +
22535 +/* in this structure all data necessary to start up, shut down and communicate
22536 + * with ktxnmgrd are kept. */
22537 +struct ktxnmgrd_context {
22538 + /* wait queue head on which ktxnmgrd sleeps */
22539 + wait_queue_head_t wait;
22540 + /* spin lock protecting all fields of this structure */
22541 + spinlock_t guard;
22542 + /* timeout of sleeping on ->wait */
22543 + signed long timeout;
22544 + /* kernel thread running ktxnmgrd */
22545 + struct task_struct *tsk;
22546 + /* list of all file systems served by this ktxnmgrd */
22547 + struct list_head queue;
22548 + /* should ktxnmgrd repeat scanning of atoms? */
22549 + unsigned int rescan:1;
22550 +};
22551 +
22552 +extern int reiser4_init_ktxnmgrd(struct super_block *);
22553 +extern void reiser4_done_ktxnmgrd(struct super_block *);
22554 +
22555 +extern void ktxnmgrd_kick(txn_mgr * mgr);
22556 +extern int is_current_ktxnmgrd(void);
22557 +
22558 +/* __KTXNMGRD_H__ */
22559 +#endif
22560 +
22561 +/* Make Linus happy.
22562 + Local variables:
22563 + c-indentation-style: "K&R"
22564 + mode-name: "LC"
22565 + c-basic-offset: 8
22566 + tab-width: 8
22567 + fill-column: 120
22568 + End:
22569 +*/
22570 diff -urN linux-2.6.22.orig/fs/reiser4/lock.c linux-2.6.22/fs/reiser4/lock.c
22571 --- linux-2.6.22.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22572 +++ linux-2.6.22/fs/reiser4/lock.c 2007-07-29 00:25:34.884698547 +0400
22573 @@ -0,0 +1,1232 @@
22574 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22575 + * reiser4/README */
22576 +
22577 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22578 + order. V4 balances the tree from the bottom up, and searches the tree from
22579 + the top down, and that is really the way we want it, so tradition won't work
22580 + for us.
22581 +
22582 + Instead we have two lock orderings, a high priority lock ordering, and a low
22583 + priority lock ordering. Each node in the tree has a lock in its znode.
22584 +
22585 + Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22586 + has a set (maybe empty) of already locked nodes ("process locked set"). Each
22587 + process may have a pending lock request to a node locked by another process.
22588 + Note: we lock and unlock, but do not transfer locks: it is possible
22589 + transferring locks instead would save some bus locking....
22590 +
22591 + Deadlock occurs when we have a loop constructed from process locked sets and
22592 + lock request vectors.
22593 +
22594 + NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22595 + memory is extended with "znodes" with which we connect nodes with their left
22596 + and right neighbors using sibling pointers stored in the znodes. When we
22597 + perform balancing operations we often go from left to right and from right to
22598 + left.
22599 +
22600 + +-P1-+ +-P3-+
22601 + |+--+| V1 |+--+|
22602 + ||N1|| -------> ||N3||
22603 + |+--+| |+--+|
22604 + +----+ +----+
22605 + ^ |
22606 + |V2 |V3
22607 + | v
22608 + +---------P2---------+
22609 + |+--+ +--+|
22610 + ||N2| -------- |N4||
22611 + |+--+ +--+|
22612 + +--------------------+
22613 +
22614 + We solve this by ensuring that only low priority processes lock in top to
22615 + bottom order and from right to left, and high priority processes lock from
22616 + bottom to top and left to right.
22617 +
22618 + ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22619 + kill those damn busy loops.
22620 + ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22621 + stage) cannot be ordered that way. There are no rules what nodes can belong
22622 + to the atom and what nodes cannot. We cannot define what is right or left
22623 + direction, what is top or bottom. We can take immediate parent or side
22624 + neighbor of one node, but nobody guarantees that, say, left neighbor node is
22625 + not a far right neighbor for other nodes from the same atom. It breaks
22626 + deadlock avoidance rules and hi-low priority locking cannot be applied for
22627 + atom locks.
22628 +
22629 + How does it help to avoid deadlocks ?
22630 +
22631 + Suppose we have a deadlock with n processes. Processes from one priority
22632 + class never deadlock because they take locks in one consistent
22633 + order.
22634 +
22635 + So, any possible deadlock loop must have low priority as well as high
22636 + priority processes. There are no other lock priority levels except low and
22637 + high. We know that any deadlock loop contains at least one node locked by a
22638 + low priority process and requested by a high priority process. If this
22639 + situation is caught and resolved it is sufficient to avoid deadlocks.
22640 +
22641 + V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22642 +
22643 + The deadlock prevention algorithm is based on comparing
22644 + priorities of node owners (processes which keep znode locked) and
22645 + requesters (processes which want to acquire a lock on znode). We
22646 + implement a scheme where low-priority owners yield locks to
22647 + high-priority requesters. We created a signal passing system that
22648 + is used to ask low-priority processes to yield one or more locked
22649 + znodes.
22650 +
22651 + The condition when a znode needs to change its owners is described by the
22652 + following formula:
22653 +
22654 + #############################################
22655 + # #
22656 + # (number of high-priority requesters) > 0 #
22657 + # AND #
22658 + # (numbers of high-priority owners) == 0 #
22659 + # #
22660 + #############################################
22661 +
22662 + Note that a low-priority process delays node releasing if another
22663 + high-priority process owns this node. So, slightly more strictly speaking,
22664 + to have a deadlock capable cycle you must have a loop in which a high
22665 + priority process is waiting on a low priority process to yield a node, which
22666 + is slightly different from saying a high priority process is waiting on a
22667 + node owned by a low priority process.
22668 +
22669 + It is enough to avoid deadlocks if we prevent any low-priority process from
22670 + falling asleep if its locked set contains a node which satisfies the
22671 + deadlock condition.
22672 +
22673 + That condition is implicitly or explicitly checked in all places where new
22674 + high-priority requests may be added or removed from node request queue or
22675 + high-priority process takes or releases a lock on node. The main
22676 + goal of these checks is to never lose the moment when node becomes "has
22677 + wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22678 + at that time.
22679 +
22680 + The information about received signals is stored in the per-process
22681 + structure (lock stack) and analyzed before a low-priority process goes to
22682 + sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22683 + sleeping process up and forces him to re-check lock status and received
22684 + signal info. If "must-yield-this-lock" signals were received the locking
22685 + primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22686 +
22687 + V4 LOCKING DRAWBACKS
22688 +
22689 + If we have already balanced on one level, and we are propagating our changes
22690 + upward to a higher level, it could be very messy to surrender all locks on
22691 + the lower level because we put so much computational work into it, and
22692 + reverting them to their state before they were locked might be very complex.
22693 + We also don't want to acquire all locks before performing balancing because
22694 + that would either be almost as much work as the balancing, or it would be
22695 + too conservative and lock too much. We want balancing to be done only at
22696 + high priority. Yet, we might want to go to the left one node and use some
22697 + of its empty space... So we make one attempt at getting the node to the left
22698 + using try_lock, and if it fails we do without it, because we didn't really
22699 + need it, it was only a nice to have.
22700 +
22701 + LOCK STRUCTURES DESCRIPTION
22702 +
22703 + The following data structures are used in the reiser4 locking
22704 + implementation:
22705 +
22706 + All fields related to long-term locking are stored in znode->lock.
22707 +
22708 + The lock stack is a per thread object. It owns all znodes locked by the
22709 + thread. One znode may be locked by several threads in case of read lock or
22710 + one znode may be write locked by one thread several times. The special link
22711 + objects (lock handles) support n<->m relation between znodes and lock
22712 + owners.
22713 +
22714 + <Thread 1> <Thread 2>
22715 +
22716 + +---------+ +---------+
22717 + | LS1 | | LS2 |
22718 + +---------+ +---------+
22719 + ^ ^
22720 + |---------------+ +----------+
22721 + v v v v
22722 + +---------+ +---------+ +---------+ +---------+
22723 + | LH1 | | LH2 | | LH3 | | LH4 |
22724 + +---------+ +---------+ +---------+ +---------+
22725 + ^ ^ ^ ^
22726 + | +------------+ |
22727 + v v v
22728 + +---------+ +---------+ +---------+
22729 + | Z1 | | Z2 | | Z3 |
22730 + +---------+ +---------+ +---------+
22731 +
22732 + Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22733 + picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22734 + LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22735 + Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22736 + list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22737 + is locked (for read) twice by different threads and two lock handles are on
22738 + its list. Each lock handle represents a single relation of a locking of a
22739 + znode by a thread. Locking of a znode is an establishing of a locking
22740 + relation between the lock stack and the znode by adding of a new lock handle
22741 + to a list of lock handles, the lock stack. The lock stack links all lock
22742 + handles for all znodes locked by the lock stack. The znode list groups all
22743 + lock handles for all locks stacks which locked the znode.
22744 +
22745 + Yet another relation may exist between znode and lock owners. If lock
22746 + procedure cannot immediately take lock on an object it adds the lock owner
22747 + on special `requestors' list belongs to znode. That list represents a
22748 + queue of pending lock requests. Because one lock owner may request only
22749 + only one lock object at a time, it is a 1->n relation between lock objects
22750 + and a lock owner implemented as it is described above. Full information
22751 + (priority, pointers to lock and link objects) about each lock request is
22752 + stored in lock owner structure in `request' field.
22753 +
22754 + SHORT_TERM LOCKING
22755 +
22756 + This is a list of primitive operations over lock stacks / lock handles /
22757 + znodes and locking descriptions for them.
22758 +
22759 + 1. locking / unlocking which is done by two list insertion/deletion, one
22760 + to/from znode's list of lock handles, another one is to/from lock stack's
22761 + list of lock handles. The first insertion is protected by
22762 + znode->lock.guard spinlock. The list owned by the lock stack can be
22763 + modified only by thread who owns the lock stack and nobody else can
22764 + modify/read it. There is nothing to be protected by a spinlock or
22765 + something else.
22766 +
22767 + 2. adding/removing a lock request to/from znode requesters list. The rule is
22768 + that znode->lock.guard spinlock should be taken for this.
22769 +
22770 + 3. we can traverse list of lock handles and use references to lock stacks who
22771 + locked given znode if znode->lock.guard spinlock is taken.
22772 +
22773 + 4. If a lock stack is associated with a znode as a lock requestor or lock
22774 + owner its existence is guaranteed by znode->lock.guard spinlock. Some its
22775 + (lock stack's) fields should be protected from being accessed in parallel
22776 + by two or more threads. Please look at lock_stack structure definition
22777 + for the info how those fields are protected. */
22778 +
22779 +/* Znode lock and capturing intertwining. */
22780 +/* In current implementation we capture formatted nodes before locking
22781 + them. Take a look on longterm lock znode, reiser4_try_capture() request
22782 + precedes locking requests. The longterm_lock_znode function unconditionally
22783 + captures znode before even checking of locking conditions.
22784 +
22785 + Another variant is to capture znode after locking it. It was not tested, but
22786 + at least one deadlock condition is supposed to be there. One thread has
22787 + locked a znode (Node-1) and calls reiser4_try_capture() for it.
22788 + reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22789 + Second thread is a flushing thread, its current atom is the atom Node-1
22790 + belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22791 + is locked by the first thread. The described situation is a deadlock. */
22792 +
22793 +#include "debug.h"
22794 +#include "txnmgr.h"
22795 +#include "znode.h"
22796 +#include "jnode.h"
22797 +#include "tree.h"
22798 +#include "plugin/node/node.h"
22799 +#include "super.h"
22800 +
22801 +#include <linux/spinlock.h>
22802 +
22803 +#if REISER4_DEBUG
22804 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
22805 + znode_lock_request);
22806 +#endif
22807 +
22808 +/* Returns a lock owner associated with current thread */
22809 +lock_stack *get_current_lock_stack(void)
22810 +{
22811 + return &get_current_context()->stack;
22812 +}
22813 +
22814 +/* Wakes up all low priority owners informing them about possible deadlock */
22815 +static void wake_up_all_lopri_owners(znode * node)
22816 +{
22817 + lock_handle *handle;
22818 +
22819 + assert_spin_locked(&(node->lock.guard));
22820 + list_for_each_entry(handle, &node->lock.owners, owners_link) {
22821 + assert("nikita-1832", handle->node == node);
22822 + /* count this signal in owner->nr_signaled */
22823 + if (!handle->signaled) {
22824 + handle->signaled = 1;
22825 + atomic_inc(&handle->owner->nr_signaled);
22826 + /* Wake up a single process */
22827 + reiser4_wake_up(handle->owner);
22828 + }
22829 + }
22830 +}
22831 +
22832 +/* Adds a lock to a lock owner, which means creating a link to the lock and
22833 + putting the link into the two lists all links are on (the doubly linked list
22834 + that forms the lock_stack, and the doubly linked list of links attached
22835 + to a lock.
22836 +*/
22837 +static inline void
22838 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
22839 +{
22840 + assert("jmacd-810", handle->owner == NULL);
22841 + assert_spin_locked(&(node->lock.guard));
22842 +
22843 + handle->owner = owner;
22844 + handle->node = node;
22845 +
22846 + assert("reiser4-4",
22847 + ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22848 +
22849 + /* add lock handle to the end of lock_stack's list of locks */
22850 + list_add_tail(&handle->locks_link, &owner->locks);
22851 + ON_DEBUG(owner->nr_locks++);
22852 + reiser4_ctx_gfp_mask_set();
22853 +
22854 + /* add lock handle to the head of znode's list of owners */
22855 + list_add(&handle->owners_link, &node->lock.owners);
22856 + handle->signaled = 0;
22857 +}
22858 +
22859 +/* Breaks a relation between a lock and its owner */
22860 +static inline void unlink_object(lock_handle * handle)
22861 +{
22862 + assert("zam-354", handle->owner != NULL);
22863 + assert("nikita-1608", handle->node != NULL);
22864 + assert_spin_locked(&(handle->node->lock.guard));
22865 + assert("nikita-1829", handle->owner == get_current_lock_stack());
22866 + assert("reiser4-5", handle->owner->nr_locks > 0);
22867 +
22868 + /* remove lock handle from lock_stack's list of locks */
22869 + list_del(&handle->locks_link);
22870 + ON_DEBUG(handle->owner->nr_locks--);
22871 + reiser4_ctx_gfp_mask_set();
22872 + assert("reiser4-6",
22873 + ergo(list_empty_careful(&handle->owner->locks),
22874 + handle->owner->nr_locks == 0));
22875 + /* remove lock handle from znode's list of owners */
22876 + list_del(&handle->owners_link);
22877 + /* indicates that lock handle is free now */
22878 + handle->node = NULL;
22879 +#if REISER4_DEBUG
22880 + INIT_LIST_HEAD(&handle->locks_link);
22881 + INIT_LIST_HEAD(&handle->owners_link);
22882 + handle->owner = NULL;
22883 +#endif
22884 +}
22885 +
22886 +/* Actually locks an object knowing that we are able to do this */
22887 +static void lock_object(lock_stack * owner)
22888 +{
22889 + struct lock_request *request;
22890 + znode *node;
22891 +
22892 + request = &owner->request;
22893 + node = request->node;
22894 + assert_spin_locked(&(node->lock.guard));
22895 + if (request->mode == ZNODE_READ_LOCK) {
22896 + node->lock.nr_readers++;
22897 + } else {
22898 + /* check that we don't switched from read to write lock */
22899 + assert("nikita-1840", node->lock.nr_readers <= 0);
22900 + /* We allow recursive locking; a node can be locked several
22901 + times for write by same process */
22902 + node->lock.nr_readers--;
22903 + }
22904 +
22905 + link_object(request->handle, owner, node);
22906 +
22907 + if (owner->curpri) {
22908 + node->lock.nr_hipri_owners++;
22909 + }
22910 +}
22911 +
22912 +/* Check for recursive write locking */
22913 +static int recursive(lock_stack * owner)
22914 +{
22915 + int ret;
22916 + znode *node;
22917 + lock_handle *lh;
22918 +
22919 + node = owner->request.node;
22920 +
22921 + /* Owners list is not empty for a locked node */
22922 + assert("zam-314", !list_empty_careful(&node->lock.owners));
22923 + assert("nikita-1841", owner == get_current_lock_stack());
22924 + assert_spin_locked(&(node->lock.guard));
22925 +
22926 + lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
22927 + ret = (lh->owner == owner);
22928 +
22929 + /* Recursive read locking should be done usual way */
22930 + assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
22931 + /* mixing of read/write locks is not allowed */
22932 + assert("zam-341", !ret || znode_is_wlocked(node));
22933 +
22934 + return ret;
22935 +}
22936 +
22937 +#if REISER4_DEBUG
22938 +/* Returns true if the lock is held by the calling thread. */
22939 +int znode_is_any_locked(const znode * node)
22940 +{
22941 + lock_handle *handle;
22942 + lock_stack *stack;
22943 + int ret;
22944 +
22945 + if (!znode_is_locked(node)) {
22946 + return 0;
22947 + }
22948 +
22949 + stack = get_current_lock_stack();
22950 +
22951 + spin_lock_stack(stack);
22952 +
22953 + ret = 0;
22954 +
22955 + list_for_each_entry(handle, &stack->locks, locks_link) {
22956 + if (handle->node == node) {
22957 + ret = 1;
22958 + break;
22959 + }
22960 + }
22961 +
22962 + spin_unlock_stack(stack);
22963 +
22964 + return ret;
22965 +}
22966 +
22967 +#endif
22968 +
22969 +/* Returns true if a write lock is held by the calling thread. */
22970 +int znode_is_write_locked(const znode * node)
22971 +{
22972 + lock_stack *stack;
22973 + lock_handle *handle;
22974 +
22975 + assert("jmacd-8765", node != NULL);
22976 +
22977 + if (!znode_is_wlocked(node)) {
22978 + return 0;
22979 + }
22980 +
22981 + stack = get_current_lock_stack();
22982 +
22983 + /*
22984 + * When znode is write locked, all owner handles point to the same lock
22985 + * stack. Get pointer to lock stack from the first lock handle from
22986 + * znode's owner list
22987 + */
22988 + handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
22989 +
22990 + return (handle->owner == stack);
22991 +}
22992 +
22993 +/* This "deadlock" condition is the essential part of reiser4 locking
22994 + implementation. This condition is checked explicitly by calling
22995 + check_deadlock_condition() or implicitly in all places where znode lock
22996 + state (set of owners and request queue) is changed. Locking code is
22997 + designed to use this condition to trigger procedure of passing object from
22998 + low priority owner(s) to high priority one(s).
22999 +
23000 + The procedure results in passing an event (setting lock_handle->signaled
23001 + flag) and counting this event in nr_signaled field of owner's lock stack
23002 + object and wakeup owner's process.
23003 +*/
23004 +static inline int check_deadlock_condition(znode * node)
23005 +{
23006 + assert_spin_locked(&(node->lock.guard));
23007 + return node->lock.nr_hipri_requests > 0
23008 + && node->lock.nr_hipri_owners == 0;
23009 +}
23010 +
23011 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23012 +{
23013 + zlock * lock = &node->lock;
23014 +
23015 + return mode == ZNODE_READ_LOCK &&
23016 + lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23017 +}
23018 +
23019 +/* checks lock/request compatibility */
23020 +static int can_lock_object(lock_stack * owner)
23021 +{
23022 + znode *node = owner->request.node;
23023 +
23024 + assert_spin_locked(&(node->lock.guard));
23025 +
23026 + /* See if the node is disconnected. */
23027 + if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23028 + return RETERR(-EINVAL);
23029 +
23030 + /* Do not ever try to take a lock if we are going in low priority
23031 + direction and a node have a high priority request without high
23032 + priority owners. */
23033 + if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23034 + return RETERR(-E_REPEAT);
23035 + if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23036 + return RETERR(-E_REPEAT);
23037 + if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23038 + return RETERR(-E_REPEAT);
23039 + return 0;
23040 +}
23041 +
23042 +/* Setting of a high priority to the process. It clears "signaled" flags
23043 + because znode locked by high-priority process can't satisfy our "deadlock
23044 + condition". */
23045 +static void set_high_priority(lock_stack * owner)
23046 +{
23047 + assert("nikita-1846", owner == get_current_lock_stack());
23048 + /* Do nothing if current priority is already high */
23049 + if (!owner->curpri) {
23050 + /* We don't need locking for owner->locks list, because, this
23051 + * function is only called with the lock stack of the current
23052 + * thread, and no other thread can play with owner->locks list
23053 + * and/or change ->node pointers of lock handles in this list.
23054 + *
23055 + * (Interrupts also are not involved.)
23056 + */
23057 + lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23058 + while (&owner->locks != &item->locks_link) {
23059 + znode *node = item->node;
23060 +
23061 + spin_lock_zlock(&node->lock);
23062 +
23063 + node->lock.nr_hipri_owners++;
23064 +
23065 + /* we can safely set signaled to zero, because
23066 + previous statement (nr_hipri_owners ++) guarantees
23067 + that signaled will be never set again. */
23068 + item->signaled = 0;
23069 + spin_unlock_zlock(&node->lock);
23070 +
23071 + item = list_entry(item->locks_link.next, lock_handle, locks_link);
23072 + }
23073 + owner->curpri = 1;
23074 + atomic_set(&owner->nr_signaled, 0);
23075 + }
23076 +}
23077 +
23078 +/* Sets a low priority to the process. */
23079 +static void set_low_priority(lock_stack * owner)
23080 +{
23081 + assert("nikita-3075", owner == get_current_lock_stack());
23082 + /* Do nothing if current priority is already low */
23083 + if (owner->curpri) {
23084 + /* scan all locks (lock handles) held by @owner, which is
23085 + actually current thread, and check whether we are reaching
23086 + deadlock possibility anywhere.
23087 + */
23088 + lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23089 + while (&owner->locks != &handle->locks_link) {
23090 + znode *node = handle->node;
23091 + spin_lock_zlock(&node->lock);
23092 + /* this thread just was hipri owner of @node, so
23093 + nr_hipri_owners has to be greater than zero. */
23094 + assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23095 + node->lock.nr_hipri_owners--;
23096 + /* If we have deadlock condition, adjust a nr_signaled
23097 + field. It is enough to set "signaled" flag only for
23098 + current process, other low-pri owners will be
23099 + signaled and waken up after current process unlocks
23100 + this object and any high-priority requestor takes
23101 + control. */
23102 + if (check_deadlock_condition(node)
23103 + && !handle->signaled) {
23104 + handle->signaled = 1;
23105 + atomic_inc(&owner->nr_signaled);
23106 + }
23107 + spin_unlock_zlock(&node->lock);
23108 + handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23109 + }
23110 + owner->curpri = 0;
23111 + }
23112 +}
23113 +
23114 +static void remove_lock_request(lock_stack * requestor)
23115 +{
23116 + zlock * lock = &requestor->request.node->lock;
23117 +
23118 + if (requestor->curpri) {
23119 + assert("nikita-1838", lock->nr_hipri_requests > 0);
23120 + lock->nr_hipri_requests--;
23121 + if (requestor->request.mode == ZNODE_WRITE_LOCK)
23122 + lock->nr_hipri_write_requests --;
23123 + }
23124 + list_del(&requestor->requestors_link);
23125 +}
23126 +
23127 +static void invalidate_all_lock_requests(znode * node)
23128 +{
23129 + lock_stack *requestor, *tmp;
23130 +
23131 + assert_spin_locked(&(node->lock.guard));
23132 +
23133 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23134 + remove_lock_request(requestor);
23135 + requestor->request.ret_code = -EINVAL;
23136 + reiser4_wake_up(requestor);
23137 + requestor->request.mode = ZNODE_NO_LOCK;
23138 + }
23139 +}
23140 +
23141 +static void dispatch_lock_requests(znode * node)
23142 +{
23143 + lock_stack *requestor, *tmp;
23144 +
23145 + assert_spin_locked(&(node->lock.guard));
23146 +
23147 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23148 + if (znode_is_write_locked(node))
23149 + break;
23150 + if (!can_lock_object(requestor)) {
23151 + lock_object(requestor);
23152 + remove_lock_request(requestor);
23153 + requestor->request.ret_code = 0;
23154 + reiser4_wake_up(requestor);
23155 + requestor->request.mode = ZNODE_NO_LOCK;
23156 + }
23157 + }
23158 +}
23159 +
23160 +/* release long-term lock, acquired by longterm_lock_znode() */
23161 +void longterm_unlock_znode(lock_handle * handle)
23162 +{
23163 + znode *node = handle->node;
23164 + lock_stack *oldowner = handle->owner;
23165 + int hipri;
23166 + int readers;
23167 + int rdelta;
23168 + int youdie;
23169 +
23170 + /*
23171 + * this is time-critical and highly optimized code. Modify carefully.
23172 + */
23173 +
23174 + assert("jmacd-1021", handle != NULL);
23175 + assert("jmacd-1022", handle->owner != NULL);
23176 + assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23177 +
23178 + assert("zam-130", oldowner == get_current_lock_stack());
23179 +
23180 + LOCK_CNT_DEC(long_term_locked_znode);
23181 +
23182 + /*
23183 + * to minimize amount of operations performed under lock, pre-compute
23184 + * all variables used within critical section. This makes code
23185 + * obscure.
23186 + */
23187 +
23188 + /* was this lock of hi or lo priority */
23189 + hipri = oldowner->curpri ? 1 : 0;
23190 + /* number of readers */
23191 + readers = node->lock.nr_readers;
23192 + /* +1 if write lock, -1 if read lock */
23193 + rdelta = (readers > 0) ? -1 : +1;
23194 + /* true if node is to die and write lock is released */
23195 + youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23196 +
23197 + spin_lock_zlock(&node->lock);
23198 +
23199 + assert("zam-101", znode_is_locked(node));
23200 +
23201 + /* Adjust a number of high priority owners of this lock */
23202 + assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23203 + node->lock.nr_hipri_owners -= hipri;
23204 +
23205 + /* Handle znode deallocation on last write-lock release. */
23206 + if (znode_is_wlocked_once(node)) {
23207 + if (youdie) {
23208 + forget_znode(handle);
23209 + assert("nikita-2191", znode_invariant(node));
23210 + zput(node);
23211 + return;
23212 + }
23213 + }
23214 +
23215 + if (handle->signaled)
23216 + atomic_dec(&oldowner->nr_signaled);
23217 +
23218 + /* Unlocking means owner<->object link deletion */
23219 + unlink_object(handle);
23220 +
23221 + /* This is enough to be sure whether an object is completely
23222 + unlocked. */
23223 + node->lock.nr_readers += rdelta;
23224 +
23225 + /* If the node is locked it must have an owners list. Likewise, if
23226 + the node is unlocked it must have an empty owners list. */
23227 + assert("zam-319", equi(znode_is_locked(node),
23228 + !list_empty_careful(&node->lock.owners)));
23229 +
23230 +#if REISER4_DEBUG
23231 + if (!znode_is_locked(node))
23232 + ++node->times_locked;
23233 +#endif
23234 +
23235 + /* If there are pending lock requests we wake up a requestor */
23236 + if (!znode_is_wlocked(node))
23237 + dispatch_lock_requests(node);
23238 + if (check_deadlock_condition(node))
23239 + wake_up_all_lopri_owners(node);
23240 + spin_unlock_zlock(&node->lock);
23241 +
23242 + /* minus one reference from handle->node */
23243 + assert("nikita-2190", znode_invariant(node));
23244 + ON_DEBUG(check_lock_data());
23245 + ON_DEBUG(check_lock_node_data(node));
23246 + zput(node);
23247 +}
23248 +
23249 +/* final portion of longterm-lock */
23250 +static int
23251 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23252 +{
23253 + znode *node = owner->request.node;
23254 +
23255 + assert_spin_locked(&(node->lock.guard));
23256 +
23257 + /* If we broke with (ok == 0) it means we can_lock, now do it. */
23258 + if (ok == 0) {
23259 + lock_object(owner);
23260 + owner->request.mode = 0;
23261 + /* count a reference from lockhandle->node
23262 +
23263 + znode was already referenced at the entry to this function,
23264 + hence taking spin-lock here is not necessary (see comment
23265 + in the zref()).
23266 + */
23267 + zref(node);
23268 +
23269 + LOCK_CNT_INC(long_term_locked_znode);
23270 + }
23271 + spin_unlock_zlock(&node->lock);
23272 + ON_DEBUG(check_lock_data());
23273 + ON_DEBUG(check_lock_node_data(node));
23274 + return ok;
23275 +}
23276 +
23277 +/*
23278 + * version of longterm_znode_lock() optimized for the most common case: read
23279 + * lock without any special flags. This is the kind of lock that any tree
23280 + * traversal takes on the root node of the tree, which is very frequent.
23281 + */
23282 +static int longterm_lock_tryfast(lock_stack * owner)
23283 +{
23284 + int result;
23285 + znode *node;
23286 + zlock *lock;
23287 +
23288 + node = owner->request.node;
23289 + lock = &node->lock;
23290 +
23291 + assert("nikita-3340", reiser4_schedulable());
23292 + assert("nikita-3341", request_is_deadlock_safe(node,
23293 + ZNODE_READ_LOCK,
23294 + ZNODE_LOCK_LOPRI));
23295 + spin_lock_zlock(lock);
23296 + result = can_lock_object(owner);
23297 + spin_unlock_zlock(lock);
23298 +
23299 + if (likely(result != -EINVAL)) {
23300 + spin_lock_znode(node);
23301 + result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23302 + spin_unlock_znode(node);
23303 + spin_lock_zlock(lock);
23304 + if (unlikely(result != 0)) {
23305 + owner->request.mode = 0;
23306 + } else {
23307 + result = can_lock_object(owner);
23308 + if (unlikely(result == -E_REPEAT)) {
23309 + /* fall back to longterm_lock_znode() */
23310 + spin_unlock_zlock(lock);
23311 + return 1;
23312 + }
23313 + }
23314 + return lock_tail(owner, result, ZNODE_READ_LOCK);
23315 + } else
23316 + return 1;
23317 +}
23318 +
23319 +/* locks given lock object */
23320 +int longterm_lock_znode(
23321 + /* local link object (allocated by lock owner thread, usually on its own
23322 + * stack) */
23323 + lock_handle * handle,
23324 + /* znode we want to lock. */
23325 + znode * node,
23326 + /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23327 + znode_lock_mode mode,
23328 + /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23329 + znode_lock_request request) {
23330 + int ret;
23331 + int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23332 + int non_blocking = 0;
23333 + int has_atom;
23334 + txn_capture cap_flags;
23335 + zlock *lock;
23336 + txn_handle *txnh;
23337 + tree_level level;
23338 +
23339 + /* Get current process context */
23340 + lock_stack *owner = get_current_lock_stack();
23341 +
23342 + /* Check that the lock handle is initialized and isn't already being
23343 + * used. */
23344 + assert("jmacd-808", handle->owner == NULL);
23345 + assert("nikita-3026", reiser4_schedulable());
23346 + assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23347 + assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23348 + /* long term locks are not allowed in the VM contexts (->writepage(),
23349 + * prune_{d,i}cache()).
23350 + *
23351 + * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23352 + * bug caused by d_splice_alias() only working for directories.
23353 + */
23354 + assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23355 + assert ("zam-1055", mode != ZNODE_NO_LOCK);
23356 +
23357 + cap_flags = 0;
23358 + if (request & ZNODE_LOCK_NONBLOCK) {
23359 + cap_flags |= TXN_CAPTURE_NONBLOCKING;
23360 + non_blocking = 1;
23361 + }
23362 +
23363 + if (request & ZNODE_LOCK_DONT_FUSE)
23364 + cap_flags |= TXN_CAPTURE_DONT_FUSE;
23365 +
23366 + /* If we are changing our process priority we must adjust a number
23367 + of high priority owners for each znode that we already lock */
23368 + if (hipri) {
23369 + set_high_priority(owner);
23370 + } else {
23371 + set_low_priority(owner);
23372 + }
23373 +
23374 + level = znode_get_level(node);
23375 +
23376 + /* Fill request structure with our values. */
23377 + owner->request.mode = mode;
23378 + owner->request.handle = handle;
23379 + owner->request.node = node;
23380 +
23381 + txnh = get_current_context()->trans;
23382 + lock = &node->lock;
23383 +
23384 + if (mode == ZNODE_READ_LOCK && request == 0) {
23385 + ret = longterm_lock_tryfast(owner);
23386 + if (ret <= 0)
23387 + return ret;
23388 + }
23389 +
23390 + has_atom = (txnh->atom != NULL);
23391 +
23392 + /* Synchronize on node's zlock guard lock. */
23393 + spin_lock_zlock(lock);
23394 +
23395 + if (znode_is_locked(node) &&
23396 + mode == ZNODE_WRITE_LOCK && recursive(owner))
23397 + return lock_tail(owner, 0, mode);
23398 +
23399 + for (;;) {
23400 + /* Check the lock's availability: if it is unavaiable we get
23401 + E_REPEAT, 0 indicates "can_lock", otherwise the node is
23402 + invalid. */
23403 + ret = can_lock_object(owner);
23404 +
23405 + if (unlikely(ret == -EINVAL)) {
23406 + /* @node is dying. Leave it alone. */
23407 + break;
23408 + }
23409 +
23410 + if (unlikely(ret == -E_REPEAT && non_blocking)) {
23411 + /* either locking of @node by the current thread will
23412 + * lead to the deadlock, or lock modes are
23413 + * incompatible. */
23414 + break;
23415 + }
23416 +
23417 + assert("nikita-1844", (ret == 0)
23418 + || ((ret == -E_REPEAT) && !non_blocking));
23419 + /* If we can get the lock... Try to capture first before
23420 + taking the lock. */
23421 +
23422 + /* first handle commonest case where node and txnh are already
23423 + * in the same atom. */
23424 + /* safe to do without taking locks, because:
23425 + *
23426 + * 1. read of aligned word is atomic with respect to writes to
23427 + * this word
23428 + *
23429 + * 2. false negatives are handled in reiser4_try_capture().
23430 + *
23431 + * 3. false positives are impossible.
23432 + *
23433 + * PROOF: left as an exercise to the curious reader.
23434 + *
23435 + * Just kidding. Here is one:
23436 + *
23437 + * At the time T0 txnh->atom is stored in txnh_atom.
23438 + *
23439 + * At the time T1 node->atom is stored in node_atom.
23440 + *
23441 + * At the time T2 we observe that
23442 + *
23443 + * txnh_atom != NULL && node_atom == txnh_atom.
23444 + *
23445 + * Imagine that at this moment we acquire node and txnh spin
23446 + * lock in this order. Suppose that under spin lock we have
23447 + *
23448 + * node->atom != txnh->atom, (S1)
23449 + *
23450 + * at the time T3.
23451 + *
23452 + * txnh->atom != NULL still, because txnh is open by the
23453 + * current thread.
23454 + *
23455 + * Suppose node->atom == NULL, that is, node was un-captured
23456 + * between T1, and T3. But un-capturing of formatted node is
23457 + * always preceded by the call to reiser4_invalidate_lock(),
23458 + * which marks znode as JNODE_IS_DYING under zlock spin
23459 + * lock. Contradiction, because can_lock_object() above checks
23460 + * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23461 + *
23462 + * Suppose that node->atom != node_atom, that is, atom, node
23463 + * belongs to was fused into another atom: node_atom was fused
23464 + * into node->atom. Atom of txnh was equal to node_atom at T2,
23465 + * which means that under spin lock, txnh->atom == node->atom,
23466 + * because txnh->atom can only follow fusion
23467 + * chain. Contradicts S1.
23468 + *
23469 + * The same for hypothesis txnh->atom != txnh_atom. Hence,
23470 + * node->atom == node_atom == txnh_atom == txnh->atom. Again
23471 + * contradicts S1. Hence S1 is false. QED.
23472 + *
23473 + */
23474 +
23475 + if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23476 + ;
23477 + } else {
23478 + /*
23479 + * unlock zlock spin lock here. It is possible for
23480 + * longterm_unlock_znode() to sneak in here, but there
23481 + * is no harm: reiser4_invalidate_lock() will mark znode
23482 + * as JNODE_IS_DYING and this will be noted by
23483 + * can_lock_object() below.
23484 + */
23485 + spin_unlock_zlock(lock);
23486 + spin_lock_znode(node);
23487 + ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23488 + spin_unlock_znode(node);
23489 + spin_lock_zlock(lock);
23490 + if (unlikely(ret != 0)) {
23491 + /* In the failure case, the txnmgr releases
23492 + the znode's lock (or in some cases, it was
23493 + released a while ago). There's no need to
23494 + reacquire it so we should return here,
23495 + avoid releasing the lock. */
23496 + owner->request.mode = 0;
23497 + break;
23498 + }
23499 +
23500 + /* Check the lock's availability again -- this is
23501 + because under some circumstances the capture code
23502 + has to release and reacquire the znode spinlock. */
23503 + ret = can_lock_object(owner);
23504 + }
23505 +
23506 + /* This time, a return of (ret == 0) means we can lock, so we
23507 + should break out of the loop. */
23508 + if (likely(ret != -E_REPEAT || non_blocking))
23509 + break;
23510 +
23511 + /* Lock is unavailable, we have to wait. */
23512 + ret = reiser4_prepare_to_sleep(owner);
23513 + if (unlikely(ret != 0))
23514 + break;
23515 +
23516 + assert_spin_locked(&(node->lock.guard));
23517 + if (hipri) {
23518 + /* If we are going in high priority direction then
23519 + increase high priority requests counter for the
23520 + node */
23521 + lock->nr_hipri_requests++;
23522 + if (mode == ZNODE_WRITE_LOCK)
23523 + lock->nr_hipri_write_requests ++;
23524 + /* If there are no high priority owners for a node,
23525 + then immediately wake up low priority owners, so
23526 + they can detect possible deadlock */
23527 + if (lock->nr_hipri_owners == 0)
23528 + wake_up_all_lopri_owners(node);
23529 + }
23530 + list_add_tail(&owner->requestors_link, &lock->requestors);
23531 +
23532 + /* Ok, here we have prepared a lock request, so unlock
23533 + a znode ... */
23534 + spin_unlock_zlock(lock);
23535 + /* ... and sleep */
23536 + reiser4_go_to_sleep(owner);
23537 + if (owner->request.mode == ZNODE_NO_LOCK)
23538 + goto request_is_done;
23539 + spin_lock_zlock(lock);
23540 + if (owner->request.mode == ZNODE_NO_LOCK) {
23541 + spin_unlock_zlock(lock);
23542 + request_is_done:
23543 + if (owner->request.ret_code == 0) {
23544 + LOCK_CNT_INC(long_term_locked_znode);
23545 + zref(node);
23546 + }
23547 + return owner->request.ret_code;
23548 + }
23549 + remove_lock_request(owner);
23550 + }
23551 +
23552 + return lock_tail(owner, ret, mode);
23553 +}
23554 +
23555 +/* lock object invalidation means changing of lock object state to `INVALID'
23556 + and waiting for all other processes to cancel theirs lock requests. */
23557 +void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23558 + * owner and lock
23559 + * object is being
23560 + * invalidated. */ )
23561 +{
23562 + znode *node = handle->node;
23563 + lock_stack *owner = handle->owner;
23564 +
23565 + assert("zam-325", owner == get_current_lock_stack());
23566 + assert("zam-103", znode_is_write_locked(node));
23567 + assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23568 + assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23569 + assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23570 + assert("nikita-3097", znode_is_wlocked_once(node));
23571 + assert_spin_locked(&(node->lock.guard));
23572 +
23573 + if (handle->signaled)
23574 + atomic_dec(&owner->nr_signaled);
23575 +
23576 + ZF_SET(node, JNODE_IS_DYING);
23577 + unlink_object(handle);
23578 + node->lock.nr_readers = 0;
23579 +
23580 + invalidate_all_lock_requests(node);
23581 + spin_unlock_zlock(&node->lock);
23582 +}
23583 +
23584 +/* Initializes lock_stack. */
23585 +void init_lock_stack(lock_stack * owner /* pointer to
23586 + * allocated
23587 + * structure. */ )
23588 +{
23589 + INIT_LIST_HEAD(&owner->locks);
23590 + INIT_LIST_HEAD(&owner->requestors_link);
23591 + spin_lock_init(&owner->sguard);
23592 + owner->curpri = 1;
23593 + init_waitqueue_head(&owner->wait);
23594 +}
23595 +
23596 +/* Initializes lock object. */
23597 +void reiser4_init_lock(zlock * lock /* pointer on allocated
23598 + * uninitialized lock object
23599 + * structure. */ )
23600 +{
23601 + memset(lock, 0, sizeof(zlock));
23602 + spin_lock_init(&lock->guard);
23603 + INIT_LIST_HEAD(&lock->requestors);
23604 + INIT_LIST_HEAD(&lock->owners);
23605 +}
23606 +
23607 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
23608 + heap locations). */
23609 +static void
23610 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23611 +{
23612 + znode *node = old->node;
23613 + lock_stack *owner = old->owner;
23614 + int signaled;
23615 +
23616 + /* locks_list, modified by link_object() is not protected by
23617 + anything. This is valid because only current thread ever modifies
23618 + locks_list of its lock_stack.
23619 + */
23620 + assert("nikita-1827", owner == get_current_lock_stack());
23621 + assert("nikita-1831", new->owner == NULL);
23622 +
23623 + spin_lock_zlock(&node->lock);
23624 +
23625 + signaled = old->signaled;
23626 + if (unlink_old) {
23627 + unlink_object(old);
23628 + } else {
23629 + if (node->lock.nr_readers > 0) {
23630 + node->lock.nr_readers += 1;
23631 + } else {
23632 + node->lock.nr_readers -= 1;
23633 + }
23634 + if (signaled) {
23635 + atomic_inc(&owner->nr_signaled);
23636 + }
23637 + if (owner->curpri) {
23638 + node->lock.nr_hipri_owners += 1;
23639 + }
23640 + LOCK_CNT_INC(long_term_locked_znode);
23641 +
23642 + zref(node);
23643 + }
23644 + link_object(new, owner, node);
23645 + new->signaled = signaled;
23646 +
23647 + spin_unlock_zlock(&node->lock);
23648 +}
23649 +
23650 +void move_lh(lock_handle * new, lock_handle * old)
23651 +{
23652 + move_lh_internal(new, old, /*unlink_old */ 1);
23653 +}
23654 +
23655 +void copy_lh(lock_handle * new, lock_handle * old)
23656 +{
23657 + move_lh_internal(new, old, /*unlink_old */ 0);
23658 +}
23659 +
23660 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23661 +int reiser4_check_deadlock(void)
23662 +{
23663 + lock_stack *owner = get_current_lock_stack();
23664 + return atomic_read(&owner->nr_signaled) != 0;
23665 +}
23666 +
23667 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23668 + priorities. */
23669 +int reiser4_prepare_to_sleep(lock_stack * owner)
23670 +{
23671 + assert("nikita-1847", owner == get_current_lock_stack());
23672 +
23673 + /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23674 + * counted in nr_signaled */
23675 + if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23676 + assert("zam-959", !owner->curpri);
23677 + return RETERR(-E_DEADLOCK);
23678 + }
23679 + return 0;
23680 +}
23681 +
23682 +/* Wakes up a single thread */
23683 +void __reiser4_wake_up(lock_stack * owner)
23684 +{
23685 + atomic_set(&owner->wakeup, 1);
23686 + wake_up(&owner->wait);
23687 +}
23688 +
23689 +/* Puts a thread to sleep */
23690 +void reiser4_go_to_sleep(lock_stack * owner)
23691 +{
23692 + /* Well, we might sleep here, so holding of any spinlocks is no-no */
23693 + assert("nikita-3027", reiser4_schedulable());
23694 +
23695 + wait_event(owner->wait, atomic_read(&owner->wakeup));
23696 + atomic_set(&owner->wakeup, 0);
23697 +}
23698 +
23699 +int lock_stack_isclean(lock_stack * owner)
23700 +{
23701 + if (list_empty_careful(&owner->locks)) {
23702 + assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23703 + return 1;
23704 + }
23705 +
23706 + return 0;
23707 +}
23708 +
23709 +#if REISER4_DEBUG
23710 +
23711 +/*
23712 + * debugging functions
23713 + */
23714 +
23715 +static void list_check(struct list_head *head)
23716 +{
23717 + struct list_head *pos;
23718 +
23719 + list_for_each(pos, head)
23720 + assert("", (pos->prev != NULL && pos->next != NULL &&
23721 + pos->prev->next == pos && pos->next->prev == pos));
23722 +}
23723 +
23724 +/* check consistency of locking data-structures hanging of the @stack */
23725 +static void check_lock_stack(lock_stack * stack)
23726 +{
23727 + spin_lock_stack(stack);
23728 + /* check that stack->locks is not corrupted */
23729 + list_check(&stack->locks);
23730 + spin_unlock_stack(stack);
23731 +}
23732 +
23733 +/* check consistency of locking data structures */
23734 +void check_lock_data(void)
23735 +{
23736 + check_lock_stack(&get_current_context()->stack);
23737 +}
23738 +
23739 +/* check consistency of locking data structures for @node */
23740 +void check_lock_node_data(znode * node)
23741 +{
23742 + spin_lock_zlock(&node->lock);
23743 + list_check(&node->lock.owners);
23744 + list_check(&node->lock.requestors);
23745 + spin_unlock_zlock(&node->lock);
23746 +}
23747 +
23748 +/* check that given lock request is dead lock safe. This check is, of course,
23749 + * not exhaustive. */
23750 +static int
23751 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23752 + znode_lock_request request)
23753 +{
23754 + lock_stack *owner;
23755 +
23756 + owner = get_current_lock_stack();
23757 + /*
23758 + * check that hipri lock request is not issued when there are locked
23759 + * nodes at the higher levels.
23760 + */
23761 + if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23762 + znode_get_level(node) != 0) {
23763 + lock_handle *item;
23764 +
23765 + list_for_each_entry(item, &owner->locks, locks_link) {
23766 + znode *other;
23767 +
23768 + other = item->node;
23769 +
23770 + if (znode_get_level(other) == 0)
23771 + continue;
23772 + if (znode_get_level(other) > znode_get_level(node))
23773 + return 0;
23774 + }
23775 + }
23776 + return 1;
23777 +}
23778 +
23779 +#endif
23780 +
23781 +/* return pointer to static storage with name of lock_mode. For
23782 + debugging */
23783 +const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23784 +{
23785 + if (lock == ZNODE_READ_LOCK)
23786 + return "read";
23787 + else if (lock == ZNODE_WRITE_LOCK)
23788 + return "write";
23789 + else {
23790 + static char buf[30];
23791 +
23792 + sprintf(buf, "unknown: %i", lock);
23793 + return buf;
23794 + }
23795 +}
23796 +
23797 +/* Make Linus happy.
23798 + Local variables:
23799 + c-indentation-style: "K&R"
23800 + mode-name: "LC"
23801 + c-basic-offset: 8
23802 + tab-width: 8
23803 + fill-column: 79
23804 + End:
23805 +*/
23806 diff -urN linux-2.6.22.orig/fs/reiser4/lock.h linux-2.6.22/fs/reiser4/lock.h
23807 --- linux-2.6.22.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23808 +++ linux-2.6.22/fs/reiser4/lock.h 2007-07-29 00:25:34.888699583 +0400
23809 @@ -0,0 +1,249 @@
23810 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23811 +
23812 +/* Long term locking data structures. See lock.c for details. */
23813 +
23814 +#ifndef __LOCK_H__
23815 +#define __LOCK_H__
23816 +
23817 +#include "forward.h"
23818 +#include "debug.h"
23819 +#include "dformat.h"
23820 +#include "key.h"
23821 +#include "coord.h"
23822 +#include "plugin/node/node.h"
23823 +#include "txnmgr.h"
23824 +#include "readahead.h"
23825 +
23826 +#include <linux/types.h>
23827 +#include <linux/spinlock.h>
23828 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
23829 +#include <asm/atomic.h>
23830 +#include <linux/wait.h>
23831 +
23832 +/* Per-znode lock object */
23833 +struct zlock {
23834 + spinlock_t guard;
23835 + /* The number of readers if positive; the number of recursively taken
23836 + write locks if negative. Protected by zlock spin lock. */
23837 + int nr_readers;
23838 + /* A number of processes (lock_stacks) that have this object
23839 + locked with high priority */
23840 + unsigned nr_hipri_owners;
23841 + /* A number of attempts to lock znode in high priority direction */
23842 + unsigned nr_hipri_requests;
23843 + /* A linked list of lock_handle objects that contains pointers
23844 + for all lock_stacks which have this lock object locked */
23845 + unsigned nr_hipri_write_requests;
23846 + struct list_head owners;
23847 + /* A linked list of lock_stacks that wait for this lock */
23848 + struct list_head requestors;
23849 +};
23850 +
23851 +static inline void spin_lock_zlock(zlock *lock)
23852 +{
23853 + /* check that zlock is not locked */
23854 + assert("", LOCK_CNT_NIL(spin_locked_zlock));
23855 + /* check that spinlocks of lower priorities are not held */
23856 + assert("", LOCK_CNT_NIL(spin_locked_stack));
23857 +
23858 + spin_lock(&lock->guard);
23859 +
23860 + LOCK_CNT_INC(spin_locked_zlock);
23861 + LOCK_CNT_INC(spin_locked);
23862 +}
23863 +
23864 +static inline void spin_unlock_zlock(zlock *lock)
23865 +{
23866 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23867 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23868 +
23869 + LOCK_CNT_DEC(spin_locked_zlock);
23870 + LOCK_CNT_DEC(spin_locked);
23871 +
23872 + spin_unlock(&lock->guard);
23873 +}
23874 +
23875 +#define lock_is_locked(lock) ((lock)->nr_readers != 0)
23876 +#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
23877 +#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
23878 +#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
23879 +#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
23880 +#define lock_mode_compatible(lock, mode) \
23881 + (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23882 + ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23883 +
23884 +/* Since we have R/W znode locks we need additional bidirectional `link'
23885 + objects to implement n<->m relationship between lock owners and lock
23886 + objects. We call them `lock handles'.
23887 +
23888 + Locking: see lock.c/"SHORT-TERM LOCKING"
23889 +*/
23890 +struct lock_handle {
23891 + /* This flag indicates that a signal to yield a lock was passed to
23892 + lock owner and counted in owner->nr_signalled
23893 +
23894 + Locking: this is accessed under spin lock on ->node.
23895 + */
23896 + int signaled;
23897 + /* A link to owner of a lock */
23898 + lock_stack *owner;
23899 + /* A link to znode locked */
23900 + znode *node;
23901 + /* A list of all locks for a process */
23902 + struct list_head locks_link;
23903 + /* A list of all owners for a znode */
23904 + struct list_head owners_link;
23905 +};
23906 +
23907 +struct lock_request {
23908 + /* A pointer to uninitialized link object */
23909 + lock_handle *handle;
23910 + /* A pointer to the object we want to lock */
23911 + znode *node;
23912 + /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23913 + znode_lock_mode mode;
23914 + /* how dispatch_lock_requests() returns lock request result code */
23915 + int ret_code;
23916 +};
23917 +
23918 +/* A lock stack structure for accumulating locks owned by a process */
23919 +struct lock_stack {
23920 + /* A guard lock protecting a lock stack */
23921 + spinlock_t sguard;
23922 + /* number of znodes which were requested by high priority processes */
23923 + atomic_t nr_signaled;
23924 + /* Current priority of a process
23925 +
23926 + This is only accessed by the current thread and thus requires no
23927 + locking.
23928 + */
23929 + int curpri;
23930 + /* A list of all locks owned by this process. Elements can be added to
23931 + * this list only by the current thread. ->node pointers in this list
23932 + * can be only changed by the current thread. */
23933 + struct list_head locks;
23934 + /* When lock_stack waits for the lock, it puts itself on double-linked
23935 + requestors list of that lock */
23936 + struct list_head requestors_link;
23937 + /* Current lock request info.
23938 +
23939 + This is only accessed by the current thread and thus requires no
23940 + locking.
23941 + */
23942 + struct lock_request request;
23943 + /* the following two fields are the lock stack's
23944 + * synchronization object to use with the standard linux/wait.h
23945 + * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
23946 + * usage details. */
23947 + wait_queue_head_t wait;
23948 + atomic_t wakeup;
23949 +#if REISER4_DEBUG
23950 + int nr_locks; /* number of lock handles in the above list */
23951 +#endif
23952 +};
23953 +
23954 +/*
23955 + User-visible znode locking functions
23956 +*/
23957 +
23958 +extern int longterm_lock_znode(lock_handle * handle,
23959 + znode * node,
23960 + znode_lock_mode mode,
23961 + znode_lock_request request);
23962 +
23963 +extern void longterm_unlock_znode(lock_handle * handle);
23964 +
23965 +extern int reiser4_check_deadlock(void);
23966 +
23967 +extern lock_stack *get_current_lock_stack(void);
23968 +
23969 +extern void init_lock_stack(lock_stack * owner);
23970 +extern void reiser4_init_lock(zlock * lock);
23971 +
23972 +static inline void init_lh(lock_handle *lh)
23973 +{
23974 +#if REISER4_DEBUG
23975 + memset(lh, 0, sizeof *lh);
23976 + INIT_LIST_HEAD(&lh->locks_link);
23977 + INIT_LIST_HEAD(&lh->owners_link);
23978 +#else
23979 + lh->node = NULL;
23980 +#endif
23981 +}
23982 +
23983 +static inline void done_lh(lock_handle *lh)
23984 +{
23985 + assert("zam-342", lh != NULL);
23986 + if (lh->node != NULL)
23987 + longterm_unlock_znode(lh);
23988 +}
23989 +
23990 +extern void move_lh(lock_handle * new, lock_handle * old);
23991 +extern void copy_lh(lock_handle * new, lock_handle * old);
23992 +
23993 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
23994 +extern void reiser4_go_to_sleep(lock_stack * owner);
23995 +extern void __reiser4_wake_up(lock_stack * owner);
23996 +
23997 +extern int lock_stack_isclean(lock_stack * owner);
23998 +
23999 +/* zlock object state check macros: only used in assertions. Both forms imply that the
24000 + lock is held by the current thread. */
24001 +extern int znode_is_write_locked(const znode *);
24002 +extern void reiser4_invalidate_lock(lock_handle *);
24003 +
24004 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24005 +#define spin_ordering_pred_stack(stack) \
24006 + (LOCK_CNT_NIL(spin_locked_stack) && \
24007 + LOCK_CNT_NIL(spin_locked_txnmgr) && \
24008 + LOCK_CNT_NIL(spin_locked_inode) && \
24009 + LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24010 + LOCK_CNT_NIL(spin_locked_super_eflush) )
24011 +
24012 +static inline void spin_lock_stack(lock_stack *stack)
24013 +{
24014 + assert("", spin_ordering_pred_stack(stack));
24015 + spin_lock(&(stack->sguard));
24016 + LOCK_CNT_INC(spin_locked_stack);
24017 + LOCK_CNT_INC(spin_locked);
24018 +}
24019 +
24020 +static inline void spin_unlock_stack(lock_stack *stack)
24021 +{
24022 + assert_spin_locked(&(stack->sguard));
24023 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24024 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24025 + LOCK_CNT_DEC(spin_locked_stack);
24026 + LOCK_CNT_DEC(spin_locked);
24027 + spin_unlock(&(stack->sguard));
24028 +}
24029 +
24030 +static inline void reiser4_wake_up(lock_stack * owner)
24031 +{
24032 + spin_lock_stack(owner);
24033 + __reiser4_wake_up(owner);
24034 + spin_unlock_stack(owner);
24035 +}
24036 +
24037 +const char *lock_mode_name(znode_lock_mode lock);
24038 +
24039 +#if REISER4_DEBUG
24040 +extern void check_lock_data(void);
24041 +extern void check_lock_node_data(znode * node);
24042 +#else
24043 +#define check_lock_data() noop
24044 +#define check_lock_node_data() noop
24045 +#endif
24046 +
24047 +/* __LOCK_H__ */
24048 +#endif
24049 +
24050 +/* Make Linus happy.
24051 + Local variables:
24052 + c-indentation-style: "K&R"
24053 + mode-name: "LC"
24054 + c-basic-offset: 8
24055 + tab-width: 8
24056 + fill-column: 120
24057 + End:
24058 +*/
24059 diff -urN linux-2.6.22.orig/fs/reiser4/Makefile linux-2.6.22/fs/reiser4/Makefile
24060 --- linux-2.6.22.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
24061 +++ linux-2.6.22/fs/reiser4/Makefile 2007-07-29 00:25:34.888699583 +0400
24062 @@ -0,0 +1,99 @@
24063 +#
24064 +# reiser4/Makefile
24065 +#
24066 +
24067 +obj-$(CONFIG_REISER4_FS) += reiser4.o
24068 +
24069 +reiser4-y := \
24070 + debug.o \
24071 + jnode.o \
24072 + znode.o \
24073 + key.o \
24074 + pool.o \
24075 + tree_mod.o \
24076 + estimate.o \
24077 + carry.o \
24078 + carry_ops.o \
24079 + lock.o \
24080 + tree.o \
24081 + context.o \
24082 + tap.o \
24083 + coord.o \
24084 + block_alloc.o \
24085 + txnmgr.o \
24086 + kassign.o \
24087 + flush.o \
24088 + wander.o \
24089 + eottl.o \
24090 + search.o \
24091 + page_cache.o \
24092 + seal.o \
24093 + dscale.o \
24094 + flush_queue.o \
24095 + ktxnmgrd.o \
24096 + blocknrset.o \
24097 + super.o \
24098 + super_ops.o \
24099 + fsdata.o \
24100 + export_ops.o \
24101 + oid.o \
24102 + tree_walk.o \
24103 + inode.o \
24104 + vfs_ops.o \
24105 + as_ops.o \
24106 + entd.o\
24107 + readahead.o \
24108 + status_flags.o \
24109 + init_super.o \
24110 + safe_link.o \
24111 + \
24112 + plugin/plugin.o \
24113 + plugin/plugin_set.o \
24114 + plugin/node/node.o \
24115 + plugin/object.o \
24116 + plugin/cluster.o \
24117 + plugin/inode_ops.o \
24118 + plugin/inode_ops_rename.o \
24119 + plugin/file_ops.o \
24120 + plugin/file_ops_readdir.o \
24121 + plugin/file_plugin_common.o \
24122 + plugin/file/file.o \
24123 + plugin/file/tail_conversion.o \
24124 + plugin/file/file_conversion.o \
24125 + plugin/file/symlink.o \
24126 + plugin/file/cryptcompress.o \
24127 + plugin/dir_plugin_common.o \
24128 + plugin/dir/hashed_dir.o \
24129 + plugin/dir/seekable_dir.o \
24130 + plugin/node/node40.o \
24131 + \
24132 + plugin/crypto/cipher.o \
24133 + plugin/crypto/digest.o \
24134 + \
24135 + plugin/compress/minilzo.o \
24136 + plugin/compress/compress.o \
24137 + plugin/compress/compress_mode.o \
24138 + \
24139 + plugin/item/static_stat.o \
24140 + plugin/item/sde.o \
24141 + plugin/item/cde.o \
24142 + plugin/item/blackbox.o \
24143 + plugin/item/internal.o \
24144 + plugin/item/tail.o \
24145 + plugin/item/ctail.o \
24146 + plugin/item/extent.o \
24147 + plugin/item/extent_item_ops.o \
24148 + plugin/item/extent_file_ops.o \
24149 + plugin/item/extent_flush_ops.o \
24150 + \
24151 + plugin/hash.o \
24152 + plugin/fibration.o \
24153 + plugin/tail_policy.o \
24154 + plugin/item/item.o \
24155 + \
24156 + plugin/security/perm.o \
24157 + plugin/space/bitmap.o \
24158 + \
24159 + plugin/disk_format/disk_format40.o \
24160 + plugin/disk_format/disk_format.o
24161 +
24162 diff -urN linux-2.6.22.orig/fs/reiser4/oid.c linux-2.6.22/fs/reiser4/oid.c
24163 --- linux-2.6.22.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300
24164 +++ linux-2.6.22/fs/reiser4/oid.c 2007-07-29 00:25:34.888699583 +0400
24165 @@ -0,0 +1,141 @@
24166 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24167 +
24168 +#include "debug.h"
24169 +#include "super.h"
24170 +#include "txnmgr.h"
24171 +
24172 +/* we used to have oid allocation plugin. It was removed because it
24173 + was recognized as providing unneeded level of abstraction. If one
24174 + ever will find it useful - look at yet_unneeded_abstractions/oid
24175 +*/
24176 +
24177 +/*
24178 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24179 + * are provided by disk format plugin that reads them from the disk during
24180 + * mount.
24181 + */
24182 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24183 +{
24184 + reiser4_super_info_data *sbinfo;
24185 +
24186 + sbinfo = get_super_private(super);
24187 +
24188 + sbinfo->next_to_use = next;
24189 + sbinfo->oids_in_use = nr_files;
24190 + return 0;
24191 +}
24192 +
24193 +/*
24194 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24195 + * runs out of oids.
24196 + */
24197 +oid_t oid_allocate(struct super_block * super)
24198 +{
24199 + reiser4_super_info_data *sbinfo;
24200 + oid_t oid;
24201 +
24202 + sbinfo = get_super_private(super);
24203 +
24204 + spin_lock_reiser4_super(sbinfo);
24205 + if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24206 + oid = sbinfo->next_to_use++;
24207 + sbinfo->oids_in_use++;
24208 + } else
24209 + oid = ABSOLUTE_MAX_OID;
24210 + spin_unlock_reiser4_super(sbinfo);
24211 + return oid;
24212 +}
24213 +
24214 +/*
24215 + * Tell oid allocator that @oid is now free.
24216 + */
24217 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24218 +{
24219 + reiser4_super_info_data *sbinfo;
24220 +
24221 + sbinfo = get_super_private(super);
24222 +
24223 + spin_lock_reiser4_super(sbinfo);
24224 + sbinfo->oids_in_use--;
24225 + spin_unlock_reiser4_super(sbinfo);
24226 + return 0;
24227 +}
24228 +
24229 +/*
24230 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24231 + * without actually allocating it. This is used by disk format plugin to save
24232 + * oid allocator state on the disk.
24233 + */
24234 +oid_t oid_next(const struct super_block * super)
24235 +{
24236 + reiser4_super_info_data *sbinfo;
24237 + oid_t oid;
24238 +
24239 + sbinfo = get_super_private(super);
24240 +
24241 + spin_lock_reiser4_super(sbinfo);
24242 + oid = sbinfo->next_to_use;
24243 + spin_unlock_reiser4_super(sbinfo);
24244 + return oid;
24245 +}
24246 +
24247 +/*
24248 + * returns number of currently used oids. This is used by statfs(2) to report
24249 + * number of "inodes" and by disk format plugin to save oid allocator state on
24250 + * the disk.
24251 + */
24252 +long oids_used(const struct super_block *super)
24253 +{
24254 + reiser4_super_info_data *sbinfo;
24255 + oid_t used;
24256 +
24257 + sbinfo = get_super_private(super);
24258 +
24259 + spin_lock_reiser4_super(sbinfo);
24260 + used = sbinfo->oids_in_use;
24261 + spin_unlock_reiser4_super(sbinfo);
24262 + if (used < (__u64) ((long)~0) >> 1)
24263 + return (long)used;
24264 + else
24265 + return (long)-1;
24266 +}
24267 +
24268 +/*
24269 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24270 + * at the point when we are irrevocably committed to creation of the new file
24271 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24272 + * error).
24273 + */
24274 +void oid_count_allocated(void)
24275 +{
24276 + txn_atom *atom;
24277 +
24278 + atom = get_current_atom_locked();
24279 + atom->nr_objects_created++;
24280 + spin_unlock_atom(atom);
24281 +}
24282 +
24283 +/*
24284 + * Count oid as free in atom. This is done after call to oid_release() at the
24285 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24286 + * when oid release cannot be any longer rolled back due to some error).
24287 + */
24288 +void oid_count_released(void)
24289 +{
24290 + txn_atom *atom;
24291 +
24292 + atom = get_current_atom_locked();
24293 + atom->nr_objects_deleted++;
24294 + spin_unlock_atom(atom);
24295 +}
24296 +
24297 +/*
24298 + Local variables:
24299 + c-indentation-style: "K&R"
24300 + mode-name: "LC"
24301 + c-basic-offset: 8
24302 + tab-width: 8
24303 + fill-column: 120
24304 + scroll-step: 1
24305 + End:
24306 +*/
24307 diff -urN linux-2.6.22.orig/fs/reiser4/page_cache.c linux-2.6.22/fs/reiser4/page_cache.c
24308 --- linux-2.6.22.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300
24309 +++ linux-2.6.22/fs/reiser4/page_cache.c 2007-07-29 00:25:34.888699583 +0400
24310 @@ -0,0 +1,730 @@
24311 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24312 + * reiser4/README */
24313 +
24314 +/* Memory pressure hooks. Fake inodes handling. */
24315 +
24316 +/* GLOSSARY
24317 +
24318 + . Formatted and unformatted nodes.
24319 + Elements of reiser4 balanced tree to store data and metadata.
24320 + Unformatted nodes are pointed to by extent pointers. Such nodes
24321 + are used to store data of large objects. Unlike unformatted nodes,
24322 + formatted ones have associated format described by node4X plugin.
24323 +
24324 + . Jnode (or journal node)
24325 + The in-memory header which is used to track formatted and unformatted
24326 + nodes, bitmap nodes, etc. In particular, jnodes are used to track
24327 + transactional information associated with each block(see reiser4/jnode.c
24328 + for details).
24329 +
24330 + . Znode
24331 + The in-memory header which is used to track formatted nodes. Contains
24332 + embedded jnode (see reiser4/znode.c for details).
24333 +*/
24334 +
24335 +/* We store all file system meta data (and data, of course) in the page cache.
24336 +
24337 + What does this mean? In stead of using bread/brelse we create special
24338 + "fake" inode (one per super block) and store content of formatted nodes
24339 + into pages bound to this inode in the page cache. In newer kernels bread()
24340 + already uses inode attached to block device (bd_inode). Advantage of having
24341 + our own fake inode is that we can install appropriate methods in its
24342 + address_space operations. Such methods are called by VM on memory pressure
24343 + (or during background page flushing) and we can use them to react
24344 + appropriately.
24345 +
24346 + In initial version we only support one block per page. Support for multiple
24347 + blocks per page is complicated by relocation.
24348 +
24349 + To each page, used by reiser4, jnode is attached. jnode is analogous to
24350 + buffer head. Difference is that jnode is bound to the page permanently:
24351 + jnode cannot be removed from memory until its backing page is.
24352 +
24353 + jnode contain pointer to page (->pg field) and page contain pointer to
24354 + jnode in ->private field. Pointer from jnode to page is protected to by
24355 + jnode's spinlock and pointer from page to jnode is protected by page lock
24356 + (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24357 + lock. To go into reverse direction use jnode_lock_page() function that uses
24358 + standard try-lock-and-release device.
24359 +
24360 + Properties:
24361 +
24362 + 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24363 + reference counter is increased.
24364 +
24365 + 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24366 + reference counter is decreased.
24367 +
24368 + 3. on jload() reference counter on jnode page is increased, page is
24369 + kmapped and `referenced'.
24370 +
24371 + 4. on jrelse() inverse operations are performed.
24372 +
24373 + 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24374 +
24375 + DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24376 + historically.]
24377 +
24378 + [In the following discussion, `lock' invariably means long term lock on
24379 + znode.] (What about page locks?)
24380 +
24381 + There is some special class of deadlock possibilities related to memory
24382 + pressure. Locks acquired by other reiser4 threads are accounted for in
24383 + deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24384 + invoked additional hidden arc is added to the locking graph: thread that
24385 + tries to allocate memory waits for ->vm_writeback() to finish. If this
24386 + thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24387 + prevention is useless.
24388 +
24389 + Another related problem is possibility for ->vm_writeback() to run out of
24390 + memory itself. This is not a problem for ext2 and friends, because their
24391 + ->vm_writeback() don't allocate much memory, but reiser4 flush is
24392 + definitely able to allocate huge amounts of memory.
24393 +
24394 + It seems that there is no reliable way to cope with the problems above. In
24395 + stead it was decided that ->vm_writeback() (as invoked in the kswapd
24396 + context) wouldn't perform any flushing itself, but rather should just wake
24397 + up some auxiliary thread dedicated for this purpose (or, the same thread
24398 + that does periodic commit of old atoms (ktxnmgrd.c)).
24399 +
24400 + Details:
24401 +
24402 + 1. Page is called `reclaimable' against particular reiser4 mount F if this
24403 + page can be ultimately released by try_to_free_pages() under presumptions
24404 + that:
24405 +
24406 + a. ->vm_writeback() for F is no-op, and
24407 +
24408 + b. none of the threads accessing F are making any progress, and
24409 +
24410 + c. other reiser4 mounts obey the same memory reservation protocol as F
24411 + (described below).
24412 +
24413 + For example, clean un-pinned page, or page occupied by ext2 data are
24414 + reclaimable against any reiser4 mount.
24415 +
24416 + When there is more than one reiser4 mount in a system, condition (c) makes
24417 + reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24418 +
24419 + THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24420 +
24421 + Fake inode is used to bound formatted nodes and each node is indexed within
24422 + fake inode by its block number. If block size of smaller than page size, it
24423 + may so happen that block mapped to the page with formatted node is occupied
24424 + by unformatted node or is unallocated. This lead to some complications,
24425 + because flushing whole page can lead to an incorrect overwrite of
24426 + unformatted node that is moreover, can be cached in some other place as
24427 + part of the file body. To avoid this, buffers for unformatted nodes are
24428 + never marked dirty. Also pages in the fake are never marked dirty. This
24429 + rules out usage of ->writepage() as memory pressure hook. In stead
24430 + ->releasepage() is used.
24431 +
24432 + Josh is concerned that page->buffer is going to die. This should not pose
24433 + significant problem though, because we need to add some data structures to
24434 + the page anyway (jnode) and all necessary book keeping can be put there.
24435 +
24436 +*/
24437 +
24438 +/* Life cycle of pages/nodes.
24439 +
24440 + jnode contains reference to page and page contains reference back to
24441 + jnode. This reference is counted in page ->count. Thus, page bound to jnode
24442 + cannot be released back into free pool.
24443 +
24444 + 1. Formatted nodes.
24445 +
24446 + 1. formatted node is represented by znode. When new znode is created its
24447 + ->pg pointer is NULL initially.
24448 +
24449 + 2. when node content is loaded into znode (by call to zload()) for the
24450 + first time following happens (in call to ->read_node() or
24451 + ->allocate_node()):
24452 +
24453 + 1. new page is added to the page cache.
24454 +
24455 + 2. this page is attached to znode and its ->count is increased.
24456 +
24457 + 3. page is kmapped.
24458 +
24459 + 3. if more calls to zload() follow (without corresponding zrelses), page
24460 + counter is left intact and in its stead ->d_count is increased in znode.
24461 +
24462 + 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24463 + ->release_node() is called and page is kunmapped as result.
24464 +
24465 + 5. at some moment node can be captured by a transaction. Its ->x_count
24466 + is then increased by transaction manager.
24467 +
24468 + 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24469 + bit set) following will happen (also see comment at the top of znode.c):
24470 +
24471 + 1. when last lock is released, node will be uncaptured from
24472 + transaction. This released reference that transaction manager acquired
24473 + at the step 5.
24474 +
24475 + 2. when last reference is released, zput() detects that node is
24476 + actually deleted and calls ->delete_node()
24477 + operation. page_cache_delete_node() implementation detaches jnode from
24478 + page and releases page.
24479 +
24480 + 7. otherwise (node wasn't removed from the tree), last reference to
24481 + znode will be released after transaction manager committed transaction
24482 + node was in. This implies squallocing of this node (see
24483 + flush.c). Nothing special happens at this point. Znode is still in the
24484 + hash table and page is still attached to it.
24485 +
24486 + 8. znode is actually removed from the memory because of the memory
24487 + pressure, or during umount (znodes_tree_done()). Anyway, znode is
24488 + removed by the call to zdrop(). At this moment, page is detached from
24489 + znode and removed from the inode address space.
24490 +
24491 +*/
24492 +
24493 +#include "debug.h"
24494 +#include "dformat.h"
24495 +#include "key.h"
24496 +#include "txnmgr.h"
24497 +#include "jnode.h"
24498 +#include "znode.h"
24499 +#include "block_alloc.h"
24500 +#include "tree.h"
24501 +#include "vfs_ops.h"
24502 +#include "inode.h"
24503 +#include "super.h"
24504 +#include "entd.h"
24505 +#include "page_cache.h"
24506 +#include "ktxnmgrd.h"
24507 +
24508 +#include <linux/types.h>
24509 +#include <linux/fs.h>
24510 +#include <linux/mm.h> /* for struct page */
24511 +#include <linux/swap.h> /* for struct page */
24512 +#include <linux/pagemap.h>
24513 +#include <linux/bio.h>
24514 +#include <linux/writeback.h>
24515 +#include <linux/blkdev.h>
24516 +
24517 +static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24518 +
24519 +static struct address_space_operations formatted_fake_as_ops;
24520 +
24521 +static const oid_t fake_ino = 0x1;
24522 +static const oid_t bitmap_ino = 0x2;
24523 +static const oid_t cc_ino = 0x3;
24524 +
24525 +static void
24526 +init_fake_inode(struct super_block *super, struct inode *fake,
24527 + struct inode **pfake)
24528 +{
24529 + assert("nikita-2168", fake->i_state & I_NEW);
24530 + fake->i_mapping->a_ops = &formatted_fake_as_ops;
24531 + *pfake = fake;
24532 + /* NOTE-NIKITA something else? */
24533 + unlock_new_inode(fake);
24534 +}
24535 +
24536 +/**
24537 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24538 + * @super: super block to init fake inode for
24539 + *
24540 + * Initializes fake inode to which formatted nodes are bound in the page cache
24541 + * and inode for bitmaps.
24542 + */
24543 +int reiser4_init_formatted_fake(struct super_block *super)
24544 +{
24545 + struct inode *fake;
24546 + struct inode *bitmap;
24547 + struct inode *cc;
24548 + reiser4_super_info_data *sinfo;
24549 +
24550 + assert("nikita-1703", super != NULL);
24551 +
24552 + sinfo = get_super_private_nocheck(super);
24553 + fake = iget_locked(super, oid_to_ino(fake_ino));
24554 +
24555 + if (fake != NULL) {
24556 + init_fake_inode(super, fake, &sinfo->fake);
24557 +
24558 + bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24559 + if (bitmap != NULL) {
24560 + init_fake_inode(super, bitmap, &sinfo->bitmap);
24561 +
24562 + cc = iget_locked(super, oid_to_ino(cc_ino));
24563 + if (cc != NULL) {
24564 + init_fake_inode(super, cc, &sinfo->cc);
24565 + return 0;
24566 + } else {
24567 + iput(sinfo->fake);
24568 + iput(sinfo->bitmap);
24569 + sinfo->fake = NULL;
24570 + sinfo->bitmap = NULL;
24571 + }
24572 + } else {
24573 + iput(sinfo->fake);
24574 + sinfo->fake = NULL;
24575 + }
24576 + }
24577 + return RETERR(-ENOMEM);
24578 +}
24579 +
24580 +/**
24581 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24582 + * @super: super block to init fake inode for
24583 + *
24584 + * Releases inodes which were used as address spaces of bitmap and formatted
24585 + * nodes.
24586 + */
24587 +void reiser4_done_formatted_fake(struct super_block *super)
24588 +{
24589 + reiser4_super_info_data *sinfo;
24590 +
24591 + sinfo = get_super_private_nocheck(super);
24592 +
24593 + if (sinfo->fake != NULL) {
24594 + iput(sinfo->fake);
24595 + sinfo->fake = NULL;
24596 + }
24597 +
24598 + if (sinfo->bitmap != NULL) {
24599 + iput(sinfo->bitmap);
24600 + sinfo->bitmap = NULL;
24601 + }
24602 +
24603 + if (sinfo->cc != NULL) {
24604 + iput(sinfo->cc);
24605 + sinfo->cc = NULL;
24606 + }
24607 + return;
24608 +}
24609 +
24610 +void reiser4_wait_page_writeback(struct page *page)
24611 +{
24612 + assert("zam-783", PageLocked(page));
24613 +
24614 + do {
24615 + unlock_page(page);
24616 + wait_on_page_writeback(page);
24617 + lock_page(page);
24618 + } while (PageWriteback(page));
24619 +}
24620 +
24621 +/* return tree @page is in */
24622 +reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24623 +{
24624 + assert("nikita-2461", page != NULL);
24625 + return &get_super_private(page->mapping->host->i_sb)->tree;
24626 +}
24627 +
24628 +/* completion handler for single page bio-based read.
24629 +
24630 + mpage_end_io_read() would also do. But it's static.
24631 +
24632 +*/
24633 +static int
24634 +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24635 + int err UNUSED_ARG)
24636 +{
24637 + struct page *page;
24638 +
24639 + if (bio->bi_size != 0) {
24640 + warning("nikita-3332", "Truncated single page read: %i",
24641 + bio->bi_size);
24642 + return 1;
24643 + }
24644 +
24645 + page = bio->bi_io_vec[0].bv_page;
24646 +
24647 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24648 + SetPageUptodate(page);
24649 + } else {
24650 + ClearPageUptodate(page);
24651 + SetPageError(page);
24652 + }
24653 + unlock_page(page);
24654 + bio_put(bio);
24655 + return 0;
24656 +}
24657 +
24658 +/* completion handler for single page bio-based write.
24659 +
24660 + mpage_end_io_write() would also do. But it's static.
24661 +
24662 +*/
24663 +static int
24664 +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24665 + int err UNUSED_ARG)
24666 +{
24667 + struct page *page;
24668 +
24669 + if (bio->bi_size != 0) {
24670 + warning("nikita-3333", "Truncated single page write: %i",
24671 + bio->bi_size);
24672 + return 1;
24673 + }
24674 +
24675 + page = bio->bi_io_vec[0].bv_page;
24676 +
24677 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24678 + SetPageError(page);
24679 + end_page_writeback(page);
24680 + bio_put(bio);
24681 + return 0;
24682 +}
24683 +
24684 +/* ->readpage() method for formatted nodes */
24685 +static int formatted_readpage(struct file *f UNUSED_ARG,
24686 + struct page *page /* page to read */ )
24687 +{
24688 + assert("nikita-2412", PagePrivate(page) && jprivate(page));
24689 + return reiser4_page_io(page, jprivate(page), READ,
24690 + reiser4_ctx_gfp_mask_get());
24691 +}
24692 +
24693 +/**
24694 + * reiser4_page_io - submit single-page bio request
24695 + * @page: page to perform io for
24696 + * @node: jnode of page
24697 + * @rw: read or write
24698 + * @gfp: gfp mask for bio allocation
24699 + *
24700 + * Submits single page read or write.
24701 + */
24702 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24703 +{
24704 + struct bio *bio;
24705 + int result;
24706 +
24707 + assert("nikita-2094", page != NULL);
24708 + assert("nikita-2226", PageLocked(page));
24709 + assert("nikita-2634", node != NULL);
24710 + assert("nikita-2893", rw == READ || rw == WRITE);
24711 +
24712 + if (rw) {
24713 + if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24714 + unlock_page(page);
24715 + return 0;
24716 + }
24717 + }
24718 +
24719 + bio = page_bio(page, node, rw, gfp);
24720 + if (!IS_ERR(bio)) {
24721 + if (rw == WRITE) {
24722 + set_page_writeback(page);
24723 + unlock_page(page);
24724 + }
24725 + reiser4_submit_bio(rw, bio);
24726 + result = 0;
24727 + } else {
24728 + unlock_page(page);
24729 + result = PTR_ERR(bio);
24730 + }
24731 +
24732 + return result;
24733 +}
24734 +
24735 +/* helper function to construct bio for page */
24736 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24737 +{
24738 + struct bio *bio;
24739 + assert("nikita-2092", page != NULL);
24740 + assert("nikita-2633", node != NULL);
24741 +
24742 + /* Simple implementation in the assumption that blocksize == pagesize.
24743 +
24744 + We only have to submit one block, but submit_bh() will allocate bio
24745 + anyway, so lets use all the bells-and-whistles of bio code.
24746 + */
24747 +
24748 + bio = bio_alloc(gfp, 1);
24749 + if (bio != NULL) {
24750 + int blksz;
24751 + struct super_block *super;
24752 + reiser4_block_nr blocknr;
24753 +
24754 + super = page->mapping->host->i_sb;
24755 + assert("nikita-2029", super != NULL);
24756 + blksz = super->s_blocksize;
24757 + assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24758 +
24759 + spin_lock_jnode(node);
24760 + blocknr = *jnode_get_io_block(node);
24761 + spin_unlock_jnode(node);
24762 +
24763 + assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24764 + assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24765 +
24766 + bio->bi_bdev = super->s_bdev;
24767 + /* fill bio->bi_sector before calling bio_add_page(), because
24768 + * q->merge_bvec_fn may want to inspect it (see
24769 + * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24770 + bio->bi_sector = blocknr * (blksz >> 9);
24771 +
24772 + if (!bio_add_page(bio, page, blksz, 0)) {
24773 + warning("nikita-3452",
24774 + "Single page bio cannot be constructed");
24775 + return ERR_PTR(RETERR(-EINVAL));
24776 + }
24777 +
24778 + /* bio -> bi_idx is filled by bio_init() */
24779 + bio->bi_end_io = (rw == READ) ?
24780 + end_bio_single_page_read : end_bio_single_page_write;
24781 +
24782 + return bio;
24783 + } else
24784 + return ERR_PTR(RETERR(-ENOMEM));
24785 +}
24786 +
24787 +/* this function is internally called by jnode_make_dirty() */
24788 +int reiser4_set_page_dirty_internal(struct page *page)
24789 +{
24790 + struct address_space *mapping;
24791 +
24792 + mapping = page->mapping;
24793 + BUG_ON(mapping == NULL);
24794 +
24795 + if (!TestSetPageDirty(page)) {
24796 + if (mapping_cap_account_dirty(mapping))
24797 + inc_zone_page_state(page, NR_FILE_DIRTY);
24798 +
24799 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24800 + }
24801 +
24802 + /* znode must be dirty ? */
24803 + if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
24804 + assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24805 + return 0;
24806 +}
24807 +
24808 +#if 0
24809 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24810 +{
24811 + if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24812 + return 1;
24813 + if (ctx->super != s)
24814 + return 1;
24815 + if (get_super_private(s)->entd.tsk == current)
24816 + return 0;
24817 + if (!lock_stack_isclean(&ctx->stack))
24818 + return 0;
24819 + if (ctx->trans->atom != NULL)
24820 + return 0;
24821 + return 1;
24822 +}
24823 +#endif
24824 +
24825 +/**
24826 + * reiser4_writepage - writepage of struct address_space_operations
24827 + * @page: page to write
24828 + * @wbc:
24829 + *
24830 + *
24831 + */
24832 +/* Common memory pressure notification. */
24833 +int reiser4_writepage(struct page *page,
24834 + struct writeback_control *wbc)
24835 +{
24836 + struct super_block *s;
24837 + reiser4_context *ctx;
24838 +
24839 + assert("vs-828", PageLocked(page));
24840 +
24841 + s = page->mapping->host->i_sb;
24842 + ctx = get_current_context_check();
24843 +
24844 + //assert("", can_hit_entd(ctx, s));
24845 + return write_page_by_ent(page, wbc);
24846 +}
24847 +
24848 +/* ->set_page_dirty() method of formatted address_space */
24849 +static int formatted_set_page_dirty(struct page *page)
24850 +{
24851 + assert("nikita-2173", page != NULL);
24852 + BUG();
24853 + return __set_page_dirty_nobuffers(page);
24854 +}
24855 +
24856 +/* writepages method of address space operations in reiser4 is used to involve
24857 + into transactions pages which are dirtied via mmap. Only regular files can
24858 + have such pages. Fake inode is used to access formatted nodes via page
24859 + cache. As formatted nodes can never be mmaped, fake inode's writepages has
24860 + nothing to do */
24861 +static int
24862 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24863 +{
24864 + return 0;
24865 +}
24866 +
24867 +/* address space operations for the fake inode */
24868 +static struct address_space_operations formatted_fake_as_ops = {
24869 + /* Perform a writeback of a single page as a memory-freeing
24870 + * operation. */
24871 + .writepage = reiser4_writepage,
24872 + /* this is called to read formatted node */
24873 + .readpage = formatted_readpage,
24874 + /* ->sync_page() method of fake inode address space operations. Called
24875 + from wait_on_page() and lock_page().
24876 +
24877 + This is most annoyingly misnomered method. Actually it is called
24878 + from wait_on_page_bit() and lock_page() and its purpose is to
24879 + actually start io by jabbing device drivers.
24880 + */
24881 + .sync_page = block_sync_page,
24882 + /* Write back some dirty pages from this mapping. Called from sync.
24883 + called during sync (pdflush) */
24884 + .writepages = writepages_fake,
24885 + /* Set a page dirty */
24886 + .set_page_dirty = formatted_set_page_dirty,
24887 + /* used for read-ahead. Not applicable */
24888 + .readpages = NULL,
24889 + .prepare_write = NULL,
24890 + .commit_write = NULL,
24891 + .bmap = NULL,
24892 + /* called just before page is being detached from inode mapping and
24893 + removed from memory. Called on truncate, cut/squeeze, and
24894 + umount. */
24895 + .invalidatepage = reiser4_invalidatepage,
24896 + /* this is called by shrink_cache() so that file system can try to
24897 + release objects (jnodes, buffers, journal heads) attached to page
24898 + and, may be made page itself free-able.
24899 + */
24900 + .releasepage = reiser4_releasepage,
24901 + .direct_IO = NULL
24902 +};
24903 +
24904 +/* called just before page is released (no longer used by reiser4). Callers:
24905 + jdelete() and extent2tail(). */
24906 +void reiser4_drop_page(struct page *page)
24907 +{
24908 + assert("nikita-2181", PageLocked(page));
24909 + clear_page_dirty_for_io(page);
24910 + ClearPageUptodate(page);
24911 +#if defined(PG_skipped)
24912 + ClearPageSkipped(page);
24913 +#endif
24914 + unlock_page(page);
24915 +}
24916 +
24917 +#define JNODE_GANG_SIZE (16)
24918 +
24919 +/* find all jnodes from range specified and invalidate them */
24920 +static int
24921 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24922 +{
24923 + reiser4_inode *info;
24924 + int truncated_jnodes;
24925 + reiser4_tree *tree;
24926 + unsigned long index;
24927 + unsigned long end;
24928 +
24929 + if (inode_file_plugin(inode) ==
24930 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24931 + /*
24932 + * No need to get rid of jnodes here: if the single jnode of
24933 + * page cluster did not have page, then it was found and killed
24934 + * before in
24935 + * truncate_complete_page_cluster()->jput()->jput_final(),
24936 + * otherwise it will be dropped by reiser4_invalidatepage()
24937 + */
24938 + return 0;
24939 + truncated_jnodes = 0;
24940 +
24941 + info = reiser4_inode_data(inode);
24942 + tree = reiser4_tree_by_inode(inode);
24943 +
24944 + index = from;
24945 + end = from + count;
24946 +
24947 + while (1) {
24948 + jnode *gang[JNODE_GANG_SIZE];
24949 + int taken;
24950 + int i;
24951 + jnode *node;
24952 +
24953 + assert("nikita-3466", index <= end);
24954 +
24955 + read_lock_tree(tree);
24956 + taken =
24957 + radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
24958 + (void **)gang, index,
24959 + JNODE_GANG_SIZE);
24960 + for (i = 0; i < taken; ++i) {
24961 + node = gang[i];
24962 + if (index_jnode(node) < end)
24963 + jref(node);
24964 + else
24965 + gang[i] = NULL;
24966 + }
24967 + read_unlock_tree(tree);
24968 +
24969 + for (i = 0; i < taken; ++i) {
24970 + node = gang[i];
24971 + if (node != NULL) {
24972 + index = max(index, index_jnode(node));
24973 + spin_lock_jnode(node);
24974 + assert("edward-1457", node->pg == NULL);
24975 + /* this is always called after
24976 + truncate_inode_pages_range(). Therefore, here
24977 + jnode can not have page. New pages can not be
24978 + created because truncate_jnodes_range goes
24979 + under exclusive access on file obtained,
24980 + where as new page creation requires
24981 + non-exclusive access obtained */
24982 + JF_SET(node, JNODE_HEARD_BANSHEE);
24983 + reiser4_uncapture_jnode(node);
24984 + unhash_unformatted_jnode(node);
24985 + truncated_jnodes++;
24986 + jput(node);
24987 + } else
24988 + break;
24989 + }
24990 + if (i != taken || taken == 0)
24991 + break;
24992 + }
24993 + return truncated_jnodes;
24994 +}
24995 +
24996 +/* Truncating files in reiser4: problems and solutions.
24997 +
24998 + VFS calls fs's truncate after it has called truncate_inode_pages()
24999 + to get rid of pages corresponding to part of file being truncated.
25000 + In reiser4 it may cause existence of unallocated extents which do
25001 + not have jnodes. Flush code does not expect that. Solution of this
25002 + problem is straightforward. As vfs's truncate is implemented using
25003 + setattr operation, it seems reasonable to have ->setattr() that
25004 + will cut file body. However, flush code also does not expect dirty
25005 + pages without parent items, so it is impossible to cut all items,
25006 + then truncate all pages in two steps. We resolve this problem by
25007 + cutting items one-by-one. Each such fine-grained step performed
25008 + under longterm znode lock calls at the end ->kill_hook() method of
25009 + a killed item to remove its binded pages and jnodes.
25010 +
25011 + The following function is a common part of mentioned kill hooks.
25012 + Also, this is called before tail-to-extent conversion (to not manage
25013 + few copies of the data).
25014 +*/
25015 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25016 + unsigned long count, int even_cows)
25017 +{
25018 + loff_t from_bytes, count_bytes;
25019 +
25020 + if (count == 0)
25021 + return;
25022 + from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25023 + count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25024 +
25025 + unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25026 + truncate_inode_pages_range(mapping, from_bytes,
25027 + from_bytes + count_bytes - 1);
25028 + truncate_jnodes_range(mapping->host, from, count);
25029 +}
25030 +
25031 +/*
25032 + * Local variables:
25033 + * c-indentation-style: "K&R"
25034 + * mode-name: "LC"
25035 + * c-basic-offset: 8
25036 + * tab-width: 8
25037 + * fill-column: 120
25038 + * scroll-step: 1
25039 + * End:
25040 + */
25041 diff -urN linux-2.6.22.orig/fs/reiser4/page_cache.h linux-2.6.22/fs/reiser4/page_cache.h
25042 --- linux-2.6.22.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300
25043 +++ linux-2.6.22/fs/reiser4/page_cache.h 2007-07-29 00:25:34.888699583 +0400
25044 @@ -0,0 +1,68 @@
25045 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25046 + * reiser4/README */
25047 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25048 +
25049 +#if !defined( __REISER4_PAGE_CACHE_H__ )
25050 +#define __REISER4_PAGE_CACHE_H__
25051 +
25052 +#include "forward.h"
25053 +#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25054 +
25055 +#include <linux/fs.h> /* for struct super_block, address_space */
25056 +#include <linux/mm.h> /* for struct page */
25057 +#include <linux/pagemap.h> /* for lock_page() */
25058 +#include <linux/vmalloc.h> /* for __vmalloc() */
25059 +
25060 +extern int reiser4_init_formatted_fake(struct super_block *);
25061 +extern void reiser4_done_formatted_fake(struct super_block *);
25062 +
25063 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25064 +
25065 +extern int reiser4_set_page_dirty_internal(struct page *);
25066 +
25067 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25068 +
25069 +extern void reiser4_wait_page_writeback(struct page *);
25070 +static inline void lock_and_wait_page_writeback(struct page *page)
25071 +{
25072 + lock_page(page);
25073 + if (unlikely(PageWriteback(page)))
25074 + reiser4_wait_page_writeback(page);
25075 +}
25076 +
25077 +#define jprivate(page) ((jnode *)page_private(page))
25078 +
25079 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25080 +extern void reiser4_drop_page(struct page *);
25081 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25082 + unsigned long count, int even_cows);
25083 +extern void capture_reiser4_inodes(struct super_block *,
25084 + struct writeback_control *);
25085 +static inline void * reiser4_vmalloc (unsigned long size)
25086 +{
25087 + return __vmalloc(size,
25088 + reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25089 + PAGE_KERNEL);
25090 +}
25091 +
25092 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25093 +
25094 +#if REISER4_DEBUG
25095 +extern void print_page(const char *prefix, struct page *page);
25096 +#else
25097 +#define print_page(prf, p) noop
25098 +#endif
25099 +
25100 +/* __REISER4_PAGE_CACHE_H__ */
25101 +#endif
25102 +
25103 +/* Make Linus happy.
25104 + Local variables:
25105 + c-indentation-style: "K&R"
25106 + mode-name: "LC"
25107 + c-basic-offset: 8
25108 + tab-width: 8
25109 + fill-column: 120
25110 + scroll-step: 1
25111 + End:
25112 +*/
25113 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/cluster.c linux-2.6.22/fs/reiser4/plugin/cluster.c
25114 --- linux-2.6.22.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300
25115 +++ linux-2.6.22/fs/reiser4/plugin/cluster.c 2007-07-29 00:25:34.892700618 +0400
25116 @@ -0,0 +1,71 @@
25117 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25118 +
25119 +/* Contains reiser4 cluster plugins (see
25120 + http://www.namesys.com/cryptcompress_design.html
25121 + "Concepts of clustering" for details). */
25122 +
25123 +#include "plugin_header.h"
25124 +#include "plugin.h"
25125 +#include "../inode.h"
25126 +
25127 +static int change_cluster(struct inode *inode,
25128 + reiser4_plugin * plugin,
25129 + pset_member memb)
25130 +{
25131 + assert("edward-1324", inode != NULL);
25132 + assert("edward-1325", plugin != NULL);
25133 + assert("edward-1326", is_reiser4_inode(inode));
25134 + assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25135 +
25136 + /* Can't change the cluster plugin for already existent regular files. */
25137 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25138 + return RETERR(-EINVAL);
25139 +
25140 + /* If matches, nothing to change. */
25141 + if (inode_hash_plugin(inode) != NULL &&
25142 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25143 + return 0;
25144 +
25145 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25146 + PSET_CLUSTER, plugin);
25147 +}
25148 +
25149 +static reiser4_plugin_ops cluster_plugin_ops = {
25150 + .init = NULL,
25151 + .load = NULL,
25152 + .save_len = NULL,
25153 + .save = NULL,
25154 + .change = &change_cluster
25155 +};
25156 +
25157 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25158 + [CLUSTER_ ## ID ## _ID] = { \
25159 + .h = { \
25160 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25161 + .id = CLUSTER_ ## ID ## _ID, \
25162 + .pops = &cluster_plugin_ops, \
25163 + .label = LABEL, \
25164 + .desc = DESC, \
25165 + .linkage = {NULL, NULL} \
25166 + }, \
25167 + .shift = SHIFT \
25168 + }
25169 +
25170 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25171 + SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25172 + SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25173 + SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25174 + SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25175 + SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25176 +};
25177 +
25178 +/*
25179 + Local variables:
25180 + c-indentation-style: "K&R"
25181 + mode-name: "LC"
25182 + c-basic-offset: 8
25183 + tab-width: 8
25184 + fill-column: 120
25185 + scroll-step: 1
25186 + End:
25187 +*/
25188 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/cluster.h linux-2.6.22/fs/reiser4/plugin/cluster.h
25189 --- linux-2.6.22.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300
25190 +++ linux-2.6.22/fs/reiser4/plugin/cluster.h 2007-07-29 00:25:34.892700618 +0400
25191 @@ -0,0 +1,399 @@
25192 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25193 +
25194 +/* This file contains size/offset translators, modulators
25195 + and other helper functions. */
25196 +
25197 +#if !defined( __FS_REISER4_CLUSTER_H__ )
25198 +#define __FS_REISER4_CLUSTER_H__
25199 +
25200 +#include "../inode.h"
25201 +
25202 +static inline int inode_cluster_shift(struct inode *inode)
25203 +{
25204 + assert("edward-92", inode != NULL);
25205 + assert("edward-93", reiser4_inode_data(inode) != NULL);
25206 +
25207 + return inode_cluster_plugin(inode)->shift;
25208 +}
25209 +
25210 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25211 +{
25212 + return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25213 +}
25214 +
25215 +/* cluster size in page units */
25216 +static inline unsigned cluster_nrpages(struct inode *inode)
25217 +{
25218 + return 1U << cluster_nrpages_shift(inode);
25219 +}
25220 +
25221 +static inline size_t inode_cluster_size(struct inode *inode)
25222 +{
25223 + assert("edward-96", inode != NULL);
25224 +
25225 + return 1U << inode_cluster_shift(inode);
25226 +}
25227 +
25228 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25229 +{
25230 + return idx >> cluster_nrpages_shift(inode);
25231 +}
25232 +
25233 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25234 +{
25235 + return idx << cluster_nrpages_shift(inode);
25236 +}
25237 +
25238 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25239 +{
25240 + return clust_to_pg(pg_to_clust(idx, inode), inode);
25241 +}
25242 +
25243 +static inline pgoff_t off_to_pg(loff_t off)
25244 +{
25245 + return (off >> PAGE_CACHE_SHIFT);
25246 +}
25247 +
25248 +static inline loff_t pg_to_off(pgoff_t idx)
25249 +{
25250 + return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25251 +}
25252 +
25253 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25254 +{
25255 + return off >> inode_cluster_shift(inode);
25256 +}
25257 +
25258 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25259 +{
25260 + return (loff_t) idx << inode_cluster_shift(inode);
25261 +}
25262 +
25263 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25264 +{
25265 + return clust_to_off(off_to_clust(off, inode), inode);
25266 +}
25267 +
25268 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25269 +{
25270 + return clust_to_pg(off_to_clust(off, inode), inode);
25271 +}
25272 +
25273 +static inline unsigned off_to_pgoff(loff_t off)
25274 +{
25275 + return off & (PAGE_CACHE_SIZE - 1);
25276 +}
25277 +
25278 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25279 +{
25280 + return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25281 +}
25282 +
25283 +static inline pgoff_t offset_in_clust(struct page * page)
25284 +{
25285 + assert("edward-1488", page != NULL);
25286 + assert("edward-1489", page->mapping != NULL);
25287 +
25288 + return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
25289 +}
25290 +
25291 +static inline int first_page_in_cluster(struct page * page)
25292 +{
25293 + return offset_in_clust(page) == 0;
25294 +}
25295 +
25296 +static inline int last_page_in_cluster(struct page * page)
25297 +{
25298 + return offset_in_clust(page) ==
25299 + cluster_nrpages(page->mapping->host) - 1;
25300 +}
25301 +
25302 +static inline unsigned
25303 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25304 +{
25305 + return off_to_cloff(pg_to_off(idx), inode);
25306 +}
25307 +
25308 +/*********************** Size translators **************************/
25309 +
25310 +/* Translate linear size.
25311 + * New units are (1 << @blk_shift) times larger, then old ones.
25312 + * In other words, calculate number of logical blocks, occupied
25313 + * by @count elements
25314 + */
25315 +static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
25316 +{
25317 + return (count + (1UL << blkbits) - 1) >> blkbits;
25318 +}
25319 +
25320 +/* size in pages */
25321 +static inline pgoff_t size_in_pages(loff_t size)
25322 +{
25323 + return size_in_blocks(size, PAGE_CACHE_SHIFT);
25324 +}
25325 +
25326 +/* size in logical clusters */
25327 +static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
25328 +{
25329 + return size_in_blocks(size, inode_cluster_shift(inode));
25330 +}
25331 +
25332 +/* size in pages to the size in page clusters */
25333 +static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25334 +{
25335 + return size_in_blocks(size, cluster_nrpages_shift(inode));
25336 +}
25337 +
25338 +/*********************** Size modulators ***************************/
25339 +
25340 +/*
25341 + Modulate linear size by nominated block size and offset.
25342 +
25343 + The "finite" function (which is zero almost everywhere).
25344 + How much is a height of the figure at a position @pos,
25345 + when trying to construct rectangle of height (1 << @blkbits),
25346 + and square @size.
25347 +
25348 + ******
25349 + *******
25350 + *******
25351 + *******
25352 + ----------> pos
25353 +*/
25354 +static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
25355 +{
25356 + unsigned end = size >> blkbits;
25357 + if (pos < end)
25358 + return 1U << blkbits;
25359 + if (unlikely(pos > end))
25360 + return 0;
25361 + return size & ~(~0ull << blkbits);
25362 +}
25363 +
25364 +/* the same as above, but block size is page size */
25365 +static inline unsigned __mbp(loff_t size, pgoff_t pos)
25366 +{
25367 + return __mbb(size, pos, PAGE_CACHE_SHIFT);
25368 +}
25369 +
25370 +/* number of file's bytes in the nominated logical cluster */
25371 +static inline unsigned lbytes(cloff_t index, struct inode * inode)
25372 +{
25373 + return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25374 +}
25375 +
25376 +/* number of file's bytes in the nominated page */
25377 +static inline unsigned pbytes(pgoff_t index, struct inode * inode)
25378 +{
25379 + return __mbp(i_size_read(inode), index);
25380 +}
25381 +
25382 +/* return true, if logical cluster is not occupied by the file */
25383 +static inline int new_logical_cluster(struct cluster_handle * clust,
25384 + struct inode *inode)
25385 +{
25386 + return clust_to_off(clust->index, inode) >= i_size_read(inode);
25387 +}
25388 +
25389 +/* return true, if pages @p1 and @p2 are of the same page cluster */
25390 +static inline int same_page_cluster(struct page * p1, struct page * p2)
25391 +{
25392 + assert("edward-1490", p1 != NULL);
25393 + assert("edward-1491", p2 != NULL);
25394 + assert("edward-1492", p1->mapping != NULL);
25395 + assert("edward-1493", p2->mapping != NULL);
25396 +
25397 + return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25398 + pg_to_clust(page_index(p2), p2->mapping->host));
25399 +}
25400 +
25401 +static inline int cluster_is_complete(struct cluster_handle * clust,
25402 + struct inode * inode)
25403 +{
25404 + return clust->tc.lsize == inode_cluster_size(inode);
25405 +}
25406 +
25407 +static inline void reiser4_slide_init(struct reiser4_slide * win)
25408 +{
25409 + assert("edward-1084", win != NULL);
25410 + memset(win, 0, sizeof *win);
25411 +}
25412 +
25413 +static inline tfm_action
25414 +cluster_get_tfm_act(struct tfm_cluster * tc)
25415 +{
25416 + assert("edward-1356", tc != NULL);
25417 + return tc->act;
25418 +}
25419 +
25420 +static inline void
25421 +cluster_set_tfm_act(struct tfm_cluster * tc, tfm_action act)
25422 +{
25423 + assert("edward-1356", tc != NULL);
25424 + tc->act = act;
25425 +}
25426 +
25427 +static inline void cluster_init_act(struct cluster_handle * clust,
25428 + tfm_action act,
25429 + struct reiser4_slide * window)
25430 +{
25431 + assert("edward-84", clust != NULL);
25432 + memset(clust, 0, sizeof *clust);
25433 + cluster_set_tfm_act(&clust->tc, act);
25434 + clust->dstat = INVAL_DISK_CLUSTER;
25435 + clust->win = window;
25436 +}
25437 +
25438 +static inline void cluster_init_read(struct cluster_handle * clust,
25439 + struct reiser4_slide * window)
25440 +{
25441 + cluster_init_act (clust, TFMA_READ, window);
25442 +}
25443 +
25444 +static inline void cluster_init_write(struct cluster_handle * clust,
25445 + struct reiser4_slide * window)
25446 +{
25447 + cluster_init_act (clust, TFMA_WRITE, window);
25448 +}
25449 +
25450 +/* true if @p1 and @p2 are items of the same disk cluster */
25451 +static inline int same_disk_cluster(const coord_t * p1, const coord_t * p2)
25452 +{
25453 + /* drop this if you have other items to aggregate */
25454 + assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25455 +
25456 + return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25457 +}
25458 +
25459 +static inline int dclust_get_extension_dsize(hint_t * hint)
25460 +{
25461 + return hint->ext_coord.extension.ctail.dsize;
25462 +}
25463 +
25464 +static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25465 +{
25466 + hint->ext_coord.extension.ctail.dsize = dsize;
25467 +}
25468 +
25469 +static inline int dclust_get_extension_shift(hint_t * hint)
25470 +{
25471 + return hint->ext_coord.extension.ctail.shift;
25472 +}
25473 +
25474 +static inline int dclust_get_extension_ncount(hint_t * hint)
25475 +{
25476 + return hint->ext_coord.extension.ctail.ncount;
25477 +}
25478 +
25479 +static inline void dclust_inc_extension_ncount(hint_t * hint)
25480 +{
25481 + hint->ext_coord.extension.ctail.ncount ++;
25482 +}
25483 +
25484 +static inline void dclust_init_extension(hint_t * hint)
25485 +{
25486 + memset(&hint->ext_coord.extension.ctail, 0,
25487 + sizeof(hint->ext_coord.extension.ctail));
25488 +}
25489 +
25490 +static inline int hint_is_unprepped_dclust(hint_t * hint)
25491 +{
25492 + assert("edward-1451", hint_is_valid(hint));
25493 + return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25494 +}
25495 +
25496 +static inline void coord_set_between_clusters(coord_t * coord)
25497 +{
25498 +#if REISER4_DEBUG
25499 + int result;
25500 + result = zload(coord->node);
25501 + assert("edward-1296", !result);
25502 +#endif
25503 + if (!coord_is_between_items(coord)) {
25504 + coord->between = AFTER_ITEM;
25505 + coord->unit_pos = 0;
25506 + }
25507 +#if REISER4_DEBUG
25508 + zrelse(coord->node);
25509 +#endif
25510 +}
25511 +
25512 +int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25513 +int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
25514 + znode_lock_mode mode);
25515 +int checkout_logical_cluster(struct cluster_handle *, jnode *, struct inode *);
25516 +int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25517 +void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25518 + int even_cows);
25519 +void invalidate_hint_cluster(struct cluster_handle * clust);
25520 +void put_hint_cluster(struct cluster_handle * clust, struct inode *inode,
25521 + znode_lock_mode mode);
25522 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode * inode,
25523 + znode_lock_mode lock_mode);
25524 +void reset_cluster_params(struct cluster_handle * clust);
25525 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
25526 + int count);
25527 +int prepare_page_cluster(struct inode *inode, struct cluster_handle * clust,
25528 + rw_op rw);
25529 +void __put_page_cluster(int from, int to, struct page ** pages,
25530 + struct inode * inode);
25531 +void put_page_cluster(struct cluster_handle * clust,
25532 + struct inode * inode, rw_op rw);
25533 +void put_cluster_handle(struct cluster_handle * clust);
25534 +int grab_tfm_stream(struct inode *inode, struct tfm_cluster * tc, tfm_stream_id id);
25535 +int tfm_cluster_is_uptodate(struct tfm_cluster * tc);
25536 +void tfm_cluster_set_uptodate(struct tfm_cluster * tc);
25537 +void tfm_cluster_clr_uptodate(struct tfm_cluster * tc);
25538 +
25539 +/* move cluster handle to the target position
25540 + specified by the page of index @pgidx */
25541 +static inline void move_cluster_forward(struct cluster_handle * clust,
25542 + struct inode *inode,
25543 + pgoff_t pgidx)
25544 +{
25545 + assert("edward-1297", clust != NULL);
25546 + assert("edward-1298", inode != NULL);
25547 +
25548 + reset_cluster_params(clust);
25549 + if (clust->index_valid &&
25550 + /* Hole in the indices. Hint became invalid and can not be
25551 + used by find_cluster_item() even if seal/node versions
25552 + will coincide */
25553 + pg_to_clust(pgidx, inode) != clust->index + 1) {
25554 + reiser4_unset_hint(clust->hint);
25555 + invalidate_hint_cluster(clust);
25556 + }
25557 + clust->index = pg_to_clust(pgidx, inode);
25558 + clust->index_valid = 1;
25559 +}
25560 +
25561 +static inline int alloc_clust_pages(struct cluster_handle * clust,
25562 + struct inode *inode)
25563 +{
25564 + assert("edward-791", clust != NULL);
25565 + assert("edward-792", inode != NULL);
25566 + clust->pages =
25567 + kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25568 + reiser4_ctx_gfp_mask_get());
25569 + if (!clust->pages)
25570 + return -ENOMEM;
25571 + return 0;
25572 +}
25573 +
25574 +static inline void free_clust_pages(struct cluster_handle * clust)
25575 +{
25576 + kfree(clust->pages);
25577 +}
25578 +
25579 +#endif /* __FS_REISER4_CLUSTER_H__ */
25580 +
25581 +/* Make Linus happy.
25582 + Local variables:
25583 + c-indentation-style: "K&R"
25584 + mode-name: "LC"
25585 + c-basic-offset: 8
25586 + tab-width: 8
25587 + fill-column: 120
25588 + scroll-step: 1
25589 + End:
25590 +*/
25591 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.22/fs/reiser4/plugin/compress/compress.c
25592 --- linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300
25593 +++ linux-2.6.22/fs/reiser4/plugin/compress/compress.c 2007-07-29 00:25:34.892700618 +0400
25594 @@ -0,0 +1,381 @@
25595 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25596 +/* reiser4 compression transform plugins */
25597 +
25598 +#include "../../debug.h"
25599 +#include "../../inode.h"
25600 +#include "../plugin.h"
25601 +#include "minilzo.h"
25602 +
25603 +#include <linux/zlib.h>
25604 +#include <linux/types.h>
25605 +#include <linux/hardirq.h>
25606 +
25607 +static int change_compression(struct inode *inode,
25608 + reiser4_plugin * plugin,
25609 + pset_member memb)
25610 +{
25611 + assert("edward-1316", inode != NULL);
25612 + assert("edward-1317", plugin != NULL);
25613 + assert("edward-1318", is_reiser4_inode(inode));
25614 + assert("edward-1319",
25615 + plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25616 +
25617 + /* cannot change compression plugin of already existing regular object */
25618 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25619 + return RETERR(-EINVAL);
25620 +
25621 + /* If matches, nothing to change. */
25622 + if (inode_hash_plugin(inode) != NULL &&
25623 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25624 + return 0;
25625 +
25626 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25627 + PSET_COMPRESSION, plugin);
25628 +}
25629 +
25630 +static reiser4_plugin_ops compression_plugin_ops = {
25631 + .init = NULL,
25632 + .load = NULL,
25633 + .save_len = NULL,
25634 + .save = NULL,
25635 + .change = &change_compression
25636 +};
25637 +
25638 +/******************************************************************************/
25639 +/* gzip1 compression */
25640 +/******************************************************************************/
25641 +
25642 +#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25643 +#define GZIP1_DEF_WINBITS 15
25644 +#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25645 +
25646 +static int gzip1_init(void)
25647 +{
25648 + int ret = -EINVAL;
25649 +#if REISER4_ZLIB
25650 + ret = 0;
25651 +#endif
25652 + if (ret == -EINVAL)
25653 + warning("edward-1337", "Zlib not compiled into kernel");
25654 + return ret;
25655 +}
25656 +
25657 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25658 +{
25659 + return 0;
25660 +}
25661 +
25662 +static coa_t gzip1_alloc(tfm_action act)
25663 +{
25664 + coa_t coa = NULL;
25665 +#if REISER4_ZLIB
25666 + int ret = 0;
25667 + switch (act) {
25668 + case TFMA_WRITE: /* compress */
25669 + coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25670 + if (!coa) {
25671 + ret = -ENOMEM;
25672 + break;
25673 + }
25674 + memset(coa, 0, zlib_deflate_workspacesize());
25675 + break;
25676 + case TFMA_READ: /* decompress */
25677 + coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25678 + if (!coa) {
25679 + ret = -ENOMEM;
25680 + break;
25681 + }
25682 + memset(coa, 0, zlib_inflate_workspacesize());
25683 + break;
25684 + default:
25685 + impossible("edward-767",
25686 + "trying to alloc workspace for unknown tfm action");
25687 + }
25688 + if (ret) {
25689 + warning("edward-768",
25690 + "alloc workspace for gzip1 (tfm action = %d) failed\n",
25691 + act);
25692 + return ERR_PTR(ret);
25693 + }
25694 +#endif
25695 + return coa;
25696 +}
25697 +
25698 +static void gzip1_free(coa_t coa, tfm_action act)
25699 +{
25700 + assert("edward-769", coa != NULL);
25701 +
25702 + switch (act) {
25703 + case TFMA_WRITE: /* compress */
25704 + vfree(coa);
25705 + break;
25706 + case TFMA_READ: /* decompress */
25707 + vfree(coa);
25708 + break;
25709 + default:
25710 + impossible("edward-770", "unknown tfm action");
25711 + }
25712 + return;
25713 +}
25714 +
25715 +static int gzip1_min_size_deflate(void)
25716 +{
25717 + return 64;
25718 +}
25719 +
25720 +static void
25721 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25722 + __u8 * dst_first, unsigned *dst_len)
25723 +{
25724 +#if REISER4_ZLIB
25725 + int ret = 0;
25726 + struct z_stream_s stream;
25727 +
25728 + memset(&stream, 0, sizeof(stream));
25729 +
25730 + assert("edward-842", coa != NULL);
25731 + assert("edward-875", src_len != 0);
25732 +
25733 + stream.workspace = coa;
25734 + ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25735 + -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25736 + Z_DEFAULT_STRATEGY);
25737 + if (ret != Z_OK) {
25738 + warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25739 + goto rollback;
25740 + }
25741 + ret = zlib_deflateReset(&stream);
25742 + if (ret != Z_OK) {
25743 + warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25744 + goto rollback;
25745 + }
25746 + stream.next_in = src_first;
25747 + stream.avail_in = src_len;
25748 + stream.next_out = dst_first;
25749 + stream.avail_out = *dst_len;
25750 +
25751 + ret = zlib_deflate(&stream, Z_FINISH);
25752 + if (ret != Z_STREAM_END) {
25753 + if (ret != Z_OK)
25754 + warning("edward-773",
25755 + "zlib_deflate returned %d\n", ret);
25756 + goto rollback;
25757 + }
25758 + *dst_len = stream.total_out;
25759 + return;
25760 + rollback:
25761 + *dst_len = src_len;
25762 +#endif
25763 + return;
25764 +}
25765 +
25766 +static void
25767 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25768 + __u8 * dst_first, unsigned *dst_len)
25769 +{
25770 +#if REISER4_ZLIB
25771 + int ret = 0;
25772 + struct z_stream_s stream;
25773 +
25774 + memset(&stream, 0, sizeof(stream));
25775 +
25776 + assert("edward-843", coa != NULL);
25777 + assert("edward-876", src_len != 0);
25778 +
25779 + stream.workspace = coa;
25780 + ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25781 + if (ret != Z_OK) {
25782 + warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25783 + return;
25784 + }
25785 + ret = zlib_inflateReset(&stream);
25786 + if (ret != Z_OK) {
25787 + warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25788 + return;
25789 + }
25790 +
25791 + stream.next_in = src_first;
25792 + stream.avail_in = src_len;
25793 + stream.next_out = dst_first;
25794 + stream.avail_out = *dst_len;
25795 +
25796 + ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25797 + /*
25798 + * Work around a bug in zlib, which sometimes wants to taste an extra
25799 + * byte when being used in the (undocumented) raw deflate mode.
25800 + * (From USAGI).
25801 + */
25802 + if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25803 + u8 zerostuff = 0;
25804 + stream.next_in = &zerostuff;
25805 + stream.avail_in = 1;
25806 + ret = zlib_inflate(&stream, Z_FINISH);
25807 + }
25808 + if (ret != Z_STREAM_END) {
25809 + warning("edward-776", "zlib_inflate returned %d\n", ret);
25810 + return;
25811 + }
25812 + *dst_len = stream.total_out;
25813 +#endif
25814 + return;
25815 +}
25816 +
25817 +/******************************************************************************/
25818 +/* lzo1 compression */
25819 +/******************************************************************************/
25820 +
25821 +static int lzo1_init(void)
25822 +{
25823 + int ret;
25824 + ret = lzo_init();
25825 + if (ret != LZO_E_OK)
25826 + warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
25827 + return ret;
25828 +}
25829 +
25830 +static int lzo1_overrun(unsigned in_len)
25831 +{
25832 + return in_len / 64 + 16 + 3;
25833 +}
25834 +
25835 +#define LZO_HEAP_SIZE(size) \
25836 + sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
25837 +
25838 +static coa_t lzo1_alloc(tfm_action act)
25839 +{
25840 + int ret = 0;
25841 + coa_t coa = NULL;
25842 +
25843 + switch (act) {
25844 + case TFMA_WRITE: /* compress */
25845 + coa = reiser4_vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25846 + if (!coa) {
25847 + ret = -ENOMEM;
25848 + break;
25849 + }
25850 + memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25851 + case TFMA_READ: /* decompress */
25852 + break;
25853 + default:
25854 + impossible("edward-877",
25855 + "trying to alloc workspace for unknown tfm action");
25856 + }
25857 + if (ret) {
25858 + warning("edward-878",
25859 + "alloc workspace for lzo1 (tfm action = %d) failed\n",
25860 + act);
25861 + return ERR_PTR(ret);
25862 + }
25863 + return coa;
25864 +}
25865 +
25866 +static void lzo1_free(coa_t coa, tfm_action act)
25867 +{
25868 + assert("edward-879", coa != NULL);
25869 +
25870 + switch (act) {
25871 + case TFMA_WRITE: /* compress */
25872 + vfree(coa);
25873 + break;
25874 + case TFMA_READ: /* decompress */
25875 + impossible("edward-1304",
25876 + "trying to free non-allocated workspace");
25877 + default:
25878 + impossible("edward-880", "unknown tfm action");
25879 + }
25880 + return;
25881 +}
25882 +
25883 +static int lzo1_min_size_deflate(void)
25884 +{
25885 + return 256;
25886 +}
25887 +
25888 +static void
25889 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25890 + __u8 * dst_first, unsigned *dst_len)
25891 +{
25892 + int result;
25893 +
25894 + assert("edward-846", coa != NULL);
25895 + assert("edward-847", src_len != 0);
25896 +
25897 + result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25898 + if (result != LZO_E_OK) {
25899 + warning("edward-849", "lzo1x_1_compress failed\n");
25900 + goto out;
25901 + }
25902 + if (*dst_len >= src_len) {
25903 + //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25904 + goto out;
25905 + }
25906 + return;
25907 + out:
25908 + *dst_len = src_len;
25909 + return;
25910 +}
25911 +
25912 +static void
25913 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25914 + __u8 * dst_first, unsigned *dst_len)
25915 +{
25916 + int result;
25917 +
25918 + assert("edward-851", coa == NULL);
25919 + assert("edward-852", src_len != 0);
25920 +
25921 + result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
25922 + if (result != LZO_E_OK)
25923 + warning("edward-853", "lzo1x_1_decompress failed\n");
25924 + return;
25925 +}
25926 +
25927 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25928 + [LZO1_COMPRESSION_ID] = {
25929 + .h = {
25930 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25931 + .id = LZO1_COMPRESSION_ID,
25932 + .pops = &compression_plugin_ops,
25933 + .label = "lzo1",
25934 + .desc = "lzo1 compression transform",
25935 + .linkage = {NULL, NULL}
25936 + },
25937 + .init = lzo1_init,
25938 + .overrun = lzo1_overrun,
25939 + .alloc = lzo1_alloc,
25940 + .free = lzo1_free,
25941 + .min_size_deflate = lzo1_min_size_deflate,
25942 + .checksum = reiser4_adler32,
25943 + .compress = lzo1_compress,
25944 + .decompress = lzo1_decompress
25945 + },
25946 + [GZIP1_COMPRESSION_ID] = {
25947 + .h = {
25948 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25949 + .id = GZIP1_COMPRESSION_ID,
25950 + .pops = &compression_plugin_ops,
25951 + .label = "gzip1",
25952 + .desc = "gzip1 compression transform",
25953 + .linkage = {NULL, NULL}
25954 + },
25955 + .init = gzip1_init,
25956 + .overrun = gzip1_overrun,
25957 + .alloc = gzip1_alloc,
25958 + .free = gzip1_free,
25959 + .min_size_deflate = gzip1_min_size_deflate,
25960 + .checksum = reiser4_adler32,
25961 + .compress = gzip1_compress,
25962 + .decompress = gzip1_decompress
25963 + }
25964 +};
25965 +
25966 +/*
25967 + Local variables:
25968 + c-indentation-style: "K&R"
25969 + mode-name: "LC"
25970 + c-basic-offset: 8
25971 + tab-width: 8
25972 + fill-column: 120
25973 + scroll-step: 1
25974 + End:
25975 +*/
25976 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.22/fs/reiser4/plugin/compress/compress.h
25977 --- linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300
25978 +++ linux-2.6.22/fs/reiser4/plugin/compress/compress.h 2007-07-29 00:25:34.892700618 +0400
25979 @@ -0,0 +1,43 @@
25980 +#if !defined( __FS_REISER4_COMPRESS_H__ )
25981 +#define __FS_REISER4_COMPRESS_H__
25982 +
25983 +#include <linux/types.h>
25984 +#include <linux/string.h>
25985 +
25986 +/* transform direction */
25987 +typedef enum {
25988 + TFMA_READ, /* decrypt, decompress */
25989 + TFMA_WRITE, /* encrypt, compress */
25990 + TFMA_LAST
25991 +} tfm_action;
25992 +
25993 +/* supported compression algorithms */
25994 +typedef enum {
25995 + LZO1_COMPRESSION_ID,
25996 + GZIP1_COMPRESSION_ID,
25997 + LAST_COMPRESSION_ID,
25998 +} reiser4_compression_id;
25999 +
26000 +/* the same as pgoff, but units are page clusters */
26001 +typedef unsigned long cloff_t;
26002 +
26003 +/* working data of a (de)compression algorithm */
26004 +typedef void *coa_t;
26005 +
26006 +/* table for all supported (de)compression algorithms */
26007 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26008 +
26009 +__u32 reiser4_adler32(char *data, __u32 len);
26010 +
26011 +#endif /* __FS_REISER4_COMPRESS_H__ */
26012 +
26013 +/* Make Linus happy.
26014 + Local variables:
26015 + c-indentation-style: "K&R"
26016 + mode-name: "LC"
26017 + c-basic-offset: 8
26018 + tab-width: 8
26019 + fill-column: 120
26020 + scroll-step: 1
26021 + End:
26022 +*/
26023 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.22/fs/reiser4/plugin/compress/compress_mode.c
26024 --- linux-2.6.22.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300
26025 +++ linux-2.6.22/fs/reiser4/plugin/compress/compress_mode.c 2007-07-29 00:25:34.892700618 +0400
26026 @@ -0,0 +1,162 @@
26027 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26028 +/* This file contains Reiser4 compression mode plugins.
26029 +
26030 + Compression mode plugin is a set of handlers called by compressor
26031 + at flush time and represent some heuristics including the ones
26032 + which are to avoid compression of incompressible data, see
26033 + http://www.namesys.com/cryptcompress_design.html for more details.
26034 +*/
26035 +#include "../../inode.h"
26036 +#include "../plugin.h"
26037 +
26038 +static int should_deflate_none(struct inode * inode, cloff_t index)
26039 +{
26040 + return 0;
26041 +}
26042 +
26043 +static int should_deflate_common(struct inode * inode, cloff_t index)
26044 +{
26045 + return compression_is_on(cryptcompress_inode_data(inode));
26046 +}
26047 +
26048 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
26049 +{
26050 + turn_off_compression(cryptcompress_inode_data(inode));
26051 + return 0;
26052 +}
26053 +
26054 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
26055 +{
26056 + struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26057 +
26058 + assert("edward-1462",
26059 + get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26060 + get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26061 +
26062 + turn_off_compression(info);
26063 + if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26064 + set_lattice_factor(info, get_lattice_factor(info) << 1);
26065 + return 0;
26066 +}
26067 +
26068 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
26069 +{
26070 + turn_on_compression(cryptcompress_inode_data(inode));
26071 + set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26072 + return 0;
26073 +}
26074 +
26075 +/* Check on dynamic lattice, the adaptive compression modes which
26076 + defines the following behavior:
26077 +
26078 + Compression is on: try to compress everything and turn
26079 + it off, whenever cluster is incompressible.
26080 +
26081 + Compression is off: try to compress clusters of indexes
26082 + k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26083 + them is compressible. If incompressible, then increase FACTOR */
26084 +
26085 +/* check if @index belongs to one-dimensional lattice
26086 + of sparce factor @factor */
26087 +static int is_on_lattice(cloff_t index, int factor)
26088 +{
26089 + return (factor ? index % factor == 0: index == 0);
26090 +}
26091 +
26092 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
26093 +{
26094 + return should_deflate_common(inode, index) ||
26095 + is_on_lattice(index,
26096 + get_lattice_factor
26097 + (cryptcompress_inode_data(inode)));
26098 +}
26099 +
26100 +/* compression mode_plugins */
26101 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26102 + [NONE_COMPRESSION_MODE_ID] = {
26103 + .h = {
26104 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26105 + .id = NONE_COMPRESSION_MODE_ID,
26106 + .pops = NULL,
26107 + .label = "none",
26108 + .desc = "Compress nothing",
26109 + .linkage = {NULL, NULL}
26110 + },
26111 + .should_deflate = should_deflate_none,
26112 + .accept_hook = NULL,
26113 + .discard_hook = NULL
26114 + },
26115 + /* Check-on-dynamic-lattice adaptive compression mode */
26116 + [LATTD_COMPRESSION_MODE_ID] = {
26117 + .h = {
26118 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26119 + .id = LATTD_COMPRESSION_MODE_ID,
26120 + .pops = NULL,
26121 + .label = "lattd",
26122 + .desc = "Check on dynamic lattice",
26123 + .linkage = {NULL, NULL}
26124 + },
26125 + .should_deflate = should_deflate_lattd,
26126 + .accept_hook = accept_hook_lattd,
26127 + .discard_hook = discard_hook_lattd
26128 + },
26129 + /* Check-ultimately compression mode:
26130 + Turn off compression forever as soon as we meet
26131 + incompressible data */
26132 + [ULTIM_COMPRESSION_MODE_ID] = {
26133 + .h = {
26134 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26135 + .id = ULTIM_COMPRESSION_MODE_ID,
26136 + .pops = NULL,
26137 + .label = "ultim",
26138 + .desc = "Check ultimately",
26139 + .linkage = {NULL, NULL}
26140 + },
26141 + .should_deflate = should_deflate_common,
26142 + .accept_hook = NULL,
26143 + .discard_hook = discard_hook_ultim
26144 + },
26145 + /* Force-to-compress-everything compression mode */
26146 + [FORCE_COMPRESSION_MODE_ID] = {
26147 + .h = {
26148 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26149 + .id = FORCE_COMPRESSION_MODE_ID,
26150 + .pops = NULL,
26151 + .label = "force",
26152 + .desc = "Force to compress everything",
26153 + .linkage = {NULL, NULL}
26154 + },
26155 + .should_deflate = NULL,
26156 + .accept_hook = NULL,
26157 + .discard_hook = NULL
26158 + },
26159 + /* Convert-to-extent compression mode.
26160 + In this mode items will be converted to extents and management
26161 + will be passed to (classic) unix file plugin as soon as ->write()
26162 + detects that the first complete logical cluster (of index #0) is
26163 + incompressible. */
26164 + [CONVX_COMPRESSION_MODE_ID] = {
26165 + .h = {
26166 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26167 + .id = CONVX_COMPRESSION_MODE_ID,
26168 + .pops = NULL,
26169 + .label = "conv",
26170 + .desc = "Convert to extent",
26171 + .linkage = {NULL, NULL}
26172 + },
26173 + .should_deflate = should_deflate_common,
26174 + .accept_hook = NULL,
26175 + .discard_hook = NULL
26176 + }
26177 +};
26178 +
26179 +/*
26180 + Local variables:
26181 + c-indentation-style: "K&R"
26182 + mode-name: "LC"
26183 + c-basic-offset: 8
26184 + tab-width: 8
26185 + fill-column: 120
26186 + scroll-step: 1
26187 + End:
26188 +*/
26189 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/lzoconf.h linux-2.6.22/fs/reiser4/plugin/compress/lzoconf.h
26190 --- linux-2.6.22.orig/fs/reiser4/plugin/compress/lzoconf.h 1970-01-01 03:00:00.000000000 +0300
26191 +++ linux-2.6.22/fs/reiser4/plugin/compress/lzoconf.h 2007-07-29 00:25:34.896701653 +0400
26192 @@ -0,0 +1,216 @@
26193 +/* lzoconf.h -- configuration for the LZO real-time data compression library
26194 + adopted for reiser4 compression transform plugin.
26195 +
26196 + This file is part of the LZO real-time data compression library
26197 + and not included in any proprietary licenses of reiser4.
26198 +
26199 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26200 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26201 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26202 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26203 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26204 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26205 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26206 + All Rights Reserved.
26207 +
26208 + The LZO library is free software; you can redistribute it and/or
26209 + modify it under the terms of the GNU General Public License as
26210 + published by the Free Software Foundation; either version 2 of
26211 + the License, or (at your option) any later version.
26212 +
26213 + The LZO library is distributed in the hope that it will be useful,
26214 + but WITHOUT ANY WARRANTY; without even the implied warranty of
26215 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26216 + GNU General Public License for more details.
26217 +
26218 + You should have received a copy of the GNU General Public License
26219 + along with the LZO library; see the file COPYING.
26220 + If not, write to the Free Software Foundation, Inc.,
26221 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26222 +
26223 + Markus F.X.J. Oberhumer
26224 + <markus@oberhumer.com>
26225 + http://www.oberhumer.com/opensource/lzo/
26226 + */
26227 +
26228 +#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
26229 +
26230 +#ifndef __LZOCONF_H
26231 +#define __LZOCONF_H
26232 +
26233 +#define LZO_VERSION 0x1080
26234 +#define LZO_VERSION_STRING "1.08"
26235 +#define LZO_VERSION_DATE "Jul 12 2002"
26236 +
26237 +/* internal Autoconf configuration file - only used when building LZO */
26238 +
26239 +/***********************************************************************
26240 +// LZO requires a conforming <limits.h>
26241 +************************************************************************/
26242 +
26243 +#define CHAR_BIT 8
26244 +#define USHRT_MAX 0xffff
26245 +
26246 +/* workaround a cpp bug under hpux 10.20 */
26247 +#define LZO_0xffffffffL 4294967295ul
26248 +
26249 +/***********************************************************************
26250 +// architecture defines
26251 +************************************************************************/
26252 +
26253 +#if !defined(__LZO_i386)
26254 +# if defined(__i386__) || defined(__386__) || defined(_M_IX86)
26255 +# define __LZO_i386
26256 +# endif
26257 +#endif
26258 +
26259 +/* memory checkers */
26260 +#if !defined(__LZO_CHECKER)
26261 +# if defined(__BOUNDS_CHECKING_ON)
26262 +# define __LZO_CHECKER
26263 +# elif defined(__CHECKER__)
26264 +# define __LZO_CHECKER
26265 +# elif defined(__INSURE__)
26266 +# define __LZO_CHECKER
26267 +# elif defined(__PURIFY__)
26268 +# define __LZO_CHECKER
26269 +# endif
26270 +#endif
26271 +
26272 +/***********************************************************************
26273 +// integral and pointer types
26274 +************************************************************************/
26275 +
26276 +/* Integral types with 32 bits or more */
26277 +#if !defined(LZO_UINT32_MAX)
26278 +# if (UINT_MAX >= LZO_0xffffffffL)
26279 + typedef unsigned int lzo_uint32;
26280 + typedef int lzo_int32;
26281 +# define LZO_UINT32_MAX UINT_MAX
26282 +# define LZO_INT32_MAX INT_MAX
26283 +# define LZO_INT32_MIN INT_MIN
26284 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26285 + typedef unsigned long lzo_uint32;
26286 + typedef long lzo_int32;
26287 +# define LZO_UINT32_MAX ULONG_MAX
26288 +# define LZO_INT32_MAX LONG_MAX
26289 +# define LZO_INT32_MIN LONG_MIN
26290 +# else
26291 +# error "lzo_uint32"
26292 +# endif
26293 +#endif
26294 +
26295 +/* lzo_uint is used like size_t */
26296 +#if !defined(LZO_UINT_MAX)
26297 +# if (UINT_MAX >= LZO_0xffffffffL)
26298 + typedef unsigned int lzo_uint;
26299 + typedef int lzo_int;
26300 +# define LZO_UINT_MAX UINT_MAX
26301 +# define LZO_INT_MAX INT_MAX
26302 +# define LZO_INT_MIN INT_MIN
26303 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26304 + typedef unsigned long lzo_uint;
26305 + typedef long lzo_int;
26306 +# define LZO_UINT_MAX ULONG_MAX
26307 +# define LZO_INT_MAX LONG_MAX
26308 +# define LZO_INT_MIN LONG_MIN
26309 +# else
26310 +# error "lzo_uint"
26311 +# endif
26312 +#endif
26313 +
26314 + typedef int lzo_bool;
26315 +
26316 +/***********************************************************************
26317 +// memory models
26318 +************************************************************************/
26319 +
26320 +/* Memory model that allows to access memory at offsets of lzo_uint. */
26321 +#if !defined(__LZO_MMODEL)
26322 +# if (LZO_UINT_MAX <= UINT_MAX)
26323 +# define __LZO_MMODEL
26324 +# else
26325 +# error "__LZO_MMODEL"
26326 +# endif
26327 +#endif
26328 +
26329 +/* no typedef here because of const-pointer issues */
26330 +#define lzo_byte unsigned char __LZO_MMODEL
26331 +#define lzo_bytep unsigned char __LZO_MMODEL *
26332 +#define lzo_charp char __LZO_MMODEL *
26333 +#define lzo_voidp void __LZO_MMODEL *
26334 +#define lzo_shortp short __LZO_MMODEL *
26335 +#define lzo_ushortp unsigned short __LZO_MMODEL *
26336 +#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
26337 +#define lzo_int32p lzo_int32 __LZO_MMODEL *
26338 +#define lzo_uintp lzo_uint __LZO_MMODEL *
26339 +#define lzo_intp lzo_int __LZO_MMODEL *
26340 +#define lzo_voidpp lzo_voidp __LZO_MMODEL *
26341 +#define lzo_bytepp lzo_bytep __LZO_MMODEL *
26342 +
26343 +#ifndef lzo_sizeof_dict_t
26344 +# define lzo_sizeof_dict_t sizeof(lzo_bytep)
26345 +#endif
26346 +
26347 +typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26348 + lzo_byte * dst, lzo_uintp dst_len,
26349 + lzo_voidp wrkmem);
26350 +
26351 +
26352 +/***********************************************************************
26353 +// error codes and prototypes
26354 +************************************************************************/
26355 +
26356 +/* Error codes for the compression/decompression functions. Negative
26357 + * values are errors, positive values will be used for special but
26358 + * normal events.
26359 + */
26360 +#define LZO_E_OK 0
26361 +#define LZO_E_ERROR (-1)
26362 +#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
26363 +#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
26364 +#define LZO_E_INPUT_OVERRUN (-4)
26365 +#define LZO_E_OUTPUT_OVERRUN (-5)
26366 +#define LZO_E_LOOKBEHIND_OVERRUN (-6)
26367 +#define LZO_E_EOF_NOT_FOUND (-7)
26368 +#define LZO_E_INPUT_NOT_CONSUMED (-8)
26369 +
26370 +/* lzo_init() should be the first function you call.
26371 + * Check the return code !
26372 + *
26373 + * lzo_init() is a macro to allow checking that the library and the
26374 + * compiler's view of various types are consistent.
26375 + */
26376 +#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26377 + (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26378 + (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26379 + (int)sizeof(lzo_compress_t))
26380 + extern int __lzo_init2(unsigned, int, int, int, int, int, int,
26381 + int, int, int);
26382 +
26383 +/* checksum functions */
26384 +extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf,
26385 + lzo_uint _len);
26386 +/* misc. */
26387 + typedef union {
26388 + lzo_bytep p;
26389 + lzo_uint u;
26390 + } __lzo_pu_u;
26391 + typedef union {
26392 + lzo_bytep p;
26393 + lzo_uint32 u32;
26394 + } __lzo_pu32_u;
26395 + typedef union {
26396 + void *vp;
26397 + lzo_bytep bp;
26398 + lzo_uint32 u32;
26399 + long l;
26400 + } lzo_align_t;
26401 +
26402 +#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26403 + ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26404 +
26405 +/* deprecated - only for backward compatibility */
26406 +#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26407 +
26408 +#endif /* already included */
26409 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.22/fs/reiser4/plugin/compress/Makefile
26410 --- linux-2.6.22.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300
26411 +++ linux-2.6.22/fs/reiser4/plugin/compress/Makefile 2007-07-29 00:25:34.896701653 +0400
26412 @@ -0,0 +1,6 @@
26413 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26414 +
26415 +compress_plugins-objs := \
26416 + compress.o \
26417 + minilzo.o \
26418 + compress_mode.o
26419 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.c linux-2.6.22/fs/reiser4/plugin/compress/minilzo.c
26420 --- linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.c 1970-01-01 03:00:00.000000000 +0300
26421 +++ linux-2.6.22/fs/reiser4/plugin/compress/minilzo.c 2007-07-29 00:25:34.900702689 +0400
26422 @@ -0,0 +1,1967 @@
26423 +/* minilzo.c -- mini subset of the LZO real-time data compression library
26424 + adopted for reiser4 compression transform plugin.
26425 +
26426 + This file is part of the LZO real-time data compression library
26427 + and not included in any proprietary licenses of reiser4.
26428 +
26429 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26430 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26431 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26432 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26433 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26434 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26435 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26436 + All Rights Reserved.
26437 +
26438 + The LZO library is free software; you can redistribute it and/or
26439 + modify it under the terms of the GNU General Public License as
26440 + published by the Free Software Foundation; either version 2 of
26441 + the License, or (at your option) any later version.
26442 +
26443 + The LZO library is distributed in the hope that it will be useful,
26444 + but WITHOUT ANY WARRANTY; without even the implied warranty of
26445 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26446 + GNU General Public License for more details.
26447 +
26448 + You should have received a copy of the GNU General Public License
26449 + along with the LZO library; see the file COPYING.
26450 + If not, write to the Free Software Foundation, Inc.,
26451 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26452 +
26453 + Markus F.X.J. Oberhumer
26454 + <markus@oberhumer.com>
26455 + http://www.oberhumer.com/opensource/lzo/
26456 + */
26457 +
26458 +/*
26459 + * NOTE:
26460 + * the full LZO package can be found at
26461 + * http://www.oberhumer.com/opensource/lzo/
26462 + */
26463 +
26464 +#include "../../debug.h" /* for reiser4 assert macro -edward */
26465 +
26466 +#define __LZO_IN_MINILZO
26467 +#define LZO_BUILD
26468 +
26469 +#include "minilzo.h"
26470 +
26471 +#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26472 +# error "version mismatch in miniLZO source files"
26473 +#endif
26474 +
26475 +#ifndef __LZO_CONF_H
26476 +#define __LZO_CONF_H
26477 +
26478 +# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
26479 +# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
26480 +
26481 +# define HAVE_MEMCMP
26482 +# define HAVE_MEMCPY
26483 +# define HAVE_MEMMOVE
26484 +# define HAVE_MEMSET
26485 +
26486 +#undef NDEBUG
26487 +#if !defined(LZO_DEBUG)
26488 +# define NDEBUG
26489 +#endif
26490 +#if defined(LZO_DEBUG) || !defined(NDEBUG)
26491 +# if !defined(NO_STDIO_H)
26492 +# include <stdio.h>
26493 +# endif
26494 +#endif
26495 +
26496 +#if !defined(LZO_COMPILE_TIME_ASSERT)
26497 +# define LZO_COMPILE_TIME_ASSERT(expr) \
26498 + { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26499 +#endif
26500 +
26501 +#if !defined(LZO_UNUSED)
26502 +# if 1
26503 +# define LZO_UNUSED(var) ((void)&var)
26504 +# elif 0
26505 +# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26506 +# else
26507 +# define LZO_UNUSED(parm) (parm = parm)
26508 +# endif
26509 +#endif
26510 +
26511 +#if defined(NO_MEMCMP)
26512 +# undef HAVE_MEMCMP
26513 +#endif
26514 +
26515 +#if !defined(HAVE_MEMSET)
26516 +# undef memset
26517 +# define memset lzo_memset
26518 +#endif
26519 +
26520 +# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
26521 +
26522 +#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
26523 +#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
26524 +#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26525 +#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26526 +
26527 +#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
26528 +
26529 +#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26530 +
26531 +#define LZO_SIZE(bits) (1u << (bits))
26532 +#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
26533 +
26534 +#define LZO_LSIZE(bits) (1ul << (bits))
26535 +#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
26536 +
26537 +#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
26538 +#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
26539 +
26540 +#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
26541 +#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26542 +
26543 +#if !defined(SIZEOF_UNSIGNED)
26544 +# if (UINT_MAX == 0xffff)
26545 +# define SIZEOF_UNSIGNED 2
26546 +# elif (UINT_MAX == LZO_0xffffffffL)
26547 +# define SIZEOF_UNSIGNED 4
26548 +# elif (UINT_MAX >= LZO_0xffffffffL)
26549 +# define SIZEOF_UNSIGNED 8
26550 +# else
26551 +# error "SIZEOF_UNSIGNED"
26552 +# endif
26553 +#endif
26554 +
26555 +#if !defined(SIZEOF_UNSIGNED_LONG)
26556 +# if (ULONG_MAX == LZO_0xffffffffL)
26557 +# define SIZEOF_UNSIGNED_LONG 4
26558 +# elif (ULONG_MAX >= LZO_0xffffffffL)
26559 +# define SIZEOF_UNSIGNED_LONG 8
26560 +# else
26561 +# error "SIZEOF_UNSIGNED_LONG"
26562 +# endif
26563 +#endif
26564 +
26565 +#if !defined(SIZEOF_SIZE_T)
26566 +# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
26567 +#endif
26568 +#if !defined(SIZE_T_MAX)
26569 +# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26570 +#endif
26571 +
26572 +#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26573 +# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26574 +# define LZO_UNALIGNED_OK_2
26575 +# endif
26576 +# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26577 +# define LZO_UNALIGNED_OK_4
26578 +# endif
26579 +#endif
26580 +
26581 +#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26582 +# if !defined(LZO_UNALIGNED_OK)
26583 +# define LZO_UNALIGNED_OK
26584 +# endif
26585 +#endif
26586 +
26587 +#if defined(__LZO_NO_UNALIGNED)
26588 +# undef LZO_UNALIGNED_OK
26589 +# undef LZO_UNALIGNED_OK_2
26590 +# undef LZO_UNALIGNED_OK_4
26591 +#endif
26592 +
26593 +#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26594 +# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26595 +#endif
26596 +#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26597 +# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26598 +#endif
26599 +
26600 +#if defined(__LZO_NO_ALIGNED)
26601 +# undef LZO_ALIGNED_OK_4
26602 +#endif
26603 +
26604 +#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26605 +# error "LZO_ALIGNED_OK_4 must not be defined on this system"
26606 +#endif
26607 +
26608 +#define LZO_LITTLE_ENDIAN 1234
26609 +#define LZO_BIG_ENDIAN 4321
26610 +#define LZO_PDP_ENDIAN 3412
26611 +
26612 +#if !defined(LZO_BYTE_ORDER)
26613 +# if defined(MFX_BYTE_ORDER)
26614 +# define LZO_BYTE_ORDER MFX_BYTE_ORDER
26615 +# elif defined(__LZO_i386)
26616 +# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
26617 +# elif defined(BYTE_ORDER)
26618 +# define LZO_BYTE_ORDER BYTE_ORDER
26619 +# elif defined(__BYTE_ORDER)
26620 +# define LZO_BYTE_ORDER __BYTE_ORDER
26621 +# endif
26622 +#endif
26623 +
26624 +#if defined(LZO_BYTE_ORDER)
26625 +# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26626 + (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26627 +# error "invalid LZO_BYTE_ORDER"
26628 +# endif
26629 +#endif
26630 +
26631 +#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
26632 +# error "LZO_BYTE_ORDER is not defined"
26633 +#endif
26634 +
26635 +#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
26636 +
26637 +#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
26638 +# if defined(__GNUC__) && defined(__i386__)
26639 +# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
26640 +# define LZO_OPTIMIZE_GNUC_i386
26641 +# endif
26642 +# endif
26643 +#endif
26644 +
26645 +extern const lzo_uint32 _lzo_crc32_table[256];
26646 +
26647 +#define _LZO_STRINGIZE(x) #x
26648 +#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
26649 +
26650 +#define _LZO_CONCAT2(a,b) a ## b
26651 +#define _LZO_CONCAT3(a,b,c) a ## b ## c
26652 +#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
26653 +#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
26654 +
26655 +#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
26656 +#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
26657 +#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
26658 +#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
26659 +
26660 +#ifndef __LZO_PTR_H
26661 +#define __LZO_PTR_H
26662 +
26663 +#if !defined(lzo_ptrdiff_t)
26664 +# if (UINT_MAX >= LZO_0xffffffffL)
26665 +typedef ptrdiff_t lzo_ptrdiff_t;
26666 +# else
26667 +typedef long lzo_ptrdiff_t;
26668 +# endif
26669 +#endif
26670 +
26671 +#if !defined(__LZO_HAVE_PTR_T)
26672 +# if defined(lzo_ptr_t)
26673 +# define __LZO_HAVE_PTR_T
26674 +# endif
26675 +#endif
26676 +#if !defined(__LZO_HAVE_PTR_T)
26677 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
26678 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
26679 +typedef unsigned long lzo_ptr_t;
26680 +typedef long lzo_sptr_t;
26681 +# define __LZO_HAVE_PTR_T
26682 +# endif
26683 +# endif
26684 +#endif
26685 +#if !defined(__LZO_HAVE_PTR_T)
26686 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
26687 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
26688 +typedef unsigned int lzo_ptr_t;
26689 +typedef int lzo_sptr_t;
26690 +# define __LZO_HAVE_PTR_T
26691 +# endif
26692 +# endif
26693 +#endif
26694 +#if !defined(__LZO_HAVE_PTR_T)
26695 +# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
26696 +# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
26697 +typedef unsigned short lzo_ptr_t;
26698 +typedef short lzo_sptr_t;
26699 +# define __LZO_HAVE_PTR_T
26700 +# endif
26701 +# endif
26702 +#endif
26703 +#if !defined(__LZO_HAVE_PTR_T)
26704 +# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
26705 +# error "no suitable type for lzo_ptr_t"
26706 +# else
26707 +typedef unsigned long lzo_ptr_t;
26708 +typedef long lzo_sptr_t;
26709 +# define __LZO_HAVE_PTR_T
26710 +# endif
26711 +#endif
26712 +
26713 +#define PTR(a) ((lzo_ptr_t) (a))
26714 +#define PTR_LINEAR(a) PTR(a)
26715 +#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
26716 +#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
26717 +#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
26718 +#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
26719 +
26720 +#define PTR_LT(a,b) (PTR(a) < PTR(b))
26721 +#define PTR_GE(a,b) (PTR(a) >= PTR(b))
26722 +#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
26723 +#define pd(a,b) ((lzo_uint) ((a)-(b)))
26724 +
26725 +typedef union {
26726 + char a_char;
26727 + unsigned char a_uchar;
26728 + short a_short;
26729 + unsigned short a_ushort;
26730 + int a_int;
26731 + unsigned int a_uint;
26732 + long a_long;
26733 + unsigned long a_ulong;
26734 + lzo_int a_lzo_int;
26735 + lzo_uint a_lzo_uint;
26736 + lzo_int32 a_lzo_int32;
26737 + lzo_uint32 a_lzo_uint32;
26738 + ptrdiff_t a_ptrdiff_t;
26739 + lzo_ptrdiff_t a_lzo_ptrdiff_t;
26740 + lzo_ptr_t a_lzo_ptr_t;
26741 + lzo_voidp a_lzo_voidp;
26742 + void *a_void_p;
26743 + lzo_bytep a_lzo_bytep;
26744 + lzo_bytepp a_lzo_bytepp;
26745 + lzo_uintp a_lzo_uintp;
26746 + lzo_uint *a_lzo_uint_p;
26747 + lzo_uint32p a_lzo_uint32p;
26748 + lzo_uint32 *a_lzo_uint32_p;
26749 + unsigned char *a_uchar_p;
26750 + char *a_char_p;
26751 +} lzo_full_align_t;
26752 +
26753 +#endif
26754 +#define LZO_DETERMINISTIC
26755 +#define LZO_DICT_USE_PTR
26756 +# define lzo_dict_t const lzo_bytep
26757 +# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
26758 +#if !defined(lzo_moff_t)
26759 +#define lzo_moff_t lzo_uint
26760 +#endif
26761 +#endif
26762 +static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
26763 +{
26764 + return PTR_LINEAR(ptr);
26765 +}
26766 +
26767 +static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
26768 +{
26769 + lzo_ptr_t p, s, n;
26770 +
26771 + assert("lzo-01", size > 0);
26772 +
26773 + p = __lzo_ptr_linear(ptr);
26774 + s = (lzo_ptr_t) (size - 1);
26775 + n = (((p + s) / size) * size) - p;
26776 +
26777 + assert("lzo-02", (long)n >= 0);
26778 + assert("lzo-03", n <= s);
26779 +
26780 + return (unsigned)n;
26781 +}
26782 +
26783 +#ifndef __LZO_UTIL_H
26784 +#define __LZO_UTIL_H
26785 +
26786 +#ifndef __LZO_CONF_H
26787 +#endif
26788 +
26789 +#if 1 && defined(HAVE_MEMCPY)
26790 +#define MEMCPY8_DS(dest,src,len) \
26791 + memcpy(dest,src,len); \
26792 + dest += len; \
26793 + src += len
26794 +#endif
26795 +
26796 +#if !defined(MEMCPY8_DS)
26797 +
26798 +#define MEMCPY8_DS(dest,src,len) \
26799 + { register lzo_uint __l = (len) / 8; \
26800 + do { \
26801 + *dest++ = *src++; \
26802 + *dest++ = *src++; \
26803 + *dest++ = *src++; \
26804 + *dest++ = *src++; \
26805 + *dest++ = *src++; \
26806 + *dest++ = *src++; \
26807 + *dest++ = *src++; \
26808 + *dest++ = *src++; \
26809 + } while (--__l > 0); }
26810 +
26811 +#endif
26812 +
26813 +#define MEMCPY_DS(dest,src,len) \
26814 + do *dest++ = *src++; \
26815 + while (--len > 0)
26816 +
26817 +#define MEMMOVE_DS(dest,src,len) \
26818 + do *dest++ = *src++; \
26819 + while (--len > 0)
26820 +
26821 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
26822 +
26823 +#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
26824 +
26825 +#else
26826 +
26827 +#define BZERO8_PTR(s,l,n) \
26828 + lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
26829 +
26830 +#endif
26831 +#endif
26832 +
26833 +/* If you use the LZO library in a product, you *must* keep this
26834 + * copyright string in the executable of your product.
26835 + */
26836 +
26837 +static const lzo_byte __lzo_copyright[] =
26838 +#if !defined(__LZO_IN_MINLZO)
26839 + LZO_VERSION_STRING;
26840 +#else
26841 + "\n\n\n"
26842 + "LZO real-time data compression library.\n"
26843 + "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
26844 + "<markus.oberhumer@jk.uni-linz.ac.at>\n"
26845 + "http://www.oberhumer.com/opensource/lzo/\n"
26846 + "\n"
26847 + "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
26848 + "LZO build date: " __DATE__ " " __TIME__ "\n\n"
26849 + "LZO special compilation options:\n"
26850 +#ifdef __cplusplus
26851 + " __cplusplus\n"
26852 +#endif
26853 +#if defined(__PIC__)
26854 + " __PIC__\n"
26855 +#elif defined(__pic__)
26856 + " __pic__\n"
26857 +#endif
26858 +#if (UINT_MAX < LZO_0xffffffffL)
26859 + " 16BIT\n"
26860 +#endif
26861 +#if defined(__LZO_STRICT_16BIT)
26862 + " __LZO_STRICT_16BIT\n"
26863 +#endif
26864 +#if (UINT_MAX > LZO_0xffffffffL)
26865 + " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
26866 +#endif
26867 +#if (ULONG_MAX > LZO_0xffffffffL)
26868 + " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
26869 +#endif
26870 +#if defined(LZO_BYTE_ORDER)
26871 + " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
26872 +#endif
26873 +#if defined(LZO_UNALIGNED_OK_2)
26874 + " LZO_UNALIGNED_OK_2\n"
26875 +#endif
26876 +#if defined(LZO_UNALIGNED_OK_4)
26877 + " LZO_UNALIGNED_OK_4\n"
26878 +#endif
26879 +#if defined(LZO_ALIGNED_OK_4)
26880 + " LZO_ALIGNED_OK_4\n"
26881 +#endif
26882 +#if defined(LZO_DICT_USE_PTR)
26883 + " LZO_DICT_USE_PTR\n"
26884 +#endif
26885 +#if defined(__LZO_QUERY_COMPRESS)
26886 + " __LZO_QUERY_COMPRESS\n"
26887 +#endif
26888 +#if defined(__LZO_QUERY_DECOMPRESS)
26889 + " __LZO_QUERY_DECOMPRESS\n"
26890 +#endif
26891 +#if defined(__LZO_IN_MINILZO)
26892 + " __LZO_IN_MINILZO\n"
26893 +#endif
26894 + "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
26895 +#if defined(__GNUC__) && defined(__VERSION__)
26896 + " by gcc " __VERSION__
26897 +#elif defined(__BORLANDC__)
26898 + " by Borland C " _LZO_MEXPAND(__BORLANDC__)
26899 +#elif defined(_MSC_VER)
26900 + " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
26901 +#elif defined(__PUREC__)
26902 + " by Pure C " _LZO_MEXPAND(__PUREC__)
26903 +#elif defined(__SC__)
26904 + " by Symantec C " _LZO_MEXPAND(__SC__)
26905 +#elif defined(__TURBOC__)
26906 + " by Turbo C " _LZO_MEXPAND(__TURBOC__)
26907 +#elif defined(__WATCOMC__)
26908 + " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
26909 +#endif
26910 + " $\n"
26911 + "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
26912 +#endif
26913 +
26914 +#define LZO_BASE 65521u
26915 +#define LZO_NMAX 5552
26916 +
26917 +#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
26918 +#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
26919 +#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
26920 +#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
26921 +#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
26922 +
26923 +# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
26924 +# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
26925 +
26926 +#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
26927 +
26928 +static lzo_bool schedule_insns_bug(void);
26929 +static lzo_bool strength_reduce_bug(int *);
26930 +
26931 +# define __lzo_assert(x) ((x) ? 1 : 0)
26932 +
26933 +#undef COMPILE_TIME_ASSERT
26934 +
26935 +# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
26936 +
26937 +static lzo_bool basic_integral_check(void)
26938 +{
26939 + lzo_bool r = 1;
26940 +
26941 + COMPILE_TIME_ASSERT(CHAR_BIT == 8);
26942 + COMPILE_TIME_ASSERT(sizeof(char) == 1);
26943 + COMPILE_TIME_ASSERT(sizeof(short) >= 2);
26944 + COMPILE_TIME_ASSERT(sizeof(long) >= 4);
26945 + COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
26946 + COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
26947 +
26948 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
26949 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
26950 +
26951 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
26952 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
26953 +#if defined(__LZO_STRICT_16BIT)
26954 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
26955 +#else
26956 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
26957 + COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
26958 +#endif
26959 +
26960 +#if (USHRT_MAX == 65535u)
26961 + COMPILE_TIME_ASSERT(sizeof(short) == 2);
26962 +#elif (USHRT_MAX == LZO_0xffffffffL)
26963 + COMPILE_TIME_ASSERT(sizeof(short) == 4);
26964 +#elif (USHRT_MAX >= LZO_0xffffffffL)
26965 + COMPILE_TIME_ASSERT(sizeof(short) > 4);
26966 +#endif
26967 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
26968 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
26969 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
26970 + COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
26971 + COMPILE_TIME_ASSERT(IS_SIGNED(short));
26972 + COMPILE_TIME_ASSERT(IS_SIGNED(int));
26973 + COMPILE_TIME_ASSERT(IS_SIGNED(long));
26974 +
26975 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
26976 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
26977 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
26978 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
26979 +
26980 + COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
26981 + COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
26982 + COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
26983 + COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
26984 + COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
26985 + COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
26986 + LZO_UTYPE_MAX(sizeof(lzo_uint32)));
26987 + COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
26988 +
26989 + r &= __lzo_assert(LZO_BYTE(257) == 1);
26990 +
26991 + return r;
26992 +}
26993 +
26994 +static lzo_bool basic_ptr_check(void)
26995 +{
26996 + lzo_bool r = 1;
26997 +
26998 + COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
26999 + COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
27000 +
27001 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
27002 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
27003 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
27004 + COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
27005 +
27006 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
27007 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
27008 + COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
27009 +
27010 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
27011 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
27012 +
27013 + COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
27014 + COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
27015 +
27016 +#if defined(SIZEOF_CHAR_P)
27017 + COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
27018 +#endif
27019 +#if defined(SIZEOF_PTRDIFF_T)
27020 + COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
27021 +#endif
27022 +
27023 + COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
27024 + COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
27025 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
27026 + COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
27027 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
27028 + COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
27029 +
27030 + return r;
27031 +}
27032 +
27033 +static lzo_bool ptr_check(void)
27034 +{
27035 + lzo_bool r = 1;
27036 + int i;
27037 + char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
27038 + lzo_bytep wrkmem;
27039 + lzo_bytepp dict;
27040 + unsigned char x[4 * sizeof(lzo_full_align_t)];
27041 + long d;
27042 + lzo_full_align_t a;
27043 + lzo_full_align_t u;
27044 +
27045 + for (i = 0; i < (int)sizeof(x); i++)
27046 + x[i] = LZO_BYTE(i);
27047 +
27048 + wrkmem =
27049 + LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
27050 +
27051 + u.a_lzo_bytep = wrkmem;
27052 + dict = u.a_lzo_bytepp;
27053 +
27054 + d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
27055 + r &= __lzo_assert(d >= 0);
27056 + r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
27057 +
27058 + memset(&a, 0, sizeof(a));
27059 + r &= __lzo_assert(a.a_lzo_voidp == NULL);
27060 +
27061 + memset(&a, 0xff, sizeof(a));
27062 + r &= __lzo_assert(a.a_ushort == USHRT_MAX);
27063 + r &= __lzo_assert(a.a_uint == UINT_MAX);
27064 + r &= __lzo_assert(a.a_ulong == ULONG_MAX);
27065 + r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
27066 + r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
27067 +
27068 + if (r == 1) {
27069 + for (i = 0; i < 8; i++)
27070 + r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
27071 + (const
27072 + lzo_voidp)(&wrkmem[i *
27073 + sizeof(lzo_byte
27074 + *)]));
27075 + }
27076 +
27077 + memset(&a, 0, sizeof(a));
27078 + r &= __lzo_assert(a.a_char_p == NULL);
27079 + r &= __lzo_assert(a.a_lzo_bytep == NULL);
27080 + r &= __lzo_assert(NULL == (void *)0);
27081 + if (r == 1) {
27082 + for (i = 0; i < 10; i++)
27083 + dict[i] = wrkmem;
27084 + BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
27085 + r &= __lzo_assert(dict[0] == wrkmem);
27086 + for (i = 1; i < 9; i++)
27087 + r &= __lzo_assert(dict[i] == NULL);
27088 + r &= __lzo_assert(dict[9] == wrkmem);
27089 + }
27090 +
27091 + if (r == 1) {
27092 + unsigned k = 1;
27093 + const unsigned n = (unsigned)sizeof(lzo_uint32);
27094 + lzo_byte *p0;
27095 + lzo_byte *p1;
27096 +
27097 + k += __lzo_align_gap(&x[k], n);
27098 + p0 = (lzo_bytep) & x[k];
27099 +#if defined(PTR_LINEAR)
27100 + r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27101 +#else
27102 + r &= __lzo_assert(n == 4);
27103 + r &= __lzo_assert(PTR_ALIGNED_4(p0));
27104 +#endif
27105 +
27106 + r &= __lzo_assert(k >= 1);
27107 + p1 = (lzo_bytep) & x[1];
27108 + r &= __lzo_assert(PTR_GE(p0, p1));
27109 +
27110 + r &= __lzo_assert(k < 1 + n);
27111 + p1 = (lzo_bytep) & x[1 + n];
27112 + r &= __lzo_assert(PTR_LT(p0, p1));
27113 +
27114 + if (r == 1) {
27115 + lzo_uint32 v0, v1;
27116 +
27117 + u.a_uchar_p = &x[k];
27118 + v0 = *u.a_lzo_uint32_p;
27119 + u.a_uchar_p = &x[k + n];
27120 + v1 = *u.a_lzo_uint32_p;
27121 +
27122 + r &= __lzo_assert(v0 > 0);
27123 + r &= __lzo_assert(v1 > 0);
27124 + }
27125 + }
27126 +
27127 + return r;
27128 +}
27129 +
27130 +static int _lzo_config_check(void)
27131 +{
27132 + lzo_bool r = 1;
27133 + int i;
27134 + union {
27135 + lzo_uint32 a;
27136 + unsigned short b;
27137 + lzo_uint32 aa[4];
27138 + unsigned char x[4 * sizeof(lzo_full_align_t)];
27139 + } u;
27140 +
27141 + COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27142 + COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27143 + < 0);
27144 +
27145 + r &= basic_integral_check();
27146 + r &= basic_ptr_check();
27147 + if (r != 1)
27148 + return LZO_E_ERROR;
27149 +
27150 + u.a = 0;
27151 + u.b = 0;
27152 + for (i = 0; i < (int)sizeof(u.x); i++)
27153 + u.x[i] = LZO_BYTE(i);
27154 +
27155 +#if defined(LZO_BYTE_ORDER)
27156 + if (r == 1) {
27157 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27158 + lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27159 + unsigned short b = (unsigned short)(u.b & 0xffff);
27160 + r &= __lzo_assert(a == 0x03020100L);
27161 + r &= __lzo_assert(b == 0x0100);
27162 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27163 + lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27164 + unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27165 + r &= __lzo_assert(a == 0x00010203L);
27166 + r &= __lzo_assert(b == 0x0001);
27167 +# else
27168 +# error "invalid LZO_BYTE_ORDER"
27169 +# endif
27170 + }
27171 +#endif
27172 +
27173 +#if defined(LZO_UNALIGNED_OK_2)
27174 + COMPILE_TIME_ASSERT(sizeof(short) == 2);
27175 + if (r == 1) {
27176 + unsigned short b[4];
27177 +
27178 + for (i = 0; i < 4; i++)
27179 + b[i] = *(const unsigned short *)&u.x[i];
27180 +
27181 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27182 + r &= __lzo_assert(b[0] == 0x0100);
27183 + r &= __lzo_assert(b[1] == 0x0201);
27184 + r &= __lzo_assert(b[2] == 0x0302);
27185 + r &= __lzo_assert(b[3] == 0x0403);
27186 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27187 + r &= __lzo_assert(b[0] == 0x0001);
27188 + r &= __lzo_assert(b[1] == 0x0102);
27189 + r &= __lzo_assert(b[2] == 0x0203);
27190 + r &= __lzo_assert(b[3] == 0x0304);
27191 +# endif
27192 + }
27193 +#endif
27194 +
27195 +#if defined(LZO_UNALIGNED_OK_4)
27196 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27197 + if (r == 1) {
27198 + lzo_uint32 a[4];
27199 +
27200 + for (i = 0; i < 4; i++)
27201 + a[i] = *(const lzo_uint32 *)&u.x[i];
27202 +
27203 +# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27204 + r &= __lzo_assert(a[0] == 0x03020100L);
27205 + r &= __lzo_assert(a[1] == 0x04030201L);
27206 + r &= __lzo_assert(a[2] == 0x05040302L);
27207 + r &= __lzo_assert(a[3] == 0x06050403L);
27208 +# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27209 + r &= __lzo_assert(a[0] == 0x00010203L);
27210 + r &= __lzo_assert(a[1] == 0x01020304L);
27211 + r &= __lzo_assert(a[2] == 0x02030405L);
27212 + r &= __lzo_assert(a[3] == 0x03040506L);
27213 +# endif
27214 + }
27215 +#endif
27216 +
27217 +#if defined(LZO_ALIGNED_OK_4)
27218 + COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27219 +#endif
27220 +
27221 + COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27222 +
27223 + if (r == 1) {
27224 + r &= __lzo_assert(!schedule_insns_bug());
27225 + }
27226 +
27227 + if (r == 1) {
27228 + static int x[3];
27229 + static unsigned xn = 3;
27230 + register unsigned j;
27231 +
27232 + for (j = 0; j < xn; j++)
27233 + x[j] = (int)j - 3;
27234 + r &= __lzo_assert(!strength_reduce_bug(x));
27235 + }
27236 +
27237 + if (r == 1) {
27238 + r &= ptr_check();
27239 + }
27240 +
27241 + return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27242 +}
27243 +
27244 +static lzo_bool schedule_insns_bug(void)
27245 +{
27246 +#if defined(__LZO_CHECKER)
27247 + return 0;
27248 +#else
27249 + const int clone[] = { 1, 2, 0 };
27250 + const int *q;
27251 + q = clone;
27252 + return (*q) ? 0 : 1;
27253 +#endif
27254 +}
27255 +
27256 +static lzo_bool strength_reduce_bug(int *x)
27257 +{
27258 + return x[0] != -3 || x[1] != -2 || x[2] != -1;
27259 +}
27260 +
27261 +#undef COMPILE_TIME_ASSERT
27262 +
27263 +int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27264 + int s6, int s7, int s8, int s9)
27265 +{
27266 + int r;
27267 +
27268 + if (v == 0)
27269 + return LZO_E_ERROR;
27270 +
27271 + r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27272 + (s2 == -1 || s2 == (int)sizeof(int)) &&
27273 + (s3 == -1 || s3 == (int)sizeof(long)) &&
27274 + (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27275 + (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27276 + (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27277 + (s7 == -1 || s7 == (int)sizeof(char *)) &&
27278 + (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27279 + (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27280 + if (!r)
27281 + return LZO_E_ERROR;
27282 +
27283 + r = _lzo_config_check();
27284 + if (r != LZO_E_OK)
27285 + return r;
27286 +
27287 + return r;
27288 +}
27289 +
27290 +#define do_compress _lzo1x_1_do_compress
27291 +
27292 +#define LZO_NEED_DICT_H
27293 +#define D_BITS 14
27294 +#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
27295 +#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27296 +
27297 +#ifndef __LZO_CONFIG1X_H
27298 +#define __LZO_CONFIG1X_H
27299 +
27300 +#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27301 +# define LZO1X
27302 +#endif
27303 +
27304 +#define LZO_EOF_CODE
27305 +#undef LZO_DETERMINISTIC
27306 +
27307 +#define M1_MAX_OFFSET 0x0400
27308 +#ifndef M2_MAX_OFFSET
27309 +#define M2_MAX_OFFSET 0x0800
27310 +#endif
27311 +#define M3_MAX_OFFSET 0x4000
27312 +#define M4_MAX_OFFSET 0xbfff
27313 +
27314 +#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
27315 +
27316 +#define M1_MIN_LEN 2
27317 +#define M1_MAX_LEN 2
27318 +#define M2_MIN_LEN 3
27319 +#ifndef M2_MAX_LEN
27320 +#define M2_MAX_LEN 8
27321 +#endif
27322 +#define M3_MIN_LEN 3
27323 +#define M3_MAX_LEN 33
27324 +#define M4_MIN_LEN 3
27325 +#define M4_MAX_LEN 9
27326 +
27327 +#define M1_MARKER 0
27328 +#define M2_MARKER 64
27329 +#define M3_MARKER 32
27330 +#define M4_MARKER 16
27331 +
27332 +#ifndef MIN_LOOKAHEAD
27333 +#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
27334 +#endif
27335 +
27336 +#if defined(LZO_NEED_DICT_H)
27337 +
27338 +#ifndef LZO_HASH
27339 +#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
27340 +#endif
27341 +#define DL_MIN_LEN M2_MIN_LEN
27342 +
27343 +#ifndef __LZO_DICT_H
27344 +#define __LZO_DICT_H
27345 +
27346 +#if !defined(D_BITS) && defined(DBITS)
27347 +# define D_BITS DBITS
27348 +#endif
27349 +#if !defined(D_BITS)
27350 +# error "D_BITS is not defined"
27351 +#endif
27352 +#if (D_BITS < 16)
27353 +# define D_SIZE LZO_SIZE(D_BITS)
27354 +# define D_MASK LZO_MASK(D_BITS)
27355 +#else
27356 +# define D_SIZE LZO_USIZE(D_BITS)
27357 +# define D_MASK LZO_UMASK(D_BITS)
27358 +#endif
27359 +#define D_HIGH ((D_MASK >> 1) + 1)
27360 +
27361 +#if !defined(DD_BITS)
27362 +# define DD_BITS 0
27363 +#endif
27364 +#define DD_SIZE LZO_SIZE(DD_BITS)
27365 +#define DD_MASK LZO_MASK(DD_BITS)
27366 +
27367 +#if !defined(DL_BITS)
27368 +# define DL_BITS (D_BITS - DD_BITS)
27369 +#endif
27370 +#if (DL_BITS < 16)
27371 +# define DL_SIZE LZO_SIZE(DL_BITS)
27372 +# define DL_MASK LZO_MASK(DL_BITS)
27373 +#else
27374 +# define DL_SIZE LZO_USIZE(DL_BITS)
27375 +# define DL_MASK LZO_UMASK(DL_BITS)
27376 +#endif
27377 +
27378 +#if (D_BITS != DL_BITS + DD_BITS)
27379 +# error "D_BITS does not match"
27380 +#endif
27381 +#if (D_BITS < 8 || D_BITS > 18)
27382 +# error "invalid D_BITS"
27383 +#endif
27384 +#if (DL_BITS < 8 || DL_BITS > 20)
27385 +# error "invalid DL_BITS"
27386 +#endif
27387 +#if (DD_BITS < 0 || DD_BITS > 6)
27388 +# error "invalid DD_BITS"
27389 +#endif
27390 +
27391 +#if !defined(DL_MIN_LEN)
27392 +# define DL_MIN_LEN 3
27393 +#endif
27394 +#if !defined(DL_SHIFT)
27395 +# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27396 +#endif
27397 +
27398 +#define LZO_HASH_GZIP 1
27399 +#define LZO_HASH_GZIP_INCREMENTAL 2
27400 +#define LZO_HASH_LZO_INCREMENTAL_A 3
27401 +#define LZO_HASH_LZO_INCREMENTAL_B 4
27402 +
27403 +#if !defined(LZO_HASH)
27404 +# error "choose a hashing strategy"
27405 +#endif
27406 +
27407 +#if (DL_MIN_LEN == 3)
27408 +# define _DV2_A(p,shift1,shift2) \
27409 + (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27410 +# define _DV2_B(p,shift1,shift2) \
27411 + (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27412 +# define _DV3_B(p,shift1,shift2,shift3) \
27413 + ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27414 +#elif (DL_MIN_LEN == 2)
27415 +# define _DV2_A(p,shift1,shift2) \
27416 + (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27417 +# define _DV2_B(p,shift1,shift2) \
27418 + (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27419 +#else
27420 +# error "invalid DL_MIN_LEN"
27421 +#endif
27422 +#define _DV_A(p,shift) _DV2_A(p,shift,shift)
27423 +#define _DV_B(p,shift) _DV2_B(p,shift,shift)
27424 +#define DA2(p,s1,s2) \
27425 + (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27426 +#define DS2(p,s1,s2) \
27427 + (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27428 +#define DX2(p,s1,s2) \
27429 + (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27430 +#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27431 +#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27432 +#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27433 +#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27434 +#define DM(v) DMS(v,0)
27435 +
27436 +#if (LZO_HASH == LZO_HASH_GZIP)
27437 +# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
27438 +
27439 +#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27440 +# define __LZO_HASH_INCREMENTAL
27441 +# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
27442 +# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
27443 +# define _DINDEX(dv,p) (dv)
27444 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27445 +
27446 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27447 +# define __LZO_HASH_INCREMENTAL
27448 +# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
27449 +# define DVAL_NEXT(dv,p) \
27450 + dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27451 +# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27452 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27453 +
27454 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27455 +# define __LZO_HASH_INCREMENTAL
27456 +# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
27457 +# define DVAL_NEXT(dv,p) \
27458 + dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27459 +# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27460 +# define DVAL_LOOKAHEAD DL_MIN_LEN
27461 +
27462 +#else
27463 +# error "choose a hashing strategy"
27464 +#endif
27465 +
27466 +#ifndef DINDEX
27467 +#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27468 +#endif
27469 +#if !defined(DINDEX1) && defined(D_INDEX1)
27470 +#define DINDEX1 D_INDEX1
27471 +#endif
27472 +#if !defined(DINDEX2) && defined(D_INDEX2)
27473 +#define DINDEX2 D_INDEX2
27474 +#endif
27475 +
27476 +#if !defined(__LZO_HASH_INCREMENTAL)
27477 +# define DVAL_FIRST(dv,p) ((void) 0)
27478 +# define DVAL_NEXT(dv,p) ((void) 0)
27479 +# define DVAL_LOOKAHEAD 0
27480 +#endif
27481 +
27482 +#if !defined(DVAL_ASSERT)
27483 +#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
27484 +static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p)
27485 +{
27486 + lzo_uint32 df;
27487 + DVAL_FIRST(df, (p));
27488 + assert(DINDEX(dv, p) == DINDEX(df, p));
27489 +}
27490 +#else
27491 +# define DVAL_ASSERT(dv,p) ((void) 0)
27492 +#endif
27493 +#endif
27494 +
27495 +# define DENTRY(p,in) (p)
27496 +# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
27497 +
27498 +#if (DD_BITS == 0)
27499 +
27500 +# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27501 +# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
27502 +# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
27503 +
27504 +#else
27505 +
27506 +# define UPDATE_D(dict,drun,dv,p,in) \
27507 + dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27508 +# define UPDATE_I(dict,drun,index,p,in) \
27509 + dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27510 +# define UPDATE_P(ptr,drun,p,in) \
27511 + (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
27512 +
27513 +#endif
27514 +
27515 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
27516 + (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
27517 +
27518 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
27519 + (BOUNDS_CHECKING_OFF_IN_EXPR( \
27520 + (PTR_LT(m_pos,in) || \
27521 + (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
27522 + m_off > max_offset) ))
27523 +
27524 +#if defined(LZO_DETERMINISTIC)
27525 +# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
27526 +#else
27527 +# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
27528 +#endif
27529 +#endif
27530 +#endif
27531 +#endif
27532 +#define DO_COMPRESS lzo1x_1_compress
27533 +static
27534 +lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
27535 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27536 +{
27537 + register const lzo_byte *ip;
27538 + lzo_byte *op;
27539 + const lzo_byte *const in_end = in + in_len;
27540 + const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
27541 + const lzo_byte *ii;
27542 + lzo_dict_p const dict = (lzo_dict_p) wrkmem;
27543 +
27544 + op = out;
27545 + ip = in;
27546 + ii = ip;
27547 +
27548 + ip += 4;
27549 + for (;;) {
27550 + register const lzo_byte *m_pos;
27551 +
27552 + lzo_moff_t m_off;
27553 + lzo_uint m_len;
27554 + lzo_uint dindex;
27555 +
27556 + DINDEX1(dindex, ip);
27557 + GINDEX(m_pos, m_off, dict, dindex, in);
27558 + if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27559 + goto literal;
27560 +#if 1
27561 + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27562 + goto try_match;
27563 + DINDEX2(dindex, ip);
27564 +#endif
27565 + GINDEX(m_pos, m_off, dict, dindex, in);
27566 + if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27567 + goto literal;
27568 + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27569 + goto try_match;
27570 + goto literal;
27571 +
27572 + try_match:
27573 +#if 1 && defined(LZO_UNALIGNED_OK_2)
27574 + if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
27575 +#else
27576 + if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
27577 +#endif
27578 + ;
27579 + } else {
27580 + if (m_pos[2] == ip[2]) {
27581 + goto match;
27582 + } else {
27583 + ;
27584 + }
27585 + }
27586 +
27587 + literal:
27588 + UPDATE_I(dict, 0, dindex, ip, in);
27589 + ++ip;
27590 + if (ip >= ip_end)
27591 + break;
27592 + continue;
27593 +
27594 + match:
27595 + UPDATE_I(dict, 0, dindex, ip, in);
27596 + if (pd(ip, ii) > 0) {
27597 + register lzo_uint t = pd(ip, ii);
27598 +
27599 + if (t <= 3) {
27600 + assert("lzo-04", op - 2 > out);
27601 + op[-2] |= LZO_BYTE(t);
27602 + } else if (t <= 18)
27603 + *op++ = LZO_BYTE(t - 3);
27604 + else {
27605 + register lzo_uint tt = t - 18;
27606 +
27607 + *op++ = 0;
27608 + while (tt > 255) {
27609 + tt -= 255;
27610 + *op++ = 0;
27611 + }
27612 + assert("lzo-05", tt > 0);
27613 + *op++ = LZO_BYTE(tt);
27614 + }
27615 + do
27616 + *op++ = *ii++;
27617 + while (--t > 0);
27618 + }
27619 +
27620 + assert("lzo-06", ii == ip);
27621 + ip += 3;
27622 + if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
27623 + || m_pos[6] != *ip++ || m_pos[7] != *ip++
27624 + || m_pos[8] != *ip++
27625 +#ifdef LZO1Y
27626 + || m_pos[9] != *ip++ || m_pos[10] != *ip++
27627 + || m_pos[11] != *ip++ || m_pos[12] != *ip++
27628 + || m_pos[13] != *ip++ || m_pos[14] != *ip++
27629 +#endif
27630 + ) {
27631 + --ip;
27632 + m_len = ip - ii;
27633 + assert("lzo-07", m_len >= 3);
27634 + assert("lzo-08", m_len <= M2_MAX_LEN);
27635 +
27636 + if (m_off <= M2_MAX_OFFSET) {
27637 + m_off -= 1;
27638 +#if defined(LZO1X)
27639 + *op++ =
27640 + LZO_BYTE(((m_len -
27641 + 1) << 5) | ((m_off & 7) << 2));
27642 + *op++ = LZO_BYTE(m_off >> 3);
27643 +#elif defined(LZO1Y)
27644 + *op++ =
27645 + LZO_BYTE(((m_len +
27646 + 1) << 4) | ((m_off & 3) << 2));
27647 + *op++ = LZO_BYTE(m_off >> 2);
27648 +#endif
27649 + } else if (m_off <= M3_MAX_OFFSET) {
27650 + m_off -= 1;
27651 + *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
27652 + goto m3_m4_offset;
27653 + } else
27654 +#if defined(LZO1X)
27655 + {
27656 + m_off -= 0x4000;
27657 + assert("lzo-09", m_off > 0);
27658 + assert("lzo-10", m_off <= 0x7fff);
27659 + *op++ = LZO_BYTE(M4_MARKER |
27660 + ((m_off & 0x4000) >> 11) |
27661 + (m_len - 2));
27662 + goto m3_m4_offset;
27663 + }
27664 +#elif defined(LZO1Y)
27665 + goto m4_match;
27666 +#endif
27667 + } else {
27668 + {
27669 + const lzo_byte *end = in_end;
27670 + const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
27671 + while (ip < end && *m == *ip)
27672 + m++, ip++;
27673 + m_len = (ip - ii);
27674 + }
27675 + assert("lzo-11", m_len > M2_MAX_LEN);
27676 +
27677 + if (m_off <= M3_MAX_OFFSET) {
27678 + m_off -= 1;
27679 + if (m_len <= 33)
27680 + *op++ =
27681 + LZO_BYTE(M3_MARKER | (m_len - 2));
27682 + else {
27683 + m_len -= 33;
27684 + *op++ = M3_MARKER | 0;
27685 + goto m3_m4_len;
27686 + }
27687 + } else {
27688 +#if defined(LZO1Y)
27689 + m4_match:
27690 +#endif
27691 + m_off -= 0x4000;
27692 + assert("lzo-12", m_off > 0);
27693 + assert("lzo-13", m_off <= 0x7fff);
27694 + if (m_len <= M4_MAX_LEN)
27695 + *op++ = LZO_BYTE(M4_MARKER |
27696 + ((m_off & 0x4000) >>
27697 + 11) | (m_len - 2));
27698 + else {
27699 + m_len -= M4_MAX_LEN;
27700 + *op++ =
27701 + LZO_BYTE(M4_MARKER |
27702 + ((m_off & 0x4000) >> 11));
27703 + m3_m4_len:
27704 + while (m_len > 255) {
27705 + m_len -= 255;
27706 + *op++ = 0;
27707 + }
27708 + assert("lzo-14", m_len > 0);
27709 + *op++ = LZO_BYTE(m_len);
27710 + }
27711 + }
27712 +
27713 + m3_m4_offset:
27714 + *op++ = LZO_BYTE((m_off & 63) << 2);
27715 + *op++ = LZO_BYTE(m_off >> 6);
27716 + }
27717 +
27718 + ii = ip;
27719 + if (ip >= ip_end)
27720 + break;
27721 + }
27722 +
27723 + *out_len = op - out;
27724 + return pd(in_end, ii);
27725 +}
27726 +
27727 +int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
27728 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27729 +{
27730 + lzo_byte *op = out;
27731 + lzo_uint t;
27732 +
27733 +#if defined(__LZO_QUERY_COMPRESS)
27734 + if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27735 + return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
27736 + D_SIZE, lzo_sizeof(lzo_dict_t));
27737 +#endif
27738 +
27739 + if (in_len <= M2_MAX_LEN + 5)
27740 + t = in_len;
27741 + else {
27742 + t = do_compress(in, in_len, op, out_len, wrkmem);
27743 + op += *out_len;
27744 + }
27745 +
27746 + if (t > 0) {
27747 + const lzo_byte *ii = in + in_len - t;
27748 +
27749 + if (op == out && t <= 238)
27750 + *op++ = LZO_BYTE(17 + t);
27751 + else if (t <= 3)
27752 + op[-2] |= LZO_BYTE(t);
27753 + else if (t <= 18)
27754 + *op++ = LZO_BYTE(t - 3);
27755 + else {
27756 + lzo_uint tt = t - 18;
27757 +
27758 + *op++ = 0;
27759 + while (tt > 255) {
27760 + tt -= 255;
27761 + *op++ = 0;
27762 + }
27763 + assert("lzo-15", tt > 0);
27764 + *op++ = LZO_BYTE(tt);
27765 + }
27766 + do
27767 + *op++ = *ii++;
27768 + while (--t > 0);
27769 + }
27770 +
27771 + *op++ = M4_MARKER | 1;
27772 + *op++ = 0;
27773 + *op++ = 0;
27774 +
27775 + *out_len = op - out;
27776 + return LZO_E_OK;
27777 +}
27778 +
27779 +#undef do_compress
27780 +#undef DO_COMPRESS
27781 +#undef LZO_HASH
27782 +
27783 +#undef LZO_TEST_DECOMPRESS_OVERRUN
27784 +#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
27785 +#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
27786 +#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27787 +#undef DO_DECOMPRESS
27788 +#define DO_DECOMPRESS lzo1x_decompress
27789 +
27790 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
27791 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27792 +# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
27793 +# endif
27794 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27795 +# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
27796 +# endif
27797 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27798 +# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27799 +# endif
27800 +#endif
27801 +
27802 +#undef TEST_IP
27803 +#undef TEST_OP
27804 +#undef TEST_LOOKBEHIND
27805 +#undef NEED_IP
27806 +#undef NEED_OP
27807 +#undef HAVE_TEST_IP
27808 +#undef HAVE_TEST_OP
27809 +#undef HAVE_NEED_IP
27810 +#undef HAVE_NEED_OP
27811 +#undef HAVE_ANY_IP
27812 +#undef HAVE_ANY_OP
27813 +
27814 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27815 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
27816 +# define TEST_IP (ip < ip_end)
27817 +# endif
27818 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
27819 +# define NEED_IP(x) \
27820 + if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
27821 +# endif
27822 +#endif
27823 +
27824 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27825 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
27826 +# define TEST_OP (op <= op_end)
27827 +# endif
27828 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
27829 +# undef TEST_OP
27830 +# define NEED_OP(x) \
27831 + if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
27832 +# endif
27833 +#endif
27834 +
27835 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27836 +# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
27837 +#else
27838 +# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
27839 +#endif
27840 +
27841 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
27842 +# define TEST_IP (ip < ip_end)
27843 +#endif
27844 +
27845 +#if defined(TEST_IP)
27846 +# define HAVE_TEST_IP
27847 +#else
27848 +# define TEST_IP 1
27849 +#endif
27850 +#if defined(TEST_OP)
27851 +# define HAVE_TEST_OP
27852 +#else
27853 +# define TEST_OP 1
27854 +#endif
27855 +
27856 +#if defined(NEED_IP)
27857 +# define HAVE_NEED_IP
27858 +#else
27859 +# define NEED_IP(x) ((void) 0)
27860 +#endif
27861 +#if defined(NEED_OP)
27862 +# define HAVE_NEED_OP
27863 +#else
27864 +# define NEED_OP(x) ((void) 0)
27865 +#endif
27866 +
27867 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
27868 +# define HAVE_ANY_IP
27869 +#endif
27870 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
27871 +# define HAVE_ANY_OP
27872 +#endif
27873 +
27874 +#undef __COPY4
27875 +#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
27876 +
27877 +#undef COPY4
27878 +#if defined(LZO_UNALIGNED_OK_4)
27879 +# define COPY4(dst,src) __COPY4(dst,src)
27880 +#elif defined(LZO_ALIGNED_OK_4)
27881 +# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
27882 +#endif
27883 +
27884 +#if defined(DO_DECOMPRESS)
27885 +int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
27886 + lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27887 +#endif
27888 +{
27889 + register lzo_byte *op;
27890 + register const lzo_byte *ip;
27891 + register lzo_uint t;
27892 +#if defined(COPY_DICT)
27893 + lzo_uint m_off;
27894 + const lzo_byte *dict_end;
27895 +#else
27896 + register const lzo_byte *m_pos;
27897 +#endif
27898 +
27899 + const lzo_byte *const ip_end = in + in_len;
27900 +#if defined(HAVE_ANY_OP)
27901 + lzo_byte *const op_end = out + *out_len;
27902 +#endif
27903 +#if defined(LZO1Z)
27904 + lzo_uint last_m_off = 0;
27905 +#endif
27906 +
27907 + LZO_UNUSED(wrkmem);
27908 +
27909 +#if defined(__LZO_QUERY_DECOMPRESS)
27910 + if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27911 + return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
27912 + 0, 0);
27913 +#endif
27914 +
27915 +#if defined(COPY_DICT)
27916 + if (dict) {
27917 + if (dict_len > M4_MAX_OFFSET) {
27918 + dict += dict_len - M4_MAX_OFFSET;
27919 + dict_len = M4_MAX_OFFSET;
27920 + }
27921 + dict_end = dict + dict_len;
27922 + } else {
27923 + dict_len = 0;
27924 + dict_end = NULL;
27925 + }
27926 +#endif
27927 +
27928 + *out_len = 0;
27929 +
27930 + op = out;
27931 + ip = in;
27932 +
27933 + if (*ip > 17) {
27934 + t = *ip++ - 17;
27935 + if (t < 4)
27936 + goto match_next;
27937 + assert("lzo-16", t > 0);
27938 + NEED_OP(t);
27939 + NEED_IP(t + 1);
27940 + do
27941 + *op++ = *ip++;
27942 + while (--t > 0);
27943 + goto first_literal_run;
27944 + }
27945 +
27946 + while (TEST_IP && TEST_OP) {
27947 + t = *ip++;
27948 + if (t >= 16)
27949 + goto match;
27950 + if (t == 0) {
27951 + NEED_IP(1);
27952 + while (*ip == 0) {
27953 + t += 255;
27954 + ip++;
27955 + NEED_IP(1);
27956 + }
27957 + t += 15 + *ip++;
27958 + }
27959 + assert("lzo-17", t > 0);
27960 + NEED_OP(t + 3);
27961 + NEED_IP(t + 4);
27962 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
27963 +#if !defined(LZO_UNALIGNED_OK_4)
27964 + if (PTR_ALIGNED2_4(op, ip)) {
27965 +#endif
27966 + COPY4(op, ip);
27967 + op += 4;
27968 + ip += 4;
27969 + if (--t > 0) {
27970 + if (t >= 4) {
27971 + do {
27972 + COPY4(op, ip);
27973 + op += 4;
27974 + ip += 4;
27975 + t -= 4;
27976 + } while (t >= 4);
27977 + if (t > 0)
27978 + do
27979 + *op++ = *ip++;
27980 + while (--t > 0);
27981 + } else
27982 + do
27983 + *op++ = *ip++;
27984 + while (--t > 0);
27985 + }
27986 +#if !defined(LZO_UNALIGNED_OK_4)
27987 + } else
27988 +#endif
27989 +#endif
27990 +#if !defined(LZO_UNALIGNED_OK_4)
27991 + {
27992 + *op++ = *ip++;
27993 + *op++ = *ip++;
27994 + *op++ = *ip++;
27995 + do
27996 + *op++ = *ip++;
27997 + while (--t > 0);
27998 + }
27999 +#endif
28000 +
28001 + first_literal_run:
28002 +
28003 + t = *ip++;
28004 + if (t >= 16)
28005 + goto match;
28006 +#if defined(COPY_DICT)
28007 +#if defined(LZO1Z)
28008 + m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28009 + last_m_off = m_off;
28010 +#else
28011 + m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
28012 +#endif
28013 + NEED_OP(3);
28014 + t = 3;
28015 + COPY_DICT(t, m_off)
28016 +#else
28017 +#if defined(LZO1Z)
28018 + t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28019 + m_pos = op - t;
28020 + last_m_off = t;
28021 +#else
28022 + m_pos = op - (1 + M2_MAX_OFFSET);
28023 + m_pos -= t >> 2;
28024 + m_pos -= *ip++ << 2;
28025 +#endif
28026 + TEST_LOOKBEHIND(m_pos, out);
28027 + NEED_OP(3);
28028 + *op++ = *m_pos++;
28029 + *op++ = *m_pos++;
28030 + *op++ = *m_pos;
28031 +#endif
28032 + goto match_done;
28033 +
28034 + while (TEST_IP && TEST_OP) {
28035 + match:
28036 + if (t >= 64) {
28037 +#if defined(COPY_DICT)
28038 +#if defined(LZO1X)
28039 + m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
28040 + t = (t >> 5) - 1;
28041 +#elif defined(LZO1Y)
28042 + m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
28043 + t = (t >> 4) - 3;
28044 +#elif defined(LZO1Z)
28045 + m_off = t & 0x1f;
28046 + if (m_off >= 0x1c)
28047 + m_off = last_m_off;
28048 + else {
28049 + m_off = 1 + (m_off << 6) + (*ip++ >> 2);
28050 + last_m_off = m_off;
28051 + }
28052 + t = (t >> 5) - 1;
28053 +#endif
28054 +#else
28055 +#if defined(LZO1X)
28056 + m_pos = op - 1;
28057 + m_pos -= (t >> 2) & 7;
28058 + m_pos -= *ip++ << 3;
28059 + t = (t >> 5) - 1;
28060 +#elif defined(LZO1Y)
28061 + m_pos = op - 1;
28062 + m_pos -= (t >> 2) & 3;
28063 + m_pos -= *ip++ << 2;
28064 + t = (t >> 4) - 3;
28065 +#elif defined(LZO1Z)
28066 + {
28067 + lzo_uint off = t & 0x1f;
28068 + m_pos = op;
28069 + if (off >= 0x1c) {
28070 + assert(last_m_off > 0);
28071 + m_pos -= last_m_off;
28072 + } else {
28073 + off =
28074 + 1 + (off << 6) +
28075 + (*ip++ >> 2);
28076 + m_pos -= off;
28077 + last_m_off = off;
28078 + }
28079 + }
28080 + t = (t >> 5) - 1;
28081 +#endif
28082 + TEST_LOOKBEHIND(m_pos, out);
28083 + assert("lzo-18", t > 0);
28084 + NEED_OP(t + 3 - 1);
28085 + goto copy_match;
28086 +#endif
28087 + } else if (t >= 32) {
28088 + t &= 31;
28089 + if (t == 0) {
28090 + NEED_IP(1);
28091 + while (*ip == 0) {
28092 + t += 255;
28093 + ip++;
28094 + NEED_IP(1);
28095 + }
28096 + t += 31 + *ip++;
28097 + }
28098 +#if defined(COPY_DICT)
28099 +#if defined(LZO1Z)
28100 + m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28101 + last_m_off = m_off;
28102 +#else
28103 + m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28104 +#endif
28105 +#else
28106 +#if defined(LZO1Z)
28107 + {
28108 + lzo_uint off =
28109 + 1 + (ip[0] << 6) + (ip[1] >> 2);
28110 + m_pos = op - off;
28111 + last_m_off = off;
28112 + }
28113 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28114 + m_pos = op - 1;
28115 + m_pos -= (*(const lzo_ushortp)ip) >> 2;
28116 +#else
28117 + m_pos = op - 1;
28118 + m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28119 +#endif
28120 +#endif
28121 + ip += 2;
28122 + } else if (t >= 16) {
28123 +#if defined(COPY_DICT)
28124 + m_off = (t & 8) << 11;
28125 +#else
28126 + m_pos = op;
28127 + m_pos -= (t & 8) << 11;
28128 +#endif
28129 + t &= 7;
28130 + if (t == 0) {
28131 + NEED_IP(1);
28132 + while (*ip == 0) {
28133 + t += 255;
28134 + ip++;
28135 + NEED_IP(1);
28136 + }
28137 + t += 7 + *ip++;
28138 + }
28139 +#if defined(COPY_DICT)
28140 +#if defined(LZO1Z)
28141 + m_off += (ip[0] << 6) + (ip[1] >> 2);
28142 +#else
28143 + m_off += (ip[0] >> 2) + (ip[1] << 6);
28144 +#endif
28145 + ip += 2;
28146 + if (m_off == 0)
28147 + goto eof_found;
28148 + m_off += 0x4000;
28149 +#if defined(LZO1Z)
28150 + last_m_off = m_off;
28151 +#endif
28152 +#else
28153 +#if defined(LZO1Z)
28154 + m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28155 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28156 + m_pos -= (*(const lzo_ushortp)ip) >> 2;
28157 +#else
28158 + m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28159 +#endif
28160 + ip += 2;
28161 + if (m_pos == op)
28162 + goto eof_found;
28163 + m_pos -= 0x4000;
28164 +#if defined(LZO1Z)
28165 + last_m_off = op - m_pos;
28166 +#endif
28167 +#endif
28168 + } else {
28169 +#if defined(COPY_DICT)
28170 +#if defined(LZO1Z)
28171 + m_off = 1 + (t << 6) + (*ip++ >> 2);
28172 + last_m_off = m_off;
28173 +#else
28174 + m_off = 1 + (t >> 2) + (*ip++ << 2);
28175 +#endif
28176 + NEED_OP(2);
28177 + t = 2;
28178 + COPY_DICT(t, m_off)
28179 +#else
28180 +#if defined(LZO1Z)
28181 + t = 1 + (t << 6) + (*ip++ >> 2);
28182 + m_pos = op - t;
28183 + last_m_off = t;
28184 +#else
28185 + m_pos = op - 1;
28186 + m_pos -= t >> 2;
28187 + m_pos -= *ip++ << 2;
28188 +#endif
28189 + TEST_LOOKBEHIND(m_pos, out);
28190 + NEED_OP(2);
28191 + *op++ = *m_pos++;
28192 + *op++ = *m_pos;
28193 +#endif
28194 + goto match_done;
28195 + }
28196 +
28197 +#if defined(COPY_DICT)
28198 +
28199 + NEED_OP(t + 3 - 1);
28200 + t += 3 - 1;
28201 + COPY_DICT(t, m_off)
28202 +#else
28203 +
28204 + TEST_LOOKBEHIND(m_pos, out);
28205 + assert("lzo-19", t > 0);
28206 + NEED_OP(t + 3 - 1);
28207 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28208 +#if !defined(LZO_UNALIGNED_OK_4)
28209 + if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28210 + assert((op - m_pos) >= 4);
28211 +#else
28212 + if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28213 +#endif
28214 + COPY4(op, m_pos);
28215 + op += 4;
28216 + m_pos += 4;
28217 + t -= 4 - (3 - 1);
28218 + do {
28219 + COPY4(op, m_pos);
28220 + op += 4;
28221 + m_pos += 4;
28222 + t -= 4;
28223 + } while (t >= 4);
28224 + if (t > 0)
28225 + do
28226 + *op++ = *m_pos++;
28227 + while (--t > 0);
28228 + } else
28229 +#endif
28230 + {
28231 + copy_match:
28232 + *op++ = *m_pos++;
28233 + *op++ = *m_pos++;
28234 + do
28235 + *op++ = *m_pos++;
28236 + while (--t > 0);
28237 + }
28238 +
28239 +#endif
28240 +
28241 + match_done:
28242 +#if defined(LZO1Z)
28243 + t = ip[-1] & 3;
28244 +#else
28245 + t = ip[-2] & 3;
28246 +#endif
28247 + if (t == 0)
28248 + break;
28249 +
28250 + match_next:
28251 + assert("lzo-20", t > 0);
28252 + NEED_OP(t);
28253 + NEED_IP(t + 1);
28254 + do
28255 + *op++ = *ip++;
28256 + while (--t > 0);
28257 + t = *ip++;
28258 + }
28259 + }
28260 +
28261 +#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28262 + *out_len = op - out;
28263 + return LZO_E_EOF_NOT_FOUND;
28264 +#endif
28265 +
28266 + eof_found:
28267 + assert("lzo-21", t == 1);
28268 + *out_len = op - out;
28269 + return (ip == ip_end ? LZO_E_OK :
28270 + (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28271 +
28272 +#if defined(HAVE_NEED_IP)
28273 + input_overrun:
28274 + *out_len = op - out;
28275 + return LZO_E_INPUT_OVERRUN;
28276 +#endif
28277 +
28278 +#if defined(HAVE_NEED_OP)
28279 + output_overrun:
28280 + *out_len = op - out;
28281 + return LZO_E_OUTPUT_OVERRUN;
28282 +#endif
28283 +
28284 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28285 + lookbehind_overrun:
28286 + *out_len = op - out;
28287 + return LZO_E_LOOKBEHIND_OVERRUN;
28288 +#endif
28289 +}
28290 +
28291 +#define LZO_TEST_DECOMPRESS_OVERRUN
28292 +#undef DO_DECOMPRESS
28293 +#define DO_DECOMPRESS lzo1x_decompress_safe
28294 +
28295 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28296 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28297 +# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28298 +# endif
28299 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28300 +# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28301 +# endif
28302 +# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28303 +# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28304 +# endif
28305 +#endif
28306 +
28307 +#undef TEST_IP
28308 +#undef TEST_OP
28309 +#undef TEST_LOOKBEHIND
28310 +#undef NEED_IP
28311 +#undef NEED_OP
28312 +#undef HAVE_TEST_IP
28313 +#undef HAVE_TEST_OP
28314 +#undef HAVE_NEED_IP
28315 +#undef HAVE_NEED_OP
28316 +#undef HAVE_ANY_IP
28317 +#undef HAVE_ANY_OP
28318 +
28319 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28320 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28321 +# define TEST_IP (ip < ip_end)
28322 +# endif
28323 +# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28324 +# define NEED_IP(x) \
28325 + if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28326 +# endif
28327 +#endif
28328 +
28329 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28330 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28331 +# define TEST_OP (op <= op_end)
28332 +# endif
28333 +# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28334 +# undef TEST_OP
28335 +# define NEED_OP(x) \
28336 + if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28337 +# endif
28338 +#endif
28339 +
28340 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28341 +# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28342 +#else
28343 +# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28344 +#endif
28345 +
28346 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28347 +# define TEST_IP (ip < ip_end)
28348 +#endif
28349 +
28350 +#if defined(TEST_IP)
28351 +# define HAVE_TEST_IP
28352 +#else
28353 +# define TEST_IP 1
28354 +#endif
28355 +#if defined(TEST_OP)
28356 +# define HAVE_TEST_OP
28357 +#else
28358 +# define TEST_OP 1
28359 +#endif
28360 +
28361 +#if defined(NEED_IP)
28362 +# define HAVE_NEED_IP
28363 +#else
28364 +# define NEED_IP(x) ((void) 0)
28365 +#endif
28366 +#if defined(NEED_OP)
28367 +# define HAVE_NEED_OP
28368 +#else
28369 +# define NEED_OP(x) ((void) 0)
28370 +#endif
28371 +
28372 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28373 +# define HAVE_ANY_IP
28374 +#endif
28375 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28376 +# define HAVE_ANY_OP
28377 +#endif
28378 +
28379 +#undef __COPY4
28380 +#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28381 +
28382 +#undef COPY4
28383 +#if defined(LZO_UNALIGNED_OK_4)
28384 +# define COPY4(dst,src) __COPY4(dst,src)
28385 +#elif defined(LZO_ALIGNED_OK_4)
28386 +# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28387 +#endif
28388 +
28389 +/***** End of minilzo.c *****/
28390 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.h linux-2.6.22/fs/reiser4/plugin/compress/minilzo.h
28391 --- linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.h 1970-01-01 03:00:00.000000000 +0300
28392 +++ linux-2.6.22/fs/reiser4/plugin/compress/minilzo.h 2007-07-29 00:25:34.900702689 +0400
28393 @@ -0,0 +1,70 @@
28394 +/* minilzo.h -- mini subset of the LZO real-time data compression library
28395 + adopted for reiser4 compression transform plugin.
28396 +
28397 + This file is part of the LZO real-time data compression library
28398 + and not included in any proprietary licenses of reiser4.
28399 +
28400 + Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28401 + Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28402 + Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28403 + Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28404 + Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28405 + Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28406 + Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28407 + All Rights Reserved.
28408 +
28409 + The LZO library is free software; you can redistribute it and/or
28410 + modify it under the terms of the GNU General Public License as
28411 + published by the Free Software Foundation; either version 2 of
28412 + the License, or (at your option) any later version.
28413 +
28414 + The LZO library is distributed in the hope that it will be useful,
28415 + but WITHOUT ANY WARRANTY; without even the implied warranty of
28416 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28417 + GNU General Public License for more details.
28418 +
28419 + You should have received a copy of the GNU General Public License
28420 + along with the LZO library; see the file COPYING.
28421 + If not, write to the Free Software Foundation, Inc.,
28422 + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28423 +
28424 + Markus F.X.J. Oberhumer
28425 + <markus@oberhumer.com>
28426 + http://www.oberhumer.com/opensource/lzo/
28427 + */
28428 +
28429 +/*
28430 + * NOTE:
28431 + * the full LZO package can be found at
28432 + * http://www.oberhumer.com/opensource/lzo/
28433 + */
28434 +
28435 +#ifndef __MINILZO_H
28436 +#define __MINILZO_H
28437 +
28438 +#define MINILZO_VERSION 0x1080
28439 +
28440 +#include "lzoconf.h"
28441 +
28442 +/* Memory required for the wrkmem parameter.
28443 + * When the required size is 0, you can also pass a NULL pointer.
28444 + */
28445 +
28446 +#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
28447 +#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28448 +#define LZO1X_MEM_DECOMPRESS (0)
28449 +
28450 +/* compression */
28451 +extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28452 + lzo_byte * dst, lzo_uintp dst_len,
28453 + lzo_voidp wrkmem);
28454 +/* decompression */
28455 +extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28456 + lzo_byte * dst, lzo_uintp dst_len,
28457 + lzo_voidp wrkmem /* NOT USED */);
28458 +/* safe decompression with overrun testing */
28459 +extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
28460 + lzo_byte * dst, lzo_uintp dst_len,
28461 + lzo_voidp wrkmem /* NOT USED */ );
28462 +
28463 +#endif /* already included */
28464 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.22/fs/reiser4/plugin/crypto/cipher.c
28465 --- linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
28466 +++ linux-2.6.22/fs/reiser4/plugin/crypto/cipher.c 2007-07-29 00:25:34.900702689 +0400
28467 @@ -0,0 +1,37 @@
28468 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
28469 + licensing governed by reiser4/README */
28470 +/* Reiser4 cipher transform plugins */
28471 +
28472 +#include "../../debug.h"
28473 +#include "../plugin.h"
28474 +
28475 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
28476 + [NONE_CIPHER_ID] = {
28477 + .h = {
28478 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
28479 + .id = NONE_CIPHER_ID,
28480 + .pops = NULL,
28481 + .label = "none",
28482 + .desc = "no cipher transform",
28483 + .linkage = {NULL, NULL}
28484 + },
28485 + .alloc = NULL,
28486 + .free = NULL,
28487 + .scale = NULL,
28488 + .align_stream = NULL,
28489 + .setkey = NULL,
28490 + .encrypt = NULL,
28491 + .decrypt = NULL
28492 + }
28493 +};
28494 +
28495 +/* Make Linus happy.
28496 + Local variables:
28497 + c-indentation-style: "K&R"
28498 + mode-name: "LC"
28499 + c-basic-offset: 8
28500 + tab-width: 8
28501 + fill-column: 120
28502 + scroll-step: 1
28503 + End:
28504 +*/
28505 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.22/fs/reiser4/plugin/crypto/cipher.h
28506 --- linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
28507 +++ linux-2.6.22/fs/reiser4/plugin/crypto/cipher.h 2007-07-29 00:25:34.900702689 +0400
28508 @@ -0,0 +1,55 @@
28509 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28510 +/* This file contains definitions for the objects operated
28511 + by reiser4 key manager, which is something like keyring
28512 + wrapped by appropriate reiser4 plugin */
28513 +
28514 +#if !defined( __FS_REISER4_CRYPT_H__ )
28515 +#define __FS_REISER4_CRYPT_H__
28516 +
28517 +#include <linux/crypto.h>
28518 +
28519 +/* key info imported from user space */
28520 +struct reiser4_crypto_data {
28521 + int keysize; /* uninstantiated key size */
28522 + __u8 * key; /* uninstantiated key */
28523 + int keyid_size; /* size of passphrase */
28524 + __u8 * keyid; /* passphrase */
28525 +};
28526 +
28527 +/* This object contains all needed infrastructure to implement
28528 + cipher transform. This is operated (allocating, inheriting,
28529 + validating, binding to host inode, etc..) by reiser4 key manager.
28530 +
28531 + This info can be allocated in two cases:
28532 + 1. importing a key from user space.
28533 + 2. reading inode from disk */
28534 +struct reiser4_crypto_info {
28535 + struct inode * host;
28536 + struct crypto_hash * digest;
28537 + struct crypto_blkcipher * cipher;
28538 +#if 0
28539 + cipher_key_plugin * kplug; /* key manager */
28540 +#endif
28541 + __u8 * keyid; /* key fingerprint, created by digest plugin,
28542 + using uninstantiated key and passphrase.
28543 + supposed to be stored in disk stat-data */
28544 + int inst; /* this indicates if the cipher key is
28545 + instantiated (case 1 above) */
28546 + int keysize; /* uninstantiated key size (bytes), supposed
28547 + to be stored in disk stat-data */
28548 + int keyload_count; /* number of the objects which has this
28549 + crypto-stat attached */
28550 +};
28551 +
28552 +#endif /* __FS_REISER4_CRYPT_H__ */
28553 +
28554 +/*
28555 + Local variables:
28556 + c-indentation-style: "K&R"
28557 + mode-name: "LC"
28558 + c-basic-offset: 8
28559 + tab-width: 8
28560 + fill-column: 120
28561 + scroll-step: 1
28562 + End:
28563 +*/
28564 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.22/fs/reiser4/plugin/crypto/digest.c
28565 --- linux-2.6.22.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
28566 +++ linux-2.6.22/fs/reiser4/plugin/crypto/digest.c 2007-07-29 00:25:34.900702689 +0400
28567 @@ -0,0 +1,58 @@
28568 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28569 +
28570 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
28571 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
28572 +#include "../../debug.h"
28573 +#include "../plugin_header.h"
28574 +#include "../plugin.h"
28575 +#include "../file/cryptcompress.h"
28576 +
28577 +#include <linux/types.h>
28578 +
28579 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
28580 +
28581 +static struct crypto_hash * alloc_sha256 (void)
28582 +{
28583 +#if REISER4_SHA256
28584 + return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
28585 +#else
28586 + warning("edward-1418", "sha256 unsupported");
28587 + return ERR_PTR(-EINVAL);
28588 +#endif
28589 +}
28590 +
28591 +static void free_sha256 (struct crypto_hash * tfm)
28592 +{
28593 +#if REISER4_SHA256
28594 + crypto_free_hash(tfm);
28595 +#endif
28596 + return;
28597 +}
28598 +
28599 +/* digest plugins */
28600 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
28601 + [SHA256_32_DIGEST_ID] = {
28602 + .h = {
28603 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
28604 + .id = SHA256_32_DIGEST_ID,
28605 + .pops = NULL,
28606 + .label = "sha256_32",
28607 + .desc = "sha256_32 digest transform",
28608 + .linkage = {NULL, NULL}
28609 + },
28610 + .fipsize = sizeof(__u32),
28611 + .alloc = alloc_sha256,
28612 + .free = free_sha256
28613 + }
28614 +};
28615 +
28616 +/*
28617 + Local variables:
28618 + c-indentation-style: "K&R"
28619 + mode-name: "LC"
28620 + c-basic-offset: 8
28621 + tab-width: 8
28622 + fill-column: 120
28623 + scroll-step: 1
28624 + End:
28625 +*/
28626 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.22/fs/reiser4/plugin/dir/dir.h
28627 --- linux-2.6.22.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
28628 +++ linux-2.6.22/fs/reiser4/plugin/dir/dir.h 2007-07-29 00:25:34.900702689 +0400
28629 @@ -0,0 +1,36 @@
28630 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28631 + * reiser4/README */
28632 +
28633 +/* this file contains declarations of methods implementing directory plugins */
28634 +
28635 +#if !defined( __REISER4_DIR_H__ )
28636 +#define __REISER4_DIR_H__
28637 +
28638 +/*#include "../../key.h"
28639 +
28640 +#include <linux/fs.h>*/
28641 +
28642 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
28643 +
28644 +/* "hashed" directory methods of dir plugin */
28645 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
28646 + reiser4_key *);
28647 +
28648 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
28649 +
28650 +/* "seekable" directory methods of dir plugin */
28651 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
28652 + reiser4_key *);
28653 +
28654 +/* __REISER4_DIR_H__ */
28655 +#endif
28656 +
28657 +/*
28658 + Local variables:
28659 + c-indentation-style: "K&R"
28660 + mode-name: "LC"
28661 + c-basic-offset: 8
28662 + tab-width: 8
28663 + fill-column: 120
28664 + End:
28665 +*/
28666 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.22/fs/reiser4/plugin/dir/hashed_dir.c
28667 --- linux-2.6.22.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300
28668 +++ linux-2.6.22/fs/reiser4/plugin/dir/hashed_dir.c 2007-07-29 00:25:34.900702689 +0400
28669 @@ -0,0 +1,81 @@
28670 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28671 + * reiser4/README */
28672 +
28673 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
28674 + names to the files. */
28675 +
28676 +/*
28677 + * Hashed directory logically consists of persistent directory
28678 + * entries. Directory entry is a pair of a file name and a key of stat-data of
28679 + * a file that has this name in the given directory.
28680 + *
28681 + * Directory entries are stored in the tree in the form of directory
28682 + * items. Directory item should implement dir_entry_ops portion of item plugin
28683 + * interface (see plugin/item/item.h). Hashed directory interacts with
28684 + * directory item plugin exclusively through dir_entry_ops operations.
28685 + *
28686 + * Currently there are two implementations of directory items: "simple
28687 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
28688 + * (plugin/item/cde.[ch]) with the latter being the default.
28689 + *
28690 + * There is, however some delicate way through which directory code interferes
28691 + * with item plugin: key assignment policy. A key for a directory item is
28692 + * chosen by directory code, and as described in kassign.c, this key contains
28693 + * a portion of file name. Directory item uses this knowledge to avoid storing
28694 + * this portion of file name twice: in the key and in the directory item body.
28695 + *
28696 + */
28697 +
28698 +#include "../../inode.h"
28699 +
28700 +void complete_entry_key(const struct inode *, const char *name,
28701 + int len, reiser4_key * result);
28702 +
28703 +/* this is implementation of build_entry_key method of dir
28704 + plugin for HASHED_DIR_PLUGIN_ID
28705 + */
28706 +void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
28707 + * (or will be) in.*/
28708 + const struct qstr *qname, /* name of file referenced
28709 + * by this entry */
28710 + reiser4_key * result /* resulting key of directory
28711 + * entry */ )
28712 +{
28713 + const char *name;
28714 + int len;
28715 +
28716 + assert("nikita-1139", dir != NULL);
28717 + assert("nikita-1140", qname != NULL);
28718 + assert("nikita-1141", qname->name != NULL);
28719 + assert("nikita-1142", result != NULL);
28720 +
28721 + name = qname->name;
28722 + len = qname->len;
28723 +
28724 + assert("nikita-2867", strlen(name) == len);
28725 +
28726 + reiser4_key_init(result);
28727 + /* locality of directory entry's key is objectid of parent
28728 + directory */
28729 + set_key_locality(result, get_inode_oid(dir));
28730 + /* minor packing locality is constant */
28731 + set_key_type(result, KEY_FILE_NAME_MINOR);
28732 + /* dot is special case---we always want it to be first entry in
28733 + a directory. Actually, we just want to have smallest
28734 + directory entry.
28735 + */
28736 + if (len == 1 && name[0] == '.')
28737 + return;
28738 +
28739 + /* initialize part of entry key which depends on file name */
28740 + complete_entry_key(dir, name, len, result);
28741 +}
28742 +
28743 +/* Local variables:
28744 + c-indentation-style: "K&R"
28745 + mode-name: "LC"
28746 + c-basic-offset: 8
28747 + tab-width: 8
28748 + fill-column: 120
28749 + End:
28750 +*/
28751 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.22/fs/reiser4/plugin/dir/Makefile
28752 --- linux-2.6.22.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300
28753 +++ linux-2.6.22/fs/reiser4/plugin/dir/Makefile 2007-07-29 00:25:34.900702689 +0400
28754 @@ -0,0 +1,5 @@
28755 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
28756 +
28757 +dir_plugins-objs := \
28758 + hashed_dir.o \
28759 + seekable_dir.o
28760 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.22/fs/reiser4/plugin/dir/seekable_dir.c
28761 --- linux-2.6.22.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300
28762 +++ linux-2.6.22/fs/reiser4/plugin/dir/seekable_dir.c 2007-07-29 00:25:34.904703724 +0400
28763 @@ -0,0 +1,46 @@
28764 +/* Copyright 2005 by Hans Reiser, licensing governed by
28765 + * reiser4/README */
28766 +
28767 +#include "../../inode.h"
28768 +
28769 +/* this is implementation of build_entry_key method of dir
28770 + plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
28771 + This is for directories where we want repeatable and restartable readdir()
28772 + even in case 32bit user level struct dirent (readdir(3)).
28773 +*/
28774 +void
28775 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
28776 + reiser4_key * result)
28777 +{
28778 + oid_t objectid;
28779 +
28780 + assert("nikita-2283", dir != NULL);
28781 + assert("nikita-2284", name != NULL);
28782 + assert("nikita-2285", name->name != NULL);
28783 + assert("nikita-2286", result != NULL);
28784 +
28785 + reiser4_key_init(result);
28786 + /* locality of directory entry's key is objectid of parent
28787 + directory */
28788 + set_key_locality(result, get_inode_oid(dir));
28789 + /* minor packing locality is constant */
28790 + set_key_type(result, KEY_FILE_NAME_MINOR);
28791 + /* dot is special case---we always want it to be first entry in
28792 + a directory. Actually, we just want to have smallest
28793 + directory entry.
28794 + */
28795 + if ((name->len == 1) && (name->name[0] == '.'))
28796 + return;
28797 +
28798 + /* objectid of key is 31 lowest bits of hash. */
28799 + objectid =
28800 + inode_hash_plugin(dir)->hash(name->name,
28801 + (int)name->len) & 0x7fffffff;
28802 +
28803 + assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
28804 + set_key_objectid(result, objectid);
28805 +
28806 + /* offset is always 0. */
28807 + set_key_offset(result, (__u64) 0);
28808 + return;
28809 +}
28810 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.22/fs/reiser4/plugin/dir_plugin_common.c
28811 --- linux-2.6.22.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
28812 +++ linux-2.6.22/fs/reiser4/plugin/dir_plugin_common.c 2007-07-29 00:25:34.904703724 +0400
28813 @@ -0,0 +1,872 @@
28814 +/* Copyright 2005 by Hans Reiser, licensing governed by
28815 + reiser4/README */
28816 +
28817 +/* this file contains typical implementations for most of methods of
28818 + directory plugin
28819 +*/
28820 +
28821 +#include "../inode.h"
28822 +
28823 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
28824 + lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
28825 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
28826 +void check_light_weight(struct inode *inode, struct inode *parent);
28827 +
28828 +/* this is common implementation of get_parent method of dir plugin
28829 + this is used by NFS kernel server to "climb" up directory tree to
28830 + check permissions
28831 + */
28832 +struct dentry *get_parent_common(struct inode *child)
28833 +{
28834 + struct super_block *s;
28835 + struct inode *parent;
28836 + struct dentry dotdot;
28837 + struct dentry *dentry;
28838 + reiser4_key key;
28839 + int result;
28840 +
28841 + /*
28842 + * lookup dotdot entry.
28843 + */
28844 +
28845 + s = child->i_sb;
28846 + memset(&dotdot, 0, sizeof(dotdot));
28847 + dotdot.d_name.name = "..";
28848 + dotdot.d_name.len = 2;
28849 + dotdot.d_op = &get_super_private(s)->ops.dentry;
28850 +
28851 + result = reiser4_lookup_name(child, &dotdot, &key);
28852 + if (result != 0)
28853 + return ERR_PTR(result);
28854 +
28855 + parent = reiser4_iget(s, &key, 1);
28856 + if (!IS_ERR(parent)) {
28857 + /*
28858 + * FIXME-NIKITA dubious: attributes are inherited from @child
28859 + * to @parent. But:
28860 + *
28861 + * (*) this is the only this we can do
28862 + *
28863 + * (*) attributes of light-weight object are inherited
28864 + * from a parent through which object was looked up first,
28865 + * so it is ambiguous anyway.
28866 + *
28867 + */
28868 + check_light_weight(parent, child);
28869 + reiser4_iget_complete(parent);
28870 + dentry = d_alloc_anon(parent);
28871 + if (dentry == NULL) {
28872 + iput(parent);
28873 + dentry = ERR_PTR(RETERR(-ENOMEM));
28874 + } else
28875 + dentry->d_op = &get_super_private(s)->ops.dentry;
28876 + } else if (PTR_ERR(parent) == -ENOENT)
28877 + dentry = ERR_PTR(RETERR(-ESTALE));
28878 + else
28879 + dentry = (void *)parent;
28880 + return dentry;
28881 +}
28882 +
28883 +/* this is common implementation of is_name_acceptable method of dir
28884 + plugin
28885 + */
28886 +int is_name_acceptable_common(const struct inode *inode, /* directory to check */
28887 + const char *name UNUSED_ARG, /* name to check */
28888 + int len /* @name's length */ )
28889 +{
28890 + assert("nikita-733", inode != NULL);
28891 + assert("nikita-734", name != NULL);
28892 + assert("nikita-735", len > 0);
28893 +
28894 + return len <= reiser4_max_filename_len(inode);
28895 +}
28896 +
28897 +/* there is no common implementation of build_entry_key method of dir
28898 + plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
28899 + plugin/dir/seekable.c:build_entry_key_seekable() for example
28900 +*/
28901 +
28902 +/* this is common implementation of build_readdir_key method of dir
28903 + plugin
28904 + see reiser4_readdir_common for more details
28905 +*/
28906 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
28907 + reiser4_key * result /* where to store key */ )
28908 +{
28909 + reiser4_file_fsdata *fdata;
28910 + struct inode *inode;
28911 +
28912 + assert("nikita-1361", dir != NULL);
28913 + assert("nikita-1362", result != NULL);
28914 + assert("nikita-1363", dir->f_dentry != NULL);
28915 + inode = dir->f_dentry->d_inode;
28916 + assert("nikita-1373", inode != NULL);
28917 +
28918 + fdata = reiser4_get_file_fsdata(dir);
28919 + if (IS_ERR(fdata))
28920 + return PTR_ERR(fdata);
28921 + assert("nikita-1364", fdata != NULL);
28922 + return extract_key_from_de_id(get_inode_oid(inode),
28923 + &fdata->dir.readdir.position.
28924 + dir_entry_key, result);
28925 +
28926 +}
28927 +
28928 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
28929 + int adj);
28930 +
28931 +/* this is common implementation of add_entry method of dir plugin
28932 +*/
28933 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
28934 + * in */
28935 + struct dentry *where, /* new name */
28936 + reiser4_object_create_data * data, /* parameters of
28937 + * new object */
28938 + reiser4_dir_entry_desc * entry /* parameters of
28939 + * new directory
28940 + * entry */)
28941 +{
28942 + int result;
28943 + coord_t *coord;
28944 + lock_handle lh;
28945 + struct reiser4_dentry_fsdata *fsdata;
28946 + reiser4_block_nr reserve;
28947 +
28948 + assert("nikita-1114", object != NULL);
28949 + assert("nikita-1250", where != NULL);
28950 +
28951 + fsdata = reiser4_get_dentry_fsdata(where);
28952 + if (unlikely(IS_ERR(fsdata)))
28953 + return PTR_ERR(fsdata);
28954 +
28955 + reserve = inode_dir_plugin(object)->estimate.add_entry(object);
28956 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
28957 + return RETERR(-ENOSPC);
28958 +
28959 + init_lh(&lh);
28960 + coord = &fsdata->dec.entry_coord;
28961 + coord_clear_iplug(coord);
28962 +
28963 + /* check for this entry in a directory. This is plugin method. */
28964 + result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
28965 + entry);
28966 + if (likely(result == -ENOENT)) {
28967 + /* add new entry. Just pass control to the directory
28968 + item plugin. */
28969 + assert("nikita-1709", inode_dir_item_plugin(object));
28970 + assert("nikita-2230", coord->node == lh.node);
28971 + reiser4_seal_done(&fsdata->dec.entry_seal);
28972 + result =
28973 + inode_dir_item_plugin(object)->s.dir.add_entry(object,
28974 + coord, &lh,
28975 + where,
28976 + entry);
28977 + if (result == 0) {
28978 + reiser4_adjust_dir_file(object, where,
28979 + fsdata->dec.pos + 1, +1);
28980 + INODE_INC_FIELD(object, i_size);
28981 + }
28982 + } else if (result == 0) {
28983 + assert("nikita-2232", coord->node == lh.node);
28984 + result = RETERR(-EEXIST);
28985 + }
28986 + done_lh(&lh);
28987 +
28988 + return result;
28989 +}
28990 +
28991 +/**
28992 + * rem_entry - remove entry from directory item
28993 + * @dir:
28994 + * @dentry:
28995 + * @entry:
28996 + * @coord:
28997 + * @lh:
28998 + *
28999 + * Checks that coordinate @coord is set properly and calls item plugin
29000 + * method to cut entry.
29001 + */
29002 +static int
29003 +rem_entry(struct inode *dir, struct dentry *dentry,
29004 + reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
29005 +{
29006 + item_plugin *iplug;
29007 + struct inode *child;
29008 +
29009 + iplug = inode_dir_item_plugin(dir);
29010 + child = dentry->d_inode;
29011 + assert("nikita-3399", child != NULL);
29012 +
29013 + /* check that we are really destroying an entry for @child */
29014 + if (REISER4_DEBUG) {
29015 + int result;
29016 + reiser4_key key;
29017 +
29018 + result = iplug->s.dir.extract_key(coord, &key);
29019 + if (result != 0)
29020 + return result;
29021 + if (get_key_objectid(&key) != get_inode_oid(child)) {
29022 + warning("nikita-3397",
29023 + "rem_entry: %#llx != %#llx\n",
29024 + get_key_objectid(&key),
29025 + (unsigned long long)get_inode_oid(child));
29026 + return RETERR(-EIO);
29027 + }
29028 + }
29029 + return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
29030 +}
29031 +
29032 +/**
29033 + * reiser4_rem_entry_common - remove entry from a directory
29034 + * @dir: directory to remove entry from
29035 + * @where: name that is being removed
29036 + * @entry: description of entry being removed
29037 + *
29038 + * This is common implementation of rem_entry method of dir plugin.
29039 + */
29040 +int reiser4_rem_entry_common(struct inode *dir,
29041 + struct dentry *dentry,
29042 + reiser4_dir_entry_desc *entry)
29043 +{
29044 + int result;
29045 + coord_t *coord;
29046 + lock_handle lh;
29047 + struct reiser4_dentry_fsdata *fsdata;
29048 + __u64 tograb;
29049 +
29050 + assert("nikita-1124", dir != NULL);
29051 + assert("nikita-1125", dentry != NULL);
29052 +
29053 + tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
29054 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
29055 + if (result != 0)
29056 + return RETERR(-ENOSPC);
29057 +
29058 + init_lh(&lh);
29059 +
29060 + /* check for this entry in a directory. This is plugin method. */
29061 + result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
29062 + fsdata = reiser4_get_dentry_fsdata(dentry);
29063 + if (IS_ERR(fsdata)) {
29064 + done_lh(&lh);
29065 + return PTR_ERR(fsdata);
29066 + }
29067 +
29068 + coord = &fsdata->dec.entry_coord;
29069 +
29070 + assert("nikita-3404",
29071 + get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
29072 + dir->i_size <= 1);
29073 +
29074 + coord_clear_iplug(coord);
29075 + if (result == 0) {
29076 + /* remove entry. Just pass control to the directory item
29077 + plugin. */
29078 + assert("vs-542", inode_dir_item_plugin(dir));
29079 + reiser4_seal_done(&fsdata->dec.entry_seal);
29080 + reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
29081 + result =
29082 + WITH_COORD(coord,
29083 + rem_entry(dir, dentry, entry, coord, &lh));
29084 + if (result == 0) {
29085 + if (dir->i_size >= 1)
29086 + INODE_DEC_FIELD(dir, i_size);
29087 + else {
29088 + warning("nikita-2509", "Dir %llu is runt",
29089 + (unsigned long long)
29090 + get_inode_oid(dir));
29091 + result = RETERR(-EIO);
29092 + }
29093 +
29094 + assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
29095 + dentry->d_inode->i_size != 2 ||
29096 + inode_dir_plugin(dentry->d_inode) == NULL);
29097 + }
29098 + }
29099 + done_lh(&lh);
29100 +
29101 + return result;
29102 +}
29103 +
29104 +static reiser4_block_nr estimate_init(struct inode *parent,
29105 + struct inode *object);
29106 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
29107 +
29108 +/* this is common implementation of init method of dir plugin
29109 + create "." and ".." entries
29110 +*/
29111 +int reiser4_dir_init_common(struct inode *object, /* new directory */
29112 + struct inode *parent, /* parent directory */
29113 + reiser4_object_create_data * data /* info passed
29114 + * to us, this
29115 + * is filled by
29116 + * reiser4()
29117 + * syscall in
29118 + * particular */)
29119 +{
29120 + reiser4_block_nr reserve;
29121 +
29122 + assert("nikita-680", object != NULL);
29123 + assert("nikita-681", S_ISDIR(object->i_mode));
29124 + assert("nikita-682", parent != NULL);
29125 + assert("nikita-684", data != NULL);
29126 + assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29127 + assert("nikita-687", object->i_mode & S_IFDIR);
29128 +
29129 + reserve = estimate_init(parent, object);
29130 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29131 + return RETERR(-ENOSPC);
29132 +
29133 + return create_dot_dotdot(object, parent);
29134 +}
29135 +
29136 +/* this is common implementation of done method of dir plugin
29137 + remove "." entry
29138 +*/
29139 +int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
29140 +{
29141 + int result;
29142 + reiser4_block_nr reserve;
29143 + struct dentry goodby_dots;
29144 + reiser4_dir_entry_desc entry;
29145 +
29146 + assert("nikita-1449", object != NULL);
29147 +
29148 + if (reiser4_inode_get_flag(object, REISER4_NO_SD))
29149 + return 0;
29150 +
29151 + /* of course, this can be rewritten to sweep everything in one
29152 + reiser4_cut_tree(). */
29153 + memset(&entry, 0, sizeof entry);
29154 +
29155 + /* FIXME: this done method is called from reiser4_delete_dir_common which
29156 + * reserved space already */
29157 + reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29158 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29159 + return RETERR(-ENOSPC);
29160 +
29161 + memset(&goodby_dots, 0, sizeof goodby_dots);
29162 + entry.obj = goodby_dots.d_inode = object;
29163 + goodby_dots.d_name.name = ".";
29164 + goodby_dots.d_name.len = 1;
29165 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29166 + reiser4_free_dentry_fsdata(&goodby_dots);
29167 + if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29168 + /* only worth a warning
29169 +
29170 + "values of \ eB\ f will give rise to dom!\n"
29171 + -- v6src/s2/mv.c:89
29172 + */
29173 + warning("nikita-2252", "Cannot remove dot of %lli: %i",
29174 + (unsigned long long)get_inode_oid(object), result);
29175 + return 0;
29176 +}
29177 +
29178 +/* this is common implementation of attach method of dir plugin
29179 +*/
29180 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
29181 + struct inode *parent UNUSED_ARG)
29182 +{
29183 + assert("nikita-2647", child != NULL);
29184 + assert("nikita-2648", parent != NULL);
29185 +
29186 + return 0;
29187 +}
29188 +
29189 +/* this is common implementation of detach method of dir plugin
29190 + remove "..", decrease nlink on parent
29191 +*/
29192 +int reiser4_detach_common(struct inode *object, struct inode *parent)
29193 +{
29194 + int result;
29195 + struct dentry goodby_dots;
29196 + reiser4_dir_entry_desc entry;
29197 +
29198 + assert("nikita-2885", object != NULL);
29199 + assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
29200 +
29201 + memset(&entry, 0, sizeof entry);
29202 +
29203 + /* NOTE-NIKITA this only works if @parent is -the- parent of
29204 + @object, viz. object whose key is stored in dotdot
29205 + entry. Wouldn't work with hard-links on directories. */
29206 + memset(&goodby_dots, 0, sizeof goodby_dots);
29207 + entry.obj = goodby_dots.d_inode = parent;
29208 + goodby_dots.d_name.name = "..";
29209 + goodby_dots.d_name.len = 2;
29210 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29211 + reiser4_free_dentry_fsdata(&goodby_dots);
29212 + if (result == 0) {
29213 + /* the dot should be the only entry remaining at this time... */
29214 + assert("nikita-3400",
29215 + object->i_size == 1 && object->i_nlink <= 2);
29216 +#if 0
29217 + /* and, together with the only name directory can have, they
29218 + * provides for the last 2 remaining references. If we get
29219 + * here as part of error handling during mkdir, @object
29220 + * possibly has no name yet, so its nlink == 1. If we get here
29221 + * from rename (targeting empty directory), it has no name
29222 + * already, so its nlink == 1. */
29223 + assert("nikita-3401",
29224 + object->i_nlink == 2 || object->i_nlink == 1);
29225 +#endif
29226 +
29227 + /* decrement nlink of directory removed ".." pointed
29228 + to */
29229 + reiser4_del_nlink(parent, NULL, 0);
29230 + }
29231 + return result;
29232 +}
29233 +
29234 +/* this is common implementation of estimate.add_entry method of
29235 + dir plugin
29236 + estimation of adding entry which supposes that entry is inserting a
29237 + unit into item
29238 +*/
29239 +reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29240 +{
29241 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
29242 +}
29243 +
29244 +/* this is common implementation of estimate.rem_entry method of dir
29245 + plugin
29246 +*/
29247 +reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29248 +{
29249 + return estimate_one_item_removal(reiser4_tree_by_inode(inode));
29250 +}
29251 +
29252 +/* this is common implementation of estimate.unlink method of dir
29253 + plugin
29254 +*/
29255 +reiser4_block_nr
29256 +dir_estimate_unlink_common(const struct inode * parent,
29257 + const struct inode * object)
29258 +{
29259 + reiser4_block_nr res;
29260 +
29261 + /* hashed_rem_entry(object) */
29262 + res = inode_dir_plugin(object)->estimate.rem_entry(object);
29263 + /* del_nlink(parent) */
29264 + res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29265 +
29266 + return res;
29267 +}
29268 +
29269 +/*
29270 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29271 + * methods: if @inode is a light-weight file, setup its credentials
29272 + * that are not stored in the stat-data in this case
29273 + */
29274 +void check_light_weight(struct inode *inode, struct inode *parent)
29275 +{
29276 + if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
29277 + inode->i_uid = parent->i_uid;
29278 + inode->i_gid = parent->i_gid;
29279 + /* clear light-weight flag. If inode would be read by any
29280 + other name, [ug]id wouldn't change. */
29281 + reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
29282 + }
29283 +}
29284 +
29285 +/* looks for name specified in @dentry in directory @parent and if name is
29286 + found - key of object found entry points to is stored in @entry->key */
29287 +int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
29288 + * name in */
29289 + struct dentry *dentry, /* name to look for */
29290 + reiser4_key * key /* place to store key */ )
29291 +{
29292 + int result;
29293 + coord_t *coord;
29294 + lock_handle lh;
29295 + const char *name;
29296 + int len;
29297 + reiser4_dir_entry_desc entry;
29298 + struct reiser4_dentry_fsdata *fsdata;
29299 +
29300 + assert("nikita-1247", parent != NULL);
29301 + assert("nikita-1248", dentry != NULL);
29302 + assert("nikita-1123", dentry->d_name.name != NULL);
29303 + assert("vs-1486",
29304 + dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29305 +
29306 + name = dentry->d_name.name;
29307 + len = dentry->d_name.len;
29308 +
29309 + if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29310 + /* some arbitrary error code to return */
29311 + return RETERR(-ENAMETOOLONG);
29312 +
29313 + fsdata = reiser4_get_dentry_fsdata(dentry);
29314 + if (IS_ERR(fsdata))
29315 + return PTR_ERR(fsdata);
29316 +
29317 + coord = &fsdata->dec.entry_coord;
29318 + coord_clear_iplug(coord);
29319 + init_lh(&lh);
29320 +
29321 + /* find entry in a directory. This is plugin method. */
29322 + result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
29323 + &entry);
29324 + if (result == 0) {
29325 + /* entry was found, extract object key from it. */
29326 + result =
29327 + WITH_COORD(coord,
29328 + item_plugin_by_coord(coord)->s.dir.
29329 + extract_key(coord, key));
29330 + }
29331 + done_lh(&lh);
29332 + return result;
29333 +
29334 +}
29335 +
29336 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
29337 +static reiser4_block_nr
29338 +estimate_init(struct inode *parent, struct inode *object)
29339 +{
29340 + reiser4_block_nr res = 0;
29341 +
29342 + assert("vpf-321", parent != NULL);
29343 + assert("vpf-322", object != NULL);
29344 +
29345 + /* hashed_add_entry(object) */
29346 + res += inode_dir_plugin(object)->estimate.add_entry(object);
29347 + /* reiser4_add_nlink(object) */
29348 + res += inode_file_plugin(object)->estimate.update(object);
29349 + /* hashed_add_entry(object) */
29350 + res += inode_dir_plugin(object)->estimate.add_entry(object);
29351 + /* reiser4_add_nlink(parent) */
29352 + res += inode_file_plugin(parent)->estimate.update(parent);
29353 +
29354 + return 0;
29355 +}
29356 +
29357 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
29358 +static int create_dot_dotdot(struct inode *object /* object to create dot and
29359 + * dotdot for */ ,
29360 + struct inode *parent /* parent of @object */)
29361 +{
29362 + int result;
29363 + struct dentry dots_entry;
29364 + reiser4_dir_entry_desc entry;
29365 +
29366 + assert("nikita-688", object != NULL);
29367 + assert("nikita-689", S_ISDIR(object->i_mode));
29368 + assert("nikita-691", parent != NULL);
29369 +
29370 + /* We store dot and dotdot as normal directory entries. This is
29371 + not necessary, because almost all information stored in them
29372 + is already in the stat-data of directory, the only thing
29373 + being missed is objectid of grand-parent directory that can
29374 + easily be added there as extension.
29375 +
29376 + But it is done the way it is done, because not storing dot
29377 + and dotdot will lead to the following complications:
29378 +
29379 + . special case handling in ->lookup().
29380 + . addition of another extension to the sd.
29381 + . dependency on key allocation policy for stat data.
29382 +
29383 + */
29384 +
29385 + memset(&entry, 0, sizeof entry);
29386 + memset(&dots_entry, 0, sizeof dots_entry);
29387 + entry.obj = dots_entry.d_inode = object;
29388 + dots_entry.d_name.name = ".";
29389 + dots_entry.d_name.len = 1;
29390 + result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
29391 + reiser4_free_dentry_fsdata(&dots_entry);
29392 +
29393 + if (result == 0) {
29394 + result = reiser4_add_nlink(object, object, 0);
29395 + if (result == 0) {
29396 + entry.obj = dots_entry.d_inode = parent;
29397 + dots_entry.d_name.name = "..";
29398 + dots_entry.d_name.len = 2;
29399 + result = reiser4_add_entry_common(object,
29400 + &dots_entry, NULL, &entry);
29401 + reiser4_free_dentry_fsdata(&dots_entry);
29402 + /* if creation of ".." failed, iput() will delete
29403 + object with ".". */
29404 + if (result == 0) {
29405 + result = reiser4_add_nlink(parent, object, 0);
29406 + if (result != 0)
29407 + /*
29408 + * if we failed to bump i_nlink, try
29409 + * to remove ".."
29410 + */
29411 + reiser4_detach_common(object, parent);
29412 + }
29413 + }
29414 + }
29415 +
29416 + if (result != 0) {
29417 + /*
29418 + * in the case of error, at least update stat-data so that,
29419 + * ->i_nlink updates are not lingering.
29420 + */
29421 + reiser4_update_sd(object);
29422 + reiser4_update_sd(parent);
29423 + }
29424 +
29425 + return result;
29426 +}
29427 +
29428 +/*
29429 + * return 0 iff @coord contains a directory entry for the file with the name
29430 + * @name.
29431 + */
29432 +static int
29433 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
29434 +{
29435 + item_plugin *iplug;
29436 + char buf[DE_NAME_BUF_LEN];
29437 +
29438 + iplug = item_plugin_by_coord(coord);
29439 + if (iplug == NULL) {
29440 + warning("nikita-1135", "Cannot get item plugin");
29441 + print_coord("coord", coord, 1);
29442 + return RETERR(-EIO);
29443 + } else if (item_id_by_coord(coord) !=
29444 + item_id_by_plugin(inode_dir_item_plugin(dir))) {
29445 + /* item id of current item does not match to id of items a
29446 + directory is built of */
29447 + warning("nikita-1136", "Wrong item plugin");
29448 + print_coord("coord", coord, 1);
29449 + return RETERR(-EIO);
29450 + }
29451 + assert("nikita-1137", iplug->s.dir.extract_name);
29452 +
29453 + /* Compare name stored in this entry with name we are looking for.
29454 +
29455 + NOTE-NIKITA Here should go code for support of something like
29456 + unicode, code tables, etc.
29457 + */
29458 + return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
29459 +}
29460 +
29461 +static int
29462 +check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
29463 +{
29464 + return WITH_COORD(coord, check_item(dir, coord, name->name));
29465 +}
29466 +
29467 +/*
29468 + * argument package used by entry_actor to scan entries with identical keys.
29469 + */
29470 +struct entry_actor_args {
29471 + /* name we are looking for */
29472 + const char *name;
29473 + /* key of directory entry. entry_actor() scans through sequence of
29474 + * items/units having the same key */
29475 + reiser4_key *key;
29476 + /* how many entries with duplicate key was scanned so far. */
29477 + int non_uniq;
29478 +#if REISER4_USE_COLLISION_LIMIT
29479 + /* scan limit */
29480 + int max_non_uniq;
29481 +#endif
29482 + /* return parameter: set to true, if ->name wasn't found */
29483 + int not_found;
29484 + /* what type of lock to take when moving to the next node during
29485 + * scan */
29486 + znode_lock_mode mode;
29487 +
29488 + /* last coord that was visited during scan */
29489 + coord_t last_coord;
29490 + /* last node locked during scan */
29491 + lock_handle last_lh;
29492 + /* inode of directory */
29493 + const struct inode *inode;
29494 +};
29495 +
29496 +/* Function called by reiser4_find_entry() to look for given name
29497 + in the directory. */
29498 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
29499 + coord_t * coord /* current coord */ ,
29500 + lock_handle * lh /* current lock handle */ ,
29501 + void *entry_actor_arg /* argument to scan */ )
29502 +{
29503 + reiser4_key unit_key;
29504 + struct entry_actor_args *args;
29505 +
29506 + assert("nikita-1131", tree != NULL);
29507 + assert("nikita-1132", coord != NULL);
29508 + assert("nikita-1133", entry_actor_arg != NULL);
29509 +
29510 + args = entry_actor_arg;
29511 + ++args->non_uniq;
29512 +#if REISER4_USE_COLLISION_LIMIT
29513 + if (args->non_uniq > args->max_non_uniq) {
29514 + args->not_found = 1;
29515 + /* hash collision overflow. */
29516 + return RETERR(-EBUSY);
29517 + }
29518 +#endif
29519 +
29520 + /*
29521 + * did we just reach the end of the sequence of items/units with
29522 + * identical keys?
29523 + */
29524 + if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
29525 + assert("nikita-1791",
29526 + keylt(args->key, unit_key_by_coord(coord, &unit_key)));
29527 + args->not_found = 1;
29528 + args->last_coord.between = AFTER_UNIT;
29529 + return 0;
29530 + }
29531 +
29532 + coord_dup(&args->last_coord, coord);
29533 + /*
29534 + * did scan just moved to the next node?
29535 + */
29536 + if (args->last_lh.node != lh->node) {
29537 + int lock_result;
29538 +
29539 + /*
29540 + * if so, lock new node with the mode requested by the caller
29541 + */
29542 + done_lh(&args->last_lh);
29543 + assert("nikita-1896", znode_is_any_locked(lh->node));
29544 + lock_result = longterm_lock_znode(&args->last_lh, lh->node,
29545 + args->mode, ZNODE_LOCK_HIPRI);
29546 + if (lock_result != 0)
29547 + return lock_result;
29548 + }
29549 + return check_item(args->inode, coord, args->name);
29550 +}
29551 +
29552 +/* Look for given @name within directory @dir.
29553 +
29554 + This is called during lookup, creation and removal of directory
29555 + entries and on reiser4_rename_common
29556 +
29557 + First calculate key that directory entry for @name would have. Search
29558 + for this key in the tree. If such key is found, scan all items with
29559 + the same key, checking name in each directory entry along the way.
29560 +*/
29561 +int reiser4_find_entry(struct inode *dir, /* directory to scan */
29562 + struct dentry *de, /* name to search for */
29563 + lock_handle * lh, /* resulting lock handle */
29564 + znode_lock_mode mode, /* required lock mode */
29565 + reiser4_dir_entry_desc * entry /* parameters of found
29566 + directory entry */)
29567 +{
29568 + const struct qstr *name;
29569 + seal_t *seal;
29570 + coord_t *coord;
29571 + int result;
29572 + __u32 flags;
29573 + struct de_location *dec;
29574 + struct reiser4_dentry_fsdata *fsdata;
29575 +
29576 + assert("nikita-1130", lh != NULL);
29577 + assert("nikita-1128", dir != NULL);
29578 +
29579 + name = &de->d_name;
29580 + assert("nikita-1129", name != NULL);
29581 +
29582 + /* dentry private data don't require lock, because dentry
29583 + manipulations are protected by i_mutex on parent.
29584 +
29585 + This is not so for inodes, because there is no -the- parent in
29586 + inode case.
29587 + */
29588 + fsdata = reiser4_get_dentry_fsdata(de);
29589 + if (IS_ERR(fsdata))
29590 + return PTR_ERR(fsdata);
29591 + dec = &fsdata->dec;
29592 +
29593 + coord = &dec->entry_coord;
29594 + coord_clear_iplug(coord);
29595 + seal = &dec->entry_seal;
29596 + /* compose key of directory entry for @name */
29597 + inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
29598 +
29599 + if (reiser4_seal_is_set(seal)) {
29600 + /* check seal */
29601 + result = reiser4_seal_validate(seal, coord, &entry->key,
29602 + lh, mode, ZNODE_LOCK_LOPRI);
29603 + if (result == 0) {
29604 + /* key was found. Check that it is really item we are
29605 + looking for. */
29606 + result = check_entry(dir, coord, name);
29607 + if (result == 0)
29608 + return 0;
29609 + }
29610 + }
29611 + flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
29612 + /*
29613 + * find place in the tree where directory item should be located.
29614 + */
29615 + result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
29616 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
29617 + flags, NULL /*ra_info */ );
29618 + if (result == CBK_COORD_FOUND) {
29619 + struct entry_actor_args arg;
29620 +
29621 + /* fast path: no hash collisions */
29622 + result = check_entry(dir, coord, name);
29623 + if (result == 0) {
29624 + reiser4_seal_init(seal, coord, &entry->key);
29625 + dec->pos = 0;
29626 + } else if (result > 0) {
29627 + /* Iterate through all units with the same keys. */
29628 + arg.name = name->name;
29629 + arg.key = &entry->key;
29630 + arg.not_found = 0;
29631 + arg.non_uniq = 0;
29632 +#if REISER4_USE_COLLISION_LIMIT
29633 + arg.max_non_uniq = max_hash_collisions(dir);
29634 + assert("nikita-2851", arg.max_non_uniq > 1);
29635 +#endif
29636 + arg.mode = mode;
29637 + arg.inode = dir;
29638 + coord_init_zero(&arg.last_coord);
29639 + init_lh(&arg.last_lh);
29640 +
29641 + result = reiser4_iterate_tree
29642 + (reiser4_tree_by_inode(dir),
29643 + coord, lh,
29644 + entry_actor, &arg, mode, 1);
29645 + /* if end of the tree or extent was reached during
29646 + scanning. */
29647 + if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
29648 + /* step back */
29649 + done_lh(lh);
29650 +
29651 + result = zload(arg.last_coord.node);
29652 + if (result == 0) {
29653 + coord_clear_iplug(&arg.last_coord);
29654 + coord_dup(coord, &arg.last_coord);
29655 + move_lh(lh, &arg.last_lh);
29656 + result = RETERR(-ENOENT);
29657 + zrelse(arg.last_coord.node);
29658 + --arg.non_uniq;
29659 + }
29660 + }
29661 +
29662 + done_lh(&arg.last_lh);
29663 + if (result == 0)
29664 + reiser4_seal_init(seal, coord, &entry->key);
29665 +
29666 + if (result == 0 || result == -ENOENT) {
29667 + assert("nikita-2580", arg.non_uniq > 0);
29668 + dec->pos = arg.non_uniq - 1;
29669 + }
29670 + }
29671 + } else
29672 + dec->pos = -1;
29673 + return result;
29674 +}
29675 +
29676 +/*
29677 + Local variables:
29678 + c-indentation-style: "K&R"
29679 + mode-name: "LC"
29680 + c-basic-offset: 8
29681 + tab-width: 8
29682 + fill-column: 120
29683 + scroll-step: 1
29684 + End:
29685 +*/
29686 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.c
29687 --- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300
29688 +++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.c 2007-07-29 00:25:34.904703724 +0400
29689 @@ -0,0 +1,655 @@
29690 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29691 +
29692 +#include "../../debug.h"
29693 +#include "../../dformat.h"
29694 +#include "../../key.h"
29695 +#include "../node/node.h"
29696 +#include "../space/space_allocator.h"
29697 +#include "disk_format40.h"
29698 +#include "../plugin.h"
29699 +#include "../../txnmgr.h"
29700 +#include "../../jnode.h"
29701 +#include "../../tree.h"
29702 +#include "../../super.h"
29703 +#include "../../wander.h"
29704 +#include "../../inode.h"
29705 +#include "../../ktxnmgrd.h"
29706 +#include "../../status_flags.h"
29707 +
29708 +#include <linux/types.h> /* for __u?? */
29709 +#include <linux/fs.h> /* for struct super_block */
29710 +#include <linux/buffer_head.h>
29711 +
29712 +/* reiser 4.0 default disk layout */
29713 +
29714 +/* Amount of free blocks needed to perform release_format40 when fs gets
29715 + mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
29716 + & tx record. */
29717 +#define RELEASE_RESERVED 4
29718 +
29719 +/* The greatest supported format40 version number */
29720 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
29721 +
29722 +/* This flag indicates that backup should be updated
29723 + (the update is performed by fsck) */
29724 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
29725 +
29726 +/* functions to access fields of format40_disk_super_block */
29727 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
29728 +{
29729 + return le64_to_cpu(get_unaligned(&sb->block_count));
29730 +}
29731 +
29732 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
29733 +{
29734 + return le64_to_cpu(get_unaligned(&sb->free_blocks));
29735 +}
29736 +
29737 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
29738 +{
29739 + return le64_to_cpu(get_unaligned(&sb->root_block));
29740 +}
29741 +
29742 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
29743 +{
29744 + return le16_to_cpu(get_unaligned(&sb->tree_height));
29745 +}
29746 +
29747 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
29748 +{
29749 + return le64_to_cpu(get_unaligned(&sb->file_count));
29750 +}
29751 +
29752 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
29753 +{
29754 + return le64_to_cpu(get_unaligned(&sb->oid));
29755 +}
29756 +
29757 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
29758 +{
29759 + return le32_to_cpu(get_unaligned(&sb->mkfs_id));
29760 +}
29761 +
29762 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
29763 +{
29764 + return le64_to_cpu(get_unaligned(&sb->flags));
29765 +}
29766 +
29767 +static __u32 get_format40_version(const format40_disk_super_block * sb)
29768 +{
29769 + return le32_to_cpu(get_unaligned(&sb->version)) &
29770 + ~FORMAT40_UPDATE_BACKUP;
29771 +}
29772 +
29773 +static int update_backup_version(const format40_disk_super_block * sb)
29774 +{
29775 + return (le32_to_cpu(get_unaligned(&sb->version)) &
29776 + FORMAT40_UPDATE_BACKUP);
29777 +}
29778 +
29779 +static int update_disk_version(const format40_disk_super_block * sb)
29780 +{
29781 + return (get_format40_version(sb) < FORMAT40_VERSION);
29782 +}
29783 +
29784 +static int incomplete_compatibility(const format40_disk_super_block * sb)
29785 +{
29786 + return (get_format40_version(sb) > FORMAT40_VERSION);
29787 +}
29788 +
29789 +static format40_super_info *get_sb_info(struct super_block *super)
29790 +{
29791 + return &get_super_private(super)->u.format40;
29792 +}
29793 +
29794 +static int consult_diskmap(struct super_block *s)
29795 +{
29796 + format40_super_info *info;
29797 + journal_location *jloc;
29798 +
29799 + info = get_sb_info(s);
29800 + jloc = &get_super_private(s)->jloc;
29801 + /* Default format-specific locations, if there is nothing in
29802 + * diskmap */
29803 + jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
29804 + jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
29805 + info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
29806 +#ifdef CONFIG_REISER4_BADBLOCKS
29807 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
29808 + &jloc->footer);
29809 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
29810 + &jloc->header);
29811 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
29812 + &info->loc.super);
29813 +#endif
29814 + return 0;
29815 +}
29816 +
29817 +/* find any valid super block of disk_format40 (even if the first
29818 + super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
29819 + if needed */
29820 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
29821 + *s)
29822 +{
29823 + struct buffer_head *super_bh;
29824 + format40_disk_super_block *disk_sb;
29825 + format40_super_info *info;
29826 +
29827 + assert("umka-487", s != NULL);
29828 +
29829 + info = get_sb_info(s);
29830 +
29831 + super_bh = sb_bread(s, info->loc.super);
29832 + if (super_bh == NULL)
29833 + return ERR_PTR(RETERR(-EIO));
29834 +
29835 + disk_sb = (format40_disk_super_block *) super_bh->b_data;
29836 + if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
29837 + brelse(super_bh);
29838 + return ERR_PTR(RETERR(-EINVAL));
29839 + }
29840 +
29841 + reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
29842 + reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
29843 + le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29844 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29845 +
29846 + return super_bh;
29847 +}
29848 +
29849 +/* find the most recent version of super block. This is called after journal is
29850 + replayed */
29851 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
29852 +{
29853 + /* Here the most recent superblock copy has to be read. However, as
29854 + journal replay isn't complete, we are using
29855 + find_a_disk_format40_super_block() function. */
29856 + return find_a_disk_format40_super_block(s);
29857 +}
29858 +
29859 +static int get_super_jnode(struct super_block *s)
29860 +{
29861 + reiser4_super_info_data *sbinfo = get_super_private(s);
29862 + jnode *sb_jnode;
29863 + int ret;
29864 +
29865 + sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
29866 +
29867 + ret = jload(sb_jnode);
29868 +
29869 + if (ret) {
29870 + reiser4_drop_io_head(sb_jnode);
29871 + return ret;
29872 + }
29873 +
29874 + pin_jnode_data(sb_jnode);
29875 + jrelse(sb_jnode);
29876 +
29877 + sbinfo->u.format40.sb_jnode = sb_jnode;
29878 +
29879 + return 0;
29880 +}
29881 +
29882 +static void done_super_jnode(struct super_block *s)
29883 +{
29884 + jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
29885 +
29886 + if (sb_jnode) {
29887 + unpin_jnode_data(sb_jnode);
29888 + reiser4_drop_io_head(sb_jnode);
29889 + }
29890 +}
29891 +
29892 +typedef enum format40_init_stage {
29893 + NONE_DONE = 0,
29894 + CONSULT_DISKMAP,
29895 + FIND_A_SUPER,
29896 + INIT_JOURNAL_INFO,
29897 + INIT_STATUS,
29898 + JOURNAL_REPLAY,
29899 + READ_SUPER,
29900 + KEY_CHECK,
29901 + INIT_OID,
29902 + INIT_TREE,
29903 + JOURNAL_RECOVER,
29904 + INIT_SA,
29905 + INIT_JNODE,
29906 + ALL_DONE
29907 +} format40_init_stage;
29908 +
29909 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
29910 +{
29911 + format40_disk_super_block *sb_copy;
29912 +
29913 + sb_copy = kmalloc(sizeof(format40_disk_super_block),
29914 + reiser4_ctx_gfp_mask_get());
29915 + if (sb_copy == NULL)
29916 + return ERR_PTR(RETERR(-ENOMEM));
29917 + memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
29918 + sizeof(format40_disk_super_block));
29919 + return sb_copy;
29920 +}
29921 +
29922 +static int check_key_format(const format40_disk_super_block *sb_copy)
29923 +{
29924 + if (!equi(REISER4_LARGE_KEY,
29925 + get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
29926 + warning("nikita-3228", "Key format mismatch. "
29927 + "Only %s keys are supported.",
29928 + REISER4_LARGE_KEY ? "large" : "small");
29929 + return RETERR(-EINVAL);
29930 + }
29931 + return 0;
29932 +}
29933 +
29934 +/**
29935 + * try_init_format40
29936 + * @super:
29937 + * @stage:
29938 + *
29939 + */
29940 +static int try_init_format40(struct super_block *super,
29941 + format40_init_stage *stage)
29942 +{
29943 + int result;
29944 + struct buffer_head *super_bh;
29945 + reiser4_super_info_data *sbinfo;
29946 + format40_disk_super_block *sb_copy;
29947 + tree_level height;
29948 + reiser4_block_nr root_block;
29949 + node_plugin *nplug;
29950 +
29951 + assert("vs-475", super != NULL);
29952 + assert("vs-474", get_super_private(super));
29953 +
29954 + *stage = NONE_DONE;
29955 +
29956 + result = consult_diskmap(super);
29957 + if (result)
29958 + return result;
29959 + *stage = CONSULT_DISKMAP;
29960 +
29961 + super_bh = find_a_disk_format40_super_block(super);
29962 + if (IS_ERR(super_bh))
29963 + return PTR_ERR(super_bh);
29964 + brelse(super_bh);
29965 + *stage = FIND_A_SUPER;
29966 +
29967 + /* ok, we are sure that filesystem format is a format40 format */
29968 +
29969 + /* map jnodes for journal control blocks (header, footer) to disk */
29970 + result = reiser4_init_journal_info(super);
29971 + if (result)
29972 + return result;
29973 + *stage = INIT_JOURNAL_INFO;
29974 +
29975 + /* ok, we are sure that filesystem format is a format40 format */
29976 + /* Now check it's state */
29977 + result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
29978 + if (result != 0 && result != -EINVAL)
29979 + /* -EINVAL means there is no magic, so probably just old
29980 + * fs. */
29981 + return result;
29982 + *stage = INIT_STATUS;
29983 +
29984 + result = reiser4_status_query(NULL, NULL);
29985 + if (result == REISER4_STATUS_MOUNT_WARN)
29986 + notice("vpf-1363", "Warning: mounting %s with errors.",
29987 + super->s_id);
29988 + if (result == REISER4_STATUS_MOUNT_RO)
29989 + notice("vpf-1364", "Warning: mounting %s with fatal errors,"
29990 + " forcing read-only mount.", super->s_id);
29991 + result = reiser4_journal_replay(super);
29992 + if (result)
29993 + return result;
29994 + *stage = JOURNAL_REPLAY;
29995 +
29996 + super_bh = read_super_block(super);
29997 + if (IS_ERR(super_bh))
29998 + return PTR_ERR(super_bh);
29999 + *stage = READ_SUPER;
30000 +
30001 + /* allocate and make a copy of format40_disk_super_block */
30002 + sb_copy = copy_sb(super_bh);
30003 + brelse(super_bh);
30004 +
30005 + if (IS_ERR(sb_copy))
30006 + return PTR_ERR(sb_copy);
30007 + printk("reiser4: %s: found disk format 4.0.%u.\n",
30008 + super->s_id,
30009 + get_format40_version(sb_copy));
30010 + if (incomplete_compatibility(sb_copy))
30011 + printk("reiser4: Warning: The last completely supported "
30012 + "version of disk format40 is %u. Some objects of "
30013 + "the semantic tree can be unaccessible.\n",
30014 + FORMAT40_VERSION);
30015 + /* make sure that key format of kernel and filesystem match */
30016 + result = check_key_format(sb_copy);
30017 + if (result) {
30018 + kfree(sb_copy);
30019 + return result;
30020 + }
30021 + *stage = KEY_CHECK;
30022 +
30023 + result = oid_init_allocator(super, get_format40_file_count(sb_copy),
30024 + get_format40_oid(sb_copy));
30025 + if (result) {
30026 + kfree(sb_copy);
30027 + return result;
30028 + }
30029 + *stage = INIT_OID;
30030 +
30031 + /* get things necessary to init reiser4_tree */
30032 + root_block = get_format40_root_block(sb_copy);
30033 + height = get_format40_tree_height(sb_copy);
30034 + nplug = node_plugin_by_id(NODE40_ID);
30035 +
30036 + /* initialize reiser4_super_info_data */
30037 + sbinfo = get_super_private(super);
30038 + assert("", sbinfo->tree.super == super);
30039 + /* init reiser4_tree for the filesystem */
30040 + result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
30041 + if (result) {
30042 + kfree(sb_copy);
30043 + return result;
30044 + }
30045 + *stage = INIT_TREE;
30046 +
30047 + /*
30048 + * initialize reiser4_super_info_data with data from format40 super
30049 + * block
30050 + */
30051 + sbinfo->default_uid = 0;
30052 + sbinfo->default_gid = 0;
30053 + sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
30054 + /* number of blocks in filesystem and reserved space */
30055 + reiser4_set_block_count(super, get_format40_block_count(sb_copy));
30056 + sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
30057 + sbinfo->version = get_format40_version(sb_copy);
30058 + kfree(sb_copy);
30059 +
30060 + if (update_backup_version(sb_copy))
30061 + printk("reiser4: Warning: metadata backup is not updated. "
30062 + "Please run 'fsck.reiser4 --fix' on %s.\n",
30063 + super->s_id);
30064 +
30065 + sbinfo->fsuid = 0;
30066 + sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
30067 + * are not supported */
30068 + sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
30069 + * layout 40 are
30070 + * of one
30071 + * plugin */
30072 + /* sbinfo->tmgr is initialized already */
30073 +
30074 + /* recover sb data which were logged separately from sb block */
30075 +
30076 + /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
30077 + * oid_init_allocator() and reiser4_set_free_blocks() with new
30078 + * data. What's the reason to call them above? */
30079 + result = reiser4_journal_recover_sb_data(super);
30080 + if (result != 0)
30081 + return result;
30082 + *stage = JOURNAL_RECOVER;
30083 +
30084 + /*
30085 + * Set number of used blocks. The number of used blocks is not stored
30086 + * neither in on-disk super block nor in the journal footer blocks. At
30087 + * this moment actual values of total blocks and free block counters
30088 + * are set in the reiser4 super block (in-memory structure) and we can
30089 + * calculate number of used blocks from them.
30090 + */
30091 + reiser4_set_data_blocks(super,
30092 + reiser4_block_count(super) -
30093 + reiser4_free_blocks(super));
30094 +
30095 +#if REISER4_DEBUG
30096 + sbinfo->min_blocks_used = 16 /* reserved area */ +
30097 + 2 /* super blocks */ +
30098 + 2 /* journal footer and header */ ;
30099 +#endif
30100 +
30101 + /* init disk space allocator */
30102 + result = sa_init_allocator(reiser4_get_space_allocator(super),
30103 + super, NULL);
30104 + if (result)
30105 + return result;
30106 + *stage = INIT_SA;
30107 +
30108 + result = get_super_jnode(super);
30109 + if (result == 0)
30110 + *stage = ALL_DONE;
30111 + return result;
30112 +}
30113 +
30114 +/* plugin->u.format.get_ready */
30115 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30116 +{
30117 + int result;
30118 + format40_init_stage stage;
30119 +
30120 + result = try_init_format40(s, &stage);
30121 + switch (stage) {
30122 + case ALL_DONE:
30123 + assert("nikita-3458", result == 0);
30124 + break;
30125 + case INIT_JNODE:
30126 + done_super_jnode(s);
30127 + case INIT_SA:
30128 + sa_destroy_allocator(reiser4_get_space_allocator(s), s);
30129 + case JOURNAL_RECOVER:
30130 + case INIT_TREE:
30131 + reiser4_done_tree(&get_super_private(s)->tree);
30132 + case INIT_OID:
30133 + case KEY_CHECK:
30134 + case READ_SUPER:
30135 + case JOURNAL_REPLAY:
30136 + case INIT_STATUS:
30137 + reiser4_status_finish();
30138 + case INIT_JOURNAL_INFO:
30139 + reiser4_done_journal_info(s);
30140 + case FIND_A_SUPER:
30141 + case CONSULT_DISKMAP:
30142 + case NONE_DONE:
30143 + break;
30144 + default:
30145 + impossible("nikita-3457", "init stage: %i", stage);
30146 + }
30147 +
30148 + if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30149 + return RETERR(-ENOSPC);
30150 +
30151 + return result;
30152 +}
30153 +
30154 +static void pack_format40_super(const struct super_block *s, char *data)
30155 +{
30156 + format40_disk_super_block *super_data =
30157 + (format40_disk_super_block *) data;
30158 +
30159 + reiser4_super_info_data *sbinfo = get_super_private(s);
30160 +
30161 + assert("zam-591", data != NULL);
30162 +
30163 + put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30164 + &super_data->free_blocks);
30165 +
30166 + put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
30167 + &super_data->root_block);
30168 +
30169 + put_unaligned(cpu_to_le64(oid_next(s)),
30170 + &super_data->oid);
30171 +
30172 + put_unaligned(cpu_to_le64(oids_used(s)),
30173 + &super_data->file_count);
30174 +
30175 + put_unaligned(cpu_to_le16(sbinfo->tree.height),
30176 + &super_data->tree_height);
30177 +
30178 + if (update_disk_version(super_data)) {
30179 + __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
30180 +
30181 + put_unaligned(cpu_to_le32(version), &super_data->version);
30182 + }
30183 +}
30184 +
30185 +/* plugin->u.format.log_super
30186 + return a jnode which should be added to transaction when the super block
30187 + gets logged */
30188 +jnode *log_super_format40(struct super_block *s)
30189 +{
30190 + jnode *sb_jnode;
30191 +
30192 + sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30193 +
30194 + jload(sb_jnode);
30195 +
30196 + pack_format40_super(s, jdata(sb_jnode));
30197 +
30198 + jrelse(sb_jnode);
30199 +
30200 + return sb_jnode;
30201 +}
30202 +
30203 +/* plugin->u.format.release */
30204 +int release_format40(struct super_block *s)
30205 +{
30206 + int ret;
30207 + reiser4_super_info_data *sbinfo;
30208 +
30209 + sbinfo = get_super_private(s);
30210 + assert("zam-579", sbinfo != NULL);
30211 +
30212 + if (!rofs_super(s)) {
30213 + ret = reiser4_capture_super_block(s);
30214 + if (ret != 0)
30215 + warning("vs-898",
30216 + "reiser4_capture_super_block failed: %d",
30217 + ret);
30218 +
30219 + ret = txnmgr_force_commit_all(s, 1);
30220 + if (ret != 0)
30221 + warning("jmacd-74438", "txn_force failed: %d", ret);
30222 +
30223 + all_grabbed2free();
30224 + }
30225 +
30226 + sa_destroy_allocator(&sbinfo->space_allocator, s);
30227 + reiser4_done_journal_info(s);
30228 + done_super_jnode(s);
30229 +
30230 + rcu_barrier();
30231 + reiser4_done_tree(&sbinfo->tree);
30232 + /* call finish_rcu(), because some znode were "released" in
30233 + * reiser4_done_tree(). */
30234 + rcu_barrier();
30235 +
30236 + return 0;
30237 +}
30238 +
30239 +#define FORMAT40_ROOT_LOCALITY 41
30240 +#define FORMAT40_ROOT_OBJECTID 42
30241 +
30242 +/* plugin->u.format.root_dir_key */
30243 +const reiser4_key *root_dir_key_format40(const struct super_block *super
30244 + UNUSED_ARG)
30245 +{
30246 + static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30247 + .el = {
30248 + __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30249 +#if REISER4_LARGE_KEY
30250 + ON_LARGE_KEY(0ull,)
30251 +#endif
30252 + __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30253 + 0ull
30254 + }
30255 + };
30256 +
30257 + return &FORMAT40_ROOT_DIR_KEY;
30258 +}
30259 +
30260 +/* plugin->u.format.check_open.
30261 + Check the opened object for validness. For now it checks for the valid oid &
30262 + locality only, can be improved later and it its work may depend on the mount
30263 + options. */
30264 +int check_open_format40(const struct inode *object)
30265 +{
30266 + oid_t max, oid;
30267 +
30268 + max = oid_next(object->i_sb) - 1;
30269 +
30270 + /* Check the oid. */
30271 + oid = get_inode_oid(object);
30272 + if (oid > max) {
30273 + warning("vpf-1360", "The object with the oid %llu "
30274 + "greater then the max used oid %llu found.",
30275 + (unsigned long long)oid, (unsigned long long)max);
30276 +
30277 + return RETERR(-EIO);
30278 + }
30279 +
30280 + /* Check the locality. */
30281 + oid = reiser4_inode_data(object)->locality_id;
30282 + if (oid > max) {
30283 + warning("vpf-1361", "The object with the locality %llu "
30284 + "greater then the max used oid %llu found.",
30285 + (unsigned long long)oid, (unsigned long long)max);
30286 +
30287 + return RETERR(-EIO);
30288 + }
30289 +
30290 + return 0;
30291 +}
30292 +
30293 +/* plugin->u.format.version_update.
30294 + Perform all version update operations from the on-disk
30295 + format40_disk_super_block.version on disk to FORMAT40_VERSION.
30296 + */
30297 +int version_update_format40(struct super_block *super) {
30298 + txn_handle * trans;
30299 + lock_handle lh;
30300 + txn_atom *atom;
30301 + int ret;
30302 +
30303 + /* Nothing to do if RO mount or the on-disk version is not less. */
30304 + if (super->s_flags & MS_RDONLY)
30305 + return 0;
30306 +
30307 + if (get_super_private(super)->version >= FORMAT40_VERSION)
30308 + return 0;
30309 +
30310 + printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
30311 + "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
30312 + "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
30313 +
30314 + /* Mark the uber znode dirty to call log_super on write_logs. */
30315 + init_lh(&lh);
30316 + ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
30317 + ZNODE_LOCK_HIPRI, &lh);
30318 + if (ret != 0)
30319 + return ret;
30320 +
30321 + znode_make_dirty(lh.node);
30322 + done_lh(&lh);
30323 +
30324 + /* Update the backup blocks. */
30325 +
30326 + /* Force write_logs immediately. */
30327 + trans = get_current_context()->trans;
30328 + atom = get_current_atom_locked();
30329 + assert("vpf-1906", atom != NULL);
30330 +
30331 + spin_lock_txnh(trans);
30332 + return force_commit_atom(trans);
30333 +}
30334 +
30335 +/* Make Linus happy.
30336 + Local variables:
30337 + c-indentation-style: "K&R"
30338 + mode-name: "LC"
30339 + c-basic-offset: 8
30340 + tab-width: 8
30341 + fill-column: 120
30342 + scroll-step: 1
30343 + End:
30344 +*/
30345 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.h
30346 --- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300
30347 +++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.h 2007-07-29 00:25:34.908704759 +0400
30348 @@ -0,0 +1,109 @@
30349 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30350 +
30351 +/* this file contains:
30352 + - definition of ondisk super block of standart disk layout for
30353 + reiser 4.0 (layout 40)
30354 + - definition of layout 40 specific portion of in-core super block
30355 + - declarations of functions implementing methods of layout plugin
30356 + for layout 40
30357 + - declarations of functions used to get/set fields in layout 40 super block
30358 +*/
30359 +
30360 +#ifndef __DISK_FORMAT40_H__
30361 +#define __DISK_FORMAT40_H__
30362 +
30363 +/* magic for default reiser4 layout */
30364 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30365 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30366 +
30367 +#include "../../dformat.h"
30368 +
30369 +#include <linux/fs.h> /* for struct super_block */
30370 +
30371 +typedef enum {
30372 + FORMAT40_LARGE_KEYS
30373 +} format40_flags;
30374 +
30375 +/* ondisk super block for format 40. It is 512 bytes long */
30376 +typedef struct format40_disk_super_block {
30377 + /* 0 */ d64 block_count;
30378 + /* number of block in a filesystem */
30379 + /* 8 */ d64 free_blocks;
30380 + /* number of free blocks */
30381 + /* 16 */ d64 root_block;
30382 + /* filesystem tree root block */
30383 + /* 24 */ d64 oid;
30384 + /* smallest free objectid */
30385 + /* 32 */ d64 file_count;
30386 + /* number of files in a filesystem */
30387 + /* 40 */ d64 flushes;
30388 + /* number of times super block was
30389 + flushed. Needed if format 40
30390 + will have few super blocks */
30391 + /* 48 */ d32 mkfs_id;
30392 + /* unique identifier of fs */
30393 + /* 52 */ char magic[16];
30394 + /* magic string ReIsEr40FoRmAt */
30395 + /* 68 */ d16 tree_height;
30396 + /* height of filesystem tree */
30397 + /* 70 */ d16 formatting_policy;
30398 + /* not used anymore */
30399 + /* 72 */ d64 flags;
30400 + /* 80 */ d32 version;
30401 + /* on-disk format version number
30402 + initially assigned by mkfs as the greatest format40
30403 + version number supported by reiser4progs and updated
30404 + in mount time in accordance with the greatest format40
30405 + version number supported by kernel.
30406 + Is used by fsck to catch possible corruption and
30407 + for various compatibility issues */
30408 + /* 84 */ char not_used[428];
30409 +} format40_disk_super_block;
30410 +
30411 +/* format 40 specific part of reiser4_super_info_data */
30412 +typedef struct format40_super_info {
30413 +/* format40_disk_super_block actual_sb; */
30414 + jnode *sb_jnode;
30415 + struct {
30416 + reiser4_block_nr super;
30417 + } loc;
30418 +} format40_super_info;
30419 +
30420 +/* Defines for journal header and footer respectively. */
30421 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
30422 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
30423 +
30424 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
30425 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
30426 +
30427 +#define FORMAT40_STATUS_BLOCKNR \
30428 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
30429 +
30430 +/* Diskmap declarations */
30431 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
30432 +#define FORMAT40_SUPER 1
30433 +#define FORMAT40_JH 2
30434 +#define FORMAT40_JF 3
30435 +
30436 +/* declarations of functions implementing methods of layout plugin for
30437 + format 40. The functions theirself are in disk_format40.c */
30438 +extern int init_format_format40(struct super_block *, void *data);
30439 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
30440 +extern int release_format40(struct super_block *s);
30441 +extern jnode *log_super_format40(struct super_block *s);
30442 +extern int check_open_format40(const struct inode *object);
30443 +extern int version_update_format40(struct super_block *super);
30444 +
30445 +/* __DISK_FORMAT40_H__ */
30446 +#endif
30447 +
30448 +/* Make Linus happy.
30449 + Local variables:
30450 + c-indentation-style: "K&R"
30451 + mode-name: "LC"
30452 + c-basic-offset: 8
30453 + tab-width: 8
30454 + fill-column: 120
30455 + scroll-step: 1
30456 + End:
30457 +*/
30458 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.c
30459 --- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
30460 +++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.c 2007-07-29 00:25:34.908704759 +0400
30461 @@ -0,0 +1,38 @@
30462 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30463 +
30464 +#include "../../debug.h"
30465 +#include "../plugin_header.h"
30466 +#include "disk_format40.h"
30467 +#include "disk_format.h"
30468 +#include "../plugin.h"
30469 +
30470 +/* initialization of disk layout plugins */
30471 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30472 + [FORMAT40_ID] = {
30473 + .h = {
30474 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30475 + .id = FORMAT40_ID,
30476 + .pops = NULL,
30477 + .label = "reiser40",
30478 + .desc = "standard disk layout for reiser40",
30479 + .linkage = {NULL, NULL}
30480 + },
30481 + .init_format = init_format_format40,
30482 + .root_dir_key = root_dir_key_format40,
30483 + .release = release_format40,
30484 + .log_super = log_super_format40,
30485 + .check_open = check_open_format40,
30486 + .version_update = version_update_format40
30487 + }
30488 +};
30489 +
30490 +/* Make Linus happy.
30491 + Local variables:
30492 + c-indentation-style: "K&R"
30493 + mode-name: "LC"
30494 + c-basic-offset: 8
30495 + tab-width: 8
30496 + fill-column: 120
30497 + scroll-step: 1
30498 + End:
30499 +*/
30500 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.h
30501 --- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
30502 +++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.h 2007-07-29 00:25:34.908704759 +0400
30503 @@ -0,0 +1,27 @@
30504 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30505 +
30506 +/* identifiers for disk layouts, they are also used as indexes in array of disk
30507 + plugins */
30508 +
30509 +#if !defined( __REISER4_DISK_FORMAT_H__ )
30510 +#define __REISER4_DISK_FORMAT_H__
30511 +
30512 +typedef enum {
30513 + /* standard reiser4 disk layout plugin id */
30514 + FORMAT40_ID,
30515 + LAST_FORMAT_ID
30516 +} disk_format_id;
30517 +
30518 +/* __REISER4_DISK_FORMAT_H__ */
30519 +#endif
30520 +
30521 +/* Make Linus happy.
30522 + Local variables:
30523 + c-indentation-style: "K&R"
30524 + mode-name: "LC"
30525 + c-basic-offset: 8
30526 + tab-width: 8
30527 + fill-column: 120
30528 + scroll-step: 1
30529 + End:
30530 +*/
30531 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.22/fs/reiser4/plugin/disk_format/Makefile
30532 --- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300
30533 +++ linux-2.6.22/fs/reiser4/plugin/disk_format/Makefile 2007-07-29 00:25:34.908704759 +0400
30534 @@ -0,0 +1,5 @@
30535 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
30536 +
30537 +df_plugins-objs := \
30538 + disk_format40.o \
30539 + disk_format.o
30540 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/fibration.c linux-2.6.22/fs/reiser4/plugin/fibration.c
30541 --- linux-2.6.22.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300
30542 +++ linux-2.6.22/fs/reiser4/plugin/fibration.c 2007-07-29 00:25:34.908704759 +0400
30543 @@ -0,0 +1,175 @@
30544 +/* Copyright 2004 by Hans Reiser, licensing governed by
30545 + * reiser4/README */
30546 +
30547 +/* Directory fibrations */
30548 +
30549 +/*
30550 + * Suppose we have a directory tree with sources of some project. During
30551 + * compilation .o files are created within this tree. This makes access
30552 + * to the original source files less efficient, because source files are
30553 + * now "diluted" by object files: default directory plugin uses prefix
30554 + * of a file name as a part of the key for directory entry (and this
30555 + * part is also inherited by the key of file body). This means that
30556 + * foo.o will be located close to foo.c and foo.h in the tree.
30557 + *
30558 + * To avoid this effect directory plugin fill highest 7 (unused
30559 + * originally) bits of the second component of the directory entry key
30560 + * by bit-pattern depending on the file name (see
30561 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
30562 + * "fibre". Fibre of the file name key is inherited by key of stat data
30563 + * and keys of file body (in the case of REISER4_LARGE_KEY).
30564 + *
30565 + * Fibre for a given file is chosen by per-directory fibration
30566 + * plugin. Names within given fibre are ordered lexicographically.
30567 + */
30568 +
30569 +#include "../debug.h"
30570 +#include "plugin_header.h"
30571 +#include "plugin.h"
30572 +#include "../super.h"
30573 +#include "../inode.h"
30574 +
30575 +#include <linux/types.h>
30576 +
30577 +static const int fibre_shift = 57;
30578 +
30579 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
30580 +
30581 +/*
30582 + * Trivial fibration: all files of directory are just ordered
30583 + * lexicographically.
30584 + */
30585 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
30586 +{
30587 + return FIBRE_NO(0);
30588 +}
30589 +
30590 +/*
30591 + * dot-o fibration: place .o files after all others.
30592 + */
30593 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
30594 +{
30595 + /* special treatment for .*\.o */
30596 + if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
30597 + return FIBRE_NO(1);
30598 + else
30599 + return FIBRE_NO(0);
30600 +}
30601 +
30602 +/*
30603 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
30604 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
30605 + * default fibre for the rest.
30606 + */
30607 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
30608 +{
30609 + if (len > 2 && name[len - 2] == '.')
30610 + return FIBRE_NO(name[len - 1]);
30611 + else
30612 + return FIBRE_NO(0);
30613 +}
30614 +
30615 +/*
30616 + * ext.3 fibration: try to separate files with different 3-character
30617 + * extensions from each other.
30618 + */
30619 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
30620 +{
30621 + if (len > 4 && name[len - 4] == '.')
30622 + return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
30623 + else
30624 + return FIBRE_NO(0);
30625 +}
30626 +
30627 +static int change_fibration(struct inode *inode,
30628 + reiser4_plugin * plugin,
30629 + pset_member memb)
30630 +{
30631 + int result;
30632 +
30633 + assert("nikita-3503", inode != NULL);
30634 + assert("nikita-3504", plugin != NULL);
30635 +
30636 + assert("nikita-3505", is_reiser4_inode(inode));
30637 + assert("nikita-3506", inode_dir_plugin(inode) != NULL);
30638 + assert("nikita-3507",
30639 + plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
30640 +
30641 + result = 0;
30642 + if (inode_fibration_plugin(inode) == NULL ||
30643 + inode_fibration_plugin(inode)->h.id != plugin->h.id) {
30644 + if (is_dir_empty(inode) == 0)
30645 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
30646 + PSET_FIBRATION, plugin);
30647 + else
30648 + result = RETERR(-ENOTEMPTY);
30649 +
30650 + }
30651 + return result;
30652 +}
30653 +
30654 +static reiser4_plugin_ops fibration_plugin_ops = {
30655 + .init = NULL,
30656 + .load = NULL,
30657 + .save_len = NULL,
30658 + .save = NULL,
30659 + .change = change_fibration
30660 +};
30661 +
30662 +/* fibration plugins */
30663 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
30664 + [FIBRATION_LEXICOGRAPHIC] = {
30665 + .h = {
30666 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30667 + .id = FIBRATION_LEXICOGRAPHIC,
30668 + .pops = &fibration_plugin_ops,
30669 + .label = "lexicographic",
30670 + .desc = "no fibration",
30671 + .linkage = {NULL, NULL}
30672 + },
30673 + .fibre = fibre_trivial
30674 + },
30675 + [FIBRATION_DOT_O] = {
30676 + .h = {
30677 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30678 + .id = FIBRATION_DOT_O,
30679 + .pops = &fibration_plugin_ops,
30680 + .label = "dot-o",
30681 + .desc = "fibrate .o files separately",
30682 + .linkage = {NULL, NULL}
30683 + },
30684 + .fibre = fibre_dot_o
30685 + },
30686 + [FIBRATION_EXT_1] = {
30687 + .h = {
30688 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30689 + .id = FIBRATION_EXT_1,
30690 + .pops = &fibration_plugin_ops,
30691 + .label = "ext-1",
30692 + .desc = "fibrate file by single character extension",
30693 + .linkage = {NULL, NULL}
30694 + },
30695 + .fibre = fibre_ext_1
30696 + },
30697 + [FIBRATION_EXT_3] = {
30698 + .h = {
30699 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30700 + .id = FIBRATION_EXT_3,
30701 + .pops = &fibration_plugin_ops,
30702 + .label = "ext-3",
30703 + .desc = "fibrate file by three character extension",
30704 + .linkage = {NULL, NULL}
30705 + },
30706 + .fibre = fibre_ext_3
30707 + }
30708 +};
30709 +
30710 +/*
30711 + * Local variables:
30712 + * c-indentation-style: "K&R"
30713 + * mode-name: "LC"
30714 + * c-basic-offset: 8
30715 + * tab-width: 8
30716 + * fill-column: 79
30717 + * End:
30718 + */
30719 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/fibration.h linux-2.6.22/fs/reiser4/plugin/fibration.h
30720 --- linux-2.6.22.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300
30721 +++ linux-2.6.22/fs/reiser4/plugin/fibration.h 2007-07-29 00:25:34.908704759 +0400
30722 @@ -0,0 +1,37 @@
30723 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
30724 +
30725 +/* Fibration plugin used by hashed directory plugin to segment content
30726 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
30727 +
30728 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
30729 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
30730 +
30731 +#include "plugin_header.h"
30732 +
30733 +typedef struct fibration_plugin {
30734 + /* generic fields */
30735 + plugin_header h;
30736 +
30737 + __u64(*fibre) (const struct inode * dir, const char *name, int len);
30738 +} fibration_plugin;
30739 +
30740 +typedef enum {
30741 + FIBRATION_LEXICOGRAPHIC,
30742 + FIBRATION_DOT_O,
30743 + FIBRATION_EXT_1,
30744 + FIBRATION_EXT_3,
30745 + LAST_FIBRATION_ID
30746 +} reiser4_fibration_id;
30747 +
30748 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
30749 +#endif
30750 +
30751 +/* Make Linus happy.
30752 + Local variables:
30753 + c-indentation-style: "K&R"
30754 + mode-name: "LC"
30755 + c-basic-offset: 8
30756 + tab-width: 8
30757 + fill-column: 120
30758 + End:
30759 +*/
30760 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.c
30761 --- linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300
30762 +++ linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.c 2007-07-29 00:25:34.916706830 +0400
30763 @@ -0,0 +1,3832 @@
30764 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
30765 + reiser4/README */
30766 +/*
30767 + * Written by Edward Shishkin.
30768 + *
30769 + * Implementations of inode/file/address_space operations
30770 + * specific for cryptcompress file plugin which manages
30771 + * regular files built of compressed and(or) encrypted bodies.
30772 + * See http://dev.namesys.com/CryptcompressPlugin for details.
30773 + */
30774 +
30775 +#include "../../inode.h"
30776 +#include "../cluster.h"
30777 +#include "../object.h"
30778 +#include "../../tree_walk.h"
30779 +#include "cryptcompress.h"
30780 +
30781 +#include <asm/scatterlist.h>
30782 +#include <linux/pagevec.h>
30783 +#include <asm/uaccess.h>
30784 +#include <linux/swap.h>
30785 +#include <linux/writeback.h>
30786 +#include <linux/random.h>
30787 +
30788 +/*
30789 + Managing primary and secondary caches by Reiser4
30790 + cryptcompress file plugin. Synchronization scheme.
30791 +
30792 +
30793 + +------------------+
30794 + +------------------->| tfm stream |
30795 + | | (compressed data)|
30796 + flush | +------------------+
30797 + +-----------------+ |
30798 + |(->)longterm lock| V
30799 +--+ writepages() | | +-***-+ reiser4 +---+
30800 + | | +--+ | *** | storage tree | |
30801 + | | | +-***-+ (primary cache)| |
30802 +u | write() (secondary| cache) V / | \ | |
30803 +s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d |
30804 +e | | | |page cluster | | | **disk cluster** | | i |
30805 +r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s |
30806 + | read() ^ ^ | | k |
30807 + | | (->)longterm lock| | page_io()| |
30808 + | | +------+ | |
30809 +--+ readpages() | | +---+
30810 + | V
30811 + | +------------------+
30812 + +--------------------| tfm stream |
30813 + | (plain text) |
30814 + +------------------+
30815 +*/
30816 +
30817 +/* get cryptcompress specific portion of inode */
30818 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
30819 +{
30820 + return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
30821 +}
30822 +
30823 +/* plugin->u.file.init_inode_data */
30824 +void init_inode_data_cryptcompress(struct inode *inode,
30825 + reiser4_object_create_data * crd,
30826 + int create)
30827 +{
30828 + struct cryptcompress_info *data;
30829 +
30830 + data = cryptcompress_inode_data(inode);
30831 + assert("edward-685", data != NULL);
30832 +
30833 + memset(data, 0, sizeof(*data));
30834 +
30835 + mutex_init(&data->checkin_mutex);
30836 + data->trunc_index = ULONG_MAX;
30837 + turn_on_compression(data);
30838 + set_lattice_factor(data, MIN_LATTICE_FACTOR);
30839 + init_inode_ordering(inode, crd, create);
30840 +}
30841 +
30842 +/* The following is a part of reiser4 cipher key manager
30843 + which is called when opening/creating a cryptcompress file */
30844 +
30845 +/* get/set cipher key info */
30846 +struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
30847 +{
30848 + assert("edward-90", inode != NULL);
30849 + assert("edward-91", reiser4_inode_data(inode) != NULL);
30850 + return cryptcompress_inode_data(inode)->crypt;
30851 +}
30852 +
30853 +static void set_inode_crypto_info (struct inode * inode,
30854 + struct reiser4_crypto_info * info)
30855 +{
30856 + cryptcompress_inode_data(inode)->crypt = info;
30857 +}
30858 +
30859 +/* allocate a cipher key info */
30860 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
30861 +{
30862 + struct reiser4_crypto_info * info;
30863 + int fipsize;
30864 +
30865 + info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
30866 + if (!info)
30867 + return ERR_PTR(-ENOMEM);
30868 + memset(info, 0, sizeof (*info));
30869 + fipsize = inode_digest_plugin(inode)->fipsize;
30870 + info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
30871 + if (!info->keyid) {
30872 + kfree(info);
30873 + return ERR_PTR(-ENOMEM);
30874 + }
30875 + info->host = inode;
30876 + return info;
30877 +}
30878 +
30879 +#if 0
30880 +/* allocate/free low-level info for cipher and digest
30881 + transforms */
30882 +static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
30883 +{
30884 + struct crypto_blkcipher * ctfm = NULL;
30885 + struct crypto_hash * dtfm = NULL;
30886 + cipher_plugin * cplug = inode_cipher_plugin(info->host);
30887 + digest_plugin * dplug = inode_digest_plugin(info->host);
30888 +
30889 + if (cplug->alloc) {
30890 + ctfm = cplug->alloc();
30891 + if (IS_ERR(ctfm)) {
30892 + warning("edward-1364",
30893 + "Can not allocate info for %s\n",
30894 + cplug->h.desc);
30895 + return RETERR(PTR_ERR(ctfm));
30896 + }
30897 + }
30898 + info_set_cipher(info, ctfm);
30899 + if (dplug->alloc) {
30900 + dtfm = dplug->alloc();
30901 + if (IS_ERR(dtfm)) {
30902 + warning("edward-1365",
30903 + "Can not allocate info for %s\n",
30904 + dplug->h.desc);
30905 + goto unhappy_with_digest;
30906 + }
30907 + }
30908 + info_set_digest(info, dtfm);
30909 + return 0;
30910 + unhappy_with_digest:
30911 + if (cplug->free) {
30912 + cplug->free(ctfm);
30913 + info_set_cipher(info, NULL);
30914 + }
30915 + return RETERR(PTR_ERR(dtfm));
30916 +}
30917 +#endif
30918 +
30919 +static void
30920 +free_crypto_tfms(struct reiser4_crypto_info * info)
30921 +{
30922 + assert("edward-1366", info != NULL);
30923 + if (!info_get_cipher(info)) {
30924 + assert("edward-1601", !info_get_digest(info));
30925 + return;
30926 + }
30927 + inode_cipher_plugin(info->host)->free(info_get_cipher(info));
30928 + info_set_cipher(info, NULL);
30929 + inode_digest_plugin(info->host)->free(info_get_digest(info));
30930 + info_set_digest(info, NULL);
30931 + return;
30932 +}
30933 +
30934 +#if 0
30935 +/* create a key fingerprint for disk stat-data */
30936 +static int create_keyid (struct reiser4_crypto_info * info,
30937 + struct reiser4_crypto_data * data)
30938 +{
30939 + int ret = -ENOMEM;
30940 + size_t blk, pad;
30941 + __u8 * dmem;
30942 + __u8 * cmem;
30943 + struct hash_desc ddesc;
30944 + struct blkcipher_desc cdesc;
30945 + struct scatterlist sg;
30946 +
30947 + assert("edward-1367", info != NULL);
30948 + assert("edward-1368", info->keyid != NULL);
30949 +
30950 + ddesc.tfm = info_get_digest(info);
30951 + ddesc.flags = 0;
30952 + cdesc.tfm = info_get_cipher(info);
30953 + cdesc.flags = 0;
30954 +
30955 + dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
30956 + reiser4_ctx_gfp_mask_get());
30957 + if (!dmem)
30958 + goto exit1;
30959 +
30960 + blk = crypto_blkcipher_blocksize(cdesc.tfm);
30961 +
30962 + pad = data->keyid_size % blk;
30963 + pad = (pad ? blk - pad : 0);
30964 +
30965 + cmem = kmalloc((size_t)data->keyid_size + pad,
30966 + reiser4_ctx_gfp_mask_get());
30967 + if (!cmem)
30968 + goto exit2;
30969 + memcpy(cmem, data->keyid, data->keyid_size);
30970 + memset(cmem + data->keyid_size, 0, pad);
30971 +
30972 + sg.page = virt_to_page(cmem);
30973 + sg.offset = offset_in_page(cmem);
30974 + sg.length = data->keyid_size + pad;
30975 +
30976 + ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
30977 + data->keyid_size + pad);
30978 + if (ret) {
30979 + warning("edward-1369",
30980 + "encryption failed flags=%x\n", cdesc.flags);
30981 + goto exit3;
30982 + }
30983 + ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
30984 + if (ret) {
30985 + warning("edward-1602",
30986 + "digest failed flags=%x\n", ddesc.flags);
30987 + goto exit3;
30988 + }
30989 + memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
30990 + exit3:
30991 + kfree(cmem);
30992 + exit2:
30993 + kfree(dmem);
30994 + exit1:
30995 + return ret;
30996 +}
30997 +#endif
30998 +
30999 +static void destroy_keyid(struct reiser4_crypto_info * info)
31000 +{
31001 + assert("edward-1370", info != NULL);
31002 + assert("edward-1371", info->keyid != NULL);
31003 + kfree(info->keyid);
31004 + return;
31005 +}
31006 +
31007 +static void __free_crypto_info (struct inode * inode)
31008 +{
31009 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
31010 + assert("edward-1372", info != NULL);
31011 +
31012 + free_crypto_tfms(info);
31013 + destroy_keyid(info);
31014 + kfree(info);
31015 +}
31016 +
31017 +#if 0
31018 +static void instantiate_crypto_info(struct reiser4_crypto_info * info)
31019 +{
31020 + assert("edward-1373", info != NULL);
31021 + assert("edward-1374", info->inst == 0);
31022 + info->inst = 1;
31023 +}
31024 +#endif
31025 +
31026 +static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
31027 +{
31028 + assert("edward-1375", info != NULL);
31029 + info->inst = 0;
31030 +}
31031 +
31032 +static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
31033 +{
31034 + return info->inst;
31035 +}
31036 +
31037 +static int inode_has_cipher_key(struct inode * inode)
31038 +{
31039 + assert("edward-1376", inode != NULL);
31040 + return inode_crypto_info(inode) &&
31041 + is_crypto_info_instantiated(inode_crypto_info(inode));
31042 +}
31043 +
31044 +static void free_crypto_info (struct inode * inode)
31045 +{
31046 + uninstantiate_crypto_info(inode_crypto_info(inode));
31047 + __free_crypto_info(inode);
31048 +}
31049 +
31050 +static int need_cipher(struct inode * inode)
31051 +{
31052 + return inode_cipher_plugin(inode) !=
31053 + cipher_plugin_by_id(NONE_CIPHER_ID);
31054 +}
31055 +
31056 +/* Parse @data which contains a (uninstantiated) cipher key imported
31057 + from user space, create a low-level cipher info and attach it to
31058 + the @object. If success, then info contains an instantiated key */
31059 +#if 0
31060 +struct reiser4_crypto_info * create_crypto_info(struct inode * object,
31061 + struct reiser4_crypto_data * data)
31062 +{
31063 + int ret;
31064 + struct reiser4_crypto_info * info;
31065 +
31066 + assert("edward-1377", data != NULL);
31067 + assert("edward-1378", need_cipher(object));
31068 +
31069 + if (inode_file_plugin(object) !=
31070 + file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
31071 + return ERR_PTR(-EINVAL);
31072 +
31073 + info = reiser4_alloc_crypto_info(object);
31074 + if (IS_ERR(info))
31075 + return info;
31076 + ret = alloc_crypto_tfms(info);
31077 + if (ret)
31078 + goto err;
31079 + /* instantiating a key */
31080 + ret = crypto_blkcipher_setkey(info_get_cipher(info),
31081 + data->key,
31082 + data->keysize);
31083 + if (ret) {
31084 + warning("edward-1379",
31085 + "setkey failed flags=%x",
31086 + crypto_blkcipher_get_flags(info_get_cipher(info)));
31087 + goto err;
31088 + }
31089 + info->keysize = data->keysize;
31090 + ret = create_keyid(info, data);
31091 + if (ret)
31092 + goto err;
31093 + instantiate_crypto_info(info);
31094 + return info;
31095 + err:
31096 + __free_crypto_info(object);
31097 + return ERR_PTR(ret);
31098 +}
31099 +#endif
31100 +
31101 +/* increment/decrement a load counter when
31102 + attaching/detaching the crypto-stat to any object */
31103 +static void load_crypto_info(struct reiser4_crypto_info * info)
31104 +{
31105 + assert("edward-1380", info != NULL);
31106 + inc_keyload_count(info);
31107 +}
31108 +
31109 +static void unload_crypto_info(struct inode * inode)
31110 +{
31111 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
31112 + assert("edward-1381", info->keyload_count > 0);
31113 +
31114 + dec_keyload_count(inode_crypto_info(inode));
31115 + if (info->keyload_count == 0)
31116 + /* final release */
31117 + free_crypto_info(inode);
31118 +}
31119 +
31120 +/* attach/detach an existing crypto-stat */
31121 +void reiser4_attach_crypto_info(struct inode * inode,
31122 + struct reiser4_crypto_info * info)
31123 +{
31124 + assert("edward-1382", inode != NULL);
31125 + assert("edward-1383", info != NULL);
31126 + assert("edward-1384", inode_crypto_info(inode) == NULL);
31127 +
31128 + set_inode_crypto_info(inode, info);
31129 + load_crypto_info(info);
31130 +}
31131 +
31132 +/* returns true, if crypto stat can be attached to the @host */
31133 +#if REISER4_DEBUG
31134 +static int host_allows_crypto_info(struct inode * host)
31135 +{
31136 + int ret;
31137 + file_plugin * fplug = inode_file_plugin(host);
31138 +
31139 + switch (fplug->h.id) {
31140 + case CRYPTCOMPRESS_FILE_PLUGIN_ID:
31141 + ret = 1;
31142 + break;
31143 + default:
31144 + ret = 0;
31145 + }
31146 + return ret;
31147 +}
31148 +#endif /* REISER4_DEBUG */
31149 +
31150 +static void reiser4_detach_crypto_info(struct inode * inode)
31151 +{
31152 + assert("edward-1385", inode != NULL);
31153 + assert("edward-1386", host_allows_crypto_info(inode));
31154 +
31155 + if (inode_crypto_info(inode))
31156 + unload_crypto_info(inode);
31157 + set_inode_crypto_info(inode, NULL);
31158 +}
31159 +
31160 +#if 0
31161 +
31162 +/* compare fingerprints of @child and @parent */
31163 +static int keyid_eq(struct reiser4_crypto_info * child,
31164 + struct reiser4_crypto_info * parent)
31165 +{
31166 + return !memcmp(child->keyid,
31167 + parent->keyid,
31168 + info_digest_plugin(parent)->fipsize);
31169 +}
31170 +
31171 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
31172 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
31173 +{
31174 + if (!need_cipher(child))
31175 + return 0;
31176 + /* the child is created */
31177 + if (!inode_crypto_info(child))
31178 + return 1;
31179 + /* the child is looked up */
31180 + if (!inode_crypto_info(parent))
31181 + return 0;
31182 + return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31183 + inode_digest_plugin(child) == inode_digest_plugin(parent) &&
31184 + inode_crypto_info(child)->keysize ==
31185 + inode_crypto_info(parent)->keysize &&
31186 + keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
31187 +}
31188 +#endif
31189 +
31190 +/* helper functions for ->create() method of the cryptcompress plugin */
31191 +static int inode_set_crypto(struct inode * object)
31192 +{
31193 + reiser4_inode * info;
31194 + if (!inode_crypto_info(object)) {
31195 + if (need_cipher(object))
31196 + return RETERR(-EINVAL);
31197 + /* the file is not to be encrypted */
31198 + return 0;
31199 + }
31200 + info = reiser4_inode_data(object);
31201 + info->extmask |= (1 << CRYPTO_STAT);
31202 + return 0;
31203 +}
31204 +
31205 +static int inode_init_compression(struct inode * object)
31206 +{
31207 + int result = 0;
31208 + assert("edward-1461", object != NULL);
31209 + if (inode_compression_plugin(object)->init)
31210 + result = inode_compression_plugin(object)->init();
31211 + return result;
31212 +}
31213 +
31214 +static int inode_check_cluster(struct inode * object)
31215 +{
31216 + assert("edward-696", object != NULL);
31217 +
31218 + if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
31219 + warning("edward-1320", "Can not support '%s' "
31220 + "logical clusters (less then page size)",
31221 + inode_cluster_plugin(object)->h.label);
31222 + return RETERR(-EINVAL);
31223 + }
31224 + if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
31225 + warning("edward-1463", "Can not support '%s' "
31226 + "logical clusters (too big for transform)",
31227 + inode_cluster_plugin(object)->h.label);
31228 + return RETERR(-EINVAL);
31229 + }
31230 + return 0;
31231 +}
31232 +
31233 +/* ->destroy_inode() method of the cryptcompress plugin */
31234 +void destroy_inode_cryptcompress(struct inode * inode)
31235 +{
31236 + assert("edward-1464", INODE_PGCOUNT(inode) == 0);
31237 + reiser4_detach_crypto_info(inode);
31238 + return;
31239 +}
31240 +
31241 +/* ->create() method of the cryptcompress plugin
31242 +
31243 +. install plugins
31244 +. attach crypto info if specified
31245 +. attach compression info if specified
31246 +. attach cluster info
31247 +*/
31248 +int create_cryptcompress(struct inode *object, struct inode *parent,
31249 + reiser4_object_create_data * data)
31250 +{
31251 + int result;
31252 + reiser4_inode *info;
31253 +
31254 + assert("edward-23", object != NULL);
31255 + assert("edward-24", parent != NULL);
31256 + assert("edward-30", data != NULL);
31257 + assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
31258 + assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
31259 +
31260 + info = reiser4_inode_data(object);
31261 +
31262 + assert("edward-29", info != NULL);
31263 +
31264 + /* set file bit */
31265 + info->plugin_mask |= (1 << PSET_FILE);
31266 +
31267 + /* set crypto */
31268 + result = inode_set_crypto(object);
31269 + if (result)
31270 + goto error;
31271 + /* set compression */
31272 + result = inode_init_compression(object);
31273 + if (result)
31274 + goto error;
31275 + /* set cluster */
31276 + result = inode_check_cluster(object);
31277 + if (result)
31278 + goto error;
31279 +
31280 + /* save everything in disk stat-data */
31281 + result = write_sd_by_inode_common(object);
31282 + if (!result)
31283 + return 0;
31284 + error:
31285 + reiser4_detach_crypto_info(object);
31286 + return result;
31287 +}
31288 +
31289 +/* ->open_object() method of the cryptcompress plugin */
31290 +int open_object_cryptcompress(struct inode * inode, struct file * file)
31291 +{
31292 + int result;
31293 + struct inode * parent;
31294 +
31295 + assert("edward-1394", inode != NULL);
31296 + assert("edward-1395", file != NULL);
31297 + assert("edward-1396", file != NULL);
31298 + assert("edward-1397", file->f_dentry->d_inode == inode);
31299 + assert("edward-1398", file->f_dentry->d_parent != NULL);
31300 + assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31301 + assert("edward-698",
31302 + inode_file_plugin(inode) ==
31303 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
31304 + result = inode_check_cluster(inode);
31305 + if (result)
31306 + return result;
31307 + result = inode_init_compression(inode);
31308 + if (result)
31309 + return result;
31310 + if (!need_cipher(inode))
31311 + /* the file is not to be ciphered */
31312 + return 0;
31313 + parent = file->f_dentry->d_parent->d_inode;
31314 + if (!inode_has_cipher_key(inode))
31315 + return RETERR(-EINVAL);
31316 + return 0;
31317 +}
31318 +
31319 +/* returns a blocksize, the attribute of a cipher algorithm */
31320 +static unsigned int
31321 +cipher_blocksize(struct inode * inode)
31322 +{
31323 + assert("edward-758", need_cipher(inode));
31324 + assert("edward-1400", inode_crypto_info(inode) != NULL);
31325 + return crypto_blkcipher_blocksize
31326 + (info_get_cipher(inode_crypto_info(inode)));
31327 +}
31328 +
31329 +/* returns offset translated by scale factor of the crypto-algorithm */
31330 +static loff_t inode_scaled_offset (struct inode * inode,
31331 + const loff_t src_off /* input offset */)
31332 +{
31333 + assert("edward-97", inode != NULL);
31334 +
31335 + if (!need_cipher(inode) ||
31336 + src_off == get_key_offset(reiser4_min_key()) ||
31337 + src_off == get_key_offset(reiser4_max_key()))
31338 + return src_off;
31339 +
31340 + return inode_cipher_plugin(inode)->scale(inode,
31341 + cipher_blocksize(inode),
31342 + src_off);
31343 +}
31344 +
31345 +/* returns disk cluster size */
31346 +size_t inode_scaled_cluster_size(struct inode * inode)
31347 +{
31348 + assert("edward-110", inode != NULL);
31349 +
31350 + return inode_scaled_offset(inode, inode_cluster_size(inode));
31351 +}
31352 +
31353 +/* set number of cluster pages */
31354 +static void set_cluster_nrpages(struct cluster_handle * clust,
31355 + struct inode *inode)
31356 +{
31357 + struct reiser4_slide * win;
31358 +
31359 + assert("edward-180", clust != NULL);
31360 + assert("edward-1040", inode != NULL);
31361 +
31362 + clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
31363 + win = clust->win;
31364 + if (!win) {
31365 + clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
31366 + return;
31367 + }
31368 + assert("edward-1176", clust->op != LC_INVAL);
31369 + assert("edward-1064", win->off + win->count + win->delta != 0);
31370 +
31371 + if (win->stat == HOLE_WINDOW &&
31372 + win->off == 0 && win->count == inode_cluster_size(inode)) {
31373 + /* special case: writing a "fake" logical cluster */
31374 + clust->nr_pages = 0;
31375 + return;
31376 + }
31377 + clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
31378 + lbytes(clust->index, inode)));
31379 + return;
31380 +}
31381 +
31382 +/* plugin->key_by_inode()
31383 + build key of a disk cluster */
31384 +int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
31385 + reiser4_key * key)
31386 +{
31387 + assert("edward-64", inode != 0);
31388 +
31389 + if (likely(off != get_key_offset(reiser4_max_key())))
31390 + off = off_to_clust_to_off(off, inode);
31391 + if (inode_crypto_info(inode))
31392 + off = inode_scaled_offset(inode, off);
31393 +
31394 + key_by_inode_and_offset_common(inode, 0, key);
31395 + set_key_offset(key, (__u64)off);
31396 + return 0;
31397 +}
31398 +
31399 +/* plugin->flow_by_inode() */
31400 +/* flow is used to read/write disk clusters */
31401 +int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
31402 + int user, /* 1: @buf is of user space,
31403 + 0: kernel space */
31404 + loff_t size, /* @buf size */
31405 + loff_t off, /* offset to start io from */
31406 + rw_op op, /* READ or WRITE */
31407 + flow_t * f /* resulting flow */)
31408 +{
31409 + assert("edward-436", f != NULL);
31410 + assert("edward-149", inode != NULL);
31411 + assert("edward-150", inode_file_plugin(inode) != NULL);
31412 + assert("edward-1465", user == 0); /* we use flow to read/write
31413 + disk clusters located in
31414 + kernel space */
31415 + f->length = size;
31416 + memcpy(&f->data, &buf, sizeof(buf));
31417 + f->user = user;
31418 + f->op = op;
31419 +
31420 + return key_by_inode_cryptcompress(inode, off, &f->key);
31421 +}
31422 +
31423 +static int
31424 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
31425 + znode_lock_mode lock_mode)
31426 +{
31427 + coord_t *coord;
31428 +
31429 + assert("edward-704", hint != NULL);
31430 + assert("edward-1089", !hint_is_valid(hint));
31431 + assert("edward-706", hint->lh.owner == NULL);
31432 +
31433 + coord = &hint->ext_coord.coord;
31434 +
31435 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
31436 + /* hint either not set or set by different operation */
31437 + return RETERR(-E_REPEAT);
31438 +
31439 + if (get_key_offset(key) != hint->offset)
31440 + /* hint is set for different key */
31441 + return RETERR(-E_REPEAT);
31442 +
31443 + assert("edward-707", reiser4_schedulable());
31444 +
31445 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
31446 + key, &hint->lh, lock_mode,
31447 + ZNODE_LOCK_LOPRI);
31448 +}
31449 +
31450 +/* reserve disk space when writing a logical cluster */
31451 +static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
31452 +{
31453 + int result = 0;
31454 +
31455 + assert("edward-965", reiser4_schedulable());
31456 + assert("edward-439", inode != NULL);
31457 + assert("edward-440", clust != NULL);
31458 + assert("edward-441", clust->pages != NULL);
31459 +
31460 + if (clust->nr_pages == 0) {
31461 + assert("edward-1152", clust->win != NULL);
31462 + assert("edward-1153", clust->win->stat == HOLE_WINDOW);
31463 + /* don't reserve disk space for fake logical cluster */
31464 + return 0;
31465 + }
31466 + assert("edward-442", jprivate(clust->pages[0]) != NULL);
31467 +
31468 + result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
31469 + estimate_update_cluster(inode),
31470 + BA_CAN_COMMIT);
31471 + if (result)
31472 + return result;
31473 + clust->reserved = 1;
31474 + grabbed2cluster_reserved(estimate_insert_cluster(inode) +
31475 + estimate_update_cluster(inode));
31476 +#if REISER4_DEBUG
31477 + clust->reserved_prepped = estimate_update_cluster(inode);
31478 + clust->reserved_unprepped = estimate_insert_cluster(inode);
31479 +#endif
31480 + /* there can be space grabbed by txnmgr_force_commit_all */
31481 + return 0;
31482 +}
31483 +
31484 +/* free reserved disk space if writing a logical cluster fails */
31485 +static void free_reserved4cluster(struct inode *inode,
31486 + struct cluster_handle *ch, int count)
31487 +{
31488 + assert("edward-967", ch->reserved == 1);
31489 +
31490 + cluster_reserved2free(count);
31491 + ch->reserved = 0;
31492 +}
31493 +
31494 +/* The core search procedure of the cryptcompress plugin.
31495 + If returned value is not cbk_errored, then current znode is locked */
31496 +static int find_cluster_item(hint_t * hint,
31497 + const reiser4_key * key, /* key of the item we are
31498 + looking for */
31499 + znode_lock_mode lock_mode /* which lock */ ,
31500 + ra_info_t * ra_info, lookup_bias bias, __u32 flags)
31501 +{
31502 + int result;
31503 + reiser4_key ikey;
31504 + int went_right = 0;
31505 + coord_t *coord = &hint->ext_coord.coord;
31506 + coord_t orig = *coord;
31507 +
31508 + assert("edward-152", hint != NULL);
31509 +
31510 + if (!hint_is_valid(hint)) {
31511 + result = cryptcompress_hint_validate(hint, key, lock_mode);
31512 + if (result == -E_REPEAT)
31513 + goto traverse_tree;
31514 + else if (result) {
31515 + assert("edward-1216", 0);
31516 + return result;
31517 + }
31518 + hint_set_valid(hint);
31519 + }
31520 + assert("edward-709", znode_is_any_locked(coord->node));
31521 +
31522 + /* In-place lookup is going here, it means we just need to
31523 + check if next item of the @coord match to the @keyhint) */
31524 +
31525 + if (equal_to_rdk(coord->node, key)) {
31526 + result = goto_right_neighbor(coord, &hint->lh);
31527 + if (result == -E_NO_NEIGHBOR) {
31528 + assert("edward-1217", 0);
31529 + return RETERR(-EIO);
31530 + }
31531 + if (result)
31532 + return result;
31533 + assert("edward-1218", equal_to_ldk(coord->node, key));
31534 + went_right = 1;
31535 + } else {
31536 + coord->item_pos++;
31537 + coord->unit_pos = 0;
31538 + coord->between = AT_UNIT;
31539 + }
31540 + result = zload(coord->node);
31541 + if (result)
31542 + return result;
31543 + assert("edward-1219", !node_is_empty(coord->node));
31544 +
31545 + if (!coord_is_existing_item(coord)) {
31546 + zrelse(coord->node);
31547 + goto not_found;
31548 + }
31549 + item_key_by_coord(coord, &ikey);
31550 + zrelse(coord->node);
31551 + if (!keyeq(key, &ikey))
31552 + goto not_found;
31553 + /* Ok, item is found, update node counts */
31554 + if (went_right)
31555 + dclust_inc_extension_ncount(hint);
31556 + return CBK_COORD_FOUND;
31557 +
31558 + not_found:
31559 + assert("edward-1220", coord->item_pos > 0);
31560 + //coord->item_pos--;
31561 + /* roll back */
31562 + *coord = orig;
31563 + ON_DEBUG(coord_update_v(coord));
31564 + return CBK_COORD_NOTFOUND;
31565 +
31566 + traverse_tree:
31567 + assert("edward-713", hint->lh.owner == NULL);
31568 + assert("edward-714", reiser4_schedulable());
31569 +
31570 + reiser4_unset_hint(hint);
31571 + dclust_init_extension(hint);
31572 + coord_init_zero(coord);
31573 + result = coord_by_key(current_tree, key, coord, &hint->lh,
31574 + lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
31575 + CBK_UNIQUE | flags, ra_info);
31576 + if (cbk_errored(result))
31577 + return result;
31578 + if(result == CBK_COORD_FOUND)
31579 + dclust_inc_extension_ncount(hint);
31580 + hint_set_valid(hint);
31581 + return result;
31582 +}
31583 +
31584 +/* This function is called by deflate[inflate] manager when
31585 + creating a transformed/plain stream to check if we should
31586 + create/cut some overhead. If this returns true, then @oh
31587 + contains the size of this overhead.
31588 + */
31589 +static int need_cut_or_align(struct inode * inode,
31590 + struct cluster_handle * ch, rw_op rw, int * oh)
31591 +{
31592 + struct tfm_cluster * tc = &ch->tc;
31593 + switch (rw) {
31594 + case WRITE_OP: /* estimate align */
31595 + *oh = tc->len % cipher_blocksize(inode);
31596 + if (*oh != 0)
31597 + return 1;
31598 + break;
31599 + case READ_OP: /* estimate cut */
31600 + *oh = *(tfm_output_data(ch) + tc->len - 1);
31601 + break;
31602 + default:
31603 + impossible("edward-1401", "bad option");
31604 + }
31605 + return (tc->len != tc->lsize);
31606 +}
31607 +
31608 +/* create/cut an overhead of transformed/plain stream */
31609 +static void align_or_cut_overhead(struct inode * inode,
31610 + struct cluster_handle * ch, rw_op rw)
31611 +{
31612 + int oh;
31613 + cipher_plugin * cplug = inode_cipher_plugin(inode);
31614 +
31615 + assert("edward-1402", need_cipher(inode));
31616 +
31617 + if (!need_cut_or_align(inode, ch, rw, &oh))
31618 + return;
31619 + switch (rw) {
31620 + case WRITE_OP: /* do align */
31621 + ch->tc.len +=
31622 + cplug->align_stream(tfm_input_data(ch) +
31623 + ch->tc.len, ch->tc.len,
31624 + cipher_blocksize(inode));
31625 + *(tfm_input_data(ch) + ch->tc.len - 1) =
31626 + cipher_blocksize(inode) - oh;
31627 + break;
31628 + case READ_OP: /* do cut */
31629 + assert("edward-1403", oh <= cipher_blocksize(inode));
31630 + ch->tc.len -= oh;
31631 + break;
31632 + default:
31633 + impossible("edward-1404", "bad option");
31634 + }
31635 + return;
31636 +}
31637 +
31638 +static unsigned max_cipher_overhead(struct inode * inode)
31639 +{
31640 + if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
31641 + return 0;
31642 + return cipher_blocksize(inode);
31643 +}
31644 +
31645 +static int deflate_overhead(struct inode *inode)
31646 +{
31647 + return (inode_compression_plugin(inode)->
31648 + checksum ? DC_CHECKSUM_SIZE : 0);
31649 +}
31650 +
31651 +static unsigned deflate_overrun(struct inode * inode, int ilen)
31652 +{
31653 + return coa_overrun(inode_compression_plugin(inode), ilen);
31654 +}
31655 +
31656 +/* Estimating compressibility of a logical cluster by various
31657 + policies represented by compression mode plugin.
31658 + If this returns false, then compressor won't be called for
31659 + the cluster of index @index.
31660 +*/
31661 +static int should_compress(struct tfm_cluster * tc, cloff_t index,
31662 + struct inode *inode)
31663 +{
31664 + compression_plugin *cplug = inode_compression_plugin(inode);
31665 + compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
31666 +
31667 + assert("edward-1321", tc->len != 0);
31668 + assert("edward-1322", cplug != NULL);
31669 + assert("edward-1323", mplug != NULL);
31670 +
31671 + return /* estimate by size */
31672 + (cplug->min_size_deflate ?
31673 + tc->len >= cplug->min_size_deflate() :
31674 + 1) &&
31675 + /* estimate by compression mode plugin */
31676 + (mplug->should_deflate ?
31677 + mplug->should_deflate(inode, index) :
31678 + 1);
31679 +}
31680 +
31681 +/* Evaluating results of compression transform.
31682 + Returns true, if we need to accept this results */
31683 +static int save_compressed(int size_before, int size_after, struct inode *inode)
31684 +{
31685 + return (size_after + deflate_overhead(inode) +
31686 + max_cipher_overhead(inode) < size_before);
31687 +}
31688 +
31689 +/* Guess result of the evaluation above */
31690 +static int need_inflate(struct cluster_handle * ch, struct inode * inode,
31691 + int encrypted /* is cluster encrypted */ )
31692 +{
31693 + struct tfm_cluster * tc = &ch->tc;
31694 +
31695 + assert("edward-142", tc != 0);
31696 + assert("edward-143", inode != NULL);
31697 +
31698 + return tc->len <
31699 + (encrypted ?
31700 + inode_scaled_offset(inode, tc->lsize) :
31701 + tc->lsize);
31702 +}
31703 +
31704 +/* If results of compression were accepted, then we add
31705 + a checksum to catch possible disk cluster corruption.
31706 + The following is a format of the data stored in disk clusters:
31707 +
31708 + data This is (transformed) logical cluster.
31709 + cipher_overhead This is created by ->align() method
31710 + of cipher plugin. May be absent.
31711 + checksum (4) This is created by ->checksum method
31712 + of compression plugin to check
31713 + integrity. May be absent.
31714 +
31715 + Crypto overhead format:
31716 +
31717 + data
31718 + control_byte (1) contains aligned overhead size:
31719 + 1 <= overhead <= cipher_blksize
31720 +*/
31721 +/* Append a checksum at the end of a transformed stream */
31722 +static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
31723 +{
31724 + __u32 checksum;
31725 +
31726 + assert("edward-1309", tc != NULL);
31727 + assert("edward-1310", tc->len > 0);
31728 + assert("edward-1311", cplug->checksum != NULL);
31729 +
31730 + checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
31731 + put_unaligned(cpu_to_le32(checksum),
31732 + (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
31733 + tc->len += (int)DC_CHECKSUM_SIZE;
31734 +}
31735 +
31736 +/* Check a disk cluster checksum.
31737 + Returns 0 if checksum is correct, otherwise returns 1 */
31738 +static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
31739 +{
31740 + assert("edward-1312", tc != NULL);
31741 + assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
31742 + assert("edward-1314", cplug->checksum != NULL);
31743 +
31744 + if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
31745 + tc->len - (int)DC_CHECKSUM_SIZE) !=
31746 + le32_to_cpu(get_unaligned((d32 *)
31747 + (tfm_stream_data(tc, INPUT_STREAM)
31748 + + tc->len - (int)DC_CHECKSUM_SIZE)))) {
31749 + warning("edward-156",
31750 + "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
31751 + (int)le32_to_cpu
31752 + (get_unaligned((d32 *)
31753 + (tfm_stream_data(tc, INPUT_STREAM) +
31754 + tc->len - (int)DC_CHECKSUM_SIZE))),
31755 + (int)cplug->checksum
31756 + (tfm_stream_data(tc, INPUT_STREAM),
31757 + tc->len - (int)DC_CHECKSUM_SIZE));
31758 + return 1;
31759 + }
31760 + tc->len -= (int)DC_CHECKSUM_SIZE;
31761 + return 0;
31762 +}
31763 +
31764 +/* get input/output stream for some transform action */
31765 +int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
31766 + tfm_stream_id id)
31767 +{
31768 + size_t size = inode_scaled_cluster_size(inode);
31769 +
31770 + assert("edward-901", tc != NULL);
31771 + assert("edward-1027", inode_compression_plugin(inode) != NULL);
31772 +
31773 + if (cluster_get_tfm_act(tc) == TFMA_WRITE)
31774 + size += deflate_overrun(inode, inode_cluster_size(inode));
31775 +
31776 + if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
31777 + alternate_streams(tc);
31778 + if (!get_tfm_stream(tc, id))
31779 + return alloc_tfm_stream(tc, size, id);
31780 +
31781 + assert("edward-902", tfm_stream_is_set(tc, id));
31782 +
31783 + if (tfm_stream_size(tc, id) < size)
31784 + return realloc_tfm_stream(tc, size, id);
31785 + return 0;
31786 +}
31787 +
31788 +/* Common deflate manager */
31789 +int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
31790 +{
31791 + int result = 0;
31792 + int compressed = 0;
31793 + int encrypted = 0;
31794 + struct tfm_cluster * tc = &clust->tc;
31795 + compression_plugin * coplug;
31796 +
31797 + assert("edward-401", inode != NULL);
31798 + assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
31799 + assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
31800 + assert("edward-498", !tfm_cluster_is_uptodate(tc));
31801 +
31802 + coplug = inode_compression_plugin(inode);
31803 + if (should_compress(tc, clust->index, inode)) {
31804 + /* try to compress, discard bad results */
31805 + __u32 dst_len;
31806 + compression_mode_plugin * mplug =
31807 + inode_compression_mode_plugin(inode);
31808 + assert("edward-602", coplug != NULL);
31809 + assert("edward-1423", coplug->compress != NULL);
31810 +
31811 + result = grab_coa(tc, coplug);
31812 + if (result) {
31813 + warning("edward-1424",
31814 + "alloc_coa failed with ret=%d, skipped compression",
31815 + result);
31816 + goto cipher;
31817 + }
31818 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31819 + if (result) {
31820 + warning("edward-1425",
31821 + "alloc stream failed with ret=%d, skipped compression",
31822 + result);
31823 + goto cipher;
31824 + }
31825 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
31826 + coplug->compress(get_coa(tc, coplug->h.id, tc->act),
31827 + tfm_input_data(clust), tc->len,
31828 + tfm_output_data(clust), &dst_len);
31829 + /* make sure we didn't overwrite extra bytes */
31830 + assert("edward-603",
31831 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
31832 +
31833 + /* evaluate results of compression transform */
31834 + if (save_compressed(tc->len, dst_len, inode)) {
31835 + /* good result, accept */
31836 + tc->len = dst_len;
31837 + if (mplug->accept_hook != NULL) {
31838 + result = mplug->accept_hook(inode, clust->index);
31839 + if (result)
31840 + warning("edward-1426",
31841 + "accept_hook failed with ret=%d",
31842 + result);
31843 + }
31844 + compressed = 1;
31845 + }
31846 + else {
31847 + /* bad result, discard */
31848 +#if 0
31849 + if (cluster_is_complete(clust, inode))
31850 + warning("edward-1496",
31851 + "incompressible cluster %lu (inode %llu)",
31852 + clust->index,
31853 + (unsigned long long)get_inode_oid(inode));
31854 +#endif
31855 + if (mplug->discard_hook != NULL &&
31856 + cluster_is_complete(clust, inode)) {
31857 + result = mplug->discard_hook(inode,
31858 + clust->index);
31859 + if (result)
31860 + warning("edward-1427",
31861 + "discard_hook failed with ret=%d",
31862 + result);
31863 + }
31864 + }
31865 + }
31866 + cipher:
31867 + if (need_cipher(inode)) {
31868 + cipher_plugin * ciplug;
31869 + struct blkcipher_desc desc;
31870 + struct scatterlist src;
31871 + struct scatterlist dst;
31872 +
31873 + ciplug = inode_cipher_plugin(inode);
31874 + desc.tfm = info_get_cipher(inode_crypto_info(inode));
31875 + desc.flags = 0;
31876 + if (compressed)
31877 + alternate_streams(tc);
31878 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31879 + if (result)
31880 + return result;
31881 +
31882 + align_or_cut_overhead(inode, clust, WRITE_OP);
31883 + src.page = virt_to_page(tfm_input_data(clust));
31884 + src.offset = offset_in_page(tfm_input_data(clust));
31885 + src.length = tc->len;
31886 +
31887 + dst.page = virt_to_page(tfm_output_data(clust));
31888 + dst.offset = offset_in_page(tfm_output_data(clust));
31889 + dst.length = tc->len;
31890 +
31891 + result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
31892 + if (result) {
31893 + warning("edward-1405",
31894 + "encryption failed flags=%x\n", desc.flags);
31895 + return result;
31896 + }
31897 + encrypted = 1;
31898 + }
31899 + if (compressed && coplug->checksum != NULL)
31900 + dc_set_checksum(coplug, tc);
31901 + if (!compressed && !encrypted)
31902 + alternate_streams(tc);
31903 + return result;
31904 +}
31905 +
31906 +/* Common inflate manager. */
31907 +int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
31908 +{
31909 + int result = 0;
31910 + int transformed = 0;
31911 + struct tfm_cluster * tc = &clust->tc;
31912 + compression_plugin * coplug;
31913 +
31914 + assert("edward-905", inode != NULL);
31915 + assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
31916 + assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
31917 + assert("edward-1349", tc->act == TFMA_READ);
31918 + assert("edward-907", !tfm_cluster_is_uptodate(tc));
31919 +
31920 + /* Handle a checksum (if any) */
31921 + coplug = inode_compression_plugin(inode);
31922 + if (need_inflate(clust, inode, need_cipher(inode)) &&
31923 + coplug->checksum != NULL) {
31924 + result = dc_check_checksum(coplug, tc);
31925 + if (unlikely(result)) {
31926 + warning("edward-1460",
31927 + "Inode %llu: disk cluster %lu looks corrupted",
31928 + (unsigned long long)get_inode_oid(inode),
31929 + clust->index);
31930 + return RETERR(-EIO);
31931 + }
31932 + }
31933 + if (need_cipher(inode)) {
31934 + cipher_plugin * ciplug;
31935 + struct blkcipher_desc desc;
31936 + struct scatterlist src;
31937 + struct scatterlist dst;
31938 +
31939 + ciplug = inode_cipher_plugin(inode);
31940 + desc.tfm = info_get_cipher(inode_crypto_info(inode));
31941 + desc.flags = 0;
31942 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31943 + if (result)
31944 + return result;
31945 + assert("edward-909", tfm_cluster_is_set(tc));
31946 +
31947 + src.page = virt_to_page(tfm_input_data(clust));
31948 + src.offset = offset_in_page(tfm_input_data(clust));
31949 + src.length = tc->len;
31950 +
31951 + dst.page = virt_to_page(tfm_output_data(clust));
31952 + dst.offset = offset_in_page(tfm_output_data(clust));
31953 + dst.length = tc->len;
31954 +
31955 + result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
31956 + if (result) {
31957 + warning("edward-1600", "decrypt failed flags=%x\n",
31958 + desc.flags);
31959 + return result;
31960 + }
31961 + align_or_cut_overhead(inode, clust, READ_OP);
31962 + transformed = 1;
31963 + }
31964 + if (need_inflate(clust, inode, 0)) {
31965 + unsigned dst_len = inode_cluster_size(inode);
31966 + if(transformed)
31967 + alternate_streams(tc);
31968 +
31969 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31970 + if (result)
31971 + return result;
31972 + assert("edward-1305", coplug->decompress != NULL);
31973 + assert("edward-910", tfm_cluster_is_set(tc));
31974 +
31975 + coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
31976 + tfm_input_data(clust), tc->len,
31977 + tfm_output_data(clust), &dst_len);
31978 + /* check length */
31979 + tc->len = dst_len;
31980 + assert("edward-157", dst_len == tc->lsize);
31981 + transformed = 1;
31982 + }
31983 + if (!transformed)
31984 + alternate_streams(tc);
31985 + return result;
31986 +}
31987 +
31988 +/* This is implementation of readpage method of struct
31989 + address_space_operations for cryptcompress plugin. */
31990 +int readpage_cryptcompress(struct file *file, struct page *page)
31991 +{
31992 + reiser4_context *ctx;
31993 + struct cluster_handle clust;
31994 + item_plugin *iplug;
31995 + int result;
31996 +
31997 + assert("edward-88", PageLocked(page));
31998 + assert("vs-976", !PageUptodate(page));
31999 + assert("edward-89", page->mapping && page->mapping->host);
32000 +
32001 + ctx = reiser4_init_context(page->mapping->host->i_sb);
32002 + if (IS_ERR(ctx)) {
32003 + unlock_page(page);
32004 + return PTR_ERR(ctx);
32005 + }
32006 + assert("edward-113",
32007 + ergo(file != NULL,
32008 + page->mapping == file->f_dentry->d_inode->i_mapping));
32009 +
32010 + if (PageUptodate(page)) {
32011 + warning("edward-1338", "page is already uptodate\n");
32012 + unlock_page(page);
32013 + reiser4_exit_context(ctx);
32014 + return 0;
32015 + }
32016 + cluster_init_read(&clust, NULL);
32017 + clust.file = file;
32018 + iplug = item_plugin_by_id(CTAIL_ID);
32019 + if (!iplug->s.file.readpage) {
32020 + unlock_page(page);
32021 + put_cluster_handle(&clust);
32022 + reiser4_exit_context(ctx);
32023 + return -EINVAL;
32024 + }
32025 + result = iplug->s.file.readpage(&clust, page);
32026 +
32027 + put_cluster_handle(&clust);
32028 + reiser4_txn_restart(ctx);
32029 + reiser4_exit_context(ctx);
32030 + return result;
32031 +}
32032 +
32033 +/* number of pages to check in */
32034 +static int get_new_nrpages(struct cluster_handle * clust)
32035 +{
32036 + switch (clust->op) {
32037 + case LC_APPOV:
32038 + return clust->nr_pages;
32039 + case LC_TRUNC:
32040 + assert("edward-1179", clust->win != NULL);
32041 + return size_in_pages(clust->win->off + clust->win->count);
32042 + default:
32043 + impossible("edward-1180", "bad page cluster option");
32044 + return 0;
32045 + }
32046 +}
32047 +
32048 +static void set_cluster_pages_dirty(struct cluster_handle * clust,
32049 + struct inode * inode)
32050 +{
32051 + int i;
32052 + struct page *pg;
32053 + int nrpages = get_new_nrpages(clust);
32054 +
32055 + for (i = 0; i < nrpages; i++) {
32056 +
32057 + pg = clust->pages[i];
32058 + assert("edward-968", pg != NULL);
32059 + lock_page(pg);
32060 + assert("edward-1065", PageUptodate(pg));
32061 + reiser4_set_page_dirty_internal(pg);
32062 + unlock_page(pg);
32063 + mark_page_accessed(pg);
32064 + }
32065 +}
32066 +
32067 +/* Grab a page cluster for read/write operations.
32068 + Attach a jnode for write operations (when preparing for modifications, which
32069 + are supposed to be committed).
32070 +
32071 + We allocate only one jnode per page cluster; this jnode is binded to the
32072 + first page of this cluster, so we have an extra-reference that will be put
32073 + as soon as jnode is evicted from memory), other references will be cleaned
32074 + up in flush time (assume that check in page cluster was successful).
32075 +*/
32076 +int grab_page_cluster(struct inode * inode,
32077 + struct cluster_handle * clust, rw_op rw)
32078 +{
32079 + int i;
32080 + int result = 0;
32081 + jnode *node = NULL;
32082 +
32083 + assert("edward-182", clust != NULL);
32084 + assert("edward-183", clust->pages != NULL);
32085 + assert("edward-1466", clust->node == NULL);
32086 + assert("edward-1428", inode != NULL);
32087 + assert("edward-1429", inode->i_mapping != NULL);
32088 + assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32089 +
32090 + if (clust->nr_pages == 0)
32091 + return 0;
32092 +
32093 + for (i = 0; i < clust->nr_pages; i++) {
32094 +
32095 + assert("edward-1044", clust->pages[i] == NULL);
32096 +
32097 + clust->pages[i] =
32098 + find_or_create_page(inode->i_mapping,
32099 + clust_to_pg(clust->index, inode) + i,
32100 + reiser4_ctx_gfp_mask_get());
32101 + if (!clust->pages[i]) {
32102 + result = RETERR(-ENOMEM);
32103 + break;
32104 + }
32105 + if (i == 0 && rw == WRITE_OP) {
32106 + node = jnode_of_page(clust->pages[i]);
32107 + if (IS_ERR(node)) {
32108 + result = PTR_ERR(node);
32109 + unlock_page(clust->pages[i]);
32110 + break;
32111 + }
32112 + JF_SET(node, JNODE_CLUSTER_PAGE);
32113 + assert("edward-920", jprivate(clust->pages[0]));
32114 + }
32115 + INODE_PGCOUNT_INC(inode);
32116 + unlock_page(clust->pages[i]);
32117 + }
32118 + if (unlikely(result)) {
32119 + while (i) {
32120 + put_cluster_page(clust->pages[--i]);
32121 + INODE_PGCOUNT_DEC(inode);
32122 + }
32123 + if (node && !IS_ERR(node))
32124 + jput(node);
32125 + return result;
32126 + }
32127 + clust->node = node;
32128 + return 0;
32129 +}
32130 +
32131 +static void truncate_page_cluster_range(struct inode * inode,
32132 + struct page ** pages,
32133 + cloff_t index,
32134 + int from, int count,
32135 + int even_cows)
32136 +{
32137 + assert("edward-1467", count > 0);
32138 + reiser4_invalidate_pages(inode->i_mapping,
32139 + clust_to_pg(index, inode) + from,
32140 + count, even_cows);
32141 +}
32142 +
32143 +/* Put @count pages starting from @from offset */
32144 +void __put_page_cluster(int from, int count,
32145 + struct page ** pages, struct inode * inode)
32146 +{
32147 + int i;
32148 + assert("edward-1468", pages != NULL);
32149 + assert("edward-1469", inode != NULL);
32150 + assert("edward-1470", from >= 0 && count >= 0);
32151 +
32152 + for (i = 0; i < count; i++) {
32153 + assert("edward-1471", pages[from + i] != NULL);
32154 + assert("edward-1472",
32155 + pages[from + i]->index == pages[from]->index + i);
32156 +
32157 + put_cluster_page(pages[from + i]);
32158 + INODE_PGCOUNT_DEC(inode);
32159 + }
32160 +}
32161 +
32162 +/*
32163 + * This is dual to grab_page_cluster,
32164 + * however if @rw == WRITE_OP, then we call this function
32165 + * only if something is failed before checkin page cluster.
32166 + */
32167 +void put_page_cluster(struct cluster_handle * clust,
32168 + struct inode * inode, rw_op rw)
32169 +{
32170 + assert("edward-445", clust != NULL);
32171 + assert("edward-922", clust->pages != NULL);
32172 + assert("edward-446",
32173 + ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
32174 +
32175 + __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
32176 + if (rw == WRITE_OP) {
32177 + if (unlikely(clust->node)) {
32178 + assert("edward-447",
32179 + clust->node == jprivate(clust->pages[0]));
32180 + jput(clust->node);
32181 + clust->node = NULL;
32182 + }
32183 + }
32184 +}
32185 +
32186 +#if REISER4_DEBUG
32187 +int cryptcompress_inode_ok(struct inode *inode)
32188 +{
32189 + if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
32190 + return 0;
32191 + if (!cluster_shift_ok(inode_cluster_shift(inode)))
32192 + return 0;
32193 + return 1;
32194 +}
32195 +
32196 +static int window_ok(struct reiser4_slide * win, struct inode *inode)
32197 +{
32198 + assert("edward-1115", win != NULL);
32199 + assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32200 +
32201 + return (win->off != inode_cluster_size(inode)) &&
32202 + (win->off + win->count + win->delta <= inode_cluster_size(inode));
32203 +}
32204 +
32205 +static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
32206 +{
32207 + assert("edward-279", clust != NULL);
32208 +
32209 + if (!clust->pages)
32210 + return 0;
32211 + return (clust->win ? window_ok(clust->win, inode) : 1);
32212 +}
32213 +#if 0
32214 +static int pages_truncate_ok(struct inode *inode, pgoff_t start)
32215 +{
32216 + int found;
32217 + struct page * page;
32218 +
32219 + found = find_get_pages(inode->i_mapping, start, 1, &page);
32220 + if (found)
32221 + put_cluster_page(page);
32222 + return !found;
32223 +}
32224 +#else
32225 +#define pages_truncate_ok(inode, start) 1
32226 +#endif
32227 +
32228 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
32229 +{
32230 + jnode *node;
32231 + node = jlookup(current_tree, get_inode_oid(inode),
32232 + clust_to_pg(index, inode));
32233 + if (likely(!node))
32234 + return 1;
32235 + jput(node);
32236 + return 0;
32237 +}
32238 +
32239 +static int find_fake_appended(struct inode *inode, cloff_t * index);
32240 +
32241 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
32242 +{
32243 + int result;
32244 + cloff_t raidx;
32245 +
32246 + result = find_fake_appended(inode, &raidx);
32247 + return !result && (aidx == raidx);
32248 +}
32249 +#endif
32250 +
32251 +/* guess next window stat */
32252 +static inline window_stat next_window_stat(struct reiser4_slide * win)
32253 +{
32254 + assert("edward-1130", win != NULL);
32255 + return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32256 + HOLE_WINDOW : DATA_WINDOW);
32257 +}
32258 +
32259 +/* guess and set next cluster index and window params */
32260 +static void move_update_window(struct inode * inode,
32261 + struct cluster_handle * clust,
32262 + loff_t file_off, loff_t to_file)
32263 +{
32264 + struct reiser4_slide * win;
32265 +
32266 + assert("edward-185", clust != NULL);
32267 + assert("edward-438", clust->pages != NULL);
32268 + assert("edward-281", cluster_ok(clust, inode));
32269 +
32270 + win = clust->win;
32271 + if (!win)
32272 + return;
32273 +
32274 + switch (win->stat) {
32275 + case DATA_WINDOW:
32276 + /* increment */
32277 + clust->index++;
32278 + win->stat = DATA_WINDOW;
32279 + win->off = 0;
32280 + win->count = min((loff_t)inode_cluster_size(inode), to_file);
32281 + break;
32282 + case HOLE_WINDOW:
32283 + switch (next_window_stat(win)) {
32284 + case HOLE_WINDOW:
32285 + /* skip */
32286 + clust->index = off_to_clust(file_off, inode);
32287 + win->stat = HOLE_WINDOW;
32288 + win->off = 0;
32289 + win->count = off_to_cloff(file_off, inode);
32290 + win->delta = min((loff_t)(inode_cluster_size(inode) -
32291 + win->count), to_file);
32292 + break;
32293 + case DATA_WINDOW:
32294 + /* stay */
32295 + win->stat = DATA_WINDOW;
32296 + /* off+count+delta=inv */
32297 + win->off = win->off + win->count;
32298 + win->count = win->delta;
32299 + win->delta = 0;
32300 + break;
32301 + default:
32302 + impossible("edward-282", "wrong next window state");
32303 + }
32304 + break;
32305 + default:
32306 + impossible("edward-283", "wrong current window state");
32307 + }
32308 + assert("edward-1068", cluster_ok(clust, inode));
32309 +}
32310 +
32311 +static int update_sd_cryptcompress(struct inode *inode)
32312 +{
32313 + int result = 0;
32314 +
32315 + assert("edward-978", reiser4_schedulable());
32316 +
32317 + result = reiser4_grab_space_force(/* one for stat data update */
32318 + estimate_update_common(inode),
32319 + BA_CAN_COMMIT);
32320 + if (result)
32321 + return result;
32322 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
32323 + result = reiser4_update_sd(inode);
32324 +
32325 + return result;
32326 +}
32327 +
32328 +static void uncapture_cluster_jnode(jnode * node)
32329 +{
32330 + txn_atom *atom;
32331 +
32332 + assert_spin_locked(&(node->guard));
32333 +
32334 + atom = jnode_get_atom(node);
32335 + if (atom == NULL) {
32336 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
32337 + spin_unlock_jnode(node);
32338 + return;
32339 + }
32340 + reiser4_uncapture_block(node);
32341 + spin_unlock_atom(atom);
32342 + jput(node);
32343 +}
32344 +
32345 +static void put_found_pages(struct page **pages, int nr)
32346 +{
32347 + int i;
32348 + for (i = 0; i < nr; i++) {
32349 + assert("edward-1045", pages[i] != NULL);
32350 + put_cluster_page(pages[i]);
32351 + }
32352 +}
32353 +
32354 +/* Lifecycle of a logical cluster in the system.
32355 + *
32356 + *
32357 + * Logical cluster of a cryptcompress file is represented in the system by
32358 + * . page cluster (in memory, primary cache, contains plain text);
32359 + * . disk cluster (in memory, secondary cache, contains transformed text).
32360 + * Primary cache is to reduce number of transform operations (compression,
32361 + * encryption), i.e. to implement transform-caching strategy.
32362 + * Secondary cache is to reduce number of I/O operations, i.e. for usual
32363 + * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
32364 + * a logical cluster to the primary cache. Disk cluster is a set of items
32365 + * of the same type defined by some reiser4 item plugin id.
32366 + *
32367 + * 1. Performing modifications
32368 + *
32369 + * Every modification of a cryptcompress file is considered as a set of
32370 + * operations performed on file's logical clusters. Every such "atomic"
32371 + * modification is truncate, append and(or) overwrite some bytes of a
32372 + * logical cluster performed in the primary cache with the following
32373 + * synchronization with the secondary cache (in flush time). Disk clusters,
32374 + * which live in the secondary cache, are supposed to be synchronized with
32375 + * disk. The mechanism of synchronization of primary and secondary caches
32376 + * includes so-called checkin/checkout technique described below.
32377 + *
32378 + * 2. Submitting modifications
32379 + *
32380 + * Each page cluster has associated jnode (a special in-memory header to
32381 + * keep a track of transactions in reiser4), which is attached to its first
32382 + * page when grabbing page cluster for modifications (see grab_page_cluster).
32383 + * Submitting modifications (see checkin_logical_cluster) is going per logical
32384 + * cluster and includes:
32385 + * . checkin_cluster_size;
32386 + * . checkin_page_cluster.
32387 + * checkin_cluster_size() is resolved to file size update (which completely
32388 + * defines new size of logical cluster (number of file's bytes in a logical
32389 + * cluster).
32390 + * checkin_page_cluster() captures jnode of a page cluster and installs
32391 + * jnode's dirty flag (if needed) to indicate that modifications are
32392 + * successfully checked in.
32393 + *
32394 + * 3. Checking out modifications
32395 + *
32396 + * Is going per logical cluster in flush time (see checkout_logical_cluster).
32397 + * This is the time of synchronizing primary and secondary caches.
32398 + * checkout_logical_cluster() includes:
32399 + * . checkout_page_cluster (retrieving checked in pages).
32400 + * . uncapture jnode (including clear dirty flag and unlock)
32401 + *
32402 + * 4. Committing modifications
32403 + *
32404 + * Proceeding a synchronization of primary and secondary caches. When checking
32405 + * out page cluster (the phase above) pages are locked/flushed/unlocked
32406 + * one-by-one in ascending order of their indexes to contiguous stream, which
32407 + * is supposed to be transformed (compressed, encrypted), chopped up into items
32408 + * and committed to disk as a disk cluster.
32409 + *
32410 + * 5. Managing page references
32411 + *
32412 + * Every checked in page have a special additional "control" reference,
32413 + * which is dropped at checkout. We need this to avoid unexpected evicting
32414 + * pages from memory before checkout. Control references are managed so
32415 + * they are not accumulated with every checkin:
32416 + *
32417 + * 0
32418 + * checkin -> 1
32419 + * 0 -> checkout
32420 + * checkin -> 1
32421 + * checkin -> 1
32422 + * checkin -> 1
32423 + * 0 -> checkout
32424 + * ...
32425 + *
32426 + * Every page cluster has its own unique "cluster lock". Update/drop
32427 + * references are serialized via this lock. Number of checked in cluster
32428 + * pages is calculated by i_size under cluster lock. File size is updated
32429 + * at every checkin action also under cluster lock (except cases of
32430 + * appending/truncating fake logical clusters).
32431 + *
32432 + * Proof of correctness:
32433 + *
32434 + * Since we update file size under cluster lock, in the case of non-fake
32435 + * logical cluster with its lock held we do have expected number of checked
32436 + * in pages. On the other hand, append/truncate of fake logical clusters
32437 + * doesn't change number of checked in pages of any cluster.
32438 + *
32439 + * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
32440 + * Currently, I don't see any reason to create a special lock for those
32441 + * needs.
32442 + */
32443 +
32444 +static inline void lock_cluster(jnode * node)
32445 +{
32446 + spin_lock_jnode(node);
32447 +}
32448 +
32449 +static inline void unlock_cluster(jnode * node)
32450 +{
32451 + spin_unlock_jnode(node);
32452 +}
32453 +
32454 +static inline void unlock_cluster_uncapture(jnode * node)
32455 +{
32456 + uncapture_cluster_jnode(node);
32457 +}
32458 +
32459 +/* Set new file size by window. Cluster lock is required. */
32460 +static void checkin_file_size(struct cluster_handle * clust,
32461 + struct inode * inode)
32462 +{
32463 + loff_t new_size;
32464 + struct reiser4_slide * win;
32465 +
32466 + assert("edward-1181", clust != NULL);
32467 + assert("edward-1182", inode != NULL);
32468 + assert("edward-1473", clust->pages != NULL);
32469 + assert("edward-1474", clust->pages[0] != NULL);
32470 + assert("edward-1475", jprivate(clust->pages[0]) != NULL);
32471 + assert_spin_locked(&(jprivate(clust->pages[0])->guard));
32472 +
32473 +
32474 + win = clust->win;
32475 + assert("edward-1183", win != NULL);
32476 +
32477 + new_size = clust_to_off(clust->index, inode) + win->off;
32478 +
32479 + switch (clust->op) {
32480 + case LC_APPOV:
32481 + if (new_size + win->count <= i_size_read(inode))
32482 + /* overwrite only */
32483 + return;
32484 + new_size += win->count;
32485 + break;
32486 + case LC_TRUNC:
32487 + break;
32488 + default:
32489 + impossible("edward-1184", "bad page cluster option");
32490 + break;
32491 + }
32492 + inode_check_scale_nolock(inode, i_size_read(inode), new_size);
32493 + i_size_write(inode, new_size);
32494 + return;
32495 +}
32496 +
32497 +static inline void checkin_cluster_size(struct cluster_handle * clust,
32498 + struct inode * inode)
32499 +{
32500 + if (clust->win)
32501 + checkin_file_size(clust, inode);
32502 +}
32503 +
32504 +static int checkin_page_cluster(struct cluster_handle * clust,
32505 + struct inode * inode)
32506 +{
32507 + int result;
32508 + jnode * node;
32509 + int old_nrpages = clust->old_nrpages;
32510 + int new_nrpages = get_new_nrpages(clust);
32511 +
32512 + node = clust->node;
32513 +
32514 + assert("edward-221", node != NULL);
32515 + assert("edward-971", clust->reserved == 1);
32516 + assert("edward-1263",
32517 + clust->reserved_prepped == estimate_update_cluster(inode));
32518 + assert("edward-1264", clust->reserved_unprepped == 0);
32519 +
32520 + if (JF_ISSET(node, JNODE_DIRTY)) {
32521 + /*
32522 + * page cluster was checked in, but not yet
32523 + * checked out, so release related resources
32524 + */
32525 + free_reserved4cluster(inode, clust,
32526 + estimate_update_cluster(inode));
32527 + __put_page_cluster(0, clust->old_nrpages,
32528 + clust->pages, inode);
32529 + } else {
32530 + result = capture_cluster_jnode(node);
32531 + if (unlikely(result)) {
32532 + unlock_cluster(node);
32533 + return result;
32534 + }
32535 + jnode_make_dirty_locked(node);
32536 + clust->reserved = 0;
32537 + }
32538 + unlock_cluster(node);
32539 +
32540 + if (new_nrpages < old_nrpages) {
32541 + /* truncate >= 1 complete pages */
32542 + __put_page_cluster(new_nrpages,
32543 + old_nrpages - new_nrpages,
32544 + clust->pages, inode);
32545 + truncate_page_cluster_range(inode,
32546 + clust->pages, clust->index,
32547 + new_nrpages,
32548 + old_nrpages - new_nrpages,
32549 + 0);
32550 + }
32551 +#if REISER4_DEBUG
32552 + clust->reserved_prepped -= estimate_update_cluster(inode);
32553 +#endif
32554 + return 0;
32555 +}
32556 +
32557 +/* Submit modifications of a logical cluster */
32558 +static int checkin_logical_cluster(struct cluster_handle * clust,
32559 + struct inode *inode)
32560 +{
32561 + int result = 0;
32562 + jnode * node;
32563 +
32564 + node = clust->node;
32565 +
32566 + assert("edward-1035", node != NULL);
32567 + assert("edward-1029", clust != NULL);
32568 + assert("edward-1030", clust->reserved == 1);
32569 + assert("edward-1031", clust->nr_pages != 0);
32570 + assert("edward-1032", clust->pages != NULL);
32571 + assert("edward-1033", clust->pages[0] != NULL);
32572 + assert("edward-1446", jnode_is_cluster_page(node));
32573 + assert("edward-1476", node == jprivate(clust->pages[0]));
32574 +
32575 + lock_cluster(node);
32576 + checkin_cluster_size(clust, inode);
32577 + /* this will unlock cluster */
32578 + result = checkin_page_cluster(clust, inode);
32579 + jput(node);
32580 + clust->node = NULL;
32581 + return result;
32582 +}
32583 +
32584 +/*
32585 + * Retrieve size of logical cluster that was checked in at
32586 + * the latest modifying session (cluster lock is required)
32587 + */
32588 +static inline void checkout_cluster_size(struct cluster_handle * clust,
32589 + struct inode * inode)
32590 +{
32591 + struct tfm_cluster *tc = &clust->tc;
32592 +
32593 + tc->len = lbytes(clust->index, inode);
32594 + assert("edward-1478", tc->len != 0);
32595 +}
32596 +
32597 +/*
32598 + * Retrieve a page cluster with the latest submitted modifications
32599 + * and flush its pages to previously allocated contiguous stream.
32600 + */
32601 +static void checkout_page_cluster(struct cluster_handle * clust,
32602 + jnode * node, struct inode * inode)
32603 +{
32604 + int i;
32605 + int found;
32606 + int to_put;
32607 + struct tfm_cluster *tc = &clust->tc;
32608 +
32609 + /* find and put checked in pages: cluster is locked,
32610 + * so we must get expected number (to_put) of pages
32611 + */
32612 + to_put = size_in_pages(lbytes(clust->index, inode));
32613 + found = find_get_pages(inode->i_mapping,
32614 + clust_to_pg(clust->index, inode),
32615 + to_put, clust->pages);
32616 + BUG_ON(found != to_put);
32617 +
32618 + __put_page_cluster(0, to_put, clust->pages, inode);
32619 + unlock_cluster_uncapture(node);
32620 +
32621 + /* Flush found pages.
32622 + *
32623 + * Note, that we don't disable modifications while flushing,
32624 + * moreover, some found pages can be truncated, as we have
32625 + * released cluster lock.
32626 + */
32627 + for (i = 0; i < found; i++) {
32628 + int in_page;
32629 + char * data;
32630 + assert("edward-1479",
32631 + clust->pages[i]->index == clust->pages[0]->index + i);
32632 +
32633 + lock_page(clust->pages[i]);
32634 + if (!PageUptodate(clust->pages[i])) {
32635 + /* page was truncated */
32636 + assert("edward-1480",
32637 + i_size_read(inode) <= page_offset(clust->pages[i]));
32638 + assert("edward-1481",
32639 + clust->pages[i]->mapping != inode->i_mapping);
32640 + unlock_page(clust->pages[i]);
32641 + break;
32642 + }
32643 + /* Update the number of bytes in the logical cluster,
32644 + * as it could be partially truncated. Note, that only
32645 + * partial truncate is possible (complete truncate can
32646 + * not go here, as it is performed via ->kill_hook()
32647 + * called by cut_file_items(), and the last one must
32648 + * wait for znode locked with parent coord).
32649 + */
32650 + checkout_cluster_size(clust, inode);
32651 +
32652 + /* this can be zero, as new file size is
32653 + checked in before truncating pages */
32654 + in_page = __mbp(tc->len, i);
32655 +
32656 + data = kmap(clust->pages[i]);
32657 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
32658 + data, in_page);
32659 + kunmap(clust->pages[i]);
32660 +
32661 + if (PageDirty(clust->pages[i]))
32662 + cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
32663 +
32664 + unlock_page(clust->pages[i]);
32665 +
32666 + if (in_page < PAGE_CACHE_SIZE)
32667 + /* end of the file */
32668 + break;
32669 + }
32670 + put_found_pages(clust->pages, found); /* find_get_pages */
32671 + tc->lsize = tc->len;
32672 + return;
32673 +}
32674 +
32675 +/* Check out modifications of a logical cluster */
32676 +int checkout_logical_cluster(struct cluster_handle * clust,
32677 + jnode * node, struct inode *inode)
32678 +{
32679 + int result;
32680 + struct tfm_cluster *tc = &clust->tc;
32681 +
32682 + assert("edward-980", node != NULL);
32683 + assert("edward-236", inode != NULL);
32684 + assert("edward-237", clust != NULL);
32685 + assert("edward-240", !clust->win);
32686 + assert("edward-241", reiser4_schedulable());
32687 + assert("edward-718", cryptcompress_inode_ok(inode));
32688 +
32689 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
32690 + if (result) {
32691 + warning("edward-1430", "alloc stream failed with ret=%d",
32692 + result);
32693 + return RETERR(-E_REPEAT);
32694 + }
32695 + lock_cluster(node);
32696 +
32697 + if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
32698 + /* race with another flush */
32699 + warning("edward-982",
32700 + "checking out logical cluster %lu of inode %llu: "
32701 + "jnode is not dirty", clust->index,
32702 + (unsigned long long)get_inode_oid(inode));
32703 + unlock_cluster(node);
32704 + return RETERR(-E_REPEAT);
32705 + }
32706 + cluster_reserved2grabbed(estimate_update_cluster(inode));
32707 +
32708 + /* this will unlock cluster */
32709 + checkout_page_cluster(clust, node, inode);
32710 + return 0;
32711 +}
32712 +
32713 +/* set hint for the cluster of the index @index */
32714 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
32715 + cloff_t index, znode_lock_mode mode)
32716 +{
32717 + reiser4_key key;
32718 + assert("edward-722", cryptcompress_inode_ok(inode));
32719 + assert("edward-723",
32720 + inode_file_plugin(inode) ==
32721 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
32722 +
32723 + inode_file_plugin(inode)->key_by_inode(inode,
32724 + clust_to_off(index, inode),
32725 + &key);
32726 +
32727 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
32728 + hint->offset = get_key_offset(&key);
32729 + hint->mode = mode;
32730 +}
32731 +
32732 +void invalidate_hint_cluster(struct cluster_handle * clust)
32733 +{
32734 + assert("edward-1291", clust != NULL);
32735 + assert("edward-1292", clust->hint != NULL);
32736 +
32737 + done_lh(&clust->hint->lh);
32738 + hint_clr_valid(clust->hint);
32739 +}
32740 +
32741 +void put_hint_cluster(struct cluster_handle * clust, struct inode *inode,
32742 + znode_lock_mode mode)
32743 +{
32744 + assert("edward-1286", clust != NULL);
32745 + assert("edward-1287", clust->hint != NULL);
32746 +
32747 + set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
32748 + invalidate_hint_cluster(clust);
32749 +}
32750 +
32751 +static int balance_dirty_page_cluster(struct cluster_handle * clust,
32752 + struct inode *inode, loff_t off,
32753 + loff_t to_file)
32754 +{
32755 + int result;
32756 + struct cryptcompress_info * info;
32757 +
32758 + assert("edward-724", inode != NULL);
32759 + assert("edward-725", cryptcompress_inode_ok(inode));
32760 +
32761 + /* set next window params */
32762 + move_update_window(inode, clust, off, to_file);
32763 +
32764 + result = update_sd_cryptcompress(inode);
32765 + if (result)
32766 + return result;
32767 + assert("edward-726", clust->hint->lh.owner == NULL);
32768 + info = cryptcompress_inode_data(inode);
32769 +
32770 + mutex_unlock(&info->checkin_mutex);
32771 + reiser4_throttle_write(inode);
32772 + mutex_lock(&info->checkin_mutex);
32773 + return 0;
32774 +}
32775 +
32776 +/* set zeroes to the page cluster, proceed it, and maybe, try to capture
32777 + its pages */
32778 +static int write_hole(struct inode *inode, struct cluster_handle * clust,
32779 + loff_t file_off, loff_t to_file)
32780 +{
32781 + int result = 0;
32782 + unsigned cl_off, cl_count = 0;
32783 + unsigned to_pg, pg_off;
32784 + struct reiser4_slide * win;
32785 +
32786 + assert("edward-190", clust != NULL);
32787 + assert("edward-1069", clust->win != NULL);
32788 + assert("edward-191", inode != NULL);
32789 + assert("edward-727", cryptcompress_inode_ok(inode));
32790 + assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
32791 + assert("edward-1154",
32792 + ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
32793 +
32794 + win = clust->win;
32795 +
32796 + assert("edward-1070", win != NULL);
32797 + assert("edward-201", win->stat == HOLE_WINDOW);
32798 + assert("edward-192", cluster_ok(clust, inode));
32799 +
32800 + if (win->off == 0 && win->count == inode_cluster_size(inode)) {
32801 + /* This part of the hole will be represented by "fake"
32802 + * logical cluster, i.e. which doesn't have appropriate
32803 + * disk cluster until someone modify this logical cluster
32804 + * and make it dirty.
32805 + * So go forward here..
32806 + */
32807 + move_update_window(inode, clust, file_off, to_file);
32808 + return 0;
32809 + }
32810 + cl_count = win->count; /* number of zeroes to write */
32811 + cl_off = win->off;
32812 + pg_off = off_to_pgoff(win->off);
32813 +
32814 + while (cl_count) {
32815 + struct page *page;
32816 + page = clust->pages[off_to_pg(cl_off)];
32817 +
32818 + assert("edward-284", page != NULL);
32819 +
32820 + to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
32821 + lock_page(page);
32822 + zero_user_page(page, pg_off, to_pg, KM_USER0);
32823 + SetPageUptodate(page);
32824 + reiser4_set_page_dirty_internal(page);
32825 + mark_page_accessed(page);
32826 + unlock_page(page);
32827 +
32828 + cl_off += to_pg;
32829 + cl_count -= to_pg;
32830 + pg_off = 0;
32831 + }
32832 + if (!win->delta) {
32833 + /* only zeroes in this window, try to capture
32834 + */
32835 + result = checkin_logical_cluster(clust, inode);
32836 + if (result)
32837 + return result;
32838 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
32839 + result =
32840 + balance_dirty_page_cluster(clust, inode, file_off, to_file);
32841 + } else
32842 + move_update_window(inode, clust, file_off, to_file);
32843 + return result;
32844 +}
32845 +
32846 +/*
32847 + The main disk search procedure for cryptcompress plugin, which
32848 + . scans all items of disk cluster with the lock mode @mode
32849 + . maybe reads each one (if @read)
32850 + . maybe makes its znode dirty (if write lock mode was specified)
32851 +
32852 + NOTE-EDWARD: Callers should handle the case when disk cluster
32853 + is incomplete (-EIO)
32854 +*/
32855 +int find_disk_cluster(struct cluster_handle * clust,
32856 + struct inode *inode, int read, znode_lock_mode mode)
32857 +{
32858 + flow_t f;
32859 + hint_t *hint;
32860 + int result = 0;
32861 + int was_grabbed;
32862 + ra_info_t ra_info;
32863 + file_plugin *fplug;
32864 + item_plugin *iplug;
32865 + struct tfm_cluster *tc;
32866 + struct cryptcompress_info * info;
32867 +
32868 + assert("edward-138", clust != NULL);
32869 + assert("edward-728", clust->hint != NULL);
32870 + assert("edward-226", reiser4_schedulable());
32871 + assert("edward-137", inode != NULL);
32872 + assert("edward-729", cryptcompress_inode_ok(inode));
32873 +
32874 + hint = clust->hint;
32875 + fplug = inode_file_plugin(inode);
32876 + was_grabbed = get_current_context()->grabbed_blocks;
32877 + info = cryptcompress_inode_data(inode);
32878 + tc = &clust->tc;
32879 +
32880 + assert("edward-462", !tfm_cluster_is_uptodate(tc));
32881 + assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
32882 +
32883 + dclust_init_extension(hint);
32884 +
32885 + /* set key of the first disk cluster item */
32886 + fplug->flow_by_inode(inode,
32887 + (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
32888 + 0 /* kernel space */ ,
32889 + inode_scaled_cluster_size(inode),
32890 + clust_to_off(clust->index, inode), READ_OP, &f);
32891 + if (mode == ZNODE_WRITE_LOCK) {
32892 + /* reserve for flush to make dirty all the leaf nodes
32893 + which contain disk cluster */
32894 + result =
32895 + reiser4_grab_space_force(estimate_dirty_cluster(inode),
32896 + BA_CAN_COMMIT);
32897 + if (result)
32898 + goto out;
32899 + }
32900 +
32901 + ra_info.key_to_stop = f.key;
32902 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
32903 +
32904 + while (f.length) {
32905 + result = find_cluster_item(hint, &f.key, mode,
32906 + NULL, FIND_EXACT,
32907 + (mode == ZNODE_WRITE_LOCK ?
32908 + CBK_FOR_INSERT : 0));
32909 + switch (result) {
32910 + case CBK_COORD_NOTFOUND:
32911 + result = 0;
32912 + if (inode_scaled_offset
32913 + (inode, clust_to_off(clust->index, inode)) ==
32914 + get_key_offset(&f.key)) {
32915 + /* first item not found, this is treated
32916 + as disk cluster is absent */
32917 + clust->dstat = FAKE_DISK_CLUSTER;
32918 + goto out;
32919 + }
32920 + /* we are outside the cluster, stop search here */
32921 + assert("edward-146",
32922 + f.length != inode_scaled_cluster_size(inode));
32923 + goto ok;
32924 + case CBK_COORD_FOUND:
32925 + assert("edward-148",
32926 + hint->ext_coord.coord.between == AT_UNIT);
32927 + assert("edward-460",
32928 + hint->ext_coord.coord.unit_pos == 0);
32929 +
32930 + coord_clear_iplug(&hint->ext_coord.coord);
32931 + result = zload_ra(hint->ext_coord.coord.node, &ra_info);
32932 + if (unlikely(result))
32933 + goto out;
32934 + iplug = item_plugin_by_coord(&hint->ext_coord.coord);
32935 + assert("edward-147",
32936 + item_id_by_coord(&hint->ext_coord.coord) ==
32937 + CTAIL_ID);
32938 +
32939 + result = iplug->s.file.read(NULL, &f, hint);
32940 + if (result) {
32941 + zrelse(hint->ext_coord.coord.node);
32942 + goto out;
32943 + }
32944 + if (mode == ZNODE_WRITE_LOCK) {
32945 + /* Don't make dirty more nodes then it was
32946 + estimated (see comments before
32947 + estimate_dirty_cluster). Missed nodes will be
32948 + read up in flush time if they are evicted from
32949 + memory */
32950 + if (dclust_get_extension_ncount(hint) <=
32951 + estimate_dirty_cluster(inode))
32952 + znode_make_dirty(hint->ext_coord.coord.node);
32953 +
32954 + znode_set_convertible(hint->ext_coord.coord.
32955 + node);
32956 + }
32957 + zrelse(hint->ext_coord.coord.node);
32958 + break;
32959 + default:
32960 + goto out;
32961 + }
32962 + }
32963 + ok:
32964 + /* at least one item was found */
32965 + /* NOTE-EDWARD: Callers should handle the case
32966 + when disk cluster is incomplete (-EIO) */
32967 + tc->len = inode_scaled_cluster_size(inode) - f.length;
32968 + tc->lsize = lbytes(clust->index, inode);
32969 + assert("edward-1196", tc->len > 0);
32970 + assert("edward-1406", tc->lsize > 0);
32971 +
32972 + if (hint_is_unprepped_dclust(clust->hint)) {
32973 + clust->dstat = UNPR_DISK_CLUSTER;
32974 + } else if (clust->index == info->trunc_index) {
32975 + clust->dstat = TRNC_DISK_CLUSTER;
32976 + } else {
32977 + clust->dstat = PREP_DISK_CLUSTER;
32978 + dclust_set_extension_dsize(clust->hint, tc->len);
32979 + }
32980 + out:
32981 + assert("edward-1339",
32982 + get_current_context()->grabbed_blocks >= was_grabbed);
32983 + grabbed2free(get_current_context(),
32984 + get_current_super_private(),
32985 + get_current_context()->grabbed_blocks - was_grabbed);
32986 + return result;
32987 +}
32988 +
32989 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
32990 + znode_lock_mode lock_mode)
32991 +{
32992 + reiser4_key key;
32993 + ra_info_t ra_info;
32994 +
32995 + assert("edward-730", reiser4_schedulable());
32996 + assert("edward-731", clust != NULL);
32997 + assert("edward-732", inode != NULL);
32998 +
32999 + if (hint_is_valid(clust->hint)) {
33000 + assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
33001 + assert("edward-1294",
33002 + znode_is_write_locked(clust->hint->lh.node));
33003 + /* already have a valid locked position */
33004 + return (clust->dstat ==
33005 + FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
33006 + CBK_COORD_FOUND);
33007 + }
33008 + key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
33009 + &key);
33010 + ra_info.key_to_stop = key;
33011 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
33012 +
33013 + return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
33014 + CBK_FOR_INSERT);
33015 +}
33016 +
33017 +/* Read needed cluster pages before modifying.
33018 + If success, @clust->hint contains locked position in the tree.
33019 + Also:
33020 + . find and set disk cluster state
33021 + . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
33022 +*/
33023 +static int read_some_cluster_pages(struct inode * inode,
33024 + struct cluster_handle * clust)
33025 +{
33026 + int i;
33027 + int result = 0;
33028 + item_plugin *iplug;
33029 + struct reiser4_slide * win = clust->win;
33030 + znode_lock_mode mode = ZNODE_WRITE_LOCK;
33031 +
33032 + iplug = item_plugin_by_id(CTAIL_ID);
33033 +
33034 + assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
33035 +
33036 +#if REISER4_DEBUG
33037 + if (clust->nr_pages == 0) {
33038 + /* start write hole from fake disk cluster */
33039 + assert("edward-1117", win != NULL);
33040 + assert("edward-1118", win->stat == HOLE_WINDOW);
33041 + assert("edward-1119", new_logical_cluster(clust, inode));
33042 + }
33043 +#endif
33044 + if (new_logical_cluster(clust, inode)) {
33045 + /*
33046 + new page cluster is about to be written, nothing to read,
33047 + */
33048 + assert("edward-734", reiser4_schedulable());
33049 + assert("edward-735", clust->hint->lh.owner == NULL);
33050 +
33051 + if (clust->nr_pages) {
33052 + int off;
33053 + struct page * pg;
33054 + assert("edward-1419", clust->pages != NULL);
33055 + pg = clust->pages[clust->nr_pages - 1];
33056 + assert("edward-1420", pg != NULL);
33057 + off = off_to_pgoff(win->off+win->count+win->delta);
33058 + if (off) {
33059 + lock_page(pg);
33060 + zero_user_page(pg, off, PAGE_CACHE_SIZE - off,
33061 + KM_USER0);
33062 + unlock_page(pg);
33063 + }
33064 + }
33065 + clust->dstat = FAKE_DISK_CLUSTER;
33066 + return 0;
33067 + }
33068 + /*
33069 + Here we should search for disk cluster to figure out its real state.
33070 + Also there is one more important reason to do disk search: we need
33071 + to make disk cluster _dirty_ if it exists
33072 + */
33073 +
33074 + /* if windows is specified, read the only pages
33075 + that will be modified partially */
33076 +
33077 + for (i = 0; i < clust->nr_pages; i++) {
33078 + struct page *pg = clust->pages[i];
33079 +
33080 + lock_page(pg);
33081 + if (PageUptodate(pg)) {
33082 + unlock_page(pg);
33083 + continue;
33084 + }
33085 + unlock_page(pg);
33086 +
33087 + if (win &&
33088 + i >= size_in_pages(win->off) &&
33089 + i < off_to_pg(win->off + win->count + win->delta))
33090 + /* page will be completely overwritten */
33091 + continue;
33092 +
33093 + if (win && (i == clust->nr_pages - 1) &&
33094 + /* the last page is
33095 + partially modified,
33096 + not uptodate .. */
33097 + (size_in_pages(i_size_read(inode)) <= pg->index)) {
33098 + /* .. and appended,
33099 + so set zeroes to the rest */
33100 + int offset;
33101 + lock_page(pg);
33102 + assert("edward-1260",
33103 + size_in_pages(win->off + win->count +
33104 + win->delta) - 1 == i);
33105 +
33106 + offset =
33107 + off_to_pgoff(win->off + win->count + win->delta);
33108 + zero_user_page(pg, offset, PAGE_CACHE_SIZE - offset,
33109 + KM_USER0);
33110 + unlock_page(pg);
33111 + /* still not uptodate */
33112 + break;
33113 + }
33114 + lock_page(pg);
33115 + result = do_readpage_ctail(inode, clust, pg, mode);
33116 +
33117 + assert("edward-1526", ergo(!result, PageUptodate(pg)));
33118 + unlock_page(pg);
33119 + if (result) {
33120 + warning("edward-219", "do_readpage_ctail failed");
33121 + goto out;
33122 + }
33123 + }
33124 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
33125 + /* disk cluster unclaimed, but we need to make its znodes dirty
33126 + * to make flush update convert its content
33127 + */
33128 + result = find_disk_cluster(clust, inode,
33129 + 0 /* do not read items */,
33130 + mode);
33131 + }
33132 + out:
33133 + tfm_cluster_clr_uptodate(&clust->tc);
33134 + return result;
33135 +}
33136 +
33137 +static int should_create_unprepped_cluster(struct cluster_handle * clust,
33138 + struct inode * inode)
33139 +{
33140 + assert("edward-737", clust != NULL);
33141 +
33142 + switch (clust->dstat) {
33143 + case PREP_DISK_CLUSTER:
33144 + case UNPR_DISK_CLUSTER:
33145 + return 0;
33146 + case FAKE_DISK_CLUSTER:
33147 + if (clust->win &&
33148 + clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
33149 + assert("edward-1172",
33150 + new_logical_cluster(clust, inode));
33151 + return 0;
33152 + }
33153 + return 1;
33154 + default:
33155 + impossible("edward-1173", "bad disk cluster state");
33156 + return 0;
33157 + }
33158 +}
33159 +
33160 +static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
33161 + struct inode *inode)
33162 +{
33163 + int result;
33164 +
33165 + assert("edward-1123", reiser4_schedulable());
33166 + assert("edward-737", clust != NULL);
33167 + assert("edward-738", inode != NULL);
33168 + assert("edward-739", cryptcompress_inode_ok(inode));
33169 + assert("edward-1053", clust->hint != NULL);
33170 +
33171 + if (!should_create_unprepped_cluster(clust, inode)) {
33172 + if (clust->reserved) {
33173 + cluster_reserved2free(estimate_insert_cluster(inode));
33174 +#if REISER4_DEBUG
33175 + assert("edward-1267",
33176 + clust->reserved_unprepped ==
33177 + estimate_insert_cluster(inode));
33178 + clust->reserved_unprepped -=
33179 + estimate_insert_cluster(inode);
33180 +#endif
33181 + }
33182 + return 0;
33183 + }
33184 + assert("edward-1268", clust->reserved);
33185 + cluster_reserved2grabbed(estimate_insert_cluster(inode));
33186 +#if REISER4_DEBUG
33187 + assert("edward-1441",
33188 + clust->reserved_unprepped == estimate_insert_cluster(inode));
33189 + clust->reserved_unprepped -= estimate_insert_cluster(inode);
33190 +#endif
33191 + result = ctail_insert_unprepped_cluster(clust, inode);
33192 + if (result)
33193 + return result;
33194 +
33195 + inode_add_bytes(inode, inode_cluster_size(inode));
33196 +
33197 + assert("edward-743", cryptcompress_inode_ok(inode));
33198 + assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
33199 +
33200 + clust->dstat = UNPR_DISK_CLUSTER;
33201 + return 0;
33202 +}
33203 +
33204 +/* . Grab page cluster for read, write, setattr, etc. operations;
33205 + * . Truncate its complete pages, if needed;
33206 + */
33207 +int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
33208 + rw_op rw)
33209 +{
33210 + assert("edward-177", inode != NULL);
33211 + assert("edward-741", cryptcompress_inode_ok(inode));
33212 + assert("edward-740", clust->pages != NULL);
33213 +
33214 + set_cluster_nrpages(clust, inode);
33215 + reset_cluster_pgset(clust, cluster_nrpages(inode));
33216 + return grab_page_cluster(inode, clust, rw);
33217 +}
33218 +
33219 +/* Truncate complete page cluster of index @index.
33220 + * This is called by ->kill_hook() method of item
33221 + * plugin when deleting a disk cluster of such index.
33222 + */
33223 +void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
33224 + int even_cows)
33225 +{
33226 + int found;
33227 + int nr_pages;
33228 + jnode *node;
33229 + struct page *pages[MAX_CLUSTER_NRPAGES];
33230 +
33231 + node = jlookup(current_tree, get_inode_oid(inode),
33232 + clust_to_pg(index, inode));
33233 + nr_pages = size_in_pages(lbytes(index, inode));
33234 + assert("edward-1483", nr_pages != 0);
33235 + if (!node)
33236 + goto truncate;
33237 + found = find_get_pages(inode->i_mapping,
33238 + clust_to_pg(index, inode),
33239 + cluster_nrpages(inode), pages);
33240 + if (!found) {
33241 + assert("edward-1484", jnode_truncate_ok(inode, index));
33242 + return;
33243 + }
33244 + lock_cluster(node);
33245 +
33246 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
33247 + && index == 0)
33248 + /* converting to unix_file is in progress */
33249 + JF_CLR(node, JNODE_CLUSTER_PAGE);
33250 + if (JF_ISSET(node, JNODE_DIRTY)) {
33251 + /*
33252 + * @nr_pages were checked in, but not yet checked out -
33253 + * we need to release them. (also there can be pages
33254 + * attached to page cache by read(), etc. - don't take
33255 + * them into account).
33256 + */
33257 + assert("edward-1198", found >= nr_pages);
33258 +
33259 + /* free disk space grabbed for disk cluster converting */
33260 + cluster_reserved2grabbed(estimate_update_cluster(inode));
33261 + grabbed2free(get_current_context(),
33262 + get_current_super_private(),
33263 + estimate_update_cluster(inode));
33264 + __put_page_cluster(0, nr_pages, pages, inode);
33265 +
33266 + /* This will clear dirty bit, uncapture and unlock jnode */
33267 + unlock_cluster_uncapture(node);
33268 + } else
33269 + unlock_cluster(node);
33270 + jput(node); /* jlookup */
33271 + put_found_pages(pages, found); /* find_get_pages */
33272 + truncate:
33273 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
33274 + index == 0)
33275 + return;
33276 + truncate_page_cluster_range(inode, pages, index, 0,
33277 + cluster_nrpages(inode),
33278 + even_cows);
33279 + assert("edward-1201",
33280 + ergo(!reiser4_inode_get_flag(inode,
33281 + REISER4_FILE_CONV_IN_PROGRESS),
33282 + jnode_truncate_ok(inode, index)));
33283 + return;
33284 +}
33285 +
33286 +/*
33287 + * Set cluster handle @clust of a logical cluster before
33288 + * modifications which are supposed to be committed.
33289 + *
33290 + * . grab cluster pages;
33291 + * . reserve disk space;
33292 + * . maybe read pages from disk and set the disk cluster dirty;
33293 + * . maybe write hole and check in (partially zeroed) logical cluster;
33294 + * . create 'unprepped' disk cluster for new or fake logical one.
33295 + */
33296 +static int prepare_logical_cluster(struct inode *inode,
33297 + loff_t file_off, /* write position
33298 + in the file */
33299 + loff_t to_file, /* bytes of users data
33300 + to write to the file */
33301 + struct cluster_handle * clust,
33302 + logical_cluster_op op)
33303 +{
33304 + int result = 0;
33305 + struct reiser4_slide * win = clust->win;
33306 +
33307 + reset_cluster_params(clust);
33308 + cluster_set_tfm_act(&clust->tc, TFMA_READ);
33309 +#if REISER4_DEBUG
33310 + clust->ctx = get_current_context();
33311 +#endif
33312 + assert("edward-1190", op != LC_INVAL);
33313 +
33314 + clust->op = op;
33315 +
33316 + result = prepare_page_cluster(inode, clust, WRITE_OP);
33317 + if (result)
33318 + return result;
33319 + assert("edward-1447",
33320 + ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
33321 + assert("edward-1448",
33322 + ergo(clust->nr_pages != 0,
33323 + jnode_is_cluster_page(jprivate(clust->pages[0]))));
33324 +
33325 + result = reserve4cluster(inode, clust);
33326 + if (result)
33327 + goto err1;
33328 + result = read_some_cluster_pages(inode, clust);
33329 + if (result) {
33330 + free_reserved4cluster(inode,
33331 + clust,
33332 + estimate_update_cluster(inode) +
33333 + estimate_insert_cluster(inode));
33334 + goto err1;
33335 + }
33336 + assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33337 +
33338 + result = cryptcompress_make_unprepped_cluster(clust, inode);
33339 + if (result)
33340 + goto err2;
33341 + if (win && win->stat == HOLE_WINDOW) {
33342 + result = write_hole(inode, clust, file_off, to_file);
33343 + if (result)
33344 + goto err2;
33345 + }
33346 + return 0;
33347 + err2:
33348 + free_reserved4cluster(inode, clust,
33349 + estimate_update_cluster(inode));
33350 + err1:
33351 + put_page_cluster(clust, inode, WRITE_OP);
33352 + assert("edward-1125", result == -ENOSPC);
33353 + return result;
33354 +}
33355 +
33356 +/* set window by two offsets */
33357 +static void set_window(struct cluster_handle * clust,
33358 + struct reiser4_slide * win, struct inode *inode,
33359 + loff_t o1, loff_t o2)
33360 +{
33361 + assert("edward-295", clust != NULL);
33362 + assert("edward-296", inode != NULL);
33363 + assert("edward-1071", win != NULL);
33364 + assert("edward-297", o1 <= o2);
33365 +
33366 + clust->index = off_to_clust(o1, inode);
33367 +
33368 + win->off = off_to_cloff(o1, inode);
33369 + win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
33370 + o2 - o1);
33371 + win->delta = 0;
33372 +
33373 + clust->win = win;
33374 +}
33375 +
33376 +static int set_cluster_by_window(struct inode *inode,
33377 + struct cluster_handle * clust,
33378 + struct reiser4_slide * win, size_t length,
33379 + loff_t file_off)
33380 +{
33381 + int result;
33382 +
33383 + assert("edward-197", clust != NULL);
33384 + assert("edward-1072", win != NULL);
33385 + assert("edward-198", inode != NULL);
33386 +
33387 + result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33388 + if (result)
33389 + return result;
33390 +
33391 + if (file_off > i_size_read(inode)) {
33392 + /* Uhmm, hole in cryptcompress file... */
33393 + loff_t hole_size;
33394 + hole_size = file_off - inode->i_size;
33395 +
33396 + set_window(clust, win, inode, inode->i_size, file_off);
33397 + win->stat = HOLE_WINDOW;
33398 + if (win->off + hole_size < inode_cluster_size(inode))
33399 + /* there is also user's data to append to the hole */
33400 + win->delta = min(inode_cluster_size(inode) -
33401 + (win->off + win->count), length);
33402 + return 0;
33403 + }
33404 + set_window(clust, win, inode, file_off, file_off + length);
33405 + win->stat = DATA_WINDOW;
33406 + return 0;
33407 +}
33408 +
33409 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
33410 + int count)
33411 +{
33412 + int result = 0;
33413 + int (*setting_actor)(struct cluster_handle * clust, int count);
33414 +
33415 + assert("edward-1358", clust != NULL);
33416 + assert("edward-1359", page != NULL);
33417 + assert("edward-1360", page->mapping != NULL);
33418 + assert("edward-1361", page->mapping->host != NULL);
33419 +
33420 + setting_actor =
33421 + (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
33422 + result = setting_actor(clust, count);
33423 + clust->index = pg_to_clust(page->index, page->mapping->host);
33424 + return result;
33425 +}
33426 +
33427 +/* reset all the params that not get updated */
33428 +void reset_cluster_params(struct cluster_handle * clust)
33429 +{
33430 + assert("edward-197", clust != NULL);
33431 +
33432 + clust->dstat = INVAL_DISK_CLUSTER;
33433 + clust->tc.uptodate = 0;
33434 + clust->tc.len = 0;
33435 +}
33436 +
33437 +static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
33438 + const char __user *buf, size_t to_write,
33439 + loff_t pos, int *conv_occured)
33440 +{
33441 + int i;
33442 + hint_t *hint;
33443 + int result = 0;
33444 + size_t count;
33445 + struct reiser4_slide win;
33446 + struct cluster_handle clust;
33447 + struct cryptcompress_info * info;
33448 +
33449 + assert("edward-161", reiser4_schedulable());
33450 + assert("edward-748", cryptcompress_inode_ok(inode));
33451 + assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33452 + assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33453 +
33454 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33455 + if (hint == NULL)
33456 + return RETERR(-ENOMEM);
33457 +
33458 + result = load_file_hint(file, hint);
33459 + if (result) {
33460 + kfree(hint);
33461 + return result;
33462 + }
33463 + count = to_write;
33464 +
33465 + reiser4_slide_init(&win);
33466 + cluster_init_read(&clust, &win);
33467 + clust.hint = hint;
33468 + info = cryptcompress_inode_data(inode);
33469 +
33470 + mutex_lock(&info->checkin_mutex);
33471 +
33472 + result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
33473 + if (result)
33474 + goto out;
33475 +
33476 + if (next_window_stat(&win) == HOLE_WINDOW) {
33477 + /* write hole in this iteration
33478 + separated from the loop below */
33479 + result = write_conversion_hook(file, inode,
33480 + pos,
33481 + &clust,
33482 + NULL);
33483 + if (result)
33484 + goto out;
33485 + result = prepare_logical_cluster(inode, pos, count, &clust,
33486 + LC_APPOV);
33487 + if (result)
33488 + goto out;
33489 + }
33490 + do {
33491 + const char __user * src;
33492 + unsigned page_off, to_page;
33493 +
33494 + assert("edward-750", reiser4_schedulable());
33495 +
33496 + result = write_conversion_hook(file, inode,
33497 + pos + to_write - count,
33498 + &clust,
33499 + conv_occured);
33500 + if (result || *conv_occured)
33501 + goto out;
33502 + result = prepare_logical_cluster(inode, pos, count, &clust,
33503 + LC_APPOV);
33504 + if (result)
33505 + goto out;
33506 +
33507 + assert("edward-751", cryptcompress_inode_ok(inode));
33508 + assert("edward-204", win.stat == DATA_WINDOW);
33509 + assert("edward-1288", hint_is_valid(clust.hint));
33510 + assert("edward-752",
33511 + znode_is_write_locked(hint->ext_coord.coord.node));
33512 + put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33513 +
33514 + /* set write position in page */
33515 + page_off = off_to_pgoff(win.off);
33516 +
33517 + /* copy user's data to cluster pages */
33518 + for (i = off_to_pg(win.off), src = buf;
33519 + i < size_in_pages(win.off + win.count);
33520 + i++, src += to_page) {
33521 + to_page = __mbp(win.off + win.count, i) - page_off;
33522 + assert("edward-1039",
33523 + page_off + to_page <= PAGE_CACHE_SIZE);
33524 + assert("edward-287", clust.pages[i] != NULL);
33525 +
33526 + fault_in_pages_readable(src, to_page);
33527 +
33528 + lock_page(clust.pages[i]);
33529 + result =
33530 + __copy_from_user((char *)kmap(clust.pages[i]) +
33531 + page_off, src, to_page);
33532 + kunmap(clust.pages[i]);
33533 + if (unlikely(result)) {
33534 + unlock_page(clust.pages[i]);
33535 + result = -EFAULT;
33536 + goto err2;
33537 + }
33538 + SetPageUptodate(clust.pages[i]);
33539 + reiser4_set_page_dirty_internal(clust.pages[i]);
33540 + flush_dcache_page(clust.pages[i]);
33541 + mark_page_accessed(clust.pages[i]);
33542 + unlock_page(clust.pages[i]);
33543 + page_off = 0;
33544 + }
33545 + assert("edward-753", cryptcompress_inode_ok(inode));
33546 +
33547 + result = checkin_logical_cluster(&clust, inode);
33548 + if (result)
33549 + goto err2;
33550 +
33551 + buf += win.count;
33552 + count -= win.count;
33553 +
33554 + result = balance_dirty_page_cluster(&clust, inode, 0, count);
33555 + if (result)
33556 + goto err1;
33557 + assert("edward-755", hint->lh.owner == NULL);
33558 + reset_cluster_params(&clust);
33559 + continue;
33560 + err2:
33561 + put_page_cluster(&clust, inode, WRITE_OP);
33562 + err1:
33563 + if (clust.reserved)
33564 + free_reserved4cluster(inode,
33565 + &clust,
33566 + estimate_update_cluster(inode));
33567 + break;
33568 + } while (count);
33569 + out:
33570 + /*
33571 + * NOTE: at this point file may have
33572 + * another (unix-file) plugin installed
33573 + */
33574 + done_lh(&hint->lh);
33575 + if (result == -EEXIST)
33576 + warning("edward-1407", "write returns EEXIST!\n");
33577 +
33578 + put_cluster_handle(&clust);
33579 + save_file_hint(file, hint);
33580 + kfree(hint);
33581 + /*
33582 + * don't release cryptcompress-specific
33583 + * checkin_mutex, if conversion occured
33584 + */
33585 + if (*conv_occured == 0)
33586 + mutex_unlock(&info->checkin_mutex);
33587 + if (buf) {
33588 + /* if nothing were written - there must be an error */
33589 + assert("edward-195", ergo((to_write == count),
33590 + (result < 0 || *conv_occured)));
33591 + return (to_write - count) ? (to_write - count) : result;
33592 + }
33593 + return result;
33594 +}
33595 +
33596 +/**
33597 + * write_cryptcompress - write of struct file_operations
33598 + * @file: file to write to
33599 + * @buf: address of user-space buffer
33600 + * @read_amount: number of bytes to write
33601 + * @off: position in file to write to
33602 + *
33603 + * This is implementation of vfs's write method of struct file_operations for
33604 + * cryptcompress plugin.
33605 + */
33606 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
33607 + size_t count, loff_t *off, int *conv)
33608 +{
33609 + ssize_t result;
33610 + struct inode *inode;
33611 + reiser4_context *ctx;
33612 + loff_t pos = *off;
33613 + struct cryptcompress_info *info;
33614 +
33615 + assert("edward-1449", *conv == 0);
33616 +
33617 + inode = file->f_dentry->d_inode;
33618 + assert("edward-196", cryptcompress_inode_ok(inode));
33619 +
33620 + info = cryptcompress_inode_data(inode);
33621 +
33622 + ctx = reiser4_init_context(inode->i_sb);
33623 + if (IS_ERR(ctx))
33624 + return PTR_ERR(ctx);
33625 +
33626 + mutex_lock(&inode->i_mutex);
33627 +
33628 + result = generic_write_checks(file, &pos, &count, 0);
33629 + if (unlikely(result != 0))
33630 + goto out;
33631 + if (unlikely(count == 0))
33632 + goto out;
33633 + result = remove_suid(file->f_dentry);
33634 + if (unlikely(result != 0))
33635 + goto out;
33636 + /* remove_suid might create a transaction */
33637 + reiser4_txn_restart(ctx);
33638 +
33639 + result = do_write_cryptcompress(file, inode, buf, count, pos, conv);
33640 +
33641 + if (result < 0)
33642 + goto out;
33643 + /* update position in a file */
33644 + *off = pos + result;
33645 + out:
33646 + mutex_unlock(&inode->i_mutex);
33647 +
33648 + context_set_commit_async(ctx);
33649 + reiser4_exit_context(ctx);
33650 + return result;
33651 +}
33652 +
33653 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
33654 + struct list_head *pages, unsigned nr_pages)
33655 +{
33656 + reiser4_context * ctx;
33657 + int ret;
33658 +
33659 + ctx = reiser4_init_context(mapping->host->i_sb);
33660 + if (IS_ERR(ctx)) {
33661 + ret = PTR_ERR(ctx);
33662 + goto err;
33663 + }
33664 + /* cryptcompress file can be built of ctail items only */
33665 + ret = readpages_ctail(file, mapping, pages);
33666 + reiser4_txn_restart(ctx);
33667 + reiser4_exit_context(ctx);
33668 + if (ret) {
33669 +err:
33670 + put_pages_list(pages);
33671 + }
33672 + return ret;
33673 +}
33674 +
33675 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
33676 +{
33677 + /* reserve one block to update stat data item */
33678 + assert("edward-1193",
33679 + inode_file_plugin(inode)->estimate.update ==
33680 + estimate_update_common);
33681 + return estimate_update_common(inode);
33682 +}
33683 +
33684 +/**
33685 + * read_cryptcompress - read of struct file_operations
33686 + * @file: file to read from
33687 + * @buf: address of user-space buffer
33688 + * @read_amount: number of bytes to read
33689 + * @off: position in file to read from
33690 + *
33691 + * This is implementation of vfs's read method of struct file_operations for
33692 + * cryptcompress plugin.
33693 + */
33694 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
33695 + loff_t * off)
33696 +{
33697 + ssize_t result;
33698 + struct inode *inode;
33699 + reiser4_context *ctx;
33700 + struct cryptcompress_info *info;
33701 + reiser4_block_nr needed;
33702 +
33703 + inode = file->f_dentry->d_inode;
33704 + assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
33705 +
33706 + ctx = reiser4_init_context(inode->i_sb);
33707 + if (IS_ERR(ctx))
33708 + return PTR_ERR(ctx);
33709 +
33710 + info = cryptcompress_inode_data(inode);
33711 + needed = cryptcompress_estimate_read(inode);
33712 +
33713 + result = reiser4_grab_space(needed, BA_CAN_COMMIT);
33714 + if (result != 0) {
33715 + reiser4_exit_context(ctx);
33716 + return result;
33717 + }
33718 + result = do_sync_read(file, buf, size, off);
33719 +
33720 + context_set_commit_async(ctx);
33721 + reiser4_exit_context(ctx);
33722 +
33723 + return result;
33724 +}
33725 +
33726 +/* Look for a disk cluster and keep lookup result in @found.
33727 + * If @index > 0, then find disk cluster of the index (@index - 1);
33728 + * If @index == 0, then find the rightmost disk cluster.
33729 + * Keep incremented index of the found disk cluster in @found.
33730 + * @found == 0 means that disk cluster was not found (in the last
33731 + * case (@index == 0) it means that file doesn't have disk clusters).
33732 + */
33733 +static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
33734 + cloff_t index)
33735 +{
33736 + int result;
33737 + reiser4_key key;
33738 + loff_t offset;
33739 + hint_t *hint;
33740 + lock_handle *lh;
33741 + lookup_bias bias;
33742 + coord_t *coord;
33743 + item_plugin *iplug;
33744 +
33745 + assert("edward-1131", inode != NULL);
33746 + assert("edward-95", cryptcompress_inode_ok(inode));
33747 +
33748 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33749 + if (hint == NULL)
33750 + return RETERR(-ENOMEM);
33751 + hint_init_zero(hint);
33752 + lh = &hint->lh;
33753 +
33754 + bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
33755 + offset =
33756 + (index ? clust_to_off(index, inode) -
33757 + 1 : get_key_offset(reiser4_max_key()));
33758 +
33759 + key_by_inode_cryptcompress(inode, offset, &key);
33760 +
33761 + /* find the last item of this object */
33762 + result =
33763 + find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
33764 + bias, 0);
33765 + if (cbk_errored(result)) {
33766 + done_lh(lh);
33767 + kfree(hint);
33768 + return result;
33769 + }
33770 + if (result == CBK_COORD_NOTFOUND) {
33771 + /* no real disk clusters */
33772 + done_lh(lh);
33773 + kfree(hint);
33774 + *found = 0;
33775 + return 0;
33776 + }
33777 + /* disk cluster is found */
33778 + coord = &hint->ext_coord.coord;
33779 + coord_clear_iplug(coord);
33780 + result = zload(coord->node);
33781 + if (unlikely(result)) {
33782 + done_lh(lh);
33783 + kfree(hint);
33784 + return result;
33785 + }
33786 + iplug = item_plugin_by_coord(coord);
33787 + assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
33788 + assert("edward-1202", ctail_ok(coord));
33789 +
33790 + item_key_by_coord(coord, &key);
33791 + *found = off_to_clust(get_key_offset(&key), inode) + 1;
33792 +
33793 + assert("edward-1132", ergo(index, index == *found));
33794 +
33795 + zrelse(coord->node);
33796 + done_lh(lh);
33797 + kfree(hint);
33798 + return 0;
33799 +}
33800 +
33801 +static int find_fake_appended(struct inode *inode, cloff_t * index)
33802 +{
33803 + return lookup_disk_cluster(inode, index,
33804 + 0 /* find last real one */ );
33805 +}
33806 +
33807 +/* Set left coord when unit is not found after node_lookup()
33808 + This takes into account that there can be holes in a sequence
33809 + of disk clusters */
33810 +
33811 +static void adjust_left_coord(coord_t * left_coord)
33812 +{
33813 + switch (left_coord->between) {
33814 + case AFTER_UNIT:
33815 + left_coord->between = AFTER_ITEM;
33816 + case AFTER_ITEM:
33817 + case BEFORE_UNIT:
33818 + break;
33819 + default:
33820 + impossible("edward-1204", "bad left coord to cut");
33821 + }
33822 + return;
33823 +}
33824 +
33825 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
33826 +int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
33827 + const reiser4_key * to_key,
33828 + reiser4_key * smallest_removed,
33829 + struct inode *object, int truncate,
33830 + int *progress)
33831 +{
33832 + lock_handle next_node_lock;
33833 + coord_t left_coord;
33834 + int result;
33835 +
33836 + assert("edward-1158", tap->coord->node != NULL);
33837 + assert("edward-1159", znode_is_write_locked(tap->coord->node));
33838 + assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
33839 +
33840 + *progress = 0;
33841 + init_lh(&next_node_lock);
33842 +
33843 + while (1) {
33844 + znode *node; /* node from which items are cut */
33845 + node_plugin *nplug; /* node plugin for @node */
33846 +
33847 + node = tap->coord->node;
33848 +
33849 + /* Move next_node_lock to the next node on the left. */
33850 + result =
33851 + reiser4_get_left_neighbor(&next_node_lock, node,
33852 + ZNODE_WRITE_LOCK,
33853 + GN_CAN_USE_UPPER_LEVELS);
33854 + if (result != 0 && result != -E_NO_NEIGHBOR)
33855 + break;
33856 + /* FIXME-EDWARD: Check can we delete the node as a whole. */
33857 + result = reiser4_tap_load(tap);
33858 + if (result)
33859 + return result;
33860 +
33861 + /* Prepare the second (right) point for cut_node() */
33862 + if (*progress)
33863 + coord_init_last_unit(tap->coord, node);
33864 +
33865 + else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
33866 + /* set rightmost unit for the items without lookup method */
33867 + tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
33868 +
33869 + nplug = node->nplug;
33870 +
33871 + assert("edward-1161", nplug);
33872 + assert("edward-1162", nplug->lookup);
33873 +
33874 + /* left_coord is leftmost unit cut from @node */
33875 + result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
33876 +
33877 + if (IS_CBKERR(result))
33878 + break;
33879 +
33880 + if (result == CBK_COORD_NOTFOUND)
33881 + adjust_left_coord(&left_coord);
33882 +
33883 + /* adjust coordinates so that they are set to existing units */
33884 + if (coord_set_to_right(&left_coord)
33885 + || coord_set_to_left(tap->coord)) {
33886 + result = 0;
33887 + break;
33888 + }
33889 +
33890 + if (coord_compare(&left_coord, tap->coord) ==
33891 + COORD_CMP_ON_RIGHT) {
33892 + /* keys from @from_key to @to_key are not in the tree */
33893 + result = 0;
33894 + break;
33895 + }
33896 +
33897 + /* cut data from one node */
33898 + *smallest_removed = *reiser4_min_key();
33899 + result = kill_node_content(&left_coord,
33900 + tap->coord,
33901 + from_key,
33902 + to_key,
33903 + smallest_removed,
33904 + next_node_lock.node,
33905 + object, truncate);
33906 + reiser4_tap_relse(tap);
33907 +
33908 + if (result)
33909 + break;
33910 +
33911 + ++(*progress);
33912 +
33913 + /* Check whether all items with keys >= from_key were removed
33914 + * from the tree. */
33915 + if (keyle(smallest_removed, from_key))
33916 + /* result = 0; */
33917 + break;
33918 +
33919 + if (next_node_lock.node == NULL)
33920 + break;
33921 +
33922 + result = reiser4_tap_move(tap, &next_node_lock);
33923 + done_lh(&next_node_lock);
33924 + if (result)
33925 + break;
33926 +
33927 + /* Break long cut_tree operation (deletion of a large file) if
33928 + * atom requires commit. */
33929 + if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
33930 + && current_atom_should_commit()) {
33931 + result = -E_REPEAT;
33932 + break;
33933 + }
33934 + }
33935 + done_lh(&next_node_lock);
33936 + return result;
33937 +}
33938 +
33939 +/* Append or expand hole in two steps:
33940 + * 1) set zeroes to the rightmost page of the rightmost non-fake
33941 + * logical cluster;
33942 + * 2) expand hole via fake logical clusters (just increase i_size)
33943 + */
33944 +static int cryptcompress_append_hole(struct inode *inode /* with old size */,
33945 + loff_t new_size)
33946 +{
33947 + int result = 0;
33948 + hint_t *hint;
33949 + lock_handle *lh;
33950 + loff_t hole_size;
33951 + int nr_zeroes;
33952 + struct reiser4_slide win;
33953 + struct cluster_handle clust;
33954 +
33955 + assert("edward-1133", inode->i_size < new_size);
33956 + assert("edward-1134", reiser4_schedulable());
33957 + assert("edward-1135", cryptcompress_inode_ok(inode));
33958 + assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
33959 + assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
33960 +
33961 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33962 + if (hint == NULL)
33963 + return RETERR(-ENOMEM);
33964 + hint_init_zero(hint);
33965 + lh = &hint->lh;
33966 +
33967 + reiser4_slide_init(&win);
33968 + cluster_init_read(&clust, &win);
33969 + clust.hint = hint;
33970 +
33971 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33972 + if (result)
33973 + goto out;
33974 + if (off_to_cloff(inode->i_size, inode) == 0)
33975 + goto append_fake;
33976 + hole_size = new_size - inode->i_size;
33977 + nr_zeroes =
33978 + inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
33979 + if (hole_size < nr_zeroes)
33980 + nr_zeroes = hole_size;
33981 + set_window(&clust, &win, inode, inode->i_size,
33982 + inode->i_size + nr_zeroes);
33983 + win.stat = HOLE_WINDOW;
33984 +
33985 + assert("edward-1137",
33986 + clust.index == off_to_clust(inode->i_size, inode));
33987 +
33988 + result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
33989 +
33990 + assert("edward-1271", !result || result == -ENOSPC);
33991 + if (result)
33992 + goto out;
33993 + assert("edward-1139",
33994 + clust.dstat == PREP_DISK_CLUSTER ||
33995 + clust.dstat == UNPR_DISK_CLUSTER);
33996 +
33997 + assert("edward-1431", hole_size >= nr_zeroes);
33998 + if (hole_size == nr_zeroes)
33999 + /* nothing to append anymore */
34000 + goto out;
34001 + append_fake:
34002 + INODE_SET_SIZE(inode, new_size);
34003 + out:
34004 + done_lh(lh);
34005 + kfree(hint);
34006 + put_cluster_handle(&clust);
34007 + return result;
34008 +}
34009 +
34010 +static int
34011 +update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
34012 +{
34013 + return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
34014 + ? 0 : reiser4_update_file_size(inode, key, update_sd));
34015 +}
34016 +
34017 +/* Prune cryptcompress file in two steps:
34018 + * 1) cut all nominated logical clusters except the leftmost one which
34019 + * is to be partially truncated. Note, that there can be "holes"
34020 + * represented by fake logical clusters.
34021 + * 2) set zeroes and capture leftmost partially truncated logical
34022 + * cluster, if it is not fake; otherwise prune fake logical cluster
34023 + * (just decrease i_size).
34024 + */
34025 +static int prune_cryptcompress(struct inode *inode, loff_t new_size,
34026 + int update_sd, cloff_t aidx)
34027 +{
34028 + int result = 0;
34029 + unsigned nr_zeroes;
34030 + loff_t to_prune;
34031 + loff_t old_size;
34032 + cloff_t ridx;
34033 +
34034 + hint_t *hint;
34035 + lock_handle *lh;
34036 + struct reiser4_slide win;
34037 + struct cluster_handle clust;
34038 +
34039 + assert("edward-1140", inode->i_size >= new_size);
34040 + assert("edward-1141", reiser4_schedulable());
34041 + assert("edward-1142", cryptcompress_inode_ok(inode));
34042 + assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
34043 +
34044 + old_size = inode->i_size;
34045 +
34046 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34047 + if (hint == NULL)
34048 + return RETERR(-ENOMEM);
34049 + hint_init_zero(hint);
34050 + lh = &hint->lh;
34051 +
34052 + reiser4_slide_init(&win);
34053 + cluster_init_read(&clust, &win);
34054 + clust.hint = hint;
34055 +
34056 + /* calculate index of the rightmost logical cluster
34057 + that will be completely truncated */
34058 + ridx = size_in_lc(new_size, inode);
34059 +
34060 + /* truncate all disk clusters starting from @ridx */
34061 + assert("edward-1174", ridx <= aidx);
34062 + old_size = inode->i_size;
34063 + if (ridx != aidx) {
34064 + struct cryptcompress_info * info;
34065 + info = cryptcompress_inode_data(inode);
34066 + result = cut_file_items(inode,
34067 + clust_to_off(ridx, inode),
34068 + update_sd,
34069 + clust_to_off(aidx, inode),
34070 + update_cryptcompress_size);
34071 + info->trunc_index = ULONG_MAX;
34072 + if (result)
34073 + goto out;
34074 + }
34075 + /*
34076 + * there can be pages of fake logical clusters, truncate them
34077 + */
34078 + truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
34079 + assert("edward-1524",
34080 + pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
34081 + /*
34082 + * now perform partial truncate of last logical cluster
34083 + */
34084 + if (!off_to_cloff(new_size, inode)) {
34085 + /* no partial truncate is needed */
34086 + assert("edward-1145", inode->i_size == new_size);
34087 + goto truncate_fake;
34088 + }
34089 + assert("edward-1146", new_size < inode->i_size);
34090 +
34091 + to_prune = inode->i_size - new_size;
34092 +
34093 + /* check if the last logical cluster is fake */
34094 + result = lookup_disk_cluster(inode, &aidx, ridx);
34095 + if (result)
34096 + goto out;
34097 + if (!aidx)
34098 + /* yup, this is fake one */
34099 + goto truncate_fake;
34100 +
34101 + assert("edward-1148", aidx == ridx);
34102 +
34103 + /* do partial truncate of the last page cluster,
34104 + and try to capture this one */
34105 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34106 + if (result)
34107 + goto out;
34108 + nr_zeroes = (off_to_pgoff(new_size) ?
34109 + PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
34110 + set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
34111 + win.stat = HOLE_WINDOW;
34112 +
34113 + assert("edward-1149", clust.index == ridx - 1);
34114 +
34115 + result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
34116 + if (result)
34117 + goto out;
34118 + assert("edward-1151",
34119 + clust.dstat == PREP_DISK_CLUSTER ||
34120 + clust.dstat == UNPR_DISK_CLUSTER);
34121 +
34122 + assert("edward-1191", inode->i_size == new_size);
34123 + assert("edward-1206", body_truncate_ok(inode, ridx));
34124 + truncate_fake:
34125 + /* drop all the pages that don't have jnodes (i.e. pages
34126 + which can not be truncated by cut_file_items() because
34127 + of holes represented by fake disk clusters) including
34128 + the pages of partially truncated cluster which was
34129 + released by prepare_logical_cluster() */
34130 + INODE_SET_SIZE(inode, new_size);
34131 + truncate_inode_pages(inode->i_mapping, new_size);
34132 + out:
34133 + assert("edward-1334", !result || result == -ENOSPC);
34134 + assert("edward-1497",
34135 + pages_truncate_ok(inode, size_in_pages(new_size)));
34136 +
34137 + done_lh(lh);
34138 + kfree(hint);
34139 + put_cluster_handle(&clust);
34140 + return result;
34141 +}
34142 +
34143 +/* Prepare cryptcompress file for truncate:
34144 + * prune or append rightmost fake logical clusters (if any)
34145 + */
34146 +static int start_truncate_fake(struct inode *inode, cloff_t aidx,
34147 + loff_t new_size, int update_sd)
34148 +{
34149 + int result = 0;
34150 + int bytes;
34151 +
34152 + if (new_size > inode->i_size) {
34153 + /* append */
34154 + if (inode->i_size < clust_to_off(aidx, inode))
34155 + /* no fake bytes */
34156 + return 0;
34157 + bytes = new_size - inode->i_size;
34158 + INODE_SET_SIZE(inode, inode->i_size + bytes);
34159 + } else {
34160 + /* prune */
34161 + if (inode->i_size <= clust_to_off(aidx, inode))
34162 + /* no fake bytes */
34163 + return 0;
34164 + bytes = inode->i_size -
34165 + max(new_size, clust_to_off(aidx, inode));
34166 + if (!bytes)
34167 + return 0;
34168 + INODE_SET_SIZE(inode, inode->i_size - bytes);
34169 + /* In the case of fake prune we need to drop page cluster.
34170 + There are only 2 cases for partially truncated page:
34171 + 1. If is is dirty, therefore it is anonymous
34172 + (was dirtied via mmap), and will be captured
34173 + later via ->capture().
34174 + 2. If is clean, therefore it is filled by zeroes.
34175 + In both cases we don't need to make it dirty and
34176 + capture here.
34177 + */
34178 + truncate_inode_pages(inode->i_mapping, inode->i_size);
34179 + }
34180 + if (update_sd)
34181 + result = update_sd_cryptcompress(inode);
34182 + return result;
34183 +}
34184 +
34185 +/* This is called in setattr_cryptcompress when it is used to truncate,
34186 + * and in delete_cryptcompress */
34187 +static int cryptcompress_truncate(struct inode *inode, /* old size */
34188 + loff_t new_size, /* new size */
34189 + int update_sd)
34190 +{
34191 + int result;
34192 + cloff_t aidx;
34193 +
34194 + result = find_fake_appended(inode, &aidx);
34195 + if (result)
34196 + return result;
34197 + assert("edward-1208",
34198 + ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34199 +
34200 + result = start_truncate_fake(inode, aidx, new_size, update_sd);
34201 + if (result)
34202 + return result;
34203 + if (inode->i_size == new_size)
34204 + /* nothing to truncate anymore */
34205 + return 0;
34206 + result = (inode->i_size < new_size ?
34207 + cryptcompress_append_hole(inode, new_size) :
34208 + prune_cryptcompress(inode, new_size, update_sd, aidx));
34209 + if (!result && update_sd)
34210 + result = update_sd_cryptcompress(inode);
34211 + return result;
34212 +}
34213 +
34214 +/* Capture an anonymous pager cluster. (Page cluser is
34215 + * anonymous if it contains at least one anonymous page
34216 + */
34217 +static int capture_anon_page_cluster(struct cluster_handle * clust,
34218 + struct inode * inode)
34219 +{
34220 + int result;
34221 +
34222 + assert("edward-1073", clust != NULL);
34223 + assert("edward-1074", inode != NULL);
34224 + assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34225 +
34226 + result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
34227 + if (result)
34228 + return result;
34229 + set_cluster_pages_dirty(clust, inode);
34230 + result = checkin_logical_cluster(clust, inode);
34231 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
34232 + if (unlikely(result))
34233 + put_page_cluster(clust, inode, WRITE_OP);
34234 + return result;
34235 +}
34236 +
34237 +/* Starting from @index find tagged pages of the same page cluster.
34238 + * Clear the tag for each of them. Return number of found pages.
34239 + */
34240 +static int find_anon_page_cluster(struct address_space * mapping,
34241 + pgoff_t * index, struct page ** pages)
34242 +{
34243 + int i = 0;
34244 + int found;
34245 + write_lock_irq(&mapping->tree_lock);
34246 + do {
34247 + /* looking for one page */
34248 + found = radix_tree_gang_lookup_tag(&mapping->page_tree,
34249 + (void **)&pages[i],
34250 + *index, 1,
34251 + PAGECACHE_TAG_REISER4_MOVED);
34252 + if (!found)
34253 + break;
34254 + if (!same_page_cluster(pages[0], pages[i]))
34255 + break;
34256 +
34257 + /* found */
34258 + page_cache_get(pages[i]);
34259 + *index = pages[i]->index + 1;
34260 +
34261 + radix_tree_tag_clear(&mapping->page_tree,
34262 + pages[i]->index,
34263 + PAGECACHE_TAG_REISER4_MOVED);
34264 + if (last_page_in_cluster(pages[i++]))
34265 + break;
34266 + } while (1);
34267 + write_unlock_irq(&mapping->tree_lock);
34268 + return i;
34269 +}
34270 +
34271 +#define MAX_PAGES_TO_CAPTURE (1024)
34272 +
34273 +/* Capture anonymous page clusters */
34274 +static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
34275 + int to_capture)
34276 +{
34277 + int count = 0;
34278 + int found = 0;
34279 + int result = 0;
34280 + hint_t *hint;
34281 + lock_handle *lh;
34282 + struct inode * inode;
34283 + struct cluster_handle clust;
34284 + struct page * pages[MAX_CLUSTER_NRPAGES];
34285 +
34286 + assert("edward-1127", mapping != NULL);
34287 + assert("edward-1128", mapping->host != NULL);
34288 + assert("edward-1440", mapping->host->i_mapping == mapping);
34289 +
34290 + inode = mapping->host;
34291 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34292 + if (hint == NULL)
34293 + return RETERR(-ENOMEM);
34294 + hint_init_zero(hint);
34295 + lh = &hint->lh;
34296 +
34297 + cluster_init_read(&clust, NULL);
34298 + clust.hint = hint;
34299 +
34300 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34301 + if (result)
34302 + goto out;
34303 +
34304 + while (to_capture > 0) {
34305 + found = find_anon_page_cluster(mapping, index, pages);
34306 + if (!found) {
34307 + *index = (pgoff_t) - 1;
34308 + break;
34309 + }
34310 + move_cluster_forward(&clust, inode, pages[0]->index);
34311 + result = capture_anon_page_cluster(&clust, inode);
34312 +
34313 + put_found_pages(pages, found); /* find_anon_page_cluster */
34314 + if (result)
34315 + break;
34316 + to_capture -= clust.nr_pages;
34317 + count += clust.nr_pages;
34318 + }
34319 + if (result) {
34320 + warning("edward-1077",
34321 + "Capture failed (inode %llu, result=%i, captured=%d)\n",
34322 + (unsigned long long)get_inode_oid(inode), result, count);
34323 + } else {
34324 + assert("edward-1078", ergo(found > 0, count > 0));
34325 + if (to_capture <= 0)
34326 + /* there may be left more pages */
34327 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
34328 + result = count;
34329 + }
34330 + out:
34331 + done_lh(lh);
34332 + kfree(hint);
34333 + put_cluster_handle(&clust);
34334 + return result;
34335 +}
34336 +
34337 +/* Returns true if inode's mapping has dirty pages
34338 + which do not belong to any atom */
34339 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
34340 +{
34341 + int result;
34342 + read_lock_irq(&inode->i_mapping->tree_lock);
34343 + result = radix_tree_tagged(&inode->i_mapping->page_tree,
34344 + PAGECACHE_TAG_REISER4_MOVED);
34345 + read_unlock_irq(&inode->i_mapping->tree_lock);
34346 + return result;
34347 +}
34348 +
34349 +/* This is implementation of vfs's writepages method of struct
34350 + address_space_operations */
34351 +int writepages_cryptcompress(struct address_space *mapping,
34352 + struct writeback_control *wbc)
34353 +{
34354 + int result = 0;
34355 + long to_capture;
34356 + pgoff_t nrpages;
34357 + pgoff_t index = 0;
34358 + struct inode *inode;
34359 + struct cryptcompress_info *info;
34360 +
34361 + inode = mapping->host;
34362 + if (!cryptcompress_inode_has_anon_pages(inode))
34363 + goto end;
34364 + info = cryptcompress_inode_data(inode);
34365 + nrpages = size_in_pages(i_size_read(inode));
34366 +
34367 + if (wbc->sync_mode != WB_SYNC_ALL)
34368 + to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
34369 + else
34370 + to_capture = MAX_PAGES_TO_CAPTURE;
34371 + do {
34372 + reiser4_context *ctx;
34373 +
34374 + ctx = reiser4_init_context(inode->i_sb);
34375 + if (IS_ERR(ctx)) {
34376 + result = PTR_ERR(ctx);
34377 + break;
34378 + }
34379 + /* avoid recursive calls to ->sync_inodes */
34380 + ctx->nobalance = 1;
34381 +
34382 + assert("edward-1079",
34383 + lock_stack_isclean(get_current_lock_stack()));
34384 +
34385 + reiser4_txn_restart_current();
34386 +
34387 + if (get_current_context()->entd) {
34388 + if (mutex_trylock(&info->checkin_mutex) == 0) {
34389 + /* the mutex might be occupied by
34390 + entd caller */
34391 + result = RETERR(-EBUSY);
34392 + reiser4_exit_context(ctx);
34393 + break;
34394 + }
34395 + } else
34396 + mutex_lock(&info->checkin_mutex);
34397 +
34398 + result = capture_anon_pages(inode->i_mapping, &index,
34399 + to_capture);
34400 + mutex_unlock(&info->checkin_mutex);
34401 +
34402 + if (result < 0) {
34403 + reiser4_exit_context(ctx);
34404 + break;
34405 + }
34406 + wbc->nr_to_write -= result;
34407 + if (wbc->sync_mode != WB_SYNC_ALL) {
34408 + reiser4_exit_context(ctx);
34409 + break;
34410 + }
34411 + result = txnmgr_force_commit_all(inode->i_sb, 0);
34412 + reiser4_exit_context(ctx);
34413 + } while (result >= 0 && index < nrpages);
34414 +
34415 + end:
34416 + if (is_in_reiser4_context()) {
34417 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34418 + /* there are already pages to flush, flush them out,
34419 + do not delay until end of reiser4_sync_inodes */
34420 + reiser4_writeout(inode->i_sb, wbc);
34421 + get_current_context()->nr_captured = 0;
34422 + }
34423 + }
34424 + return result;
34425 +}
34426 +
34427 +/* plugin->u.file.mmap */
34428 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34429 +{
34430 + int result;
34431 + struct inode *inode;
34432 + reiser4_context *ctx;
34433 +
34434 + inode = file->f_dentry->d_inode;
34435 + ctx = reiser4_init_context(inode->i_sb);
34436 + if (IS_ERR(ctx))
34437 + return PTR_ERR(ctx);
34438 + /*
34439 + * generic_file_mmap will do update_atime. Grab space for stat data
34440 + * update.
34441 + */
34442 + result = reiser4_grab_space_force
34443 + (inode_file_plugin(inode)->estimate.update(inode),
34444 + BA_CAN_COMMIT);
34445 + if (result) {
34446 + reiser4_exit_context(ctx);
34447 + return result;
34448 + }
34449 + result = generic_file_mmap(file, vma);
34450 + reiser4_exit_context(ctx);
34451 + return result;
34452 +}
34453 +
34454 +/* plugin->u.file.release */
34455 +/* plugin->u.file.get_block */
34456 +
34457 +/* this is implementation of delete method of file plugin for
34458 + * cryptcompress objects
34459 + */
34460 +int delete_object_cryptcompress(struct inode *inode)
34461 +{
34462 + int result;
34463 + struct cryptcompress_info * info;
34464 +
34465 + assert("edward-429", inode->i_nlink == 0);
34466 +
34467 + reiser4_txn_restart_current();
34468 + info = cryptcompress_inode_data(inode);
34469 +
34470 + mutex_lock(&info->checkin_mutex);
34471 + result = cryptcompress_truncate(inode, 0, 0);
34472 + mutex_unlock(&info->checkin_mutex);
34473 +
34474 + if (result) {
34475 + warning("edward-430",
34476 + "cannot truncate cryptcompress file %lli: %i",
34477 + (unsigned long long)get_inode_oid(inode),
34478 + result);
34479 + }
34480 + truncate_inode_pages(inode->i_mapping, 0);
34481 + assert("edward-1487", pages_truncate_ok(inode, 0));
34482 + /* and remove stat data */
34483 + return reiser4_delete_object_common(inode);
34484 +}
34485 +
34486 +/* plugin->u.file.setattr method
34487 + This implements actual truncate (see comments in reiser4/page_cache.c) */
34488 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
34489 +{
34490 + int result;
34491 + struct inode *inode;
34492 + struct cryptcompress_info * info;
34493 +
34494 + inode = dentry->d_inode;
34495 + info = cryptcompress_inode_data(inode);
34496 +
34497 + if (attr->ia_valid & ATTR_SIZE) {
34498 + if (i_size_read(inode) != attr->ia_size) {
34499 + reiser4_context *ctx;
34500 + loff_t old_size;
34501 +
34502 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
34503 + if (IS_ERR(ctx))
34504 + return PTR_ERR(ctx);
34505 +
34506 + old_size = i_size_read(inode);
34507 + inode_check_scale(inode, old_size, attr->ia_size);
34508 +
34509 + mutex_lock(&info->checkin_mutex);
34510 + result = cryptcompress_truncate(inode,
34511 + attr->ia_size,
34512 + 1/* update sd */);
34513 + mutex_unlock(&info->checkin_mutex);
34514 + if (result) {
34515 + warning("edward-1192",
34516 + "truncate_cryptcompress failed: oid %lli, "
34517 + "old size %lld, new size %lld, retval %d",
34518 + (unsigned long long)
34519 + get_inode_oid(inode), old_size,
34520 + attr->ia_size, result);
34521 + }
34522 + context_set_commit_async(ctx);
34523 + reiser4_exit_context(ctx);
34524 + } else
34525 + result = 0;
34526 + } else
34527 + result = reiser4_setattr_common(dentry, attr);
34528 + return result;
34529 +}
34530 +
34531 +/* sendfile_cryptcompress - sendfile of struct file_operations */
34532 +ssize_t
34533 +sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
34534 + read_actor_t actor, void *target)
34535 +{
34536 + reiser4_context *ctx;
34537 + ssize_t result;
34538 + struct inode *inode;
34539 + struct cryptcompress_info *info;
34540 +
34541 + inode = file->f_dentry->d_inode;
34542 + ctx = reiser4_init_context(inode->i_sb);
34543 + if (IS_ERR(ctx))
34544 + return PTR_ERR(ctx);
34545 + /*
34546 + * generic_file_sndfile may want to call update_atime. Grab space for
34547 + * stat data update
34548 + */
34549 + result = reiser4_grab_space(estimate_update_common(inode),
34550 + BA_CAN_COMMIT);
34551 + if (result)
34552 + goto exit;
34553 + info = cryptcompress_inode_data(inode);
34554 +
34555 + result = generic_file_sendfile(file, ppos, count, actor, target);
34556 + exit:
34557 + reiser4_exit_context(ctx);
34558 + return result;
34559 +}
34560 +
34561 +/*
34562 + * release_cryptcompress - release of struct file_operations
34563 + * @inode: inode of released file
34564 + * @file: file to release
34565 + */
34566 +int release_cryptcompress(struct inode *inode, struct file *file)
34567 +{
34568 + reiser4_context *ctx = reiser4_init_context(inode->i_sb);
34569 +
34570 + if (IS_ERR(ctx))
34571 + return PTR_ERR(ctx);
34572 + reiser4_free_file_fsdata(file);
34573 + reiser4_exit_context(ctx);
34574 + return 0;
34575 +}
34576 +
34577 +#if 0
34578 +int prepare_write_cryptcompress(struct file *file, struct page *page,
34579 + unsigned from, unsigned to)
34580 +{
34581 + return prepare_write_common(file, page, from, to);
34582 +}
34583 +#endif /* 0 */
34584 +
34585 +
34586 +/*
34587 + Local variables:
34588 + c-indentation-style: "K&R"
34589 + mode-name: "LC"
34590 + c-basic-offset: 8
34591 + tab-width: 8
34592 + fill-column: 80
34593 + scroll-step: 1
34594 + End:
34595 +*/
34596 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.h
34597 --- linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300
34598 +++ linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.h 2007-07-29 00:26:21.804839975 +0400
34599 @@ -0,0 +1,607 @@
34600 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
34601 +/* See http://www.namesys.com/cryptcompress_design.html */
34602 +
34603 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
34604 +#define __FS_REISER4_CRYPTCOMPRESS_H__
34605 +
34606 +#include "../../page_cache.h"
34607 +#include "../compress/compress.h"
34608 +#include "../crypto/cipher.h"
34609 +
34610 +#include <linux/pagemap.h>
34611 +
34612 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
34613 +#define MAX_CLUSTER_SHIFT 16
34614 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
34615 +#define DC_CHECKSUM_SIZE 4
34616 +
34617 +#define MIN_LATTICE_FACTOR 1
34618 +#define MAX_LATTICE_FACTOR 32
34619 +
34620 +/* this mask contains all non-standard plugins that might
34621 + be present in reiser4-specific part of inode managed by
34622 + cryptcompress file plugin */
34623 +#define cryptcompress_mask \
34624 + ((1 << PSET_FILE) | \
34625 + (1 << PSET_CLUSTER) | \
34626 + (1 << PSET_CIPHER) | \
34627 + (1 << PSET_DIGEST) | \
34628 + (1 << PSET_COMPRESSION) | \
34629 + (1 << PSET_COMPRESSION_MODE))
34630 +
34631 +#if REISER4_DEBUG
34632 +static inline int cluster_shift_ok(int shift)
34633 +{
34634 + return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
34635 +}
34636 +#endif
34637 +
34638 +#if REISER4_DEBUG
34639 +#define INODE_PGCOUNT(inode) \
34640 +({ \
34641 + assert("edward-1530", inode_file_plugin(inode) == \
34642 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
34643 + atomic_read(&cryptcompress_inode_data(inode)->pgcount); \
34644 + })
34645 +#define INODE_PGCOUNT_INC(inode) \
34646 +do { \
34647 + assert("edward-1531", inode_file_plugin(inode) == \
34648 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
34649 + atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \
34650 +} while (0)
34651 +#define INODE_PGCOUNT_DEC(inode) \
34652 +do { \
34653 + if (inode_file_plugin(inode) == \
34654 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \
34655 + atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \
34656 +} while (0)
34657 +#else
34658 +#define INODE_PGCOUNT(inode) (0)
34659 +#define INODE_PGCOUNT_INC(inode)
34660 +#define INODE_PGCOUNT_DEC(inode)
34661 +#endif /* REISER4_DEBUG */
34662 +
34663 +struct tfm_stream {
34664 + __u8 *data;
34665 + size_t size;
34666 +};
34667 +
34668 +typedef enum {
34669 + INPUT_STREAM,
34670 + OUTPUT_STREAM,
34671 + LAST_STREAM
34672 +} tfm_stream_id;
34673 +
34674 +typedef struct tfm_stream * tfm_unit[LAST_STREAM];
34675 +
34676 +static inline __u8 *ts_data(struct tfm_stream * stm)
34677 +{
34678 + assert("edward-928", stm != NULL);
34679 + return stm->data;
34680 +}
34681 +
34682 +static inline size_t ts_size(struct tfm_stream * stm)
34683 +{
34684 + assert("edward-929", stm != NULL);
34685 + return stm->size;
34686 +}
34687 +
34688 +static inline void set_ts_size(struct tfm_stream * stm, size_t size)
34689 +{
34690 + assert("edward-930", stm != NULL);
34691 +
34692 + stm->size = size;
34693 +}
34694 +
34695 +static inline int alloc_ts(struct tfm_stream ** stm)
34696 +{
34697 + assert("edward-931", stm);
34698 + assert("edward-932", *stm == NULL);
34699 +
34700 + *stm = kmalloc(sizeof **stm, reiser4_ctx_gfp_mask_get());
34701 + if (*stm == NULL)
34702 + return -ENOMEM;
34703 + memset(*stm, 0, sizeof **stm);
34704 + return 0;
34705 +}
34706 +
34707 +static inline void free_ts(struct tfm_stream * stm)
34708 +{
34709 + assert("edward-933", !ts_data(stm));
34710 + assert("edward-934", !ts_size(stm));
34711 +
34712 + kfree(stm);
34713 +}
34714 +
34715 +static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
34716 +{
34717 + assert("edward-935", !ts_data(stm));
34718 + assert("edward-936", !ts_size(stm));
34719 + assert("edward-937", size != 0);
34720 +
34721 + stm->data = reiser4_vmalloc(size);
34722 + if (!stm->data)
34723 + return -ENOMEM;
34724 + set_ts_size(stm, size);
34725 + return 0;
34726 +}
34727 +
34728 +static inline void free_ts_data(struct tfm_stream * stm)
34729 +{
34730 + assert("edward-938", equi(ts_data(stm), ts_size(stm)));
34731 +
34732 + if (ts_data(stm))
34733 + vfree(ts_data(stm));
34734 + memset(stm, 0, sizeof *stm);
34735 +}
34736 +
34737 +/* Write modes for item conversion in flush convert phase */
34738 +typedef enum {
34739 + CRC_APPEND_ITEM = 1,
34740 + CRC_OVERWRITE_ITEM = 2,
34741 + CRC_CUT_ITEM = 3
34742 +} cryptcompress_write_mode_t;
34743 +
34744 +typedef enum {
34745 + LC_INVAL = 0, /* invalid value */
34746 + LC_APPOV = 1, /* append and/or overwrite */
34747 + LC_TRUNC = 2 /* truncate */
34748 +} logical_cluster_op;
34749 +
34750 +/* Transform cluster.
34751 + * Intermediate state between page cluster and disk cluster
34752 + * Is used for data transform (compression/encryption)
34753 + */
34754 +struct tfm_cluster {
34755 + coa_set coa; /* compression algorithms info */
34756 + tfm_unit tun; /* plain and transformed streams */
34757 + tfm_action act;
34758 + int uptodate;
34759 + int lsize; /* number of bytes in logical cluster */
34760 + int len; /* length of the transform stream */
34761 +};
34762 +
34763 +static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
34764 + tfm_action act)
34765 +{
34766 + return tc->coa[id][act];
34767 +}
34768 +
34769 +static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
34770 + tfm_action act, coa_t coa)
34771 +{
34772 + tc->coa[id][act] = coa;
34773 +}
34774 +
34775 +static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
34776 +{
34777 + coa_t coa;
34778 +
34779 + coa = cplug->alloc(tc->act);
34780 + if (IS_ERR(coa))
34781 + return PTR_ERR(coa);
34782 + set_coa(tc, cplug->h.id, tc->act, coa);
34783 + return 0;
34784 +}
34785 +
34786 +static inline int
34787 +grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
34788 +{
34789 + return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
34790 + alloc_coa(tc, cplug) : 0);
34791 +}
34792 +
34793 +static inline void free_coa_set(struct tfm_cluster * tc)
34794 +{
34795 + tfm_action j;
34796 + reiser4_compression_id i;
34797 + compression_plugin *cplug;
34798 +
34799 + assert("edward-810", tc != NULL);
34800 +
34801 + for (j = 0; j < TFMA_LAST; j++)
34802 + for (i = 0; i < LAST_COMPRESSION_ID; i++) {
34803 + if (!get_coa(tc, i, j))
34804 + continue;
34805 + cplug = compression_plugin_by_id(i);
34806 + assert("edward-812", cplug->free != NULL);
34807 + cplug->free(get_coa(tc, i, j), j);
34808 + set_coa(tc, i, j, 0);
34809 + }
34810 + return;
34811 +}
34812 +
34813 +static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
34814 + tfm_stream_id id)
34815 +{
34816 + return tc->tun[id];
34817 +}
34818 +
34819 +static inline void set_tfm_stream(struct tfm_cluster * tc,
34820 + tfm_stream_id id, struct tfm_stream * ts)
34821 +{
34822 + tc->tun[id] = ts;
34823 +}
34824 +
34825 +static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
34826 +{
34827 + return ts_data(get_tfm_stream(tc, id));
34828 +}
34829 +
34830 +static inline void set_tfm_stream_data(struct tfm_cluster * tc,
34831 + tfm_stream_id id, __u8 * data)
34832 +{
34833 + get_tfm_stream(tc, id)->data = data;
34834 +}
34835 +
34836 +static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
34837 +{
34838 + return ts_size(get_tfm_stream(tc, id));
34839 +}
34840 +
34841 +static inline void
34842 +set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
34843 +{
34844 + get_tfm_stream(tc, id)->size = size;
34845 +}
34846 +
34847 +static inline int
34848 +alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
34849 +{
34850 + assert("edward-939", tc != NULL);
34851 + assert("edward-940", !get_tfm_stream(tc, id));
34852 +
34853 + tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
34854 + reiser4_ctx_gfp_mask_get());
34855 + if (!tc->tun[id])
34856 + return -ENOMEM;
34857 + return alloc_ts_data(get_tfm_stream(tc, id), size);
34858 +}
34859 +
34860 +static inline int
34861 +realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
34862 +{
34863 + assert("edward-941", tfm_stream_size(tc, id) < size);
34864 + free_ts_data(get_tfm_stream(tc, id));
34865 + return alloc_ts_data(get_tfm_stream(tc, id), size);
34866 +}
34867 +
34868 +static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
34869 +{
34870 + free_ts_data(get_tfm_stream(tc, id));
34871 + free_ts(get_tfm_stream(tc, id));
34872 + set_tfm_stream(tc, id, 0);
34873 +}
34874 +
34875 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
34876 +{
34877 + return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
34878 +}
34879 +
34880 +static inline void free_tfm_unit(struct tfm_cluster * tc)
34881 +{
34882 + tfm_stream_id id;
34883 + for (id = 0; id < LAST_STREAM; id++) {
34884 + if (!get_tfm_stream(tc, id))
34885 + continue;
34886 + free_tfm_stream(tc, id);
34887 + }
34888 +}
34889 +
34890 +static inline void put_tfm_cluster(struct tfm_cluster * tc)
34891 +{
34892 + assert("edward-942", tc != NULL);
34893 + free_coa_set(tc);
34894 + free_tfm_unit(tc);
34895 +}
34896 +
34897 +static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
34898 +{
34899 + assert("edward-943", tc != NULL);
34900 + assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
34901 + return (tc->uptodate == 1);
34902 +}
34903 +
34904 +static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
34905 +{
34906 + assert("edward-945", tc != NULL);
34907 + assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
34908 + tc->uptodate = 1;
34909 + return;
34910 +}
34911 +
34912 +static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
34913 +{
34914 + assert("edward-947", tc != NULL);
34915 + assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
34916 + tc->uptodate = 0;
34917 + return;
34918 +}
34919 +
34920 +static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
34921 +{
34922 + return (get_tfm_stream(tc, id) &&
34923 + tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
34924 +}
34925 +
34926 +static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
34927 +{
34928 + int i;
34929 + for (i = 0; i < LAST_STREAM; i++)
34930 + if (!tfm_stream_is_set(tc, i))
34931 + return 0;
34932 + return 1;
34933 +}
34934 +
34935 +static inline void alternate_streams(struct tfm_cluster * tc)
34936 +{
34937 + struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
34938 +
34939 + set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
34940 + set_tfm_stream(tc, OUTPUT_STREAM, tmp);
34941 +}
34942 +
34943 +/* Set of states to indicate a kind of data
34944 + * that will be written to the window */
34945 +typedef enum {
34946 + DATA_WINDOW, /* user's data */
34947 + HOLE_WINDOW /* zeroes (such kind of data can be written
34948 + * if we start to write from offset > i_size) */
34949 +} window_stat;
34950 +
34951 +/* Window (of logical cluster size) discretely sliding along a file.
34952 + * Is used to locate hole region in a logical cluster to be properly
34953 + * represented on disk.
34954 + * We split a write to cryptcompress file into writes to its logical
34955 + * clusters. Before writing to a logical cluster we set a window, i.e.
34956 + * calculate values of the following fields:
34957 + */
34958 +struct reiser4_slide {
34959 + unsigned off; /* offset to write from */
34960 + unsigned count; /* number of bytes to write */
34961 + unsigned delta; /* number of bytes to append to the hole */
34962 + window_stat stat; /* what kind of data will be written starting
34963 + from @off */
34964 +};
34965 +
34966 +/* Possible states of a disk cluster */
34967 +typedef enum {
34968 + INVAL_DISK_CLUSTER, /* unknown state */
34969 + PREP_DISK_CLUSTER, /* disk cluster got converted by flush
34970 + * at least 1 time */
34971 + UNPR_DISK_CLUSTER, /* disk cluster just created and should be
34972 + * converted by flush */
34973 + FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory
34974 + * nor on disk */
34975 + TRNC_DISK_CLUSTER /* disk cluster is partially truncated */
34976 +} disk_cluster_stat;
34977 +
34978 +/* The following structure represents various stages of the same logical
34979 + * cluster of index @index:
34980 + * . fixed slide
34981 + * . page cluster (stage in primary cache)
34982 + * . transform cluster (transition stage)
34983 + * . disk cluster (stage in secondary cache)
34984 + * This structure is used in transition and synchronizing operations, e.g.
34985 + * transform cluster is a transition state when synchronizing page cluster
34986 + * and disk cluster.
34987 + * FIXME: Encapsulate page cluster, disk cluster.
34988 + */
34989 +struct cluster_handle {
34990 + cloff_t index; /* offset in a file (unit is a cluster size) */
34991 + int index_valid; /* for validating the index above, if needed */
34992 + struct file *file; /* host file */
34993 +
34994 + /* logical cluster */
34995 + struct reiser4_slide *win; /* sliding window to locate holes */
34996 + logical_cluster_op op; /* logical cluster operation (truncate or
34997 + append/overwrite) */
34998 + /* transform cluster */
34999 + struct tfm_cluster tc; /* contains all needed info to synchronize
35000 + page cluster and disk cluster) */
35001 + /* page cluster */
35002 + int nr_pages; /* number of pages of current checkin action */
35003 + int old_nrpages; /* number of pages of last checkin action */
35004 + struct page **pages; /* attached pages */
35005 + jnode * node; /* jnode for capture */
35006 +
35007 + /* disk cluster */
35008 + hint_t *hint; /* current position in the tree */
35009 + disk_cluster_stat dstat; /* state of the current disk cluster */
35010 + int reserved; /* is space for disk cluster reserved */
35011 +#if REISER4_DEBUG
35012 + reiser4_context *ctx;
35013 + int reserved_prepped;
35014 + int reserved_unprepped;
35015 +#endif
35016 +
35017 +};
35018 +
35019 +static inline __u8 * tfm_input_data (struct cluster_handle * clust)
35020 +{
35021 + return tfm_stream_data(&clust->tc, INPUT_STREAM);
35022 +}
35023 +
35024 +static inline __u8 * tfm_output_data (struct cluster_handle * clust)
35025 +{
35026 + return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
35027 +}
35028 +
35029 +static inline int reset_cluster_pgset(struct cluster_handle * clust,
35030 + int nrpages)
35031 +{
35032 + assert("edward-1057", clust->pages != NULL);
35033 + memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
35034 + return 0;
35035 +}
35036 +
35037 +static inline int alloc_cluster_pgset(struct cluster_handle * clust,
35038 + int nrpages)
35039 +{
35040 + assert("edward-949", clust != NULL);
35041 + assert("edward-1362", clust->pages == NULL);
35042 + assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
35043 +
35044 + clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
35045 + reiser4_ctx_gfp_mask_get());
35046 + if (!clust->pages)
35047 + return RETERR(-ENOMEM);
35048 + return 0;
35049 +}
35050 +
35051 +static inline void free_cluster_pgset(struct cluster_handle * clust)
35052 +{
35053 + assert("edward-951", clust->pages != NULL);
35054 + kfree(clust->pages);
35055 + clust->pages = NULL;
35056 +}
35057 +
35058 +static inline void put_cluster_handle(struct cluster_handle * clust)
35059 +{
35060 + assert("edward-435", clust != NULL);
35061 +
35062 + put_tfm_cluster(&clust->tc);
35063 + if (clust->pages)
35064 + free_cluster_pgset(clust);
35065 + memset(clust, 0, sizeof *clust);
35066 +}
35067 +
35068 +static inline void inc_keyload_count(struct reiser4_crypto_info * data)
35069 +{
35070 + assert("edward-1410", data != NULL);
35071 + data->keyload_count++;
35072 +}
35073 +
35074 +static inline void dec_keyload_count(struct reiser4_crypto_info * data)
35075 +{
35076 + assert("edward-1411", data != NULL);
35077 + assert("edward-1412", data->keyload_count > 0);
35078 + data->keyload_count--;
35079 +}
35080 +
35081 +static inline int capture_cluster_jnode(jnode * node)
35082 +{
35083 + return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
35084 +}
35085 +
35086 +/* cryptcompress specific part of reiser4_inode */
35087 +struct cryptcompress_info {
35088 + struct mutex checkin_mutex; /* This is to serialize
35089 + * checkin_logical_cluster operations */
35090 + cloff_t trunc_index; /* Index of the leftmost truncated disk
35091 + * cluster (to resolve races with read) */
35092 + struct reiser4_crypto_info *crypt;
35093 + /*
35094 + * the following 2 fields are controlled by compression mode plugin
35095 + */
35096 + int compress_toggle; /* Current status of compressibility */
35097 + int lattice_factor; /* Factor of dynamic lattice. FIXME: Have
35098 + * a compression_toggle to keep the factor
35099 + */
35100 +#if REISER4_DEBUG
35101 + atomic_t pgcount; /* number of grabbed pages */
35102 +#endif
35103 +};
35104 +
35105 +static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
35106 +{
35107 + info->compress_toggle = val;
35108 +}
35109 +
35110 +static inline int get_compression_toggle (struct cryptcompress_info * info)
35111 +{
35112 + return info->compress_toggle;
35113 +}
35114 +
35115 +static inline int compression_is_on(struct cryptcompress_info * info)
35116 +{
35117 + return get_compression_toggle(info) == 1;
35118 +}
35119 +
35120 +static inline void turn_on_compression(struct cryptcompress_info * info)
35121 +{
35122 + set_compression_toggle(info, 1);
35123 +}
35124 +
35125 +static inline void turn_off_compression(struct cryptcompress_info * info)
35126 +{
35127 + set_compression_toggle(info, 0);
35128 +}
35129 +
35130 +static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
35131 +{
35132 + info->lattice_factor = val;
35133 +}
35134 +
35135 +static inline int get_lattice_factor(struct cryptcompress_info * info)
35136 +{
35137 + return info->lattice_factor;
35138 +}
35139 +
35140 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
35141 +int equal_to_rdk(znode *, const reiser4_key *);
35142 +int goto_right_neighbor(coord_t *, lock_handle *);
35143 +int cryptcompress_inode_ok(struct inode *inode);
35144 +int coord_is_unprepped_ctail(const coord_t * coord);
35145 +extern int ctail_read_disk_cluster (struct cluster_handle *, struct inode *,
35146 + struct page *, znode_lock_mode mode);
35147 +extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
35148 + struct page * page, znode_lock_mode mode);
35149 +extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
35150 + struct inode * inode);
35151 +extern int readpages_cryptcompress(struct file*, struct address_space*,
35152 + struct list_head*, unsigned);
35153 +int bind_cryptcompress(struct inode *child, struct inode *parent);
35154 +void destroy_inode_cryptcompress(struct inode * inode);
35155 +int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
35156 + rw_op rw);
35157 +int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
35158 + struct cluster_handle * clust, int * progress);
35159 +struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
35160 +void inherit_crypto_info_common(struct inode * parent, struct inode * object,
35161 + int (*can_inherit)(struct inode * child,
35162 + struct inode * parent));
35163 +void reiser4_attach_crypto_info(struct inode * inode,
35164 + struct reiser4_crypto_info * info);
35165 +void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
35166 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
35167 +
35168 +static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
35169 +{
35170 + return info->cipher;
35171 +}
35172 +
35173 +static inline void info_set_cipher(struct reiser4_crypto_info * info,
35174 + struct crypto_blkcipher * tfm)
35175 +{
35176 + info->cipher = tfm;
35177 +}
35178 +
35179 +static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
35180 +{
35181 + return info->digest;
35182 +}
35183 +
35184 +static inline void info_set_digest(struct reiser4_crypto_info * info,
35185 + struct crypto_hash * tfm)
35186 +{
35187 + info->digest = tfm;
35188 +}
35189 +
35190 +static inline void put_cluster_page(struct page * page)
35191 +{
35192 + page_cache_release(page);
35193 +}
35194 +
35195 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
35196 +
35197 +/* Make Linus happy.
35198 + Local variables:
35199 + c-indentation-style: "K&R"
35200 + mode-name: "LC"
35201 + c-basic-offset: 8
35202 + tab-width: 8
35203 + fill-column: 120
35204 + scroll-step: 1
35205 + End:
35206 +*/
35207 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/file.c linux-2.6.22/fs/reiser4/plugin/file/file.c
35208 --- linux-2.6.22.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300
35209 +++ linux-2.6.22/fs/reiser4/plugin/file/file.c 2007-07-29 00:25:34.924708901 +0400
35210 @@ -0,0 +1,2817 @@
35211 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
35212 + * reiser4/README */
35213 +
35214 +/*
35215 + * this file contains implementations of inode/file/address_space/file plugin
35216 + * operations specific for "unix file plugin" (plugin id is
35217 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
35218 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
35219 + * no items but stat data)
35220 + */
35221 +
35222 +#include "../../inode.h"
35223 +#include "../../super.h"
35224 +#include "../../tree_walk.h"
35225 +#include "../../carry.h"
35226 +#include "../../page_cache.h"
35227 +#include "../../ioctl.h"
35228 +#include "../object.h"
35229 +#include "../cluster.h"
35230 +#include "../../safe_link.h"
35231 +
35232 +#include <linux/writeback.h>
35233 +#include <linux/pagevec.h>
35234 +#include <linux/syscalls.h>
35235 +
35236 +
35237 +static int unpack(struct file *file, struct inode *inode, int forever);
35238 +static void drop_access(struct unix_file_info *);
35239 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35240 + znode_lock_mode lock_mode);
35241 +
35242 +/* Get exclusive access and make sure that file is not partially
35243 + * converted (It may happen that another process is doing tail
35244 + * conversion. If so, wait until it completes)
35245 + */
35246 +static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
35247 + struct inode *inode)
35248 +{
35249 + do {
35250 + get_exclusive_access(uf_info);
35251 + if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
35252 + break;
35253 + drop_exclusive_access(uf_info);
35254 + schedule();
35255 + } while (1);
35256 +}
35257 +
35258 +/* get unix file plugin specific portion of inode */
35259 +struct unix_file_info *unix_file_inode_data(const struct inode *inode)
35260 +{
35261 + return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35262 +}
35263 +
35264 +/**
35265 + * equal_to_rdk - compare key and znode's right delimiting key
35266 + * @node: node whose right delimiting key to compare with @key
35267 + * @key: key to compare with @node's right delimiting key
35268 + *
35269 + * Returns true if @key is equal to right delimiting key of @node.
35270 + */
35271 +int equal_to_rdk(znode *node, const reiser4_key *key)
35272 +{
35273 + int result;
35274 +
35275 + read_lock_dk(znode_get_tree(node));
35276 + result = keyeq(key, znode_get_rd_key(node));
35277 + read_unlock_dk(znode_get_tree(node));
35278 + return result;
35279 +}
35280 +
35281 +#if REISER4_DEBUG
35282 +
35283 +/**
35284 + * equal_to_ldk - compare key and znode's left delimiting key
35285 + * @node: node whose left delimiting key to compare with @key
35286 + * @key: key to compare with @node's left delimiting key
35287 + *
35288 + * Returns true if @key is equal to left delimiting key of @node.
35289 + */
35290 +int equal_to_ldk(znode *node, const reiser4_key *key)
35291 +{
35292 + int result;
35293 +
35294 + read_lock_dk(znode_get_tree(node));
35295 + result = keyeq(key, znode_get_ld_key(node));
35296 + read_unlock_dk(znode_get_tree(node));
35297 + return result;
35298 +}
35299 +
35300 +/**
35301 + * check_coord - check whether coord corresponds to key
35302 + * @coord: coord to check
35303 + * @key: key @coord has to correspond to
35304 + *
35305 + * Returns true if @coord is set as if it was set as result of lookup with @key
35306 + * in coord->node.
35307 + */
35308 +static int check_coord(const coord_t *coord, const reiser4_key *key)
35309 +{
35310 + coord_t twin;
35311 +
35312 + node_plugin_by_node(coord->node)->lookup(coord->node, key,
35313 + FIND_MAX_NOT_MORE_THAN, &twin);
35314 + return coords_equal(coord, &twin);
35315 +}
35316 +
35317 +#endif /* REISER4_DEBUG */
35318 +
35319 +/**
35320 + * init_uf_coord - initialize extended coord
35321 + * @uf_coord:
35322 + * @lh:
35323 + *
35324 + *
35325 + */
35326 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35327 +{
35328 + coord_init_zero(&uf_coord->coord);
35329 + coord_clear_iplug(&uf_coord->coord);
35330 + uf_coord->lh = lh;
35331 + init_lh(lh);
35332 + memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35333 + uf_coord->valid = 0;
35334 +}
35335 +
35336 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
35337 +{
35338 + assert("vs-1333", uf_coord->valid == 0);
35339 +
35340 + if (coord_is_between_items(&uf_coord->coord))
35341 + return;
35342 +
35343 + assert("vs-1348",
35344 + item_plugin_by_coord(&uf_coord->coord)->s.file.
35345 + init_coord_extension);
35346 +
35347 + item_body_by_coord(&uf_coord->coord);
35348 + item_plugin_by_coord(&uf_coord->coord)->s.file.
35349 + init_coord_extension(uf_coord, offset);
35350 +}
35351 +
35352 +/**
35353 + * goto_right_neighbor - lock right neighbor, drop current node lock
35354 + * @coord:
35355 + * @lh:
35356 + *
35357 + * Obtain lock on right neighbor and drop lock on current node.
35358 + */
35359 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35360 +{
35361 + int result;
35362 + lock_handle lh_right;
35363 +
35364 + assert("vs-1100", znode_is_locked(coord->node));
35365 +
35366 + init_lh(&lh_right);
35367 + result = reiser4_get_right_neighbor(&lh_right, coord->node,
35368 + znode_is_wlocked(coord->node) ?
35369 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35370 + GN_CAN_USE_UPPER_LEVELS);
35371 + if (result) {
35372 + done_lh(&lh_right);
35373 + return result;
35374 + }
35375 +
35376 + /*
35377 + * we hold two longterm locks on neighboring nodes. Unlock left of
35378 + * them
35379 + */
35380 + done_lh(lh);
35381 +
35382 + coord_init_first_unit_nocheck(coord, lh_right.node);
35383 + move_lh(lh, &lh_right);
35384 +
35385 + return 0;
35386 +
35387 +}
35388 +
35389 +/**
35390 + * set_file_state
35391 + * @uf_info:
35392 + * @cbk_result:
35393 + * @level:
35394 + *
35395 + * This is to be used by find_file_item and in find_file_state to
35396 + * determine real state of file
35397 + */
35398 +static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
35399 + tree_level level)
35400 +{
35401 + if (cbk_errored(cbk_result))
35402 + /* error happened in find_file_item */
35403 + return;
35404 +
35405 + assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35406 +
35407 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35408 + if (cbk_result == CBK_COORD_NOTFOUND)
35409 + uf_info->container = UF_CONTAINER_EMPTY;
35410 + else if (level == LEAF_LEVEL)
35411 + uf_info->container = UF_CONTAINER_TAILS;
35412 + else
35413 + uf_info->container = UF_CONTAINER_EXTENTS;
35414 + } else {
35415 + /*
35416 + * file state is known, check whether it is set correctly if
35417 + * file is not being tail converted
35418 + */
35419 + if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35420 + REISER4_PART_IN_CONV)) {
35421 + assert("vs-1162",
35422 + ergo(level == LEAF_LEVEL &&
35423 + cbk_result == CBK_COORD_FOUND,
35424 + uf_info->container == UF_CONTAINER_TAILS));
35425 + assert("vs-1165",
35426 + ergo(level == TWIG_LEVEL &&
35427 + cbk_result == CBK_COORD_FOUND,
35428 + uf_info->container == UF_CONTAINER_EXTENTS));
35429 + }
35430 + }
35431 +}
35432 +
35433 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35434 + const reiser4_key *key, znode_lock_mode lock_mode,
35435 + struct inode *inode)
35436 +{
35437 + return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
35438 + FIND_MAX_NOT_MORE_THAN,
35439 + TWIG_LEVEL, LEAF_LEVEL,
35440 + (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35441 + (CBK_UNIQUE | CBK_FOR_INSERT),
35442 + NULL /* ra_info */ );
35443 +}
35444 +
35445 +/**
35446 + * find_file_item - look for file item in the tree
35447 + * @hint: provides coordinate, lock handle, seal
35448 + * @key: key for search
35449 + * @mode: mode of lock to put on returned node
35450 + * @ra_info:
35451 + * @inode:
35452 + *
35453 + * This finds position in the tree corresponding to @key. It first tries to use
35454 + * @hint's seal if it is set.
35455 + */
35456 +int find_file_item(hint_t *hint, const reiser4_key *key,
35457 + znode_lock_mode lock_mode,
35458 + struct inode *inode)
35459 +{
35460 + int result;
35461 + coord_t *coord;
35462 + lock_handle *lh;
35463 +
35464 + assert("nikita-3030", reiser4_schedulable());
35465 + assert("vs-1707", hint != NULL);
35466 + assert("vs-47", inode != NULL);
35467 +
35468 + coord = &hint->ext_coord.coord;
35469 + lh = hint->ext_coord.lh;
35470 + init_lh(lh);
35471 +
35472 + result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35473 + if (!result) {
35474 + if (coord->between == AFTER_UNIT &&
35475 + equal_to_rdk(coord->node, key)) {
35476 + result = goto_right_neighbor(coord, lh);
35477 + if (result == -E_NO_NEIGHBOR)
35478 + return RETERR(-EIO);
35479 + if (result)
35480 + return result;
35481 + assert("vs-1152", equal_to_ldk(coord->node, key));
35482 + /*
35483 + * we moved to different node. Invalidate coord
35484 + * extension, zload is necessary to init it again
35485 + */
35486 + hint->ext_coord.valid = 0;
35487 + }
35488 +
35489 + set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35490 + znode_get_level(coord->node));
35491 +
35492 + return CBK_COORD_FOUND;
35493 + }
35494 +
35495 + coord_init_zero(coord);
35496 + result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35497 + set_file_state(unix_file_inode_data(inode), result,
35498 + znode_get_level(coord->node));
35499 +
35500 + /* FIXME: we might already have coord extension initialized */
35501 + hint->ext_coord.valid = 0;
35502 + return result;
35503 +}
35504 +
35505 +/* plugin->u.file.write_flowom = NULL
35506 + plugin->u.file.read_flow = NULL */
35507 +
35508 +void hint_init_zero(hint_t * hint)
35509 +{
35510 + memset(hint, 0, sizeof(*hint));
35511 + init_lh(&hint->lh);
35512 + hint->ext_coord.lh = &hint->lh;
35513 +}
35514 +
35515 +static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
35516 +{
35517 + int result;
35518 + reiser4_key key;
35519 + coord_t coord;
35520 + lock_handle lh;
35521 +
35522 + assert("vs-1628", ea_obtained(uf_info));
35523 +
35524 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35525 + key_by_inode_and_offset_common(inode, 0, &key);
35526 + init_lh(&lh);
35527 + result = find_file_item_nohint(&coord, &lh, &key,
35528 + ZNODE_READ_LOCK, inode);
35529 + set_file_state(uf_info, result, znode_get_level(coord.node));
35530 + done_lh(&lh);
35531 + if (!cbk_errored(result))
35532 + result = 0;
35533 + } else
35534 + result = 0;
35535 + assert("vs-1074",
35536 + ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
35537 + reiser4_txn_restart_current();
35538 + return result;
35539 +}
35540 +
35541 +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
35542 + data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
35543 + if page corresponds to hole extent and unallocated one will have to be created */
35544 +static int reserve_partial_page(reiser4_tree * tree)
35545 +{
35546 + grab_space_enable();
35547 + return reiser4_grab_reserved(reiser4_get_current_sb(),
35548 + 1 +
35549 + 2 * estimate_one_insert_into_item(tree),
35550 + BA_CAN_COMMIT);
35551 +}
35552 +
35553 +/* estimate and reserve space needed to cut one item and update one stat data */
35554 +static int reserve_cut_iteration(reiser4_tree * tree)
35555 +{
35556 + __u64 estimate = estimate_one_item_removal(tree)
35557 + + estimate_one_insert_into_item(tree);
35558 +
35559 + assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
35560 +
35561 + grab_space_enable();
35562 + /* We need to double our estimate now that we can delete more than one
35563 + node. */
35564 + return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
35565 + BA_CAN_COMMIT);
35566 +}
35567 +
35568 +int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
35569 + int update_sd)
35570 +{
35571 + int result = 0;
35572 +
35573 + INODE_SET_SIZE(inode, get_key_offset(key));
35574 + if (update_sd) {
35575 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
35576 + result = reiser4_update_sd(inode);
35577 + }
35578 + return result;
35579 +}
35580 +
35581 +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
35582 + and update file stat data on every single cut from the tree */
35583 +int
35584 +cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
35585 + loff_t cur_size, int (*update_actor) (struct inode *,
35586 + reiser4_key *, int))
35587 +{
35588 + reiser4_key from_key, to_key;
35589 + reiser4_key smallest_removed;
35590 + file_plugin *fplug = inode_file_plugin(inode);
35591 + int result;
35592 + int progress = 0;
35593 +
35594 + assert("vs-1248",
35595 + fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
35596 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
35597 +
35598 + fplug->key_by_inode(inode, new_size, &from_key);
35599 + to_key = from_key;
35600 + set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
35601 + /* this loop normally runs just once */
35602 + while (1) {
35603 + result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
35604 + if (result)
35605 + break;
35606 +
35607 + result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
35608 + &smallest_removed, inode, 1,
35609 + &progress);
35610 + if (result == -E_REPEAT) {
35611 + /* -E_REPEAT is a signal to interrupt a long file truncation process */
35612 + if (progress) {
35613 + result =
35614 + update_actor(inode, &smallest_removed,
35615 + update_sd);
35616 + if (result)
35617 + break;
35618 + }
35619 +
35620 + /* the below does up(sbinfo->delete_mutex). Do not get folled */
35621 + reiser4_release_reserved(inode->i_sb);
35622 +
35623 + /* reiser4_cut_tree_object() was interrupted probably because
35624 + * current atom requires commit, we have to release
35625 + * transaction handle to allow atom commit. */
35626 + reiser4_txn_restart_current();
35627 + continue;
35628 + }
35629 + if (result
35630 + && !(result == CBK_COORD_NOTFOUND && new_size == 0
35631 + && inode->i_size == 0))
35632 + break;
35633 +
35634 + set_key_offset(&smallest_removed, new_size);
35635 + /* Final sd update after the file gets its correct size */
35636 + result = update_actor(inode, &smallest_removed, update_sd);
35637 + break;
35638 + }
35639 +
35640 + /* the below does up(sbinfo->delete_mutex). Do not get folled */
35641 + reiser4_release_reserved(inode->i_sb);
35642 +
35643 + return result;
35644 +}
35645 +
35646 +int find_or_create_extent(struct page *page);
35647 +
35648 +/* part of truncate_file_body: it is called when truncate is used to make file
35649 + shorter */
35650 +static int shorten_file(struct inode *inode, loff_t new_size)
35651 +{
35652 + int result;
35653 + struct page *page;
35654 + int padd_from;
35655 + unsigned long index;
35656 + struct unix_file_info *uf_info;
35657 +
35658 + /*
35659 + * all items of ordinary reiser4 file are grouped together. That is why
35660 + * we can use reiser4_cut_tree. Plan B files (for instance) can not be
35661 + * truncated that simply
35662 + */
35663 + result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
35664 + get_key_offset(reiser4_max_key()),
35665 + reiser4_update_file_size);
35666 + if (result)
35667 + return result;
35668 +
35669 + uf_info = unix_file_inode_data(inode);
35670 + assert("vs-1105", new_size == inode->i_size);
35671 + if (new_size == 0) {
35672 + uf_info->container = UF_CONTAINER_EMPTY;
35673 + return 0;
35674 + }
35675 +
35676 + result = find_file_state(inode, uf_info);
35677 + if (result)
35678 + return result;
35679 + if (uf_info->container == UF_CONTAINER_TAILS)
35680 + /*
35681 + * No need to worry about zeroing last page after new file
35682 + * end
35683 + */
35684 + return 0;
35685 +
35686 + padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
35687 + if (!padd_from)
35688 + /* file is truncated to page boundary */
35689 + return 0;
35690 +
35691 + result = reserve_partial_page(reiser4_tree_by_inode(inode));
35692 + if (result) {
35693 + reiser4_release_reserved(inode->i_sb);
35694 + return result;
35695 + }
35696 +
35697 + /* last page is partially truncated - zero its content */
35698 + index = (inode->i_size >> PAGE_CACHE_SHIFT);
35699 + page = read_mapping_page(inode->i_mapping, index, NULL);
35700 + if (IS_ERR(page)) {
35701 + /*
35702 + * the below does up(sbinfo->delete_mutex). Do not get
35703 + * confused
35704 + */
35705 + reiser4_release_reserved(inode->i_sb);
35706 + if (likely(PTR_ERR(page) == -EINVAL)) {
35707 + /* looks like file is built of tail items */
35708 + return 0;
35709 + }
35710 + return PTR_ERR(page);
35711 + }
35712 + wait_on_page_locked(page);
35713 + if (!PageUptodate(page)) {
35714 + page_cache_release(page);
35715 + /*
35716 + * the below does up(sbinfo->delete_mutex). Do not get
35717 + * confused
35718 + */
35719 + reiser4_release_reserved(inode->i_sb);
35720 + return RETERR(-EIO);
35721 + }
35722 +
35723 + /*
35724 + * if page correspons to hole extent unit - unallocated one will be
35725 + * created here. This is not necessary
35726 + */
35727 + result = find_or_create_extent(page);
35728 +
35729 + /*
35730 + * FIXME: cut_file_items has already updated inode. Probably it would
35731 + * be better to update it here when file is really truncated
35732 + */
35733 + if (result) {
35734 + page_cache_release(page);
35735 + /*
35736 + * the below does up(sbinfo->delete_mutex). Do not get
35737 + * confused
35738 + */
35739 + reiser4_release_reserved(inode->i_sb);
35740 + return result;
35741 + }
35742 +
35743 + lock_page(page);
35744 + assert("vs-1066", PageLocked(page));
35745 + zero_user_page(page, padd_from, PAGE_CACHE_SIZE - padd_from, KM_USER0);
35746 + unlock_page(page);
35747 + page_cache_release(page);
35748 + /* the below does up(sbinfo->delete_mutex). Do not get confused */
35749 + reiser4_release_reserved(inode->i_sb);
35750 + return 0;
35751 +}
35752 +
35753 +/**
35754 + * should_have_notail
35755 + * @uf_info:
35756 + * @new_size:
35757 + *
35758 + * Calls formatting plugin to see whether file of size @new_size has to be
35759 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
35760 + */
35761 +static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
35762 +{
35763 + if (!uf_info->tplug)
35764 + return 1;
35765 + return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
35766 + new_size);
35767 +
35768 +}
35769 +
35770 +/**
35771 + * truncate_file_body - change length of file
35772 + * @inode: inode of file
35773 + * @new_size: new file length
35774 + *
35775 + * Adjusts items file @inode is built of to match @new_size. It may either cut
35776 + * items or add them to represent a hole at the end of file. The caller has to
35777 + * obtain exclusive access to the file.
35778 + */
35779 +static int truncate_file_body(struct inode *inode, loff_t new_size)
35780 +{
35781 + int result;
35782 +
35783 + if (inode->i_size < new_size) {
35784 + /* expanding truncate */
35785 + struct dentry dentry;
35786 + struct file file;
35787 + struct unix_file_info *uf_info;
35788 +
35789 + dentry.d_inode = inode;
35790 + file.f_dentry = &dentry;
35791 + file.private_data = NULL;
35792 + file.f_pos = new_size;
35793 + file.private_data = NULL;
35794 + uf_info = unix_file_inode_data(inode);
35795 + result = find_file_state(inode, uf_info);
35796 + if (result)
35797 + return result;
35798 +
35799 + if (should_have_notail(uf_info, new_size)) {
35800 + /*
35801 + * file of size @new_size has to be built of
35802 + * extents. If it is built of tails - convert to
35803 + * extents
35804 + */
35805 + if (uf_info->container == UF_CONTAINER_TAILS) {
35806 + /*
35807 + * if file is being convered by another process
35808 + * - wait until it completes
35809 + */
35810 + while (1) {
35811 + if (reiser4_inode_get_flag(inode,
35812 + REISER4_PART_IN_CONV)) {
35813 + drop_exclusive_access(uf_info);
35814 + schedule();
35815 + get_exclusive_access(uf_info);
35816 + continue;
35817 + }
35818 + break;
35819 + }
35820 +
35821 + if (uf_info->container == UF_CONTAINER_TAILS) {
35822 + result = tail2extent(uf_info);
35823 + if (result)
35824 + return result;
35825 + }
35826 + }
35827 + result = reiser4_write_extent(&file, NULL, 0,
35828 + &new_size);
35829 + if (result)
35830 + return result;
35831 + uf_info->container = UF_CONTAINER_EXTENTS;
35832 + } else {
35833 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
35834 + result = reiser4_write_extent(&file, NULL, 0,
35835 + &new_size);
35836 + if (result)
35837 + return result;
35838 + } else {
35839 + result = reiser4_write_tail(&file, NULL, 0,
35840 + &new_size);
35841 + if (result)
35842 + return result;
35843 + uf_info->container = UF_CONTAINER_TAILS;
35844 + }
35845 + }
35846 + BUG_ON(result > 0);
35847 + INODE_SET_FIELD(inode, i_size, new_size);
35848 + file_update_time(&file);
35849 + result = reiser4_update_sd(inode);
35850 + BUG_ON(result != 0);
35851 + reiser4_free_file_fsdata(&file);
35852 + } else
35853 + result = shorten_file(inode, new_size);
35854 + return result;
35855 +}
35856 +
35857 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
35858 +
35859 +/**
35860 + * load_file_hint - copy hint from struct file to local variable
35861 + * @file: file to get hint from
35862 + * @hint: structure to fill
35863 + *
35864 + * Reiser4 specific portion of struct file may contain information (hint)
35865 + * stored on exiting from previous read or write. That information includes
35866 + * seal of znode and coord within that znode where previous read or write
35867 + * stopped. This function copies that information to @hint if it was stored or
35868 + * initializes @hint by 0s otherwise.
35869 + */
35870 +int load_file_hint(struct file *file, hint_t *hint)
35871 +{
35872 + reiser4_file_fsdata *fsdata;
35873 +
35874 + if (file) {
35875 + fsdata = reiser4_get_file_fsdata(file);
35876 + if (IS_ERR(fsdata))
35877 + return PTR_ERR(fsdata);
35878 +
35879 + spin_lock_inode(file->f_dentry->d_inode);
35880 + if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
35881 + *hint = fsdata->reg.hint;
35882 + init_lh(&hint->lh);
35883 + hint->ext_coord.lh = &hint->lh;
35884 + spin_unlock_inode(file->f_dentry->d_inode);
35885 + /*
35886 + * force re-validation of the coord on the first
35887 + * iteration of the read/write loop.
35888 + */
35889 + hint->ext_coord.valid = 0;
35890 + assert("nikita-19892", coords_equal(&hint->seal.coord1,
35891 + &hint->ext_coord.
35892 + coord));
35893 + return 0;
35894 + }
35895 + memset(&fsdata->reg.hint, 0, sizeof(hint_t));
35896 + spin_unlock_inode(file->f_dentry->d_inode);
35897 + }
35898 + hint_init_zero(hint);
35899 + return 0;
35900 +}
35901 +
35902 +/**
35903 + * save_file_hint - copy hint to reiser4 private struct file's part
35904 + * @file: file to save hint in
35905 + * @hint: hint to save
35906 + *
35907 + * This copies @hint to reiser4 private part of struct file. It can help
35908 + * speedup future accesses to the file.
35909 + */
35910 +void save_file_hint(struct file *file, const hint_t *hint)
35911 +{
35912 + reiser4_file_fsdata *fsdata;
35913 +
35914 + assert("edward-1337", hint != NULL);
35915 +
35916 + if (!file || !reiser4_seal_is_set(&hint->seal))
35917 + return;
35918 + fsdata = reiser4_get_file_fsdata(file);
35919 + assert("vs-965", !IS_ERR(fsdata));
35920 + assert("nikita-19891",
35921 + coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
35922 + assert("vs-30", hint->lh.owner == NULL);
35923 + spin_lock_inode(file->f_dentry->d_inode);
35924 + fsdata->reg.hint = *hint;
35925 + spin_unlock_inode(file->f_dentry->d_inode);
35926 + return;
35927 +}
35928 +
35929 +void reiser4_unset_hint(hint_t * hint)
35930 +{
35931 + assert("vs-1315", hint);
35932 + hint->ext_coord.valid = 0;
35933 + reiser4_seal_done(&hint->seal);
35934 + done_lh(&hint->lh);
35935 +}
35936 +
35937 +/* coord must be set properly. So, that reiser4_set_hint
35938 + has nothing to do */
35939 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
35940 + znode_lock_mode mode)
35941 +{
35942 + ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
35943 + assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
35944 +
35945 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
35946 + hint->offset = get_key_offset(key);
35947 + hint->mode = mode;
35948 + done_lh(&hint->lh);
35949 +}
35950 +
35951 +int hint_is_set(const hint_t * hint)
35952 +{
35953 + return reiser4_seal_is_set(&hint->seal);
35954 +}
35955 +
35956 +#if REISER4_DEBUG
35957 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
35958 +{
35959 + return (get_key_locality(k1) == get_key_locality(k2) &&
35960 + get_key_type(k1) == get_key_type(k2) &&
35961 + get_key_band(k1) == get_key_band(k2) &&
35962 + get_key_ordering(k1) == get_key_ordering(k2) &&
35963 + get_key_objectid(k1) == get_key_objectid(k2));
35964 +}
35965 +#endif
35966 +
35967 +static int
35968 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35969 + znode_lock_mode lock_mode)
35970 +{
35971 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
35972 + /* hint either not set or set by different operation */
35973 + return RETERR(-E_REPEAT);
35974 +
35975 + assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
35976 +
35977 + if (check_key && get_key_offset(key) != hint->offset)
35978 + /* hint is set for different key */
35979 + return RETERR(-E_REPEAT);
35980 +
35981 + assert("vs-31", hint->ext_coord.lh == &hint->lh);
35982 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
35983 + hint->ext_coord.lh, lock_mode,
35984 + ZNODE_LOCK_LOPRI);
35985 +}
35986 +
35987 +/**
35988 + * find_or_create_extent -
35989 + * @page:
35990 + *
35991 + *
35992 + */
35993 +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
35994 + unallocated extent if it does not exist yet, initialize jnode, capture page */
35995 +int find_or_create_extent(struct page *page)
35996 +{
35997 + int result;
35998 + struct inode *inode;
35999 + int plugged_hole;
36000 +
36001 + jnode *node;
36002 +
36003 + assert("vs-1065", page->mapping && page->mapping->host);
36004 + inode = page->mapping->host;
36005 +
36006 + lock_page(page);
36007 + node = jnode_of_page(page);
36008 + if (IS_ERR(node)) {
36009 + unlock_page(page);
36010 + return PTR_ERR(node);
36011 + }
36012 + JF_SET(node, JNODE_WRITE_PREPARED);
36013 + unlock_page(page);
36014 +
36015 + if (node->blocknr == 0) {
36016 + plugged_hole = 0;
36017 + result = reiser4_update_extent(inode, node, page_offset(page),
36018 + &plugged_hole);
36019 + if (result) {
36020 + JF_CLR(node, JNODE_WRITE_PREPARED);
36021 + jput(node);
36022 + warning("", "reiser4_update_extent failed: %d", result);
36023 + return result;
36024 + }
36025 + if (plugged_hole)
36026 + reiser4_update_sd(inode);
36027 + } else {
36028 + spin_lock_jnode(node);
36029 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
36030 + BUG_ON(result != 0);
36031 + jnode_make_dirty_locked(node);
36032 + spin_unlock_jnode(node);
36033 + }
36034 +
36035 + BUG_ON(node->atom == NULL);
36036 + JF_CLR(node, JNODE_WRITE_PREPARED);
36037 + jput(node);
36038 +
36039 + if (get_current_context()->entd) {
36040 + entd_context *ent = get_entd_context(node->tree->super);
36041 +
36042 + if (ent->cur_request->page == page)
36043 + ent->cur_request->node = node;
36044 + }
36045 + return 0;
36046 +}
36047 +
36048 +/**
36049 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
36050 + * @inode: inode to check
36051 + *
36052 + * Returns true if inode's mapping has dirty pages which do not belong to any
36053 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
36054 + * tree or were eflushed and can be found via jnodes tagged
36055 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
36056 + */
36057 +static int has_anonymous_pages(struct inode *inode)
36058 +{
36059 + int result;
36060 +
36061 + read_lock_irq(&inode->i_mapping->tree_lock);
36062 + result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
36063 + read_unlock_irq(&inode->i_mapping->tree_lock);
36064 + return result;
36065 +}
36066 +
36067 +/**
36068 + * capture_page_and_create_extent -
36069 + * @page: page to be captured
36070 + *
36071 + * Grabs space for extent creation and stat data update and calls function to
36072 + * do actual work.
36073 + */
36074 +static int capture_page_and_create_extent(struct page *page)
36075 +{
36076 + int result;
36077 + struct inode *inode;
36078 +
36079 + assert("vs-1084", page->mapping && page->mapping->host);
36080 + inode = page->mapping->host;
36081 + assert("vs-1139",
36082 + unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
36083 + /* page belongs to file */
36084 + assert("vs-1393",
36085 + inode->i_size > page_offset(page));
36086 +
36087 + /* page capture may require extent creation (if it does not exist yet)
36088 + and stat data's update (number of blocks changes on extent
36089 + creation) */
36090 + grab_space_enable();
36091 + result = reiser4_grab_space(2 * estimate_one_insert_into_item
36092 + (reiser4_tree_by_inode(inode)),
36093 + BA_CAN_COMMIT);
36094 + if (likely(!result))
36095 + result = find_or_create_extent(page);
36096 +
36097 + if (result != 0)
36098 + SetPageError(page);
36099 + return result;
36100 +}
36101 +
36102 +/* this is implementation of method commit_write of struct
36103 + address_space_operations for unix file plugin */
36104 +int
36105 +commit_write_unix_file(struct file *file, struct page *page,
36106 + unsigned from, unsigned to)
36107 +{
36108 + reiser4_context *ctx;
36109 + struct inode *inode;
36110 + int result;
36111 +
36112 + assert("umka-3101", file != NULL);
36113 + assert("umka-3102", page != NULL);
36114 + assert("umka-3093", PageLocked(page));
36115 +
36116 + SetPageUptodate(page);
36117 +
36118 + inode = page->mapping->host;
36119 + ctx = reiser4_init_context(page->mapping->host->i_sb);
36120 + if (IS_ERR(ctx))
36121 + return PTR_ERR(ctx);
36122 + page_cache_get(page);
36123 + unlock_page(page);
36124 + result = capture_page_and_create_extent(page);
36125 + lock_page(page);
36126 + page_cache_release(page);
36127 +
36128 + /* don't commit transaction under inode semaphore */
36129 + context_set_commit_async(ctx);
36130 + reiser4_exit_context(ctx);
36131 + return result;
36132 +}
36133 +
36134 +/*
36135 + * Support for "anonymous" pages and jnodes.
36136 + *
36137 + * When file is write-accessed through mmap pages can be dirtied from the user
36138 + * level. In this case kernel is not notified until one of following happens:
36139 + *
36140 + * (1) msync()
36141 + *
36142 + * (2) truncate() (either explicit or through unlink)
36143 + *
36144 + * (3) VM scanner starts reclaiming mapped pages, dirtying them before
36145 + * starting write-back.
36146 + *
36147 + * As a result of (3) ->writepage may be called on a dirty page without
36148 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
36149 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
36150 + * this situation by creating jnode for anonymous page, starting IO on the
36151 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
36152 + * memory. Such jnode is also called anonymous.
36153 + *
36154 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
36155 + * tree. This is done by capture_anonymous_*() functions below.
36156 + */
36157 +
36158 +/**
36159 + * capture_anonymous_page - involve page into transaction
36160 + * @pg: page to deal with
36161 + *
36162 + * Takes care that @page has corresponding metadata in the tree, creates jnode
36163 + * for @page and captures it. On success 1 is returned.
36164 + */
36165 +static int capture_anonymous_page(struct page *page)
36166 +{
36167 + int result;
36168 +
36169 + if (PageWriteback(page))
36170 + /* FIXME: do nothing? */
36171 + return 0;
36172 +
36173 + result = capture_page_and_create_extent(page);
36174 + if (result == 0) {
36175 + result = 1;
36176 + } else
36177 + warning("nikita-3329",
36178 + "Cannot capture anon page: %i", result);
36179 +
36180 + return result;
36181 +}
36182 +
36183 +/**
36184 + * capture_anonymous_pages - find and capture pages dirtied via mmap
36185 + * @mapping: address space where to look for pages
36186 + * @index: start index
36187 + * @to_capture: maximum number of pages to capture
36188 + *
36189 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
36190 + * captures (involves into atom) them, returns number of captured pages,
36191 + * updates @index to next page after the last captured one.
36192 + */
36193 +static int
36194 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
36195 + unsigned int to_capture)
36196 +{
36197 + int result;
36198 + struct pagevec pvec;
36199 + unsigned int i, count;
36200 + int nr;
36201 +
36202 + pagevec_init(&pvec, 0);
36203 + count = min(pagevec_space(&pvec), to_capture);
36204 + nr = 0;
36205 +
36206 + /* find pages tagged MOVED */
36207 + write_lock_irq(&mapping->tree_lock);
36208 + pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
36209 + (void **)pvec.pages, *index, count,
36210 + PAGECACHE_TAG_REISER4_MOVED);
36211 + if (pagevec_count(&pvec) == 0) {
36212 + /*
36213 + * there are no pages tagged MOVED in mapping->page_tree
36214 + * starting from *index
36215 + */
36216 + write_unlock_irq(&mapping->tree_lock);
36217 + *index = (pgoff_t)-1;
36218 + return 0;
36219 + }
36220 +
36221 + /* clear MOVED tag for all found pages */
36222 + for (i = 0; i < pagevec_count(&pvec); i++) {
36223 + void *p;
36224 +
36225 + page_cache_get(pvec.pages[i]);
36226 + p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36227 + PAGECACHE_TAG_REISER4_MOVED);
36228 + assert("vs-49", p == pvec.pages[i]);
36229 + }
36230 + write_unlock_irq(&mapping->tree_lock);
36231 +
36232 +
36233 + *index = pvec.pages[i - 1]->index + 1;
36234 +
36235 + for (i = 0; i < pagevec_count(&pvec); i++) {
36236 + /*
36237 + * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36238 + * reiser4_set_page_dirty_internal which is called when jnode is
36239 + * captured
36240 + */
36241 + result = capture_anonymous_page(pvec.pages[i]);
36242 + if (result == 1)
36243 + nr++;
36244 + else {
36245 + if (result < 0) {
36246 + warning("vs-1454",
36247 + "failed to capture page: "
36248 + "result=%d, captured=%d)\n",
36249 + result, i);
36250 +
36251 + /*
36252 + * set MOVED tag to all pages which left not
36253 + * captured
36254 + */
36255 + write_lock_irq(&mapping->tree_lock);
36256 + for (; i < pagevec_count(&pvec); i ++) {
36257 + radix_tree_tag_set(&mapping->page_tree,
36258 + pvec.pages[i]->index,
36259 + PAGECACHE_TAG_REISER4_MOVED);
36260 + }
36261 + write_unlock_irq(&mapping->tree_lock);
36262 +
36263 + pagevec_release(&pvec);
36264 + return result;
36265 + } else {
36266 + /*
36267 + * result == 0. capture_anonymous_page returns
36268 + * 0 for Writeback-ed page. Set MOVED tag on
36269 + * that page
36270 + */
36271 + write_lock_irq(&mapping->tree_lock);
36272 + radix_tree_tag_set(&mapping->page_tree,
36273 + pvec.pages[i]->index,
36274 + PAGECACHE_TAG_REISER4_MOVED);
36275 + write_unlock_irq(&mapping->tree_lock);
36276 + if (i == 0)
36277 + *index = pvec.pages[0]->index;
36278 + else
36279 + *index = pvec.pages[i - 1]->index + 1;
36280 + }
36281 + }
36282 + }
36283 + pagevec_release(&pvec);
36284 + return nr;
36285 +}
36286 +
36287 +/**
36288 + * capture_anonymous_jnodes - find and capture anonymous jnodes
36289 + * @mapping: address space where to look for jnodes
36290 + * @from: start index
36291 + * @to: end index
36292 + * @to_capture: maximum number of jnodes to capture
36293 + *
36294 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36295 + * the range of indexes @from-@to and captures them, returns number of captured
36296 + * jnodes, updates @from to next jnode after the last captured one.
36297 + */
36298 +static int
36299 +capture_anonymous_jnodes(struct address_space *mapping,
36300 + pgoff_t *from, pgoff_t to, int to_capture)
36301 +{
36302 + *from = to;
36303 + return 0;
36304 +}
36305 +
36306 +/*
36307 + * Commit atom of the jnode of a page.
36308 + */
36309 +static int sync_page(struct page *page)
36310 +{
36311 + int result;
36312 + do {
36313 + jnode *node;
36314 + txn_atom *atom;
36315 +
36316 + lock_page(page);
36317 + node = jprivate(page);
36318 + if (node != NULL) {
36319 + spin_lock_jnode(node);
36320 + atom = jnode_get_atom(node);
36321 + spin_unlock_jnode(node);
36322 + } else
36323 + atom = NULL;
36324 + unlock_page(page);
36325 + result = reiser4_sync_atom(atom);
36326 + } while (result == -E_REPEAT);
36327 + /*
36328 + * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36329 + * handle the case where more pages get added to the atom while we are
36330 + * syncing it?
36331 + */
36332 + assert("nikita-3485", ergo(result == 0,
36333 + get_current_context()->trans->atom == NULL));
36334 + return result;
36335 +}
36336 +
36337 +/*
36338 + * Commit atoms of pages on @pages list.
36339 + * call sync_page for each page from mapping's page tree
36340 + */
36341 +static int sync_page_list(struct inode *inode)
36342 +{
36343 + int result;
36344 + struct address_space *mapping;
36345 + unsigned long from; /* start index for radix_tree_gang_lookup */
36346 + unsigned int found; /* return value for radix_tree_gang_lookup */
36347 +
36348 + mapping = inode->i_mapping;
36349 + from = 0;
36350 + result = 0;
36351 + read_lock_irq(&mapping->tree_lock);
36352 + while (result == 0) {
36353 + struct page *page;
36354 +
36355 + found =
36356 + radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36357 + from, 1);
36358 + assert("", found < 2);
36359 + if (found == 0)
36360 + break;
36361 +
36362 + /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36363 + sys_fsync */
36364 + page_cache_get(page);
36365 + read_unlock_irq(&mapping->tree_lock);
36366 +
36367 + from = page->index + 1;
36368 +
36369 + result = sync_page(page);
36370 +
36371 + page_cache_release(page);
36372 + read_lock_irq(&mapping->tree_lock);
36373 + }
36374 +
36375 + read_unlock_irq(&mapping->tree_lock);
36376 + return result;
36377 +}
36378 +
36379 +static int commit_file_atoms(struct inode *inode)
36380 +{
36381 + int result;
36382 + struct unix_file_info *uf_info;
36383 +
36384 + uf_info = unix_file_inode_data(inode);
36385 +
36386 + get_exclusive_access(uf_info);
36387 + /*
36388 + * find what items file is made from
36389 + */
36390 + result = find_file_state(inode, uf_info);
36391 + drop_exclusive_access(uf_info);
36392 + if (result != 0)
36393 + return result;
36394 +
36395 + /*
36396 + * file state cannot change because we are under ->i_mutex
36397 + */
36398 + switch (uf_info->container) {
36399 + case UF_CONTAINER_EXTENTS:
36400 + /* find_file_state might open join an atom */
36401 + reiser4_txn_restart_current();
36402 + result =
36403 + /*
36404 + * when we are called by
36405 + * filemap_fdatawrite->
36406 + * do_writepages()->
36407 + * reiser4_writepages()
36408 + *
36409 + * inode->i_mapping->dirty_pages are spices into
36410 + * ->io_pages, leaving ->dirty_pages dirty.
36411 + *
36412 + * When we are called from
36413 + * reiser4_fsync()->sync_unix_file(), we have to
36414 + * commit atoms of all pages on the ->dirty_list.
36415 + *
36416 + * So for simplicity we just commit ->io_pages and
36417 + * ->dirty_pages.
36418 + */
36419 + sync_page_list(inode);
36420 + break;
36421 + case UF_CONTAINER_TAILS:
36422 + /*
36423 + * NOTE-NIKITA probably we can be smarter for tails. For now
36424 + * just commit all existing atoms.
36425 + */
36426 + result = txnmgr_force_commit_all(inode->i_sb, 0);
36427 + break;
36428 + case UF_CONTAINER_EMPTY:
36429 + result = 0;
36430 + break;
36431 + case UF_CONTAINER_UNKNOWN:
36432 + default:
36433 + result = -EIO;
36434 + break;
36435 + }
36436 +
36437 + /*
36438 + * commit current transaction: there can be captured nodes from
36439 + * find_file_state() and finish_conversion().
36440 + */
36441 + reiser4_txn_restart_current();
36442 + return result;
36443 +}
36444 +
36445 +/**
36446 + * writepages_unix_file - writepages of struct address_space_operations
36447 + * @mapping:
36448 + * @wbc:
36449 + *
36450 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36451 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36452 + * created by reiser4_writepage.
36453 + */
36454 +int writepages_unix_file(struct address_space *mapping,
36455 + struct writeback_control *wbc)
36456 +{
36457 + int result;
36458 + struct unix_file_info *uf_info;
36459 + pgoff_t pindex, jindex, nr_pages;
36460 + long to_capture;
36461 + struct inode *inode;
36462 +
36463 + inode = mapping->host;
36464 + if (!has_anonymous_pages(inode)) {
36465 + result = 0;
36466 + goto end;
36467 + }
36468 + jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
36469 + result = 0;
36470 + nr_pages = size_in_pages(i_size_read(inode));
36471 +
36472 + uf_info = unix_file_inode_data(inode);
36473 +
36474 + do {
36475 + reiser4_context *ctx;
36476 +
36477 + if (wbc->sync_mode != WB_SYNC_ALL)
36478 + to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36479 + else
36480 + to_capture = CAPTURE_APAGE_BURST;
36481 +
36482 + ctx = reiser4_init_context(inode->i_sb);
36483 + if (IS_ERR(ctx)) {
36484 + result = PTR_ERR(ctx);
36485 + break;
36486 + }
36487 + /* avoid recursive calls to ->sync_inodes */
36488 + ctx->nobalance = 1;
36489 + assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36490 + assert("", LOCK_CNT_NIL(inode_sem_w));
36491 + assert("", LOCK_CNT_NIL(inode_sem_r));
36492 +
36493 + reiser4_txn_restart_current();
36494 +
36495 + /* we have to get nonexclusive access to the file */
36496 + if (get_current_context()->entd) {
36497 + /*
36498 + * use nonblocking version of nonexclusive_access to
36499 + * avoid deadlock which might look like the following:
36500 + * process P1 holds NEA on file F1 and called entd to
36501 + * reclaim some memory. Entd works for P1 and is going
36502 + * to capture pages of file F2. To do that entd has to
36503 + * get NEA to F2. F2 is held by process P2 which also
36504 + * called entd. But entd is serving P1 at the moment
36505 + * and P2 has to wait. Process P3 trying to get EA to
36506 + * file F2. Existence of pending EA request to file F2
36507 + * makes impossible for entd to get NEA to file
36508 + * F2. Neither of these process can continue. Using
36509 + * nonblocking version of gettign NEA is supposed to
36510 + * avoid this deadlock.
36511 + */
36512 + if (try_to_get_nonexclusive_access(uf_info) == 0) {
36513 + result = RETERR(-EBUSY);
36514 + reiser4_exit_context(ctx);
36515 + break;
36516 + }
36517 + } else
36518 + get_nonexclusive_access(uf_info);
36519 +
36520 + while (to_capture > 0) {
36521 + pgoff_t start;
36522 +
36523 + assert("vs-1727", jindex <= pindex);
36524 + if (pindex == jindex) {
36525 + start = pindex;
36526 + result =
36527 + capture_anonymous_pages(inode->i_mapping,
36528 + &pindex,
36529 + to_capture);
36530 + if (result <= 0)
36531 + break;
36532 + to_capture -= result;
36533 + wbc->nr_to_write -= result;
36534 + if (start + result == pindex) {
36535 + jindex = pindex;
36536 + continue;
36537 + }
36538 + if (to_capture <= 0)
36539 + break;
36540 + }
36541 + /* deal with anonymous jnodes between jindex and pindex */
36542 + result =
36543 + capture_anonymous_jnodes(inode->i_mapping, &jindex,
36544 + pindex, to_capture);
36545 + if (result < 0)
36546 + break;
36547 + to_capture -= result;
36548 + get_current_context()->nr_captured += result;
36549 +
36550 + if (jindex == (pgoff_t) - 1) {
36551 + assert("vs-1728", pindex == (pgoff_t) - 1);
36552 + break;
36553 + }
36554 + }
36555 + if (to_capture <= 0)
36556 + /* there may be left more pages */
36557 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
36558 +
36559 + drop_nonexclusive_access(uf_info);
36560 + if (result < 0) {
36561 + /* error happened */
36562 + reiser4_exit_context(ctx);
36563 + return result;
36564 + }
36565 + if (wbc->sync_mode != WB_SYNC_ALL) {
36566 + reiser4_exit_context(ctx);
36567 + return 0;
36568 + }
36569 + result = commit_file_atoms(inode);
36570 + reiser4_exit_context(ctx);
36571 + if (pindex >= nr_pages && jindex == pindex)
36572 + break;
36573 + } while (1);
36574 +
36575 + end:
36576 + if (is_in_reiser4_context()) {
36577 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
36578 + /*
36579 + * there are already pages to flush, flush them out, do
36580 + * not delay until end of reiser4_sync_inodes
36581 + */
36582 + reiser4_writeout(inode->i_sb, wbc);
36583 + get_current_context()->nr_captured = 0;
36584 + }
36585 + }
36586 + return result;
36587 +}
36588 +
36589 +/*
36590 + * ->sync() method for unix file.
36591 + *
36592 + * We are trying to be smart here. Instead of committing all atoms (original
36593 + * solution), we scan dirty pages of this file and commit all atoms they are
36594 + * part of.
36595 + *
36596 + * Situation is complicated by anonymous pages: i.e., extent-less pages
36597 + * dirtied through mmap. Fortunately sys_fsync() first calls
36598 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
36599 + * all missing extents and capture anonymous pages.
36600 + */
36601 +int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
36602 +{
36603 + reiser4_context *ctx;
36604 + txn_atom *atom;
36605 + reiser4_block_nr reserve;
36606 +
36607 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
36608 + if (IS_ERR(ctx))
36609 + return PTR_ERR(ctx);
36610 +
36611 + reserve = estimate_update_common(dentry->d_inode);
36612 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
36613 + reiser4_exit_context(ctx);
36614 + return RETERR(-ENOSPC);
36615 + }
36616 + write_sd_by_inode_common(dentry->d_inode);
36617 +
36618 + atom = get_current_atom_locked();
36619 + spin_lock_txnh(ctx->trans);
36620 + force_commit_atom(ctx->trans);
36621 + reiser4_exit_context(ctx);
36622 + return 0;
36623 +}
36624 +
36625 +/**
36626 + * readpage_unix_file_nolock - readpage of struct address_space_operations
36627 + * @file:
36628 + * @page:
36629 + *
36630 + * Compose a key and search for item containing information about @page
36631 + * data. If item is found - its readpage method is called.
36632 + */
36633 +int readpage_unix_file(struct file *file, struct page *page)
36634 +{
36635 + reiser4_context *ctx;
36636 + int result;
36637 + struct inode *inode;
36638 + reiser4_key key;
36639 + item_plugin *iplug;
36640 + hint_t *hint;
36641 + lock_handle *lh;
36642 + coord_t *coord;
36643 +
36644 + assert("vs-1062", PageLocked(page));
36645 + assert("vs-976", !PageUptodate(page));
36646 + assert("vs-1061", page->mapping && page->mapping->host);
36647 +
36648 + if (page->mapping->host->i_size <= page_offset(page)) {
36649 + /* page is out of file already */
36650 + unlock_page(page);
36651 + return -EINVAL;
36652 + }
36653 +
36654 + inode = page->mapping->host;
36655 + ctx = reiser4_init_context(inode->i_sb);
36656 + if (IS_ERR(ctx)) {
36657 + unlock_page(page);
36658 + return PTR_ERR(ctx);
36659 + }
36660 +
36661 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36662 + if (hint == NULL) {
36663 + unlock_page(page);
36664 + reiser4_exit_context(ctx);
36665 + return RETERR(-ENOMEM);
36666 + }
36667 +
36668 + result = load_file_hint(file, hint);
36669 + if (result) {
36670 + kfree(hint);
36671 + unlock_page(page);
36672 + reiser4_exit_context(ctx);
36673 + return result;
36674 + }
36675 + lh = &hint->lh;
36676 +
36677 + /* get key of first byte of the page */
36678 + key_by_inode_and_offset_common(inode, page_offset(page), &key);
36679 +
36680 + /* look for file metadata corresponding to first byte of page */
36681 + page_cache_get(page);
36682 + unlock_page(page);
36683 + result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
36684 + lock_page(page);
36685 + page_cache_release(page);
36686 +
36687 + if (page->mapping == NULL) {
36688 + /*
36689 + * readpage allows truncate to run concurrently. Page was
36690 + * truncated while it was not locked
36691 + */
36692 + done_lh(lh);
36693 + kfree(hint);
36694 + unlock_page(page);
36695 + reiser4_txn_restart(ctx);
36696 + reiser4_exit_context(ctx);
36697 + return -EINVAL;
36698 + }
36699 +
36700 + if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
36701 + if (result == CBK_COORD_FOUND &&
36702 + hint->ext_coord.coord.between != AT_UNIT)
36703 + /* file is truncated */
36704 + result = -EINVAL;
36705 + done_lh(lh);
36706 + kfree(hint);
36707 + unlock_page(page);
36708 + reiser4_txn_restart(ctx);
36709 + reiser4_exit_context(ctx);
36710 + return result;
36711 + }
36712 +
36713 + /*
36714 + * item corresponding to page is found. It can not be removed because
36715 + * znode lock is held
36716 + */
36717 + if (PageUptodate(page)) {
36718 + done_lh(lh);
36719 + kfree(hint);
36720 + unlock_page(page);
36721 + reiser4_txn_restart(ctx);
36722 + reiser4_exit_context(ctx);
36723 + return 0;
36724 + }
36725 +
36726 + coord = &hint->ext_coord.coord;
36727 + result = zload(coord->node);
36728 + if (result) {
36729 + done_lh(lh);
36730 + kfree(hint);
36731 + unlock_page(page);
36732 + reiser4_txn_restart(ctx);
36733 + reiser4_exit_context(ctx);
36734 + return result;
36735 + }
36736 +
36737 + validate_extended_coord(&hint->ext_coord, page_offset(page));
36738 +
36739 + if (!coord_is_existing_unit(coord)) {
36740 + /* this indicates corruption */
36741 + warning("vs-280",
36742 + "Looking for page %lu of file %llu (size %lli). "
36743 + "No file items found (%d). File is corrupted?\n",
36744 + page->index, (unsigned long long)get_inode_oid(inode),
36745 + inode->i_size, result);
36746 + zrelse(coord->node);
36747 + done_lh(lh);
36748 + kfree(hint);
36749 + unlock_page(page);
36750 + reiser4_txn_restart(ctx);
36751 + reiser4_exit_context(ctx);
36752 + return RETERR(-EIO);
36753 + }
36754 +
36755 + /*
36756 + * get plugin of found item or use plugin if extent if there are no
36757 + * one
36758 + */
36759 + iplug = item_plugin_by_coord(coord);
36760 + if (iplug->s.file.readpage)
36761 + result = iplug->s.file.readpage(coord, page);
36762 + else
36763 + result = RETERR(-EINVAL);
36764 +
36765 + if (!result) {
36766 + set_key_offset(&key,
36767 + (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
36768 + /* FIXME should call reiser4_set_hint() */
36769 + reiser4_unset_hint(hint);
36770 + } else {
36771 + unlock_page(page);
36772 + reiser4_unset_hint(hint);
36773 + }
36774 + assert("vs-979",
36775 + ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
36776 + assert("vs-9791", ergo(result != 0, !PageLocked(page)));
36777 +
36778 + zrelse(coord->node);
36779 + done_lh(lh);
36780 +
36781 + save_file_hint(file, hint);
36782 + kfree(hint);
36783 +
36784 + /*
36785 + * FIXME: explain why it is needed. HINT: page allocation in write can
36786 + * not be done when atom is not NULL because reiser4_writepage can not
36787 + * kick entd and have to eflush
36788 + */
36789 + reiser4_txn_restart(ctx);
36790 + reiser4_exit_context(ctx);
36791 + return result;
36792 +}
36793 +
36794 +struct uf_readpages_context {
36795 + lock_handle lh;
36796 + coord_t coord;
36797 +};
36798 +
36799 +/* A callback function for readpages_unix_file/read_cache_pages.
36800 + * If the file is build of tails, then return error (-ENOENT).
36801 + *
36802 + * @data -- a pointer to reiser4_readpages_context object,
36803 + * to save the twig lock and the coord between
36804 + * read_cache_page iterations.
36805 + * @page -- page to start read.
36806 + */
36807 +static int uf_readpages_filler(void * data, struct page * page)
36808 +{
36809 + struct uf_readpages_context *rc = data;
36810 + jnode * node;
36811 + int ret = 0;
36812 + reiser4_extent *ext;
36813 + __u64 ext_index;
36814 + int cbk_done = 0;
36815 + struct address_space * mapping = page->mapping;
36816 +
36817 + if (PageUptodate(page)) {
36818 + unlock_page(page);
36819 + return 0;
36820 + }
36821 + page_cache_get(page);
36822 +
36823 + if (rc->lh.node == 0) {
36824 + /* no twig lock - have to do tree search. */
36825 + reiser4_key key;
36826 + repeat:
36827 + unlock_page(page);
36828 + key_by_inode_and_offset_common(
36829 + mapping->host, page_offset(page), &key);
36830 + ret = coord_by_key(
36831 + &get_super_private(mapping->host->i_sb)->tree,
36832 + &key, &rc->coord, &rc->lh,
36833 + ZNODE_READ_LOCK, FIND_EXACT,
36834 + TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
36835 + if (unlikely(ret))
36836 + goto exit;
36837 + lock_page(page);
36838 + cbk_done = 1;
36839 + }
36840 + ret = zload(rc->coord.node);
36841 + if (unlikely(ret))
36842 + goto unlock;
36843 + if (!coord_is_existing_item(&rc->coord) ||
36844 + !item_is_extent(&rc->coord)) {
36845 + zrelse(rc->coord.node);
36846 + ret = RETERR(-EIO);
36847 + goto unlock;
36848 + }
36849 + ext = extent_by_coord(&rc->coord);
36850 + ext_index = extent_unit_index(&rc->coord);
36851 + if (page->index < ext_index ||
36852 + page->index >= ext_index + extent_get_width(ext)) {
36853 + /* the page index doesn't belong to the extent unit
36854 + which the coord points to - release the lock and
36855 + repeat with tree search. */
36856 + zrelse(rc->coord.node);
36857 + done_lh(&rc->lh);
36858 + /* we can be here after a CBK call only in case of
36859 + corruption of the tree or the tree lookup algorithm bug. */
36860 + if (unlikely(cbk_done)) {
36861 + ret = RETERR(-EIO);
36862 + goto unlock;
36863 + }
36864 + goto repeat;
36865 + }
36866 + node = jnode_of_page(page);
36867 + if (unlikely(IS_ERR(node))) {
36868 + zrelse(rc->coord.node);
36869 + ret = PTR_ERR(node);
36870 + goto unlock;
36871 + }
36872 + ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
36873 + jput(node);
36874 + zrelse(rc->coord.node);
36875 + if (likely(!ret))
36876 + goto exit;
36877 + unlock:
36878 + unlock_page(page);
36879 + exit:
36880 + page_cache_release(page);
36881 + return ret;
36882 +}
36883 +
36884 +/**
36885 + * readpages_unix_file - called by the readahead code, starts reading for each
36886 + * page of given list of pages
36887 + */
36888 +int readpages_unix_file(
36889 + struct file *file, struct address_space *mapping,
36890 + struct list_head *pages, unsigned nr_pages)
36891 +{
36892 + reiser4_context *ctx;
36893 + struct uf_readpages_context rc;
36894 + int ret;
36895 +
36896 + ctx = reiser4_init_context(mapping->host->i_sb);
36897 + if (IS_ERR(ctx)) {
36898 + put_pages_list(pages);
36899 + return PTR_ERR(ctx);
36900 + }
36901 + init_lh(&rc.lh);
36902 + ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
36903 + done_lh(&rc.lh);
36904 + context_set_commit_async(ctx);
36905 + /* close the transaction to protect further page allocation from deadlocks */
36906 + reiser4_txn_restart(ctx);
36907 + reiser4_exit_context(ctx);
36908 + return ret;
36909 +}
36910 +
36911 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
36912 + loff_t count UNUSED_ARG)
36913 +{
36914 + /* We should reserve one block, because of updating of the stat data
36915 + item */
36916 + assert("vs-1249",
36917 + inode_file_plugin(inode)->estimate.update ==
36918 + estimate_update_common);
36919 + return estimate_update_common(inode);
36920 +}
36921 +
36922 +/* this is called with nonexclusive access obtained, file's container can not change */
36923 +static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
36924 + char __user *buf, /* address of user-space buffer */
36925 + size_t count, /* number of bytes to read */
36926 + loff_t *off)
36927 +{
36928 + int result;
36929 + struct inode *inode;
36930 + flow_t flow;
36931 + int (*read_f) (struct file *, flow_t *, hint_t *);
36932 + coord_t *coord;
36933 + znode *loaded;
36934 +
36935 + inode = file->f_dentry->d_inode;
36936 +
36937 + /* build flow */
36938 + assert("vs-1250",
36939 + inode_file_plugin(inode)->flow_by_inode ==
36940 + flow_by_inode_unix_file);
36941 + result =
36942 + flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
36943 + *off, READ_OP, &flow);
36944 + if (unlikely(result))
36945 + return result;
36946 +
36947 + /* get seal and coord sealed with it from reiser4 private data
36948 + of struct file. The coord will tell us where our last read
36949 + of this file finished, and the seal will help to determine
36950 + if that location is still valid.
36951 + */
36952 + coord = &hint->ext_coord.coord;
36953 + while (flow.length && result == 0) {
36954 + result =
36955 + find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
36956 + if (cbk_errored(result))
36957 + /* error happened */
36958 + break;
36959 +
36960 + if (coord->between != AT_UNIT) {
36961 + /* there were no items corresponding to given offset */
36962 + done_lh(hint->ext_coord.lh);
36963 + break;
36964 + }
36965 +
36966 + loaded = coord->node;
36967 + result = zload(loaded);
36968 + if (unlikely(result)) {
36969 + done_lh(hint->ext_coord.lh);
36970 + break;
36971 + }
36972 +
36973 + if (hint->ext_coord.valid == 0)
36974 + validate_extended_coord(&hint->ext_coord,
36975 + get_key_offset(&flow.key));
36976 +
36977 + assert("vs-4", hint->ext_coord.valid == 1);
36978 + assert("vs-33", hint->ext_coord.lh == &hint->lh);
36979 + /* call item's read method */
36980 + read_f = item_plugin_by_coord(coord)->s.file.read;
36981 + result = read_f(file, &flow, hint);
36982 + zrelse(loaded);
36983 + done_lh(hint->ext_coord.lh);
36984 + }
36985 +
36986 + return (count - flow.length) ? (count - flow.length) : result;
36987 +}
36988 +
36989 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
36990 +
36991 +/**
36992 + * read_unix_file - read of struct file_operations
36993 + * @file: file to read from
36994 + * @buf: address of user-space buffer
36995 + * @read_amount: number of bytes to read
36996 + * @off: position in file to read from
36997 + *
36998 + * This is implementation of vfs's read method of struct file_operations for
36999 + * unix file plugin.
37000 + */
37001 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
37002 + loff_t *off)
37003 +{
37004 + reiser4_context *ctx;
37005 + ssize_t result;
37006 + struct inode *inode;
37007 + struct unix_file_info *uf_info;
37008 +
37009 + if (unlikely(read_amount == 0))
37010 + return 0;
37011 +
37012 + assert("umka-072", file != NULL);
37013 + assert("umka-074", off != NULL);
37014 + inode = file->f_dentry->d_inode;
37015 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37016 +
37017 + ctx = reiser4_init_context(inode->i_sb);
37018 + if (IS_ERR(ctx))
37019 + return PTR_ERR(ctx);
37020 + uf_info = unix_file_inode_data(inode);
37021 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37022 + get_exclusive_access(uf_info);
37023 + result = find_file_state(inode, uf_info);
37024 + if (unlikely(result != 0))
37025 + goto out;
37026 + } else
37027 + get_nonexclusive_access(uf_info);
37028 + result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
37029 + BA_CAN_COMMIT);
37030 + if (unlikely(result != 0))
37031 + goto out;
37032 + if (uf_info->container == UF_CONTAINER_EXTENTS){
37033 + result = do_sync_read(file, buf, read_amount, off);
37034 + } else if (uf_info->container == UF_CONTAINER_TAILS ||
37035 + reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
37036 + reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37037 + result = read_unix_file_container_tails(file, buf, read_amount, off);
37038 + } else {
37039 + assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
37040 + result = 0;
37041 + }
37042 +out:
37043 + drop_access(uf_info);
37044 + context_set_commit_async(ctx);
37045 + reiser4_exit_context(ctx);
37046 + return result;
37047 +}
37048 +
37049 +static ssize_t read_unix_file_container_tails(
37050 + struct file *file, char __user *buf, size_t read_amount, loff_t *off)
37051 +{
37052 + int result;
37053 + struct inode *inode;
37054 + hint_t *hint;
37055 + struct unix_file_info *uf_info;
37056 + size_t count, read, left;
37057 + loff_t size;
37058 +
37059 + assert("umka-072", file != NULL);
37060 + assert("umka-074", off != NULL);
37061 + inode = file->f_dentry->d_inode;
37062 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37063 +
37064 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
37065 + if (hint == NULL)
37066 + return RETERR(-ENOMEM);
37067 +
37068 + result = load_file_hint(file, hint);
37069 + if (result) {
37070 + kfree(hint);
37071 + return result;
37072 + }
37073 +
37074 + left = read_amount;
37075 + count = 0;
37076 + uf_info = unix_file_inode_data(inode);
37077 + while (left > 0) {
37078 + reiser4_txn_restart_current();
37079 + size = i_size_read(inode);
37080 + if (*off >= size)
37081 + /* position to read from is past the end of file */
37082 + break;
37083 + if (*off + left > size)
37084 + left = size - *off;
37085 + /* faultin user page */
37086 + result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
37087 + if (result)
37088 + return RETERR(-EFAULT);
37089 +
37090 + read = read_file(hint, file, buf,
37091 + left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
37092 + off);
37093 + if (read < 0) {
37094 + result = read;
37095 + break;
37096 + }
37097 + left -= read;
37098 + buf += read;
37099 +
37100 + /* update position in a file */
37101 + *off += read;
37102 + /* total number of read bytes */
37103 + count += read;
37104 + }
37105 + done_lh(&hint->lh);
37106 + save_file_hint(file, hint);
37107 + kfree(hint);
37108 + if (count)
37109 + file_accessed(file);
37110 + /* return number of read bytes or error code if nothing is read */
37111 + return count ? count : result;
37112 +}
37113 +
37114 +/* This function takes care about @file's pages. First of all it checks if
37115 + filesystems readonly and if so gets out. Otherwise, it throws out all
37116 + pages of file if it was mapped for read and going to be mapped for write
37117 + and consists of tails. This is done in order to not manage few copies
37118 + of the data (first in page cache and second one in tails them selves)
37119 + for the case of mapping files consisting tails.
37120 +
37121 + Here also tail2extent conversion is performed if it is allowed and file
37122 + is going to be written or mapped for write. This functions may be called
37123 + from write_unix_file() or mmap_unix_file(). */
37124 +static int check_pages_unix_file(struct file *file, struct inode *inode)
37125 +{
37126 + reiser4_invalidate_pages(inode->i_mapping, 0,
37127 + (inode->i_size + PAGE_CACHE_SIZE -
37128 + 1) >> PAGE_CACHE_SHIFT, 0);
37129 + return unpack(file, inode, 0 /* not forever */ );
37130 +}
37131 +
37132 +/**
37133 + * mmap_unix_file - mmap of struct file_operations
37134 + * @file: file to mmap
37135 + * @vma:
37136 + *
37137 + * This is implementation of vfs's mmap method of struct file_operations for
37138 + * unix file plugin. It converts file to extent if necessary. Sets
37139 + * reiser4_inode's flag - REISER4_HAS_MMAP.
37140 + */
37141 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
37142 +{
37143 + reiser4_context *ctx;
37144 + int result;
37145 + struct inode *inode;
37146 + struct unix_file_info *uf_info;
37147 + reiser4_block_nr needed;
37148 +
37149 + inode = file->f_dentry->d_inode;
37150 + ctx = reiser4_init_context(inode->i_sb);
37151 + if (IS_ERR(ctx))
37152 + return PTR_ERR(ctx);
37153 +
37154 + uf_info = unix_file_inode_data(inode);
37155 +
37156 + get_exclusive_access_careful(uf_info, inode);
37157 +
37158 + if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
37159 + /*
37160 + * we need file built of extent items. If it is still built of
37161 + * tail items we have to convert it. Find what items the file
37162 + * is built of
37163 + */
37164 + result = find_file_state(inode, uf_info);
37165 + if (result != 0) {
37166 + drop_exclusive_access(uf_info);
37167 + reiser4_exit_context(ctx);
37168 + return result;
37169 + }
37170 +
37171 + assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
37172 + uf_info->container == UF_CONTAINER_EXTENTS ||
37173 + uf_info->container == UF_CONTAINER_EMPTY));
37174 + if (uf_info->container == UF_CONTAINER_TAILS) {
37175 + /*
37176 + * invalidate all pages and convert file from tails to
37177 + * extents
37178 + */
37179 + result = check_pages_unix_file(file, inode);
37180 + if (result) {
37181 + drop_exclusive_access(uf_info);
37182 + reiser4_exit_context(ctx);
37183 + return result;
37184 + }
37185 + }
37186 + }
37187 +
37188 + /*
37189 + * generic_file_mmap will do update_atime. Grab space for stat data
37190 + * update.
37191 + */
37192 + needed = inode_file_plugin(inode)->estimate.update(inode);
37193 + result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37194 + if (result) {
37195 + drop_exclusive_access(uf_info);
37196 + reiser4_exit_context(ctx);
37197 + return result;
37198 + }
37199 +
37200 + result = generic_file_mmap(file, vma);
37201 + if (result == 0) {
37202 + /* mark file as having mapping. */
37203 + reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
37204 + }
37205 +
37206 + drop_exclusive_access(uf_info);
37207 + reiser4_exit_context(ctx);
37208 + return result;
37209 +}
37210 +
37211 +/**
37212 + * find_first_item
37213 + * @inode:
37214 + *
37215 + * Finds file item which is responsible for first byte in the file.
37216 + */
37217 +static int find_first_item(struct inode *inode)
37218 +{
37219 + coord_t coord;
37220 + lock_handle lh;
37221 + reiser4_key key;
37222 + int result;
37223 +
37224 + coord_init_zero(&coord);
37225 + init_lh(&lh);
37226 + inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37227 + result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37228 + inode);
37229 + if (result == CBK_COORD_FOUND) {
37230 + if (coord.between == AT_UNIT) {
37231 + result = zload(coord.node);
37232 + if (result == 0) {
37233 + result = item_id_by_coord(&coord);
37234 + zrelse(coord.node);
37235 + if (result != EXTENT_POINTER_ID &&
37236 + result != FORMATTING_ID)
37237 + result = RETERR(-EIO);
37238 + }
37239 + } else
37240 + result = RETERR(-EIO);
37241 + }
37242 + done_lh(&lh);
37243 + return result;
37244 +}
37245 +
37246 +/**
37247 + * open_unix_file
37248 + * @inode:
37249 + * @file:
37250 + *
37251 + * If filesystem is not readonly - complete uncompleted tail conversion if
37252 + * there was one
37253 + */
37254 +int open_unix_file(struct inode *inode, struct file *file)
37255 +{
37256 + int result;
37257 + reiser4_context *ctx;
37258 + struct unix_file_info *uf_info;
37259 +
37260 + if (IS_RDONLY(inode))
37261 + return 0;
37262 +
37263 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
37264 + return 0;
37265 +
37266 + ctx = reiser4_init_context(inode->i_sb);
37267 + if (IS_ERR(ctx))
37268 + return PTR_ERR(ctx);
37269 +
37270 + uf_info = unix_file_inode_data(inode);
37271 +
37272 + get_exclusive_access_careful(uf_info, inode);
37273 +
37274 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37275 + /*
37276 + * other process completed the conversion
37277 + */
37278 + drop_exclusive_access(uf_info);
37279 + reiser4_exit_context(ctx);
37280 + return 0;
37281 + }
37282 +
37283 + /*
37284 + * file left in semi converted state after unclean shutdown or another
37285 + * thread is doing conversion and dropped exclusive access which doing
37286 + * balance dirty pages. Complete the conversion
37287 + */
37288 + result = find_first_item(inode);
37289 + if (result == EXTENT_POINTER_ID)
37290 + /*
37291 + * first item is extent, therefore there was incomplete
37292 + * tail2extent conversion. Complete it
37293 + */
37294 + result = tail2extent(unix_file_inode_data(inode));
37295 + else if (result == FORMATTING_ID)
37296 + /*
37297 + * first item is formatting item, therefore there was
37298 + * incomplete extent2tail conversion. Complete it
37299 + */
37300 + result = extent2tail(unix_file_inode_data(inode));
37301 + else
37302 + result = -EIO;
37303 +
37304 + assert("vs-1712",
37305 + ergo(result == 0,
37306 + (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
37307 + !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
37308 + drop_exclusive_access(uf_info);
37309 + reiser4_exit_context(ctx);
37310 + return result;
37311 +}
37312 +
37313 +#define NEITHER_OBTAINED 0
37314 +#define EA_OBTAINED 1
37315 +#define NEA_OBTAINED 2
37316 +
37317 +static void drop_access(struct unix_file_info *uf_info)
37318 +{
37319 + if (uf_info->exclusive_use)
37320 + drop_exclusive_access(uf_info);
37321 + else
37322 + drop_nonexclusive_access(uf_info);
37323 +}
37324 +
37325 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37326 + __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37327 +
37328 +/**
37329 + * write_unix_file - write of struct file_operations
37330 + * @file: file to write to
37331 + * @buf: address of user-space buffer
37332 + * @write_amount: number of bytes to write
37333 + * @off: position in file to write to
37334 + *
37335 + * This is implementation of vfs's write method of struct file_operations for
37336 + * unix file plugin.
37337 + */
37338 +ssize_t write_unix_file(struct file *file, const char __user *buf,
37339 + size_t count, loff_t *pos)
37340 +{
37341 + int result;
37342 + reiser4_context *ctx;
37343 + struct inode *inode;
37344 + struct unix_file_info *uf_info;
37345 + ssize_t written;
37346 + int try_free_space;
37347 + int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37348 + size_t left;
37349 + ssize_t (*write_op)(struct file *, const char __user *, size_t,
37350 + loff_t *pos);
37351 + int ea;
37352 + loff_t new_size;
37353 +
37354 + inode = file->f_dentry->d_inode;
37355 + ctx = reiser4_init_context(inode->i_sb);
37356 + if (IS_ERR(ctx))
37357 + return PTR_ERR(ctx);
37358 +
37359 + mutex_lock(&inode->i_mutex);
37360 +
37361 + assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37362 + assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
37363 +
37364 + /* check amount of bytes to write and writing position */
37365 + result = generic_write_checks(file, pos, &count, 0);
37366 + if (result) {
37367 + mutex_unlock(&inode->i_mutex);
37368 + context_set_commit_async(ctx);
37369 + reiser4_exit_context(ctx);
37370 + return result;
37371 + }
37372 +
37373 + result = remove_suid(file->f_dentry);
37374 + if (result) {
37375 + mutex_unlock(&inode->i_mutex);
37376 + context_set_commit_async(ctx);
37377 + reiser4_exit_context(ctx);
37378 + return result;
37379 + }
37380 + /* remove_suid might create a transaction */
37381 + reiser4_txn_restart(ctx);
37382 +
37383 + uf_info = unix_file_inode_data(inode);
37384 +
37385 + current->backing_dev_info = inode->i_mapping->backing_dev_info;
37386 + written = 0;
37387 + try_free_space = 0;
37388 + left = count;
37389 + ea = NEITHER_OBTAINED;
37390 +
37391 + new_size = i_size_read(inode);
37392 + if (*pos + count > new_size)
37393 + new_size = *pos + count;
37394 +
37395 + while (left) {
37396 + if (left < to_write)
37397 + to_write = left;
37398 +
37399 + if (uf_info->container == UF_CONTAINER_EMPTY) {
37400 + get_exclusive_access(uf_info);
37401 + ea = EA_OBTAINED;
37402 + if (uf_info->container != UF_CONTAINER_EMPTY) {
37403 + /* file is made not empty by another process */
37404 + drop_exclusive_access(uf_info);
37405 + ea = NEITHER_OBTAINED;
37406 + continue;
37407 + }
37408 + } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37409 + /*
37410 + * get exclusive access directly just to not have to
37411 + * re-obtain it if file will appear empty
37412 + */
37413 + get_exclusive_access(uf_info);
37414 + ea = EA_OBTAINED;
37415 + result = find_file_state(inode, uf_info);
37416 + if (result) {
37417 + drop_exclusive_access(uf_info);
37418 + ea = NEITHER_OBTAINED;
37419 + break;
37420 + }
37421 + } else {
37422 + get_nonexclusive_access(uf_info);
37423 + ea = NEA_OBTAINED;
37424 + }
37425 +
37426 + /* either EA or NEA is obtained. Choose item write method */
37427 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
37428 + /* file is built of extent items */
37429 + write_op = reiser4_write_extent;
37430 + } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37431 + /* file is empty */
37432 + if (should_have_notail(uf_info, new_size))
37433 + write_op = reiser4_write_extent;
37434 + else
37435 + write_op = reiser4_write_tail;
37436 + } else {
37437 + /* file is built of tail items */
37438 + if (should_have_notail(uf_info, new_size)) {
37439 + if (ea == NEA_OBTAINED) {
37440 + drop_nonexclusive_access(uf_info);
37441 + get_exclusive_access(uf_info);
37442 + ea = EA_OBTAINED;
37443 + }
37444 + if (uf_info->container == UF_CONTAINER_TAILS) {
37445 + /*
37446 + * if file is being convered by another
37447 + * process - wait until it completes
37448 + */
37449 + while (1) {
37450 + if (reiser4_inode_get_flag(inode,
37451 + REISER4_PART_IN_CONV)) {
37452 + drop_exclusive_access(uf_info);
37453 + schedule();
37454 + get_exclusive_access(uf_info);
37455 + continue;
37456 + }
37457 + break;
37458 + }
37459 + if (uf_info->container == UF_CONTAINER_TAILS) {
37460 + result = tail2extent(uf_info);
37461 + if (result)
37462 + break;
37463 + }
37464 + }
37465 + drop_exclusive_access(uf_info);
37466 + ea = NEITHER_OBTAINED;
37467 + continue;
37468 + }
37469 + write_op = reiser4_write_tail;
37470 + }
37471 +
37472 + written = write_op(file, buf, to_write, pos);
37473 + if (written == -ENOSPC && try_free_space) {
37474 + drop_access(uf_info);
37475 + txnmgr_force_commit_all(inode->i_sb, 0);
37476 + try_free_space = 0;
37477 + continue;
37478 + }
37479 + if (written < 0) {
37480 + drop_access(uf_info);
37481 + result = written;
37482 + break;
37483 + }
37484 + /* something is written. */
37485 + if (uf_info->container == UF_CONTAINER_EMPTY) {
37486 + assert("", ea == EA_OBTAINED);
37487 + uf_info->container =
37488 + (write_op == reiser4_write_extent) ?
37489 + UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37490 + } else {
37491 + assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
37492 + write_op == reiser4_write_extent));
37493 + assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
37494 + write_op == reiser4_write_tail));
37495 + }
37496 + if (*pos + written > inode->i_size)
37497 + INODE_SET_FIELD(inode, i_size, *pos + written);
37498 + file_update_time(file);
37499 + result = reiser4_update_sd(inode);
37500 + if (result) {
37501 + mutex_unlock(&inode->i_mutex);
37502 + current->backing_dev_info = NULL;
37503 + drop_access(uf_info);
37504 + context_set_commit_async(ctx);
37505 + reiser4_exit_context(ctx);
37506 + return result;
37507 + }
37508 + drop_access(uf_info);
37509 + ea = NEITHER_OBTAINED;
37510 + reiser4_txn_restart(ctx);
37511 + current->journal_info = NULL;
37512 + /*
37513 + * tell VM how many pages were dirtied. Maybe number of pages
37514 + * which were dirty already should not be counted
37515 + */
37516 + balance_dirty_pages_ratelimited_nr(inode->i_mapping,
37517 + (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
37518 + current->journal_info = ctx;
37519 +
37520 + left -= written;
37521 + buf += written;
37522 + *pos += written;
37523 + }
37524 +
37525 + mutex_unlock(&inode->i_mutex);
37526 +
37527 + if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
37528 + reiser4_txn_restart_current();
37529 + grab_space_enable();
37530 + result = sync_unix_file(file, file->f_dentry,
37531 + 0 /* data and stat data */ );
37532 + if (result)
37533 + warning("reiser4-7", "failed to sync file %llu",
37534 + (unsigned long long)get_inode_oid(inode));
37535 + }
37536 +
37537 + current->backing_dev_info = NULL;
37538 +
37539 + reiser4_exit_context(ctx);
37540 +
37541 + /*
37542 + * return number of written bytes or error code if nothing is
37543 + * written. Note, that it does not work correctly in case when
37544 + * sync_unix_file returns error
37545 + */
37546 + return (count - left) ? (count - left) : result;
37547 +}
37548 +
37549 +/**
37550 + * release_unix_file - release of struct file_operations
37551 + * @inode: inode of released file
37552 + * @file: file to release
37553 + *
37554 + * Implementation of release method of struct file_operations for unix file
37555 + * plugin. If last reference to indode is released - convert all extent items
37556 + * into tail items if necessary. Frees reiser4 specific file data.
37557 + */
37558 +int release_unix_file(struct inode *inode, struct file *file)
37559 +{
37560 + reiser4_context *ctx;
37561 + struct unix_file_info *uf_info;
37562 + int result;
37563 + int in_reiser4;
37564 +
37565 + in_reiser4 = is_in_reiser4_context();
37566 +
37567 + ctx = reiser4_init_context(inode->i_sb);
37568 + if (IS_ERR(ctx))
37569 + return PTR_ERR(ctx);
37570 +
37571 + result = 0;
37572 + if (in_reiser4 == 0) {
37573 + uf_info = unix_file_inode_data(inode);
37574 +
37575 + get_exclusive_access_careful(uf_info, inode);
37576 + if (atomic_read(&file->f_dentry->d_count) == 1 &&
37577 + uf_info->container == UF_CONTAINER_EXTENTS &&
37578 + !should_have_notail(uf_info, inode->i_size) &&
37579 + !rofs_inode(inode)) {
37580 + result = extent2tail(uf_info);
37581 + if (result != 0) {
37582 + warning("nikita-3233",
37583 + "Failed (%d) to convert in %s (%llu)",
37584 + result, __FUNCTION__,
37585 + (unsigned long long)
37586 + get_inode_oid(inode));
37587 + }
37588 + }
37589 + drop_exclusive_access(uf_info);
37590 + } else {
37591 + /*
37592 + we are within reiser4 context already. How latter is
37593 + possible? Simple:
37594 +
37595 + (gdb) bt
37596 + #0 get_exclusive_access ()
37597 + #2 0xc01e56d3 in release_unix_file ()
37598 + #3 0xc01c3643 in reiser4_release ()
37599 + #4 0xc014cae0 in __fput ()
37600 + #5 0xc013ffc3 in remove_vm_struct ()
37601 + #6 0xc0141786 in exit_mmap ()
37602 + #7 0xc0118480 in mmput ()
37603 + #8 0xc0133205 in oom_kill ()
37604 + #9 0xc01332d1 in out_of_memory ()
37605 + #10 0xc013bc1d in try_to_free_pages ()
37606 + #11 0xc013427b in __alloc_pages ()
37607 + #12 0xc013f058 in do_anonymous_page ()
37608 + #13 0xc013f19d in do_no_page ()
37609 + #14 0xc013f60e in handle_mm_fault ()
37610 + #15 0xc01131e5 in do_page_fault ()
37611 + #16 0xc0104935 in error_code ()
37612 + #17 0xc025c0c6 in __copy_to_user_ll ()
37613 + #18 0xc01d496f in reiser4_read_tail ()
37614 + #19 0xc01e4def in read_unix_file ()
37615 + #20 0xc01c3504 in reiser4_read ()
37616 + #21 0xc014bd4f in vfs_read ()
37617 + #22 0xc014bf66 in sys_read ()
37618 + */
37619 + warning("vs-44", "out of memory?");
37620 + }
37621 +
37622 + reiser4_free_file_fsdata(file);
37623 +
37624 + reiser4_exit_context(ctx);
37625 + return result;
37626 +}
37627 +
37628 +static void set_file_notail(struct inode *inode)
37629 +{
37630 + reiser4_inode *state;
37631 + formatting_plugin *tplug;
37632 +
37633 + state = reiser4_inode_data(inode);
37634 + tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
37635 + force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
37636 +}
37637 +
37638 +/* if file is built of tails - convert it to extents */
37639 +static int unpack(struct file *filp, struct inode *inode, int forever)
37640 +{
37641 + int result = 0;
37642 + struct unix_file_info *uf_info;
37643 +
37644 + uf_info = unix_file_inode_data(inode);
37645 + assert("vs-1628", ea_obtained(uf_info));
37646 +
37647 + result = find_file_state(inode, uf_info);
37648 + if (result)
37649 + return result;
37650 + assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
37651 +
37652 + if (uf_info->container == UF_CONTAINER_TAILS) {
37653 + /*
37654 + * if file is being convered by another process - wait until it
37655 + * completes
37656 + */
37657 + while (1) {
37658 + if (reiser4_inode_get_flag(inode,
37659 + REISER4_PART_IN_CONV)) {
37660 + drop_exclusive_access(uf_info);
37661 + schedule();
37662 + get_exclusive_access(uf_info);
37663 + continue;
37664 + }
37665 + break;
37666 + }
37667 + if (uf_info->container == UF_CONTAINER_TAILS) {
37668 + result = tail2extent(uf_info);
37669 + if (result)
37670 + return result;
37671 + }
37672 + }
37673 + if (forever) {
37674 + /* safe new formatting plugin in stat data */
37675 + __u64 tograb;
37676 +
37677 + set_file_notail(inode);
37678 +
37679 + grab_space_enable();
37680 + tograb = inode_file_plugin(inode)->estimate.update(inode);
37681 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
37682 + result = reiser4_update_sd(inode);
37683 + }
37684 +
37685 + return result;
37686 +}
37687 +
37688 +/* implentation of vfs' ioctl method of struct file_operations for unix file
37689 + plugin
37690 +*/
37691 +int
37692 +ioctl_unix_file(struct inode *inode, struct file *filp,
37693 + unsigned int cmd, unsigned long arg UNUSED_ARG)
37694 +{
37695 + reiser4_context *ctx;
37696 + int result;
37697 +
37698 + ctx = reiser4_init_context(inode->i_sb);
37699 + if (IS_ERR(ctx))
37700 + return PTR_ERR(ctx);
37701 +
37702 + switch (cmd) {
37703 + case REISER4_IOC_UNPACK:
37704 + get_exclusive_access(unix_file_inode_data(inode));
37705 + result = unpack(filp, inode, 1 /* forever */ );
37706 + drop_exclusive_access(unix_file_inode_data(inode));
37707 + break;
37708 +
37709 + default:
37710 + result = RETERR(-ENOSYS);
37711 + break;
37712 + }
37713 + reiser4_exit_context(ctx);
37714 + return result;
37715 +}
37716 +
37717 +/* implentation of vfs' bmap method of struct address_space_operations for unix
37718 + file plugin
37719 +*/
37720 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
37721 +{
37722 + reiser4_context *ctx;
37723 + sector_t result;
37724 + reiser4_key key;
37725 + coord_t coord;
37726 + lock_handle lh;
37727 + struct inode *inode;
37728 + item_plugin *iplug;
37729 + sector_t block;
37730 +
37731 + inode = mapping->host;
37732 +
37733 + ctx = reiser4_init_context(inode->i_sb);
37734 + if (IS_ERR(ctx))
37735 + return PTR_ERR(ctx);
37736 + key_by_inode_and_offset_common(inode,
37737 + (loff_t) lblock * current_blocksize,
37738 + &key);
37739 +
37740 + init_lh(&lh);
37741 + result =
37742 + find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
37743 + if (cbk_errored(result)) {
37744 + done_lh(&lh);
37745 + reiser4_exit_context(ctx);
37746 + return result;
37747 + }
37748 +
37749 + result = zload(coord.node);
37750 + if (result) {
37751 + done_lh(&lh);
37752 + reiser4_exit_context(ctx);
37753 + return result;
37754 + }
37755 +
37756 + iplug = item_plugin_by_coord(&coord);
37757 + if (iplug->s.file.get_block) {
37758 + result = iplug->s.file.get_block(&coord, lblock, &block);
37759 + if (result == 0)
37760 + result = block;
37761 + } else
37762 + result = RETERR(-EINVAL);
37763 +
37764 + zrelse(coord.node);
37765 + done_lh(&lh);
37766 + reiser4_exit_context(ctx);
37767 + return result;
37768 +}
37769 +
37770 +/**
37771 + * flow_by_inode_unix_file - initizlize structure flow
37772 + * @inode: inode of file for which read or write is abou
37773 + * @buf: buffer to perform read to or write from
37774 + * @user: flag showing whether @buf is user space or kernel space
37775 + * @size: size of buffer @buf
37776 + * @off: start offset fro read or write
37777 + * @op: READ or WRITE
37778 + * @flow:
37779 + *
37780 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
37781 + */
37782 +int flow_by_inode_unix_file(struct inode *inode,
37783 + const char __user *buf, int user,
37784 + loff_t size, loff_t off,
37785 + rw_op op, flow_t *flow)
37786 +{
37787 + assert("nikita-1100", inode != NULL);
37788 +
37789 + flow->length = size;
37790 + memcpy(&flow->data, &buf, sizeof(buf));
37791 + flow->user = user;
37792 + flow->op = op;
37793 + assert("nikita-1931", inode_file_plugin(inode) != NULL);
37794 + assert("nikita-1932",
37795 + inode_file_plugin(inode)->key_by_inode ==
37796 + key_by_inode_and_offset_common);
37797 + /* calculate key of write position and insert it into flow->key */
37798 + return key_by_inode_and_offset_common(inode, off, &flow->key);
37799 +}
37800 +
37801 +/* plugin->u.file.set_plug_in_sd = NULL
37802 + plugin->u.file.set_plug_in_inode = NULL
37803 + plugin->u.file.create_blank_sd = NULL */
37804 +/* plugin->u.file.delete */
37805 +/*
37806 + plugin->u.file.add_link = reiser4_add_link_common
37807 + plugin->u.file.rem_link = NULL */
37808 +
37809 +/* plugin->u.file.owns_item
37810 + this is common_file_owns_item with assertion */
37811 +/* Audited by: green(2002.06.15) */
37812 +int
37813 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
37814 + const coord_t * coord /* coord to check */ )
37815 +{
37816 + int result;
37817 +
37818 + result = owns_item_common(inode, coord);
37819 + if (!result)
37820 + return 0;
37821 + if (!plugin_of_group(item_plugin_by_coord(coord),
37822 + UNIX_FILE_METADATA_ITEM_TYPE))
37823 + return 0;
37824 + assert("vs-547",
37825 + item_id_by_coord(coord) == EXTENT_POINTER_ID ||
37826 + item_id_by_coord(coord) == FORMATTING_ID);
37827 + return 1;
37828 +}
37829 +
37830 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
37831 +{
37832 + int result;
37833 + int s_result;
37834 + loff_t old_size;
37835 + reiser4_tree *tree;
37836 +
37837 + inode_check_scale(inode, inode->i_size, attr->ia_size);
37838 +
37839 + old_size = inode->i_size;
37840 + tree = reiser4_tree_by_inode(inode);
37841 +
37842 + result = safe_link_grab(tree, BA_CAN_COMMIT);
37843 + if (result == 0)
37844 + result = safe_link_add(inode, SAFE_TRUNCATE);
37845 + if (result == 0)
37846 + result = truncate_file_body(inode, attr->ia_size);
37847 + if (result)
37848 + warning("vs-1588", "truncate_file failed: oid %lli, "
37849 + "old size %lld, new size %lld, retval %d",
37850 + (unsigned long long)get_inode_oid(inode),
37851 + old_size, attr->ia_size, result);
37852 +
37853 + s_result = safe_link_grab(tree, BA_CAN_COMMIT);
37854 + if (s_result == 0)
37855 + s_result =
37856 + safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
37857 + if (s_result != 0) {
37858 + warning("nikita-3417", "Cannot kill safelink %lli: %i",
37859 + (unsigned long long)get_inode_oid(inode), s_result);
37860 + }
37861 + safe_link_release(tree);
37862 + return result;
37863 +}
37864 +
37865 +/* plugin->u.file.setattr method */
37866 +/* This calls inode_setattr and if truncate is in effect it also takes
37867 + exclusive inode access to avoid races */
37868 +int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
37869 + struct iattr *attr /* change description */ )
37870 +{
37871 + int result;
37872 +
37873 + if (attr->ia_valid & ATTR_SIZE) {
37874 + reiser4_context *ctx;
37875 + struct unix_file_info *uf_info;
37876 +
37877 + /* truncate does reservation itself and requires exclusive
37878 + access obtained */
37879 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
37880 + if (IS_ERR(ctx))
37881 + return PTR_ERR(ctx);
37882 +
37883 + uf_info = unix_file_inode_data(dentry->d_inode);
37884 + get_exclusive_access_careful(uf_info, dentry->d_inode);
37885 + result = setattr_truncate(dentry->d_inode, attr);
37886 + drop_exclusive_access(uf_info);
37887 + context_set_commit_async(ctx);
37888 + reiser4_exit_context(ctx);
37889 + } else
37890 + result = reiser4_setattr_common(dentry, attr);
37891 +
37892 + return result;
37893 +}
37894 +
37895 +/* plugin->u.file.init_inode_data */
37896 +void
37897 +init_inode_data_unix_file(struct inode *inode,
37898 + reiser4_object_create_data * crd, int create)
37899 +{
37900 + struct unix_file_info *data;
37901 +
37902 + data = unix_file_inode_data(inode);
37903 + data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
37904 + init_rwsem(&data->latch);
37905 + data->tplug = inode_formatting_plugin(inode);
37906 + data->exclusive_use = 0;
37907 +
37908 +#if REISER4_DEBUG
37909 + data->ea_owner = NULL;
37910 + atomic_set(&data->nr_neas, 0);
37911 +#endif
37912 + init_inode_ordering(inode, crd, create);
37913 +}
37914 +
37915 +/**
37916 + * delete_object_unix_file - delete_object of file_plugin
37917 + * @inode: inode to be deleted
37918 + *
37919 + * Truncates file to length 0, removes stat data and safe link.
37920 + */
37921 +int delete_object_unix_file(struct inode *inode)
37922 +{
37923 + struct unix_file_info *uf_info;
37924 + int result;
37925 +
37926 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
37927 + return 0;
37928 +
37929 + /* truncate file bogy first */
37930 + uf_info = unix_file_inode_data(inode);
37931 + get_exclusive_access(uf_info);
37932 + result = truncate_file_body(inode, 0 /* size */ );
37933 + drop_exclusive_access(uf_info);
37934 +
37935 + if (result)
37936 + warning("", "failed to truncate file (%llu) on removal: %d",
37937 + get_inode_oid(inode), result);
37938 +
37939 + /* remove stat data and safe link */
37940 + return reiser4_delete_object_common(inode);
37941 +}
37942 +
37943 +/**
37944 + * sendfile_unix_file - sendfile of struct file_operations
37945 + * @file: file to be sent
37946 + * @ppos: position to start from
37947 + * @count: number of bytes to send
37948 + * @actor: function to copy data
37949 + * @target: where to copy read data
37950 + *
37951 + * Reads @count bytes from @file and calls @actor for every page read. This is
37952 + * needed for loop back devices support.
37953 + */
37954 +ssize_t
37955 +sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
37956 + read_actor_t actor, void *target)
37957 +{
37958 + reiser4_context *ctx;
37959 + ssize_t result;
37960 + struct inode *inode;
37961 + struct unix_file_info *uf_info;
37962 +
37963 + inode = file->f_dentry->d_inode;
37964 + ctx = reiser4_init_context(inode->i_sb);
37965 + if (IS_ERR(ctx))
37966 + return PTR_ERR(ctx);
37967 +
37968 + /*
37969 + * generic_file_sndfile may want to call update_atime. Grab space for
37970 + * stat data update
37971 + */
37972 + result = reiser4_grab_space(estimate_update_common(inode),
37973 + BA_CAN_COMMIT);
37974 + if (result)
37975 + goto error;
37976 + mutex_lock(&inode->i_mutex);
37977 + reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
37978 + mutex_unlock(&inode->i_mutex);
37979 +
37980 + uf_info = unix_file_inode_data(inode);
37981 + get_nonexclusive_access(uf_info);
37982 + result = generic_file_sendfile(file, ppos, count, actor, target);
37983 + drop_nonexclusive_access(uf_info);
37984 + error:
37985 + reiser4_exit_context(ctx);
37986 + return result;
37987 +}
37988 +
37989 +int
37990 +prepare_write_unix_file(struct file *file, struct page *page,
37991 + unsigned from, unsigned to)
37992 +{
37993 + reiser4_context *ctx;
37994 + struct unix_file_info *uf_info;
37995 + int ret;
37996 +
37997 + ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
37998 + if (IS_ERR(ctx))
37999 + return PTR_ERR(ctx);
38000 +
38001 + uf_info = unix_file_inode_data(file->f_dentry->d_inode);
38002 + get_exclusive_access(uf_info);
38003 + ret = find_file_state(file->f_dentry->d_inode, uf_info);
38004 + if (ret == 0) {
38005 + if (uf_info->container == UF_CONTAINER_TAILS)
38006 + ret = -EINVAL;
38007 + else
38008 + ret = do_prepare_write(file, page, from, to);
38009 + }
38010 + drop_exclusive_access(uf_info);
38011 +
38012 + /* don't commit transaction under inode semaphore */
38013 + context_set_commit_async(ctx);
38014 + reiser4_exit_context(ctx);
38015 + return ret;
38016 +}
38017 +
38018 +/*
38019 + * Local variables:
38020 + * c-indentation-style: "K&R"
38021 + * mode-name: "LC"
38022 + * c-basic-offset: 8
38023 + * tab-width: 8
38024 + * fill-column: 79
38025 + * scroll-step: 1
38026 + * End:
38027 + */
38028 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.22/fs/reiser4/plugin/file/file_conversion.c
38029 --- linux-2.6.22.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300
38030 +++ linux-2.6.22/fs/reiser4/plugin/file/file_conversion.c 2007-07-29 00:25:34.928709936 +0400
38031 @@ -0,0 +1,609 @@
38032 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
38033 + licensing governed by reiser4/README */
38034 +
38035 +/* This file contains hooks that converts (*) cryptcompress files to unix-files,
38036 + and a set of protected (**) methods of a cryptcompress file plugin to perform
38037 + such conversion.
38038 +
38039 +(*)
38040 + The conversion is performed for incompressible files to reduce cpu and memory
38041 + usage. If first logical cluster (64K by default) of a file is incompressible,
38042 + then we make a desicion, that the whole file is incompressible.
38043 + The conversion can be enabled via installing a special compression mode
38044 + plugin (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for
38045 + details).
38046 +
38047 +(**)
38048 + The protection means serialization of critical sections (readers and writers
38049 + of @pset->file)
38050 +*/
38051 +
38052 +#include "../../inode.h"
38053 +#include "../cluster.h"
38054 +#include "file.h"
38055 +
38056 +#define conversion_enabled(inode) \
38057 + (inode_compression_mode_plugin(inode) == \
38058 + compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
38059 +
38060 +
38061 +/* Located sections (readers and writers of @pset->file) are not
38062 + permanently critical: cryptcompress file can be converted only
38063 + if the conversion is enabled (see the macrio above). And we don't
38064 + convert unix files at all.
38065 + The following helper macro is a sanity check to decide if we
38066 + need to protect a located section.
38067 +*/
38068 +#define should_protect(inode) \
38069 + (inode_file_plugin(inode) == \
38070 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
38071 + conversion_enabled(inode))
38072 +
38073 +/* All protected methods have prefix "prot" in their names.
38074 + It is convenient to construct them by usual (unprotected) ones
38075 + using the following common macros:
38076 +*/
38077 +
38078 +/* Macro for passive protection.
38079 + method_cryptcompress contains only readers */
38080 +#define PROT_PASSIVE(type, method, args) \
38081 +({ \
38082 + type _result; \
38083 + struct rw_semaphore * guard = \
38084 + &reiser4_inode_data(inode)->conv_sem; \
38085 + \
38086 + if (should_protect(inode)) { \
38087 + down_read(guard); \
38088 + if (!should_protect(inode)) \
38089 + up_read(guard); \
38090 + } \
38091 + if (inode_file_plugin(inode) == \
38092 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38093 + _result = method ## _unix_file args; \
38094 + else \
38095 + _result = method ## _cryptcompress args; \
38096 + if (should_protect(inode)) \
38097 + up_read(guard); \
38098 + _result; \
38099 +})
38100 +
38101 +#define PROT_PASSIVE_VOID(method, args) \
38102 +({ \
38103 + struct rw_semaphore * guard = \
38104 + &reiser4_inode_data(inode)->conv_sem; \
38105 + \
38106 + if (should_protect(inode)) { \
38107 + down_read(guard); \
38108 + if (!should_protect(inode)) \
38109 + up_read(guard); \
38110 + } \
38111 + if (inode_file_plugin(inode) == \
38112 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38113 + method ## _unix_file args; \
38114 + else \
38115 + method ## _cryptcompress args; \
38116 + if (should_protect(inode)) \
38117 + up_read(guard); \
38118 +})
38119 +
38120 +/* Macro for active protection.
38121 + active_expr contains readers and writers; after its
38122 + evaluation conversion should be disabled */
38123 +#define PROT_ACTIVE(type, method, args, active_expr) \
38124 +({ \
38125 + type _result = 0; \
38126 + struct rw_semaphore * guard = \
38127 + &reiser4_inode_data(inode)->conv_sem; \
38128 + reiser4_context * ctx = reiser4_init_context(inode->i_sb); \
38129 + if (IS_ERR(ctx)) \
38130 + return PTR_ERR(ctx); \
38131 + \
38132 + if (should_protect(inode)) { \
38133 + down_write(guard); \
38134 + if (should_protect(inode)) \
38135 + _result = active_expr; \
38136 + up_write(guard); \
38137 + } \
38138 + if (_result == 0) { \
38139 + if (inode_file_plugin(inode) == \
38140 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38141 + _result = method ## _unix_file args; \
38142 + else \
38143 + _result = method ## _cryptcompress args; \
38144 + } \
38145 + reiser4_exit_context(ctx); \
38146 + _result; \
38147 +})
38148 +
38149 +/* Pass management to the unix-file plugin with "notail" policy */
38150 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
38151 +{
38152 + int result;
38153 + reiser4_inode *info;
38154 + struct unix_file_info * uf;
38155 + info = reiser4_inode_data(inode);
38156 +
38157 + result = aset_set_unsafe(&info->pset,
38158 + PSET_FILE,
38159 + (reiser4_plugin *)
38160 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
38161 + if (result)
38162 + return result;
38163 + result = aset_set_unsafe(&info->pset,
38164 + PSET_FORMATTING,
38165 + (reiser4_plugin *)
38166 + formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
38167 + if (result)
38168 + return result;
38169 + /* get rid of non-standard plugins */
38170 + info->plugin_mask &= ~cryptcompress_mask;
38171 + /* get rid of plugin stat-data extension */
38172 + info->extmask &= ~(1 << PLUGIN_STAT);
38173 +
38174 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
38175 +
38176 + /* FIXME use init_inode_data_unix_file() instead,
38177 + but aviod init_inode_ordering() */
38178 + /* Init unix-file specific part of inode */
38179 + uf = unix_file_inode_data(inode);
38180 + uf->container = UF_CONTAINER_UNKNOWN;
38181 + init_rwsem(&uf->latch);
38182 + uf->tplug = inode_formatting_plugin(inode);
38183 + uf->exclusive_use = 0;
38184 +#if REISER4_DEBUG
38185 + uf->ea_owner = NULL;
38186 + atomic_set(&uf->nr_neas, 0);
38187 +#endif
38188 + inode->i_op =
38189 + &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->inode_ops;
38190 + inode->i_fop =
38191 + &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->file_ops;
38192 + inode->i_mapping->a_ops =
38193 + &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->as_ops;
38194 + file->f_op = inode->i_fop;
38195 + return 0;
38196 +}
38197 +
38198 +#if REISER4_DEBUG
38199 +static int disabled_conversion_inode_ok(struct inode * inode)
38200 +{
38201 + __u64 extmask = reiser4_inode_data(inode)->extmask;
38202 + __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
38203 +
38204 + return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
38205 + (extmask & (1 << UNIX_STAT)) &&
38206 + (extmask & (1 << LARGE_TIMES_STAT)) &&
38207 + (extmask & (1 << PLUGIN_STAT)) &&
38208 + (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
38209 +}
38210 +#endif
38211 +
38212 +/* Assign another mode that will control
38213 + compression at flush time only */
38214 +static int disable_conversion_no_update_sd(struct inode * inode)
38215 +{
38216 + int result;
38217 + result =
38218 + force_plugin_pset(inode,
38219 + PSET_COMPRESSION_MODE,
38220 + (reiser4_plugin *)compression_mode_plugin_by_id
38221 + (LATTD_COMPRESSION_MODE_ID));
38222 + assert("edward-1500",
38223 + ergo(!result, disabled_conversion_inode_ok(inode)));
38224 + return result;
38225 +}
38226 +
38227 +/* Disable future attempts to check/convert. This function is called by
38228 + conversion hooks. */
38229 +static int disable_conversion(struct inode * inode)
38230 +{
38231 + return disable_conversion_no_update_sd(inode);
38232 +}
38233 +
38234 +static int check_position(struct inode * inode,
38235 + loff_t pos /* position in the file to write from */,
38236 + struct cluster_handle * clust,
38237 + int * check_compress)
38238 +{
38239 + assert("edward-1505", conversion_enabled(inode));
38240 + /*
38241 + * if file size is more then cluster size, then compressible
38242 + * status must be figured out (i.e. compression was disabled,
38243 + * or file plugin was converted to unix_file)
38244 + */
38245 + assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
38246 +
38247 + if (pos > inode->i_size)
38248 + /* first logical cluster will contain a (partial) hole */
38249 + return disable_conversion(inode);
38250 + if (pos < inode_cluster_size(inode))
38251 + /* writing to the first logical cluster */
38252 + return 0;
38253 + /*
38254 + * here we have:
38255 + * cluster_size <= pos <= i_size <= cluster_size,
38256 + * and, hence, pos == i_size == cluster_size
38257 + */
38258 + assert("edward-1498",
38259 + pos == inode->i_size &&
38260 + pos == inode_cluster_size(inode));
38261 +
38262 + *check_compress = 1;
38263 + return 0;
38264 +}
38265 +
38266 +static void start_check_compressibility(struct inode * inode,
38267 + struct cluster_handle * clust,
38268 + hint_t * hint)
38269 +{
38270 + assert("edward-1507", clust->index == 1);
38271 + assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
38272 + assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
38273 +
38274 + hint_init_zero(hint);
38275 + clust->hint = hint;
38276 + clust->index --;
38277 + clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
38278 +
38279 + /* first logical cluster (of index #0) must be complete */
38280 + assert("edward-1510", lbytes(clust->index, inode) ==
38281 + inode_cluster_size(inode));
38282 +}
38283 +
38284 +static void finish_check_compressibility(struct inode * inode,
38285 + struct cluster_handle * clust,
38286 + hint_t * hint)
38287 +{
38288 + reiser4_unset_hint(clust->hint);
38289 + clust->hint = hint;
38290 + clust->index ++;
38291 +}
38292 +
38293 +#if REISER4_DEBUG
38294 +static int prepped_dclust_ok(hint_t * hint)
38295 +{
38296 + reiser4_key key;
38297 + coord_t * coord = &hint->ext_coord.coord;
38298 +
38299 + item_key_by_coord(coord, &key);
38300 + return (item_id_by_coord(coord) == CTAIL_ID &&
38301 + !coord_is_unprepped_ctail(coord) &&
38302 + (get_key_offset(&key) + nr_units_ctail(coord) ==
38303 + dclust_get_extension_dsize(hint)));
38304 +}
38305 +#endif
38306 +
38307 +#define fifty_persent(size) (size >> 1)
38308 +/* evaluation of data compressibility */
38309 +#define data_is_compressible(osize, isize) \
38310 + (osize < fifty_persent(isize))
38311 +
38312 +/* This is called only once per file life.
38313 + Read first logical cluster (of index #0) and estimate its compressibility.
38314 + Save estimation result in @compressible */
38315 +static int read_check_compressibility(struct inode * inode,
38316 + struct cluster_handle * clust,
38317 + int * compressible)
38318 +{
38319 + int i;
38320 + int result;
38321 + __u32 dst_len;
38322 + hint_t tmp_hint;
38323 + hint_t * cur_hint = clust->hint;
38324 +
38325 + start_check_compressibility(inode, clust, &tmp_hint);
38326 +
38327 + reset_cluster_pgset(clust, cluster_nrpages(inode));
38328 + result = grab_page_cluster(inode, clust, READ_OP);
38329 + if (result)
38330 + return result;
38331 + /* Read page cluster here */
38332 + for (i = 0; i < clust->nr_pages; i++) {
38333 + struct page *page = clust->pages[i];
38334 + lock_page(page);
38335 + result = do_readpage_ctail(inode, clust, page,
38336 + ZNODE_READ_LOCK);
38337 + unlock_page(page);
38338 + if (result)
38339 + goto error;
38340 + }
38341 + tfm_cluster_clr_uptodate(&clust->tc);
38342 +
38343 + cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
38344 +
38345 + if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
38346 + /* lenght of compressed data is known, no need to compress */
38347 + assert("edward-1511",
38348 + znode_is_any_locked(tmp_hint.lh.node));
38349 + assert("edward-1512",
38350 + WITH_DATA(tmp_hint.ext_coord.coord.node,
38351 + prepped_dclust_ok(&tmp_hint)));
38352 + dst_len = dclust_get_extension_dsize(&tmp_hint);
38353 + }
38354 + else {
38355 + struct tfm_cluster * tc = &clust->tc;
38356 + compression_plugin * cplug = inode_compression_plugin(inode);
38357 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
38358 + if (result)
38359 + goto error;
38360 + for (i = 0; i < clust->nr_pages; i++) {
38361 + char *data;
38362 + lock_page(clust->pages[i]);
38363 + BUG_ON(!PageUptodate(clust->pages[i]));
38364 + data = kmap(clust->pages[i]);
38365 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
38366 + data, PAGE_CACHE_SIZE);
38367 + kunmap(clust->pages[i]);
38368 + unlock_page(clust->pages[i]);
38369 + }
38370 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
38371 + if (result)
38372 + goto error;
38373 + result = grab_coa(tc, cplug);
38374 + if (result)
38375 + goto error;
38376 + tc->len = tc->lsize = lbytes(clust->index, inode);
38377 + assert("edward-1513", tc->len == inode_cluster_size(inode));
38378 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
38379 + cplug->compress(get_coa(tc, cplug->h.id, tc->act),
38380 + tfm_input_data(clust), tc->len,
38381 + tfm_output_data(clust), &dst_len);
38382 + assert("edward-1514",
38383 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
38384 + }
38385 + finish_check_compressibility(inode, clust, cur_hint);
38386 + *compressible = data_is_compressible(dst_len,
38387 + inode_cluster_size(inode));
38388 + return 0;
38389 + error:
38390 + put_page_cluster(clust, inode, READ_OP);
38391 + return result;
38392 +}
38393 +
38394 +/* Cut disk cluster of index @idx */
38395 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
38396 +{
38397 + reiser4_key from, to;
38398 + assert("edward-1515", inode_file_plugin(inode) ==
38399 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
38400 + key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
38401 + to = from;
38402 + set_key_offset(&to,
38403 + get_key_offset(&from) + inode_cluster_size(inode) - 1);
38404 + return reiser4_cut_tree(reiser4_tree_by_inode(inode),
38405 + &from, &to, inode, 0);
38406 +}
38407 +
38408 +static int reserve_cryptcompress2unixfile(struct inode *inode)
38409 +{
38410 + reiser4_block_nr unformatted_nodes;
38411 + reiser4_tree *tree;
38412 +
38413 + tree = reiser4_tree_by_inode(inode);
38414 +
38415 + /* number of unformatted nodes which will be created */
38416 + unformatted_nodes = cluster_nrpages(inode); /* N */
38417 +
38418 + /*
38419 + * space required for one iteration of extent->tail conversion:
38420 + *
38421 + * 1. kill ctail items
38422 + *
38423 + * 2. insert N unformatted nodes
38424 + *
38425 + * 3. insert N (worst-case single-block
38426 + * extents) extent units.
38427 + *
38428 + * 4. drilling to the leaf level by coord_by_key()
38429 + *
38430 + * 5. possible update of stat-data
38431 + *
38432 + */
38433 + grab_space_enable();
38434 + return reiser4_grab_space
38435 + (2 * tree->height +
38436 + unformatted_nodes +
38437 + unformatted_nodes * estimate_one_insert_into_item(tree) +
38438 + 1 + estimate_one_insert_item(tree) +
38439 + inode_file_plugin(inode)->estimate.update(inode),
38440 + BA_CAN_COMMIT);
38441 +}
38442 +
38443 +/* clear flag that indicated conversion and update
38444 + stat-data with new (unix-file - specific) info */
38445 +static int complete_file_conversion(struct inode *inode)
38446 +{
38447 + int result;
38448 +
38449 + grab_space_enable();
38450 + result =
38451 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
38452 + BA_CAN_COMMIT);
38453 + if (result == 0) {
38454 + reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38455 + result = reiser4_update_sd(inode);
38456 + }
38457 + if (result)
38458 + warning("edward-1452",
38459 + "Converting %llu to unix-file: update sd failed (%i)",
38460 + (unsigned long long)get_inode_oid(inode), result);
38461 + return 0;
38462 +}
38463 +
38464 +
38465 +/* do conversion */
38466 +static int cryptcompress2unixfile(struct file * file, struct inode * inode,
38467 + struct cluster_handle * clust)
38468 +{
38469 + int i;
38470 + int result = 0;
38471 + struct cryptcompress_info *cr_info;
38472 + struct unix_file_info *uf_info;
38473 +
38474 + assert("edward-1516", clust->pages[0]->index == 0);
38475 + assert("edward-1517", clust->hint != NULL);
38476 +
38477 + /* release all cryptcompress-specific recources */
38478 + cr_info = cryptcompress_inode_data(inode);
38479 + result = reserve_cryptcompress2unixfile(inode);
38480 + if (result)
38481 + goto out;
38482 + reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38483 + reiser4_unset_hint(clust->hint);
38484 + result = cut_disk_cluster(inode, 0);
38485 + if (result)
38486 + goto out;
38487 + /* captured jnode of cluster and assotiated resources (pages,
38488 + reserved disk space) were released by ->kill_hook() method
38489 + of the item plugin */
38490 +
38491 + result = __cryptcompress2unixfile(file, inode);
38492 + if (result)
38493 + goto out;
38494 + /* At this point file is managed by unix file plugin */
38495 +
38496 + uf_info = unix_file_inode_data(inode);
38497 +
38498 + assert("edward-1518",
38499 + ergo(jprivate(clust->pages[0]),
38500 + !jnode_is_cluster_page(jprivate(clust->pages[0]))));
38501 + for(i = 0; i < clust->nr_pages; i++) {
38502 + assert("edward-1519", clust->pages[i]);
38503 + assert("edward-1520", PageUptodate(clust->pages[i]));
38504 +
38505 + result = find_or_create_extent(clust->pages[i]);
38506 + if (result)
38507 + break;
38508 + }
38509 + if (!result) {
38510 + uf_info->container = UF_CONTAINER_EXTENTS;
38511 + complete_file_conversion(inode);
38512 + }
38513 + out:
38514 + all_grabbed2free();
38515 + if (result)
38516 + warning("edward-1453", "Failed to convert file %llu: ret=%i",
38517 + (unsigned long long)get_inode_oid(inode), result);
38518 + return result;
38519 +}
38520 +
38521 +/* Check, then perform or disable conversion if needed */
38522 +int write_conversion_hook(struct file * file, struct inode * inode, loff_t pos,
38523 + struct cluster_handle * clust, int * progress)
38524 +{
38525 + int result;
38526 + int check_compress = 0;
38527 + int compressible = 0;
38528 +
38529 + if (!conversion_enabled(inode))
38530 + return 0;
38531 + result = check_position(inode, pos, clust, &check_compress);
38532 + if (result || !check_compress)
38533 + return result;
38534 + result = read_check_compressibility(inode, clust, &compressible);
38535 + if (result)
38536 + return result;
38537 +
38538 + /* At this point page cluster is grabbed and uptodate */
38539 + if (!compressible) {
38540 + result = cryptcompress2unixfile(file, inode, clust);
38541 + if (result == 0)
38542 + *progress = 1;
38543 + }
38544 + else
38545 + result = disable_conversion(inode);
38546 +
38547 + reiser4_txn_restart_current();
38548 + put_page_cluster(clust, inode, READ_OP);
38549 + return result;
38550 +}
38551 +
38552 +static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
38553 +{
38554 + return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
38555 +}
38556 +
38557 +/* Protected methods of cryptcompress file plugin constructed
38558 + by the macros above */
38559 +
38560 +/* Wrappers with active protection for:
38561 + . write_cryptcompress;
38562 + . setattr_cryptcompress;
38563 +*/
38564 +
38565 +ssize_t prot_write_cryptcompress(struct file *file, const char __user *buf,
38566 + size_t count, loff_t *off)
38567 +{
38568 + int prot = 0;
38569 + int conv = 0;
38570 + ssize_t written_cr = 0;
38571 + ssize_t written_uf = 0;
38572 + struct inode * inode = file->f_dentry->d_inode;
38573 + struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
38574 +
38575 + if (should_protect(inode)) {
38576 + prot = 1;
38577 + down_write(guard);
38578 + }
38579 + written_cr = write_cryptcompress(file, buf, count, off, &conv);
38580 + if (prot)
38581 + up_write(guard);
38582 + if (written_cr < 0)
38583 + return written_cr;
38584 + if (conv)
38585 + written_uf = write_unix_file(file, buf + written_cr,
38586 + count - written_cr, off);
38587 + return written_cr + (written_uf < 0 ? 0 : written_uf);
38588 +}
38589 +
38590 +int prot_setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
38591 +{
38592 + struct inode * inode = dentry->d_inode;
38593 + return PROT_ACTIVE(int, setattr, (dentry, attr),
38594 + setattr_conversion_hook(inode, attr));
38595 +}
38596 +
38597 +/* Wrappers with passive protection for:
38598 + . read_cryptcomperess;
38599 + . mmap_cryptcompress;
38600 + . release_cryptcompress;
38601 + . sendfile_cryptcompress;
38602 + . delete_object_cryptcompress.
38603 +*/
38604 +ssize_t prot_read_cryptcompress(struct file * file, char __user * buf,
38605 + size_t size, loff_t * off)
38606 +{
38607 + struct inode * inode = file->f_dentry->d_inode;
38608 + return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
38609 +}
38610 +
38611 +int prot_mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
38612 +{
38613 + struct inode *inode = file->f_dentry->d_inode;
38614 + return PROT_PASSIVE(int, mmap, (file, vma));
38615 +}
38616 +
38617 +int prot_release_cryptcompress(struct inode *inode, struct file *file)
38618 +{
38619 + return PROT_PASSIVE(int, release, (inode, file));
38620 +}
38621 +
38622 +ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos,
38623 + size_t count, read_actor_t actor,
38624 + void *target)
38625 +{
38626 + struct inode * inode = file->f_dentry->d_inode;
38627 + return PROT_PASSIVE(ssize_t, sendfile,
38628 + (file, ppos, count, actor, target));
38629 +}
38630 +
38631 +/*
38632 + Local variables:
38633 + c-indentation-style: "K&R"
38634 + mode-name: "LC"
38635 + c-basic-offset: 8
38636 + tab-width: 8
38637 + fill-column: 80
38638 + scroll-step: 1
38639 + End:
38640 +*/
38641 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/file.h linux-2.6.22/fs/reiser4/plugin/file/file.h
38642 --- linux-2.6.22.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300
38643 +++ linux-2.6.22/fs/reiser4/plugin/file/file.h 2007-07-29 00:25:34.928709936 +0400
38644 @@ -0,0 +1,272 @@
38645 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38646 + * reiser4/README */
38647 +
38648 +/* this file contains declarations of methods implementing
38649 + file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
38650 + and SYMLINK_FILE_PLUGIN_ID) */
38651 +
38652 +#if !defined( __REISER4_FILE_H__ )
38653 +#define __REISER4_FILE_H__
38654 +
38655 +/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38656 +
38657 +/* inode operations */
38658 +int setattr_unix_file(struct dentry *, struct iattr *);
38659 +
38660 +/* file operations */
38661 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38662 + loff_t *off);
38663 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38664 + loff_t * off);
38665 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38666 + unsigned long arg);
38667 +int mmap_unix_file(struct file *, struct vm_area_struct *);
38668 +int open_unix_file(struct inode *, struct file *);
38669 +int release_unix_file(struct inode *, struct file *);
38670 +int sync_unix_file(struct file *, struct dentry *, int datasync);
38671 +ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38672 + read_actor_t, void *target);
38673 +
38674 +/* address space operations */
38675 +int readpage_unix_file(struct file *, struct page *);
38676 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
38677 +int writepages_unix_file(struct address_space *, struct writeback_control *);
38678 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38679 + unsigned to);
38680 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
38681 + unsigned to);
38682 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38683 +
38684 +/* file plugin operations */
38685 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38686 + int user, loff_t, loff_t, rw_op, flow_t *);
38687 +int owns_item_unix_file(const struct inode *, const coord_t *);
38688 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38689 + int create);
38690 +int delete_object_unix_file(struct inode *);
38691 +
38692 +/*
38693 + * all the write into unix file is performed by item write method. Write method
38694 + * of unix file plugin only decides which item plugin (extent or tail) and in
38695 + * which mode (one from the enum below) to call
38696 + */
38697 +typedef enum {
38698 + FIRST_ITEM = 1,
38699 + APPEND_ITEM = 2,
38700 + OVERWRITE_ITEM = 3
38701 +} write_mode_t;
38702 +
38703 +/* unix file may be in one the following states */
38704 +typedef enum {
38705 + UF_CONTAINER_UNKNOWN = 0,
38706 + UF_CONTAINER_TAILS = 1,
38707 + UF_CONTAINER_EXTENTS = 2,
38708 + UF_CONTAINER_EMPTY = 3
38709 +} file_container_t;
38710 +
38711 +struct formatting_plugin;
38712 +struct inode;
38713 +
38714 +/* unix file plugin specific part of reiser4 inode */
38715 +struct unix_file_info {
38716 + /*
38717 + * this read-write lock protects file containerization change. Accesses
38718 + * which do not change file containerization (see file_container_t)
38719 + * (read, readpage, writepage, write (until tail conversion is
38720 + * involved)) take read-lock. Accesses which modify file
38721 + * containerization (truncate, conversion from tail to extent and back)
38722 + * take write-lock.
38723 + */
38724 + struct rw_semaphore latch;
38725 + /* this enum specifies which items are used to build the file */
38726 + file_container_t container;
38727 + /*
38728 + * plugin which controls when file is to be converted to extents and
38729 + * back to tail
38730 + */
38731 + struct formatting_plugin *tplug;
38732 + /* if this is set, file is in exclusive use */
38733 + int exclusive_use;
38734 +#if REISER4_DEBUG
38735 + /* pointer to task struct of thread owning exclusive access to file */
38736 + void *ea_owner;
38737 + atomic_t nr_neas;
38738 + void *last_reader;
38739 +#endif
38740 +};
38741 +
38742 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38743 +void get_exclusive_access(struct unix_file_info *);
38744 +void drop_exclusive_access(struct unix_file_info *);
38745 +void get_nonexclusive_access(struct unix_file_info *);
38746 +void drop_nonexclusive_access(struct unix_file_info *);
38747 +int try_to_get_nonexclusive_access(struct unix_file_info *);
38748 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38749 + struct inode *);
38750 +int find_file_item_nohint(coord_t *, lock_handle *,
38751 + const reiser4_key *, znode_lock_mode,
38752 + struct inode *);
38753 +
38754 +int load_file_hint(struct file *, hint_t *);
38755 +void save_file_hint(struct file *, const hint_t *);
38756 +
38757 +#include "../item/extent.h"
38758 +#include "../item/tail.h"
38759 +#include "../item/ctail.h"
38760 +
38761 +struct uf_coord {
38762 + coord_t coord;
38763 + lock_handle *lh;
38764 + int valid;
38765 + union {
38766 + struct extent_coord_extension extent;
38767 + struct tail_coord_extension tail;
38768 + struct ctail_coord_extension ctail;
38769 + } extension;
38770 +};
38771 +
38772 +#include "../../forward.h"
38773 +#include "../../seal.h"
38774 +#include "../../lock.h"
38775 +
38776 +/*
38777 + * This structure is used to speed up file operations (reads and writes). A
38778 + * hint is a suggestion about where a key resolved to last time. A seal
38779 + * indicates whether a node has been modified since a hint was last recorded.
38780 + * You check the seal, and if the seal is still valid, you can use the hint
38781 + * without traversing the tree again.
38782 + */
38783 +struct hint {
38784 + seal_t seal; /* a seal over last file item accessed */
38785 + uf_coord_t ext_coord;
38786 + loff_t offset;
38787 + znode_lock_mode mode;
38788 + lock_handle lh;
38789 +};
38790 +
38791 +static inline int hint_is_valid(hint_t * hint)
38792 +{
38793 + return hint->ext_coord.valid;
38794 +}
38795 +
38796 +static inline void hint_set_valid(hint_t * hint)
38797 +{
38798 + hint->ext_coord.valid = 1;
38799 +}
38800 +
38801 +static inline void hint_clr_valid(hint_t * hint)
38802 +{
38803 + hint->ext_coord.valid = 0;
38804 +}
38805 +
38806 +int load_file_hint(struct file *, hint_t *);
38807 +void save_file_hint(struct file *, const hint_t *);
38808 +void hint_init_zero(hint_t *);
38809 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38810 +int hint_is_set(const hint_t *);
38811 +void reiser4_unset_hint(hint_t *);
38812 +
38813 +int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
38814 +int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38815 + loff_t cur_size, int (*update_actor) (struct inode *,
38816 + reiser4_key *, int));
38817 +#if REISER4_DEBUG
38818 +
38819 +/* return 1 is exclusive access is obtained, 0 - otherwise */
38820 +static inline int ea_obtained(struct unix_file_info * uf_info)
38821 +{
38822 + int ret;
38823 +
38824 + ret = down_read_trylock(&uf_info->latch);
38825 + if (ret)
38826 + up_read(&uf_info->latch);
38827 + return !ret;
38828 +}
38829 +
38830 +#endif
38831 +
38832 +/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38833 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
38834 + reiser4_object_create_data *);
38835 +void destroy_inode_symlink(struct inode *);
38836 +
38837 +/* declarations of functions implementing CRYPTCOMPRESS_FILE_PLUGIN_ID
38838 + file plugin */
38839 +
38840 +/* inode operations */
38841 +int setattr_cryptcompress(struct dentry *, struct iattr *);
38842 +int prot_setattr_cryptcompress(struct dentry *, struct iattr *);
38843 +
38844 +/* file operations */
38845 +ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38846 + loff_t * off);
38847 +ssize_t prot_read_cryptcompress(struct file *, char __user *buf,
38848 + size_t read_amount, loff_t * off);
38849 +
38850 +ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38851 + loff_t * off, int * conv);
38852 +ssize_t prot_write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38853 + loff_t * off);
38854 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38855 +int prot_mmap_cryptcompress(struct file *, struct vm_area_struct *);
38856 +ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38857 + read_actor_t actor, void *target);
38858 +ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38859 + read_actor_t actor, void *target);
38860 +
38861 +int release_cryptcompress(struct inode *, struct file *);
38862 +int prot_release_cryptcompress(struct inode *, struct file *);
38863 +
38864 +/* address space operations */
38865 +extern int readpage_cryptcompress(struct file *, struct page *);
38866 +extern int writepages_cryptcompress(struct address_space *,
38867 + struct writeback_control *);
38868 +/* file plugin operations */
38869 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38870 + int user, loff_t, loff_t, rw_op, flow_t *);
38871 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38872 +int create_cryptcompress(struct inode *, struct inode *,
38873 + reiser4_object_create_data *);
38874 +int delete_object_cryptcompress(struct inode *);
38875 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38876 + int create);
38877 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38878 + const reiser4_key * to_key,
38879 + reiser4_key * smallest_removed,
38880 + struct inode *object, int truncate,
38881 + int *progress);
38882 +void destroy_inode_cryptcompress(struct inode *);
38883 +int open_object_cryptcompress(struct inode * inode, struct file * file);
38884 +
38885 +extern reiser4_plugin_ops cryptcompress_plugin_ops;
38886 +
38887 +#define WRITE_GRANULARITY 32
38888 +
38889 +int tail2extent(struct unix_file_info *);
38890 +int extent2tail(struct unix_file_info *);
38891 +
38892 +int goto_right_neighbor(coord_t *, lock_handle *);
38893 +int find_or_create_extent(struct page *);
38894 +int equal_to_ldk(znode *, const reiser4_key *);
38895 +
38896 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
38897 +
38898 +static inline int cbk_errored(int cbk_result)
38899 +{
38900 + return (cbk_result != CBK_COORD_NOTFOUND
38901 + && cbk_result != CBK_COORD_FOUND);
38902 +}
38903 +
38904 +/* __REISER4_FILE_H__ */
38905 +#endif
38906 +
38907 +/*
38908 + * Local variables:
38909 + * c-indentation-style: "K&R"
38910 + * mode-name: "LC"
38911 + * c-basic-offset: 8
38912 + * tab-width: 8
38913 + * fill-column: 79
38914 + * scroll-step: 1
38915 + * End:
38916 +*/
38917 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/Makefile linux-2.6.22/fs/reiser4/plugin/file/Makefile
38918 --- linux-2.6.22.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300
38919 +++ linux-2.6.22/fs/reiser4/plugin/file/Makefile 2007-07-29 00:25:34.928709936 +0400
38920 @@ -0,0 +1,7 @@
38921 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
38922 +
38923 +file_plugins-objs := \
38924 + file.o \
38925 + tail_conversion.o \
38926 + symlink.o \
38927 + cryptcompress.o
38928 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.22/fs/reiser4/plugin/file/symfile.c
38929 --- linux-2.6.22.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300
38930 +++ linux-2.6.22/fs/reiser4/plugin/file/symfile.c 2007-07-29 00:25:34.928709936 +0400
38931 @@ -0,0 +1,87 @@
38932 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38933 +
38934 +/* Symfiles are a generalization of Unix symlinks.
38935 +
38936 + A symfile when read behaves as though you took its contents and
38937 + substituted them into the reiser4 naming system as the right hand side
38938 + of an assignment, and then read that which you had assigned to it.
38939 +
38940 + A key issue for symfiles is how to implement writes through to
38941 + subfiles. In general, one must have some method of determining what
38942 + of that which is written to the symfile is written to what subfile.
38943 + This can be done by use of custom plugin methods written by users, or
38944 + by using a few general methods we provide for those willing to endure
38945 + the insertion of delimiters into what is read.
38946 +
38947 + Writing to symfiles without delimiters to denote what is written to
38948 + what subfile is not supported by any plugins we provide in this
38949 + release. Our most sophisticated support for writes is that embodied
38950 + by the invert plugin (see invert.c).
38951 +
38952 + A read only version of the /etc/passwd file might be
38953 + constructed as a symfile whose contents are as follows:
38954 +
38955 + /etc/passwd/userlines/*
38956 +
38957 + or
38958 +
38959 + /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
38960 +
38961 + or
38962 +
38963 + /etc/passwd/userlines/(demidov+edward+reiser+root)
38964 +
38965 + A symfile with contents
38966 +
38967 + /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
38968 +
38969 + will return when read
38970 +
38971 + The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
38972 +
38973 + and write of what has been read will not be possible to implement as
38974 + an identity operation because there are no delimiters denoting the
38975 + boundaries of what is to be written to what subfile.
38976 +
38977 + Note that one could make this a read/write symfile if one specified
38978 + delimiters, and the write method understood those delimiters delimited
38979 + what was written to subfiles.
38980 +
38981 + So, specifying the symfile in a manner that allows writes:
38982 +
38983 + /etc/passwd/userlines/demidov+"(
38984 + )+/etc/passwd/userlines/edward+"(
38985 + )+/etc/passwd/userlines/reiser+"(
38986 + )+/etc/passwd/userlines/root+"(
38987 + )
38988 +
38989 + or
38990 +
38991 + /etc/passwd/userlines/(demidov+"(
38992 + )+edward+"(
38993 + )+reiser+"(
38994 + )+root+"(
38995 + ))
38996 +
38997 + and the file demidov might be specified as:
38998 +
38999 + /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39000 +
39001 + or
39002 +
39003 + /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39004 +
39005 + Notice that if the file demidov has a carriage return in it, the
39006 + parsing fails, but then if you put carriage returns in the wrong place
39007 + in a normal /etc/passwd file it breaks things also.
39008 +
39009 + Note that it is forbidden to have no text between two interpolations
39010 + if one wants to be able to define what parts of a write go to what
39011 + subfiles referenced in an interpolation.
39012 +
39013 + If one wants to be able to add new lines by writing to the file, one
39014 + must either write a custom plugin for /etc/passwd that knows how to
39015 + name an added line, or one must use an invert, or one must use a more
39016 + sophisticated symfile syntax that we are not planning to write for
39017 + version 4.0.
39018 +*/
39019 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.22/fs/reiser4/plugin/file/symlink.c
39020 --- linux-2.6.22.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300
39021 +++ linux-2.6.22/fs/reiser4/plugin/file/symlink.c 2007-07-29 00:25:34.928709936 +0400
39022 @@ -0,0 +1,95 @@
39023 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39024 +
39025 +#include "../../inode.h"
39026 +
39027 +#include <linux/types.h>
39028 +#include <linux/fs.h>
39029 +
39030 +/* file plugin methods specific for symlink files
39031 + (SYMLINK_FILE_PLUGIN_ID) */
39032 +
39033 +/* this is implementation of create_object method of file plugin for
39034 + SYMLINK_FILE_PLUGIN_ID
39035 + */
39036 +
39037 +/**
39038 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39039 + * @symlink: inode of symlink object
39040 + * @dir: inode of parent directory
39041 + * @info: parameters of new object
39042 + *
39043 + * Inserts stat data with symlink extension where into the tree.
39044 + */
39045 +int reiser4_create_symlink(struct inode *symlink,
39046 + struct inode *dir UNUSED_ARG,
39047 + reiser4_object_create_data *data /* info passed to us
39048 + * this is filled by
39049 + * reiser4() syscall
39050 + * in particular */)
39051 +{
39052 + int result;
39053 +
39054 + assert("nikita-680", symlink != NULL);
39055 + assert("nikita-681", S_ISLNK(symlink->i_mode));
39056 + assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
39057 + assert("nikita-682", dir != NULL);
39058 + assert("nikita-684", data != NULL);
39059 + assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39060 +
39061 + /*
39062 + * stat data of symlink has symlink extension in which we store
39063 + * symlink content, that is, path symlink is pointing to.
39064 + */
39065 + reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39066 +
39067 + assert("vs-838", symlink->i_private == NULL);
39068 + symlink->i_private = (void *)data->name;
39069 +
39070 + assert("vs-843", symlink->i_size == 0);
39071 + INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39072 +
39073 + /* insert stat data appended with data->name */
39074 + result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39075 + if (result) {
39076 + /* FIXME-VS: Make sure that symlink->i_private is not attached
39077 + to kmalloced data */
39078 + INODE_SET_FIELD(symlink, i_size, 0);
39079 + } else {
39080 + assert("vs-849", symlink->i_private
39081 + && reiser4_inode_get_flag(symlink,
39082 + REISER4_GENERIC_PTR_USED));
39083 + assert("vs-850",
39084 + !memcmp((char *)symlink->i_private, data->name,
39085 + (size_t) symlink->i_size + 1));
39086 + }
39087 + return result;
39088 +}
39089 +
39090 +/* this is implementation of destroy_inode method of file plugin for
39091 + SYMLINK_FILE_PLUGIN_ID
39092 + */
39093 +void destroy_inode_symlink(struct inode *inode)
39094 +{
39095 + assert("edward-799",
39096 + inode_file_plugin(inode) ==
39097 + file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39098 + assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39099 + assert("edward-801", reiser4_inode_get_flag(inode,
39100 + REISER4_GENERIC_PTR_USED));
39101 + assert("vs-839", S_ISLNK(inode->i_mode));
39102 +
39103 + kfree(inode->i_private);
39104 + inode->i_private = NULL;
39105 + reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39106 +}
39107 +
39108 +/*
39109 + Local variables:
39110 + c-indentation-style: "K&R"
39111 + mode-name: "LC"
39112 + c-basic-offset: 8
39113 + tab-width: 8
39114 + fill-column: 80
39115 + scroll-step: 1
39116 + End:
39117 +*/
39118 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.22/fs/reiser4/plugin/file/tail_conversion.c
39119 --- linux-2.6.22.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300
39120 +++ linux-2.6.22/fs/reiser4/plugin/file/tail_conversion.c 2007-07-29 00:25:34.932710971 +0400
39121 @@ -0,0 +1,729 @@
39122 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39123 +
39124 +#include "../../inode.h"
39125 +#include "../../super.h"
39126 +#include "../../page_cache.h"
39127 +#include "../../carry.h"
39128 +#include "../../safe_link.h"
39129 +#include "../../vfs_ops.h"
39130 +
39131 +#include <linux/writeback.h>
39132 +
39133 +/* this file contains:
39134 + tail2extent and extent2tail */
39135 +
39136 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39137 +void get_exclusive_access(struct unix_file_info * uf_info)
39138 +{
39139 + assert("nikita-3028", reiser4_schedulable());
39140 + assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39141 + assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39142 + /*
39143 + * "deadlock avoidance": sometimes we commit a transaction under
39144 + * rw-semaphore on a file. Such commit can deadlock with another
39145 + * thread that captured some block (hence preventing atom from being
39146 + * committed) and waits on rw-semaphore.
39147 + */
39148 + reiser4_txn_restart_current();
39149 + LOCK_CNT_INC(inode_sem_w);
39150 + down_write(&uf_info->latch);
39151 + uf_info->exclusive_use = 1;
39152 + assert("vs-1713", uf_info->ea_owner == NULL);
39153 + assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39154 + ON_DEBUG(uf_info->ea_owner = current);
39155 +}
39156 +
39157 +void drop_exclusive_access(struct unix_file_info * uf_info)
39158 +{
39159 + assert("vs-1714", uf_info->ea_owner == current);
39160 + assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39161 + ON_DEBUG(uf_info->ea_owner = NULL);
39162 + uf_info->exclusive_use = 0;
39163 + up_write(&uf_info->latch);
39164 + assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39165 + assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39166 + LOCK_CNT_DEC(inode_sem_w);
39167 + reiser4_txn_restart_current();
39168 +}
39169 +
39170 +/**
39171 + * nea_grabbed - do something when file semaphore is down_read-ed
39172 + * @uf_info:
39173 + *
39174 + * This is called when nonexclisive access is obtained on file. All it does is
39175 + * for debugging purposes.
39176 + */
39177 +static void nea_grabbed(struct unix_file_info *uf_info)
39178 +{
39179 +#if REISER4_DEBUG
39180 + LOCK_CNT_INC(inode_sem_r);
39181 + assert("vs-1716", uf_info->ea_owner == NULL);
39182 + atomic_inc(&uf_info->nr_neas);
39183 + uf_info->last_reader = current;
39184 +#endif
39185 +}
39186 +
39187 +/**
39188 + * get_nonexclusive_access - get nonexclusive access to a file
39189 + * @uf_info: unix file specific part of inode to obtain access to
39190 + *
39191 + * Nonexclusive access is obtained on a file before read, write, readpage.
39192 + */
39193 +void get_nonexclusive_access(struct unix_file_info *uf_info)
39194 +{
39195 + assert("nikita-3029", reiser4_schedulable());
39196 + assert("nikita-3361", get_current_context()->trans->atom == NULL);
39197 +
39198 + down_read(&uf_info->latch);
39199 + nea_grabbed(uf_info);
39200 +}
39201 +
39202 +/**
39203 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39204 + * @uf_info: unix file specific part of inode to obtain access to
39205 + *
39206 + * Non-blocking version of nonexclusive access obtaining.
39207 + */
39208 +int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
39209 +{
39210 + int result;
39211 +
39212 + result = down_read_trylock(&uf_info->latch);
39213 + if (result)
39214 + nea_grabbed(uf_info);
39215 + return result;
39216 +}
39217 +
39218 +void drop_nonexclusive_access(struct unix_file_info * uf_info)
39219 +{
39220 + assert("vs-1718", uf_info->ea_owner == NULL);
39221 + assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
39222 + ON_DEBUG(atomic_dec(&uf_info->nr_neas));
39223 +
39224 + up_read(&uf_info->latch);
39225 +
39226 + LOCK_CNT_DEC(inode_sem_r);
39227 + reiser4_txn_restart_current();
39228 +}
39229 +
39230 +/* part of tail2extent. Cut all items covering @count bytes starting from
39231 + @offset */
39232 +/* Audited by: green(2002.06.15) */
39233 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
39234 +{
39235 + reiser4_key from, to;
39236 +
39237 + /* AUDIT: How about putting an assertion here, what would check
39238 + all provided range is covered by tail items only? */
39239 + /* key of first byte in the range to be cut */
39240 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39241 +
39242 + /* key of last byte in that range */
39243 + to = from;
39244 + set_key_offset(&to, (__u64) (offset + count - 1));
39245 +
39246 + /* cut everything between those keys */
39247 + return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
39248 + inode, 0);
39249 +}
39250 +
39251 +static void release_all_pages(struct page **pages, unsigned nr_pages)
39252 +{
39253 + unsigned i;
39254 +
39255 + for (i = 0; i < nr_pages; i++) {
39256 + if (pages[i] == NULL) {
39257 + unsigned j;
39258 + for (j = i + 1; j < nr_pages; j++)
39259 + assert("vs-1620", pages[j] == NULL);
39260 + break;
39261 + }
39262 + page_cache_release(pages[i]);
39263 + pages[i] = NULL;
39264 + }
39265 +}
39266 +
39267 +/* part of tail2extent. replace tail items with extent one. Content of tail
39268 + items (@count bytes) being cut are copied already into
39269 + pages. extent_writepage method is called to create extents corresponding to
39270 + those pages */
39271 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
39272 +{
39273 + int result;
39274 + unsigned i;
39275 + STORE_COUNTERS;
39276 +
39277 + if (nr_pages == 0)
39278 + return 0;
39279 +
39280 + assert("vs-596", pages[0]);
39281 +
39282 + /* cut copied items */
39283 + result = cut_formatting_items(inode, page_offset(pages[0]), count);
39284 + if (result)
39285 + return result;
39286 +
39287 + CHECK_COUNTERS;
39288 +
39289 + /* put into tree replacement for just removed items: extent item, namely */
39290 + for (i = 0; i < nr_pages; i++) {
39291 + result = add_to_page_cache_lru(pages[i], inode->i_mapping,
39292 + pages[i]->index,
39293 + mapping_gfp_mask(inode->
39294 + i_mapping));
39295 + if (result)
39296 + break;
39297 + unlock_page(pages[i]);
39298 + result = find_or_create_extent(pages[i]);
39299 + if (result)
39300 + break;
39301 + SetPageUptodate(pages[i]);
39302 + }
39303 + return result;
39304 +}
39305 +
39306 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
39307 + * items */
39308 +
39309 +static int reserve_tail2extent_iteration(struct inode *inode)
39310 +{
39311 + reiser4_block_nr unformatted_nodes;
39312 + reiser4_tree *tree;
39313 +
39314 + tree = reiser4_tree_by_inode(inode);
39315 +
39316 + /* number of unformatted nodes which will be created */
39317 + unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
39318 +
39319 + /*
39320 + * space required for one iteration of extent->tail conversion:
39321 + *
39322 + * 1. kill N tail items
39323 + *
39324 + * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
39325 + *
39326 + * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
39327 + * extents) extent units.
39328 + *
39329 + * 4. drilling to the leaf level by coord_by_key()
39330 + *
39331 + * 5. possible update of stat-data
39332 + *
39333 + */
39334 + grab_space_enable();
39335 + return reiser4_grab_space
39336 + (2 * tree->height +
39337 + TAIL2EXTENT_PAGE_NUM +
39338 + TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
39339 + 1 + estimate_one_insert_item(tree) +
39340 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39341 +}
39342 +
39343 +/* clear stat data's flag indicating that conversion is being converted */
39344 +static int complete_conversion(struct inode *inode)
39345 +{
39346 + int result;
39347 +
39348 + grab_space_enable();
39349 + result =
39350 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39351 + BA_CAN_COMMIT);
39352 + if (result == 0) {
39353 + reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
39354 + result = reiser4_update_sd(inode);
39355 + }
39356 + if (result)
39357 + warning("vs-1696", "Failed to clear converting bit of %llu: %i",
39358 + (unsigned long long)get_inode_oid(inode), result);
39359 + return 0;
39360 +}
39361 +
39362 +/**
39363 + * find_start
39364 + * @inode:
39365 + * @id:
39366 + * @offset:
39367 + *
39368 + * this is used by tail2extent and extent2tail to detect where previous
39369 + * uncompleted conversion stopped
39370 + */
39371 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
39372 +{
39373 + int result;
39374 + lock_handle lh;
39375 + coord_t coord;
39376 + struct unix_file_info *ufo;
39377 + int found;
39378 + reiser4_key key;
39379 +
39380 + ufo = unix_file_inode_data(inode);
39381 + init_lh(&lh);
39382 + result = 0;
39383 + found = 0;
39384 + inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
39385 + do {
39386 + init_lh(&lh);
39387 + result = find_file_item_nohint(&coord, &lh, &key,
39388 + ZNODE_READ_LOCK, inode);
39389 +
39390 + if (result == CBK_COORD_FOUND) {
39391 + if (coord.between == AT_UNIT) {
39392 + /*coord_clear_iplug(&coord); */
39393 + result = zload(coord.node);
39394 + if (result == 0) {
39395 + if (item_id_by_coord(&coord) == id)
39396 + found = 1;
39397 + else
39398 + item_plugin_by_coord(&coord)->s.
39399 + file.append_key(&coord,
39400 + &key);
39401 + zrelse(coord.node);
39402 + }
39403 + } else
39404 + result = RETERR(-ENOENT);
39405 + }
39406 + done_lh(&lh);
39407 + } while (result == 0 && !found);
39408 + *offset = get_key_offset(&key);
39409 + return result;
39410 +}
39411 +
39412 +/**
39413 + * tail2extent
39414 + * @uf_info:
39415 + *
39416 + *
39417 + */
39418 +int tail2extent(struct unix_file_info *uf_info)
39419 +{
39420 + int result;
39421 + reiser4_key key; /* key of next byte to be moved to page */
39422 + char *p_data; /* data of page */
39423 + unsigned page_off = 0, /* offset within the page where to copy data */
39424 + count; /* number of bytes of item which can be
39425 + * copied to page */
39426 + struct page *pages[TAIL2EXTENT_PAGE_NUM];
39427 + struct page *page;
39428 + int done; /* set to 1 when all file is read */
39429 + char *item;
39430 + int i;
39431 + struct inode *inode;
39432 + int first_iteration;
39433 + int bytes;
39434 + __u64 offset;
39435 +
39436 + assert("nikita-3362", ea_obtained(uf_info));
39437 + inode = unix_file_info_to_inode(uf_info);
39438 + assert("nikita-3412", !IS_RDONLY(inode));
39439 + assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
39440 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
39441 +
39442 + offset = 0;
39443 + first_iteration = 1;
39444 + result = 0;
39445 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
39446 + /*
39447 + * file is marked on disk as there was a conversion which did
39448 + * not complete due to either crash or some error. Find which
39449 + * offset tail conversion stopped at
39450 + */
39451 + result = find_start(inode, FORMATTING_ID, &offset);
39452 + if (result == -ENOENT) {
39453 + /* no tail items found, everything is converted */
39454 + uf_info->container = UF_CONTAINER_EXTENTS;
39455 + complete_conversion(inode);
39456 + return 0;
39457 + } else if (result != 0)
39458 + /* some other error */
39459 + return result;
39460 + first_iteration = 0;
39461 + }
39462 +
39463 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
39464 +
39465 + /* get key of first byte of a file */
39466 + inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
39467 +
39468 + done = 0;
39469 + while (done == 0) {
39470 + memset(pages, 0, sizeof(pages));
39471 + result = reserve_tail2extent_iteration(inode);
39472 + if (result != 0)
39473 + goto out;
39474 + if (first_iteration) {
39475 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
39476 + reiser4_update_sd(inode);
39477 + first_iteration = 0;
39478 + }
39479 + bytes = 0;
39480 + for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
39481 + assert("vs-598",
39482 + (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
39483 + page = alloc_page(reiser4_ctx_gfp_mask_get());
39484 + if (!page) {
39485 + result = RETERR(-ENOMEM);
39486 + goto error;
39487 + }
39488 +
39489 + page->index =
39490 + (unsigned long)(get_key_offset(&key) >>
39491 + PAGE_CACHE_SHIFT);
39492 + /*
39493 + * usually when one is going to longterm lock znode (as
39494 + * find_file_item does, for instance) he must not hold
39495 + * locked pages. However, there is an exception for
39496 + * case tail2extent. Pages appearing here are not
39497 + * reachable to everyone else, they are clean, they do
39498 + * not have jnodes attached so keeping them locked do
39499 + * not risk deadlock appearance
39500 + */
39501 + assert("vs-983", !PagePrivate(page));
39502 + reiser4_invalidate_pages(inode->i_mapping, page->index,
39503 + 1, 0);
39504 +
39505 + for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
39506 + coord_t coord;
39507 + lock_handle lh;
39508 +
39509 + /* get next item */
39510 + /* FIXME: we might want to readahead here */
39511 + init_lh(&lh);
39512 + result =
39513 + find_file_item_nohint(&coord, &lh, &key,
39514 + ZNODE_READ_LOCK,
39515 + inode);
39516 + if (result != CBK_COORD_FOUND) {
39517 + /*
39518 + * error happened of not items of file
39519 + * were found
39520 + */
39521 + done_lh(&lh);
39522 + page_cache_release(page);
39523 + goto error;
39524 + }
39525 +
39526 + if (coord.between == AFTER_UNIT) {
39527 + /*
39528 + * end of file is reached. Padd page
39529 + * with zeros
39530 + */
39531 + done_lh(&lh);
39532 + done = 1;
39533 + p_data = kmap_atomic(page, KM_USER0);
39534 + memset(p_data + page_off, 0,
39535 + PAGE_CACHE_SIZE - page_off);
39536 + kunmap_atomic(p_data, KM_USER0);
39537 + break;
39538 + }
39539 +
39540 + result = zload(coord.node);
39541 + if (result) {
39542 + page_cache_release(page);
39543 + done_lh(&lh);
39544 + goto error;
39545 + }
39546 + assert("vs-856", coord.between == AT_UNIT);
39547 + item = ((char *)item_body_by_coord(&coord)) +
39548 + coord.unit_pos;
39549 +
39550 + /* how many bytes to copy */
39551 + count =
39552 + item_length_by_coord(&coord) -
39553 + coord.unit_pos;
39554 + /* limit length of copy to end of page */
39555 + if (count > PAGE_CACHE_SIZE - page_off)
39556 + count = PAGE_CACHE_SIZE - page_off;
39557 +
39558 + /*
39559 + * copy item (as much as will fit starting from
39560 + * the beginning of the item) into the page
39561 + */
39562 + p_data = kmap_atomic(page, KM_USER0);
39563 + memcpy(p_data + page_off, item, count);
39564 + kunmap_atomic(p_data, KM_USER0);
39565 +
39566 + page_off += count;
39567 + bytes += count;
39568 + set_key_offset(&key,
39569 + get_key_offset(&key) + count);
39570 +
39571 + zrelse(coord.node);
39572 + done_lh(&lh);
39573 + } /* end of loop which fills one page by content of
39574 + * formatting items */
39575 +
39576 + if (page_off) {
39577 + /* something was copied into page */
39578 + pages[i] = page;
39579 + } else {
39580 + page_cache_release(page);
39581 + assert("vs-1648", done == 1);
39582 + break;
39583 + }
39584 + } /* end of loop through pages of one conversion iteration */
39585 +
39586 + if (i > 0) {
39587 + result = replace(inode, pages, i, bytes);
39588 + release_all_pages(pages, sizeof_array(pages));
39589 + if (result)
39590 + goto error;
39591 + /*
39592 + * We have to drop exclusive access to avoid deadlock
39593 + * which may happen because called by reiser4_writepages
39594 + * capture_unix_file requires to get non-exclusive
39595 + * access to a file. It is safe to drop EA in the middle
39596 + * of tail2extent conversion because write_unix_file,
39597 + * setattr_unix_file(truncate), mmap_unix_file,
39598 + * release_unix_file(extent2tail) checks if conversion
39599 + * is not in progress (see comments before
39600 + * get_exclusive_access_careful().
39601 + * Other processes that acquire non-exclusive access
39602 + * (read_unix_file, reiser4_writepages, etc) should work
39603 + * on partially converted files.
39604 + */
39605 + drop_exclusive_access(uf_info);
39606 + /* throttle the conversion */
39607 + reiser4_throttle_write(inode);
39608 + get_exclusive_access(uf_info);
39609 +
39610 + /*
39611 + * nobody is allowed to complete conversion but a
39612 + * process which started it
39613 + */
39614 + assert("", reiser4_inode_get_flag(inode,
39615 + REISER4_PART_MIXED));
39616 + }
39617 + }
39618 +
39619 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
39620 +
39621 + if (result == 0) {
39622 + /* file is converted to extent items */
39623 + assert("vs-1697", reiser4_inode_get_flag(inode,
39624 + REISER4_PART_MIXED));
39625 +
39626 + uf_info->container = UF_CONTAINER_EXTENTS;
39627 + complete_conversion(inode);
39628 + } else {
39629 + /*
39630 + * conversion is not complete. Inode was already marked as
39631 + * REISER4_PART_CONV and stat-data were updated at the first
39632 + * iteration of the loop above.
39633 + */
39634 + error:
39635 + release_all_pages(pages, sizeof_array(pages));
39636 + warning("nikita-2282", "Partial conversion of %llu: %i",
39637 + (unsigned long long)get_inode_oid(inode), result);
39638 + }
39639 +
39640 + out:
39641 + return result;
39642 +}
39643 +
39644 +static int reserve_extent2tail_iteration(struct inode *inode)
39645 +{
39646 + reiser4_tree *tree;
39647 +
39648 + tree = reiser4_tree_by_inode(inode);
39649 + /*
39650 + * reserve blocks for (in this order):
39651 + *
39652 + * 1. removal of extent item
39653 + *
39654 + * 2. insertion of tail by insert_flow()
39655 + *
39656 + * 3. drilling to the leaf level by coord_by_key()
39657 + *
39658 + * 4. possible update of stat-data
39659 + */
39660 + grab_space_enable();
39661 + return reiser4_grab_space
39662 + (estimate_one_item_removal(tree) +
39663 + estimate_insert_flow(tree->height) +
39664 + 1 + estimate_one_insert_item(tree) +
39665 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39666 +}
39667 +
39668 +/* for every page of file: read page, cut part of extent pointing to this page,
39669 + put data of page tree by tail item */
39670 +int extent2tail(struct unix_file_info *uf_info)
39671 +{
39672 + int result;
39673 + struct inode *inode;
39674 + struct page *page;
39675 + unsigned long num_pages, i;
39676 + unsigned long start_page;
39677 + reiser4_key from;
39678 + reiser4_key to;
39679 + unsigned count;
39680 + __u64 offset;
39681 +
39682 + assert("nikita-3362", ea_obtained(uf_info));
39683 + inode = unix_file_info_to_inode(uf_info);
39684 + assert("nikita-3412", !IS_RDONLY(inode));
39685 + assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
39686 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
39687 +
39688 + offset = 0;
39689 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
39690 + /*
39691 + * file is marked on disk as there was a conversion which did
39692 + * not complete due to either crash or some error. Find which
39693 + * offset tail conversion stopped at
39694 + */
39695 + result = find_start(inode, EXTENT_POINTER_ID, &offset);
39696 + if (result == -ENOENT) {
39697 + /* no extent found, everything is converted */
39698 + uf_info->container = UF_CONTAINER_TAILS;
39699 + complete_conversion(inode);
39700 + return 0;
39701 + } else if (result != 0)
39702 + /* some other error */
39703 + return result;
39704 + }
39705 +
39706 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
39707 +
39708 + /* number of pages in the file */
39709 + num_pages =
39710 + (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
39711 + start_page = offset >> PAGE_CACHE_SHIFT;
39712 +
39713 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39714 + to = from;
39715 +
39716 + result = 0;
39717 + for (i = 0; i < num_pages; i++) {
39718 + __u64 start_byte;
39719 +
39720 + result = reserve_extent2tail_iteration(inode);
39721 + if (result != 0)
39722 + break;
39723 + if (i == 0 && offset == 0) {
39724 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
39725 + reiser4_update_sd(inode);
39726 + }
39727 +
39728 + page = read_mapping_page(inode->i_mapping,
39729 + (unsigned)(i + start_page), NULL);
39730 + if (IS_ERR(page)) {
39731 + result = PTR_ERR(page);
39732 + break;
39733 + }
39734 +
39735 + wait_on_page_locked(page);
39736 +
39737 + if (!PageUptodate(page)) {
39738 + page_cache_release(page);
39739 + result = RETERR(-EIO);
39740 + break;
39741 + }
39742 +
39743 + /* cut part of file we have read */
39744 + start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
39745 + set_key_offset(&from, start_byte);
39746 + set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
39747 + /*
39748 + * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
39749 + * commits during over-long truncates. But
39750 + * extent->tail conversion should be performed in one
39751 + * transaction.
39752 + */
39753 + result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
39754 + &to, inode, 0);
39755 +
39756 + if (result) {
39757 + page_cache_release(page);
39758 + break;
39759 + }
39760 +
39761 + /* put page data into tree via tail_write */
39762 + count = PAGE_CACHE_SIZE;
39763 + if ((i == (num_pages - 1)) &&
39764 + (inode->i_size & ~PAGE_CACHE_MASK))
39765 + /* last page can be incompleted */
39766 + count = (inode->i_size & ~PAGE_CACHE_MASK);
39767 + while (count) {
39768 + struct dentry dentry;
39769 + struct file file;
39770 + loff_t pos;
39771 +
39772 + dentry.d_inode = inode;
39773 + file.f_dentry = &dentry;
39774 + file.private_data = NULL;
39775 + file.f_pos = start_byte;
39776 + file.private_data = NULL;
39777 + pos = start_byte;
39778 + result = reiser4_write_tail(&file,
39779 + (char __user *)kmap(page),
39780 + count, &pos);
39781 + reiser4_free_file_fsdata(&file);
39782 + if (result <= 0) {
39783 + warning("", "reiser4_write_tail failed");
39784 + page_cache_release(page);
39785 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
39786 + return result;
39787 + }
39788 + count -= result;
39789 + }
39790 +
39791 + /* release page */
39792 + lock_page(page);
39793 + /* page is already detached from jnode and mapping. */
39794 + assert("vs-1086", page->mapping == NULL);
39795 + assert("nikita-2690",
39796 + (!PagePrivate(page) && jprivate(page) == 0));
39797 + /* waiting for writeback completion with page lock held is
39798 + * perfectly valid. */
39799 + wait_on_page_writeback(page);
39800 + reiser4_drop_page(page);
39801 + /* release reference taken by read_cache_page() above */
39802 + page_cache_release(page);
39803 +
39804 + drop_exclusive_access(uf_info);
39805 + /* throttle the conversion */
39806 + reiser4_throttle_write(inode);
39807 + get_exclusive_access(uf_info);
39808 + /*
39809 + * nobody is allowed to complete conversion but a process which
39810 + * started it
39811 + */
39812 + assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
39813 + }
39814 +
39815 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
39816 +
39817 + if (i == num_pages) {
39818 + /* file is converted to formatted items */
39819 + assert("vs-1698", reiser4_inode_get_flag(inode,
39820 + REISER4_PART_MIXED));
39821 + assert("vs-1260",
39822 + inode_has_no_jnodes(reiser4_inode_data(inode)));
39823 +
39824 + uf_info->container = UF_CONTAINER_TAILS;
39825 + complete_conversion(inode);
39826 + return 0;
39827 + }
39828 + /*
39829 + * conversion is not complete. Inode was already marked as
39830 + * REISER4_PART_MIXED and stat-data were updated at the first *
39831 + * iteration of the loop above.
39832 + */
39833 + warning("nikita-2282",
39834 + "Partial conversion of %llu: %lu of %lu: %i",
39835 + (unsigned long long)get_inode_oid(inode), i,
39836 + num_pages, result);
39837 +
39838 + return result;
39839 +}
39840 +
39841 +/*
39842 + * Local variables:
39843 + * c-indentation-style: "K&R"
39844 + * mode-name: "LC"
39845 + * c-basic-offset: 8
39846 + * tab-width: 8
39847 + * fill-column: 79
39848 + * scroll-step: 1
39849 + * End:
39850 + */
39851 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file_ops.c linux-2.6.22/fs/reiser4/plugin/file_ops.c
39852 --- linux-2.6.22.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300
39853 +++ linux-2.6.22/fs/reiser4/plugin/file_ops.c 2007-07-29 00:25:34.932710971 +0400
39854 @@ -0,0 +1,168 @@
39855 +/* Copyright 2005 by Hans Reiser, licensing governed by
39856 + reiser4/README */
39857 +
39858 +/* this file contains typical implementations for some of methods of
39859 + struct file_operations and of struct address_space_operations
39860 +*/
39861 +
39862 +#include "../inode.h"
39863 +#include "object.h"
39864 +
39865 +/* file operations */
39866 +
39867 +/* implementation of vfs's llseek method of struct file_operations for
39868 + typical directory can be found in readdir_common.c
39869 +*/
39870 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
39871 +
39872 +/* implementation of vfs's readdir method of struct file_operations for
39873 + typical directory can be found in readdir_common.c
39874 +*/
39875 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
39876 +
39877 +/**
39878 + * reiser4_release_dir_common - release of struct file_operations
39879 + * @inode: inode of released file
39880 + * @file: file to release
39881 + *
39882 + * Implementation of release method of struct file_operations for typical
39883 + * directory. All it does is freeing of reiser4 specific file data.
39884 +*/
39885 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
39886 +{
39887 + reiser4_context *ctx;
39888 +
39889 + ctx = reiser4_init_context(inode->i_sb);
39890 + if (IS_ERR(ctx))
39891 + return PTR_ERR(ctx);
39892 + reiser4_free_file_fsdata(file);
39893 + reiser4_exit_context(ctx);
39894 + return 0;
39895 +}
39896 +
39897 +/* this is common implementation of vfs's fsync method of struct
39898 + file_operations
39899 +*/
39900 +int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
39901 +{
39902 + reiser4_context *ctx;
39903 + int result;
39904 +
39905 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
39906 + if (IS_ERR(ctx))
39907 + return PTR_ERR(ctx);
39908 + result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
39909 +
39910 + context_set_commit_async(ctx);
39911 + reiser4_exit_context(ctx);
39912 + return result;
39913 +}
39914 +
39915 +/* this is common implementation of vfs's sendfile method of struct
39916 + file_operations
39917 +
39918 + Reads @count bytes from @file and calls @actor for every page read. This is
39919 + needed for loop back devices support.
39920 +*/
39921 +#if 0
39922 +ssize_t
39923 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
39924 + read_actor_t actor, void *target)
39925 +{
39926 + reiser4_context *ctx;
39927 + ssize_t result;
39928 +
39929 + ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
39930 + if (IS_ERR(ctx))
39931 + return PTR_ERR(ctx);
39932 + result = generic_file_sendfile(file, ppos, count, actor, target);
39933 + reiser4_exit_context(ctx);
39934 + return result;
39935 +}
39936 +#endif /* 0 */
39937 +
39938 +/* address space operations */
39939 +
39940 +/* this is common implementation of vfs's prepare_write method of struct
39941 + address_space_operations
39942 +*/
39943 +int
39944 +prepare_write_common(struct file *file, struct page *page, unsigned from,
39945 + unsigned to)
39946 +{
39947 + reiser4_context *ctx;
39948 + int result;
39949 +
39950 + ctx = reiser4_init_context(page->mapping->host->i_sb);
39951 + result = do_prepare_write(file, page, from, to);
39952 +
39953 + /* don't commit transaction under inode semaphore */
39954 + context_set_commit_async(ctx);
39955 + reiser4_exit_context(ctx);
39956 +
39957 + return result;
39958 +}
39959 +
39960 +/* this is helper for prepare_write_common and prepare_write_unix_file
39961 + */
39962 +int
39963 +do_prepare_write(struct file *file, struct page *page, unsigned from,
39964 + unsigned to)
39965 +{
39966 + int result;
39967 + file_plugin *fplug;
39968 + struct inode *inode;
39969 +
39970 + assert("umka-3099", file != NULL);
39971 + assert("umka-3100", page != NULL);
39972 + assert("umka-3095", PageLocked(page));
39973 +
39974 + if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
39975 + return 0;
39976 +
39977 + inode = page->mapping->host;
39978 + fplug = inode_file_plugin(inode);
39979 +
39980 + if (page->mapping->a_ops->readpage == NULL)
39981 + return RETERR(-EINVAL);
39982 +
39983 + result = page->mapping->a_ops->readpage(file, page);
39984 + if (result != 0) {
39985 + SetPageError(page);
39986 + ClearPageUptodate(page);
39987 + /* All reiser4 readpage() implementations should return the
39988 + * page locked in case of error. */
39989 + assert("nikita-3472", PageLocked(page));
39990 + } else {
39991 + /*
39992 + * ->readpage() either:
39993 + *
39994 + * 1. starts IO against @page. @page is locked for IO in
39995 + * this case.
39996 + *
39997 + * 2. doesn't start IO. @page is unlocked.
39998 + *
39999 + * In either case, page should be locked.
40000 + */
40001 + lock_page(page);
40002 + /*
40003 + * IO (if any) is completed at this point. Check for IO
40004 + * errors.
40005 + */
40006 + if (!PageUptodate(page))
40007 + result = RETERR(-EIO);
40008 + }
40009 + assert("umka-3098", PageLocked(page));
40010 + return result;
40011 +}
40012 +
40013 +/*
40014 + * Local variables:
40015 + * c-indentation-style: "K&R"
40016 + * mode-name: "LC"
40017 + * c-basic-offset: 8
40018 + * tab-width: 8
40019 + * fill-column: 79
40020 + * scroll-step: 1
40021 + * End:
40022 + */
40023 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.22/fs/reiser4/plugin/file_ops_readdir.c
40024 --- linux-2.6.22.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300
40025 +++ linux-2.6.22/fs/reiser4/plugin/file_ops_readdir.c 2007-07-29 00:25:34.932710971 +0400
40026 @@ -0,0 +1,658 @@
40027 +/* Copyright 2005 by Hans Reiser, licensing governed by
40028 + * reiser4/README */
40029 +
40030 +#include "../inode.h"
40031 +
40032 +/* return true, iff @coord points to the valid directory item that is part of
40033 + * @inode directory. */
40034 +static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40035 +{
40036 + return plugin_of_group(item_plugin_by_coord(coord),
40037 + DIR_ENTRY_ITEM_TYPE) &&
40038 + inode_file_plugin(inode)->owns_item(inode, coord);
40039 +}
40040 +
40041 +/* compare two logical positions within the same directory */
40042 +static cmp_t dir_pos_cmp(const struct dir_pos * p1, const struct dir_pos * p2)
40043 +{
40044 + cmp_t result;
40045 +
40046 + assert("nikita-2534", p1 != NULL);
40047 + assert("nikita-2535", p2 != NULL);
40048 +
40049 + result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40050 + if (result == EQUAL_TO) {
40051 + int diff;
40052 +
40053 + diff = p1->pos - p2->pos;
40054 + result =
40055 + (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40056 + }
40057 + return result;
40058 +}
40059 +
40060 +/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
40061 + * necessary. */
40062 +static void
40063 +adjust_dir_pos(struct file *dir, struct readdir_pos * readdir_spot,
40064 + const struct dir_pos * mod_point, int adj)
40065 +{
40066 + struct dir_pos *pos;
40067 +
40068 + /*
40069 + * new directory entry was added (adj == +1) or removed (adj == -1) at
40070 + * the @mod_point. Directory file descriptor @dir is doing readdir and
40071 + * is currently positioned at @readdir_spot. Latter has to be updated
40072 + * to maintain stable readdir.
40073 + */
40074 + /* directory is positioned to the beginning. */
40075 + if (readdir_spot->entry_no == 0)
40076 + return;
40077 +
40078 + pos = &readdir_spot->position;
40079 + switch (dir_pos_cmp(mod_point, pos)) {
40080 + case LESS_THAN:
40081 + /* @mod_pos is _before_ @readdir_spot, that is, entry was
40082 + * added/removed on the left (in key order) of current
40083 + * position. */
40084 + /* logical number of directory entry readdir is "looking" at
40085 + * changes */
40086 + readdir_spot->entry_no += adj;
40087 + assert("nikita-2577",
40088 + ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
40089 + if (de_id_cmp(&pos->dir_entry_key,
40090 + &mod_point->dir_entry_key) == EQUAL_TO) {
40091 + assert("nikita-2575", mod_point->pos < pos->pos);
40092 + /*
40093 + * if entry added/removed has the same key as current
40094 + * for readdir, update counter of duplicate keys in
40095 + * @readdir_spot.
40096 + */
40097 + pos->pos += adj;
40098 + }
40099 + break;
40100 + case GREATER_THAN:
40101 + /* directory is modified after @pos: nothing to do. */
40102 + break;
40103 + case EQUAL_TO:
40104 + /* cannot insert an entry readdir is looking at, because it
40105 + already exists. */
40106 + assert("nikita-2576", adj < 0);
40107 + /* directory entry to which @pos points to is being
40108 + removed.
40109 +
40110 + NOTE-NIKITA: Right thing to do is to update @pos to point
40111 + to the next entry. This is complex (we are under spin-lock
40112 + for one thing). Just rewind it to the beginning. Next
40113 + readdir will have to scan the beginning of
40114 + directory. Proper solution is to use semaphore in
40115 + spin lock's stead and use rewind_right() here.
40116 +
40117 + NOTE-NIKITA: now, semaphore is used, so...
40118 + */
40119 + memset(readdir_spot, 0, sizeof *readdir_spot);
40120 + }
40121 +}
40122 +
40123 +/* scan all file-descriptors for this directory and adjust their
40124 + positions respectively. Should be used by implementations of
40125 + add_entry and rem_entry of dir plugin */
40126 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
40127 + int offset, int adj)
40128 +{
40129 + reiser4_file_fsdata *scan;
40130 + struct dir_pos mod_point;
40131 +
40132 + assert("nikita-2536", dir != NULL);
40133 + assert("nikita-2538", de != NULL);
40134 + assert("nikita-2539", adj != 0);
40135 +
40136 + build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40137 + mod_point.pos = offset;
40138 +
40139 + spin_lock_inode(dir);
40140 +
40141 + /*
40142 + * new entry was added/removed in directory @dir. Scan all file
40143 + * descriptors for @dir that are currently involved into @readdir and
40144 + * update them.
40145 + */
40146 +
40147 + list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40148 + adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40149 +
40150 + spin_unlock_inode(dir);
40151 +}
40152 +
40153 +/*
40154 + * traverse tree to start/continue readdir from the readdir position @pos.
40155 + */
40156 +static int dir_go_to(struct file *dir, struct readdir_pos * pos, tap_t * tap)
40157 +{
40158 + reiser4_key key;
40159 + int result;
40160 + struct inode *inode;
40161 +
40162 + assert("nikita-2554", pos != NULL);
40163 +
40164 + inode = dir->f_dentry->d_inode;
40165 + result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40166 + if (result != 0)
40167 + return result;
40168 + result = reiser4_object_lookup(inode,
40169 + &key,
40170 + tap->coord,
40171 + tap->lh,
40172 + tap->mode,
40173 + FIND_EXACT,
40174 + LEAF_LEVEL, LEAF_LEVEL,
40175 + 0, &tap->ra_info);
40176 + if (result == CBK_COORD_FOUND)
40177 + result = rewind_right(tap, (int)pos->position.pos);
40178 + else {
40179 + tap->coord->node = NULL;
40180 + done_lh(tap->lh);
40181 + result = RETERR(-EIO);
40182 + }
40183 + return result;
40184 +}
40185 +
40186 +/*
40187 + * handling of non-unique keys: calculate at what ordinal position within
40188 + * sequence of directory items with identical keys @pos is.
40189 + */
40190 +static int set_pos(struct inode *inode, struct readdir_pos * pos, tap_t * tap)
40191 +{
40192 + int result;
40193 + coord_t coord;
40194 + lock_handle lh;
40195 + tap_t scan;
40196 + de_id *did;
40197 + reiser4_key de_key;
40198 +
40199 + coord_init_zero(&coord);
40200 + init_lh(&lh);
40201 + reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40202 + reiser4_tap_copy(&scan, tap);
40203 + reiser4_tap_load(&scan);
40204 + pos->position.pos = 0;
40205 +
40206 + did = &pos->position.dir_entry_key;
40207 +
40208 + if (is_valid_dir_coord(inode, scan.coord)) {
40209 +
40210 + build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40211 +
40212 + while (1) {
40213 +
40214 + result = go_prev_unit(&scan);
40215 + if (result != 0)
40216 + break;
40217 +
40218 + if (!is_valid_dir_coord(inode, scan.coord)) {
40219 + result = -EINVAL;
40220 + break;
40221 + }
40222 +
40223 + /* get key of directory entry */
40224 + unit_key_by_coord(scan.coord, &de_key);
40225 + if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
40226 + /* duplicate-sequence is over */
40227 + break;
40228 + }
40229 + pos->position.pos++;
40230 + }
40231 + } else
40232 + result = RETERR(-ENOENT);
40233 + reiser4_tap_relse(&scan);
40234 + reiser4_tap_done(&scan);
40235 + return result;
40236 +}
40237 +
40238 +/*
40239 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
40240 + */
40241 +static int dir_rewind(struct file *dir, struct readdir_pos * pos, tap_t * tap)
40242 +{
40243 + __u64 destination;
40244 + __s64 shift;
40245 + int result;
40246 + struct inode *inode;
40247 + loff_t dirpos;
40248 +
40249 + assert("nikita-2553", dir != NULL);
40250 + assert("nikita-2548", pos != NULL);
40251 + assert("nikita-2551", tap->coord != NULL);
40252 + assert("nikita-2552", tap->lh != NULL);
40253 +
40254 + dirpos = reiser4_get_dir_fpos(dir);
40255 + shift = dirpos - pos->fpos;
40256 + /* this is logical directory entry within @dir which we are rewinding
40257 + * to */
40258 + destination = pos->entry_no + shift;
40259 +
40260 + inode = dir->f_dentry->d_inode;
40261 + if (dirpos < 0)
40262 + return RETERR(-EINVAL);
40263 + else if (destination == 0ll || dirpos == 0) {
40264 + /* rewind to the beginning of directory */
40265 + memset(pos, 0, sizeof *pos);
40266 + return dir_go_to(dir, pos, tap);
40267 + } else if (destination >= inode->i_size)
40268 + return RETERR(-ENOENT);
40269 +
40270 + if (shift < 0) {
40271 + /* I am afraid of negative numbers */
40272 + shift = -shift;
40273 + /* rewinding to the left */
40274 + if (shift <= (int)pos->position.pos) {
40275 + /* destination is within sequence of entries with
40276 + duplicate keys. */
40277 + result = dir_go_to(dir, pos, tap);
40278 + } else {
40279 + shift -= pos->position.pos;
40280 + while (1) {
40281 + /* repetitions: deadlock is possible when
40282 + going to the left. */
40283 + result = dir_go_to(dir, pos, tap);
40284 + if (result == 0) {
40285 + result = rewind_left(tap, shift);
40286 + if (result == -E_DEADLOCK) {
40287 + reiser4_tap_done(tap);
40288 + continue;
40289 + }
40290 + }
40291 + break;
40292 + }
40293 + }
40294 + } else {
40295 + /* rewinding to the right */
40296 + result = dir_go_to(dir, pos, tap);
40297 + if (result == 0)
40298 + result = rewind_right(tap, shift);
40299 + }
40300 + if (result == 0) {
40301 + result = set_pos(inode, pos, tap);
40302 + if (result == 0) {
40303 + /* update pos->position.pos */
40304 + pos->entry_no = destination;
40305 + pos->fpos = dirpos;
40306 + }
40307 + }
40308 + return result;
40309 +}
40310 +
40311 +/*
40312 + * Function that is called by common_readdir() on each directory entry while
40313 + * doing readdir. ->filldir callback may block, so we had to release long term
40314 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
40315 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
40316 + *
40317 + * Whether node is unlocked in case of any other error is undefined. It is
40318 + * guaranteed to be still locked if success (0) is returned.
40319 + *
40320 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
40321 + * unlocked.
40322 + */
40323 +static int
40324 +feed_entry(struct file *f, struct readdir_pos * pos, tap_t * tap,
40325 + filldir_t filldir, void *dirent)
40326 +{
40327 + item_plugin *iplug;
40328 + char *name;
40329 + reiser4_key sd_key;
40330 + int result;
40331 + char buf[DE_NAME_BUF_LEN];
40332 + char name_buf[32];
40333 + char *local_name;
40334 + unsigned file_type;
40335 + seal_t seal;
40336 + coord_t *coord;
40337 + reiser4_key entry_key;
40338 +
40339 + coord = tap->coord;
40340 + iplug = item_plugin_by_coord(coord);
40341 +
40342 + /* pointer to name within the node */
40343 + name = iplug->s.dir.extract_name(coord, buf);
40344 + assert("nikita-1371", name != NULL);
40345 +
40346 + /* key of object the entry points to */
40347 + if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
40348 + return RETERR(-EIO);
40349 +
40350 + /* we must release longterm znode lock before calling filldir to avoid
40351 + deadlock which may happen if filldir causes page fault. So, copy
40352 + name to intermediate buffer */
40353 + if (strlen(name) + 1 > sizeof(name_buf)) {
40354 + local_name = kmalloc(strlen(name) + 1,
40355 + reiser4_ctx_gfp_mask_get());
40356 + if (local_name == NULL)
40357 + return RETERR(-ENOMEM);
40358 + } else
40359 + local_name = name_buf;
40360 +
40361 + strcpy(local_name, name);
40362 + file_type = iplug->s.dir.extract_file_type(coord);
40363 +
40364 + unit_key_by_coord(coord, &entry_key);
40365 + reiser4_seal_init(&seal, coord, &entry_key);
40366 +
40367 + longterm_unlock_znode(tap->lh);
40368 +
40369 + /*
40370 + * send information about directory entry to the ->filldir() filler
40371 + * supplied to us by caller (VFS).
40372 + *
40373 + * ->filldir is entitled to do weird things. For example, ->filldir
40374 + * supplied by knfsd re-enters file system. Make sure no locks are
40375 + * held.
40376 + */
40377 + assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
40378 +
40379 + reiser4_txn_restart_current();
40380 + result = filldir(dirent, name, (int)strlen(name),
40381 + /* offset of this entry */
40382 + f->f_pos,
40383 + /* inode number of object bounden by this entry */
40384 + oid_to_uino(get_key_objectid(&sd_key)), file_type);
40385 + if (local_name != name_buf)
40386 + kfree(local_name);
40387 + if (result < 0)
40388 + /* ->filldir() is satisfied. (no space in buffer, IOW) */
40389 + result = 1;
40390 + else
40391 + result = reiser4_seal_validate(&seal, coord, &entry_key,
40392 + tap->lh, tap->mode,
40393 + ZNODE_LOCK_HIPRI);
40394 + return result;
40395 +}
40396 +
40397 +static void move_entry(struct readdir_pos * pos, coord_t * coord)
40398 +{
40399 + reiser4_key de_key;
40400 + de_id *did;
40401 +
40402 + /* update @pos */
40403 + ++pos->entry_no;
40404 + did = &pos->position.dir_entry_key;
40405 +
40406 + /* get key of directory entry */
40407 + unit_key_by_coord(coord, &de_key);
40408 +
40409 + if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
40410 + /* we are within sequence of directory entries
40411 + with duplicate keys. */
40412 + ++pos->position.pos;
40413 + else {
40414 + pos->position.pos = 0;
40415 + build_de_id_by_key(&de_key, did);
40416 + }
40417 + ++pos->fpos;
40418 +}
40419 +
40420 +/*
40421 + * STATELESS READDIR
40422 + *
40423 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
40424 + * into reiser4_file_fsdata on each directory modification (name insertion and
40425 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
40426 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
40427 + * across client READDIR requests for the same directory.
40428 + *
40429 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
40430 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
40431 + * find detached reiser4_file_fsdata corresponding to previous readdir
40432 + * request. In other words, additional state is maintained on the
40433 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
40434 + *
40435 + * To efficiently detect when our ->readdir() method is called by NFS server,
40436 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
40437 + * file_is_stateless() function).
40438 + *
40439 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
40440 + * bits of NFS readdir cookie: when first readdir request comes to the given
40441 + * directory from the given client, cookie is set to 0. This situation is
40442 + * detected, global cid_counter is incremented, and stored in highest bits of
40443 + * all direntry offsets returned to the client, including last one. As the
40444 + * only valid readdir cookie is one obtained as direntry->offset, we are
40445 + * guaranteed that next readdir request (continuing current one) will have
40446 + * current cid in the highest bits of starting readdir cookie. All d_cursors
40447 + * are hashed into per-super-block hash table by (oid, cid) key.
40448 + *
40449 + * In addition d_cursors are placed into per-super-block radix tree where they
40450 + * are keyed by oid alone. This is necessary to efficiently remove them during
40451 + * rmdir.
40452 + *
40453 + * At last, currently unused d_cursors are linked into special list. This list
40454 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
40455 + *
40456 + */
40457 +
40458 +/*
40459 + * prepare for readdir.
40460 + */
40461 +static int dir_readdir_init(struct file *f, tap_t * tap,
40462 + struct readdir_pos ** pos)
40463 +{
40464 + struct inode *inode;
40465 + reiser4_file_fsdata *fsdata;
40466 + int result;
40467 +
40468 + assert("nikita-1359", f != NULL);
40469 + inode = f->f_dentry->d_inode;
40470 + assert("nikita-1360", inode != NULL);
40471 +
40472 + if (!S_ISDIR(inode->i_mode))
40473 + return RETERR(-ENOTDIR);
40474 +
40475 + /* try to find detached readdir state */
40476 + result = reiser4_attach_fsdata(f, inode);
40477 + if (result != 0)
40478 + return result;
40479 +
40480 + fsdata = reiser4_get_file_fsdata(f);
40481 + assert("nikita-2571", fsdata != NULL);
40482 + if (IS_ERR(fsdata))
40483 + return PTR_ERR(fsdata);
40484 +
40485 + /* add file descriptor to the readdir list hanging of directory
40486 + * inode. This list is used to scan "readdirs-in-progress" while
40487 + * inserting or removing names in the directory. */
40488 + spin_lock_inode(inode);
40489 + if (list_empty_careful(&fsdata->dir.linkage))
40490 + list_add(&fsdata->dir.linkage, get_readdir_list(inode));
40491 + *pos = &fsdata->dir.readdir;
40492 + spin_unlock_inode(inode);
40493 +
40494 + /* move @tap to the current position */
40495 + return dir_rewind(f, *pos, tap);
40496 +}
40497 +
40498 +/* this is implementation of vfs's llseek method of struct file_operations for
40499 + typical directory
40500 + See comment before reiser4_readdir_common() for explanation.
40501 +*/
40502 +loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
40503 +{
40504 + reiser4_context *ctx;
40505 + loff_t result;
40506 + struct inode *inode;
40507 +
40508 + inode = file->f_dentry->d_inode;
40509 +
40510 + ctx = reiser4_init_context(inode->i_sb);
40511 + if (IS_ERR(ctx))
40512 + return PTR_ERR(ctx);
40513 +
40514 + mutex_lock(&inode->i_mutex);
40515 +
40516 + /* update ->f_pos */
40517 + result = default_llseek(file, off, origin);
40518 + if (result >= 0) {
40519 + int ff;
40520 + coord_t coord;
40521 + lock_handle lh;
40522 + tap_t tap;
40523 + struct readdir_pos *pos;
40524 +
40525 + coord_init_zero(&coord);
40526 + init_lh(&lh);
40527 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40528 +
40529 + ff = dir_readdir_init(file, &tap, &pos);
40530 + reiser4_detach_fsdata(file);
40531 + if (ff != 0)
40532 + result = (loff_t) ff;
40533 + reiser4_tap_done(&tap);
40534 + }
40535 + reiser4_detach_fsdata(file);
40536 + mutex_unlock(&inode->i_mutex);
40537 +
40538 + reiser4_exit_context(ctx);
40539 + return result;
40540 +}
40541 +
40542 +/* this is common implementation of vfs's readdir method of struct
40543 + file_operations
40544 +
40545 + readdir problems:
40546 +
40547 + readdir(2)/getdents(2) interface is based on implicit assumption that
40548 + readdir can be restarted from any particular point by supplying file system
40549 + with off_t-full of data. That is, file system fills ->d_off field in struct
40550 + dirent and later user passes ->d_off to the seekdir(3), which is, actually,
40551 + implemented by glibc as lseek(2) on directory.
40552 +
40553 + Reiser4 cannot restart readdir from 64 bits of data, because two last
40554 + components of the key of directory entry are unknown, which given 128 bits:
40555 + locality and type fields in the key of directory entry are always known, to
40556 + start readdir() from given point objectid and offset fields have to be
40557 + filled.
40558 +
40559 + Traditional UNIX API for scanning through directory
40560 + (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
40561 + assumption that directory is structured very much like regular file, in
40562 + particular, it is implied that each name within given directory (directory
40563 + entry) can be uniquely identified by scalar offset and that such offset is
40564 + stable across the life-time of the name is identifies.
40565 +
40566 + This is manifestly not so for reiser4. In reiser4 the only stable unique
40567 + identifies for the directory entry is its key that doesn't fit into
40568 + seekdir/telldir API.
40569 +
40570 + solution:
40571 +
40572 + Within each file descriptor participating in readdir-ing of directory
40573 + plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
40574 + the "current" directory entry that file descriptor looks at. It contains a
40575 + key of directory entry (plus some additional info to deal with non-unique
40576 + keys that we wouldn't dwell onto here) and a logical position of this
40577 + directory entry starting from the beginning of the directory, that is
40578 + ordinal number of this entry in the readdir order.
40579 +
40580 + Obviously this logical position is not stable in the face of directory
40581 + modifications. To work around this, on each addition or removal of directory
40582 + entry all file descriptors for directory inode are scanned and their
40583 + readdir_pos are updated accordingly (adjust_dir_pos()).
40584 +*/
40585 +int reiser4_readdir_common(struct file *f /* directory file being read */,
40586 + void *dirent /* opaque data passed to us by VFS */,
40587 + filldir_t filld /* filler function passed to us
40588 + * by VFS */)
40589 +{
40590 + reiser4_context *ctx;
40591 + int result;
40592 + struct inode *inode;
40593 + coord_t coord;
40594 + lock_handle lh;
40595 + tap_t tap;
40596 + struct readdir_pos *pos;
40597 +
40598 + assert("nikita-1359", f != NULL);
40599 + inode = f->f_dentry->d_inode;
40600 + assert("nikita-1360", inode != NULL);
40601 +
40602 + if (!S_ISDIR(inode->i_mode))
40603 + return RETERR(-ENOTDIR);
40604 +
40605 + ctx = reiser4_init_context(inode->i_sb);
40606 + if (IS_ERR(ctx))
40607 + return PTR_ERR(ctx);
40608 +
40609 + coord_init_zero(&coord);
40610 + init_lh(&lh);
40611 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40612 +
40613 + reiser4_readdir_readahead_init(inode, &tap);
40614 +
40615 + repeat:
40616 + result = dir_readdir_init(f, &tap, &pos);
40617 + if (result == 0) {
40618 + result = reiser4_tap_load(&tap);
40619 + /* scan entries one by one feeding them to @filld */
40620 + while (result == 0) {
40621 + coord_t *coord;
40622 +
40623 + coord = tap.coord;
40624 + assert("nikita-2572", coord_is_existing_unit(coord));
40625 + assert("nikita-3227", is_valid_dir_coord(inode, coord));
40626 +
40627 + result = feed_entry(f, pos, &tap, filld, dirent);
40628 + if (result > 0) {
40629 + break;
40630 + } else if (result == 0) {
40631 + ++f->f_pos;
40632 + result = go_next_unit(&tap);
40633 + if (result == -E_NO_NEIGHBOR ||
40634 + result == -ENOENT) {
40635 + result = 0;
40636 + break;
40637 + } else if (result == 0) {
40638 + if (is_valid_dir_coord(inode, coord))
40639 + move_entry(pos, coord);
40640 + else
40641 + break;
40642 + }
40643 + } else if (result == -E_REPEAT) {
40644 + /* feed_entry() had to restart. */
40645 + ++f->f_pos;
40646 + reiser4_tap_relse(&tap);
40647 + goto repeat;
40648 + } else
40649 + warning("vs-1617",
40650 + "reiser4_readdir_common: unexpected error %d",
40651 + result);
40652 + }
40653 + reiser4_tap_relse(&tap);
40654 +
40655 + if (result >= 0)
40656 + f->f_version = inode->i_version;
40657 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
40658 + result = 0;
40659 + reiser4_tap_done(&tap);
40660 + reiser4_detach_fsdata(f);
40661 +
40662 + /* try to update directory's atime */
40663 + if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
40664 + BA_CAN_COMMIT) != 0)
40665 + warning("", "failed to update atime on readdir: %llu",
40666 + get_inode_oid(inode));
40667 + else
40668 + file_accessed(f);
40669 +
40670 + context_set_commit_async(ctx);
40671 + reiser4_exit_context(ctx);
40672 +
40673 + return (result <= 0) ? result : 0;
40674 +}
40675 +
40676 +/*
40677 + * Local variables:
40678 + * c-indentation-style: "K&R"
40679 + * mode-name: "LC"
40680 + * c-basic-offset: 8
40681 + * tab-width: 8
40682 + * fill-column: 79
40683 + * End:
40684 + */
40685 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.22/fs/reiser4/plugin/file_plugin_common.c
40686 --- linux-2.6.22.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
40687 +++ linux-2.6.22/fs/reiser4/plugin/file_plugin_common.c 2007-07-29 00:25:34.936712007 +0400
40688 @@ -0,0 +1,1007 @@
40689 +/* Copyright 2005 by Hans Reiser, licensing governed by
40690 + reiser4/README */
40691 +
40692 +/* this file contains typical implementations for most of methods of
40693 + file plugin
40694 +*/
40695 +
40696 +#include "../inode.h"
40697 +#include "object.h"
40698 +#include "../safe_link.h"
40699 +
40700 +#include <linux/quotaops.h>
40701 +
40702 +static int insert_new_sd(struct inode *inode);
40703 +static int update_sd(struct inode *inode);
40704 +
40705 +/* this is common implementation of write_sd_by_inode method of file plugin
40706 + either insert stat data or update it
40707 + */
40708 +int write_sd_by_inode_common(struct inode *inode /* object to save */ )
40709 +{
40710 + int result;
40711 +
40712 + assert("nikita-730", inode != NULL);
40713 +
40714 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
40715 + /* object doesn't have stat-data yet */
40716 + result = insert_new_sd(inode);
40717 + else
40718 + result = update_sd(inode);
40719 + if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
40720 + /* Don't issue warnings about "name is too long" */
40721 + warning("nikita-2221", "Failed to save sd for %llu: %i",
40722 + (unsigned long long)get_inode_oid(inode), result);
40723 + return result;
40724 +}
40725 +
40726 +/* this is common implementation of key_by_inode method of file plugin
40727 + */
40728 +int
40729 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
40730 + reiser4_key * key)
40731 +{
40732 + reiser4_key_init(key);
40733 + set_key_locality(key, reiser4_inode_data(inode)->locality_id);
40734 + set_key_ordering(key, get_inode_ordering(inode));
40735 + set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
40736 + set_key_type(key, KEY_BODY_MINOR);
40737 + set_key_offset(key, (__u64) off);
40738 + return 0;
40739 +}
40740 +
40741 +/* this is common implementation of set_plug_in_inode method of file plugin
40742 + */
40743 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
40744 + struct inode *parent /* parent object */ ,
40745 + reiser4_object_create_data * data /* creational
40746 + * data */ )
40747 +{
40748 + __u64 mask;
40749 +
40750 + object->i_mode = data->mode;
40751 + /* this should be plugin decision */
40752 + object->i_uid = current->fsuid;
40753 + object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
40754 +
40755 + /* support for BSD style group-id assignment. See mount's manual page
40756 + description of bsdgroups ext2 mount options for more details */
40757 + if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
40758 + object->i_gid = parent->i_gid;
40759 + else if (parent->i_mode & S_ISGID) {
40760 + /* parent directory has sguid bit */
40761 + object->i_gid = parent->i_gid;
40762 + if (S_ISDIR(object->i_mode))
40763 + /* sguid is inherited by sub-directories */
40764 + object->i_mode |= S_ISGID;
40765 + } else
40766 + object->i_gid = current->fsgid;
40767 +
40768 + /* this object doesn't have stat-data yet */
40769 + reiser4_inode_set_flag(object, REISER4_NO_SD);
40770 +#if 0
40771 + /* this is now called after all inode plugins are initialized:
40772 + do_create_vfs_child after adjust_to_parent */
40773 + /* setup inode and file-operations for this inode */
40774 + setup_inode_ops(object, data);
40775 +#endif
40776 + object->i_nlink = 0;
40777 + reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
40778 + mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
40779 + if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
40780 + mask |= (1 << LARGE_TIMES_STAT);
40781 +
40782 + reiser4_inode_data(object)->extmask = mask;
40783 + return 0;
40784 +}
40785 +
40786 +/* this is common implementation of adjust_to_parent method of file plugin for
40787 + regular files
40788 + */
40789 +int adjust_to_parent_common(struct inode *object /* new object */ ,
40790 + struct inode *parent /* parent directory */ ,
40791 + struct inode *root /* root directory */ )
40792 +{
40793 + assert("nikita-2165", object != NULL);
40794 + if (parent == NULL)
40795 + parent = root;
40796 + assert("nikita-2069", parent != NULL);
40797 +
40798 + /*
40799 + * inherit missing plugins from parent
40800 + */
40801 +
40802 + grab_plugin_pset(object, parent, PSET_FILE);
40803 + grab_plugin_pset(object, parent, PSET_SD);
40804 + grab_plugin_pset(object, parent, PSET_FORMATTING);
40805 + grab_plugin_pset(object, parent, PSET_PERM);
40806 + return 0;
40807 +}
40808 +
40809 +/* this is common implementation of adjust_to_parent method of file plugin for
40810 + typical directories
40811 + */
40812 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
40813 + struct inode *parent /* parent directory */ ,
40814 + struct inode *root /* root directory */ )
40815 +{
40816 + int result = 0;
40817 + pset_member memb;
40818 +
40819 + assert("nikita-2166", object != NULL);
40820 + if (parent == NULL)
40821 + parent = root;
40822 + assert("nikita-2167", parent != NULL);
40823 +
40824 + /*
40825 + * inherit missing plugins from parent
40826 + */
40827 + for (memb = 0; memb < PSET_LAST; ++memb) {
40828 + result = grab_plugin_pset(object, parent, memb);
40829 + if (result != 0)
40830 + break;
40831 + }
40832 + return result;
40833 +}
40834 +
40835 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
40836 + struct inode *parent /* parent directory */,
40837 + struct inode *root /* root directory */)
40838 +{
40839 + int result;
40840 + result = adjust_to_parent_common(object, parent, root);
40841 + if (result)
40842 + return result;
40843 + assert("edward-1416", parent != NULL);
40844 +
40845 + grab_plugin_pset(object, parent, PSET_CLUSTER);
40846 + grab_plugin_pset(object, parent, PSET_CIPHER);
40847 + grab_plugin_pset(object, parent, PSET_DIGEST);
40848 + grab_plugin_pset(object, parent, PSET_COMPRESSION);
40849 + grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
40850 +
40851 + return 0;
40852 +}
40853 +
40854 +/* this is common implementation of create_object method of file plugin
40855 + */
40856 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
40857 + reiser4_object_create_data * data)
40858 +{
40859 + reiser4_block_nr reserve;
40860 + assert("nikita-744", object != NULL);
40861 + assert("nikita-745", parent != NULL);
40862 + assert("nikita-747", data != NULL);
40863 + assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
40864 +
40865 + reserve = estimate_create_common(object);
40866 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
40867 + return RETERR(-ENOSPC);
40868 + return write_sd_by_inode_common(object);
40869 +}
40870 +
40871 +static int common_object_delete_no_reserve(struct inode *inode);
40872 +
40873 +/**
40874 + * reiser4_delete_object_common - delete_object of file_plugin
40875 + * @inode: inode to be deleted
40876 + *
40877 + * This is common implementation of delete_object method of file_plugin. It
40878 + * applies to object its deletion consists of removing two items - stat data
40879 + * and safe-link.
40880 + */
40881 +int reiser4_delete_object_common(struct inode *inode)
40882 +{
40883 + int result;
40884 +
40885 + assert("nikita-1477", inode != NULL);
40886 + /* FIXME: if file body deletion failed (i/o error, for instance),
40887 + inode->i_size can be != 0 here */
40888 + assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
40889 + assert("nikita-3421", inode->i_nlink == 0);
40890 +
40891 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
40892 + reiser4_block_nr reserve;
40893 +
40894 + /* grab space which is needed to remove 2 items from the tree:
40895 + stat data and safe-link */
40896 + reserve = 2 *
40897 + estimate_one_item_removal(reiser4_tree_by_inode(inode));
40898 + if (reiser4_grab_space_force(reserve,
40899 + BA_RESERVED | BA_CAN_COMMIT))
40900 + return RETERR(-ENOSPC);
40901 + result = common_object_delete_no_reserve(inode);
40902 + } else
40903 + result = 0;
40904 + return result;
40905 +}
40906 +
40907 +/**
40908 + * reiser4_delete_dir_common - delete_object of file_plugin
40909 + * @inode: inode to be deleted
40910 + *
40911 + * This is common implementation of delete_object method of file_plugin for
40912 + * typical directory. It calls done method of dir_plugin to remove "." and
40913 + * removes stat data and safe-link.
40914 + */
40915 +int reiser4_delete_dir_common(struct inode *inode)
40916 +{
40917 + int result;
40918 + dir_plugin *dplug;
40919 +
40920 + assert("", (get_current_context() &&
40921 + get_current_context()->trans->atom == NULL));
40922 +
40923 + dplug = inode_dir_plugin(inode);
40924 + assert("vs-1101", dplug && dplug->done);
40925 +
40926 + /* kill cursors which might be attached to inode */
40927 + reiser4_kill_cursors(inode);
40928 +
40929 + /* grab space enough for removing two items */
40930 + if (reiser4_grab_space
40931 + (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
40932 + BA_RESERVED | BA_CAN_COMMIT))
40933 + return RETERR(-ENOSPC);
40934 +
40935 + result = dplug->done(inode);
40936 + if (!result)
40937 + result = common_object_delete_no_reserve(inode);
40938 + return result;
40939 +}
40940 +
40941 +/* this is common implementation of add_link method of file plugin
40942 + */
40943 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
40944 +{
40945 + /*
40946 + * increment ->i_nlink and update ->i_ctime
40947 + */
40948 +
40949 + INODE_INC_FIELD(object, i_nlink);
40950 + object->i_ctime = CURRENT_TIME;
40951 + return 0;
40952 +}
40953 +
40954 +/* this is common implementation of rem_link method of file plugin
40955 + */
40956 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
40957 +{
40958 + assert("nikita-2021", object != NULL);
40959 + assert("nikita-2163", object->i_nlink > 0);
40960 +
40961 + /*
40962 + * decrement ->i_nlink and update ->i_ctime
40963 + */
40964 +
40965 + INODE_DEC_FIELD(object, i_nlink);
40966 + object->i_ctime = CURRENT_TIME;
40967 + return 0;
40968 +}
40969 +
40970 +/* this is common implementation of rem_link method of file plugin for typical
40971 + directory
40972 +*/
40973 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
40974 +{
40975 + assert("nikita-20211", object != NULL);
40976 + assert("nikita-21631", object->i_nlink > 0);
40977 +
40978 + /*
40979 + * decrement ->i_nlink and update ->i_ctime
40980 + */
40981 + INODE_DEC_FIELD(object, i_nlink);
40982 + if (object->i_nlink == 1)
40983 + INODE_DEC_FIELD(object, i_nlink);
40984 + object->i_ctime = CURRENT_TIME;
40985 + return 0;
40986 +}
40987 +
40988 +/* this is common implementation of owns_item method of file plugin
40989 + compare objectids of keys in inode and coord */
40990 +int owns_item_common(const struct inode *inode, /* object to check
40991 + * against */
40992 + const coord_t * coord /* coord to check */ )
40993 +{
40994 + reiser4_key item_key;
40995 + reiser4_key file_key;
40996 +
40997 + assert("nikita-760", inode != NULL);
40998 + assert("nikita-761", coord != NULL);
40999 +
41000 + return coord_is_existing_item(coord) &&
41001 + (get_key_objectid(build_sd_key(inode, &file_key)) ==
41002 + get_key_objectid(item_key_by_coord(coord, &item_key)));
41003 +}
41004 +
41005 +/* this is common implementation of owns_item method of file plugin
41006 + for typical directory
41007 +*/
41008 +int owns_item_common_dir(const struct inode *inode, /* object to check against */
41009 + const coord_t * coord /* coord of item to check */ )
41010 +{
41011 + reiser4_key item_key;
41012 +
41013 + assert("nikita-1335", inode != NULL);
41014 + assert("nikita-1334", coord != NULL);
41015 +
41016 + if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
41017 + return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41018 + get_inode_oid(inode);
41019 + else
41020 + return owns_item_common(inode, coord);
41021 +}
41022 +
41023 +/* this is common implementation of can_add_link method of file plugin
41024 + checks whether yet another hard links to this object can be added
41025 +*/
41026 +int can_add_link_common(const struct inode *object /* object to check */ )
41027 +{
41028 + assert("nikita-732", object != NULL);
41029 +
41030 + /* inode->i_nlink is unsigned int, so just check for integer
41031 + overflow */
41032 + return object->i_nlink + 1 != 0;
41033 +}
41034 +
41035 +/* this is common implementation of can_rem_link method of file plugin for
41036 + typical directory
41037 +*/
41038 +int can_rem_link_common_dir(const struct inode *inode)
41039 +{
41040 + /* is_dir_empty() returns 0 is dir is empty */
41041 + return !is_dir_empty(inode);
41042 +}
41043 +
41044 +/* this is common implementation of detach method of file plugin for typical
41045 + directory
41046 +*/
41047 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
41048 +{
41049 + dir_plugin *dplug;
41050 +
41051 + dplug = inode_dir_plugin(child);
41052 + assert("nikita-2883", dplug != NULL);
41053 + assert("nikita-2884", dplug->detach != NULL);
41054 + return dplug->detach(child, parent);
41055 +}
41056 +
41057 +/* this is common implementation of bind method of file plugin for typical
41058 + directory
41059 +*/
41060 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
41061 +{
41062 + dir_plugin *dplug;
41063 +
41064 + dplug = inode_dir_plugin(child);
41065 + assert("nikita-2646", dplug != NULL);
41066 + return dplug->attach(child, parent);
41067 +}
41068 +
41069 +static int process_truncate(struct inode *, __u64 size);
41070 +
41071 +/* this is common implementation of safelink method of file plugin
41072 + */
41073 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41074 +{
41075 + int result;
41076 +
41077 + assert("vs-1705", get_current_context()->trans->atom == NULL);
41078 + if (link == SAFE_UNLINK)
41079 + /* nothing to do. iput() in the caller (process_safelink) will
41080 + * finish with file */
41081 + result = 0;
41082 + else if (link == SAFE_TRUNCATE)
41083 + result = process_truncate(object, value);
41084 + else {
41085 + warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41086 + result = RETERR(-EIO);
41087 + }
41088 + return result;
41089 +}
41090 +
41091 +/* this is common implementation of estimate.create method of file plugin
41092 + can be used when object creation involves insertion of one item (usually stat
41093 + data) into tree
41094 +*/
41095 +reiser4_block_nr estimate_create_common(const struct inode * object)
41096 +{
41097 + return estimate_one_insert_item(reiser4_tree_by_inode(object));
41098 +}
41099 +
41100 +/* this is common implementation of estimate.create method of file plugin for
41101 + typical directory
41102 + can be used when directory creation involves insertion of two items (usually
41103 + stat data and item containing "." and "..") into tree
41104 +*/
41105 +reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41106 +{
41107 + return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
41108 +}
41109 +
41110 +/* this is common implementation of estimate.update method of file plugin
41111 + can be used when stat data update does not do more than inserting a unit
41112 + into a stat data item which is probably true for most cases
41113 +*/
41114 +reiser4_block_nr estimate_update_common(const struct inode * inode)
41115 +{
41116 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
41117 +}
41118 +
41119 +/* this is common implementation of estimate.unlink method of file plugin
41120 + */
41121 +reiser4_block_nr
41122 +estimate_unlink_common(const struct inode * object UNUSED_ARG,
41123 + const struct inode * parent UNUSED_ARG)
41124 +{
41125 + return 0;
41126 +}
41127 +
41128 +/* this is common implementation of estimate.unlink method of file plugin for
41129 + typical directory
41130 +*/
41131 +reiser4_block_nr
41132 +estimate_unlink_common_dir(const struct inode * object,
41133 + const struct inode * parent)
41134 +{
41135 + dir_plugin *dplug;
41136 +
41137 + dplug = inode_dir_plugin(object);
41138 + assert("nikita-2888", dplug != NULL);
41139 + assert("nikita-2887", dplug->estimate.unlink != NULL);
41140 + return dplug->estimate.unlink(object, parent);
41141 +}
41142 +
41143 +char *wire_write_common(struct inode *inode, char *start)
41144 +{
41145 + return build_inode_onwire(inode, start);
41146 +}
41147 +
41148 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41149 +{
41150 + return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41151 +}
41152 +
41153 +struct dentry *wire_get_common(struct super_block *sb,
41154 + reiser4_object_on_wire * obj)
41155 +{
41156 + struct inode *inode;
41157 + struct dentry *dentry;
41158 + reiser4_key key;
41159 +
41160 + extract_key_from_id(&obj->u.std.key_id, &key);
41161 + inode = reiser4_iget(sb, &key, 1);
41162 + if (!IS_ERR(inode)) {
41163 + reiser4_iget_complete(inode);
41164 + dentry = d_alloc_anon(inode);
41165 + if (dentry == NULL) {
41166 + iput(inode);
41167 + dentry = ERR_PTR(-ENOMEM);
41168 + } else
41169 + dentry->d_op = &get_super_private(sb)->ops.dentry;
41170 + } else if (PTR_ERR(inode) == -ENOENT)
41171 + /*
41172 + * inode wasn't found at the key encoded in the file
41173 + * handle. Hence, file handle is stale.
41174 + */
41175 + dentry = ERR_PTR(RETERR(-ESTALE));
41176 + else
41177 + dentry = (void *)inode;
41178 + return dentry;
41179 +}
41180 +
41181 +int wire_size_common(struct inode *inode)
41182 +{
41183 + return inode_onwire_size(inode);
41184 +}
41185 +
41186 +void wire_done_common(reiser4_object_on_wire * obj)
41187 +{
41188 + /* nothing to do */
41189 +}
41190 +
41191 +/* helper function to print errors */
41192 +static void key_warning(const reiser4_key * key /* key to print */ ,
41193 + const struct inode *inode,
41194 + int code /* error code to print */ )
41195 +{
41196 + assert("nikita-716", key != NULL);
41197 +
41198 + if (code != -ENOMEM) {
41199 + warning("nikita-717", "Error for inode %llu (%i)",
41200 + (unsigned long long)get_key_objectid(key), code);
41201 + reiser4_print_key("for key", key);
41202 + }
41203 +}
41204 +
41205 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41206 +#if REISER4_DEBUG
41207 +static void
41208 +check_inode_seal(const struct inode *inode,
41209 + const coord_t * coord, const reiser4_key * key)
41210 +{
41211 + reiser4_key unit_key;
41212 +
41213 + unit_key_by_coord(coord, &unit_key);
41214 + assert("nikita-2752",
41215 + WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
41216 + assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
41217 +}
41218 +
41219 +static void check_sd_coord(coord_t * coord, const reiser4_key * key)
41220 +{
41221 + reiser4_key ukey;
41222 +
41223 + coord_clear_iplug(coord);
41224 + if (zload(coord->node))
41225 + return;
41226 +
41227 + if (!coord_is_existing_unit(coord) ||
41228 + !item_plugin_by_coord(coord) ||
41229 + !keyeq(unit_key_by_coord(coord, &ukey), key) ||
41230 + (znode_get_level(coord->node) != LEAF_LEVEL) ||
41231 + !item_is_statdata(coord)) {
41232 + warning("nikita-1901", "Conspicuous seal");
41233 + reiser4_print_key("key", key);
41234 + print_coord("coord", coord, 1);
41235 + impossible("nikita-2877", "no way");
41236 + }
41237 + zrelse(coord->node);
41238 +}
41239 +
41240 +#else
41241 +#define check_inode_seal(inode, coord, key) noop
41242 +#define check_sd_coord(coord, key) noop
41243 +#endif
41244 +
41245 +/* insert new stat-data into tree. Called with inode state
41246 + locked. Return inode state locked. */
41247 +static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
41248 +{
41249 + int result;
41250 + reiser4_key key;
41251 + coord_t coord;
41252 + reiser4_item_data data;
41253 + char *area;
41254 + reiser4_inode *ref;
41255 + lock_handle lh;
41256 + oid_t oid;
41257 +
41258 + assert("nikita-723", inode != NULL);
41259 + assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
41260 +
41261 + ref = reiser4_inode_data(inode);
41262 + spin_lock_inode(inode);
41263 +
41264 + if (ref->plugin_mask != 0)
41265 + /* inode has non-standard plugins */
41266 + inode_set_extension(inode, PLUGIN_STAT);
41267 + /*
41268 + * prepare specification of new item to be inserted
41269 + */
41270 +
41271 + data.iplug = inode_sd_plugin(inode);
41272 + data.length = data.iplug->s.sd.save_len(inode);
41273 + spin_unlock_inode(inode);
41274 +
41275 + data.data = NULL;
41276 + data.user = 0;
41277 +/* could be optimized for case where there is only one node format in
41278 + * use in the filesystem, probably there are lots of such
41279 + * places we could optimize for only one node layout.... -Hans */
41280 + if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
41281 + /* This is silly check, but we don't know actual node where
41282 + insertion will go into. */
41283 + return RETERR(-ENAMETOOLONG);
41284 + }
41285 + oid = oid_allocate(inode->i_sb);
41286 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
41287 + if (oid == ABSOLUTE_MAX_OID)
41288 + return RETERR(-EOVERFLOW);
41289 +
41290 + set_inode_oid(inode, oid);
41291 +
41292 + coord_init_zero(&coord);
41293 + init_lh(&lh);
41294 +
41295 + result = insert_by_key(reiser4_tree_by_inode(inode),
41296 + build_sd_key(inode, &key), &data, &coord, &lh,
41297 + /* stat data lives on a leaf level */
41298 + LEAF_LEVEL, CBK_UNIQUE);
41299 +
41300 + /* we don't want to re-check that somebody didn't insert
41301 + stat-data while we were doing io, because if it did,
41302 + insert_by_key() returned error. */
41303 + /* but what _is_ possible is that plugin for inode's stat-data,
41304 + list of non-standard plugins or their state would change
41305 + during io, so that stat-data wouldn't fit into sd. To avoid
41306 + this race we keep inode_state lock. This lock has to be
41307 + taken each time you access inode in a way that would cause
41308 + changes in sd size: changing plugins etc.
41309 + */
41310 +
41311 + if (result == IBK_INSERT_OK) {
41312 + coord_clear_iplug(&coord);
41313 + result = zload(coord.node);
41314 + if (result == 0) {
41315 + /* have we really inserted stat data? */
41316 + assert("nikita-725", item_is_statdata(&coord));
41317 +
41318 + /* inode was just created. It is inserted into hash
41319 + table, but no directory entry was yet inserted into
41320 + parent. So, inode is inaccessible through
41321 + ->lookup(). All places that directly grab inode
41322 + from hash-table (like old knfsd), should check
41323 + IMMUTABLE flag that is set by common_create_child.
41324 + */
41325 + assert("nikita-3240", data.iplug != NULL);
41326 + assert("nikita-3241", data.iplug->s.sd.save != NULL);
41327 + area = item_body_by_coord(&coord);
41328 + result = data.iplug->s.sd.save(inode, &area);
41329 + znode_make_dirty(coord.node);
41330 + if (result == 0) {
41331 + /* object has stat-data now */
41332 + reiser4_inode_clr_flag(inode, REISER4_NO_SD);
41333 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41334 + /* initialise stat-data seal */
41335 + reiser4_seal_init(&ref->sd_seal, &coord, &key);
41336 + ref->sd_coord = coord;
41337 + check_inode_seal(inode, &coord, &key);
41338 + } else if (result != -ENOMEM)
41339 + /*
41340 + * convert any other error code to -EIO to
41341 + * avoid confusing user level with unexpected
41342 + * errors.
41343 + */
41344 + result = RETERR(-EIO);
41345 + zrelse(coord.node);
41346 + }
41347 + }
41348 + done_lh(&lh);
41349 +
41350 + if (result != 0)
41351 + key_warning(&key, inode, result);
41352 + else
41353 + oid_count_allocated();
41354 +
41355 + return result;
41356 +}
41357 +
41358 +/* find sd of inode in a tree, deal with errors */
41359 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
41360 + znode_lock_mode lock_mode /* lock mode */ ,
41361 + coord_t * coord /* resulting coord */ ,
41362 + lock_handle * lh /* resulting lock handle */ ,
41363 + const reiser4_key * key /* resulting key */ ,
41364 + int silent)
41365 +{
41366 + int result;
41367 + __u32 flags;
41368 +
41369 + assert("nikita-1692", inode != NULL);
41370 + assert("nikita-1693", coord != NULL);
41371 + assert("nikita-1694", key != NULL);
41372 +
41373 + /* look for the object's stat data in a tree.
41374 + This returns in "node" pointer to a locked znode and in "pos"
41375 + position of an item found in node. Both are only valid if
41376 + coord_found is returned. */
41377 + flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
41378 + flags |= CBK_UNIQUE;
41379 + /*
41380 + * traverse tree to find stat data. We cannot use vroot here, because
41381 + * it only covers _body_ of the file, and stat data don't belong
41382 + * there.
41383 + */
41384 + result = coord_by_key(reiser4_tree_by_inode(inode),
41385 + key,
41386 + coord,
41387 + lh,
41388 + lock_mode,
41389 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
41390 + if (REISER4_DEBUG && result == 0)
41391 + check_sd_coord(coord, key);
41392 +
41393 + if (result != 0 && !silent)
41394 + key_warning(key, inode, result);
41395 + return result;
41396 +}
41397 +
41398 +static int
41399 +locate_inode_sd(struct inode *inode,
41400 + reiser4_key * key, coord_t * coord, lock_handle * lh)
41401 +{
41402 + reiser4_inode *state;
41403 + seal_t seal;
41404 + int result;
41405 +
41406 + assert("nikita-3483", inode != NULL);
41407 +
41408 + state = reiser4_inode_data(inode);
41409 + spin_lock_inode(inode);
41410 + *coord = state->sd_coord;
41411 + coord_clear_iplug(coord);
41412 + seal = state->sd_seal;
41413 + spin_unlock_inode(inode);
41414 +
41415 + build_sd_key(inode, key);
41416 + if (reiser4_seal_is_set(&seal)) {
41417 + /* first, try to use seal */
41418 + result = reiser4_seal_validate(&seal,
41419 + coord,
41420 + key,
41421 + lh, ZNODE_WRITE_LOCK,
41422 + ZNODE_LOCK_LOPRI);
41423 + if (result == 0)
41424 + check_sd_coord(coord, key);
41425 + } else
41426 + result = -E_REPEAT;
41427 +
41428 + if (result != 0) {
41429 + coord_init_zero(coord);
41430 + result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
41431 + }
41432 + return result;
41433 +}
41434 +
41435 +#if REISER4_DEBUG
41436 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
41437 +{
41438 + return (get_key_locality(k1) == get_key_locality(k2) &&
41439 + get_key_type(k1) == get_key_type(k2) &&
41440 + get_key_band(k1) == get_key_band(k2) &&
41441 + get_key_ordering(k1) == get_key_ordering(k2) &&
41442 + get_key_objectid(k1) == get_key_objectid(k2));
41443 +}
41444 +
41445 +#include "../tree_walk.h"
41446 +
41447 +/* make some checks before and after stat-data resize operation */
41448 +static int check_sd_resize(struct inode * inode, coord_t * coord,
41449 + int length, int progress /* 1 means after resize */)
41450 +{
41451 + int ret = 0;
41452 + lock_handle left_lock;
41453 + coord_t left_coord;
41454 + reiser4_key left_key;
41455 + reiser4_key key;
41456 +
41457 + if (inode_file_plugin(inode) !=
41458 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
41459 + return 0;
41460 + if (!length)
41461 + return 0;
41462 + if (coord->item_pos != 0)
41463 + return 0;
41464 +
41465 + init_lh(&left_lock);
41466 + ret = reiser4_get_left_neighbor(&left_lock,
41467 + coord->node,
41468 + ZNODE_WRITE_LOCK,
41469 + GN_CAN_USE_UPPER_LEVELS);
41470 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
41471 + ret == -ENOENT || ret == -EINVAL
41472 + || ret == -E_DEADLOCK) {
41473 + ret = 0;
41474 + goto exit;
41475 + }
41476 + ret = zload(left_lock.node);
41477 + if (ret)
41478 + goto exit;
41479 + coord_init_last_unit(&left_coord, left_lock.node);
41480 + item_key_by_coord(&left_coord, &left_key);
41481 + item_key_by_coord(coord, &key);
41482 +
41483 + if (all_but_offset_key_eq(&key, &left_key))
41484 + /* corruption occured */
41485 + ret = 1;
41486 + zrelse(left_lock.node);
41487 + exit:
41488 + done_lh(&left_lock);
41489 + return ret;
41490 +}
41491 +#endif
41492 +
41493 +/* update stat-data at @coord */
41494 +static int
41495 +update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
41496 + lock_handle * lh)
41497 +{
41498 + int result;
41499 + reiser4_item_data data;
41500 + char *area;
41501 + reiser4_inode *state;
41502 + znode *loaded;
41503 +
41504 + state = reiser4_inode_data(inode);
41505 +
41506 + coord_clear_iplug(coord);
41507 + result = zload(coord->node);
41508 + if (result != 0)
41509 + return result;
41510 + loaded = coord->node;
41511 +
41512 + spin_lock_inode(inode);
41513 + assert("nikita-728", inode_sd_plugin(inode) != NULL);
41514 + data.iplug = inode_sd_plugin(inode);
41515 +
41516 + /* if inode has non-standard plugins, add appropriate stat data
41517 + * extension */
41518 + if (state->extmask & (1 << PLUGIN_STAT)) {
41519 + if (state->plugin_mask == 0)
41520 + inode_clr_extension(inode, PLUGIN_STAT);
41521 + } else if (state->plugin_mask != 0)
41522 + inode_set_extension(inode, PLUGIN_STAT);
41523 +
41524 + if (state->extmask & (1 << HEIR_STAT)) {
41525 + if (state->heir_mask == 0)
41526 + inode_clr_extension(inode, HEIR_STAT);
41527 + } else if (state->heir_mask != 0)
41528 + inode_set_extension(inode, HEIR_STAT);
41529 +
41530 + /* data.length is how much space to add to (or remove
41531 + from if negative) sd */
41532 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
41533 + /* recalculate stat-data length */
41534 + data.length =
41535 + data.iplug->s.sd.save_len(inode) -
41536 + item_length_by_coord(coord);
41537 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41538 + } else
41539 + data.length = 0;
41540 + spin_unlock_inode(inode);
41541 +
41542 + /* if on-disk stat data is of different length than required
41543 + for this inode, resize it */
41544 +
41545 + if (data.length != 0) {
41546 + data.data = NULL;
41547 + data.user = 0;
41548 +
41549 + assert("edward-1441",
41550 + !check_sd_resize(inode, coord,
41551 + data.length, 0/* before resize */));
41552 +
41553 + /* insertion code requires that insertion point (coord) was
41554 + * between units. */
41555 + coord->between = AFTER_UNIT;
41556 + result = reiser4_resize_item(coord, &data, key, lh,
41557 + COPI_DONT_SHIFT_LEFT);
41558 + if (result != 0) {
41559 + key_warning(key, inode, result);
41560 + zrelse(loaded);
41561 + return result;
41562 + }
41563 + if (loaded != coord->node) {
41564 + /* reiser4_resize_item moved coord to another node.
41565 + Zload it */
41566 + zrelse(loaded);
41567 + coord_clear_iplug(coord);
41568 + result = zload(coord->node);
41569 + if (result != 0)
41570 + return result;
41571 + loaded = coord->node;
41572 + }
41573 + assert("edward-1442",
41574 + !check_sd_resize(inode, coord,
41575 + data.length, 1/* after resize */));
41576 + }
41577 + area = item_body_by_coord(coord);
41578 + spin_lock_inode(inode);
41579 + result = data.iplug->s.sd.save(inode, &area);
41580 + znode_make_dirty(coord->node);
41581 +
41582 + /* re-initialise stat-data seal */
41583 +
41584 + /*
41585 + * coord.between was possibly skewed from AT_UNIT when stat-data size
41586 + * was changed and new extensions were pasted into item.
41587 + */
41588 + coord->between = AT_UNIT;
41589 + reiser4_seal_init(&state->sd_seal, coord, key);
41590 + state->sd_coord = *coord;
41591 + spin_unlock_inode(inode);
41592 + check_inode_seal(inode, coord, key);
41593 + zrelse(loaded);
41594 + return result;
41595 +}
41596 +
41597 +/* Update existing stat-data in a tree. Called with inode state locked. Return
41598 + inode state locked. */
41599 +static int update_sd(struct inode *inode /* inode to update sd for */ )
41600 +{
41601 + int result;
41602 + reiser4_key key;
41603 + coord_t coord;
41604 + lock_handle lh;
41605 +
41606 + assert("nikita-726", inode != NULL);
41607 +
41608 + /* no stat-data, nothing to update?! */
41609 + assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
41610 +
41611 + init_lh(&lh);
41612 +
41613 + result = locate_inode_sd(inode, &key, &coord, &lh);
41614 + if (result == 0)
41615 + result = update_sd_at(inode, &coord, &key, &lh);
41616 + done_lh(&lh);
41617 +
41618 + return result;
41619 +}
41620 +
41621 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
41622 + Remove object stat data. Space for that must be reserved by caller before
41623 +*/
41624 +static int
41625 +common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
41626 +{
41627 + int result;
41628 +
41629 + assert("nikita-1477", inode != NULL);
41630 +
41631 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
41632 + reiser4_key sd_key;
41633 +
41634 + DQUOT_FREE_INODE(inode);
41635 + DQUOT_DROP(inode);
41636 +
41637 + build_sd_key(inode, &sd_key);
41638 + result =
41639 + reiser4_cut_tree(reiser4_tree_by_inode(inode),
41640 + &sd_key, &sd_key, NULL, 0);
41641 + if (result == 0) {
41642 + reiser4_inode_set_flag(inode, REISER4_NO_SD);
41643 + result = oid_release(inode->i_sb, get_inode_oid(inode));
41644 + if (result == 0) {
41645 + oid_count_released();
41646 +
41647 + result = safe_link_del(reiser4_tree_by_inode(inode),
41648 + get_inode_oid(inode),
41649 + SAFE_UNLINK);
41650 + }
41651 + }
41652 + } else
41653 + result = 0;
41654 + return result;
41655 +}
41656 +
41657 +/* helper for safelink_common */
41658 +static int process_truncate(struct inode *inode, __u64 size)
41659 +{
41660 + int result;
41661 + struct iattr attr;
41662 + file_plugin *fplug;
41663 + reiser4_context *ctx;
41664 + struct dentry dentry;
41665 +
41666 + assert("vs-21", is_in_reiser4_context());
41667 + ctx = reiser4_init_context(inode->i_sb);
41668 + assert("vs-22", !IS_ERR(ctx));
41669 +
41670 + attr.ia_size = size;
41671 + attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
41672 + fplug = inode_file_plugin(inode);
41673 +
41674 + mutex_lock(&inode->i_mutex);
41675 + assert("vs-1704", get_current_context()->trans->atom == NULL);
41676 + dentry.d_inode = inode;
41677 + result = inode->i_op->setattr(&dentry, &attr);
41678 + mutex_unlock(&inode->i_mutex);
41679 +
41680 + context_set_commit_async(ctx);
41681 + reiser4_exit_context(ctx);
41682 +
41683 + return result;
41684 +}
41685 +
41686 +/*
41687 + Local variables:
41688 + c-indentation-style: "K&R"
41689 + mode-name: "LC"
41690 + c-basic-offset: 8
41691 + tab-width: 8
41692 + fill-column: 80
41693 + scroll-step: 1
41694 + End:
41695 +*/
41696 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/hash.c linux-2.6.22/fs/reiser4/plugin/hash.c
41697 --- linux-2.6.22.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300
41698 +++ linux-2.6.22/fs/reiser4/plugin/hash.c 2007-07-29 00:25:34.936712007 +0400
41699 @@ -0,0 +1,353 @@
41700 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
41701 + * reiser4/README */
41702 +
41703 +/* Hash functions */
41704 +
41705 +#include "../debug.h"
41706 +#include "plugin_header.h"
41707 +#include "plugin.h"
41708 +#include "../super.h"
41709 +#include "../inode.h"
41710 +
41711 +#include <linux/types.h>
41712 +
41713 +/* old rupasov (yura) hash */
41714 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
41715 + int len /* @name's length */ )
41716 +{
41717 + int i;
41718 + int j;
41719 + int pow;
41720 + __u64 a;
41721 + __u64 c;
41722 +
41723 + assert("nikita-672", name != NULL);
41724 + assert("nikita-673", len >= 0);
41725 +
41726 + for (pow = 1, i = 1; i < len; ++i)
41727 + pow = pow * 10;
41728 +
41729 + if (len == 1)
41730 + a = name[0] - 48;
41731 + else
41732 + a = (name[0] - 48) * pow;
41733 +
41734 + for (i = 1; i < len; ++i) {
41735 + c = name[i] - 48;
41736 + for (pow = 1, j = i; j < len - 1; ++j)
41737 + pow = pow * 10;
41738 + a = a + c * pow;
41739 + }
41740 + for (; i < 40; ++i) {
41741 + c = '0' - 48;
41742 + for (pow = 1, j = i; j < len - 1; ++j)
41743 + pow = pow * 10;
41744 + a = a + c * pow;
41745 + }
41746 +
41747 + for (; i < 256; ++i) {
41748 + c = i;
41749 + for (pow = 1, j = i; j < len - 1; ++j)
41750 + pow = pow * 10;
41751 + a = a + c * pow;
41752 + }
41753 +
41754 + a = a << 7;
41755 + return a;
41756 +}
41757 +
41758 +/* r5 hash */
41759 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
41760 + int len UNUSED_ARG /* @name's length */ )
41761 +{
41762 + __u64 a = 0;
41763 +
41764 + assert("nikita-674", name != NULL);
41765 + assert("nikita-675", len >= 0);
41766 +
41767 + while (*name) {
41768 + a += *name << 4;
41769 + a += *name >> 4;
41770 + a *= 11;
41771 + name++;
41772 + }
41773 + return a;
41774 +}
41775 +
41776 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
41777 + H0 = Key
41778 + Hi = E Mi(Hi-1) + Hi-1
41779 +
41780 + (see Applied Cryptography, 2nd edition, p448).
41781 +
41782 + Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
41783 +
41784 + Jeremy has agreed to the contents of reiserfs/README. -Hans
41785 +
41786 + This code was blindly upgraded to __u64 by s/__u32/__u64/g.
41787 +*/
41788 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
41789 + int len /* @name's length */ )
41790 +{
41791 + __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
41792 +
41793 + __u64 h0 = k[0], h1 = k[1];
41794 + __u64 a, b, c, d;
41795 + __u64 pad;
41796 + int i;
41797 +
41798 + assert("nikita-676", name != NULL);
41799 + assert("nikita-677", len >= 0);
41800 +
41801 +#define DELTA 0x9E3779B9u
41802 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
41803 +#define PARTROUNDS 6 /* 6 gets complete mixing */
41804 +
41805 +/* a, b, c, d - data; h0, h1 - accumulated hash */
41806 +#define TEACORE(rounds) \
41807 + do { \
41808 + __u64 sum = 0; \
41809 + int n = rounds; \
41810 + __u64 b0, b1; \
41811 + \
41812 + b0 = h0; \
41813 + b1 = h1; \
41814 + \
41815 + do \
41816 + { \
41817 + sum += DELTA; \
41818 + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
41819 + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
41820 + } while(--n); \
41821 + \
41822 + h0 += b0; \
41823 + h1 += b1; \
41824 + } while(0)
41825 +
41826 + pad = (__u64) len | ((__u64) len << 8);
41827 + pad |= pad << 16;
41828 +
41829 + while (len >= 16) {
41830 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41831 + 16 | (__u64) name[3] << 24;
41832 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41833 + 16 | (__u64) name[7] << 24;
41834 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41835 + 16 | (__u64) name[11] << 24;
41836 + d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
41837 + << 16 | (__u64) name[15] << 24;
41838 +
41839 + TEACORE(PARTROUNDS);
41840 +
41841 + len -= 16;
41842 + name += 16;
41843 + }
41844 +
41845 + if (len >= 12) {
41846 + //assert(len < 16);
41847 + if (len >= 16)
41848 + *(int *)0 = 0;
41849 +
41850 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41851 + 16 | (__u64) name[3] << 24;
41852 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41853 + 16 | (__u64) name[7] << 24;
41854 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41855 + 16 | (__u64) name[11] << 24;
41856 +
41857 + d = pad;
41858 + for (i = 12; i < len; i++) {
41859 + d <<= 8;
41860 + d |= name[i];
41861 + }
41862 + } else if (len >= 8) {
41863 + //assert(len < 12);
41864 + if (len >= 12)
41865 + *(int *)0 = 0;
41866 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41867 + 16 | (__u64) name[3] << 24;
41868 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41869 + 16 | (__u64) name[7] << 24;
41870 +
41871 + c = d = pad;
41872 + for (i = 8; i < len; i++) {
41873 + c <<= 8;
41874 + c |= name[i];
41875 + }
41876 + } else if (len >= 4) {
41877 + //assert(len < 8);
41878 + if (len >= 8)
41879 + *(int *)0 = 0;
41880 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41881 + 16 | (__u64) name[3] << 24;
41882 +
41883 + b = c = d = pad;
41884 + for (i = 4; i < len; i++) {
41885 + b <<= 8;
41886 + b |= name[i];
41887 + }
41888 + } else {
41889 + //assert(len < 4);
41890 + if (len >= 4)
41891 + *(int *)0 = 0;
41892 + a = b = c = d = pad;
41893 + for (i = 0; i < len; i++) {
41894 + a <<= 8;
41895 + a |= name[i];
41896 + }
41897 + }
41898 +
41899 + TEACORE(FULLROUNDS);
41900 +
41901 +/* return 0;*/
41902 + return h0 ^ h1;
41903 +
41904 +}
41905 +
41906 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
41907 +
41908 + See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
41909 +
41910 + Excerpts:
41911 +
41912 + FNV hashes are designed to be fast while maintaining a low collision
41913 + rate.
41914 +
41915 + [This version also seems to preserve lexicographical order locally.]
41916 +
41917 + FNV hash algorithms and source code have been released into the public
41918 + domain.
41919 +
41920 +*/
41921 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
41922 + int len UNUSED_ARG /* @name's length */ )
41923 +{
41924 + unsigned long long a = 0xcbf29ce484222325ull;
41925 + const unsigned long long fnv_64_prime = 0x100000001b3ull;
41926 +
41927 + assert("nikita-678", name != NULL);
41928 + assert("nikita-679", len >= 0);
41929 +
41930 + /* FNV-1 hash each octet in the buffer */
41931 + for (; *name; ++name) {
41932 + /* multiply by the 32 bit FNV magic prime mod 2^64 */
41933 + a *= fnv_64_prime;
41934 + /* xor the bottom with the current octet */
41935 + a ^= (unsigned long long)(*name);
41936 + }
41937 + /* return our new hash value */
41938 + return a;
41939 +}
41940 +
41941 +/* degenerate hash function used to simplify testing of non-unique key
41942 + handling */
41943 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
41944 + int len UNUSED_ARG /* @name's length */ )
41945 +{
41946 + return 0xc0c0c0c010101010ull;
41947 +}
41948 +
41949 +static int change_hash(struct inode *inode,
41950 + reiser4_plugin * plugin,
41951 + pset_member memb)
41952 +{
41953 + int result;
41954 +
41955 + assert("nikita-3503", inode != NULL);
41956 + assert("nikita-3504", plugin != NULL);
41957 +
41958 + assert("nikita-3505", is_reiser4_inode(inode));
41959 + assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
41960 +
41961 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
41962 + return RETERR(-EINVAL);
41963 +
41964 + result = 0;
41965 + if (inode_hash_plugin(inode) == NULL ||
41966 + inode_hash_plugin(inode)->h.id != plugin->h.id) {
41967 + if (is_dir_empty(inode) == 0)
41968 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
41969 + PSET_HASH, plugin);
41970 + else
41971 + result = RETERR(-ENOTEMPTY);
41972 +
41973 + }
41974 + return result;
41975 +}
41976 +
41977 +static reiser4_plugin_ops hash_plugin_ops = {
41978 + .init = NULL,
41979 + .load = NULL,
41980 + .save_len = NULL,
41981 + .save = NULL,
41982 + .change = change_hash
41983 +};
41984 +
41985 +/* hash plugins */
41986 +hash_plugin hash_plugins[LAST_HASH_ID] = {
41987 + [RUPASOV_HASH_ID] = {
41988 + .h = {
41989 + .type_id = REISER4_HASH_PLUGIN_TYPE,
41990 + .id = RUPASOV_HASH_ID,
41991 + .pops = &hash_plugin_ops,
41992 + .label = "rupasov",
41993 + .desc = "Original Yura's hash",
41994 + .linkage = {NULL, NULL}
41995 + },
41996 + .hash = hash_rupasov
41997 + },
41998 + [R5_HASH_ID] = {
41999 + .h = {
42000 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42001 + .id = R5_HASH_ID,
42002 + .pops = &hash_plugin_ops,
42003 + .label = "r5",
42004 + .desc = "r5 hash",
42005 + .linkage = {NULL, NULL}
42006 + },
42007 + .hash = hash_r5
42008 + },
42009 + [TEA_HASH_ID] = {
42010 + .h = {
42011 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42012 + .id = TEA_HASH_ID,
42013 + .pops = &hash_plugin_ops,
42014 + .label = "tea",
42015 + .desc = "tea hash",
42016 + .linkage = {NULL, NULL}
42017 + },
42018 + .hash = hash_tea
42019 + },
42020 + [FNV1_HASH_ID] = {
42021 + .h = {
42022 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42023 + .id = FNV1_HASH_ID,
42024 + .pops = &hash_plugin_ops,
42025 + .label = "fnv1",
42026 + .desc = "fnv1 hash",
42027 + .linkage = {NULL, NULL}
42028 + },
42029 + .hash = hash_fnv1
42030 + },
42031 + [DEGENERATE_HASH_ID] = {
42032 + .h = {
42033 + .type_id = REISER4_HASH_PLUGIN_TYPE,
42034 + .id = DEGENERATE_HASH_ID,
42035 + .pops = &hash_plugin_ops,
42036 + .label = "degenerate hash",
42037 + .desc = "Degenerate hash: only for testing",
42038 + .linkage = {NULL, NULL}
42039 + },
42040 + .hash = hash_deg
42041 + }
42042 +};
42043 +
42044 +/* Make Linus happy.
42045 + Local variables:
42046 + c-indentation-style: "K&R"
42047 + mode-name: "LC"
42048 + c-basic-offset: 8
42049 + tab-width: 8
42050 + fill-column: 120
42051 + End:
42052 +*/
42053 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.22/fs/reiser4/plugin/inode_ops.c
42054 --- linux-2.6.22.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300
42055 +++ linux-2.6.22/fs/reiser4/plugin/inode_ops.c 2007-07-29 00:25:34.936712007 +0400
42056 @@ -0,0 +1,897 @@
42057 +/*
42058 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42059 + */
42060 +
42061 +/*
42062 + * this file contains typical implementations for most of methods of struct
42063 + * inode_operations
42064 + */
42065 +
42066 +#include "../inode.h"
42067 +#include "../safe_link.h"
42068 +
42069 +#include <linux/quotaops.h>
42070 +#include <linux/namei.h>
42071 +
42072 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42073 + reiser4_object_create_data *data);
42074 +
42075 +/**
42076 + * reiser4_create_common - create of inode operations
42077 + * @parent: inode of parent directory
42078 + * @dentry: dentry of new object to create
42079 + * @mode: the permissions to use
42080 + * @nameidata:
42081 + *
42082 + * This is common implementation of vfs's create method of struct
42083 + * inode_operations.
42084 + * Creates regular file using file plugin from parent directory plugin set.
42085 + */
42086 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
42087 + int mode, struct nameidata *nameidata)
42088 +{
42089 + reiser4_object_create_data data;
42090 + file_plugin *fplug;
42091 +
42092 + memset(&data, 0, sizeof data);
42093 + data.mode = S_IFREG | mode;
42094 + fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
42095 + if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
42096 + warning("vpf-1900", "'%s' is not a regular file plugin.",
42097 + fplug->h.label);
42098 + return RETERR(-EIO);
42099 + }
42100 + data.id = fplug->h.id;
42101 + return create_vfs_object(parent, dentry, &data);
42102 +}
42103 +
42104 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42105 +void check_light_weight(struct inode *inode, struct inode *parent);
42106 +
42107 +/**
42108 + * reiser4_lookup_common - lookup of inode operations
42109 + * @parent: inode of directory to lookup into
42110 + * @dentry: name to look for
42111 + * @nameidata:
42112 + *
42113 + * This is common implementation of vfs's lookup method of struct
42114 + * inode_operations.
42115 + */
42116 +struct dentry *reiser4_lookup_common(struct inode *parent,
42117 + struct dentry *dentry,
42118 + struct nameidata *nameidata)
42119 +{
42120 + reiser4_context *ctx;
42121 + int result;
42122 + struct dentry *new;
42123 + struct inode *inode;
42124 + reiser4_dir_entry_desc entry;
42125 +
42126 + ctx = reiser4_init_context(parent->i_sb);
42127 + if (IS_ERR(ctx))
42128 + return (struct dentry *)ctx;
42129 +
42130 + /* set up operations on dentry. */
42131 + dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42132 +
42133 + result = reiser4_lookup_name(parent, dentry, &entry.key);
42134 + if (result) {
42135 + context_set_commit_async(ctx);
42136 + reiser4_exit_context(ctx);
42137 + if (result == -ENOENT) {
42138 + /* object not found */
42139 + if (!IS_DEADDIR(parent))
42140 + d_add(dentry, NULL);
42141 + return NULL;
42142 + }
42143 + return ERR_PTR(result);
42144 + }
42145 +
42146 + inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42147 + if (IS_ERR(inode)) {
42148 + context_set_commit_async(ctx);
42149 + reiser4_exit_context(ctx);
42150 + return ERR_PTR(PTR_ERR(inode));
42151 + }
42152 +
42153 + /* success */
42154 + check_light_weight(inode, parent);
42155 + new = d_splice_alias(inode, dentry);
42156 + reiser4_iget_complete(inode);
42157 +
42158 + /* prevent balance_dirty_pages() from being called: we don't want to
42159 + * do this under directory i_mutex. */
42160 + context_set_commit_async(ctx);
42161 + reiser4_exit_context(ctx);
42162 + return new;
42163 +}
42164 +
42165 +static reiser4_block_nr common_estimate_link(struct inode *parent,
42166 + struct inode *object);
42167 +int reiser4_update_dir(struct inode *);
42168 +
42169 +/**
42170 + * reiser4_link_common - link of inode operations
42171 + * @existing: dentry of object which is to get new name
42172 + * @parent: directory where new name is to be created
42173 + * @newname: new name
42174 + *
42175 + * This is common implementation of vfs's link method of struct
42176 + * inode_operations.
42177 + */
42178 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
42179 + struct dentry *newname)
42180 +{
42181 + reiser4_context *ctx;
42182 + int result;
42183 + struct inode *object;
42184 + dir_plugin *parent_dplug;
42185 + reiser4_dir_entry_desc entry;
42186 + reiser4_object_create_data data;
42187 + reiser4_block_nr reserve;
42188 +
42189 + ctx = reiser4_init_context(parent->i_sb);
42190 + if (IS_ERR(ctx))
42191 + return PTR_ERR(ctx);
42192 +
42193 + assert("nikita-1431", existing != NULL);
42194 + assert("nikita-1432", parent != NULL);
42195 + assert("nikita-1433", newname != NULL);
42196 +
42197 + object = existing->d_inode;
42198 + assert("nikita-1434", object != NULL);
42199 +
42200 + /* check for race with create_object() */
42201 + if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
42202 + context_set_commit_async(ctx);
42203 + reiser4_exit_context(ctx);
42204 + return RETERR(-E_REPEAT);
42205 + }
42206 +
42207 + parent_dplug = inode_dir_plugin(parent);
42208 +
42209 + memset(&entry, 0, sizeof entry);
42210 + entry.obj = object;
42211 +
42212 + data.mode = object->i_mode;
42213 + data.id = inode_file_plugin(object)->h.id;
42214 +
42215 + reserve = common_estimate_link(parent, existing->d_inode);
42216 + if ((__s64) reserve < 0) {
42217 + context_set_commit_async(ctx);
42218 + reiser4_exit_context(ctx);
42219 + return reserve;
42220 + }
42221 +
42222 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42223 + context_set_commit_async(ctx);
42224 + reiser4_exit_context(ctx);
42225 + return RETERR(-ENOSPC);
42226 + }
42227 +
42228 + /*
42229 + * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
42230 + * means that link(2) can race against unlink(2) or rename(2), and
42231 + * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
42232 + *
42233 + * For such inode we have to undo special processing done in
42234 + * reiser4_unlink() viz. creation of safe-link.
42235 + */
42236 + if (unlikely(object->i_nlink == 0)) {
42237 + result = safe_link_del(reiser4_tree_by_inode(object),
42238 + get_inode_oid(object), SAFE_UNLINK);
42239 + if (result != 0) {
42240 + context_set_commit_async(ctx);
42241 + reiser4_exit_context(ctx);
42242 + return result;
42243 + }
42244 + }
42245 +
42246 + /* increment nlink of @existing and update its stat data */
42247 + result = reiser4_add_nlink(object, parent, 1);
42248 + if (result == 0) {
42249 + /* add entry to the parent */
42250 + result =
42251 + parent_dplug->add_entry(parent, newname, &data, &entry);
42252 + if (result != 0) {
42253 + /* failed to add entry to the parent, decrement nlink
42254 + of @existing */
42255 + reiser4_del_nlink(object, parent, 1);
42256 + /*
42257 + * now, if that failed, we have a file with too big
42258 + * nlink---space leak, much better than directory
42259 + * entry pointing to nowhere
42260 + */
42261 + }
42262 + }
42263 + if (result == 0) {
42264 + atomic_inc(&object->i_count);
42265 + /*
42266 + * Upon successful completion, link() shall mark for update
42267 + * the st_ctime field of the file. Also, the st_ctime and
42268 + * st_mtime fields of the directory that contains the new
42269 + * entry shall be marked for update. --SUS
42270 + */
42271 + result = reiser4_update_dir(parent);
42272 + }
42273 + if (result == 0)
42274 + d_instantiate(newname, existing->d_inode);
42275 +
42276 + context_set_commit_async(ctx);
42277 + reiser4_exit_context(ctx);
42278 + return result;
42279 +}
42280 +
42281 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
42282 +
42283 +/**
42284 + * reiser4_unlink_common - unlink of inode operations
42285 + * @parent: inode of directory to remove name from
42286 + * @victim: name to be removed
42287 + *
42288 + * This is common implementation of vfs's unlink method of struct
42289 + * inode_operations.
42290 + */
42291 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
42292 +{
42293 + reiser4_context *ctx;
42294 + int result;
42295 + struct inode *object;
42296 + file_plugin *fplug;
42297 +
42298 + ctx = reiser4_init_context(parent->i_sb);
42299 + if (IS_ERR(ctx))
42300 + return PTR_ERR(ctx);
42301 +
42302 + object = victim->d_inode;
42303 + fplug = inode_file_plugin(object);
42304 + assert("nikita-2882", fplug->detach != NULL);
42305 +
42306 + result = unlink_check_and_grab(parent, victim);
42307 + if (result != 0) {
42308 + context_set_commit_async(ctx);
42309 + reiser4_exit_context(ctx);
42310 + return result;
42311 + }
42312 +
42313 + result = fplug->detach(object, parent);
42314 + if (result == 0) {
42315 + dir_plugin *parent_dplug;
42316 + reiser4_dir_entry_desc entry;
42317 +
42318 + parent_dplug = inode_dir_plugin(parent);
42319 + memset(&entry, 0, sizeof entry);
42320 +
42321 + /* first, delete directory entry */
42322 + result = parent_dplug->rem_entry(parent, victim, &entry);
42323 + if (result == 0) {
42324 + /*
42325 + * if name was removed successfully, we _have_ to
42326 + * return 0 from this function, because upper level
42327 + * caller (vfs_{rmdir,unlink}) expect this.
42328 + *
42329 + * now that directory entry is removed, update
42330 + * stat-data
42331 + */
42332 + reiser4_del_nlink(object, parent, 1);
42333 + /*
42334 + * Upon successful completion, unlink() shall mark for
42335 + * update the st_ctime and st_mtime fields of the
42336 + * parent directory. Also, if the file's link count is
42337 + * not 0, the st_ctime field of the file shall be
42338 + * marked for update. --SUS
42339 + */
42340 + reiser4_update_dir(parent);
42341 + /* add safe-link for this file */
42342 + if (object->i_nlink == 0)
42343 + safe_link_add(object, SAFE_UNLINK);
42344 + }
42345 + }
42346 +
42347 + if (unlikely(result != 0)) {
42348 + if (result != -ENOMEM)
42349 + warning("nikita-3398", "Cannot unlink %llu (%i)",
42350 + (unsigned long long)get_inode_oid(object),
42351 + result);
42352 + /* if operation failed commit pending inode modifications to
42353 + * the stat-data */
42354 + reiser4_update_sd(object);
42355 + reiser4_update_sd(parent);
42356 + }
42357 +
42358 + reiser4_release_reserved(object->i_sb);
42359 +
42360 + /* @object's i_ctime was updated by ->rem_link() method(). */
42361 +
42362 + /* @victim can be already removed from the disk by this time. Inode is
42363 + then marked so that iput() wouldn't try to remove stat data. But
42364 + inode itself is still there.
42365 + */
42366 +
42367 + /*
42368 + * we cannot release directory semaphore here, because name has
42369 + * already been deleted, but dentry (@victim) still exists. Prevent
42370 + * balance_dirty_pages() from being called on exiting this context: we
42371 + * don't want to do this under directory i_mutex.
42372 + */
42373 + context_set_commit_async(ctx);
42374 + reiser4_exit_context(ctx);
42375 + return result;
42376 +}
42377 +
42378 +/**
42379 + * reiser4_symlink_common - symlink of inode operations
42380 + * @parent: inode of parent directory
42381 + * @dentry: dentry of object to be created
42382 + * @linkname: string symlink is to contain
42383 + *
42384 + * This is common implementation of vfs's symlink method of struct
42385 + * inode_operations.
42386 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
42387 + */
42388 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
42389 + const char *linkname)
42390 +{
42391 + reiser4_object_create_data data;
42392 +
42393 + memset(&data, 0, sizeof data);
42394 + data.name = linkname;
42395 + data.id = SYMLINK_FILE_PLUGIN_ID;
42396 + data.mode = S_IFLNK | S_IRWXUGO;
42397 + return create_vfs_object(parent, dentry, &data);
42398 +}
42399 +
42400 +/**
42401 + * reiser4_mkdir_common - mkdir of inode operations
42402 + * @parent: inode of parent directory
42403 + * @dentry: dentry of object to be created
42404 + * @mode: the permissions to use
42405 + *
42406 + * This is common implementation of vfs's mkdir method of struct
42407 + * inode_operations.
42408 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
42409 + */
42410 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
42411 +{
42412 + reiser4_object_create_data data;
42413 +
42414 + memset(&data, 0, sizeof data);
42415 + data.mode = S_IFDIR | mode;
42416 + data.id = DIRECTORY_FILE_PLUGIN_ID;
42417 + return create_vfs_object(parent, dentry, &data);
42418 +}
42419 +
42420 +/**
42421 + * reiser4_mknod_common - mknod of inode operations
42422 + * @parent: inode of parent directory
42423 + * @dentry: dentry of object to be created
42424 + * @mode: the permissions to use and file type
42425 + * @rdev: minor and major of new device file
42426 + *
42427 + * This is common implementation of vfs's mknod method of struct
42428 + * inode_operations.
42429 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
42430 + */
42431 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
42432 + int mode, dev_t rdev)
42433 +{
42434 + reiser4_object_create_data data;
42435 +
42436 + memset(&data, 0, sizeof data);
42437 + data.mode = mode;
42438 + data.rdev = rdev;
42439 + data.id = SPECIAL_FILE_PLUGIN_ID;
42440 + return create_vfs_object(parent, dentry, &data);
42441 +}
42442 +
42443 +/*
42444 + * implementation of vfs's rename method of struct inode_operations for typical
42445 + * directory is in inode_ops_rename.c
42446 + */
42447 +
42448 +/**
42449 + * reiser4_follow_link_common - follow_link of inode operations
42450 + * @dentry: dentry of symlink
42451 + * @data:
42452 + *
42453 + * This is common implementation of vfs's followlink method of struct
42454 + * inode_operations.
42455 + * Assumes that inode's i_private points to the content of symbolic link.
42456 + */
42457 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
42458 +{
42459 + assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
42460 +
42461 + if (!dentry->d_inode->i_private
42462 + || !reiser4_inode_get_flag(dentry->d_inode,
42463 + REISER4_GENERIC_PTR_USED))
42464 + return ERR_PTR(RETERR(-EINVAL));
42465 + nd_set_link(nd, dentry->d_inode->i_private);
42466 + return NULL;
42467 +}
42468 +
42469 +/**
42470 + * reiser4_permission_common - permission of inode operations
42471 + * @inode: inode to check permissions for
42472 + * @mask: mode bits to check permissions for
42473 + * @nameidata:
42474 + *
42475 + * Uses generic function to check for rwx permissions.
42476 + */
42477 +int reiser4_permission_common(struct inode *inode, int mask,
42478 + struct nameidata *nameidata)
42479 +{
42480 + return generic_permission(inode, mask, NULL);
42481 +}
42482 +
42483 +static int setattr_reserve(reiser4_tree *);
42484 +
42485 +/* this is common implementation of vfs's setattr method of struct
42486 + inode_operations
42487 +*/
42488 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
42489 +{
42490 + reiser4_context *ctx;
42491 + struct inode *inode;
42492 + int result;
42493 +
42494 + inode = dentry->d_inode;
42495 + result = inode_change_ok(inode, attr);
42496 + if (result)
42497 + return result;
42498 +
42499 + ctx = reiser4_init_context(inode->i_sb);
42500 + if (IS_ERR(ctx))
42501 + return PTR_ERR(ctx);
42502 +
42503 + assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
42504 +
42505 + /*
42506 + * grab disk space and call standard inode_setattr().
42507 + */
42508 + result = setattr_reserve(reiser4_tree_by_inode(inode));
42509 + if (!result) {
42510 + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
42511 + || (attr->ia_valid & ATTR_GID
42512 + && attr->ia_gid != inode->i_gid)) {
42513 + result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
42514 + if (result) {
42515 + context_set_commit_async(ctx);
42516 + reiser4_exit_context(ctx);
42517 + return result;
42518 + }
42519 + }
42520 + result = inode_setattr(inode, attr);
42521 + if (!result)
42522 + reiser4_update_sd(inode);
42523 + }
42524 +
42525 + context_set_commit_async(ctx);
42526 + reiser4_exit_context(ctx);
42527 + return result;
42528 +}
42529 +
42530 +/* this is common implementation of vfs's getattr method of struct
42531 + inode_operations
42532 +*/
42533 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
42534 + struct dentry *dentry, struct kstat *stat)
42535 +{
42536 + struct inode *obj;
42537 +
42538 + assert("nikita-2298", dentry != NULL);
42539 + assert("nikita-2299", stat != NULL);
42540 + assert("nikita-2300", dentry->d_inode != NULL);
42541 +
42542 + obj = dentry->d_inode;
42543 +
42544 + stat->dev = obj->i_sb->s_dev;
42545 + stat->ino = oid_to_uino(get_inode_oid(obj));
42546 + stat->mode = obj->i_mode;
42547 + /* don't confuse userland with huge nlink. This is not entirely
42548 + * correct, because nlink_t is not necessary 16 bit signed. */
42549 + stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
42550 + stat->uid = obj->i_uid;
42551 + stat->gid = obj->i_gid;
42552 + stat->rdev = obj->i_rdev;
42553 + stat->atime = obj->i_atime;
42554 + stat->mtime = obj->i_mtime;
42555 + stat->ctime = obj->i_ctime;
42556 + stat->size = obj->i_size;
42557 + stat->blocks =
42558 + (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
42559 + /* "preferred" blocksize for efficient file system I/O */
42560 + stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
42561 +
42562 + return 0;
42563 +}
42564 +
42565 +/* Estimate the maximum amount of nodes which might be allocated or changed on
42566 + typical new object creation. Typical creation consists of calling create
42567 + method of file plugin, adding directory entry to parent and update parent
42568 + directory's stat data.
42569 +*/
42570 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
42571 + struct inode *object
42572 + /* object */ )
42573 +{
42574 + assert("vpf-309", parent != NULL);
42575 + assert("vpf-307", object != NULL);
42576 +
42577 + return
42578 + /* object creation estimation */
42579 + inode_file_plugin(object)->estimate.create(object) +
42580 + /* stat data of parent directory estimation */
42581 + inode_file_plugin(parent)->estimate.update(parent) +
42582 + /* adding entry estimation */
42583 + inode_dir_plugin(parent)->estimate.add_entry(parent) +
42584 + /* to undo in the case of failure */
42585 + inode_dir_plugin(parent)->estimate.rem_entry(parent);
42586 +}
42587 +
42588 +/* Create child in directory.
42589 +
42590 + . get object's plugin
42591 + . get fresh inode
42592 + . initialize inode
42593 + . add object's stat-data
42594 + . initialize object's directory
42595 + . add entry to the parent
42596 + . instantiate dentry
42597 +
42598 +*/
42599 +static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
42600 + object */
42601 + struct inode **retobj)
42602 +{
42603 + int result;
42604 +
42605 + struct dentry *dentry; /* parent object */
42606 + struct inode *parent; /* new name */
42607 +
42608 + dir_plugin *par_dir; /* directory plugin on the parent */
42609 + dir_plugin *obj_dir; /* directory plugin on the new object */
42610 + file_plugin *obj_plug; /* object plugin on the new object */
42611 + struct inode *object; /* new object */
42612 + reiser4_block_nr reserve;
42613 +
42614 + reiser4_dir_entry_desc entry; /* new directory entry */
42615 +
42616 + assert("nikita-1420", data != NULL);
42617 + parent = data->parent;
42618 + dentry = data->dentry;
42619 +
42620 + assert("nikita-1418", parent != NULL);
42621 + assert("nikita-1419", dentry != NULL);
42622 +
42623 + /* check, that name is acceptable for parent */
42624 + par_dir = inode_dir_plugin(parent);
42625 + if (par_dir->is_name_acceptable &&
42626 + !par_dir->is_name_acceptable(parent,
42627 + dentry->d_name.name,
42628 + (int)dentry->d_name.len))
42629 + return RETERR(-ENAMETOOLONG);
42630 +
42631 + result = 0;
42632 + obj_plug = file_plugin_by_id((int)data->id);
42633 + if (obj_plug == NULL) {
42634 + warning("nikita-430", "Cannot find plugin %i", data->id);
42635 + return RETERR(-ENOENT);
42636 + }
42637 + object = new_inode(parent->i_sb);
42638 + if (object == NULL)
42639 + return RETERR(-ENOMEM);
42640 + /* we'll update i_nlink below */
42641 + object->i_nlink = 0;
42642 + /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
42643 + * to simplify error handling: if some error occurs before i_ino is
42644 + * initialized with oid, i_ino should already be set to some
42645 + * distinguished value. */
42646 + object->i_ino = 0;
42647 +
42648 + /* So that on error iput will be called. */
42649 + *retobj = object;
42650 +
42651 + if (DQUOT_ALLOC_INODE(object)) {
42652 + DQUOT_DROP(object);
42653 + object->i_flags |= S_NOQUOTA;
42654 + return RETERR(-EDQUOT);
42655 + }
42656 +
42657 + memset(&entry, 0, sizeof entry);
42658 + entry.obj = object;
42659 +
42660 + set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
42661 + file_plugin_to_plugin(obj_plug));
42662 + result = obj_plug->set_plug_in_inode(object, parent, data);
42663 + if (result) {
42664 + warning("nikita-431", "Cannot install plugin %i on %llx",
42665 + data->id, (unsigned long long)get_inode_oid(object));
42666 + DQUOT_FREE_INODE(object);
42667 + object->i_flags |= S_NOQUOTA;
42668 + return result;
42669 + }
42670 +
42671 + /* reget plugin after installation */
42672 + obj_plug = inode_file_plugin(object);
42673 +
42674 + if (obj_plug->create_object == NULL) {
42675 + DQUOT_FREE_INODE(object);
42676 + object->i_flags |= S_NOQUOTA;
42677 + return RETERR(-EPERM);
42678 + }
42679 +
42680 + /* if any of hash, tail, sd or permission plugins for newly created
42681 + object are not set yet set them here inheriting them from parent
42682 + directory
42683 + */
42684 + assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
42685 + result = obj_plug->adjust_to_parent(object,
42686 + parent,
42687 + object->i_sb->s_root->d_inode);
42688 + if (result == 0)
42689 + result = finish_pset(object);
42690 + if (result != 0) {
42691 + warning("nikita-432", "Cannot inherit from %llx to %llx",
42692 + (unsigned long long)get_inode_oid(parent),
42693 + (unsigned long long)get_inode_oid(object));
42694 + DQUOT_FREE_INODE(object);
42695 + object->i_flags |= S_NOQUOTA;
42696 + return result;
42697 + }
42698 +
42699 + /* setup inode and file-operations for this inode */
42700 + setup_inode_ops(object, data);
42701 +
42702 + /* call file plugin's method to initialize plugin specific part of
42703 + * inode */
42704 + if (obj_plug->init_inode_data)
42705 + obj_plug->init_inode_data(object, data, 1 /*create */ );
42706 +
42707 + /* obtain directory plugin (if any) for new object. */
42708 + obj_dir = inode_dir_plugin(object);
42709 + if (obj_dir != NULL && obj_dir->init == NULL) {
42710 + DQUOT_FREE_INODE(object);
42711 + object->i_flags |= S_NOQUOTA;
42712 + return RETERR(-EPERM);
42713 + }
42714 +
42715 + reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
42716 +
42717 + reserve = estimate_create_vfs_object(parent, object);
42718 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42719 + DQUOT_FREE_INODE(object);
42720 + object->i_flags |= S_NOQUOTA;
42721 + return RETERR(-ENOSPC);
42722 + }
42723 +
42724 + /* mark inode `immutable'. We disable changes to the file being
42725 + created until valid directory entry for it is inserted. Otherwise,
42726 + if file were expanded and insertion of directory entry fails, we
42727 + have to remove file, but we only alloted enough space in
42728 + transaction to remove _empty_ file. 3.x code used to remove stat
42729 + data in different transaction thus possibly leaking disk space on
42730 + crash. This all only matters if it's possible to access file
42731 + without name, for example, by inode number
42732 + */
42733 + reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
42734 +
42735 + /* create empty object, this includes allocation of new objectid. For
42736 + directories this implies creation of dot and dotdot */
42737 + assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
42738 +
42739 + /* mark inode as `loaded'. From this point onward
42740 + reiser4_delete_inode() will try to remove its stat-data. */
42741 + reiser4_inode_set_flag(object, REISER4_LOADED);
42742 +
42743 + result = obj_plug->create_object(object, parent, data);
42744 + if (result != 0) {
42745 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
42746 + if (result != -ENAMETOOLONG && result != -ENOMEM)
42747 + warning("nikita-2219",
42748 + "Failed to create sd for %llu",
42749 + (unsigned long long)get_inode_oid(object));
42750 + DQUOT_FREE_INODE(object);
42751 + object->i_flags |= S_NOQUOTA;
42752 + return result;
42753 + }
42754 +
42755 + if (obj_dir != NULL)
42756 + result = obj_dir->init(object, parent, data);
42757 + if (result == 0) {
42758 + assert("nikita-434", !reiser4_inode_get_flag(object,
42759 + REISER4_NO_SD));
42760 + /* insert inode into VFS hash table */
42761 + insert_inode_hash(object);
42762 + /* create entry */
42763 + result = par_dir->add_entry(parent, dentry, data, &entry);
42764 + if (result == 0) {
42765 + result = reiser4_add_nlink(object, parent, 0);
42766 + /* If O_CREAT is set and the file did not previously
42767 + exist, upon successful completion, open() shall
42768 + mark for update the st_atime, st_ctime, and
42769 + st_mtime fields of the file and the st_ctime and
42770 + st_mtime fields of the parent directory. --SUS
42771 + */
42772 + /* @object times are already updated by
42773 + reiser4_add_nlink() */
42774 + if (result == 0)
42775 + reiser4_update_dir(parent);
42776 + if (result != 0)
42777 + /* cleanup failure to add nlink */
42778 + par_dir->rem_entry(parent, dentry, &entry);
42779 + }
42780 + if (result != 0)
42781 + /* cleanup failure to add entry */
42782 + obj_plug->detach(object, parent);
42783 + } else if (result != -ENOMEM)
42784 + warning("nikita-2219", "Failed to initialize dir for %llu: %i",
42785 + (unsigned long long)get_inode_oid(object), result);
42786 +
42787 + /*
42788 + * update stat-data, committing all pending modifications to the inode
42789 + * fields.
42790 + */
42791 + reiser4_update_sd(object);
42792 + if (result != 0) {
42793 + DQUOT_FREE_INODE(object);
42794 + object->i_flags |= S_NOQUOTA;
42795 + /* if everything was ok (result == 0), parent stat-data is
42796 + * already updated above (update_parent_dir()) */
42797 + reiser4_update_sd(parent);
42798 + /* failure to create entry, remove object */
42799 + obj_plug->delete_object(object);
42800 + }
42801 +
42802 + /* file has name now, clear immutable flag */
42803 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
42804 +
42805 + /* on error, iput() will call ->delete_inode(). We should keep track
42806 + of the existence of stat-data for this inode and avoid attempt to
42807 + remove it in reiser4_delete_inode(). This is accomplished through
42808 + REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
42809 + */
42810 + return result;
42811 +}
42812 +
42813 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
42814 + reiser4_mknod and reiser4_symlink
42815 +*/
42816 +static int
42817 +create_vfs_object(struct inode *parent,
42818 + struct dentry *dentry, reiser4_object_create_data * data)
42819 +{
42820 + reiser4_context *ctx;
42821 + int result;
42822 + struct inode *child;
42823 +
42824 + ctx = reiser4_init_context(parent->i_sb);
42825 + if (IS_ERR(ctx))
42826 + return PTR_ERR(ctx);
42827 + context_set_commit_async(ctx);
42828 +
42829 + data->parent = parent;
42830 + data->dentry = dentry;
42831 + child = NULL;
42832 + result = do_create_vfs_child(data, &child);
42833 + if (unlikely(result != 0)) {
42834 + if (child != NULL) {
42835 + reiser4_make_bad_inode(child);
42836 + iput(child);
42837 + }
42838 + } else
42839 + d_instantiate(dentry, child);
42840 +
42841 + reiser4_exit_context(ctx);
42842 + return result;
42843 +}
42844 +
42845 +/* helper for link_common. Estimate disk space necessary to add a link
42846 + from @parent to @object
42847 +*/
42848 +static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
42849 + struct inode *object
42850 + /* object to which new link is being cerated */
42851 + )
42852 +{
42853 + reiser4_block_nr res = 0;
42854 + file_plugin *fplug;
42855 + dir_plugin *dplug;
42856 +
42857 + assert("vpf-317", object != NULL);
42858 + assert("vpf-318", parent != NULL);
42859 +
42860 + fplug = inode_file_plugin(object);
42861 + dplug = inode_dir_plugin(parent);
42862 + /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
42863 + /* reiser4_add_nlink(object) */
42864 + res += fplug->estimate.update(object);
42865 + /* add_entry(parent) */
42866 + res += dplug->estimate.add_entry(parent);
42867 + /* reiser4_del_nlink(object) */
42868 + res += fplug->estimate.update(object);
42869 + /* update_dir(parent) */
42870 + res += inode_file_plugin(parent)->estimate.update(parent);
42871 + /* safe-link */
42872 + res += estimate_one_item_removal(reiser4_tree_by_inode(object));
42873 +
42874 + return res;
42875 +}
42876 +
42877 +/* Estimate disk space necessary to remove a link between @parent and
42878 + @object.
42879 +*/
42880 +static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
42881 + struct inode *object
42882 + /* object to which new link is being cerated */
42883 + )
42884 +{
42885 + reiser4_block_nr res = 0;
42886 + file_plugin *fplug;
42887 + dir_plugin *dplug;
42888 +
42889 + assert("vpf-317", object != NULL);
42890 + assert("vpf-318", parent != NULL);
42891 +
42892 + fplug = inode_file_plugin(object);
42893 + dplug = inode_dir_plugin(parent);
42894 +
42895 + /* rem_entry(parent) */
42896 + res += dplug->estimate.rem_entry(parent);
42897 + /* reiser4_del_nlink(object) */
42898 + res += fplug->estimate.update(object);
42899 + /* update_dir(parent) */
42900 + res += inode_file_plugin(parent)->estimate.update(parent);
42901 + /* fplug->unlink */
42902 + res += fplug->estimate.unlink(object, parent);
42903 + /* safe-link */
42904 + res += estimate_one_insert_item(reiser4_tree_by_inode(object));
42905 +
42906 + return res;
42907 +}
42908 +
42909 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
42910 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
42911 +{
42912 + file_plugin *fplug;
42913 + struct inode *child;
42914 + int result;
42915 +
42916 + result = 0;
42917 + child = victim->d_inode;
42918 + fplug = inode_file_plugin(child);
42919 +
42920 + /* check for race with create_object() */
42921 + if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
42922 + return RETERR(-E_REPEAT);
42923 + /* object being deleted should have stat data */
42924 + assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
42925 +
42926 + /* ask object plugin */
42927 + if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
42928 + return RETERR(-ENOTEMPTY);
42929 +
42930 + result = (int)estimate_unlink(parent, child);
42931 + if (result < 0)
42932 + return result;
42933 +
42934 + return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
42935 +}
42936 +
42937 +/* helper for reiser4_setattr_common */
42938 +static int setattr_reserve(reiser4_tree * tree)
42939 +{
42940 + assert("vs-1096", is_grab_enabled(get_current_context()));
42941 + return reiser4_grab_space(estimate_one_insert_into_item(tree),
42942 + BA_CAN_COMMIT);
42943 +}
42944 +
42945 +/* helper function. Standards require that for many file-system operations
42946 + on success ctime and mtime of parent directory is to be updated. */
42947 +int reiser4_update_dir(struct inode *dir)
42948 +{
42949 + assert("nikita-2525", dir != NULL);
42950 +
42951 + dir->i_ctime = dir->i_mtime = CURRENT_TIME;
42952 + return reiser4_update_sd(dir);
42953 +}
42954 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.22/fs/reiser4/plugin/inode_ops_rename.c
42955 --- linux-2.6.22.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300
42956 +++ linux-2.6.22/fs/reiser4/plugin/inode_ops_rename.c 2007-07-29 00:25:34.940713042 +0400
42957 @@ -0,0 +1,914 @@
42958 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
42959 + * reiser4/README */
42960 +
42961 +#include "../inode.h"
42962 +#include "../safe_link.h"
42963 +
42964 +static const char *possible_leak = "Possible disk space leak.";
42965 +
42966 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
42967 +
42968 + Helper function called from hashed_rename() */
42969 +static int replace_name(struct inode *to_inode, /* inode where @from_coord is
42970 + * to be re-targeted at */
42971 + struct inode *from_dir, /* directory where @from_coord
42972 + * lives */
42973 + struct inode *from_inode, /* inode @from_coord
42974 + * originally point to */
42975 + coord_t * from_coord, /* where directory entry is in
42976 + * the tree */
42977 + lock_handle * from_lh /* lock handle on @from_coord */ )
42978 +{
42979 + item_plugin *from_item;
42980 + int result;
42981 + znode *node;
42982 +
42983 + coord_clear_iplug(from_coord);
42984 + node = from_coord->node;
42985 + result = zload(node);
42986 + if (result != 0)
42987 + return result;
42988 + from_item = item_plugin_by_coord(from_coord);
42989 + if (plugin_of_group(item_plugin_by_coord(from_coord),
42990 + DIR_ENTRY_ITEM_TYPE))
42991 + {
42992 + reiser4_key to_key;
42993 +
42994 + build_sd_key(to_inode, &to_key);
42995 +
42996 + /* everything is found and prepared to change directory entry
42997 + at @from_coord to point to @to_inode.
42998 +
42999 + @to_inode is just about to get new name, so bump its link
43000 + counter.
43001 +
43002 + */
43003 + result = reiser4_add_nlink(to_inode, from_dir, 0);
43004 + if (result != 0) {
43005 + /* Don't issue warning: this may be plain -EMLINK */
43006 + zrelse(node);
43007 + return result;
43008 + }
43009 +
43010 + result =
43011 + from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43012 + if (result != 0) {
43013 + reiser4_del_nlink(to_inode, from_dir, 0);
43014 + zrelse(node);
43015 + return result;
43016 + }
43017 +
43018 + /* @from_inode just lost its name, he-he.
43019 +
43020 + If @from_inode was directory, it contained dotdot pointing
43021 + to @from_dir. @from_dir i_nlink will be decreased when
43022 + iput() will be called on @from_inode.
43023 +
43024 + If file-system is not ADG (hard-links are
43025 + supported on directories), iput(from_inode) will not remove
43026 + @from_inode, and thus above is incorrect, but hard-links on
43027 + directories are problematic in many other respects.
43028 + */
43029 + result = reiser4_del_nlink(from_inode, from_dir, 0);
43030 + if (result != 0) {
43031 + warning("nikita-2330",
43032 + "Cannot remove link from source: %i. %s",
43033 + result, possible_leak);
43034 + }
43035 + /* Has to return success, because entry is already
43036 + * modified. */
43037 + result = 0;
43038 +
43039 + /* NOTE-NIKITA consider calling plugin method in stead of
43040 + accessing inode fields directly. */
43041 + from_dir->i_mtime = CURRENT_TIME;
43042 + } else {
43043 + warning("nikita-2326", "Unexpected item type");
43044 + result = RETERR(-EIO);
43045 + }
43046 + zrelse(node);
43047 + return result;
43048 +}
43049 +
43050 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43051 +
43052 + Helper function used by hashed_rename(). */
43053 +static int add_name(struct inode *inode, /* inode where @coord is to be
43054 + * re-targeted at */
43055 + struct inode *dir, /* directory where @coord lives */
43056 + struct dentry *name, /* new name */
43057 + coord_t * coord, /* where directory entry is in the tree */
43058 + lock_handle * lh, /* lock handle on @coord */
43059 + int is_dir /* true, if @inode is directory */ )
43060 +{
43061 + int result;
43062 + reiser4_dir_entry_desc entry;
43063 +
43064 + assert("nikita-2333", lh->node == coord->node);
43065 + assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43066 +
43067 + memset(&entry, 0, sizeof entry);
43068 + entry.obj = inode;
43069 + /* build key of directory entry description */
43070 + inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43071 +
43072 + /* ext2 does this in different order: first inserts new entry,
43073 + then increases directory nlink. We don't want do this,
43074 + because reiser4_add_nlink() calls ->add_link() plugin
43075 + method that can fail for whatever reason, leaving as with
43076 + cleanup problems.
43077 + */
43078 + /* @inode is getting new name */
43079 + reiser4_add_nlink(inode, dir, 0);
43080 + /* create @new_name in @new_dir pointing to
43081 + @old_inode */
43082 + result = WITH_COORD(coord,
43083 + inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43084 + coord,
43085 + lh,
43086 + name,
43087 + &entry));
43088 + if (result != 0) {
43089 + int result2;
43090 + result2 = reiser4_del_nlink(inode, dir, 0);
43091 + if (result2 != 0) {
43092 + warning("nikita-2327",
43093 + "Cannot drop link on %lli %i. %s",
43094 + (unsigned long long)get_inode_oid(inode),
43095 + result2, possible_leak);
43096 + }
43097 + } else
43098 + INODE_INC_FIELD(dir, i_size);
43099 + return result;
43100 +}
43101 +
43102 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43103 + struct dentry *old_name, /* old name */
43104 + struct inode *new_dir, /* directory where @new is located */
43105 + struct dentry *new_name /* new name */ )
43106 +{
43107 + reiser4_block_nr res1, res2;
43108 + dir_plugin *p_parent_old, *p_parent_new;
43109 + file_plugin *p_child_old, *p_child_new;
43110 +
43111 + assert("vpf-311", old_dir != NULL);
43112 + assert("vpf-312", new_dir != NULL);
43113 + assert("vpf-313", old_name != NULL);
43114 + assert("vpf-314", new_name != NULL);
43115 +
43116 + p_parent_old = inode_dir_plugin(old_dir);
43117 + p_parent_new = inode_dir_plugin(new_dir);
43118 + p_child_old = inode_file_plugin(old_name->d_inode);
43119 + if (new_name->d_inode)
43120 + p_child_new = inode_file_plugin(new_name->d_inode);
43121 + else
43122 + p_child_new = NULL;
43123 +
43124 + /* find_entry - can insert one leaf. */
43125 + res1 = res2 = 1;
43126 +
43127 + /* replace_name */
43128 + {
43129 + /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43130 + res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43131 + /* update key */
43132 + res1 += 1;
43133 + /* reiser4_del_nlink(p_child_new) */
43134 + if (p_child_new)
43135 + res1 += p_child_new->estimate.update(new_name->d_inode);
43136 + }
43137 +
43138 + /* else add_name */
43139 + {
43140 + /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43141 + res2 +=
43142 + 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43143 + /* reiser4_add_nlink(p_parent_old) */
43144 + res2 += p_child_old->estimate.update(old_name->d_inode);
43145 + /* add_entry(p_parent_new) */
43146 + res2 += p_parent_new->estimate.add_entry(new_dir);
43147 + /* reiser4_del_nlink(p_parent_old) */
43148 + res2 += p_child_old->estimate.update(old_name->d_inode);
43149 + }
43150 +
43151 + res1 = res1 < res2 ? res2 : res1;
43152 +
43153 + /* reiser4_write_sd(p_parent_new) */
43154 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43155 +
43156 + /* reiser4_write_sd(p_child_new) */
43157 + if (p_child_new)
43158 + res1 += p_child_new->estimate.update(new_name->d_inode);
43159 +
43160 + /* hashed_rem_entry(p_parent_old) */
43161 + res1 += p_parent_old->estimate.rem_entry(old_dir);
43162 +
43163 + /* reiser4_del_nlink(p_child_old) */
43164 + res1 += p_child_old->estimate.update(old_name->d_inode);
43165 +
43166 + /* replace_name */
43167 + {
43168 + /* reiser4_add_nlink(p_parent_dir_new) */
43169 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43170 + /* update_key */
43171 + res1 += 1;
43172 + /* reiser4_del_nlink(p_parent_new) */
43173 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43174 + /* reiser4_del_nlink(p_parent_old) */
43175 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43176 + }
43177 +
43178 + /* reiser4_write_sd(p_parent_old) */
43179 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43180 +
43181 + /* reiser4_write_sd(p_child_old) */
43182 + res1 += p_child_old->estimate.update(old_name->d_inode);
43183 +
43184 + return res1;
43185 +}
43186 +
43187 +static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
43188 + struct dentry *old_name, /* old name */
43189 + struct inode *new_dir, /* directory where @new is located */
43190 + struct dentry *new_name
43191 + /* new name */ )
43192 +{
43193 + reiser4_block_nr reserve;
43194 +
43195 + reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43196 +
43197 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43198 + return RETERR(-ENOSPC);
43199 +
43200 + return 0;
43201 +}
43202 +
43203 +/* check whether @old_inode and @new_inode can be moved within file system
43204 + * tree. This singles out attempts to rename pseudo-files, for example. */
43205 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
43206 + struct inode *new_dir, struct inode *new_inode)
43207 +{
43208 + file_plugin *fplug;
43209 + dir_plugin *dplug;
43210 +
43211 + assert("nikita-3370", old_inode != NULL);
43212 +
43213 + dplug = inode_dir_plugin(new_dir);
43214 + fplug = inode_file_plugin(old_inode);
43215 +
43216 + if (dplug == NULL)
43217 + return RETERR(-ENOTDIR);
43218 + else if (new_dir->i_op->create == NULL)
43219 + return RETERR(-EPERM);
43220 + else if (!fplug->can_add_link(old_inode))
43221 + return RETERR(-EMLINK);
43222 + else if (new_inode != NULL) {
43223 + fplug = inode_file_plugin(new_inode);
43224 + if (fplug->can_rem_link != NULL &&
43225 + !fplug->can_rem_link(new_inode))
43226 + return RETERR(-EBUSY);
43227 + }
43228 + return 0;
43229 +}
43230 +
43231 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
43232 + znode_lock_mode, reiser4_dir_entry_desc *);
43233 +int reiser4_update_dir(struct inode *);
43234 +
43235 +/* this is common implementation of vfs's rename method of struct
43236 + inode_operations
43237 + See comments in the body.
43238 +
43239 + It is arguable that this function can be made generic so, that it
43240 + will be applicable to any kind of directory plugin that deals with
43241 + directories composed out of directory entries. The only obstacle
43242 + here is that we don't have any data-type to represent directory
43243 + entry. This should be re-considered when more than one different
43244 + directory plugin will be implemented.
43245 +*/
43246 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
43247 + * is located */ ,
43248 + struct dentry *old_name /* old name */ ,
43249 + struct inode *new_dir /* directory where @new
43250 + * is located */ ,
43251 + struct dentry *new_name /* new name */ )
43252 +{
43253 + /* From `The Open Group Base Specifications Issue 6'
43254 +
43255 + If either the old or new argument names a symbolic link, rename()
43256 + shall operate on the symbolic link itself, and shall not resolve
43257 + the last component of the argument. If the old argument and the new
43258 + argument resolve to the same existing file, rename() shall return
43259 + successfully and perform no other action.
43260 +
43261 + [this is done by VFS: vfs_rename()]
43262 +
43263 + If the old argument points to the pathname of a file that is not a
43264 + directory, the new argument shall not point to the pathname of a
43265 + directory.
43266 +
43267 + [checked by VFS: vfs_rename->may_delete()]
43268 +
43269 + If the link named by the new argument exists, it shall
43270 + be removed and old renamed to new. In this case, a link named new
43271 + shall remain visible to other processes throughout the renaming
43272 + operation and refer either to the file referred to by new or old
43273 + before the operation began.
43274 +
43275 + [we should assure this]
43276 +
43277 + Write access permission is required for
43278 + both the directory containing old and the directory containing new.
43279 +
43280 + [checked by VFS: vfs_rename->may_delete(), may_create()]
43281 +
43282 + If the old argument points to the pathname of a directory, the new
43283 + argument shall not point to the pathname of a file that is not a
43284 + directory.
43285 +
43286 + [checked by VFS: vfs_rename->may_delete()]
43287 +
43288 + If the directory named by the new argument exists, it
43289 + shall be removed and old renamed to new. In this case, a link named
43290 + new shall exist throughout the renaming operation and shall refer
43291 + either to the directory referred to by new or old before the
43292 + operation began.
43293 +
43294 + [we should assure this]
43295 +
43296 + If new names an existing directory, it shall be
43297 + required to be an empty directory.
43298 +
43299 + [we should check this]
43300 +
43301 + If the old argument points to a pathname of a symbolic link, the
43302 + symbolic link shall be renamed. If the new argument points to a
43303 + pathname of a symbolic link, the symbolic link shall be removed.
43304 +
43305 + The new pathname shall not contain a path prefix that names
43306 + old. Write access permission is required for the directory
43307 + containing old and the directory containing new. If the old
43308 + argument points to the pathname of a directory, write access
43309 + permission may be required for the directory named by old, and, if
43310 + it exists, the directory named by new.
43311 +
43312 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
43313 +
43314 + If the link named by the new argument exists and the file's link
43315 + count becomes 0 when it is removed and no process has the file
43316 + open, the space occupied by the file shall be freed and the file
43317 + shall no longer be accessible. If one or more processes have the
43318 + file open when the last link is removed, the link shall be removed
43319 + before rename() returns, but the removal of the file contents shall
43320 + be postponed until all references to the file are closed.
43321 +
43322 + [iput() handles this, but we can do this manually, a la
43323 + reiser4_unlink()]
43324 +
43325 + Upon successful completion, rename() shall mark for update the
43326 + st_ctime and st_mtime fields of the parent directory of each file.
43327 +
43328 + [N/A]
43329 +
43330 + */
43331 + reiser4_context *ctx;
43332 + int result;
43333 + int is_dir; /* is @old_name directory */
43334 +
43335 + struct inode *old_inode;
43336 + struct inode *new_inode;
43337 + coord_t *new_coord;
43338 +
43339 + struct reiser4_dentry_fsdata *new_fsdata;
43340 + dir_plugin *dplug;
43341 + file_plugin *fplug;
43342 +
43343 + reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
43344 + lock_handle *new_lh, *dotdot_lh;
43345 + struct dentry *dotdot_name;
43346 + struct reiser4_dentry_fsdata *dataonstack;
43347 +
43348 + ctx = reiser4_init_context(old_dir->i_sb);
43349 + if (IS_ERR(ctx))
43350 + return PTR_ERR(ctx);
43351 +
43352 + old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43353 + sizeof(*dotdot_name) + sizeof(*dataonstack),
43354 + reiser4_ctx_gfp_mask_get());
43355 + if (old_entry == NULL) {
43356 + context_set_commit_async(ctx);
43357 + reiser4_exit_context(ctx);
43358 + return RETERR(-ENOMEM);
43359 + }
43360 + memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43361 + sizeof(*dotdot_name) + sizeof(*dataonstack));
43362 +
43363 + new_entry = old_entry + 1;
43364 + dotdot_entry = old_entry + 2;
43365 + new_lh = (lock_handle *)(old_entry + 3);
43366 + dotdot_lh = new_lh + 1;
43367 + dotdot_name = (struct dentry *)(new_lh + 2);
43368 + dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
43369 +
43370 + assert("nikita-2318", old_dir != NULL);
43371 + assert("nikita-2319", new_dir != NULL);
43372 + assert("nikita-2320", old_name != NULL);
43373 + assert("nikita-2321", new_name != NULL);
43374 +
43375 + old_inode = old_name->d_inode;
43376 + new_inode = new_name->d_inode;
43377 +
43378 + dplug = inode_dir_plugin(old_dir);
43379 + fplug = NULL;
43380 +
43381 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
43382 + if (IS_ERR(new_fsdata)) {
43383 + kfree(old_entry);
43384 + context_set_commit_async(ctx);
43385 + reiser4_exit_context(ctx);
43386 + return PTR_ERR(new_fsdata);
43387 + }
43388 +
43389 + new_coord = &new_fsdata->dec.entry_coord;
43390 + coord_clear_iplug(new_coord);
43391 +
43392 + is_dir = S_ISDIR(old_inode->i_mode);
43393 +
43394 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43395 +
43396 + /* if target is existing directory and it's not empty---return error.
43397 +
43398 + This check is done specifically, because is_dir_empty() requires
43399 + tree traversal and have to be done before locks are taken.
43400 + */
43401 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
43402 + kfree(old_entry);
43403 + context_set_commit_async(ctx);
43404 + reiser4_exit_context(ctx);
43405 + return RETERR(-ENOTEMPTY);
43406 + }
43407 +
43408 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
43409 + if (result != 0) {
43410 + kfree(old_entry);
43411 + context_set_commit_async(ctx);
43412 + reiser4_exit_context(ctx);
43413 + return result;
43414 + }
43415 +
43416 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
43417 + new_dir, new_name);
43418 + if (result != 0) {
43419 + kfree(old_entry);
43420 + context_set_commit_async(ctx);
43421 + reiser4_exit_context(ctx);
43422 + return result;
43423 + }
43424 +
43425 + init_lh(new_lh);
43426 +
43427 + /* find entry for @new_name */
43428 + result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
43429 + new_entry);
43430 +
43431 + if (IS_CBKERR(result)) {
43432 + done_lh(new_lh);
43433 + kfree(old_entry);
43434 + context_set_commit_async(ctx);
43435 + reiser4_exit_context(ctx);
43436 + return result;
43437 + }
43438 +
43439 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
43440 +
43441 + /* add or replace name for @old_inode as @new_name */
43442 + if (new_inode != NULL) {
43443 + /* target (@new_name) exists. */
43444 + /* Not clear what to do with objects that are
43445 + both directories and files at the same time. */
43446 + if (result == CBK_COORD_FOUND) {
43447 + result = replace_name(old_inode,
43448 + new_dir,
43449 + new_inode, new_coord, new_lh);
43450 + if (result == 0)
43451 + fplug = inode_file_plugin(new_inode);
43452 + } else if (result == CBK_COORD_NOTFOUND) {
43453 + /* VFS told us that @new_name is bound to existing
43454 + inode, but we failed to find directory entry. */
43455 + warning("nikita-2324", "Target not found");
43456 + result = RETERR(-ENOENT);
43457 + }
43458 + } else {
43459 + /* target (@new_name) doesn't exists. */
43460 + if (result == CBK_COORD_NOTFOUND)
43461 + result = add_name(old_inode,
43462 + new_dir,
43463 + new_name, new_coord, new_lh, is_dir);
43464 + else if (result == CBK_COORD_FOUND) {
43465 + /* VFS told us that @new_name is "negative" dentry,
43466 + but we found directory entry. */
43467 + warning("nikita-2331", "Target found unexpectedly");
43468 + result = RETERR(-EIO);
43469 + }
43470 + }
43471 +
43472 + assert("nikita-3462", ergo(result == 0,
43473 + old_inode->i_nlink >= 2 + !!is_dir));
43474 +
43475 + /* We are done with all modifications to the @new_dir, release lock on
43476 + node. */
43477 + done_lh(new_lh);
43478 +
43479 + if (fplug != NULL) {
43480 + /* detach @new_inode from name-space */
43481 + result = fplug->detach(new_inode, new_dir);
43482 + if (result != 0)
43483 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
43484 + (unsigned long long)get_inode_oid(new_inode),
43485 + result, possible_leak);
43486 + }
43487 +
43488 + if (new_inode != NULL)
43489 + reiser4_update_sd(new_inode);
43490 +
43491 + if (result == 0) {
43492 + old_entry->obj = old_inode;
43493 +
43494 + dplug->build_entry_key(old_dir,
43495 + &old_name->d_name, &old_entry->key);
43496 +
43497 + /* At this stage new name was introduced for
43498 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43499 + counters were updated.
43500 +
43501 + We want to remove @old_name now. If @old_inode wasn't
43502 + directory this is simple.
43503 + */
43504 + result = dplug->rem_entry(old_dir, old_name, old_entry);
43505 + if (result != 0 && result != -ENOMEM) {
43506 + warning("nikita-2335",
43507 + "Cannot remove old name: %i", result);
43508 + } else {
43509 + result = reiser4_del_nlink(old_inode, old_dir, 0);
43510 + if (result != 0 && result != -ENOMEM) {
43511 + warning("nikita-2337",
43512 + "Cannot drop link on old: %i", result);
43513 + }
43514 + }
43515 +
43516 + if (result == 0 && is_dir) {
43517 + /* @old_inode is directory. We also have to update
43518 + dotdot entry. */
43519 + coord_t *dotdot_coord;
43520 +
43521 + memset(dataonstack, 0, sizeof dataonstack);
43522 + memset(dotdot_entry, 0, sizeof dotdot_entry);
43523 + dotdot_entry->obj = old_dir;
43524 + memset(dotdot_name, 0, sizeof dotdot_name);
43525 + dotdot_name->d_name.name = "..";
43526 + dotdot_name->d_name.len = 2;
43527 + /*
43528 + * allocate ->d_fsdata on the stack to avoid using
43529 + * reiser4_get_dentry_fsdata(). Locking is not needed,
43530 + * because dentry is private to the current thread.
43531 + */
43532 + dotdot_name->d_fsdata = dataonstack;
43533 + init_lh(dotdot_lh);
43534 +
43535 + dotdot_coord = &dataonstack->dec.entry_coord;
43536 + coord_clear_iplug(dotdot_coord);
43537 +
43538 + result = reiser4_find_entry(old_inode, dotdot_name,
43539 + dotdot_lh, ZNODE_WRITE_LOCK,
43540 + dotdot_entry);
43541 + if (result == 0) {
43542 + /* replace_name() decreases i_nlink on
43543 + * @old_dir */
43544 + result = replace_name(new_dir,
43545 + old_inode,
43546 + old_dir,
43547 + dotdot_coord, dotdot_lh);
43548 + } else
43549 + result = RETERR(-EIO);
43550 + done_lh(dotdot_lh);
43551 + }
43552 + }
43553 + reiser4_update_dir(new_dir);
43554 + reiser4_update_dir(old_dir);
43555 + reiser4_update_sd(old_inode);
43556 + if (result == 0) {
43557 + file_plugin *fplug;
43558 +
43559 + if (new_inode != NULL) {
43560 + /* add safe-link for target file (in case we removed
43561 + * last reference to the poor fellow */
43562 + fplug = inode_file_plugin(new_inode);
43563 + if (new_inode->i_nlink == 0)
43564 + result = safe_link_add(new_inode, SAFE_UNLINK);
43565 + }
43566 + }
43567 + kfree(old_entry);
43568 + context_set_commit_async(ctx);
43569 + reiser4_exit_context(ctx);
43570 + return result;
43571 +}
43572 +
43573 +#if 0
43574 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
43575 + * is located */ ,
43576 + struct dentry *old_name /* old name */ ,
43577 + struct inode *new_dir /* directory where @new
43578 + * is located */ ,
43579 + struct dentry *new_name /* new name */ )
43580 +{
43581 + /* From `The Open Group Base Specifications Issue 6'
43582 +
43583 + If either the old or new argument names a symbolic link, rename()
43584 + shall operate on the symbolic link itself, and shall not resolve
43585 + the last component of the argument. If the old argument and the new
43586 + argument resolve to the same existing file, rename() shall return
43587 + successfully and perform no other action.
43588 +
43589 + [this is done by VFS: vfs_rename()]
43590 +
43591 + If the old argument points to the pathname of a file that is not a
43592 + directory, the new argument shall not point to the pathname of a
43593 + directory.
43594 +
43595 + [checked by VFS: vfs_rename->may_delete()]
43596 +
43597 + If the link named by the new argument exists, it shall
43598 + be removed and old renamed to new. In this case, a link named new
43599 + shall remain visible to other processes throughout the renaming
43600 + operation and refer either to the file referred to by new or old
43601 + before the operation began.
43602 +
43603 + [we should assure this]
43604 +
43605 + Write access permission is required for
43606 + both the directory containing old and the directory containing new.
43607 +
43608 + [checked by VFS: vfs_rename->may_delete(), may_create()]
43609 +
43610 + If the old argument points to the pathname of a directory, the new
43611 + argument shall not point to the pathname of a file that is not a
43612 + directory.
43613 +
43614 + [checked by VFS: vfs_rename->may_delete()]
43615 +
43616 + If the directory named by the new argument exists, it
43617 + shall be removed and old renamed to new. In this case, a link named
43618 + new shall exist throughout the renaming operation and shall refer
43619 + either to the directory referred to by new or old before the
43620 + operation began.
43621 +
43622 + [we should assure this]
43623 +
43624 + If new names an existing directory, it shall be
43625 + required to be an empty directory.
43626 +
43627 + [we should check this]
43628 +
43629 + If the old argument points to a pathname of a symbolic link, the
43630 + symbolic link shall be renamed. If the new argument points to a
43631 + pathname of a symbolic link, the symbolic link shall be removed.
43632 +
43633 + The new pathname shall not contain a path prefix that names
43634 + old. Write access permission is required for the directory
43635 + containing old and the directory containing new. If the old
43636 + argument points to the pathname of a directory, write access
43637 + permission may be required for the directory named by old, and, if
43638 + it exists, the directory named by new.
43639 +
43640 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
43641 +
43642 + If the link named by the new argument exists and the file's link
43643 + count becomes 0 when it is removed and no process has the file
43644 + open, the space occupied by the file shall be freed and the file
43645 + shall no longer be accessible. If one or more processes have the
43646 + file open when the last link is removed, the link shall be removed
43647 + before rename() returns, but the removal of the file contents shall
43648 + be postponed until all references to the file are closed.
43649 +
43650 + [iput() handles this, but we can do this manually, a la
43651 + reiser4_unlink()]
43652 +
43653 + Upon successful completion, rename() shall mark for update the
43654 + st_ctime and st_mtime fields of the parent directory of each file.
43655 +
43656 + [N/A]
43657 +
43658 + */
43659 + reiser4_context *ctx;
43660 + int result;
43661 + int is_dir; /* is @old_name directory */
43662 + struct inode *old_inode;
43663 + struct inode *new_inode;
43664 + reiser4_dir_entry_desc old_entry;
43665 + reiser4_dir_entry_desc new_entry;
43666 + coord_t *new_coord;
43667 + struct reiser4_dentry_fsdata *new_fsdata;
43668 + lock_handle new_lh;
43669 + dir_plugin *dplug;
43670 + file_plugin *fplug;
43671 +
43672 + ctx = reiser4_init_context(old_dir->i_sb);
43673 + if (IS_ERR(ctx))
43674 + return PTR_ERR(ctx);
43675 +
43676 + assert("nikita-2318", old_dir != NULL);
43677 + assert("nikita-2319", new_dir != NULL);
43678 + assert("nikita-2320", old_name != NULL);
43679 + assert("nikita-2321", new_name != NULL);
43680 +
43681 + old_inode = old_name->d_inode;
43682 + new_inode = new_name->d_inode;
43683 +
43684 + dplug = inode_dir_plugin(old_dir);
43685 + fplug = NULL;
43686 +
43687 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
43688 + if (IS_ERR(new_fsdata)) {
43689 + result = PTR_ERR(new_fsdata);
43690 + goto exit;
43691 + }
43692 +
43693 + new_coord = &new_fsdata->dec.entry_coord;
43694 + coord_clear_iplug(new_coord);
43695 +
43696 + is_dir = S_ISDIR(old_inode->i_mode);
43697 +
43698 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43699 +
43700 + /* if target is existing directory and it's not empty---return error.
43701 +
43702 + This check is done specifically, because is_dir_empty() requires
43703 + tree traversal and have to be done before locks are taken.
43704 + */
43705 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
43706 + return RETERR(-ENOTEMPTY);
43707 +
43708 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
43709 + if (result != 0)
43710 + goto exit;
43711 +
43712 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
43713 + new_dir, new_name);
43714 + if (result != 0)
43715 + goto exit;
43716 +
43717 + init_lh(&new_lh);
43718 +
43719 + /* find entry for @new_name */
43720 + result = reiser4_find_entry(new_dir, new_name, &new_lh,
43721 + ZNODE_WRITE_LOCK, &new_entry);
43722 +
43723 + if (IS_CBKERR(result)) {
43724 + done_lh(&new_lh);
43725 + goto exit;
43726 + }
43727 +
43728 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
43729 +
43730 + /* add or replace name for @old_inode as @new_name */
43731 + if (new_inode != NULL) {
43732 + /* target (@new_name) exists. */
43733 + /* Not clear what to do with objects that are
43734 + both directories and files at the same time. */
43735 + if (result == CBK_COORD_FOUND) {
43736 + result = replace_name(old_inode,
43737 + new_dir,
43738 + new_inode, new_coord, &new_lh);
43739 + if (result == 0)
43740 + fplug = inode_file_plugin(new_inode);
43741 + } else if (result == CBK_COORD_NOTFOUND) {
43742 + /* VFS told us that @new_name is bound to existing
43743 + inode, but we failed to find directory entry. */
43744 + warning("nikita-2324", "Target not found");
43745 + result = RETERR(-ENOENT);
43746 + }
43747 + } else {
43748 + /* target (@new_name) doesn't exists. */
43749 + if (result == CBK_COORD_NOTFOUND)
43750 + result = add_name(old_inode,
43751 + new_dir,
43752 + new_name, new_coord, &new_lh, is_dir);
43753 + else if (result == CBK_COORD_FOUND) {
43754 + /* VFS told us that @new_name is "negative" dentry,
43755 + but we found directory entry. */
43756 + warning("nikita-2331", "Target found unexpectedly");
43757 + result = RETERR(-EIO);
43758 + }
43759 + }
43760 +
43761 + assert("nikita-3462", ergo(result == 0,
43762 + old_inode->i_nlink >= 2 + !!is_dir));
43763 +
43764 + /* We are done with all modifications to the @new_dir, release lock on
43765 + node. */
43766 + done_lh(&new_lh);
43767 +
43768 + if (fplug != NULL) {
43769 + /* detach @new_inode from name-space */
43770 + result = fplug->detach(new_inode, new_dir);
43771 + if (result != 0)
43772 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
43773 + (unsigned long long)get_inode_oid(new_inode),
43774 + result, possible_leak);
43775 + }
43776 +
43777 + if (new_inode != NULL)
43778 + reiser4_update_sd(new_inode);
43779 +
43780 + if (result == 0) {
43781 + memset(&old_entry, 0, sizeof old_entry);
43782 + old_entry.obj = old_inode;
43783 +
43784 + dplug->build_entry_key(old_dir,
43785 + &old_name->d_name, &old_entry.key);
43786 +
43787 + /* At this stage new name was introduced for
43788 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43789 + counters were updated.
43790 +
43791 + We want to remove @old_name now. If @old_inode wasn't
43792 + directory this is simple.
43793 + */
43794 + result = dplug->rem_entry(old_dir, old_name, &old_entry);
43795 + /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
43796 + if (result != 0 && result != -ENOMEM) {
43797 + warning("nikita-2335",
43798 + "Cannot remove old name: %i", result);
43799 + } else {
43800 + result = reiser4_del_nlink(old_inode, old_dir, 0);
43801 + if (result != 0 && result != -ENOMEM) {
43802 + warning("nikita-2337",
43803 + "Cannot drop link on old: %i", result);
43804 + }
43805 + }
43806 +
43807 + if (result == 0 && is_dir) {
43808 + /* @old_inode is directory. We also have to update
43809 + dotdot entry. */
43810 + coord_t *dotdot_coord;
43811 + lock_handle dotdot_lh;
43812 + struct dentry dotdot_name;
43813 + reiser4_dir_entry_desc dotdot_entry;
43814 + struct reiser4_dentry_fsdata dataonstack;
43815 + struct reiser4_dentry_fsdata *fsdata;
43816 +
43817 + memset(&dataonstack, 0, sizeof dataonstack);
43818 + memset(&dotdot_entry, 0, sizeof dotdot_entry);
43819 + dotdot_entry.obj = old_dir;
43820 + memset(&dotdot_name, 0, sizeof dotdot_name);
43821 + dotdot_name.d_name.name = "..";
43822 + dotdot_name.d_name.len = 2;
43823 + /*
43824 + * allocate ->d_fsdata on the stack to avoid using
43825 + * reiser4_get_dentry_fsdata(). Locking is not needed,
43826 + * because dentry is private to the current thread.
43827 + */
43828 + dotdot_name.d_fsdata = &dataonstack;
43829 + init_lh(&dotdot_lh);
43830 +
43831 + fsdata = &dataonstack;
43832 + dotdot_coord = &fsdata->dec.entry_coord;
43833 + coord_clear_iplug(dotdot_coord);
43834 +
43835 + result = reiser4_find_entry(old_inode,
43836 + &dotdot_name,
43837 + &dotdot_lh,
43838 + ZNODE_WRITE_LOCK,
43839 + &dotdot_entry);
43840 + if (result == 0) {
43841 + /* replace_name() decreases i_nlink on
43842 + * @old_dir */
43843 + result = replace_name(new_dir,
43844 + old_inode,
43845 + old_dir,
43846 + dotdot_coord, &dotdot_lh);
43847 + } else
43848 + result = RETERR(-EIO);
43849 + done_lh(&dotdot_lh);
43850 + }
43851 + }
43852 + reiser4_update_dir(new_dir);
43853 + reiser4_update_dir(old_dir);
43854 + reiser4_update_sd(old_inode);
43855 + if (result == 0) {
43856 + file_plugin *fplug;
43857 +
43858 + if (new_inode != NULL) {
43859 + /* add safe-link for target file (in case we removed
43860 + * last reference to the poor fellow */
43861 + fplug = inode_file_plugin(new_inode);
43862 + if (new_inode->i_nlink == 0)
43863 + result = safe_link_add(new_inode, SAFE_UNLINK);
43864 + }
43865 + }
43866 + exit:
43867 + context_set_commit_async(ctx);
43868 + reiser4_exit_context(ctx);
43869 + return result;
43870 +}
43871 +#endif
43872 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/acl.h linux-2.6.22/fs/reiser4/plugin/item/acl.h
43873 --- linux-2.6.22.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300
43874 +++ linux-2.6.22/fs/reiser4/plugin/item/acl.h 2007-07-29 00:25:34.940713042 +0400
43875 @@ -0,0 +1,66 @@
43876 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43877 +
43878 +/* Directory entry. */
43879 +
43880 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
43881 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
43882 +
43883 +#include "../../forward.h"
43884 +#include "../../dformat.h"
43885 +#include "../../kassign.h"
43886 +#include "../../key.h"
43887 +
43888 +#include <linux/fs.h>
43889 +#include <linux/dcache.h> /* for struct dentry */
43890 +
43891 +typedef struct directory_entry_format {
43892 + /* key of object stat-data. It's not necessary to store whole
43893 + key here, because it's always key of stat-data, so minor
43894 + packing locality and offset can be omitted here. But this
43895 + relies on particular key allocation scheme for stat-data, so,
43896 + for extensibility sake, whole key can be stored here.
43897 +
43898 + We store key as array of bytes, because we don't want 8-byte
43899 + alignment of dir entries.
43900 + */
43901 + obj_key_id id;
43902 + /* file name. Null terminated string. */
43903 + d8 name[0];
43904 +} directory_entry_format;
43905 +
43906 +void print_de(const char *prefix, coord_t * coord);
43907 +int extract_key_de(const coord_t * coord, reiser4_key * key);
43908 +int update_key_de(const coord_t * coord, const reiser4_key * key,
43909 + lock_handle * lh);
43910 +char *extract_name_de(const coord_t * coord, char *buf);
43911 +unsigned extract_file_type_de(const coord_t * coord);
43912 +int add_entry_de(struct inode *dir, coord_t * coord,
43913 + lock_handle * lh, const struct dentry *name,
43914 + reiser4_dir_entry_desc * entry);
43915 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
43916 + lock_handle * lh, reiser4_dir_entry_desc * entry);
43917 +int max_name_len_de(const struct inode *dir);
43918 +
43919 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
43920 +
43921 +char *extract_dent_name(const coord_t * coord,
43922 + directory_entry_format * dent, char *buf);
43923 +
43924 +#if REISER4_LARGE_KEY
43925 +#define DE_NAME_BUF_LEN (24)
43926 +#else
43927 +#define DE_NAME_BUF_LEN (16)
43928 +#endif
43929 +
43930 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
43931 +#endif
43932 +
43933 +/* Make Linus happy.
43934 + Local variables:
43935 + c-indentation-style: "K&R"
43936 + mode-name: "LC"
43937 + c-basic-offset: 8
43938 + tab-width: 8
43939 + fill-column: 120
43940 + End:
43941 +*/
43942 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.22/fs/reiser4/plugin/item/blackbox.c
43943 --- linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
43944 +++ linux-2.6.22/fs/reiser4/plugin/item/blackbox.c 2007-07-29 00:25:34.940713042 +0400
43945 @@ -0,0 +1,142 @@
43946 +/* Copyright 2003 by Hans Reiser, licensing governed by
43947 + * reiser4/README */
43948 +
43949 +/* Black box item implementation */
43950 +
43951 +#include "../../forward.h"
43952 +#include "../../debug.h"
43953 +#include "../../dformat.h"
43954 +#include "../../kassign.h"
43955 +#include "../../coord.h"
43956 +#include "../../tree.h"
43957 +#include "../../lock.h"
43958 +
43959 +#include "blackbox.h"
43960 +#include "item.h"
43961 +#include "../plugin.h"
43962 +
43963 +int
43964 +store_black_box(reiser4_tree * tree,
43965 + const reiser4_key * key, void *data, int length)
43966 +{
43967 + int result;
43968 + reiser4_item_data idata;
43969 + coord_t coord;
43970 + lock_handle lh;
43971 +
43972 + memset(&idata, 0, sizeof idata);
43973 +
43974 + idata.data = data;
43975 + idata.user = 0;
43976 + idata.length = length;
43977 + idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
43978 +
43979 + init_lh(&lh);
43980 + result = insert_by_key(tree, key,
43981 + &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
43982 +
43983 + assert("nikita-3413",
43984 + ergo(result == 0,
43985 + WITH_COORD(&coord,
43986 + item_length_by_coord(&coord) == length)));
43987 +
43988 + done_lh(&lh);
43989 + return result;
43990 +}
43991 +
43992 +int
43993 +load_black_box(reiser4_tree * tree,
43994 + reiser4_key * key, void *data, int length, int exact)
43995 +{
43996 + int result;
43997 + coord_t coord;
43998 + lock_handle lh;
43999 +
44000 + init_lh(&lh);
44001 + result = coord_by_key(tree, key,
44002 + &coord, &lh, ZNODE_READ_LOCK,
44003 + exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44004 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44005 +
44006 + if (result == 0) {
44007 + int ilen;
44008 +
44009 + result = zload(coord.node);
44010 + if (result == 0) {
44011 + ilen = item_length_by_coord(&coord);
44012 + if (ilen <= length) {
44013 + memcpy(data, item_body_by_coord(&coord), ilen);
44014 + unit_key_by_coord(&coord, key);
44015 + } else if (exact) {
44016 + /*
44017 + * item is larger than buffer provided by the
44018 + * user. Only issue a warning if @exact is
44019 + * set. If @exact is false, we are iterating
44020 + * over all safe-links and here we are reaching
44021 + * the end of the iteration.
44022 + */
44023 + warning("nikita-3415",
44024 + "Wrong black box length: %i > %i",
44025 + ilen, length);
44026 + result = RETERR(-EIO);
44027 + }
44028 + zrelse(coord.node);
44029 + }
44030 + }
44031 +
44032 + done_lh(&lh);
44033 + return result;
44034 +
44035 +}
44036 +
44037 +int
44038 +update_black_box(reiser4_tree * tree,
44039 + const reiser4_key * key, void *data, int length)
44040 +{
44041 + int result;
44042 + coord_t coord;
44043 + lock_handle lh;
44044 +
44045 + init_lh(&lh);
44046 + result = coord_by_key(tree, key,
44047 + &coord, &lh, ZNODE_READ_LOCK,
44048 + FIND_EXACT,
44049 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44050 + if (result == 0) {
44051 + int ilen;
44052 +
44053 + result = zload(coord.node);
44054 + if (result == 0) {
44055 + ilen = item_length_by_coord(&coord);
44056 + if (length <= ilen) {
44057 + memcpy(item_body_by_coord(&coord), data,
44058 + length);
44059 + } else {
44060 + warning("nikita-3437",
44061 + "Wrong black box length: %i < %i",
44062 + ilen, length);
44063 + result = RETERR(-EIO);
44064 + }
44065 + zrelse(coord.node);
44066 + }
44067 + }
44068 +
44069 + done_lh(&lh);
44070 + return result;
44071 +
44072 +}
44073 +
44074 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44075 +{
44076 + return reiser4_cut_tree(tree, key, key, NULL, 1);
44077 +}
44078 +
44079 +/* Make Linus happy.
44080 + Local variables:
44081 + c-indentation-style: "K&R"
44082 + mode-name: "LC"
44083 + c-basic-offset: 8
44084 + tab-width: 8
44085 + fill-column: 120
44086 + End:
44087 +*/
44088 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.22/fs/reiser4/plugin/item/blackbox.h
44089 --- linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
44090 +++ linux-2.6.22/fs/reiser4/plugin/item/blackbox.h 2007-07-29 00:25:34.940713042 +0400
44091 @@ -0,0 +1,33 @@
44092 +/* Copyright 2003 by Hans Reiser, licensing governed by
44093 + * reiser4/README */
44094 +
44095 +/* "Black box" entry to fixed-width contain user supplied data */
44096 +
44097 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44098 +#define __FS_REISER4_BLACK_BOX_H__
44099 +
44100 +#include "../../forward.h"
44101 +#include "../../dformat.h"
44102 +#include "../../kassign.h"
44103 +#include "../../key.h"
44104 +
44105 +extern int store_black_box(reiser4_tree * tree,
44106 + const reiser4_key * key, void *data, int length);
44107 +extern int load_black_box(reiser4_tree * tree,
44108 + reiser4_key * key, void *data, int length, int exact);
44109 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44110 +extern int update_black_box(reiser4_tree * tree,
44111 + const reiser4_key * key, void *data, int length);
44112 +
44113 +/* __FS_REISER4_BLACK_BOX_H__ */
44114 +#endif
44115 +
44116 +/* Make Linus happy.
44117 + Local variables:
44118 + c-indentation-style: "K&R"
44119 + mode-name: "LC"
44120 + c-basic-offset: 8
44121 + tab-width: 8
44122 + fill-column: 120
44123 + End:
44124 +*/
44125 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/cde.c linux-2.6.22/fs/reiser4/plugin/item/cde.c
44126 --- linux-2.6.22.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300
44127 +++ linux-2.6.22/fs/reiser4/plugin/item/cde.c 2007-07-29 00:25:34.944714077 +0400
44128 @@ -0,0 +1,1008 @@
44129 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44130 +
44131 +/* Directory entry implementation */
44132 +
44133 +/* DESCRIPTION:
44134 +
44135 + This is "compound" directory item plugin implementation. This directory
44136 + item type is compound (as opposed to the "simple directory item" in
44137 + fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44138 + entries.
44139 +
44140 + The reason behind this decision is disk space efficiency: all directory
44141 + entries inside the same directory have identical fragment in their
44142 + keys. This, of course, depends on key assignment policy. In our default key
44143 + assignment policy, all directory entries have the same locality which is
44144 + equal to the object id of their directory.
44145 +
44146 + Composing directory item out of several directory entries for the same
44147 + directory allows us to store said key fragment only once. That is, this is
44148 + some ad hoc form of key compression (stem compression) that is implemented
44149 + here, because general key compression is not supposed to be implemented in
44150 + v4.0.
44151 +
44152 + Another decision that was made regarding all directory item plugins, is
44153 + that they will store entry keys unaligned. This is for that sake of disk
44154 + space efficiency again.
44155 +
44156 + In should be noted, that storing keys unaligned increases CPU consumption,
44157 + at least on some architectures.
44158 +
44159 + Internal on-disk structure of the compound directory item is the following:
44160 +
44161 + HEADER cde_item_format. Here number of entries is stored.
44162 + ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
44163 + ENTRY_HEADER_1 offset of entry body are stored.
44164 + ENTRY_HEADER_2 (basically two last parts of key)
44165 + ...
44166 + ENTRY_HEADER_N
44167 + ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
44168 + ENTRY_BODY_1 NUL-terminated name are stored.
44169 + ENTRY_BODY_2 (part of statadta key in the
44170 + sence that since all SDs have
44171 + zero offset, this offset is not
44172 + stored on disk).
44173 + ...
44174 + ENTRY_BODY_N
44175 +
44176 + When it comes to the balancing, each directory entry in compound directory
44177 + item is unit, that is, something that can be cut from one item and pasted
44178 + into another item of the same type. Handling of unit cut and paste is major
44179 + reason for the complexity of code below.
44180 +
44181 +*/
44182 +
44183 +#include "../../forward.h"
44184 +#include "../../debug.h"
44185 +#include "../../dformat.h"
44186 +#include "../../kassign.h"
44187 +#include "../../key.h"
44188 +#include "../../coord.h"
44189 +#include "sde.h"
44190 +#include "cde.h"
44191 +#include "item.h"
44192 +#include "../node/node.h"
44193 +#include "../plugin.h"
44194 +#include "../../znode.h"
44195 +#include "../../carry.h"
44196 +#include "../../tree.h"
44197 +#include "../../inode.h"
44198 +
44199 +#include <linux/fs.h> /* for struct inode */
44200 +#include <linux/dcache.h> /* for struct dentry */
44201 +#include <linux/quotaops.h>
44202 +
44203 +#if 0
44204 +#define CHECKME(coord) \
44205 +({ \
44206 + const char *message; \
44207 + coord_t dup; \
44208 + \
44209 + coord_dup_nocheck(&dup, (coord)); \
44210 + dup.unit_pos = 0; \
44211 + assert("nikita-2871", cde_check(&dup, &message) == 0); \
44212 +})
44213 +#else
44214 +#define CHECKME(coord) noop
44215 +#endif
44216 +
44217 +/* return body of compound directory item at @coord */
44218 +static inline cde_item_format *formatted_at(const coord_t * coord)
44219 +{
44220 + assert("nikita-1282", coord != NULL);
44221 + return item_body_by_coord(coord);
44222 +}
44223 +
44224 +/* return entry header at @coord */
44225 +static inline cde_unit_header *header_at(const coord_t *
44226 + coord /* coord of item */ ,
44227 + int idx /* index of unit */ )
44228 +{
44229 + assert("nikita-1283", coord != NULL);
44230 + return &formatted_at(coord)->entry[idx];
44231 +}
44232 +
44233 +/* return number of units in compound directory item at @coord */
44234 +static int units(const coord_t * coord /* coord of item */ )
44235 +{
44236 + return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
44237 +}
44238 +
44239 +/* return offset of the body of @idx-th entry in @coord */
44240 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
44241 + int idx /* index of unit */ )
44242 +{
44243 + if (idx < units(coord))
44244 + return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
44245 + else if (idx == units(coord))
44246 + return item_length_by_coord(coord);
44247 + else
44248 + impossible("nikita-1308", "Wrong idx");
44249 + return 0;
44250 +}
44251 +
44252 +/* set offset of the body of @idx-th entry in @coord */
44253 +static void set_offset(const coord_t * coord /* coord of item */ ,
44254 + int idx /* index of unit */ ,
44255 + unsigned int offset /* new offset */ )
44256 +{
44257 + put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
44258 +}
44259 +
44260 +static void adj_offset(const coord_t * coord /* coord of item */ ,
44261 + int idx /* index of unit */ ,
44262 + int delta /* offset change */ )
44263 +{
44264 + d16 *doffset;
44265 + __u16 offset;
44266 +
44267 + doffset = &header_at(coord, idx)->offset;
44268 + offset = le16_to_cpu(get_unaligned(doffset));
44269 + offset += delta;
44270 + put_unaligned(cpu_to_le16((__u16) offset), doffset);
44271 +}
44272 +
44273 +/* return pointer to @offset-th byte from the beginning of @coord */
44274 +static char *address(const coord_t * coord /* coord of item */ ,
44275 + int offset)
44276 +{
44277 + return ((char *)item_body_by_coord(coord)) + offset;
44278 +}
44279 +
44280 +/* return pointer to the body of @idx-th entry in @coord */
44281 +static directory_entry_format *entry_at(const coord_t * coord /* coord of
44282 + * item */ ,
44283 + int idx /* index of unit */ )
44284 +{
44285 + return (directory_entry_format *) address(coord,
44286 + (int)offset_of(coord, idx));
44287 +}
44288 +
44289 +/* return number of unit referenced by @coord */
44290 +static int idx_of(const coord_t * coord /* coord of item */ )
44291 +{
44292 + assert("nikita-1285", coord != NULL);
44293 + return coord->unit_pos;
44294 +}
44295 +
44296 +/* find position where entry with @entry_key would be inserted into @coord */
44297 +static int find(const coord_t * coord /* coord of item */ ,
44298 + const reiser4_key * entry_key /* key to look for */ ,
44299 + cmp_t * last /* result of last comparison */ )
44300 +{
44301 + int entries;
44302 +
44303 + int left;
44304 + int right;
44305 +
44306 + cde_unit_header *header;
44307 +
44308 + assert("nikita-1295", coord != NULL);
44309 + assert("nikita-1296", entry_key != NULL);
44310 + assert("nikita-1297", last != NULL);
44311 +
44312 + entries = units(coord);
44313 + left = 0;
44314 + right = entries - 1;
44315 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
44316 + int median;
44317 +
44318 + median = (left + right) >> 1;
44319 +
44320 + header = header_at(coord, median);
44321 + *last = de_id_key_cmp(&header->hash, entry_key);
44322 + switch (*last) {
44323 + case LESS_THAN:
44324 + left = median;
44325 + break;
44326 + case GREATER_THAN:
44327 + right = median;
44328 + break;
44329 + case EQUAL_TO:{
44330 + do {
44331 + median--;
44332 + header--;
44333 + } while (median >= 0 &&
44334 + de_id_key_cmp(&header->hash,
44335 + entry_key) == EQUAL_TO);
44336 + return median + 1;
44337 + }
44338 + }
44339 + }
44340 + header = header_at(coord, left);
44341 + for (; left < entries; ++left, ++header) {
44342 + prefetch(header + 1);
44343 + *last = de_id_key_cmp(&header->hash, entry_key);
44344 + if (*last != LESS_THAN)
44345 + break;
44346 + }
44347 + if (left < entries)
44348 + return left;
44349 + else
44350 + return RETERR(-ENOENT);
44351 +
44352 +}
44353 +
44354 +/* expand @coord as to accommodate for insertion of @no new entries starting
44355 + from @pos, with total bodies size @size. */
44356 +static int expand_item(const coord_t * coord /* coord of item */ ,
44357 + int pos /* unit position */ , int no /* number of new
44358 + * units*/ ,
44359 + int size /* total size of new units' data */ ,
44360 + unsigned int data_size /* free space already reserved
44361 + * in the item for insertion */ )
44362 +{
44363 + int entries;
44364 + cde_unit_header *header;
44365 + char *dent;
44366 + int i;
44367 +
44368 + assert("nikita-1310", coord != NULL);
44369 + assert("nikita-1311", pos >= 0);
44370 + assert("nikita-1312", no > 0);
44371 + assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
44372 + assert("nikita-1343",
44373 + item_length_by_coord(coord) >=
44374 + (int)(size + data_size + no * sizeof *header));
44375 +
44376 + entries = units(coord);
44377 +
44378 + if (pos == entries)
44379 + dent = address(coord, size);
44380 + else
44381 + dent = (char *)entry_at(coord, pos);
44382 + /* place where new header will be in */
44383 + header = header_at(coord, pos);
44384 + /* free space for new entry headers */
44385 + memmove(header + no, header,
44386 + (unsigned)(address(coord, size) - (char *)header));
44387 + /* if adding to the end initialise first new header */
44388 + if (pos == entries) {
44389 + set_offset(coord, pos, (unsigned)size);
44390 + }
44391 +
44392 + /* adjust entry pointer and size */
44393 + dent = dent + no * sizeof *header;
44394 + size += no * sizeof *header;
44395 + /* free space for new entries */
44396 + memmove(dent + data_size, dent,
44397 + (unsigned)(address(coord, size) - dent));
44398 +
44399 + /* increase counter */
44400 + entries += no;
44401 + put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
44402 +
44403 + /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
44404 + bytes. */
44405 + for (i = 0; i <= pos; ++i)
44406 + adj_offset(coord, i, no * sizeof *header);
44407 + /* [ pos + no ... +\infty ) entries were shifted by ( no *
44408 + sizeof *header + data_size ) bytes */
44409 + for (i = pos + no; i < entries; ++i)
44410 + adj_offset(coord, i, no * sizeof *header + data_size);
44411 + return 0;
44412 +}
44413 +
44414 +/* insert new @entry into item */
44415 +static int expand(const coord_t * coord /* coord of item */ ,
44416 + struct cde_entry * entry /* entry to insert */ ,
44417 + int len /* length of @entry data */ ,
44418 + int *pos /* position to insert */ ,
44419 + reiser4_dir_entry_desc * dir_entry /* parameters for new
44420 + * entry */ )
44421 +{
44422 + cmp_t cmp_res;
44423 + int datasize;
44424 +
44425 + *pos = find(coord, &dir_entry->key, &cmp_res);
44426 + if (*pos < 0)
44427 + *pos = units(coord);
44428 +
44429 + datasize = sizeof(directory_entry_format);
44430 + if (is_longname(entry->name->name, entry->name->len))
44431 + datasize += entry->name->len + 1;
44432 +
44433 + expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
44434 + datasize);
44435 + return 0;
44436 +}
44437 +
44438 +/* paste body of @entry into item */
44439 +static int paste_entry(const coord_t * coord /* coord of item */ ,
44440 + struct cde_entry * entry /* new entry */ ,
44441 + int pos /* position to insert */ ,
44442 + reiser4_dir_entry_desc * dir_entry /* parameters for
44443 + * new entry */ )
44444 +{
44445 + cde_unit_header *header;
44446 + directory_entry_format *dent;
44447 + const char *name;
44448 + int len;
44449 +
44450 + header = header_at(coord, pos);
44451 + dent = entry_at(coord, pos);
44452 +
44453 + build_de_id_by_key(&dir_entry->key, &header->hash);
44454 + build_inode_key_id(entry->obj, &dent->id);
44455 + /* AUDIT unsafe strcpy() operation! It should be replaced with
44456 + much less CPU hungry
44457 + memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
44458 +
44459 + Also a more major thing is that there should be a way to figure out
44460 + amount of space in dent -> name and be able to check that we are
44461 + not going to overwrite more than we supposed to */
44462 + name = entry->name->name;
44463 + len = entry->name->len;
44464 + if (is_longname(name, len)) {
44465 + strcpy((unsigned char *)dent->name, name);
44466 + put_unaligned(0, &dent->name[len]);
44467 + }
44468 + return 0;
44469 +}
44470 +
44471 +/* estimate how much space is necessary in item to insert/paste set of entries
44472 + described in @data. */
44473 +int estimate_cde(const coord_t * coord /* coord of item */ ,
44474 + const reiser4_item_data * data /* parameters for new item */ )
44475 +{
44476 + struct cde_entry_data *e;
44477 + int result;
44478 + int i;
44479 +
44480 + e = (struct cde_entry_data *) data->data;
44481 +
44482 + assert("nikita-1288", e != NULL);
44483 + assert("nikita-1289", e->num_of_entries >= 0);
44484 +
44485 + if (coord == NULL)
44486 + /* insert */
44487 + result = sizeof(cde_item_format);
44488 + else
44489 + /* paste */
44490 + result = 0;
44491 +
44492 + result += e->num_of_entries *
44493 + (sizeof(cde_unit_header) + sizeof(directory_entry_format));
44494 + for (i = 0; i < e->num_of_entries; ++i) {
44495 + const char *name;
44496 + int len;
44497 +
44498 + name = e->entry[i].name->name;
44499 + len = e->entry[i].name->len;
44500 + assert("nikita-2054", strlen(name) == len);
44501 + if (is_longname(name, len))
44502 + result += len + 1;
44503 + }
44504 + ((reiser4_item_data *) data)->length = result;
44505 + return result;
44506 +}
44507 +
44508 +/* ->nr_units() method for this item plugin. */
44509 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
44510 +{
44511 + return units(coord);
44512 +}
44513 +
44514 +/* ->unit_key() method for this item plugin. */
44515 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
44516 + reiser4_key * key /* resulting key */ )
44517 +{
44518 + assert("nikita-1452", coord != NULL);
44519 + assert("nikita-1345", idx_of(coord) < units(coord));
44520 + assert("nikita-1346", key != NULL);
44521 +
44522 + item_key_by_coord(coord, key);
44523 + extract_key_from_de_id(extract_dir_id_from_key(key),
44524 + &header_at(coord, idx_of(coord))->hash, key);
44525 + return key;
44526 +}
44527 +
44528 +/* mergeable_cde(): implementation of ->mergeable() item method.
44529 +
44530 + Two directory items are mergeable iff they are from the same
44531 + directory. That simple.
44532 +
44533 +*/
44534 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
44535 + const coord_t * p2 /* coord of second item */ )
44536 +{
44537 + reiser4_key k1;
44538 + reiser4_key k2;
44539 +
44540 + assert("nikita-1339", p1 != NULL);
44541 + assert("nikita-1340", p2 != NULL);
44542 +
44543 + return
44544 + (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
44545 + (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
44546 + extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
44547 +
44548 +}
44549 +
44550 +/* ->max_key_inside() method for this item plugin. */
44551 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
44552 + reiser4_key * result /* resulting key */ )
44553 +{
44554 + assert("nikita-1342", coord != NULL);
44555 +
44556 + item_key_by_coord(coord, result);
44557 + set_key_ordering(result, get_key_ordering(reiser4_max_key()));
44558 + set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
44559 + set_key_offset(result, get_key_offset(reiser4_max_key()));
44560 + return result;
44561 +}
44562 +
44563 +/* @data contains data which are to be put into tree */
44564 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
44565 + const reiser4_key * key /* key to check */ ,
44566 + const reiser4_item_data * data /* parameters of new
44567 + * item/unit being
44568 + * created */ )
44569 +{
44570 + reiser4_key item_key;
44571 +
44572 + /* FIXME-VS: do not rely on anything but iplug field of @data. Only
44573 + data->iplug is initialized */
44574 + assert("vs-457", data && data->iplug);
44575 +/* assert( "vs-553", data -> user == 0 );*/
44576 + item_key_by_coord(coord, &item_key);
44577 +
44578 + return (item_plugin_by_coord(coord) == data->iplug) &&
44579 + (extract_dir_id_from_key(&item_key) ==
44580 + extract_dir_id_from_key(key));
44581 +}
44582 +
44583 +#if REISER4_DEBUG
44584 +/* cde_check ->check() method for compressed directory items
44585 +
44586 + used for debugging, every item should have here the most complete
44587 + possible check of the consistency of the item that the inventor can
44588 + construct
44589 +*/
44590 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
44591 + const char **error /* where to store error message */)
44592 +{
44593 + int i;
44594 + int result;
44595 + char *item_start;
44596 + char *item_end;
44597 + reiser4_key key;
44598 +
44599 + coord_t c;
44600 +
44601 + assert("nikita-1357", coord != NULL);
44602 + assert("nikita-1358", error != NULL);
44603 +
44604 + if (!ergo(coord->item_pos != 0,
44605 + is_dot_key(item_key_by_coord(coord, &key)))) {
44606 + *error = "CDE doesn't start with dot";
44607 + return -1;
44608 + }
44609 + item_start = item_body_by_coord(coord);
44610 + item_end = item_start + item_length_by_coord(coord);
44611 +
44612 + coord_dup(&c, coord);
44613 + result = 0;
44614 + for (i = 0; i < units(coord); ++i) {
44615 + directory_entry_format *entry;
44616 +
44617 + if ((char *)(header_at(coord, i) + 1) >
44618 + item_end - units(coord) * sizeof *entry) {
44619 + *error = "CDE header is out of bounds";
44620 + result = -1;
44621 + break;
44622 + }
44623 + entry = entry_at(coord, i);
44624 + if ((char *)entry < item_start + sizeof(cde_item_format)) {
44625 + *error = "CDE header is too low";
44626 + result = -1;
44627 + break;
44628 + }
44629 + if ((char *)(entry + 1) > item_end) {
44630 + *error = "CDE header is too high";
44631 + result = -1;
44632 + break;
44633 + }
44634 + }
44635 +
44636 + return result;
44637 +}
44638 +#endif
44639 +
44640 +/* ->init() method for this item plugin. */
44641 +int init_cde(coord_t * coord /* coord of item */ ,
44642 + coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
44643 + UNUSED_ARG)
44644 +{
44645 + put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
44646 + return 0;
44647 +}
44648 +
44649 +/* ->lookup() method for this item plugin. */
44650 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
44651 + lookup_bias bias /* search bias */ ,
44652 + coord_t * coord /* coord of item to lookup in */ )
44653 +{
44654 + cmp_t last_comp;
44655 + int pos;
44656 +
44657 + reiser4_key utmost_key;
44658 +
44659 + assert("nikita-1293", coord != NULL);
44660 + assert("nikita-1294", key != NULL);
44661 +
44662 + CHECKME(coord);
44663 +
44664 + if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
44665 + coord->unit_pos = 0;
44666 + coord->between = BEFORE_UNIT;
44667 + return CBK_COORD_NOTFOUND;
44668 + }
44669 + pos = find(coord, key, &last_comp);
44670 + if (pos >= 0) {
44671 + coord->unit_pos = (int)pos;
44672 + switch (last_comp) {
44673 + case EQUAL_TO:
44674 + coord->between = AT_UNIT;
44675 + return CBK_COORD_FOUND;
44676 + case GREATER_THAN:
44677 + coord->between = BEFORE_UNIT;
44678 + return RETERR(-ENOENT);
44679 + case LESS_THAN:
44680 + default:
44681 + impossible("nikita-1298", "Broken find");
44682 + return RETERR(-EIO);
44683 + }
44684 + } else {
44685 + coord->unit_pos = units(coord) - 1;
44686 + coord->between = AFTER_UNIT;
44687 + return (bias ==
44688 + FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
44689 + CBK_COORD_NOTFOUND;
44690 + }
44691 +}
44692 +
44693 +/* ->paste() method for this item plugin. */
44694 +int paste_cde(coord_t * coord /* coord of item */ ,
44695 + reiser4_item_data * data /* parameters of new unit being
44696 + * inserted */ ,
44697 + carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
44698 +{
44699 + struct cde_entry_data *e;
44700 + int result;
44701 + int i;
44702 +
44703 + CHECKME(coord);
44704 + e = (struct cde_entry_data *) data->data;
44705 +
44706 + result = 0;
44707 + for (i = 0; i < e->num_of_entries; ++i) {
44708 + int pos;
44709 + int phantom_size;
44710 +
44711 + phantom_size = data->length;
44712 + if (units(coord) == 0)
44713 + phantom_size -= sizeof(cde_item_format);
44714 +
44715 + result =
44716 + expand(coord, e->entry + i, phantom_size, &pos, data->arg);
44717 + if (result != 0)
44718 + break;
44719 + result = paste_entry(coord, e->entry + i, pos, data->arg);
44720 + if (result != 0)
44721 + break;
44722 + }
44723 + CHECKME(coord);
44724 + return result;
44725 +}
44726 +
44727 +/* amount of space occupied by all entries starting from @idx both headers and
44728 + bodies. */
44729 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
44730 + int idx /* index of unit */ )
44731 +{
44732 + assert("nikita-1299", coord != NULL);
44733 + assert("nikita-1300", idx < (int)units(coord));
44734 +
44735 + return sizeof(cde_item_format) +
44736 + (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
44737 + idx + 1) -
44738 + offset_of(coord, 0);
44739 +}
44740 +
44741 +/* how many but not more than @want units of @source can be merged with
44742 + item in @target node. If pend == append - we try to append last item
44743 + of @target by first units of @source. If pend == prepend - we try to
44744 + "prepend" first item in @target by last units of @source. @target
44745 + node has @free_space bytes of free space. Total size of those units
44746 + are returned via @size */
44747 +int can_shift_cde(unsigned free_space /* free space in item */ ,
44748 + coord_t * coord /* coord of source item */ ,
44749 + znode * target /* target node */ ,
44750 + shift_direction pend /* shift direction */ ,
44751 + unsigned *size /* resulting number of shifted bytes */ ,
44752 + unsigned want /* maximal number of bytes to shift */ )
44753 +{
44754 + int shift;
44755 +
44756 + CHECKME(coord);
44757 + if (want == 0) {
44758 + *size = 0;
44759 + return 0;
44760 + }
44761 +
44762 + /* pend == SHIFT_LEFT <==> shifting to the left */
44763 + if (pend == SHIFT_LEFT) {
44764 + for (shift = min((int)want - 1, units(coord)); shift >= 0;
44765 + --shift) {
44766 + *size = part_size(coord, shift);
44767 + if (target != NULL)
44768 + *size -= sizeof(cde_item_format);
44769 + if (*size <= free_space)
44770 + break;
44771 + }
44772 + shift = shift + 1;
44773 + } else {
44774 + int total_size;
44775 +
44776 + assert("nikita-1301", pend == SHIFT_RIGHT);
44777 +
44778 + total_size = item_length_by_coord(coord);
44779 + for (shift = units(coord) - want - 1; shift < units(coord) - 1;
44780 + ++shift) {
44781 + *size = total_size - part_size(coord, shift);
44782 + if (target == NULL)
44783 + *size += sizeof(cde_item_format);
44784 + if (*size <= free_space)
44785 + break;
44786 + }
44787 + shift = units(coord) - shift - 1;
44788 + }
44789 + if (shift == 0)
44790 + *size = 0;
44791 + CHECKME(coord);
44792 + return shift;
44793 +}
44794 +
44795 +/* ->copy_units() method for this item plugin. */
44796 +void copy_units_cde(coord_t * target /* coord of target item */ ,
44797 + coord_t * source /* coord of source item */ ,
44798 + unsigned from /* starting unit */ ,
44799 + unsigned count /* how many units to copy */ ,
44800 + shift_direction where_is_free_space /* shift direction */ ,
44801 + unsigned free_space /* free space in item */ )
44802 +{
44803 + char *header_from;
44804 + char *header_to;
44805 +
44806 + char *entry_from;
44807 + char *entry_to;
44808 +
44809 + int pos_in_target;
44810 + int data_size;
44811 + int data_delta;
44812 + int i;
44813 +
44814 + assert("nikita-1303", target != NULL);
44815 + assert("nikita-1304", source != NULL);
44816 + assert("nikita-1305", (int)from < units(source));
44817 + assert("nikita-1307", (int)(from + count) <= units(source));
44818 +
44819 + if (where_is_free_space == SHIFT_LEFT) {
44820 + assert("nikita-1453", from == 0);
44821 + pos_in_target = units(target);
44822 + } else {
44823 + assert("nikita-1309", (int)(from + count) == units(source));
44824 + pos_in_target = 0;
44825 + memmove(item_body_by_coord(target),
44826 + (char *)item_body_by_coord(target) + free_space,
44827 + item_length_by_coord(target) - free_space);
44828 + }
44829 +
44830 + CHECKME(target);
44831 + CHECKME(source);
44832 +
44833 + /* expand @target */
44834 + data_size =
44835 + offset_of(source, (int)(from + count)) - offset_of(source,
44836 + (int)from);
44837 +
44838 + if (units(target) == 0)
44839 + free_space -= sizeof(cde_item_format);
44840 +
44841 + expand_item(target, pos_in_target, (int)count,
44842 + (int)(item_length_by_coord(target) - free_space),
44843 + (unsigned)data_size);
44844 +
44845 + /* copy first @count units of @source into @target */
44846 + data_delta =
44847 + offset_of(target, pos_in_target) - offset_of(source, (int)from);
44848 +
44849 + /* copy entries */
44850 + entry_from = (char *)entry_at(source, (int)from);
44851 + entry_to = (char *)entry_at(source, (int)(from + count));
44852 + memmove(entry_at(target, pos_in_target), entry_from,
44853 + (unsigned)(entry_to - entry_from));
44854 +
44855 + /* copy headers */
44856 + header_from = (char *)header_at(source, (int)from);
44857 + header_to = (char *)header_at(source, (int)(from + count));
44858 + memmove(header_at(target, pos_in_target), header_from,
44859 + (unsigned)(header_to - header_from));
44860 +
44861 + /* update offsets */
44862 + for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
44863 + adj_offset(target, i, data_delta);
44864 + CHECKME(target);
44865 + CHECKME(source);
44866 +}
44867 +
44868 +/* ->cut_units() method for this item plugin. */
44869 +int cut_units_cde(coord_t * coord /* coord of item */ ,
44870 + pos_in_node_t from /* start unit pos */ ,
44871 + pos_in_node_t to /* stop unit pos */ ,
44872 + struct carry_cut_data *cdata UNUSED_ARG,
44873 + reiser4_key * smallest_removed, reiser4_key * new_first)
44874 +{
44875 + char *header_from;
44876 + char *header_to;
44877 +
44878 + char *entry_from;
44879 + char *entry_to;
44880 +
44881 + int size;
44882 + int entry_delta;
44883 + int header_delta;
44884 + int i;
44885 +
44886 + unsigned count;
44887 +
44888 + CHECKME(coord);
44889 +
44890 + count = to - from + 1;
44891 +
44892 + assert("nikita-1454", coord != NULL);
44893 + assert("nikita-1455", (int)(from + count) <= units(coord));
44894 +
44895 + if (smallest_removed)
44896 + unit_key_by_coord(coord, smallest_removed);
44897 +
44898 + if (new_first) {
44899 + coord_t next;
44900 +
44901 + /* not everything is cut from item head */
44902 + assert("vs-1527", from == 0);
44903 + assert("vs-1528", to < units(coord) - 1);
44904 +
44905 + coord_dup(&next, coord);
44906 + next.unit_pos++;
44907 + unit_key_by_coord(&next, new_first);
44908 + }
44909 +
44910 + size = item_length_by_coord(coord);
44911 + if (count == (unsigned)units(coord)) {
44912 + return size;
44913 + }
44914 +
44915 + header_from = (char *)header_at(coord, (int)from);
44916 + header_to = (char *)header_at(coord, (int)(from + count));
44917 +
44918 + entry_from = (char *)entry_at(coord, (int)from);
44919 + entry_to = (char *)entry_at(coord, (int)(from + count));
44920 +
44921 + /* move headers */
44922 + memmove(header_from, header_to,
44923 + (unsigned)(address(coord, size) - header_to));
44924 +
44925 + header_delta = header_to - header_from;
44926 +
44927 + entry_from -= header_delta;
44928 + entry_to -= header_delta;
44929 + size -= header_delta;
44930 +
44931 + /* copy entries */
44932 + memmove(entry_from, entry_to,
44933 + (unsigned)(address(coord, size) - entry_to));
44934 +
44935 + entry_delta = entry_to - entry_from;
44936 + size -= entry_delta;
44937 +
44938 + /* update offsets */
44939 +
44940 + for (i = 0; i < (int)from; ++i)
44941 + adj_offset(coord, i, -header_delta);
44942 +
44943 + for (i = from; i < units(coord) - (int)count; ++i)
44944 + adj_offset(coord, i, -header_delta - entry_delta);
44945 +
44946 + put_unaligned(cpu_to_le16((__u16) units(coord) - count),
44947 + &formatted_at(coord)->num_of_entries);
44948 +
44949 + if (from == 0) {
44950 + /* entries from head was removed - move remaining to right */
44951 + memmove((char *)item_body_by_coord(coord) +
44952 + header_delta + entry_delta, item_body_by_coord(coord),
44953 + (unsigned)size);
44954 + if (REISER4_DEBUG)
44955 + memset(item_body_by_coord(coord), 0,
44956 + (unsigned)header_delta + entry_delta);
44957 + } else {
44958 + /* freed space is already at the end of item */
44959 + if (REISER4_DEBUG)
44960 + memset((char *)item_body_by_coord(coord) + size, 0,
44961 + (unsigned)header_delta + entry_delta);
44962 + }
44963 +
44964 + return header_delta + entry_delta;
44965 +}
44966 +
44967 +int kill_units_cde(coord_t * coord /* coord of item */ ,
44968 + pos_in_node_t from /* start unit pos */ ,
44969 + pos_in_node_t to /* stop unit pos */ ,
44970 + struct carry_kill_data *kdata UNUSED_ARG,
44971 + reiser4_key * smallest_removed, reiser4_key * new_first)
44972 +{
44973 + return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
44974 +}
44975 +
44976 +/* ->s.dir.extract_key() method for this item plugin. */
44977 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
44978 + reiser4_key * key /* resulting key */ )
44979 +{
44980 + directory_entry_format *dent;
44981 +
44982 + assert("nikita-1155", coord != NULL);
44983 + assert("nikita-1156", key != NULL);
44984 +
44985 + dent = entry_at(coord, idx_of(coord));
44986 + return extract_key_from_id(&dent->id, key);
44987 +}
44988 +
44989 +int
44990 +update_key_cde(const coord_t * coord, const reiser4_key * key,
44991 + lock_handle * lh UNUSED_ARG)
44992 +{
44993 + directory_entry_format *dent;
44994 + obj_key_id obj_id;
44995 + int result;
44996 +
44997 + assert("nikita-2344", coord != NULL);
44998 + assert("nikita-2345", key != NULL);
44999 +
45000 + dent = entry_at(coord, idx_of(coord));
45001 + result = build_obj_key_id(key, &obj_id);
45002 + if (result == 0) {
45003 + dent->id = obj_id;
45004 + znode_make_dirty(coord->node);
45005 + }
45006 + return 0;
45007 +}
45008 +
45009 +/* ->s.dir.extract_name() method for this item plugin. */
45010 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45011 +{
45012 + directory_entry_format *dent;
45013 +
45014 + assert("nikita-1157", coord != NULL);
45015 +
45016 + dent = entry_at(coord, idx_of(coord));
45017 + return extract_dent_name(coord, dent, buf);
45018 +}
45019 +
45020 +static int cde_bytes(int pasting, const reiser4_item_data * data)
45021 +{
45022 + int result;
45023 +
45024 + result = data->length;
45025 + if (!pasting)
45026 + result -= sizeof(cde_item_format);
45027 + return result;
45028 +}
45029 +
45030 +/* ->s.dir.add_entry() method for this item plugin */
45031 +int add_entry_cde(struct inode *dir /* directory object */ ,
45032 + coord_t * coord /* coord of item */ ,
45033 + lock_handle * lh /* lock handle for insertion */ ,
45034 + const struct dentry *name /* name to insert */ ,
45035 + reiser4_dir_entry_desc * dir_entry /* parameters of new
45036 + * directory entry */ )
45037 +{
45038 + reiser4_item_data data;
45039 + struct cde_entry entry;
45040 + struct cde_entry_data edata;
45041 + int result;
45042 +
45043 + assert("nikita-1656", coord->node == lh->node);
45044 + assert("nikita-1657", znode_is_write_locked(coord->node));
45045 +
45046 + edata.num_of_entries = 1;
45047 + edata.entry = &entry;
45048 +
45049 + entry.dir = dir;
45050 + entry.obj = dir_entry->obj;
45051 + entry.name = &name->d_name;
45052 +
45053 + data.data = (char *)&edata;
45054 + data.user = 0; /* &edata is not user space */
45055 + data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45056 + data.arg = dir_entry;
45057 + assert("nikita-1302", data.iplug != NULL);
45058 +
45059 + result = is_dot_key(&dir_entry->key);
45060 + data.length = estimate_cde(result ? coord : NULL, &data);
45061 +
45062 + /* NOTE-NIKITA quota plugin? */
45063 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45064 + return RETERR(-EDQUOT);
45065 +
45066 + if (result)
45067 + result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45068 + else
45069 + result = reiser4_resize_item(coord, &data, &dir_entry->key,
45070 + lh, 0);
45071 + return result;
45072 +}
45073 +
45074 +/* ->s.dir.rem_entry() */
45075 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
45076 + const struct qstr *name, coord_t * coord /* coord of item */ ,
45077 + lock_handle * lh UNUSED_ARG /* lock handle for
45078 + * removal */ ,
45079 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
45080 + * directory entry
45081 + * being removed */ )
45082 +{
45083 + coord_t shadow;
45084 + int result;
45085 + int length;
45086 + ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45087 +
45088 + assert("nikita-2870", strlen(name->name) == name->len);
45089 + assert("nikita-2869",
45090 + !strcmp(name->name, extract_name_cde(coord, buf)));
45091 +
45092 + length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45093 + if (is_longname(name->name, name->len))
45094 + length += name->len + 1;
45095 +
45096 + if (inode_get_bytes(dir) < length) {
45097 + warning("nikita-2628", "Dir is broke: %llu: %llu",
45098 + (unsigned long long)get_inode_oid(dir),
45099 + inode_get_bytes(dir));
45100 +
45101 + return RETERR(-EIO);
45102 + }
45103 +
45104 + /* cut_node() is supposed to take pointers to _different_
45105 + coords, because it will modify them without respect to
45106 + possible aliasing. To work around this, create temporary copy
45107 + of @coord.
45108 + */
45109 + coord_dup(&shadow, coord);
45110 + result =
45111 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45112 + if (result == 0) {
45113 + /* NOTE-NIKITA quota plugin? */
45114 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
45115 + }
45116 + return result;
45117 +}
45118 +
45119 +/* ->s.dir.max_name_len() method for this item plugin */
45120 +int max_name_len_cde(const struct inode *dir /* directory */ )
45121 +{
45122 + return
45123 + reiser4_tree_by_inode(dir)->nplug->max_item_size() -
45124 + sizeof(directory_entry_format) - sizeof(cde_item_format) -
45125 + sizeof(cde_unit_header) - 2;
45126 +}
45127 +
45128 +/* Make Linus happy.
45129 + Local variables:
45130 + c-indentation-style: "K&R"
45131 + mode-name: "LC"
45132 + c-basic-offset: 8
45133 + tab-width: 8
45134 + fill-column: 120
45135 + End:
45136 +*/
45137 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/cde.h linux-2.6.22/fs/reiser4/plugin/item/cde.h
45138 --- linux-2.6.22.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300
45139 +++ linux-2.6.22/fs/reiser4/plugin/item/cde.h 2007-07-29 00:25:34.944714077 +0400
45140 @@ -0,0 +1,87 @@
45141 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45142 +
45143 +/* Compound directory item. See cde.c for description. */
45144 +
45145 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45146 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45147 +
45148 +#include "../../forward.h"
45149 +#include "../../kassign.h"
45150 +#include "../../dformat.h"
45151 +
45152 +#include <linux/fs.h> /* for struct inode */
45153 +#include <linux/dcache.h> /* for struct dentry, etc */
45154 +
45155 +typedef struct cde_unit_header {
45156 + de_id hash;
45157 + d16 offset;
45158 +} cde_unit_header;
45159 +
45160 +typedef struct cde_item_format {
45161 + d16 num_of_entries;
45162 + cde_unit_header entry[0];
45163 +} cde_item_format;
45164 +
45165 +struct cde_entry {
45166 + const struct inode *dir;
45167 + const struct inode *obj;
45168 + const struct qstr *name;
45169 +};
45170 +
45171 +struct cde_entry_data {
45172 + int num_of_entries;
45173 + struct cde_entry *entry;
45174 +};
45175 +
45176 +/* plugin->item.b.* */
45177 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
45178 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
45179 + const reiser4_item_data *);
45180 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
45181 +pos_in_node_t nr_units_cde(const coord_t * coord);
45182 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
45183 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
45184 +void print_cde(const char *prefix, coord_t * coord);
45185 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
45186 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
45187 + coord_t * coord);
45188 +int paste_cde(coord_t * coord, reiser4_item_data * data,
45189 + carry_plugin_info * info UNUSED_ARG);
45190 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
45191 + shift_direction pend, unsigned *size, unsigned want);
45192 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
45193 + unsigned count, shift_direction where_is_free_space,
45194 + unsigned free_space);
45195 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45196 + struct carry_cut_data *, reiser4_key * smallest_removed,
45197 + reiser4_key * new_first);
45198 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45199 + struct carry_kill_data *, reiser4_key * smallest_removed,
45200 + reiser4_key * new_first);
45201 +void print_cde(const char *prefix, coord_t * coord);
45202 +int reiser4_check_cde(const coord_t * coord, const char **error);
45203 +
45204 +/* plugin->u.item.s.dir.* */
45205 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
45206 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
45207 + lock_handle * lh);
45208 +char *extract_name_cde(const coord_t * coord, char *buf);
45209 +int add_entry_cde(struct inode *dir, coord_t * coord,
45210 + lock_handle * lh, const struct dentry *name,
45211 + reiser4_dir_entry_desc * entry);
45212 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
45213 + lock_handle * lh, reiser4_dir_entry_desc * entry);
45214 +int max_name_len_cde(const struct inode *dir);
45215 +
45216 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
45217 +#endif
45218 +
45219 +/* Make Linus happy.
45220 + Local variables:
45221 + c-indentation-style: "K&R"
45222 + mode-name: "LC"
45223 + c-basic-offset: 8
45224 + tab-width: 8
45225 + fill-column: 120
45226 + End:
45227 +*/
45228 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.22/fs/reiser4/plugin/item/ctail.c
45229 --- linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300
45230 +++ linux-2.6.22/fs/reiser4/plugin/item/ctail.c 2007-07-29 00:25:34.948715113 +0400
45231 @@ -0,0 +1,1614 @@
45232 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45233 +
45234 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
45235 +
45236 +/* DESCRIPTION:
45237 +
45238 +Each cryptcompress object is stored on disk as a set of clusters sliced
45239 +into ctails.
45240 +
45241 +Internal on-disk structure:
45242 +
45243 + HEADER (1) Here stored disk cluster shift
45244 + BODY
45245 +*/
45246 +
45247 +#include "../../forward.h"
45248 +#include "../../debug.h"
45249 +#include "../../dformat.h"
45250 +#include "../../kassign.h"
45251 +#include "../../key.h"
45252 +#include "../../coord.h"
45253 +#include "item.h"
45254 +#include "../node/node.h"
45255 +#include "../plugin.h"
45256 +#include "../object.h"
45257 +#include "../../znode.h"
45258 +#include "../../carry.h"
45259 +#include "../../tree.h"
45260 +#include "../../inode.h"
45261 +#include "../../super.h"
45262 +#include "../../context.h"
45263 +#include "../../page_cache.h"
45264 +#include "../cluster.h"
45265 +#include "../../flush.h"
45266 +#include "../../tree_walk.h"
45267 +
45268 +#include <linux/pagevec.h>
45269 +#include <linux/swap.h>
45270 +#include <linux/fs.h>
45271 +
45272 +/* return body of ctail item at @coord */
45273 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
45274 +{
45275 + assert("edward-60", coord != NULL);
45276 + return item_body_by_coord(coord);
45277 +}
45278 +
45279 +static int cluster_shift_by_coord(const coord_t * coord)
45280 +{
45281 + return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
45282 +}
45283 +
45284 +static inline void dclust_set_extension_shift(hint_t * hint)
45285 +{
45286 + assert("edward-1270",
45287 + item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
45288 + hint->ext_coord.extension.ctail.shift =
45289 + cluster_shift_by_coord(&hint->ext_coord.coord);
45290 +}
45291 +
45292 +static loff_t off_by_coord(const coord_t * coord)
45293 +{
45294 + reiser4_key key;
45295 + return get_key_offset(item_key_by_coord(coord, &key));
45296 +}
45297 +
45298 +int coord_is_unprepped_ctail(const coord_t * coord)
45299 +{
45300 + assert("edward-1233", coord != NULL);
45301 + assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
45302 + assert("edward-1235",
45303 + ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
45304 + nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
45305 +
45306 + return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
45307 +}
45308 +
45309 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
45310 +{
45311 + int shift;
45312 +
45313 + if (inode != NULL) {
45314 + shift = inode_cluster_shift(inode);
45315 + assert("edward-1236",
45316 + ergo(!coord_is_unprepped_ctail(coord),
45317 + shift == cluster_shift_by_coord(coord)));
45318 + } else {
45319 + assert("edward-1237", !coord_is_unprepped_ctail(coord));
45320 + shift = cluster_shift_by_coord(coord);
45321 + }
45322 + return off_by_coord(coord) >> shift;
45323 +}
45324 +
45325 +static int disk_cluster_size(const coord_t * coord)
45326 +{
45327 + assert("edward-1156",
45328 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45329 + /* calculation of disk cluster size
45330 + is meaninless if ctail is unprepped */
45331 + assert("edward-1238", !coord_is_unprepped_ctail(coord));
45332 +
45333 + return 1 << cluster_shift_by_coord(coord);
45334 +}
45335 +
45336 +/* true if the key is of first disk cluster item */
45337 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
45338 +{
45339 + assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
45340 +
45341 + return coord_is_unprepped_ctail(coord) ||
45342 + ((get_key_offset(key) &
45343 + ((loff_t) disk_cluster_size(coord) - 1)) == 0);
45344 +}
45345 +
45346 +static char *first_unit(coord_t * coord)
45347 +{
45348 + /* FIXME: warning: pointer of type `void *' used in arithmetic */
45349 + return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
45350 +}
45351 +
45352 +/* plugin->u.item.b.max_key_inside :
45353 + tail_max_key_inside */
45354 +
45355 +/* plugin->u.item.b.can_contain_key */
45356 +int
45357 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
45358 + const reiser4_item_data * data)
45359 +{
45360 + reiser4_key item_key;
45361 +
45362 + if (item_plugin_by_coord(coord) != data->iplug)
45363 + return 0;
45364 +
45365 + item_key_by_coord(coord, &item_key);
45366 + if (get_key_locality(key) != get_key_locality(&item_key) ||
45367 + get_key_objectid(key) != get_key_objectid(&item_key))
45368 + return 0;
45369 + if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
45370 + get_key_offset(key))
45371 + return 0;
45372 + if (is_disk_cluster_key(key, coord))
45373 + return 0;
45374 + return 1;
45375 +}
45376 +
45377 +/* plugin->u.item.b.mergeable */
45378 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
45379 +{
45380 + reiser4_key key1, key2;
45381 +
45382 + assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
45383 + assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
45384 + UNIX_FILE_METADATA_ITEM_TYPE));
45385 +
45386 + if (item_id_by_coord(p2) != CTAIL_ID) {
45387 + /* second item is of another type */
45388 + return 0;
45389 + }
45390 +
45391 + item_key_by_coord(p1, &key1);
45392 + item_key_by_coord(p2, &key2);
45393 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
45394 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
45395 + get_key_type(&key1) != get_key_type(&key2)) {
45396 + /* items of different objects */
45397 + return 0;
45398 + }
45399 + if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
45400 + /* not adjacent items */
45401 + return 0;
45402 + if (is_disk_cluster_key(&key2, p2))
45403 + return 0;
45404 + return 1;
45405 +}
45406 +
45407 +/* plugin->u.item.b.nr_units */
45408 +pos_in_node_t nr_units_ctail(const coord_t * coord)
45409 +{
45410 + return (item_length_by_coord(coord) -
45411 + sizeof(ctail_formatted_at(coord)->cluster_shift));
45412 +}
45413 +
45414 +/* plugin->u.item.b.estimate:
45415 + estimate how much space is needed to insert/paste @data->length bytes
45416 + into ctail at @coord */
45417 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
45418 + const reiser4_item_data *
45419 + data /* parameters for new item */ )
45420 +{
45421 + if (coord == NULL)
45422 + /* insert */
45423 + return (sizeof(ctail_item_format) + data->length);
45424 + else
45425 + /* paste */
45426 + return data->length;
45427 +}
45428 +
45429 +/* ->init() method for this item plugin. */
45430 +int init_ctail(coord_t * to /* coord of item */ ,
45431 + coord_t * from /* old_item */ ,
45432 + reiser4_item_data * data /* structure used for insertion */ )
45433 +{
45434 + int cluster_shift; /* cpu value to convert */
45435 +
45436 + if (data) {
45437 + assert("edward-463", data->length > sizeof(ctail_item_format));
45438 + cluster_shift = *((int *)(data->arg));
45439 + data->length -= sizeof(ctail_item_format);
45440 + } else {
45441 + assert("edward-464", from != NULL);
45442 + assert("edward-855", ctail_ok(from));
45443 + cluster_shift = (int)(cluster_shift_by_coord(from));
45444 + }
45445 + put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
45446 + assert("edward-856", ctail_ok(to));
45447 + return 0;
45448 +}
45449 +
45450 +/* plugin->u.item.b.lookup:
45451 + NULL: We are looking for item keys only */
45452 +
45453 +#if REISER4_DEBUG
45454 +int ctail_ok(const coord_t * coord)
45455 +{
45456 + return coord_is_unprepped_ctail(coord) ||
45457 + cluster_shift_ok(cluster_shift_by_coord(coord));
45458 +}
45459 +
45460 +/* plugin->u.item.b.check */
45461 +int check_ctail(const coord_t * coord, const char **error)
45462 +{
45463 + if (!ctail_ok(coord)) {
45464 + if (error)
45465 + *error = "bad cluster shift in ctail";
45466 + return 1;
45467 + }
45468 + return 0;
45469 +}
45470 +#endif
45471 +
45472 +/* plugin->u.item.b.paste */
45473 +int
45474 +paste_ctail(coord_t * coord, reiser4_item_data * data,
45475 + carry_plugin_info * info UNUSED_ARG)
45476 +{
45477 + unsigned old_nr_units;
45478 +
45479 + assert("edward-268", data->data != NULL);
45480 + /* copy only from kernel space */
45481 + assert("edward-66", data->user == 0);
45482 +
45483 + old_nr_units =
45484 + item_length_by_coord(coord) - sizeof(ctail_item_format) -
45485 + data->length;
45486 +
45487 + /* ctail items never get pasted in the middle */
45488 +
45489 + if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
45490 +
45491 + /* paste at the beginning when create new item */
45492 + assert("edward-450",
45493 + item_length_by_coord(coord) ==
45494 + data->length + sizeof(ctail_item_format));
45495 + assert("edward-451", old_nr_units == 0);
45496 + } else if (coord->unit_pos == old_nr_units - 1
45497 + && coord->between == AFTER_UNIT) {
45498 +
45499 + /* paste at the end */
45500 + coord->unit_pos++;
45501 + } else
45502 + impossible("edward-453", "bad paste position");
45503 +
45504 + memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
45505 +
45506 + assert("edward-857", ctail_ok(coord));
45507 +
45508 + return 0;
45509 +}
45510 +
45511 +/* plugin->u.item.b.fast_paste */
45512 +
45513 +/* plugin->u.item.b.can_shift
45514 + number of units is returned via return value, number of bytes via @size. For
45515 + ctail items they coincide */
45516 +int
45517 +can_shift_ctail(unsigned free_space, coord_t * source,
45518 + znode * target, shift_direction direction UNUSED_ARG,
45519 + unsigned *size /* number of bytes */ , unsigned want)
45520 +{
45521 + /* make sure that that we do not want to shift more than we have */
45522 + assert("edward-68", want > 0 && want <= nr_units_ctail(source));
45523 +
45524 + *size = min(want, free_space);
45525 +
45526 + if (!target) {
45527 + /* new item will be created */
45528 + if (*size <= sizeof(ctail_item_format)) {
45529 + *size = 0;
45530 + return 0;
45531 + }
45532 + return *size - sizeof(ctail_item_format);
45533 + }
45534 + return *size;
45535 +}
45536 +
45537 +/* plugin->u.item.b.copy_units
45538 + cooperates with ->can_shift() */
45539 +void
45540 +copy_units_ctail(coord_t * target, coord_t * source,
45541 + unsigned from, unsigned count /* units */ ,
45542 + shift_direction where_is_free_space,
45543 + unsigned free_space /* bytes */ )
45544 +{
45545 + /* make sure that item @target is expanded already */
45546 + assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
45547 + assert("edward-70", free_space == count || free_space == count + 1);
45548 +
45549 + assert("edward-858", ctail_ok(source));
45550 +
45551 + if (where_is_free_space == SHIFT_LEFT) {
45552 + /* append item @target with @count first bytes of @source:
45553 + this restriction came from ordinary tails */
45554 + assert("edward-71", from == 0);
45555 + assert("edward-860", ctail_ok(target));
45556 +
45557 + memcpy(first_unit(target) + nr_units_ctail(target) - count,
45558 + first_unit(source), count);
45559 + } else {
45560 + /* target item is moved to right already */
45561 + reiser4_key key;
45562 +
45563 + assert("edward-72", nr_units_ctail(source) == from + count);
45564 +
45565 + if (free_space == count) {
45566 + init_ctail(target, source, NULL);
45567 + } else {
45568 + /* new item has been created */
45569 + assert("edward-862", ctail_ok(target));
45570 + }
45571 + memcpy(first_unit(target), first_unit(source) + from, count);
45572 +
45573 + assert("edward-863", ctail_ok(target));
45574 +
45575 + /* new units are inserted before first unit in an item,
45576 + therefore, we have to update item key */
45577 + item_key_by_coord(source, &key);
45578 + set_key_offset(&key, get_key_offset(&key) + from);
45579 +
45580 + node_plugin_by_node(target->node)->update_item_key(target, &key,
45581 + NULL /*info */);
45582 + }
45583 +}
45584 +
45585 +/* plugin->u.item.b.create_hook */
45586 +int create_hook_ctail(const coord_t * coord, void *arg)
45587 +{
45588 + assert("edward-864", znode_is_loaded(coord->node));
45589 +
45590 + znode_set_convertible(coord->node);
45591 + return 0;
45592 +}
45593 +
45594 +/* plugin->u.item.b.kill_hook */
45595 +int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
45596 + pos_in_node_t count, carry_kill_data * kdata)
45597 +{
45598 + struct inode *inode;
45599 +
45600 + assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
45601 + assert("edward-291", znode_is_write_locked(coord->node));
45602 +
45603 + inode = kdata->inode;
45604 + if (inode) {
45605 + reiser4_key key;
45606 + struct cryptcompress_info * info;
45607 + cloff_t index;
45608 +
45609 + item_key_by_coord(coord, &key);
45610 + info = cryptcompress_inode_data(inode);
45611 + index = off_to_clust(get_key_offset(&key), inode);
45612 +
45613 + if (from == 0) {
45614 + info->trunc_index = index;
45615 + if (is_disk_cluster_key(&key, coord)) {
45616 + /*
45617 + * first item of disk cluster is to be killed
45618 + */
45619 + truncate_complete_page_cluster(
45620 + inode, index, kdata->params.truncate);
45621 + inode_sub_bytes(inode,
45622 + inode_cluster_size(inode));
45623 + }
45624 + }
45625 + }
45626 + return 0;
45627 +}
45628 +
45629 +/* for shift_hook_ctail(),
45630 + return true if the first disk cluster item has dirty child
45631 +*/
45632 +static int ctail_convertible(const coord_t * coord)
45633 +{
45634 + int result;
45635 + reiser4_key key;
45636 + jnode *child = NULL;
45637 +
45638 + assert("edward-477", coord != NULL);
45639 + assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
45640 +
45641 + if (coord_is_unprepped_ctail(coord))
45642 + /* unprepped ctail should be converted */
45643 + return 1;
45644 +
45645 + item_key_by_coord(coord, &key);
45646 + child = jlookup(current_tree,
45647 + get_key_objectid(&key),
45648 + off_to_pg(off_by_coord(coord)));
45649 + if (!child)
45650 + return 0;
45651 + result = JF_ISSET(child, JNODE_DIRTY);
45652 + jput(child);
45653 + return result;
45654 +}
45655 +
45656 +/* FIXME-EDWARD */
45657 +/* plugin->u.item.b.shift_hook */
45658 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
45659 + unsigned from UNUSED_ARG /* start unit */ ,
45660 + unsigned count UNUSED_ARG /* stop unit */ ,
45661 + znode * old_node /* old parent */ )
45662 +{
45663 + assert("edward-479", item != NULL);
45664 + assert("edward-480", item->node != old_node);
45665 +
45666 + if (!znode_convertible(old_node) || znode_convertible(item->node))
45667 + return 0;
45668 + if (ctail_convertible(item))
45669 + znode_set_convertible(item->node);
45670 + return 0;
45671 +}
45672 +
45673 +static int
45674 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45675 + int cut, void *p, reiser4_key * smallest_removed,
45676 + reiser4_key * new_first)
45677 +{
45678 + pos_in_node_t count; /* number of units to cut */
45679 + char *item;
45680 +
45681 + count = to - from + 1;
45682 + item = item_body_by_coord(coord);
45683 +
45684 + assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
45685 +
45686 + if (smallest_removed) {
45687 + /* store smallest key removed */
45688 + item_key_by_coord(coord, smallest_removed);
45689 + set_key_offset(smallest_removed,
45690 + get_key_offset(smallest_removed) + from);
45691 + }
45692 +
45693 + if (new_first) {
45694 + assert("vs-1531", from == 0);
45695 +
45696 + item_key_by_coord(coord, new_first);
45697 + set_key_offset(new_first,
45698 + get_key_offset(new_first) + from + count);
45699 + }
45700 +
45701 + if (!cut)
45702 + kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
45703 +
45704 + if (from == 0) {
45705 + if (count != nr_units_ctail(coord)) {
45706 + /* part of item is removed, so move free space at the beginning
45707 + of the item and update item key */
45708 + reiser4_key key;
45709 + memcpy(item + to + 1, item, sizeof(ctail_item_format));
45710 + item_key_by_coord(coord, &key);
45711 + set_key_offset(&key, get_key_offset(&key) + count);
45712 + node_plugin_by_node(coord->node)->update_item_key(coord,
45713 + &key,
45714 + NULL);
45715 + } else {
45716 + /* cut_units should not be called to cut evrything */
45717 + assert("vs-1532", ergo(cut, 0));
45718 + /* whole item is cut, so more then amount of space occupied
45719 + by units got freed */
45720 + count += sizeof(ctail_item_format);
45721 + }
45722 + if (REISER4_DEBUG)
45723 + memset(item, 0, count);
45724 + } else if (REISER4_DEBUG)
45725 + memset(item + sizeof(ctail_item_format) + from, 0, count);
45726 + return count;
45727 +}
45728 +
45729 +/* plugin->u.item.b.cut_units */
45730 +int
45731 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45732 + carry_cut_data * cdata, reiser4_key * smallest_removed,
45733 + reiser4_key * new_first)
45734 +{
45735 + return cut_or_kill_ctail_units(item, from, to, 1, NULL,
45736 + smallest_removed, new_first);
45737 +}
45738 +
45739 +/* plugin->u.item.b.kill_units */
45740 +int
45741 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45742 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
45743 + reiser4_key * new_first)
45744 +{
45745 + return cut_or_kill_ctail_units(item, from, to, 0, kdata,
45746 + smallest_removed, new_first);
45747 +}
45748 +
45749 +/* plugin->u.item.s.file.read */
45750 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
45751 +{
45752 + uf_coord_t *uf_coord;
45753 + coord_t *coord;
45754 +
45755 + uf_coord = &hint->ext_coord;
45756 + coord = &uf_coord->coord;
45757 + assert("edward-127", f->user == 0);
45758 + assert("edward-129", coord && coord->node);
45759 + assert("edward-130", coord_is_existing_unit(coord));
45760 + assert("edward-132", znode_is_loaded(coord->node));
45761 +
45762 + /* start read only from the beginning of ctail */
45763 + assert("edward-133", coord->unit_pos == 0);
45764 + /* read only whole ctails */
45765 + assert("edward-135", nr_units_ctail(coord) <= f->length);
45766 +
45767 + assert("edward-136", reiser4_schedulable());
45768 + assert("edward-886", ctail_ok(coord));
45769 +
45770 + if (f->data)
45771 + memcpy(f->data, (char *)first_unit(coord),
45772 + (size_t) nr_units_ctail(coord));
45773 +
45774 + dclust_set_extension_shift(hint);
45775 + mark_page_accessed(znode_page(coord->node));
45776 + move_flow_forward(f, nr_units_ctail(coord));
45777 +
45778 + return 0;
45779 +}
45780 +
45781 +/**
45782 + * Prepare transform stream with plain text for page
45783 + * @page taking into account synchronization issues.
45784 + */
45785 +int ctail_read_disk_cluster(struct cluster_handle * clust, struct inode * inode,
45786 + struct page * page, znode_lock_mode mode)
45787 +{
45788 + int result;
45789 +
45790 + assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
45791 + assert("edward-671", clust->hint != NULL);
45792 + assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
45793 + assert("edward-672", cryptcompress_inode_ok(inode));
45794 + assert("edward-1527", PageLocked(page));
45795 +
45796 + unlock_page(page);
45797 +
45798 + /* set input stream */
45799 + result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
45800 + if (result) {
45801 + lock_page(page);
45802 + return result;
45803 + }
45804 + result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
45805 + lock_page(page);
45806 + if (result)
45807 + return result;
45808 + /*
45809 + * at this point we have locked position in the tree
45810 + */
45811 + assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
45812 +
45813 + if (page->mapping != inode->i_mapping) {
45814 + /* page was truncated */
45815 + reiser4_unset_hint(clust->hint);
45816 + reset_cluster_params(clust);
45817 + return AOP_TRUNCATED_PAGE;
45818 + }
45819 + if (PageUptodate(page)) {
45820 + /* disk cluster can be obsolete, don't use it! */
45821 + reiser4_unset_hint(clust->hint);
45822 + reset_cluster_params(clust);
45823 + return 0;
45824 + }
45825 + if (clust->dstat == FAKE_DISK_CLUSTER ||
45826 + clust->dstat == UNPR_DISK_CLUSTER ||
45827 + clust->dstat == TRNC_DISK_CLUSTER) {
45828 + /*
45829 + * this information about disk cluster will be valid
45830 + * as long as we keep the position in the tree locked
45831 + */
45832 + tfm_cluster_set_uptodate(&clust->tc);
45833 + return 0;
45834 + }
45835 + /* now prepare output stream.. */
45836 + result = grab_coa(&clust->tc, inode_compression_plugin(inode));
45837 + if (result)
45838 + return result;
45839 + /* ..and fill this with plain text */
45840 + result = reiser4_inflate_cluster(clust, inode);
45841 + if (result)
45842 + return result;
45843 + /*
45844 + * The stream is ready! It won't be obsolete as
45845 + * long as we keep last disk cluster item locked.
45846 + */
45847 + tfm_cluster_set_uptodate(&clust->tc);
45848 + return 0;
45849 +}
45850 +
45851 +/*
45852 + * fill one page with plain text.
45853 + */
45854 +int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
45855 + struct page *page, znode_lock_mode mode)
45856 +{
45857 + int ret;
45858 + unsigned cloff;
45859 + char *data;
45860 + size_t to_page;
45861 + struct tfm_cluster * tc = &clust->tc;
45862 +
45863 + assert("edward-212", PageLocked(page));
45864 +
45865 + if (unlikely(page->mapping != inode->i_mapping))
45866 + return AOP_TRUNCATED_PAGE;
45867 + if (PageUptodate(page))
45868 + goto exit;
45869 + to_page = pbytes(page_index(page), inode);
45870 + if (to_page == 0) {
45871 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
45872 + SetPageUptodate(page);
45873 + goto exit;
45874 + }
45875 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
45876 + clust->index = pg_to_clust(page->index, inode);
45877 +
45878 + /* this will unlock/lock the page */
45879 + ret = ctail_read_disk_cluster(clust, inode, page, mode);
45880 +
45881 + assert("edward-212", PageLocked(page));
45882 + if (ret)
45883 + return ret;
45884 +
45885 + /* refresh bytes */
45886 + to_page = pbytes(page_index(page), inode);
45887 + if (to_page == 0) {
45888 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
45889 + SetPageUptodate(page);
45890 + goto exit;
45891 + }
45892 + }
45893 + if (PageUptodate(page))
45894 + /* somebody else fill it already */
45895 + goto exit;
45896 +
45897 + assert("edward-119", tfm_cluster_is_uptodate(tc));
45898 + assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
45899 +
45900 + switch (clust->dstat) {
45901 + case UNPR_DISK_CLUSTER:
45902 + BUG_ON(1);
45903 + case TRNC_DISK_CLUSTER:
45904 + /*
45905 + * Race with truncate!
45906 + * We resolve it in favour of the last one (the only way,
45907 + * as in this case plain text is unrecoverable)
45908 + */
45909 + case FAKE_DISK_CLUSTER:
45910 + /* fill the page by zeroes */
45911 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
45912 + SetPageUptodate(page);
45913 + break;
45914 + case PREP_DISK_CLUSTER:
45915 + /* fill page by transformed stream with plain text */
45916 + assert("edward-1058", !PageUptodate(page));
45917 + assert("edward-120", tc->len <= inode_cluster_size(inode));
45918 +
45919 + /* page index in this logical cluster */
45920 + cloff = pg_to_off_to_cloff(page->index, inode);
45921 +
45922 + data = kmap(page);
45923 + memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
45924 + memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
45925 + flush_dcache_page(page);
45926 + kunmap(page);
45927 + SetPageUptodate(page);
45928 + break;
45929 + default:
45930 + impossible("edward-1169", "bad disk cluster state");
45931 + }
45932 + exit:
45933 + return 0;
45934 +}
45935 +
45936 +/* plugin->u.item.s.file.readpage */
45937 +int readpage_ctail(void *vp, struct page *page)
45938 +{
45939 + int result;
45940 + hint_t * hint;
45941 + struct cluster_handle * clust = vp;
45942 +
45943 + assert("edward-114", clust != NULL);
45944 + assert("edward-115", PageLocked(page));
45945 + assert("edward-116", !PageUptodate(page));
45946 + assert("edward-118", page->mapping && page->mapping->host);
45947 + assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
45948 +
45949 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
45950 + if (hint == NULL) {
45951 + unlock_page(page);
45952 + return RETERR(-ENOMEM);
45953 + }
45954 + clust->hint = hint;
45955 + result = load_file_hint(clust->file, hint);
45956 + if (result) {
45957 + kfree(hint);
45958 + unlock_page(page);
45959 + return result;
45960 + }
45961 + assert("vs-25", hint->ext_coord.lh == &hint->lh);
45962 +
45963 + result = do_readpage_ctail(page->mapping->host, clust, page,
45964 + ZNODE_READ_LOCK);
45965 + assert("edward-213", PageLocked(page));
45966 + assert("edward-1163", ergo(!result, PageUptodate(page)));
45967 +
45968 + unlock_page(page);
45969 + done_lh(&hint->lh);
45970 + hint->ext_coord.valid = 0;
45971 + save_file_hint(clust->file, hint);
45972 + kfree(hint);
45973 + tfm_cluster_clr_uptodate(&clust->tc);
45974 +
45975 + return result;
45976 +}
45977 +
45978 +/* Helper function for ->readpages() */
45979 +static int ctail_read_page_cluster(struct cluster_handle * clust,
45980 + struct inode *inode)
45981 +{
45982 + int i;
45983 + int result;
45984 + assert("edward-779", clust != NULL);
45985 + assert("edward-1059", clust->win == NULL);
45986 + assert("edward-780", inode != NULL);
45987 +
45988 + result = prepare_page_cluster(inode, clust, READ_OP);
45989 + if (result)
45990 + return result;
45991 +
45992 + assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
45993 +
45994 + for (i = 0; i < clust->nr_pages; i++) {
45995 + struct page *page = clust->pages[i];
45996 + lock_page(page);
45997 + result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
45998 + unlock_page(page);
45999 + if (result)
46000 + break;
46001 + }
46002 + tfm_cluster_clr_uptodate(&clust->tc);
46003 + put_page_cluster(clust, inode, READ_OP);
46004 + return result;
46005 +}
46006 +
46007 +/* filler for read_cache_pages() */
46008 +static int ctail_readpages_filler(void * data, struct page * page)
46009 +{
46010 + int ret = 0;
46011 + struct cluster_handle * clust = data;
46012 + struct inode * inode = clust->file->f_dentry->d_inode;
46013 +
46014 + assert("edward-1525", page->mapping == inode->i_mapping);
46015 +
46016 + if (PageUptodate(page)) {
46017 + unlock_page(page);
46018 + return 0;
46019 + }
46020 + if (pbytes(page_index(page), inode) == 0) {
46021 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46022 + SetPageUptodate(page);
46023 + unlock_page(page);
46024 + return 0;
46025 + }
46026 + move_cluster_forward(clust, inode, page->index);
46027 + unlock_page(page);
46028 + /*
46029 + * read the whole page cluster
46030 + */
46031 + ret = ctail_read_page_cluster(clust, inode);
46032 +
46033 + assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
46034 + return ret;
46035 +}
46036 +
46037 +/*
46038 + * We populate a bit more then upper readahead suggests:
46039 + * with each nominated page we read the whole page cluster
46040 + * this page belongs to.
46041 + */
46042 +int readpages_ctail(struct file *file, struct address_space *mapping,
46043 + struct list_head *pages)
46044 +{
46045 + int ret = 0;
46046 + hint_t *hint;
46047 + struct cluster_handle clust;
46048 + struct inode *inode = mapping->host;
46049 +
46050 + assert("edward-1521", inode == file->f_dentry->d_inode);
46051 +
46052 + cluster_init_read(&clust, NULL);
46053 + clust.file = file;
46054 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46055 + if (hint == NULL) {
46056 + warning("vs-28", "failed to allocate hint");
46057 + ret = RETERR(-ENOMEM);
46058 + goto exit1;
46059 + }
46060 + clust.hint = hint;
46061 + ret = load_file_hint(clust.file, hint);
46062 + if (ret) {
46063 + warning("edward-1522", "failed to load hint");
46064 + goto exit2;
46065 + }
46066 + assert("vs-26", hint->ext_coord.lh == &hint->lh);
46067 + ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46068 + if (ret) {
46069 + warning("edward-1523", "failed to alloc pgset");
46070 + goto exit3;
46071 + }
46072 + ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
46073 +
46074 + assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46075 + exit3:
46076 + done_lh(&hint->lh);
46077 + save_file_hint(file, hint);
46078 + hint->ext_coord.valid = 0;
46079 + exit2:
46080 + kfree(hint);
46081 + exit1:
46082 + put_cluster_handle(&clust);
46083 + return ret;
46084 +}
46085 +
46086 +/*
46087 + plugin->u.item.s.file.append_key
46088 + key of the first item of the next disk cluster
46089 +*/
46090 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46091 +{
46092 + assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46093 + assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46094 +
46095 + item_key_by_coord(coord, key);
46096 + set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
46097 + << cluster_shift_by_coord(coord));
46098 + return key;
46099 +}
46100 +
46101 +static int insert_unprepped_ctail(struct cluster_handle * clust,
46102 + struct inode *inode)
46103 +{
46104 + int result;
46105 + char buf[UCTAIL_NR_UNITS];
46106 + reiser4_item_data data;
46107 + reiser4_key key;
46108 + int shift = (int)UCTAIL_SHIFT;
46109 +
46110 + memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46111 + result = key_by_inode_cryptcompress(inode,
46112 + clust_to_off(clust->index, inode),
46113 + &key);
46114 + if (result)
46115 + return result;
46116 + data.user = 0;
46117 + data.iplug = item_plugin_by_id(CTAIL_ID);
46118 + data.arg = &shift;
46119 + data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46120 + data.data = buf;
46121 +
46122 + result = insert_by_coord(&clust->hint->ext_coord.coord,
46123 + &data, &key, clust->hint->ext_coord.lh, 0);
46124 + return result;
46125 +}
46126 +
46127 +static int
46128 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46129 + struct inode *inode)
46130 +{
46131 + int result;
46132 + carry_pool *pool;
46133 + carry_level *lowest_level;
46134 + reiser4_item_data *data;
46135 + carry_op *op;
46136 + int cluster_shift = inode_cluster_shift(inode);
46137 +
46138 + pool =
46139 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46140 + sizeof(*data));
46141 + if (IS_ERR(pool))
46142 + return PTR_ERR(pool);
46143 + lowest_level = (carry_level *) (pool + 1);
46144 + init_carry_level(lowest_level, pool);
46145 + data = (reiser4_item_data *) (lowest_level + 3);
46146 +
46147 + assert("edward-466", coord->between == AFTER_ITEM
46148 + || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46149 + || coord->between == EMPTY_NODE
46150 + || coord->between == BEFORE_UNIT);
46151 +
46152 + if (coord->between == AFTER_UNIT) {
46153 + coord->unit_pos = 0;
46154 + coord->between = AFTER_ITEM;
46155 + }
46156 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46157 + 0 /* operate directly on coord -> node */);
46158 + if (IS_ERR(op) || (op == NULL)) {
46159 + done_carry_pool(pool);
46160 + return RETERR(op ? PTR_ERR(op) : -EIO);
46161 + }
46162 + data->user = 0;
46163 + data->iplug = item_plugin_by_id(CTAIL_ID);
46164 + data->arg = &cluster_shift;
46165 +
46166 + data->length = 0;
46167 + data->data = NULL;
46168 +
46169 + op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46170 + op->u.insert_flow.insert_point = coord;
46171 + op->u.insert_flow.flow = f;
46172 + op->u.insert_flow.data = data;
46173 + op->u.insert_flow.new_nodes = 0;
46174 +
46175 + lowest_level->track_type = CARRY_TRACK_CHANGE;
46176 + lowest_level->tracked = lh;
46177 +
46178 + result = reiser4_carry(lowest_level, NULL);
46179 + done_carry_pool(pool);
46180 +
46181 + return result;
46182 +}
46183 +
46184 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46185 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
46186 + lock_handle * lh, flow_t * f,
46187 + struct inode *inode)
46188 +{
46189 + int ret;
46190 + coord_t pos;
46191 + lock_handle lock;
46192 +
46193 + assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46194 + assert("edward-484", coord->between == AT_UNIT
46195 + || coord->between == AFTER_ITEM);
46196 + assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46197 +
46198 + coord_dup(&pos, coord);
46199 + pos.unit_pos = 0;
46200 + pos.between = AFTER_ITEM;
46201 +
46202 + init_lh(&lock);
46203 + copy_lh(&lock, lh);
46204 +
46205 + ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
46206 + done_lh(&lock);
46207 + assert("edward-1347", znode_is_write_locked(lh->node));
46208 + assert("edward-1228", !ret);
46209 + return ret;
46210 +}
46211 +
46212 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46213 +static int overwrite_ctail(coord_t * coord, flow_t * f)
46214 +{
46215 + unsigned count;
46216 +
46217 + assert("edward-269", f->user == 0);
46218 + assert("edward-270", f->data != NULL);
46219 + assert("edward-271", f->length > 0);
46220 + assert("edward-272", coord_is_existing_unit(coord));
46221 + assert("edward-273", coord->unit_pos == 0);
46222 + assert("edward-274", znode_is_write_locked(coord->node));
46223 + assert("edward-275", reiser4_schedulable());
46224 + assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
46225 + assert("edward-1243", ctail_ok(coord));
46226 +
46227 + count = nr_units_ctail(coord);
46228 +
46229 + if (count > f->length)
46230 + count = f->length;
46231 + memcpy(first_unit(coord), f->data, count);
46232 + move_flow_forward(f, count);
46233 + coord->unit_pos += count;
46234 + return 0;
46235 +}
46236 +
46237 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
46238 + cut ctail (part or whole) starting from next unit position */
46239 +static int cut_ctail(coord_t * coord)
46240 +{
46241 + coord_t stop;
46242 +
46243 + assert("edward-435", coord->between == AT_UNIT &&
46244 + coord->item_pos < coord_num_items(coord) &&
46245 + coord->unit_pos <= coord_num_units(coord));
46246 +
46247 + if (coord->unit_pos == coord_num_units(coord))
46248 + /* nothing to cut */
46249 + return 0;
46250 + coord_dup(&stop, coord);
46251 + stop.unit_pos = coord_last_unit_pos(coord);
46252 +
46253 + return cut_node_content(coord, &stop, NULL, NULL, NULL);
46254 +}
46255 +
46256 +int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
46257 + struct inode * inode)
46258 +{
46259 + int result;
46260 + assert("edward-1244", inode != NULL);
46261 + assert("edward-1245", clust->hint != NULL);
46262 + assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
46263 + assert("edward-1247", clust->reserved == 1);
46264 +
46265 + result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
46266 + if (cbk_errored(result))
46267 + return result;
46268 + assert("edward-1249", result == CBK_COORD_NOTFOUND);
46269 + assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
46270 +
46271 + assert("edward-1295",
46272 + clust->hint->ext_coord.lh->node ==
46273 + clust->hint->ext_coord.coord.node);
46274 +
46275 + coord_set_between_clusters(&clust->hint->ext_coord.coord);
46276 +
46277 + result = insert_unprepped_ctail(clust, inode);
46278 + all_grabbed2free();
46279 +
46280 + assert("edward-1251", !result);
46281 + assert("edward-1252", cryptcompress_inode_ok(inode));
46282 + assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
46283 + assert("edward-1254",
46284 + reiser4_clustered_blocks(reiser4_get_current_sb()));
46285 + assert("edward-1255",
46286 + znode_convertible(clust->hint->ext_coord.coord.node));
46287 +
46288 + return result;
46289 +}
46290 +
46291 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
46292 +{
46293 + int result = 0;
46294 + struct convert_item_info * info;
46295 +
46296 + assert("edward-468", pos != NULL);
46297 + assert("edward-469", pos->sq != NULL);
46298 + assert("edward-845", item_convert_data(pos) != NULL);
46299 +
46300 + info = item_convert_data(pos);
46301 + assert("edward-679", info->flow.data != NULL);
46302 +
46303 + switch (mode) {
46304 + case CRC_APPEND_ITEM:
46305 + assert("edward-1229", info->flow.length != 0);
46306 + assert("edward-1256",
46307 + cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
46308 + result =
46309 + insert_cryptcompress_flow_in_place(&pos->coord,
46310 + &pos->lock,
46311 + &info->flow,
46312 + info->inode);
46313 + break;
46314 + case CRC_OVERWRITE_ITEM:
46315 + assert("edward-1230", info->flow.length != 0);
46316 + overwrite_ctail(&pos->coord, &info->flow);
46317 + if (info->flow.length != 0)
46318 + break;
46319 + case CRC_CUT_ITEM:
46320 + assert("edward-1231", info->flow.length == 0);
46321 + result = cut_ctail(&pos->coord);
46322 + break;
46323 + default:
46324 + result = RETERR(-EIO);
46325 + impossible("edward-244", "bad convert mode");
46326 + }
46327 + return result;
46328 +}
46329 +
46330 +/* plugin->u.item.f.scan */
46331 +int scan_ctail(flush_scan * scan)
46332 +{
46333 + int result = 0;
46334 + struct page *page;
46335 + struct inode *inode;
46336 + jnode *node = scan->node;
46337 +
46338 + assert("edward-227", scan->node != NULL);
46339 + assert("edward-228", jnode_is_cluster_page(scan->node));
46340 + assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
46341 +
46342 + page = jnode_page(node);
46343 + inode = page->mapping->host;
46344 +
46345 + if (!reiser4_scanning_left(scan))
46346 + return result;
46347 + if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
46348 + znode_make_dirty(scan->parent_lock.node);
46349 +
46350 + if (!znode_convertible(scan->parent_lock.node)) {
46351 + if (JF_ISSET(scan->node, JNODE_DIRTY))
46352 + znode_set_convertible(scan->parent_lock.node);
46353 + else {
46354 + warning("edward-681",
46355 + "cluster page is already processed");
46356 + return -EAGAIN;
46357 + }
46358 + }
46359 + return result;
46360 +}
46361 +
46362 +/* If true, this function attaches children */
46363 +static int should_attach_convert_idata(flush_pos_t * pos)
46364 +{
46365 + int result;
46366 + assert("edward-431", pos != NULL);
46367 + assert("edward-432", pos->child == NULL);
46368 + assert("edward-619", znode_is_write_locked(pos->coord.node));
46369 + assert("edward-470",
46370 + item_plugin_by_coord(&pos->coord) ==
46371 + item_plugin_by_id(CTAIL_ID));
46372 +
46373 + /* check for leftmost child */
46374 + utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
46375 +
46376 + if (!pos->child)
46377 + return 0;
46378 + spin_lock_jnode(pos->child);
46379 + result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
46380 + pos->child->atom == ZJNODE(pos->coord.node)->atom);
46381 + spin_unlock_jnode(pos->child);
46382 + if (!result && pos->child) {
46383 + /* existing child isn't to attach, clear up this one */
46384 + jput(pos->child);
46385 + pos->child = NULL;
46386 + }
46387 + return result;
46388 +}
46389 +
46390 +/* plugin->init_convert_data() */
46391 +static int
46392 +init_convert_data_ctail(struct convert_item_info * idata, struct inode *inode)
46393 +{
46394 + assert("edward-813", idata != NULL);
46395 + assert("edward-814", inode != NULL);
46396 +
46397 + idata->inode = inode;
46398 + idata->d_cur = DC_FIRST_ITEM;
46399 + idata->d_next = DC_INVALID_STATE;
46400 +
46401 + return 0;
46402 +}
46403 +
46404 +static int alloc_item_convert_data(struct convert_info * sq)
46405 +{
46406 + assert("edward-816", sq != NULL);
46407 + assert("edward-817", sq->itm == NULL);
46408 +
46409 + sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
46410 + if (sq->itm == NULL)
46411 + return RETERR(-ENOMEM);
46412 + return 0;
46413 +}
46414 +
46415 +static void free_item_convert_data(struct convert_info * sq)
46416 +{
46417 + assert("edward-818", sq != NULL);
46418 + assert("edward-819", sq->itm != NULL);
46419 + assert("edward-820", sq->iplug != NULL);
46420 +
46421 + kfree(sq->itm);
46422 + sq->itm = NULL;
46423 + return;
46424 +}
46425 +
46426 +static int alloc_convert_data(flush_pos_t * pos)
46427 +{
46428 + assert("edward-821", pos != NULL);
46429 + assert("edward-822", pos->sq == NULL);
46430 +
46431 + pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
46432 + if (!pos->sq)
46433 + return RETERR(-ENOMEM);
46434 + memset(pos->sq, 0, sizeof(*pos->sq));
46435 + cluster_init_write(&pos->sq->clust, NULL);
46436 + return 0;
46437 +}
46438 +
46439 +void free_convert_data(flush_pos_t * pos)
46440 +{
46441 + struct convert_info *sq;
46442 +
46443 + assert("edward-823", pos != NULL);
46444 + assert("edward-824", pos->sq != NULL);
46445 +
46446 + sq = pos->sq;
46447 + if (sq->itm)
46448 + free_item_convert_data(sq);
46449 + put_cluster_handle(&sq->clust);
46450 + kfree(pos->sq);
46451 + pos->sq = NULL;
46452 + return;
46453 +}
46454 +
46455 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
46456 +{
46457 + struct convert_info *sq;
46458 +
46459 + assert("edward-825", pos != NULL);
46460 + assert("edward-826", pos->sq != NULL);
46461 + assert("edward-827", item_convert_data(pos) != NULL);
46462 + assert("edward-828", inode != NULL);
46463 +
46464 + sq = pos->sq;
46465 +
46466 + memset(sq->itm, 0, sizeof(*sq->itm));
46467 +
46468 + /* iplug->init_convert_data() */
46469 + return init_convert_data_ctail(sq->itm, inode);
46470 +}
46471 +
46472 +/* create and attach disk cluster info used by 'convert' phase of the flush
46473 + squalloc() */
46474 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
46475 +{
46476 + int ret = 0;
46477 + struct convert_item_info *info;
46478 + struct cluster_handle *clust;
46479 + file_plugin *fplug = inode_file_plugin(inode);
46480 + compression_plugin *cplug = inode_compression_plugin(inode);
46481 +
46482 + assert("edward-248", pos != NULL);
46483 + assert("edward-249", pos->child != NULL);
46484 + assert("edward-251", inode != NULL);
46485 + assert("edward-682", cryptcompress_inode_ok(inode));
46486 + assert("edward-252",
46487 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
46488 + assert("edward-473",
46489 + item_plugin_by_coord(&pos->coord) ==
46490 + item_plugin_by_id(CTAIL_ID));
46491 +
46492 + if (!pos->sq) {
46493 + ret = alloc_convert_data(pos);
46494 + if (ret)
46495 + return ret;
46496 + }
46497 + clust = &pos->sq->clust;
46498 + ret = grab_coa(&clust->tc, cplug);
46499 + if (ret)
46500 + goto err;
46501 + ret = set_cluster_by_page(clust,
46502 + jnode_page(pos->child),
46503 + MAX_CLUSTER_NRPAGES);
46504 + if (ret)
46505 + goto err;
46506 +
46507 + assert("edward-829", pos->sq != NULL);
46508 + assert("edward-250", item_convert_data(pos) == NULL);
46509 +
46510 + pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
46511 +
46512 + ret = alloc_item_convert_data(pos->sq);
46513 + if (ret)
46514 + goto err;
46515 + ret = init_item_convert_data(pos, inode);
46516 + if (ret)
46517 + goto err;
46518 + info = item_convert_data(pos);
46519 +
46520 + ret = checkout_logical_cluster(clust, pos->child, inode);
46521 + if (ret)
46522 + goto err;
46523 +
46524 + reiser4_deflate_cluster(clust, inode);
46525 + inc_item_convert_count(pos);
46526 +
46527 + /* prepare flow for insertion */
46528 + fplug->flow_by_inode(info->inode,
46529 + (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
46530 + 0 /* kernel space */ ,
46531 + clust->tc.len,
46532 + clust_to_off(clust->index, inode),
46533 + WRITE_OP, &info->flow);
46534 + jput(pos->child);
46535 +
46536 + assert("edward-683", cryptcompress_inode_ok(inode));
46537 + return 0;
46538 + err:
46539 + jput(pos->child);
46540 + free_convert_data(pos);
46541 + return ret;
46542 +}
46543 +
46544 +/* clear up disk cluster info */
46545 +static void detach_convert_idata(struct convert_info * sq)
46546 +{
46547 + struct convert_item_info *info;
46548 +
46549 + assert("edward-253", sq != NULL);
46550 + assert("edward-840", sq->itm != NULL);
46551 +
46552 + info = sq->itm;
46553 + assert("edward-255", info->inode != NULL);
46554 + assert("edward-1212", info->flow.length == 0);
46555 +
46556 + free_item_convert_data(sq);
46557 + return;
46558 +}
46559 +
46560 +/* plugin->u.item.f.utmost_child */
46561 +
46562 +/* This function sets leftmost child for a first cluster item,
46563 + if the child exists, and NULL in other cases.
46564 + NOTE-EDWARD: Do not call this for RIGHT_SIDE */
46565 +
46566 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
46567 +{
46568 + reiser4_key key;
46569 +
46570 + item_key_by_coord(coord, &key);
46571 +
46572 + assert("edward-257", coord != NULL);
46573 + assert("edward-258", child != NULL);
46574 + assert("edward-259", side == LEFT_SIDE);
46575 + assert("edward-260",
46576 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46577 +
46578 + if (!is_disk_cluster_key(&key, coord))
46579 + *child = NULL;
46580 + else
46581 + *child = jlookup(current_tree,
46582 + get_key_objectid(item_key_by_coord
46583 + (coord, &key)),
46584 + off_to_pg(get_key_offset(&key)));
46585 + return 0;
46586 +}
46587 +
46588 +/* Returns true if @p2 is the next item to @p1
46589 + in the _same_ disk cluster.
46590 + Disk cluster is a set of items. If ->clustered() != NULL,
46591 + with each item the whole disk cluster should be read/modified
46592 +*/
46593 +
46594 +/* Go rightward and check for next disk cluster item, set
46595 + * d_next to DC_CHAINED_ITEM, if the last one exists.
46596 + * If the current position is last item, go to right neighbor.
46597 + * Skip empty nodes. Note, that right neighbors may be not in
46598 + * the slum because of races. If so, make it dirty and
46599 + * convertible.
46600 + */
46601 +static int next_item_dc_stat(flush_pos_t * pos)
46602 +{
46603 + int ret = 0;
46604 + int stop = 0;
46605 + znode *cur;
46606 + coord_t coord;
46607 + lock_handle lh;
46608 + lock_handle right_lock;
46609 +
46610 + assert("edward-1232", !node_is_empty(pos->coord.node));
46611 + assert("edward-1014",
46612 + pos->coord.item_pos < coord_num_items(&pos->coord));
46613 + assert("edward-1015", chaining_data_present(pos));
46614 + assert("edward-1017",
46615 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
46616 +
46617 + item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
46618 +
46619 + if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
46620 + return ret;
46621 + if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
46622 + return ret;
46623 +
46624 + /* Check next slum item.
46625 + * Note, that it can not be killed by concurrent truncate,
46626 + * as the last one will want the lock held by us.
46627 + */
46628 + init_lh(&right_lock);
46629 + cur = pos->coord.node;
46630 +
46631 + while (!stop) {
46632 + init_lh(&lh);
46633 + ret = reiser4_get_right_neighbor(&lh,
46634 + cur,
46635 + ZNODE_WRITE_LOCK,
46636 + GN_CAN_USE_UPPER_LEVELS);
46637 + if (ret)
46638 + break;
46639 + ret = zload(lh.node);
46640 + if (ret) {
46641 + done_lh(&lh);
46642 + break;
46643 + }
46644 + coord_init_before_first_item(&coord, lh.node);
46645 +
46646 + if (node_is_empty(lh.node)) {
46647 + znode_make_dirty(lh.node);
46648 + znode_set_convertible(lh.node);
46649 + stop = 0;
46650 + } else if (same_disk_cluster(&pos->coord, &coord)) {
46651 +
46652 + item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
46653 +
46654 + if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
46655 + /*
46656 + warning("edward-1024",
46657 + "next slum item mergeable, "
46658 + "but znode %p isn't dirty\n",
46659 + lh.node);
46660 + */
46661 + znode_make_dirty(lh.node);
46662 + }
46663 + if (!znode_convertible(lh.node)) {
46664 + /*
46665 + warning("edward-1272",
46666 + "next slum item mergeable, "
46667 + "but znode %p isn't convertible\n",
46668 + lh.node);
46669 + */
46670 + znode_set_convertible(lh.node);
46671 + }
46672 + stop = 1;
46673 + } else
46674 + stop = 1;
46675 + zrelse(lh.node);
46676 + done_lh(&right_lock);
46677 + copy_lh(&right_lock, &lh);
46678 + done_lh(&lh);
46679 + cur = right_lock.node;
46680 + }
46681 + done_lh(&right_lock);
46682 +
46683 + if (ret == -E_NO_NEIGHBOR)
46684 + ret = 0;
46685 + return ret;
46686 +}
46687 +
46688 +static int
46689 +assign_convert_mode(struct convert_item_info * idata,
46690 + cryptcompress_write_mode_t * mode)
46691 +{
46692 + int result = 0;
46693 +
46694 + assert("edward-1025", idata != NULL);
46695 +
46696 + if (idata->flow.length) {
46697 + /* append or overwrite */
46698 + switch (idata->d_cur) {
46699 + case DC_FIRST_ITEM:
46700 + case DC_CHAINED_ITEM:
46701 + *mode = CRC_OVERWRITE_ITEM;
46702 + break;
46703 + case DC_AFTER_CLUSTER:
46704 + *mode = CRC_APPEND_ITEM;
46705 + break;
46706 + default:
46707 + impossible("edward-1018", "wrong current item state");
46708 + }
46709 + } else {
46710 + /* cut or invalidate */
46711 + switch (idata->d_cur) {
46712 + case DC_FIRST_ITEM:
46713 + case DC_CHAINED_ITEM:
46714 + *mode = CRC_CUT_ITEM;
46715 + break;
46716 + case DC_AFTER_CLUSTER:
46717 + result = 1;
46718 + break;
46719 + default:
46720 + impossible("edward-1019", "wrong current item state");
46721 + }
46722 + }
46723 + return result;
46724 +}
46725 +
46726 +/* plugin->u.item.f.convert */
46727 +/* write ctail in guessed mode */
46728 +int convert_ctail(flush_pos_t * pos)
46729 +{
46730 + int result;
46731 + int nr_items;
46732 + cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
46733 +
46734 + assert("edward-1020", pos != NULL);
46735 + assert("edward-1213", coord_num_items(&pos->coord) != 0);
46736 + assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
46737 + assert("edward-1258", ctail_ok(&pos->coord));
46738 + assert("edward-261", pos->coord.node != NULL);
46739 +
46740 + nr_items = coord_num_items(&pos->coord);
46741 + if (!chaining_data_present(pos)) {
46742 + if (should_attach_convert_idata(pos)) {
46743 + /* attach convert item info */
46744 + struct inode *inode;
46745 +
46746 + assert("edward-264", pos->child != NULL);
46747 + assert("edward-265", jnode_page(pos->child) != NULL);
46748 + assert("edward-266",
46749 + jnode_page(pos->child)->mapping != NULL);
46750 +
46751 + inode = jnode_page(pos->child)->mapping->host;
46752 +
46753 + assert("edward-267", inode != NULL);
46754 +
46755 + /* attach item convert info by child and put the last one */
46756 + result = attach_convert_idata(pos, inode);
46757 + pos->child = NULL;
46758 + if (result == -E_REPEAT) {
46759 + /* jnode became clean, or there is no dirty
46760 + pages (nothing to update in disk cluster) */
46761 + warning("edward-1021",
46762 + "convert_ctail: nothing to attach");
46763 + return 0;
46764 + }
46765 + if (result != 0)
46766 + return result;
46767 + } else
46768 + /* unconvertible */
46769 + return 0;
46770 + } else {
46771 + /* use old convert info */
46772 +
46773 + struct convert_item_info *idata;
46774 +
46775 + idata = item_convert_data(pos);
46776 +
46777 + result = assign_convert_mode(idata, &mode);
46778 + if (result) {
46779 + /* disk cluster is over,
46780 + nothing to update anymore */
46781 + detach_convert_idata(pos->sq);
46782 + return 0;
46783 + }
46784 + }
46785 +
46786 + assert("edward-433", chaining_data_present(pos));
46787 + assert("edward-1022",
46788 + pos->coord.item_pos < coord_num_items(&pos->coord));
46789 +
46790 + /* check if next item is of current disk cluster */
46791 + result = next_item_dc_stat(pos);
46792 + if (result) {
46793 + detach_convert_idata(pos->sq);
46794 + return result;
46795 + }
46796 + result = do_convert_ctail(pos, mode);
46797 + if (result) {
46798 + detach_convert_idata(pos->sq);
46799 + return result;
46800 + }
46801 + switch (mode) {
46802 + case CRC_CUT_ITEM:
46803 + assert("edward-1214", item_convert_data(pos)->flow.length == 0);
46804 + assert("edward-1215",
46805 + coord_num_items(&pos->coord) == nr_items ||
46806 + coord_num_items(&pos->coord) == nr_items - 1);
46807 + if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
46808 + break;
46809 + if (coord_num_items(&pos->coord) != nr_items) {
46810 + /* the item was killed, no more chained items */
46811 + detach_convert_idata(pos->sq);
46812 + if (!node_is_empty(pos->coord.node))
46813 + /* make sure the next item will be scanned */
46814 + coord_init_before_item(&pos->coord);
46815 + break;
46816 + }
46817 + case CRC_APPEND_ITEM:
46818 + assert("edward-434", item_convert_data(pos)->flow.length == 0);
46819 + detach_convert_idata(pos->sq);
46820 + break;
46821 + case CRC_OVERWRITE_ITEM:
46822 + if (coord_is_unprepped_ctail(&pos->coord)) {
46823 + /* convert unpprepped ctail to prepped one */
46824 + int shift;
46825 + shift =
46826 + inode_cluster_shift(item_convert_data(pos)->inode);
46827 + assert("edward-1259", cluster_shift_ok(shift));
46828 + put_unaligned((d8)shift,
46829 + &ctail_formatted_at(&pos->coord)->
46830 + cluster_shift);
46831 + }
46832 + break;
46833 + }
46834 + return result;
46835 +}
46836 +
46837 +/* Make Linus happy.
46838 + Local variables:
46839 + c-indentation-style: "K&R"
46840 + mode-name: "LC"
46841 + c-basic-offset: 8
46842 + tab-width: 8
46843 + fill-column: 120
46844 + End:
46845 +*/
46846 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.22/fs/reiser4/plugin/item/ctail.h
46847 --- linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300
46848 +++ linux-2.6.22/fs/reiser4/plugin/item/ctail.h 2007-07-29 00:25:34.948715113 +0400
46849 @@ -0,0 +1,102 @@
46850 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46851 +
46852 +/* Ctail items are fragments (or bodies) of special tipe to provide
46853 + optimal storage of encrypted and(or) compressed files. */
46854 +
46855 +
46856 +#if !defined( __FS_REISER4_CTAIL_H__ )
46857 +#define __FS_REISER4_CTAIL_H__
46858 +
46859 +/* Disk format of ctail item */
46860 +typedef struct ctail_item_format {
46861 + /* packed shift;
46862 + if its value is different from UCTAIL_SHIFT (see below), then
46863 + size of disk cluster is calculated as (1 << cluster_shift) */
46864 + d8 cluster_shift;
46865 + /* ctail body */
46866 + d8 body[0];
46867 +} __attribute__ ((packed)) ctail_item_format;
46868 +
46869 +/* "Unprepped" disk cluster is represented by a single ctail item
46870 + with the following "magic" attributes: */
46871 +/* "magic" cluster_shift */
46872 +#define UCTAIL_SHIFT 0xff
46873 +/* How many units unprepped ctail item has */
46874 +#define UCTAIL_NR_UNITS 1
46875 +
46876 +/* The following is a set of various item states in a disk cluster.
46877 + Disk cluster is a set of items whose keys belong to the interval
46878 + [dc_key , dc_key + disk_cluster_size - 1] */
46879 +typedef enum {
46880 + DC_INVALID_STATE = 0,
46881 + DC_FIRST_ITEM = 1,
46882 + DC_CHAINED_ITEM = 2,
46883 + DC_AFTER_CLUSTER = 3
46884 +} dc_item_stat;
46885 +
46886 +/* ctail-specific extension.
46887 + In particular this describes parameters of disk cluster an item belongs to */
46888 +struct ctail_coord_extension {
46889 + int shift; /* this contains cluster_shift extracted from
46890 + ctail_item_format (above), or UCTAIL_SHIFT
46891 + (the last one is the "magic" of unprepped disk clusters)*/
46892 + int dsize; /* size of a prepped disk cluster */
46893 + int ncount; /* count of nodes occupied by a disk cluster */
46894 +};
46895 +
46896 +struct cut_list;
46897 +
46898 +/* plugin->item.b.* */
46899 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
46900 + const reiser4_item_data *);
46901 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
46902 +pos_in_node_t nr_units_ctail(const coord_t * coord);
46903 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
46904 +void print_ctail(const char *prefix, coord_t * coord);
46905 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
46906 +
46907 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
46908 + carry_plugin_info * info UNUSED_ARG);
46909 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
46910 +int can_shift_ctail(unsigned free_space, coord_t * coord,
46911 + znode * target, shift_direction pend, unsigned *size,
46912 + unsigned want);
46913 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
46914 + unsigned count, shift_direction where_is_free_space,
46915 + unsigned free_space);
46916 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46917 + carry_cut_data *, reiser4_key * smallest_removed,
46918 + reiser4_key * new_first);
46919 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46920 + carry_kill_data *, reiser4_key * smallest_removed,
46921 + reiser4_key * new_first);
46922 +int ctail_ok(const coord_t * coord);
46923 +int check_ctail(const coord_t * coord, const char **error);
46924 +
46925 +/* plugin->u.item.s.* */
46926 +int read_ctail(struct file *, flow_t *, hint_t *);
46927 +int readpage_ctail(void *, struct page *);
46928 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
46929 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
46930 +int create_hook_ctail(const coord_t * coord, void *arg);
46931 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
46932 + carry_kill_data *);
46933 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
46934 +
46935 +/* plugin->u.item.f */
46936 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
46937 +int scan_ctail(flush_scan *);
46938 +int convert_ctail(flush_pos_t *);
46939 +size_t inode_scaled_cluster_size(struct inode *);
46940 +
46941 +#endif /* __FS_REISER4_CTAIL_H__ */
46942 +
46943 +/* Make Linus happy.
46944 + Local variables:
46945 + c-indentation-style: "K&R"
46946 + mode-name: "LC"
46947 + c-basic-offset: 8
46948 + tab-width: 8
46949 + fill-column: 120
46950 + End:
46951 +*/
46952 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent.c linux-2.6.22/fs/reiser4/plugin/item/extent.c
46953 --- linux-2.6.22.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300
46954 +++ linux-2.6.22/fs/reiser4/plugin/item/extent.c 2007-07-29 00:25:34.948715113 +0400
46955 @@ -0,0 +1,197 @@
46956 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46957 +
46958 +#include "item.h"
46959 +#include "../../key.h"
46960 +#include "../../super.h"
46961 +#include "../../carry.h"
46962 +#include "../../inode.h"
46963 +#include "../../page_cache.h"
46964 +#include "../../flush.h"
46965 +#include "../object.h"
46966 +
46967 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
46968 +/* Audited by: green(2002.06.13) */
46969 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
46970 + int nr_extents)
46971 +{
46972 + data->data = ext_unit;
46973 + /* data->data is kernel space */
46974 + data->user = 0;
46975 + data->length = sizeof(reiser4_extent) * nr_extents;
46976 + data->arg = NULL;
46977 + data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
46978 + return data;
46979 +}
46980 +
46981 +/* how many bytes are addressed by @nr first extents of the extent item */
46982 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
46983 +{
46984 + pos_in_node_t i;
46985 + reiser4_block_nr blocks;
46986 + reiser4_extent *ext;
46987 +
46988 + ext = item_body_by_coord(coord);
46989 + assert("vs-263", nr <= nr_units_extent(coord));
46990 +
46991 + blocks = 0;
46992 + for (i = 0; i < nr; i++, ext++) {
46993 + blocks += extent_get_width(ext);
46994 + }
46995 +
46996 + return blocks * current_blocksize;
46997 +}
46998 +
46999 +extent_state state_of_extent(reiser4_extent * ext)
47000 +{
47001 + switch ((int)extent_get_start(ext)) {
47002 + case 0:
47003 + return HOLE_EXTENT;
47004 + case 1:
47005 + return UNALLOCATED_EXTENT;
47006 + default:
47007 + break;
47008 + }
47009 + return ALLOCATED_EXTENT;
47010 +}
47011 +
47012 +int extent_is_unallocated(const coord_t * item)
47013 +{
47014 + assert("jmacd-5133", item_is_extent(item));
47015 +
47016 + return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47017 +}
47018 +
47019 +/* set extent's start and width */
47020 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
47021 + reiser4_block_nr width)
47022 +{
47023 + extent_set_start(ext, start);
47024 + extent_set_width(ext, width);
47025 +}
47026 +
47027 +/**
47028 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
47029 + * @un_extent: coordinate of extent to be overwritten
47030 + * @lh: need better comment
47031 + * @key: need better comment
47032 + * @exts_to_add: data prepared for insertion into tree
47033 + * @replace: need better comment
47034 + * @flags: need better comment
47035 + * @return_insert_position: need better comment
47036 + *
47037 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
47038 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47039 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47040 + * set to extent which was overwritten.
47041 + */
47042 +int reiser4_replace_extent(struct replace_handle *h,
47043 + int return_inserted_position)
47044 +{
47045 + int result;
47046 + znode *orig_znode;
47047 + /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
47048 +
47049 + assert("vs-990", coord_is_existing_unit(h->coord));
47050 + assert("vs-1375", znode_is_write_locked(h->coord->node));
47051 + assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47052 + assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47053 + assert("vs-1427", ergo(h->nr_new_extents == 2,
47054 + extent_get_width(&h->new_extents[1]) != 0));
47055 +
47056 + /* compose structure for paste */
47057 + init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47058 +
47059 + coord_dup(&h->coord_after, h->coord);
47060 + init_lh(&h->lh_after);
47061 + copy_lh(&h->lh_after, h->lh);
47062 + reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47063 + reiser4_tap_monitor(&h->watch);
47064 +
47065 + ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47066 + orig_znode = h->coord->node;
47067 +
47068 +#if REISER4_DEBUG
47069 + /* make sure that key is set properly */
47070 + unit_key_by_coord(h->coord, &h->tmp);
47071 + set_key_offset(&h->tmp,
47072 + get_key_offset(&h->tmp) +
47073 + extent_get_width(&h->overwrite) * current_blocksize);
47074 + assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47075 +#endif
47076 +
47077 + /* set insert point after unit to be replaced */
47078 + h->coord->between = AFTER_UNIT;
47079 +
47080 + result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47081 + &h->paste_key, &h->item, h->flags);
47082 + if (!result) {
47083 + /* now we have to replace the unit after which new units were
47084 + inserted. Its position is tracked by @watch */
47085 + reiser4_extent *ext;
47086 + znode *node;
47087 +
47088 + node = h->coord_after.node;
47089 + if (node != orig_znode) {
47090 + coord_clear_iplug(&h->coord_after);
47091 + result = zload(node);
47092 + }
47093 +
47094 + if (likely(!result)) {
47095 + ext = extent_by_coord(&h->coord_after);
47096 +
47097 + assert("vs-987", znode_is_loaded(node));
47098 + assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47099 +
47100 + /* overwrite extent unit */
47101 + memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47102 + znode_make_dirty(node);
47103 +
47104 + if (node != orig_znode)
47105 + zrelse(node);
47106 +
47107 + if (return_inserted_position == 0) {
47108 + /* coord and lh are to be set to overwritten
47109 + extent */
47110 + assert("vs-1662",
47111 + WITH_DATA(node, !memcmp(&h->overwrite,
47112 + extent_by_coord(
47113 + &h->coord_after),
47114 + sizeof(reiser4_extent))));
47115 +
47116 + *h->coord = h->coord_after;
47117 + done_lh(h->lh);
47118 + copy_lh(h->lh, &h->lh_after);
47119 + } else {
47120 + /* h->coord and h->lh are to be set to first of
47121 + inserted units */
47122 + assert("vs-1663",
47123 + WITH_DATA(h->coord->node,
47124 + !memcmp(&h->new_extents[0],
47125 + extent_by_coord(h->coord),
47126 + sizeof(reiser4_extent))));
47127 + assert("vs-1664", h->lh->node == h->coord->node);
47128 + }
47129 + }
47130 + }
47131 + reiser4_tap_done(&h->watch);
47132 +
47133 + return result;
47134 +}
47135 +
47136 +lock_handle *znode_lh(znode *node)
47137 +{
47138 + assert("vs-1371", znode_is_write_locked(node));
47139 + assert("vs-1372", znode_is_wlocked_once(node));
47140 + return list_entry(node->lock.owners.next, lock_handle, owners_link);
47141 +}
47142 +
47143 +/*
47144 + * Local variables:
47145 + * c-indentation-style: "K&R"
47146 + * mode-name: "LC"
47147 + * c-basic-offset: 8
47148 + * tab-width: 8
47149 + * fill-column: 79
47150 + * scroll-step: 1
47151 + * End:
47152 + */
47153 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.22/fs/reiser4/plugin/item/extent_file_ops.c
47154 --- linux-2.6.22.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300
47155 +++ linux-2.6.22/fs/reiser4/plugin/item/extent_file_ops.c 2007-07-29 00:25:34.952716148 +0400
47156 @@ -0,0 +1,1453 @@
47157 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47158 +
47159 +#include "item.h"
47160 +#include "../../inode.h"
47161 +#include "../../page_cache.h"
47162 +#include "../object.h"
47163 +
47164 +#include <linux/quotaops.h>
47165 +#include <linux/swap.h>
47166 +
47167 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
47168 +{
47169 + reiser4_extent *ext;
47170 +
47171 + ext = (reiser4_extent *) (zdata(node) + offset);
47172 + return ext;
47173 +}
47174 +
47175 +/**
47176 + * check_uf_coord - verify coord extension
47177 + * @uf_coord:
47178 + * @key:
47179 + *
47180 + * Makes sure that all fields of @uf_coord are set properly. If @key is
47181 + * specified - check whether @uf_coord is set correspondingly.
47182 + */
47183 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
47184 +{
47185 +#if REISER4_DEBUG
47186 + const coord_t *coord;
47187 + const struct extent_coord_extension *ext_coord;
47188 + reiser4_extent *ext;
47189 +
47190 + coord = &uf_coord->coord;
47191 + ext_coord = &uf_coord->extension.extent;
47192 + ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
47193 +
47194 + assert("",
47195 + WITH_DATA(coord->node,
47196 + (uf_coord->valid == 1 &&
47197 + coord_is_iplug_set(coord) &&
47198 + item_is_extent(coord) &&
47199 + ext_coord->nr_units == nr_units_extent(coord) &&
47200 + ext == extent_by_coord(coord) &&
47201 + ext_coord->width == extent_get_width(ext) &&
47202 + coord->unit_pos < ext_coord->nr_units &&
47203 + ext_coord->pos_in_unit < ext_coord->width &&
47204 + memcmp(ext, &ext_coord->extent,
47205 + sizeof(reiser4_extent)) == 0)));
47206 + if (key) {
47207 + reiser4_key coord_key;
47208 +
47209 + unit_key_by_coord(&uf_coord->coord, &coord_key);
47210 + set_key_offset(&coord_key,
47211 + get_key_offset(&coord_key) +
47212 + (uf_coord->extension.extent.
47213 + pos_in_unit << PAGE_CACHE_SHIFT));
47214 + assert("", keyeq(key, &coord_key));
47215 + }
47216 +#endif
47217 +}
47218 +
47219 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
47220 +{
47221 + check_uf_coord(uf_coord, NULL);
47222 +
47223 + return ext_by_offset(uf_coord->coord.node,
47224 + uf_coord->extension.extent.ext_offset);
47225 +}
47226 +
47227 +#if REISER4_DEBUG
47228 +
47229 +/**
47230 + * offset_is_in_unit
47231 + *
47232 + *
47233 + *
47234 + */
47235 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
47236 + pos_in_unit inside of unit correspondingly */
47237 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
47238 +{
47239 + reiser4_key unit_key;
47240 + __u64 unit_off;
47241 + reiser4_extent *ext;
47242 +
47243 + ext = extent_by_coord(coord);
47244 +
47245 + unit_key_extent(coord, &unit_key);
47246 + unit_off = get_key_offset(&unit_key);
47247 + if (off < unit_off)
47248 + return 0;
47249 + if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
47250 + return 0;
47251 + return 1;
47252 +}
47253 +
47254 +static int
47255 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
47256 +{
47257 + reiser4_key item_key;
47258 +
47259 + assert("vs-771", coord_is_existing_unit(coord));
47260 + assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
47261 + assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
47262 +
47263 + return offset_is_in_unit(coord, get_key_offset(key));
47264 +}
47265 +
47266 +#endif
47267 +
47268 +/**
47269 + * can_append -
47270 + * @key:
47271 + * @coord:
47272 + *
47273 + * Returns 1 if @key is equal to an append key of item @coord is set to
47274 + */
47275 +static int can_append(const reiser4_key *key, const coord_t *coord)
47276 +{
47277 + reiser4_key append_key;
47278 +
47279 + return keyeq(key, append_key_extent(coord, &append_key));
47280 +}
47281 +
47282 +/**
47283 + * append_hole
47284 + * @coord:
47285 + * @lh:
47286 + * @key:
47287 + *
47288 + */
47289 +static int append_hole(coord_t *coord, lock_handle *lh,
47290 + const reiser4_key *key)
47291 +{
47292 + reiser4_key append_key;
47293 + reiser4_block_nr hole_width;
47294 + reiser4_extent *ext, new_ext;
47295 + reiser4_item_data idata;
47296 +
47297 + /* last item of file may have to be appended with hole */
47298 + assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
47299 + assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
47300 +
47301 + /* key of first byte which is not addressed by this extent */
47302 + append_key_extent(coord, &append_key);
47303 +
47304 + assert("", keyle(&append_key, key));
47305 +
47306 + /*
47307 + * extent item has to be appended with hole. Calculate length of that
47308 + * hole
47309 + */
47310 + hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
47311 + current_blocksize - 1) >> current_blocksize_bits);
47312 + assert("vs-954", hole_width > 0);
47313 +
47314 + /* set coord after last unit */
47315 + coord_init_after_item_end(coord);
47316 +
47317 + /* get last extent in the item */
47318 + ext = extent_by_coord(coord);
47319 + if (state_of_extent(ext) == HOLE_EXTENT) {
47320 + /*
47321 + * last extent of a file is hole extent. Widen that extent by
47322 + * @hole_width blocks. Note that we do not worry about
47323 + * overflowing - extent width is 64 bits
47324 + */
47325 + reiser4_set_extent(ext, HOLE_EXTENT_START,
47326 + extent_get_width(ext) + hole_width);
47327 + znode_make_dirty(coord->node);
47328 + return 0;
47329 + }
47330 +
47331 + /* append last item of the file with hole extent unit */
47332 + assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
47333 + state_of_extent(ext) == UNALLOCATED_EXTENT));
47334 +
47335 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47336 + init_new_extent(&idata, &new_ext, 1);
47337 + return insert_into_item(coord, lh, &append_key, &idata, 0);
47338 +}
47339 +
47340 +/**
47341 + * check_jnodes
47342 + * @twig: longterm locked twig node
47343 + * @key:
47344 + *
47345 + */
47346 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
47347 +{
47348 +#if REISER4_DEBUG
47349 + coord_t c;
47350 + reiser4_key node_key, jnode_key;
47351 +
47352 + jnode_key = *key;
47353 +
47354 + assert("", twig != NULL);
47355 + assert("", znode_get_level(twig) == TWIG_LEVEL);
47356 + assert("", znode_is_write_locked(twig));
47357 +
47358 + zload(twig);
47359 + /* get the smallest key in twig node */
47360 + coord_init_first_unit(&c, twig);
47361 + unit_key_by_coord(&c, &node_key);
47362 + assert("", keyle(&node_key, &jnode_key));
47363 +
47364 + coord_init_last_unit(&c, twig);
47365 + unit_key_by_coord(&c, &node_key);
47366 + if (item_plugin_by_coord(&c)->s.file.append_key)
47367 + item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
47368 + set_key_offset(&jnode_key,
47369 + get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
47370 + assert("", keylt(&jnode_key, &node_key));
47371 + zrelse(twig);
47372 +#endif
47373 +}
47374 +
47375 +/**
47376 + * append_last_extent - append last file item
47377 + * @uf_coord: coord to start insertion from
47378 + * @jnodes: array of jnodes
47379 + * @count: number of jnodes in the array
47380 + *
47381 + * There is already at least one extent item of file @inode in the tree. Append
47382 + * the last of them with unallocated extent unit of width @count. Assign
47383 + * fake block numbers to jnodes corresponding to the inserted extent.
47384 + */
47385 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47386 + jnode **jnodes, int count)
47387 +{
47388 + int result;
47389 + reiser4_extent new_ext;
47390 + reiser4_item_data idata;
47391 + coord_t *coord;
47392 + struct extent_coord_extension *ext_coord;
47393 + reiser4_extent *ext;
47394 + reiser4_block_nr block;
47395 + jnode *node;
47396 + int i;
47397 +
47398 + coord = &uf_coord->coord;
47399 + ext_coord = &uf_coord->extension.extent;
47400 + ext = ext_by_ext_coord(uf_coord);
47401 +
47402 + /* check correctness of position in the item */
47403 + assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
47404 + assert("vs-1311", coord->between == AFTER_UNIT);
47405 + assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
47406 +
47407 + if (!can_append(key, coord)) {
47408 + /* hole extent has to be inserted */
47409 + result = append_hole(coord, uf_coord->lh, key);
47410 + uf_coord->valid = 0;
47411 + return result;
47412 + }
47413 +
47414 + if (count == 0)
47415 + return 0;
47416 +
47417 + assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
47418 +
47419 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
47420 + count);
47421 + BUG_ON(result != 0);
47422 +
47423 + switch (state_of_extent(ext)) {
47424 + case UNALLOCATED_EXTENT:
47425 + /*
47426 + * last extent unit of the file is unallocated one. Increase
47427 + * its width by @count
47428 + */
47429 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
47430 + extent_get_width(ext) + count);
47431 + znode_make_dirty(coord->node);
47432 +
47433 + /* update coord extension */
47434 + ext_coord->width += count;
47435 + ON_DEBUG(extent_set_width
47436 + (&uf_coord->extension.extent.extent,
47437 + ext_coord->width));
47438 + break;
47439 +
47440 + case HOLE_EXTENT:
47441 + case ALLOCATED_EXTENT:
47442 + /*
47443 + * last extent unit of the file is either hole or allocated
47444 + * one. Append one unallocated extent of width @count
47445 + */
47446 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47447 + init_new_extent(&idata, &new_ext, 1);
47448 + result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
47449 + uf_coord->valid = 0;
47450 + if (result)
47451 + return result;
47452 + break;
47453 +
47454 + default:
47455 + return RETERR(-EIO);
47456 + }
47457 +
47458 + /*
47459 + * make sure that we hold long term locked twig node containing all
47460 + * jnodes we are about to capture
47461 + */
47462 + check_jnodes(uf_coord->lh->node, key, count);
47463 +
47464 + /*
47465 + * assign fake block numbers to all jnodes. FIXME: make sure whether
47466 + * twig node containing inserted extent item is locked
47467 + */
47468 + block = fake_blocknr_unformatted(count);
47469 + for (i = 0; i < count; i ++, block ++) {
47470 + node = jnodes[i];
47471 + spin_lock_jnode(node);
47472 + JF_SET(node, JNODE_CREATED);
47473 + jnode_set_block(node, &block);
47474 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47475 + BUG_ON(result != 0);
47476 + jnode_make_dirty_locked(node);
47477 + spin_unlock_jnode(node);
47478 + }
47479 + return count;
47480 +}
47481 +
47482 +/**
47483 + * insert_first_hole - inser hole extent into tree
47484 + * @coord:
47485 + * @lh:
47486 + * @key:
47487 + *
47488 + *
47489 + */
47490 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
47491 + const reiser4_key *key)
47492 +{
47493 + reiser4_extent new_ext;
47494 + reiser4_item_data idata;
47495 + reiser4_key item_key;
47496 + reiser4_block_nr hole_width;
47497 +
47498 + /* @coord must be set for inserting of new item */
47499 + assert("vs-711", coord_is_between_items(coord));
47500 +
47501 + item_key = *key;
47502 + set_key_offset(&item_key, 0ull);
47503 +
47504 + hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
47505 + current_blocksize_bits);
47506 + assert("vs-710", hole_width > 0);
47507 +
47508 + /* compose body of hole extent and insert item into tree */
47509 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47510 + init_new_extent(&idata, &new_ext, 1);
47511 + return insert_extent_by_coord(coord, &idata, &item_key, lh);
47512 +}
47513 +
47514 +
47515 +/**
47516 + * insert_first_extent - insert first file item
47517 + * @inode: inode of file
47518 + * @uf_coord: coord to start insertion from
47519 + * @jnodes: array of jnodes
47520 + * @count: number of jnodes in the array
47521 + * @inode:
47522 + *
47523 + * There are no items of file @inode in the tree yet. Insert unallocated extent
47524 + * of width @count into tree or hole extent if writing not to the
47525 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
47526 + * unallocated extent. Returns number of jnodes or error code.
47527 + */
47528 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47529 + jnode **jnodes, int count,
47530 + struct inode *inode)
47531 +{
47532 + int result;
47533 + int i;
47534 + reiser4_extent new_ext;
47535 + reiser4_item_data idata;
47536 + reiser4_block_nr block;
47537 + struct unix_file_info *uf_info;
47538 + jnode *node;
47539 +
47540 + /* first extent insertion starts at leaf level */
47541 + assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
47542 + assert("vs-711", coord_is_between_items(&uf_coord->coord));
47543 +
47544 + if (get_key_offset(key) != 0) {
47545 + result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
47546 + uf_coord->valid = 0;
47547 + uf_info = unix_file_inode_data(inode);
47548 +
47549 + /*
47550 + * first item insertion is only possible when writing to empty
47551 + * file or performing tail conversion
47552 + */
47553 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
47554 + (reiser4_inode_get_flag(inode,
47555 + REISER4_PART_MIXED) &&
47556 + reiser4_inode_get_flag(inode,
47557 + REISER4_PART_IN_CONV))));
47558 + /* if file was empty - update its state */
47559 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
47560 + uf_info->container = UF_CONTAINER_EXTENTS;
47561 + return result;
47562 + }
47563 +
47564 + if (count == 0)
47565 + return 0;
47566 +
47567 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
47568 + BUG_ON(result != 0);
47569 +
47570 + /*
47571 + * prepare for tree modification: compose body of item and item data
47572 + * structure needed for insertion
47573 + */
47574 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47575 + init_new_extent(&idata, &new_ext, 1);
47576 +
47577 + /* insert extent item into the tree */
47578 + result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
47579 + uf_coord->lh);
47580 + if (result)
47581 + return result;
47582 +
47583 + /*
47584 + * make sure that we hold long term locked twig node containing all
47585 + * jnodes we are about to capture
47586 + */
47587 + check_jnodes(uf_coord->lh->node, key, count);
47588 + /*
47589 + * assign fake block numbers to all jnodes, capture and mark them dirty
47590 + */
47591 + block = fake_blocknr_unformatted(count);
47592 + for (i = 0; i < count; i ++, block ++) {
47593 + node = jnodes[i];
47594 + spin_lock_jnode(node);
47595 + JF_SET(node, JNODE_CREATED);
47596 + jnode_set_block(node, &block);
47597 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47598 + BUG_ON(result != 0);
47599 + jnode_make_dirty_locked(node);
47600 + spin_unlock_jnode(node);
47601 + }
47602 +
47603 + /*
47604 + * invalidate coordinate, research must be performed to continue
47605 + * because write will continue on twig level
47606 + */
47607 + uf_coord->valid = 0;
47608 + return count;
47609 +}
47610 +
47611 +/**
47612 + * plug_hole - replace hole extent with unallocated and holes
47613 + * @uf_coord:
47614 + * @key:
47615 + * @node:
47616 + * @h: structure containing coordinate, lock handle, key, etc
47617 + *
47618 + * Creates an unallocated extent of width 1 within a hole. In worst case two
47619 + * additional extents can be created.
47620 + */
47621 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
47622 +{
47623 + struct replace_handle rh;
47624 + reiser4_extent *ext;
47625 + reiser4_block_nr width, pos_in_unit;
47626 + coord_t *coord;
47627 + struct extent_coord_extension *ext_coord;
47628 + int return_inserted_position;
47629 +
47630 + check_uf_coord(uf_coord, key);
47631 +
47632 + rh.coord = coord_by_uf_coord(uf_coord);
47633 + rh.lh = uf_coord->lh;
47634 + rh.flags = 0;
47635 +
47636 + coord = coord_by_uf_coord(uf_coord);
47637 + ext_coord = ext_coord_by_uf_coord(uf_coord);
47638 + ext = ext_by_ext_coord(uf_coord);
47639 +
47640 + width = ext_coord->width;
47641 + pos_in_unit = ext_coord->pos_in_unit;
47642 +
47643 + *how = 0;
47644 + if (width == 1) {
47645 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
47646 + znode_make_dirty(coord->node);
47647 + /* update uf_coord */
47648 + ON_DEBUG(ext_coord->extent = *ext);
47649 + *how = 1;
47650 + return 0;
47651 + } else if (pos_in_unit == 0) {
47652 + /* we deal with first element of extent */
47653 + if (coord->unit_pos) {
47654 + /* there is an extent to the left */
47655 + if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
47656 + /*
47657 + * left neighboring unit is an unallocated
47658 + * extent. Increase its width and decrease
47659 + * width of hole
47660 + */
47661 + extent_set_width(ext - 1,
47662 + extent_get_width(ext - 1) + 1);
47663 + extent_set_width(ext, width - 1);
47664 + znode_make_dirty(coord->node);
47665 +
47666 + /* update coord extension */
47667 + coord->unit_pos--;
47668 + ext_coord->width = extent_get_width(ext - 1);
47669 + ext_coord->pos_in_unit = ext_coord->width - 1;
47670 + ext_coord->ext_offset -= sizeof(reiser4_extent);
47671 + ON_DEBUG(ext_coord->extent =
47672 + *extent_by_coord(coord));
47673 + *how = 2;
47674 + return 0;
47675 + }
47676 + }
47677 + /* extent for replace */
47678 + reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
47679 + /* extent to be inserted */
47680 + reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
47681 + width - 1);
47682 + rh.nr_new_extents = 1;
47683 +
47684 + /* have reiser4_replace_extent to return with @coord and
47685 + @uf_coord->lh set to unit which was replaced */
47686 + return_inserted_position = 0;
47687 + *how = 3;
47688 + } else if (pos_in_unit == width - 1) {
47689 + /* we deal with last element of extent */
47690 + if (coord->unit_pos < nr_units_extent(coord) - 1) {
47691 + /* there is an extent unit to the right */
47692 + if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
47693 + /*
47694 + * right neighboring unit is an unallocated
47695 + * extent. Increase its width and decrease
47696 + * width of hole
47697 + */
47698 + extent_set_width(ext + 1,
47699 + extent_get_width(ext + 1) + 1);
47700 + extent_set_width(ext, width - 1);
47701 + znode_make_dirty(coord->node);
47702 +
47703 + /* update coord extension */
47704 + coord->unit_pos++;
47705 + ext_coord->width = extent_get_width(ext + 1);
47706 + ext_coord->pos_in_unit = 0;
47707 + ext_coord->ext_offset += sizeof(reiser4_extent);
47708 + ON_DEBUG(ext_coord->extent =
47709 + *extent_by_coord(coord));
47710 + *how = 4;
47711 + return 0;
47712 + }
47713 + }
47714 + /* extent for replace */
47715 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
47716 + /* extent to be inserted */
47717 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47718 + 1);
47719 + rh.nr_new_extents = 1;
47720 +
47721 + /* have reiser4_replace_extent to return with @coord and
47722 + @uf_coord->lh set to unit which was inserted */
47723 + return_inserted_position = 1;
47724 + *how = 5;
47725 + } else {
47726 + /* extent for replace */
47727 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
47728 + pos_in_unit);
47729 + /* extents to be inserted */
47730 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47731 + 1);
47732 + reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
47733 + width - pos_in_unit - 1);
47734 + rh.nr_new_extents = 2;
47735 +
47736 + /* have reiser4_replace_extent to return with @coord and
47737 + @uf_coord->lh set to first of units which were inserted */
47738 + return_inserted_position = 1;
47739 + *how = 6;
47740 + }
47741 + unit_key_by_coord(coord, &rh.paste_key);
47742 + set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
47743 + extent_get_width(&rh.overwrite) * current_blocksize);
47744 +
47745 + uf_coord->valid = 0;
47746 + return reiser4_replace_extent(&rh, return_inserted_position);
47747 +}
47748 +
47749 +/**
47750 + * overwrite_one_block -
47751 + * @uf_coord:
47752 + * @key:
47753 + * @node:
47754 + *
47755 + * If @node corresponds to hole extent - create unallocated extent for it and
47756 + * assign fake block number. If @node corresponds to allocated extent - assign
47757 + * block number of jnode
47758 + */
47759 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
47760 + jnode *node, int *hole_plugged)
47761 +{
47762 + int result;
47763 + struct extent_coord_extension *ext_coord;
47764 + reiser4_extent *ext;
47765 + reiser4_block_nr block;
47766 + int how;
47767 +
47768 + assert("vs-1312", uf_coord->coord.between == AT_UNIT);
47769 +
47770 + result = 0;
47771 + ext_coord = ext_coord_by_uf_coord(uf_coord);
47772 + ext = ext_by_ext_coord(uf_coord);
47773 + assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
47774 +
47775 + switch (state_of_extent(ext)) {
47776 + case ALLOCATED_EXTENT:
47777 + block = extent_get_start(ext) + ext_coord->pos_in_unit;
47778 + break;
47779 +
47780 + case HOLE_EXTENT:
47781 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
47782 + BUG_ON(result != 0);
47783 + result = plug_hole(uf_coord, key, &how);
47784 + if (result)
47785 + return result;
47786 + block = fake_blocknr_unformatted(1);
47787 + if (hole_plugged)
47788 + *hole_plugged = 1;
47789 + JF_SET(node, JNODE_CREATED);
47790 + break;
47791 +
47792 + default:
47793 + return RETERR(-EIO);
47794 + }
47795 +
47796 + jnode_set_block(node, &block);
47797 + return 0;
47798 +}
47799 +
47800 +/**
47801 + * move_coord - move coordinate forward
47802 + * @uf_coord:
47803 + *
47804 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
47805 + * the last one already or is invalid.
47806 + */
47807 +static int move_coord(uf_coord_t *uf_coord)
47808 +{
47809 + struct extent_coord_extension *ext_coord;
47810 +
47811 + if (uf_coord->valid == 0)
47812 + return 1;
47813 + ext_coord = &uf_coord->extension.extent;
47814 + ext_coord->pos_in_unit ++;
47815 + if (ext_coord->pos_in_unit < ext_coord->width)
47816 + /* coordinate moved within the unit */
47817 + return 0;
47818 +
47819 + /* end of unit is reached. Try to move to next unit */
47820 + ext_coord->pos_in_unit = 0;
47821 + uf_coord->coord.unit_pos ++;
47822 + if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
47823 + /* coordinate moved to next unit */
47824 + ext_coord->ext_offset += sizeof(reiser4_extent);
47825 + ext_coord->width =
47826 + extent_get_width(ext_by_offset
47827 + (uf_coord->coord.node,
47828 + ext_coord->ext_offset));
47829 + ON_DEBUG(ext_coord->extent =
47830 + *ext_by_offset(uf_coord->coord.node,
47831 + ext_coord->ext_offset));
47832 + return 0;
47833 + }
47834 + /* end of item is reached */
47835 + uf_coord->valid = 0;
47836 + return 1;
47837 +}
47838 +
47839 +/**
47840 + * overwrite_extent -
47841 + * @inode:
47842 + *
47843 + * Returns number of handled jnodes.
47844 + */
47845 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47846 + jnode **jnodes, int count, int *plugged_hole)
47847 +{
47848 + int result;
47849 + reiser4_key k;
47850 + int i;
47851 + jnode *node;
47852 +
47853 + k = *key;
47854 + for (i = 0; i < count; i ++) {
47855 + node = jnodes[i];
47856 + if (*jnode_get_block(node) == 0) {
47857 + result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
47858 + if (result)
47859 + return result;
47860 + }
47861 + /*
47862 + * make sure that we hold long term locked twig node containing
47863 + * all jnodes we are about to capture
47864 + */
47865 + check_jnodes(uf_coord->lh->node, &k, 1);
47866 + /*
47867 + * assign fake block numbers to all jnodes, capture and mark
47868 + * them dirty
47869 + */
47870 + spin_lock_jnode(node);
47871 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47872 + BUG_ON(result != 0);
47873 + jnode_make_dirty_locked(node);
47874 + spin_unlock_jnode(node);
47875 +
47876 + if (uf_coord->valid == 0)
47877 + return i + 1;
47878 +
47879 + check_uf_coord(uf_coord, &k);
47880 +
47881 + if (move_coord(uf_coord)) {
47882 + /*
47883 + * failed to move to the next node pointer. Either end
47884 + * of file or end of twig node is reached. In the later
47885 + * case we might go to the right neighbor.
47886 + */
47887 + uf_coord->valid = 0;
47888 + return i + 1;
47889 + }
47890 + set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
47891 + }
47892 +
47893 + return count;
47894 +}
47895 +
47896 +/**
47897 + * reiser4_update_extent
47898 + * @file:
47899 + * @jnodes:
47900 + * @count:
47901 + * @off:
47902 + *
47903 + */
47904 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
47905 + int *plugged_hole)
47906 +{
47907 + int result;
47908 + znode *loaded;
47909 + uf_coord_t uf_coord;
47910 + coord_t *coord;
47911 + lock_handle lh;
47912 + reiser4_key key;
47913 +
47914 + assert("", reiser4_lock_counters()->d_refs == 0);
47915 +
47916 + key_by_inode_and_offset_common(inode, pos, &key);
47917 +
47918 + init_uf_coord(&uf_coord, &lh);
47919 + coord = &uf_coord.coord;
47920 + result = find_file_item_nohint(coord, &lh, &key,
47921 + ZNODE_WRITE_LOCK, inode);
47922 + if (IS_CBKERR(result)) {
47923 + assert("", reiser4_lock_counters()->d_refs == 0);
47924 + return result;
47925 + }
47926 +
47927 + result = zload(coord->node);
47928 + BUG_ON(result != 0);
47929 + loaded = coord->node;
47930 +
47931 + if (coord->between == AFTER_UNIT) {
47932 + /*
47933 + * append existing extent item with unallocated extent of width
47934 + * nr_jnodes
47935 + */
47936 + init_coord_extension_extent(&uf_coord,
47937 + get_key_offset(&key));
47938 + result = append_last_extent(&uf_coord, &key,
47939 + &node, 1);
47940 + } else if (coord->between == AT_UNIT) {
47941 + /*
47942 + * overwrite
47943 + * not optimal yet. Will be optimized if new write will show
47944 + * performance win.
47945 + */
47946 + init_coord_extension_extent(&uf_coord,
47947 + get_key_offset(&key));
47948 + result = overwrite_extent(&uf_coord, &key,
47949 + &node, 1, plugged_hole);
47950 + } else {
47951 + /*
47952 + * there are no items of this file in the tree yet. Create
47953 + * first item of the file inserting one unallocated extent of
47954 + * width nr_jnodes
47955 + */
47956 + result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
47957 + }
47958 + assert("", result == 1 || result < 0);
47959 + zrelse(loaded);
47960 + done_lh(&lh);
47961 + assert("", reiser4_lock_counters()->d_refs == 0);
47962 + return (result == 1) ? 0 : result;
47963 +}
47964 +
47965 +/**
47966 + * update_extents
47967 + * @file:
47968 + * @jnodes:
47969 + * @count:
47970 + * @off:
47971 + *
47972 + */
47973 +static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
47974 +{
47975 + struct inode *inode;
47976 + struct hint hint;
47977 + reiser4_key key;
47978 + int result;
47979 + znode *loaded;
47980 +
47981 + result = load_file_hint(file, &hint);
47982 + BUG_ON(result != 0);
47983 +
47984 + inode = file->f_dentry->d_inode;
47985 + if (count != 0)
47986 + /*
47987 + * count == 0 is special case: expanding truncate
47988 + */
47989 + pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
47990 + key_by_inode_and_offset_common(inode, pos, &key);
47991 +
47992 + assert("", reiser4_lock_counters()->d_refs == 0);
47993 +
47994 + do {
47995 + result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
47996 + if (IS_CBKERR(result)) {
47997 + assert("", reiser4_lock_counters()->d_refs == 0);
47998 + return result;
47999 + }
48000 +
48001 + result = zload(hint.ext_coord.coord.node);
48002 + BUG_ON(result != 0);
48003 + loaded = hint.ext_coord.coord.node;
48004 +
48005 + if (hint.ext_coord.coord.between == AFTER_UNIT) {
48006 + /*
48007 + * append existing extent item with unallocated extent
48008 + * of width nr_jnodes
48009 + */
48010 + if (hint.ext_coord.valid == 0)
48011 + /* NOTE: get statistics on this */
48012 + init_coord_extension_extent(&hint.ext_coord,
48013 + get_key_offset(&key));
48014 + result = append_last_extent(&hint.ext_coord, &key,
48015 + jnodes, count);
48016 + } else if (hint.ext_coord.coord.between == AT_UNIT) {
48017 + /*
48018 + * overwrite
48019 + * not optimal yet. Will be optimized if new write will
48020 + * show performance win.
48021 + */
48022 + if (hint.ext_coord.valid == 0)
48023 + /* NOTE: get statistics on this */
48024 + init_coord_extension_extent(&hint.ext_coord,
48025 + get_key_offset(&key));
48026 + result = overwrite_extent(&hint.ext_coord, &key,
48027 + jnodes, count, NULL);
48028 + } else {
48029 + /*
48030 + * there are no items of this file in the tree
48031 + * yet. Create first item of the file inserting one
48032 + * unallocated extent of * width nr_jnodes
48033 + */
48034 + result = insert_first_extent(&hint.ext_coord, &key,
48035 + jnodes, count, inode);
48036 + }
48037 + zrelse(loaded);
48038 + if (result < 0) {
48039 + done_lh(hint.ext_coord.lh);
48040 + break;
48041 + }
48042 +
48043 + jnodes += result;
48044 + count -= result;
48045 + set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
48046 +
48047 + /* seal and unlock znode */
48048 + if (hint.ext_coord.valid)
48049 + reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
48050 + else
48051 + reiser4_unset_hint(&hint);
48052 +
48053 + } while (count > 0);
48054 +
48055 + save_file_hint(file, &hint);
48056 + assert("", reiser4_lock_counters()->d_refs == 0);
48057 + return result;
48058 +}
48059 +
48060 +/**
48061 + * write_extent_reserve_space - reserve space for extent write operation
48062 + * @inode:
48063 + *
48064 + * Estimates and reserves space which may be required for writing
48065 + * WRITE_GRANULARITY pages of file.
48066 + */
48067 +static int write_extent_reserve_space(struct inode *inode)
48068 +{
48069 + __u64 count;
48070 + reiser4_tree *tree;
48071 +
48072 + /*
48073 + * to write WRITE_GRANULARITY pages to a file by extents we have to
48074 + * reserve disk space for:
48075 +
48076 + * 1. find_file_item may have to insert empty node to the tree (empty
48077 + * leaf node between two extent items). This requires 1 block and
48078 + * number of blocks which are necessary to perform insertion of an
48079 + * internal item into twig level.
48080 +
48081 + * 2. for each of written pages there might be needed 1 block and
48082 + * number of blocks which might be necessary to perform insertion of or
48083 + * paste to an extent item.
48084 +
48085 + * 3. stat data update
48086 + */
48087 + tree = reiser4_tree_by_inode(inode);
48088 + count = estimate_one_insert_item(tree) +
48089 + WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
48090 + estimate_one_insert_item(tree);
48091 + grab_space_enable();
48092 + return reiser4_grab_space(count, 0 /* flags */);
48093 +}
48094 +
48095 +/*
48096 + * filemap_copy_from_user no longer exists in generic code, because it
48097 + * is deadlocky (copying from user while holding the page lock is bad).
48098 + * As a temporary fix for reiser4, just define it here.
48099 + */
48100 +static inline size_t
48101 +filemap_copy_from_user(struct page *page, unsigned long offset,
48102 + const char __user *buf, unsigned bytes)
48103 +{
48104 + char *kaddr;
48105 + int left;
48106 +
48107 + kaddr = kmap_atomic(page, KM_USER0);
48108 + left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
48109 + kunmap_atomic(kaddr, KM_USER0);
48110 +
48111 + if (left != 0) {
48112 + /* Do it the slow way */
48113 + kaddr = kmap(page);
48114 + left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
48115 + kunmap(page);
48116 + }
48117 + return bytes - left;
48118 +}
48119 +
48120 +/**
48121 + * reiser4_write_extent - write method of extent item plugin
48122 + * @file: file to write to
48123 + * @buf: address of user-space buffer
48124 + * @count: number of bytes to write
48125 + * @pos: position in file to write to
48126 + *
48127 + */
48128 +ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
48129 + size_t count, loff_t *pos)
48130 +{
48131 + int have_to_update_extent;
48132 + int nr_pages, nr_dirty;
48133 + struct page *page;
48134 + jnode *jnodes[WRITE_GRANULARITY + 1];
48135 + struct inode *inode;
48136 + unsigned long index;
48137 + unsigned long end;
48138 + int i;
48139 + int to_page, page_off;
48140 + size_t left, written;
48141 + int result = 0;
48142 +
48143 + inode = file->f_dentry->d_inode;
48144 + if (write_extent_reserve_space(inode))
48145 + return RETERR(-ENOSPC);
48146 +
48147 + if (count == 0) {
48148 + /* truncate case */
48149 + update_extents(file, jnodes, 0, *pos);
48150 + return 0;
48151 + }
48152 +
48153 + BUG_ON(get_current_context()->trans->atom != NULL);
48154 +
48155 + left = count;
48156 + index = *pos >> PAGE_CACHE_SHIFT;
48157 + /* calculate number of pages which are to be written */
48158 + end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
48159 + nr_pages = end - index + 1;
48160 + nr_dirty = 0;
48161 + assert("", nr_pages <= WRITE_GRANULARITY + 1);
48162 +
48163 + /* get pages and jnodes */
48164 + for (i = 0; i < nr_pages; i ++) {
48165 + page = find_or_create_page(inode->i_mapping, index + i,
48166 + reiser4_ctx_gfp_mask_get());
48167 + if (page == NULL) {
48168 + nr_pages = i;
48169 + result = RETERR(-ENOMEM);
48170 + goto out;
48171 + }
48172 +
48173 + jnodes[i] = jnode_of_page(page);
48174 + if (IS_ERR(jnodes[i])) {
48175 + unlock_page(page);
48176 + page_cache_release(page);
48177 + nr_pages = i;
48178 + result = RETERR(-ENOMEM);
48179 + goto out;
48180 + }
48181 + /* prevent jnode and page from disconnecting */
48182 + JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
48183 + unlock_page(page);
48184 + }
48185 +
48186 + BUG_ON(get_current_context()->trans->atom != NULL);
48187 +
48188 + have_to_update_extent = 0;
48189 +
48190 + page_off = (*pos & (PAGE_CACHE_SIZE - 1));
48191 + for (i = 0; i < nr_pages; i ++) {
48192 + to_page = PAGE_CACHE_SIZE - page_off;
48193 + if (to_page > left)
48194 + to_page = left;
48195 + page = jnode_page(jnodes[i]);
48196 + if (page_offset(page) < inode->i_size &&
48197 + !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48198 + /*
48199 + * the above is not optimal for partial write to last
48200 + * page of file when file size is not at boundary of
48201 + * page
48202 + */
48203 + lock_page(page);
48204 + if (!PageUptodate(page)) {
48205 + result = readpage_unix_file(NULL, page);
48206 + BUG_ON(result != 0);
48207 + /* wait for read completion */
48208 + lock_page(page);
48209 + BUG_ON(!PageUptodate(page));
48210 + } else
48211 + result = 0;
48212 + unlock_page(page);
48213 + }
48214 +
48215 + BUG_ON(get_current_context()->trans->atom != NULL);
48216 + fault_in_pages_readable(buf, to_page);
48217 + BUG_ON(get_current_context()->trans->atom != NULL);
48218 +
48219 + lock_page(page);
48220 + if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
48221 + simple_prepare_write(file, page, page_off,
48222 + page_off + to_page);
48223 +
48224 + written = filemap_copy_from_user(page, page_off, buf, to_page);
48225 + if (unlikely(written != to_page)) {
48226 + unlock_page(page);
48227 + result = RETERR(-EFAULT);
48228 + break;
48229 + }
48230 +
48231 + flush_dcache_page(page);
48232 + reiser4_set_page_dirty_internal(page);
48233 + unlock_page(page);
48234 + nr_dirty++;
48235 +
48236 + mark_page_accessed(page);
48237 + SetPageUptodate(page);
48238 +
48239 + if (jnodes[i]->blocknr == 0)
48240 + have_to_update_extent ++;
48241 +
48242 + page_off = 0;
48243 + buf += to_page;
48244 + left -= to_page;
48245 + BUG_ON(get_current_context()->trans->atom != NULL);
48246 + }
48247 +
48248 + if (have_to_update_extent) {
48249 + update_extents(file, jnodes, nr_dirty, *pos);
48250 + } else {
48251 + for (i = 0; i < nr_dirty; i ++) {
48252 + int ret;
48253 + spin_lock_jnode(jnodes[i]);
48254 + ret = reiser4_try_capture(jnodes[i],
48255 + ZNODE_WRITE_LOCK, 0);
48256 + BUG_ON(ret != 0);
48257 + jnode_make_dirty_locked(jnodes[i]);
48258 + spin_unlock_jnode(jnodes[i]);
48259 + }
48260 + }
48261 +out:
48262 + for (i = 0; i < nr_pages; i ++) {
48263 + page_cache_release(jnode_page(jnodes[i]));
48264 + JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
48265 + jput(jnodes[i]);
48266 + }
48267 +
48268 + /* the only errors handled so far is ENOMEM and
48269 + EFAULT on copy_from_user */
48270 +
48271 + return (count - left) ? (count - left) : result;
48272 +}
48273 +
48274 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
48275 + struct page *page)
48276 +{
48277 + jnode *j;
48278 + struct address_space *mapping;
48279 + unsigned long index;
48280 + oid_t oid;
48281 + reiser4_block_nr block;
48282 +
48283 + mapping = page->mapping;
48284 + oid = get_inode_oid(mapping->host);
48285 + index = page->index;
48286 +
48287 + switch (state_of_extent(ext)) {
48288 + case HOLE_EXTENT:
48289 + /*
48290 + * it is possible to have hole page with jnode, if page was
48291 + * eflushed previously.
48292 + */
48293 + j = jfind(mapping, index);
48294 + if (j == NULL) {
48295 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
48296 + SetPageUptodate(page);
48297 + unlock_page(page);
48298 + return 0;
48299 + }
48300 + spin_lock_jnode(j);
48301 + if (!jnode_page(j)) {
48302 + jnode_attach_page(j, page);
48303 + } else {
48304 + BUG_ON(jnode_page(j) != page);
48305 + assert("vs-1504", jnode_page(j) == page);
48306 + }
48307 + block = *jnode_get_io_block(j);
48308 + spin_unlock_jnode(j);
48309 + if (block == 0) {
48310 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
48311 + SetPageUptodate(page);
48312 + unlock_page(page);
48313 + jput(j);
48314 + return 0;
48315 + }
48316 + break;
48317 +
48318 + case ALLOCATED_EXTENT:
48319 + j = jnode_of_page(page);
48320 + if (IS_ERR(j))
48321 + return PTR_ERR(j);
48322 + if (*jnode_get_block(j) == 0) {
48323 + reiser4_block_nr blocknr;
48324 +
48325 + blocknr = extent_get_start(ext) + pos;
48326 + jnode_set_block(j, &blocknr);
48327 + } else
48328 + assert("vs-1403",
48329 + j->blocknr == extent_get_start(ext) + pos);
48330 + break;
48331 +
48332 + case UNALLOCATED_EXTENT:
48333 + j = jfind(mapping, index);
48334 + assert("nikita-2688", j);
48335 + assert("vs-1426", jnode_page(j) == NULL);
48336 +
48337 + spin_lock_jnode(j);
48338 + jnode_attach_page(j, page);
48339 + spin_unlock_jnode(j);
48340 + break;
48341 +
48342 + default:
48343 + warning("vs-957", "wrong extent\n");
48344 + return RETERR(-EIO);
48345 + }
48346 +
48347 + BUG_ON(j == 0);
48348 + reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
48349 + jput(j);
48350 + return 0;
48351 +}
48352 +
48353 +/* Implements plugin->u.item.s.file.read operation for extent items. */
48354 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
48355 +{
48356 + int result;
48357 + struct page *page;
48358 + unsigned long cur_page, next_page;
48359 + unsigned long page_off, count;
48360 + struct address_space *mapping;
48361 + loff_t file_off;
48362 + uf_coord_t *uf_coord;
48363 + coord_t *coord;
48364 + struct extent_coord_extension *ext_coord;
48365 + unsigned long nr_pages;
48366 + char *kaddr;
48367 +
48368 + assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
48369 + assert("vs-572", flow->user == 1);
48370 + assert("vs-1351", flow->length > 0);
48371 +
48372 + uf_coord = &hint->ext_coord;
48373 +
48374 + check_uf_coord(uf_coord, NULL);
48375 + assert("vs-33", uf_coord->lh == &hint->lh);
48376 +
48377 + coord = &uf_coord->coord;
48378 + assert("vs-1119", znode_is_rlocked(coord->node));
48379 + assert("vs-1120", znode_is_loaded(coord->node));
48380 + assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
48381 +
48382 + mapping = file->f_dentry->d_inode->i_mapping;
48383 + ext_coord = &uf_coord->extension.extent;
48384 +
48385 + /* offset in a file to start read from */
48386 + file_off = get_key_offset(&flow->key);
48387 + /* offset within the page to start read from */
48388 + page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
48389 + /* bytes which can be read from the page which contains file_off */
48390 + count = PAGE_CACHE_SIZE - page_off;
48391 +
48392 + /* index of page containing offset read is to start from */
48393 + cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
48394 + next_page = cur_page;
48395 + /* number of pages flow spans over */
48396 + nr_pages =
48397 + ((file_off + flow->length + PAGE_CACHE_SIZE -
48398 + 1) >> PAGE_CACHE_SHIFT) - cur_page;
48399 +
48400 + /* we start having twig node read locked. However, we do not want to
48401 + keep that lock all the time readahead works. So, set a sel and
48402 + release twig node. */
48403 + reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
48404 + /* &hint->lh is done-ed */
48405 +
48406 + do {
48407 + reiser4_txn_restart_current();
48408 + page = read_mapping_page(mapping, cur_page, file);
48409 + if (IS_ERR(page))
48410 + return PTR_ERR(page);
48411 + lock_page(page);
48412 + if (!PageUptodate(page)) {
48413 + unlock_page(page);
48414 + page_cache_release(page);
48415 + warning("jmacd-97178", "extent_read: page is not up to date");
48416 + return RETERR(-EIO);
48417 + }
48418 + mark_page_accessed(page);
48419 + unlock_page(page);
48420 +
48421 + /* If users can be writing to this page using arbitrary virtual
48422 + addresses, take care about potential aliasing before reading
48423 + the page on the kernel side.
48424 + */
48425 + if (mapping_writably_mapped(mapping))
48426 + flush_dcache_page(page);
48427 +
48428 + assert("nikita-3034", reiser4_schedulable());
48429 +
48430 + /* number of bytes which are to be read from the page */
48431 + if (count > flow->length)
48432 + count = flow->length;
48433 +
48434 + result = fault_in_pages_writeable(flow->data, count);
48435 + if (result) {
48436 + page_cache_release(page);
48437 + return RETERR(-EFAULT);
48438 + }
48439 +
48440 + kaddr = kmap_atomic(page, KM_USER0);
48441 + result = __copy_to_user_inatomic(flow->data,
48442 + kaddr + page_off, count);
48443 + kunmap_atomic(kaddr, KM_USER0);
48444 + if (result != 0) {
48445 + kaddr = kmap(page);
48446 + result = __copy_to_user(flow->data, kaddr + page_off, count);
48447 + kunmap(page);
48448 + if (unlikely(result))
48449 + return RETERR(-EFAULT);
48450 + }
48451 +
48452 + page_cache_release(page);
48453 +
48454 + /* increase key (flow->key), update user area pointer (flow->data) */
48455 + move_flow_forward(flow, count);
48456 +
48457 + page_off = 0;
48458 + cur_page ++;
48459 + count = PAGE_CACHE_SIZE;
48460 + nr_pages--;
48461 + } while (flow->length);
48462 +
48463 + return 0;
48464 +}
48465 +
48466 +/*
48467 + plugin->s.file.readpage
48468 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
48469 + or
48470 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
48471 +
48472 + At the beginning: coord->node is read locked, zloaded, page is
48473 + locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
48474 +*/
48475 +int reiser4_readpage_extent(void *vp, struct page *page)
48476 +{
48477 + uf_coord_t *uf_coord = vp;
48478 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
48479 + ON_DEBUG(reiser4_key key);
48480 +
48481 + assert("vs-1040", PageLocked(page));
48482 + assert("vs-1050", !PageUptodate(page));
48483 + assert("vs-1039", page->mapping && page->mapping->host);
48484 +
48485 + assert("vs-1044", znode_is_loaded(coord->node));
48486 + assert("vs-758", item_is_extent(coord));
48487 + assert("vs-1046", coord_is_existing_unit(coord));
48488 + assert("vs-1045", znode_is_rlocked(coord->node));
48489 + assert("vs-1047",
48490 + page->mapping->host->i_ino ==
48491 + get_key_objectid(item_key_by_coord(coord, &key)));
48492 + check_uf_coord(uf_coord, NULL);
48493 +
48494 + return reiser4_do_readpage_extent(
48495 + ext_by_ext_coord(uf_coord),
48496 + uf_coord->extension.extent.pos_in_unit, page);
48497 +}
48498 +
48499 +/**
48500 + * get_block_address_extent
48501 + * @coord:
48502 + * @block:
48503 + * @result:
48504 + *
48505 + *
48506 + */
48507 +int get_block_address_extent(const coord_t *coord, sector_t block,
48508 + sector_t *result)
48509 +{
48510 + reiser4_extent *ext;
48511 +
48512 + if (!coord_is_existing_unit(coord))
48513 + return RETERR(-EINVAL);
48514 +
48515 + ext = extent_by_coord(coord);
48516 +
48517 + if (state_of_extent(ext) != ALLOCATED_EXTENT)
48518 + /* FIXME: bad things may happen if it is unallocated extent */
48519 + *result = 0;
48520 + else {
48521 + reiser4_key key;
48522 +
48523 + unit_key_by_coord(coord, &key);
48524 + assert("vs-1645",
48525 + block >= get_key_offset(&key) >> current_blocksize_bits);
48526 + assert("vs-1646",
48527 + block <
48528 + (get_key_offset(&key) >> current_blocksize_bits) +
48529 + extent_get_width(ext));
48530 + *result =
48531 + extent_get_start(ext) + (block -
48532 + (get_key_offset(&key) >>
48533 + current_blocksize_bits));
48534 + }
48535 + return 0;
48536 +}
48537 +
48538 +/*
48539 + plugin->u.item.s.file.append_key
48540 + key of first byte which is the next to last byte by addressed by this extent
48541 +*/
48542 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
48543 +{
48544 + item_key_by_coord(coord, key);
48545 + set_key_offset(key,
48546 + get_key_offset(key) + reiser4_extent_size(coord,
48547 + nr_units_extent
48548 + (coord)));
48549 +
48550 + assert("vs-610", get_key_offset(key)
48551 + && (get_key_offset(key) & (current_blocksize - 1)) == 0);
48552 + return key;
48553 +}
48554 +
48555 +/* plugin->u.item.s.file.init_coord_extension */
48556 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
48557 +{
48558 + coord_t *coord;
48559 + struct extent_coord_extension *ext_coord;
48560 + reiser4_key key;
48561 + loff_t offset;
48562 +
48563 + assert("vs-1295", uf_coord->valid == 0);
48564 +
48565 + coord = &uf_coord->coord;
48566 + assert("vs-1288", coord_is_iplug_set(coord));
48567 + assert("vs-1327", znode_is_loaded(coord->node));
48568 +
48569 + if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
48570 + return;
48571 +
48572 + ext_coord = &uf_coord->extension.extent;
48573 + ext_coord->nr_units = nr_units_extent(coord);
48574 + ext_coord->ext_offset =
48575 + (char *)extent_by_coord(coord) - zdata(coord->node);
48576 + ext_coord->width = extent_get_width(extent_by_coord(coord));
48577 + ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
48578 + uf_coord->valid = 1;
48579 +
48580 + /* pos_in_unit is the only uninitialized field in extended coord */
48581 + if (coord->between == AFTER_UNIT) {
48582 + assert("vs-1330",
48583 + coord->unit_pos == nr_units_extent(coord) - 1);
48584 +
48585 + ext_coord->pos_in_unit = ext_coord->width - 1;
48586 + } else {
48587 + /* AT_UNIT */
48588 + unit_key_by_coord(coord, &key);
48589 + offset = get_key_offset(&key);
48590 +
48591 + assert("vs-1328", offset <= lookuped);
48592 + assert("vs-1329",
48593 + lookuped <
48594 + offset + ext_coord->width * current_blocksize);
48595 + ext_coord->pos_in_unit =
48596 + ((lookuped - offset) >> current_blocksize_bits);
48597 + }
48598 +}
48599 +
48600 +/*
48601 + * Local variables:
48602 + * c-indentation-style: "K&R"
48603 + * mode-name: "LC"
48604 + * c-basic-offset: 8
48605 + * tab-width: 8
48606 + * fill-column: 79
48607 + * scroll-step: 1
48608 + * End:
48609 + */
48610 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.22/fs/reiser4/plugin/item/extent_flush_ops.c
48611 --- linux-2.6.22.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
48612 +++ linux-2.6.22/fs/reiser4/plugin/item/extent_flush_ops.c 2007-07-29 00:25:34.964719254 +0400
48613 @@ -0,0 +1,1028 @@
48614 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48615 +
48616 +#include "item.h"
48617 +#include "../../tree.h"
48618 +#include "../../jnode.h"
48619 +#include "../../super.h"
48620 +#include "../../flush.h"
48621 +#include "../../carry.h"
48622 +#include "../object.h"
48623 +
48624 +#include <linux/pagemap.h>
48625 +
48626 +static reiser4_block_nr extent_unit_start(const coord_t * item);
48627 +
48628 +/* Return either first or last extent (depending on @side) of the item
48629 + @coord is set to. Set @pos_in_unit either to first or to last block
48630 + of extent. */
48631 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
48632 + reiser4_block_nr * pos_in_unit)
48633 +{
48634 + reiser4_extent *ext;
48635 +
48636 + if (side == LEFT_SIDE) {
48637 + /* get first extent of item */
48638 + ext = extent_item(coord);
48639 + *pos_in_unit = 0;
48640 + } else {
48641 + /* get last extent of item and last position within it */
48642 + assert("vs-363", side == RIGHT_SIDE);
48643 + ext = extent_item(coord) + coord_last_unit_pos(coord);
48644 + *pos_in_unit = extent_get_width(ext) - 1;
48645 + }
48646 +
48647 + return ext;
48648 +}
48649 +
48650 +/* item_plugin->f.utmost_child */
48651 +/* Return the child. Coord is set to extent item. Find jnode corresponding
48652 + either to first or to last unformatted node pointed by the item */
48653 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
48654 +{
48655 + reiser4_extent *ext;
48656 + reiser4_block_nr pos_in_unit;
48657 +
48658 + ext = extent_utmost_ext(coord, side, &pos_in_unit);
48659 +
48660 + switch (state_of_extent(ext)) {
48661 + case HOLE_EXTENT:
48662 + *childp = NULL;
48663 + return 0;
48664 + case ALLOCATED_EXTENT:
48665 + case UNALLOCATED_EXTENT:
48666 + break;
48667 + default:
48668 + /* this should never happen */
48669 + assert("vs-1417", 0);
48670 + }
48671 +
48672 + {
48673 + reiser4_key key;
48674 + reiser4_tree *tree;
48675 + unsigned long index;
48676 +
48677 + if (side == LEFT_SIDE) {
48678 + /* get key of first byte addressed by the extent */
48679 + item_key_by_coord(coord, &key);
48680 + } else {
48681 + /* get key of byte which next after last byte addressed by the extent */
48682 + append_key_extent(coord, &key);
48683 + }
48684 +
48685 + assert("vs-544",
48686 + (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
48687 + /* index of first or last (depending on @side) page addressed
48688 + by the extent */
48689 + index =
48690 + (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
48691 + if (side == RIGHT_SIDE)
48692 + index--;
48693 +
48694 + tree = coord->node->zjnode.tree;
48695 + *childp = jlookup(tree, get_key_objectid(&key), index);
48696 + }
48697 +
48698 + return 0;
48699 +}
48700 +
48701 +/* item_plugin->f.utmost_child_real_block */
48702 +/* Return the child's block, if allocated. */
48703 +int
48704 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
48705 + reiser4_block_nr * block)
48706 +{
48707 + reiser4_extent *ext;
48708 +
48709 + ext = extent_by_coord(coord);
48710 +
48711 + switch (state_of_extent(ext)) {
48712 + case ALLOCATED_EXTENT:
48713 + *block = extent_get_start(ext);
48714 + if (side == RIGHT_SIDE)
48715 + *block += extent_get_width(ext) - 1;
48716 + break;
48717 + case HOLE_EXTENT:
48718 + case UNALLOCATED_EXTENT:
48719 + *block = 0;
48720 + break;
48721 + default:
48722 + /* this should never happen */
48723 + assert("vs-1418", 0);
48724 + }
48725 +
48726 + return 0;
48727 +}
48728 +
48729 +/* item_plugin->f.scan */
48730 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
48731 + This scan continues, advancing the parent coordinate, until either it encounters a
48732 + formatted child or it finishes scanning this node.
48733 +
48734 + If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
48735 + not sure this is last property (same atom) is enforced, but it should be the case since
48736 + one atom must write the parent and the others must read the parent, thus fusing?). In
48737 + any case, the code below asserts this case for unallocated extents. Unallocated
48738 + extents are thus optimized because we can skip to the endpoint when scanning.
48739 +
48740 + It returns control to reiser4_scan_extent, handles these terminating conditions,
48741 + e.g., by loading the next twig.
48742 +*/
48743 +int reiser4_scan_extent(flush_scan * scan)
48744 +{
48745 + coord_t coord;
48746 + jnode *neighbor;
48747 + unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
48748 + reiser4_block_nr unit_start;
48749 + __u64 oid;
48750 + reiser4_key key;
48751 + int ret = 0, allocated, incr;
48752 + reiser4_tree *tree;
48753 +
48754 + if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
48755 + scan->stop = 1;
48756 + return 0; /* Race with truncate, this node is already
48757 + * truncated. */
48758 + }
48759 +
48760 + coord_dup(&coord, &scan->parent_coord);
48761 +
48762 + assert("jmacd-1404", !reiser4_scan_finished(scan));
48763 + assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
48764 + assert("jmacd-1406", jnode_is_unformatted(scan->node));
48765 +
48766 + /* The scan_index variable corresponds to the current page index of the
48767 + unformatted block scan position. */
48768 + scan_index = index_jnode(scan->node);
48769 +
48770 + assert("jmacd-7889", item_is_extent(&coord));
48771 +
48772 + repeat:
48773 + /* objectid of file */
48774 + oid = get_key_objectid(item_key_by_coord(&coord, &key));
48775 +
48776 + allocated = !extent_is_unallocated(&coord);
48777 + /* Get the values of this extent unit: */
48778 + unit_index = extent_unit_index(&coord);
48779 + unit_width = extent_unit_width(&coord);
48780 + unit_start = extent_unit_start(&coord);
48781 +
48782 + assert("jmacd-7187", unit_width > 0);
48783 + assert("jmacd-7188", scan_index >= unit_index);
48784 + assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
48785 +
48786 + /* Depending on the scan direction, we set different maximum values for scan_index
48787 + (scan_max) and the number of nodes that would be passed if the scan goes the
48788 + entire way (scan_dist). Incr is an integer reflecting the incremental
48789 + direction of scan_index. */
48790 + if (reiser4_scanning_left(scan)) {
48791 + scan_max = unit_index;
48792 + scan_dist = scan_index - unit_index;
48793 + incr = -1;
48794 + } else {
48795 + scan_max = unit_index + unit_width - 1;
48796 + scan_dist = scan_max - unit_index;
48797 + incr = +1;
48798 + }
48799 +
48800 + tree = coord.node->zjnode.tree;
48801 +
48802 + /* If the extent is allocated we have to check each of its blocks. If the extent
48803 + is unallocated we can skip to the scan_max. */
48804 + if (allocated) {
48805 + do {
48806 + neighbor = jlookup(tree, oid, scan_index);
48807 + if (neighbor == NULL)
48808 + goto stop_same_parent;
48809 +
48810 + if (scan->node != neighbor
48811 + && !reiser4_scan_goto(scan, neighbor)) {
48812 + /* @neighbor was jput() by reiser4_scan_goto */
48813 + goto stop_same_parent;
48814 + }
48815 +
48816 + ret = scan_set_current(scan, neighbor, 1, &coord);
48817 + if (ret != 0) {
48818 + goto exit;
48819 + }
48820 +
48821 + /* reference to @neighbor is stored in @scan, no need
48822 + to jput(). */
48823 + scan_index += incr;
48824 +
48825 + } while (incr + scan_max != scan_index);
48826 +
48827 + } else {
48828 + /* Optimized case for unallocated extents, skip to the end. */
48829 + neighbor = jlookup(tree, oid, scan_max /*index */ );
48830 + if (neighbor == NULL) {
48831 + /* Race with truncate */
48832 + scan->stop = 1;
48833 + ret = 0;
48834 + goto exit;
48835 + }
48836 +
48837 + assert("zam-1043",
48838 + reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
48839 +
48840 + ret = scan_set_current(scan, neighbor, scan_dist, &coord);
48841 + if (ret != 0) {
48842 + goto exit;
48843 + }
48844 + }
48845 +
48846 + if (coord_sideof_unit(&coord, scan->direction) == 0
48847 + && item_is_extent(&coord)) {
48848 + /* Continue as long as there are more extent units. */
48849 +
48850 + scan_index =
48851 + extent_unit_index(&coord) +
48852 + (reiser4_scanning_left(scan) ?
48853 + extent_unit_width(&coord) - 1 : 0);
48854 + goto repeat;
48855 + }
48856 +
48857 + if (0) {
48858 + stop_same_parent:
48859 +
48860 + /* If we are scanning left and we stop in the middle of an allocated
48861 + extent, we know the preceder immediately.. */
48862 + /* middle of extent is (scan_index - unit_index) != 0. */
48863 + if (reiser4_scanning_left(scan) &&
48864 + (scan_index - unit_index) != 0) {
48865 + /* FIXME(B): Someone should step-through and verify that this preceder
48866 + calculation is indeed correct. */
48867 + /* @unit_start is starting block (number) of extent
48868 + unit. Flush stopped at the @scan_index block from
48869 + the beginning of the file, which is (scan_index -
48870 + unit_index) block within extent.
48871 + */
48872 + if (unit_start) {
48873 + /* skip preceder update when we are at hole */
48874 + scan->preceder_blk =
48875 + unit_start + scan_index - unit_index;
48876 + check_preceder(scan->preceder_blk);
48877 + }
48878 + }
48879 +
48880 + /* In this case, we leave coord set to the parent of scan->node. */
48881 + scan->stop = 1;
48882 +
48883 + } else {
48884 + /* In this case, we are still scanning, coord is set to the next item which is
48885 + either off-the-end of the node or not an extent. */
48886 + assert("jmacd-8912", scan->stop == 0);
48887 + assert("jmacd-7812",
48888 + (coord_is_after_sideof_unit(&coord, scan->direction)
48889 + || !item_is_extent(&coord)));
48890 + }
48891 +
48892 + ret = 0;
48893 + exit:
48894 + return ret;
48895 +}
48896 +
48897 +/* ask block allocator for some blocks */
48898 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
48899 + reiser4_block_nr wanted_count,
48900 + reiser4_block_nr *first_allocated,
48901 + reiser4_block_nr *allocated,
48902 + block_stage_t block_stage)
48903 +{
48904 + *allocated = wanted_count;
48905 + preceder->max_dist = 0; /* scan whole disk, if needed */
48906 +
48907 + /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
48908 + preceder->block_stage = block_stage;
48909 +
48910 + /* FIXME: we do not handle errors here now */
48911 + check_me("vs-420",
48912 + reiser4_alloc_blocks(preceder, first_allocated, allocated,
48913 + BA_PERMANENT) == 0);
48914 + /* update flush_pos's preceder to last allocated block number */
48915 + preceder->blk = *first_allocated + *allocated - 1;
48916 +}
48917 +
48918 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
48919 + will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
48920 + to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
48921 +static reiser4_block_nr reserve_replace(void)
48922 +{
48923 + reiser4_block_nr grabbed, needed;
48924 +
48925 + grabbed = get_current_context()->grabbed_blocks;
48926 + needed = estimate_one_insert_into_item(current_tree);
48927 + check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
48928 + return grabbed;
48929 +}
48930 +
48931 +static void free_replace_reserved(reiser4_block_nr grabbed)
48932 +{
48933 + reiser4_context *ctx;
48934 +
48935 + ctx = get_current_context();
48936 + grabbed2free(ctx, get_super_private(ctx->super),
48937 + ctx->grabbed_blocks - grabbed);
48938 +}
48939 +
48940 +/* Block offset of first block addressed by unit */
48941 +__u64 extent_unit_index(const coord_t * item)
48942 +{
48943 + reiser4_key key;
48944 +
48945 + assert("vs-648", coord_is_existing_unit(item));
48946 + unit_key_by_coord(item, &key);
48947 + return get_key_offset(&key) >> current_blocksize_bits;
48948 +}
48949 +
48950 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
48951 + Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
48952 +__u64 extent_unit_width(const coord_t * item)
48953 +{
48954 + assert("vs-649", coord_is_existing_unit(item));
48955 + return width_by_coord(item);
48956 +}
48957 +
48958 +/* Starting block location of this unit */
48959 +static reiser4_block_nr extent_unit_start(const coord_t * item)
48960 +{
48961 + return extent_get_start(extent_by_coord(item));
48962 +}
48963 +
48964 +/**
48965 + * split_allocated_extent -
48966 + * @coord:
48967 + * @pos_in_unit:
48968 + *
48969 + * replace allocated extent with two allocated extents
48970 + */
48971 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
48972 +{
48973 + int result;
48974 + struct replace_handle *h;
48975 + reiser4_extent *ext;
48976 + reiser4_block_nr grabbed;
48977 +
48978 + ext = extent_by_coord(coord);
48979 + assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
48980 + assert("vs-1411", extent_get_width(ext) > pos_in_unit);
48981 +
48982 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
48983 + if (h == NULL)
48984 + return RETERR(-ENOMEM);
48985 + h->coord = coord;
48986 + h->lh = znode_lh(coord->node);
48987 + h->pkey = &h->key;
48988 + unit_key_by_coord(coord, h->pkey);
48989 + set_key_offset(h->pkey,
48990 + (get_key_offset(h->pkey) +
48991 + pos_in_unit * current_blocksize));
48992 + reiser4_set_extent(&h->overwrite, extent_get_start(ext),
48993 + pos_in_unit);
48994 + reiser4_set_extent(&h->new_extents[0],
48995 + extent_get_start(ext) + pos_in_unit,
48996 + extent_get_width(ext) - pos_in_unit);
48997 + h->nr_new_extents = 1;
48998 + h->flags = COPI_DONT_SHIFT_LEFT;
48999 + h->paste_key = h->key;
49000 +
49001 + /* reserve space for extent unit paste, @grabbed is reserved before */
49002 + grabbed = reserve_replace();
49003 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49004 + extent */);
49005 + /* restore reserved */
49006 + free_replace_reserved(grabbed);
49007 + kfree(h);
49008 + return result;
49009 +}
49010 +
49011 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
49012 + one). Return 1 if it succeeded, 0 - otherwise */
49013 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
49014 + reiser4_extent *replace)
49015 +{
49016 + assert("vs-1415", extent_by_coord(coord) == ext);
49017 +
49018 + if (coord->unit_pos == 0
49019 + || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
49020 + /* @ext either does not exist or is not allocated extent */
49021 + return 0;
49022 + if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
49023 + extent_get_start(replace))
49024 + return 0;
49025 +
49026 + /* we can glue, widen previous unit */
49027 + extent_set_width(ext - 1,
49028 + extent_get_width(ext - 1) + extent_get_width(replace));
49029 +
49030 + if (extent_get_width(ext) != extent_get_width(replace)) {
49031 + /* make current extent narrower */
49032 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
49033 + extent_set_start(ext,
49034 + extent_get_start(ext) +
49035 + extent_get_width(replace));
49036 + extent_set_width(ext,
49037 + extent_get_width(ext) -
49038 + extent_get_width(replace));
49039 + } else {
49040 + /* current extent completely glued with its left neighbor, remove it */
49041 + coord_t from, to;
49042 +
49043 + coord_dup(&from, coord);
49044 + from.unit_pos = nr_units_extent(coord) - 1;
49045 + coord_dup(&to, &from);
49046 +
49047 + /* currently cut from extent can cut either from the beginning or from the end. Move place which got
49048 + freed after unit removal to end of item */
49049 + memmove(ext, ext + 1,
49050 + (from.unit_pos -
49051 + coord->unit_pos) * sizeof(reiser4_extent));
49052 + /* wipe part of item which is going to be cut, so that node_check will not be confused */
49053 + cut_node_content(&from, &to, NULL, NULL, NULL);
49054 + }
49055 + znode_make_dirty(coord->node);
49056 + /* move coord back */
49057 + coord->unit_pos--;
49058 + return 1;
49059 +}
49060 +
49061 +/**
49062 + * conv_extent - replace extent with 2 ones
49063 + * @coord: coordinate of extent to be replaced
49064 + * @replace: extent to overwrite the one @coord is set to
49065 + *
49066 + * Overwrites extent @coord is set to and paste one extent unit after
49067 + * overwritten one if @replace is shorter than initial extent
49068 + */
49069 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
49070 +{
49071 + int result;
49072 + struct replace_handle *h;
49073 + reiser4_extent *ext;
49074 + reiser4_block_nr start, width, new_width;
49075 + reiser4_block_nr grabbed;
49076 + extent_state state;
49077 +
49078 + ext = extent_by_coord(coord);
49079 + state = state_of_extent(ext);
49080 + start = extent_get_start(ext);
49081 + width = extent_get_width(ext);
49082 + new_width = extent_get_width(replace);
49083 +
49084 + assert("vs-1458", (state == UNALLOCATED_EXTENT ||
49085 + state == ALLOCATED_EXTENT));
49086 + assert("vs-1459", width >= new_width);
49087 +
49088 + if (try_to_merge_with_left(coord, ext, replace)) {
49089 + /* merged @replace with left neighbor. Current unit is either
49090 + removed or narrowed */
49091 + return 0;
49092 + }
49093 +
49094 + if (width == new_width) {
49095 + /* replace current extent with @replace */
49096 + *ext = *replace;
49097 + znode_make_dirty(coord->node);
49098 + return 0;
49099 + }
49100 +
49101 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
49102 + if (h == NULL)
49103 + return RETERR(-ENOMEM);
49104 + h->coord = coord;
49105 + h->lh = znode_lh(coord->node);
49106 + h->pkey = &h->key;
49107 + unit_key_by_coord(coord, h->pkey);
49108 + set_key_offset(h->pkey,
49109 + (get_key_offset(h->pkey) + new_width * current_blocksize));
49110 + h->overwrite = *replace;
49111 +
49112 + /* replace @ext with @replace and padding extent */
49113 + reiser4_set_extent(&h->new_extents[0],
49114 + (state == ALLOCATED_EXTENT) ?
49115 + (start + new_width) :
49116 + UNALLOCATED_EXTENT_START,
49117 + width - new_width);
49118 + h->nr_new_extents = 1;
49119 + h->flags = COPI_DONT_SHIFT_LEFT;
49120 + h->paste_key = h->key;
49121 +
49122 + /* reserve space for extent unit paste, @grabbed is reserved before */
49123 + grabbed = reserve_replace();
49124 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49125 + extent */);
49126 +
49127 + /* restore reserved */
49128 + free_replace_reserved(grabbed);
49129 + kfree(h);
49130 + return result;
49131 +}
49132 +
49133 +/**
49134 + * assign_real_blocknrs
49135 + * @flush_pos:
49136 + * @oid: objectid of file jnodes to assign block number to belongs to
49137 + * @index: first jnode on the range
49138 + * @count: number of jnodes to assign block numbers to
49139 + * @first: start of allocated block range
49140 + *
49141 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
49142 + * @index. Jnodes get lookuped with jlookup.
49143 + */
49144 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
49145 + unsigned long index, reiser4_block_nr count,
49146 + reiser4_block_nr first)
49147 +{
49148 + unsigned long i;
49149 + reiser4_tree *tree;
49150 + txn_atom *atom;
49151 + int nr;
49152 +
49153 + atom = atom_locked_by_fq(flush_pos->fq);
49154 + assert("vs-1468", atom);
49155 + BUG_ON(atom == NULL);
49156 +
49157 + nr = 0;
49158 + tree = current_tree;
49159 + for (i = 0; i < count; ++i, ++index) {
49160 + jnode *node;
49161 +
49162 + node = jlookup(tree, oid, index);
49163 + assert("", node != NULL);
49164 + BUG_ON(node == NULL);
49165 +
49166 + spin_lock_jnode(node);
49167 + assert("", !jnode_is_flushprepped(node));
49168 + assert("vs-1475", node->atom == atom);
49169 + assert("vs-1476", atomic_read(&node->x_count) > 0);
49170 +
49171 + JF_CLR(node, JNODE_FLUSH_RESERVED);
49172 + jnode_set_block(node, &first);
49173 + unformatted_make_reloc(node, flush_pos->fq);
49174 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
49175 + FQ_LIST, 0));
49176 + spin_unlock_jnode(node);
49177 + first++;
49178 +
49179 + atomic_dec(&node->x_count);
49180 + nr ++;
49181 + }
49182 +
49183 + spin_unlock_atom(atom);
49184 + return;
49185 +}
49186 +
49187 +/**
49188 + * make_node_ovrwr - assign node to overwrite set
49189 + * @jnodes: overwrite set list head
49190 + * @node: jnode to belong to overwrite set
49191 + *
49192 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
49193 + * which is an accumulator for nodes before they get to overwrite set list of
49194 + * atom.
49195 + */
49196 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
49197 +{
49198 + spin_lock_jnode(node);
49199 +
49200 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
49201 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
49202 +
49203 + JF_SET(node, JNODE_OVRWR);
49204 + list_move_tail(&node->capture_link, jnodes);
49205 + ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
49206 +
49207 + spin_unlock_jnode(node);
49208 +}
49209 +
49210 +/**
49211 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
49212 + * @flush_pos: flush position
49213 + * @oid: objectid of file jnodes belong to
49214 + * @index: starting index
49215 + * @width: extent width
49216 + *
49217 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
49218 + * overwrite set. Starting from the one with index @index. If end of slum is
49219 + * detected (node is not found or flushprepped) - stop iterating and set flush
49220 + * position's state to POS_INVALID.
49221 + */
49222 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
49223 + unsigned long index, reiser4_block_nr width)
49224 +{
49225 + unsigned long i;
49226 + reiser4_tree *tree;
49227 + jnode *node;
49228 + txn_atom *atom;
49229 + LIST_HEAD(jnodes);
49230 +
49231 + tree = current_tree;
49232 +
49233 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
49234 + assert("vs-1478", atom);
49235 +
49236 + for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
49237 + node = jlookup(tree, oid, index);
49238 + if (!node) {
49239 + flush_pos->state = POS_INVALID;
49240 + break;
49241 + }
49242 + if (jnode_check_flushprepped(node)) {
49243 + flush_pos->state = POS_INVALID;
49244 + atomic_dec(&node->x_count);
49245 + break;
49246 + }
49247 + if (node->atom != atom) {
49248 + flush_pos->state = POS_INVALID;
49249 + atomic_dec(&node->x_count);
49250 + break;
49251 + }
49252 + make_node_ovrwr(&jnodes, node);
49253 + atomic_dec(&node->x_count);
49254 + }
49255 +
49256 + list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
49257 + spin_unlock_atom(atom);
49258 +}
49259 +
49260 +/**
49261 + * allocated_extent_slum_size
49262 + * @flush_pos:
49263 + * @oid:
49264 + * @index:
49265 + * @count:
49266 + *
49267 + *
49268 + */
49269 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
49270 + unsigned long index, unsigned long count)
49271 +{
49272 + unsigned long i;
49273 + reiser4_tree *tree;
49274 + txn_atom *atom;
49275 + int nr;
49276 +
49277 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
49278 + assert("vs-1468", atom);
49279 +
49280 + nr = 0;
49281 + tree = current_tree;
49282 + for (i = 0; i < count; ++i, ++index) {
49283 + jnode *node;
49284 +
49285 + node = jlookup(tree, oid, index);
49286 + if (!node)
49287 + break;
49288 +
49289 + if (jnode_check_flushprepped(node)) {
49290 + atomic_dec(&node->x_count);
49291 + break;
49292 + }
49293 +
49294 + if (node->atom != atom) {
49295 + /*
49296 + * this is possible on overwrite: extent_write may
49297 + * capture several unformatted nodes without capturing
49298 + * any formatted nodes.
49299 + */
49300 + atomic_dec(&node->x_count);
49301 + break;
49302 + }
49303 +
49304 + assert("vs-1476", atomic_read(&node->x_count) > 1);
49305 + atomic_dec(&node->x_count);
49306 + nr ++;
49307 + }
49308 +
49309 + spin_unlock_atom(atom);
49310 + return nr;
49311 +}
49312 +
49313 +/**
49314 + * alloc_extent
49315 + * @flush_pos:
49316 + *
49317 + *
49318 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
49319 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
49320 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
49321 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
49322 + * set to 1 and to overwrite set otherwise
49323 + */
49324 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
49325 +{
49326 + coord_t *coord;
49327 + reiser4_extent *ext;
49328 + reiser4_extent replace_ext;
49329 + oid_t oid;
49330 + reiser4_block_nr protected;
49331 + reiser4_block_nr start;
49332 + __u64 index;
49333 + __u64 width;
49334 + extent_state state;
49335 + int result;
49336 + reiser4_block_nr first_allocated;
49337 + __u64 allocated;
49338 + reiser4_key key;
49339 + block_stage_t block_stage;
49340 +
49341 + assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
49342 + assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
49343 + && item_is_extent(&flush_pos->coord));
49344 +
49345 + coord = &flush_pos->coord;
49346 +
49347 + ext = extent_by_coord(coord);
49348 + state = state_of_extent(ext);
49349 + if (state == HOLE_EXTENT) {
49350 + flush_pos->state = POS_INVALID;
49351 + return 0;
49352 + }
49353 +
49354 + item_key_by_coord(coord, &key);
49355 + oid = get_key_objectid(&key);
49356 + index = extent_unit_index(coord) + flush_pos->pos_in_unit;
49357 + start = extent_get_start(ext);
49358 + width = extent_get_width(ext);
49359 +
49360 + assert("vs-1457", width > flush_pos->pos_in_unit);
49361 +
49362 + if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
49363 + /* relocate */
49364 + if (flush_pos->pos_in_unit) {
49365 + /* split extent unit into two */
49366 + result =
49367 + split_allocated_extent(coord,
49368 + flush_pos->pos_in_unit);
49369 + flush_pos->pos_in_unit = 0;
49370 + return result;
49371 + }
49372 +
49373 + /* limit number of nodes to allocate */
49374 + if (flush_pos->nr_to_write < width)
49375 + width = flush_pos->nr_to_write;
49376 +
49377 + if (state == ALLOCATED_EXTENT) {
49378 + /*
49379 + * all protected nodes are not flushprepped, therefore
49380 + * they are counted as flush_reserved
49381 + */
49382 + block_stage = BLOCK_FLUSH_RESERVED;
49383 + protected = allocated_extent_slum_size(flush_pos, oid,
49384 + index, width);
49385 + if (protected == 0) {
49386 + flush_pos->state = POS_INVALID;
49387 + flush_pos->pos_in_unit = 0;
49388 + return 0;
49389 + }
49390 + } else {
49391 + block_stage = BLOCK_UNALLOCATED;
49392 + protected = width;
49393 + }
49394 +
49395 + /*
49396 + * look at previous unit if possible. If it is allocated, make
49397 + * preceder more precise
49398 + */
49399 + if (coord->unit_pos &&
49400 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
49401 + reiser4_pos_hint(flush_pos)->blk =
49402 + extent_get_start(ext - 1) +
49403 + extent_get_width(ext - 1);
49404 +
49405 + /* allocate new block numbers for protected nodes */
49406 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49407 + protected,
49408 + &first_allocated, &allocated,
49409 + block_stage);
49410 +
49411 + if (state == ALLOCATED_EXTENT)
49412 + /*
49413 + * on relocating - free nodes which are going to be
49414 + * relocated
49415 + */
49416 + reiser4_dealloc_blocks(&start, &allocated,
49417 + BLOCK_ALLOCATED, BA_DEFER);
49418 +
49419 + /* assign new block numbers to protected nodes */
49420 + assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
49421 +
49422 + /* prepare extent which will replace current one */
49423 + reiser4_set_extent(&replace_ext, first_allocated, allocated);
49424 +
49425 + /* adjust extent item */
49426 + result = conv_extent(coord, &replace_ext);
49427 + if (result != 0 && result != -ENOMEM) {
49428 + warning("vs-1461",
49429 + "Failed to allocate extent. Should not happen\n");
49430 + return result;
49431 + }
49432 +
49433 + /*
49434 + * break flush: we prepared for flushing as many blocks as we
49435 + * were asked for
49436 + */
49437 + if (flush_pos->nr_to_write == allocated)
49438 + flush_pos->state = POS_INVALID;
49439 + } else {
49440 + /* overwrite */
49441 + mark_jnodes_overwrite(flush_pos, oid, index, width);
49442 + }
49443 + flush_pos->pos_in_unit = 0;
49444 + return 0;
49445 +}
49446 +
49447 +/* if @key is glueable to the item @coord is set to */
49448 +static int must_insert(const coord_t *coord, const reiser4_key *key)
49449 +{
49450 + reiser4_key last;
49451 +
49452 + if (item_id_by_coord(coord) == EXTENT_POINTER_ID
49453 + && keyeq(append_key_extent(coord, &last), key))
49454 + return 0;
49455 + return 1;
49456 +}
49457 +
49458 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
49459 + or modify last unit of last item to have greater width */
49460 +static int put_unit_to_end(znode *node, const reiser4_key *key,
49461 + reiser4_extent *copy_ext)
49462 +{
49463 + int result;
49464 + coord_t coord;
49465 + cop_insert_flag flags;
49466 + reiser4_extent *last_ext;
49467 + reiser4_item_data data;
49468 +
49469 + /* set coord after last unit in an item */
49470 + coord_init_last_unit(&coord, node);
49471 + coord.between = AFTER_UNIT;
49472 +
49473 + flags =
49474 + COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
49475 + if (must_insert(&coord, key)) {
49476 + result =
49477 + insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
49478 + key, NULL /*lh */ , flags);
49479 +
49480 + } else {
49481 + /* try to glue with last unit */
49482 + last_ext = extent_by_coord(&coord);
49483 + if (state_of_extent(last_ext) &&
49484 + extent_get_start(last_ext) + extent_get_width(last_ext) ==
49485 + extent_get_start(copy_ext)) {
49486 + /* widen last unit of node */
49487 + extent_set_width(last_ext,
49488 + extent_get_width(last_ext) +
49489 + extent_get_width(copy_ext));
49490 + znode_make_dirty(node);
49491 + return 0;
49492 + }
49493 +
49494 + /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
49495 + result =
49496 + insert_into_item(&coord, NULL /*lh */ , key,
49497 + init_new_extent(&data, copy_ext, 1),
49498 + flags);
49499 + }
49500 +
49501 + assert("vs-438", result == 0 || result == -E_NODE_FULL);
49502 + return result;
49503 +}
49504 +
49505 +/* @coord is set to extent unit */
49506 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
49507 + flush_pos_t *flush_pos,
49508 + reiser4_key *stop_key)
49509 +{
49510 + reiser4_extent *ext;
49511 + __u64 index;
49512 + __u64 width;
49513 + reiser4_block_nr start;
49514 + extent_state state;
49515 + oid_t oid;
49516 + reiser4_block_nr first_allocated;
49517 + __u64 allocated;
49518 + __u64 protected;
49519 + reiser4_extent copy_extent;
49520 + reiser4_key key;
49521 + int result;
49522 + block_stage_t block_stage;
49523 +
49524 + assert("vs-1457", flush_pos->pos_in_unit == 0);
49525 + assert("vs-1467", coord_is_leftmost_unit(coord));
49526 + assert("vs-1467", item_is_extent(coord));
49527 +
49528 + ext = extent_by_coord(coord);
49529 + index = extent_unit_index(coord);
49530 + start = extent_get_start(ext);
49531 + width = extent_get_width(ext);
49532 + state = state_of_extent(ext);
49533 + unit_key_by_coord(coord, &key);
49534 + oid = get_key_objectid(&key);
49535 +
49536 + if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
49537 + (state == UNALLOCATED_EXTENT)) {
49538 + /* relocate */
49539 + if (state == ALLOCATED_EXTENT) {
49540 + /* all protected nodes are not flushprepped, therefore
49541 + * they are counted as flush_reserved */
49542 + block_stage = BLOCK_FLUSH_RESERVED;
49543 + protected = allocated_extent_slum_size(flush_pos, oid,
49544 + index, width);
49545 + if (protected == 0) {
49546 + flush_pos->state = POS_INVALID;
49547 + flush_pos->pos_in_unit = 0;
49548 + return 0;
49549 + }
49550 + } else {
49551 + block_stage = BLOCK_UNALLOCATED;
49552 + protected = width;
49553 + }
49554 +
49555 + /*
49556 + * look at previous unit if possible. If it is allocated, make
49557 + * preceder more precise
49558 + */
49559 + if (coord->unit_pos &&
49560 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
49561 + reiser4_pos_hint(flush_pos)->blk =
49562 + extent_get_start(ext - 1) +
49563 + extent_get_width(ext - 1);
49564 +
49565 + /* allocate new block numbers for protected nodes */
49566 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49567 + protected,
49568 + &first_allocated, &allocated,
49569 + block_stage);
49570 +
49571 + /* prepare extent which will be copied to left */
49572 + reiser4_set_extent(&copy_extent, first_allocated, allocated);
49573 +
49574 + result = put_unit_to_end(left, &key, &copy_extent);
49575 + if (result == -E_NODE_FULL) {
49576 + int target_block_stage;
49577 +
49578 + /* free blocks which were just allocated */
49579 + target_block_stage =
49580 + (state ==
49581 + ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
49582 + BLOCK_UNALLOCATED;
49583 + reiser4_dealloc_blocks(&first_allocated, &allocated,
49584 + target_block_stage,
49585 + BA_PERMANENT);
49586 +
49587 + /* rewind the preceder. */
49588 + flush_pos->preceder.blk = first_allocated;
49589 + check_preceder(flush_pos->preceder.blk);
49590 +
49591 + return SQUEEZE_TARGET_FULL;
49592 + }
49593 +
49594 + if (state == ALLOCATED_EXTENT) {
49595 + /* free nodes which were relocated */
49596 + reiser4_dealloc_blocks(&start, &allocated,
49597 + BLOCK_ALLOCATED, BA_DEFER);
49598 + }
49599 +
49600 + /* assign new block numbers to protected nodes */
49601 + assign_real_blocknrs(flush_pos, oid, index, allocated,
49602 + first_allocated);
49603 +
49604 + set_key_offset(&key,
49605 + get_key_offset(&key) +
49606 + (allocated << current_blocksize_bits));
49607 + } else {
49608 + /*
49609 + * overwrite: try to copy unit as it is to left neighbor and
49610 + * make all first not flushprepped nodes overwrite nodes
49611 + */
49612 + reiser4_set_extent(&copy_extent, start, width);
49613 + result = put_unit_to_end(left, &key, &copy_extent);
49614 + if (result == -E_NODE_FULL)
49615 + return SQUEEZE_TARGET_FULL;
49616 +
49617 + if (state != HOLE_EXTENT)
49618 + mark_jnodes_overwrite(flush_pos, oid, index, width);
49619 + set_key_offset(&key,
49620 + get_key_offset(&key) +
49621 + (width << current_blocksize_bits));
49622 + }
49623 + *stop_key = key;
49624 + return SQUEEZE_CONTINUE;
49625 +}
49626 +
49627 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
49628 +{
49629 + return key_by_inode_and_offset_common(inode, off, key);
49630 +}
49631 +
49632 +/*
49633 + * Local variables:
49634 + * c-indentation-style: "K&R"
49635 + * mode-name: "LC"
49636 + * c-basic-offset: 8
49637 + * tab-width: 8
49638 + * fill-column: 79
49639 + * scroll-step: 1
49640 + * End:
49641 + */
49642 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent.h linux-2.6.22/fs/reiser4/plugin/item/extent.h
49643 --- linux-2.6.22.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300
49644 +++ linux-2.6.22/fs/reiser4/plugin/item/extent.h 2007-07-29 00:25:34.968720289 +0400
49645 @@ -0,0 +1,231 @@
49646 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49647 +
49648 +#ifndef __REISER4_EXTENT_H__
49649 +#define __REISER4_EXTENT_H__
49650 +
49651 +/* on disk extent */
49652 +typedef struct {
49653 + reiser4_dblock_nr start;
49654 + reiser4_dblock_nr width;
49655 +} reiser4_extent;
49656 +
49657 +struct extent_stat {
49658 + int unallocated_units;
49659 + int unallocated_blocks;
49660 + int allocated_units;
49661 + int allocated_blocks;
49662 + int hole_units;
49663 + int hole_blocks;
49664 +};
49665 +
49666 +/* extents in an extent item can be either holes, or unallocated or allocated
49667 + extents */
49668 +typedef enum {
49669 + HOLE_EXTENT,
49670 + UNALLOCATED_EXTENT,
49671 + ALLOCATED_EXTENT
49672 +} extent_state;
49673 +
49674 +#define HOLE_EXTENT_START 0
49675 +#define UNALLOCATED_EXTENT_START 1
49676 +#define UNALLOCATED_EXTENT_START2 2
49677 +
49678 +struct extent_coord_extension {
49679 + reiser4_block_nr pos_in_unit;
49680 + reiser4_block_nr width; /* width of current unit */
49681 + pos_in_node_t nr_units; /* number of units */
49682 + int ext_offset; /* offset from the beginning of zdata() */
49683 + unsigned long expected_page;
49684 +#if REISER4_DEBUG
49685 + reiser4_extent extent;
49686 +#endif
49687 +};
49688 +
49689 +/* macros to set/get fields of on-disk extent */
49690 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
49691 +{
49692 + return le64_to_cpu(ext->start);
49693 +}
49694 +
49695 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
49696 +{
49697 + return le64_to_cpu(ext->width);
49698 +}
49699 +
49700 +extern __u64 reiser4_current_block_count(void);
49701 +
49702 +static inline void
49703 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
49704 +{
49705 + cassert(sizeof(ext->start) == 8);
49706 + assert("nikita-2510",
49707 + ergo(start > 1, start < reiser4_current_block_count()));
49708 + put_unaligned(cpu_to_le64(start), &ext->start);
49709 +}
49710 +
49711 +static inline void
49712 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
49713 +{
49714 + cassert(sizeof(ext->width) == 8);
49715 + assert("", width > 0);
49716 + put_unaligned(cpu_to_le64(width), &ext->width);
49717 + assert("nikita-2511",
49718 + ergo(extent_get_start(ext) > 1,
49719 + extent_get_start(ext) + width <=
49720 + reiser4_current_block_count()));
49721 +}
49722 +
49723 +#define extent_item(coord) \
49724 +({ \
49725 + assert("nikita-3143", item_is_extent(coord)); \
49726 + ((reiser4_extent *)item_body_by_coord (coord)); \
49727 +})
49728 +
49729 +#define extent_by_coord(coord) \
49730 +({ \
49731 + assert("nikita-3144", item_is_extent(coord)); \
49732 + (extent_item (coord) + (coord)->unit_pos); \
49733 +})
49734 +
49735 +#define width_by_coord(coord) \
49736 +({ \
49737 + assert("nikita-3145", item_is_extent(coord)); \
49738 + extent_get_width (extent_by_coord(coord)); \
49739 +})
49740 +
49741 +struct carry_cut_data;
49742 +struct carry_kill_data;
49743 +
49744 +/* plugin->u.item.b.* */
49745 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
49746 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
49747 + const reiser4_item_data *);
49748 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
49749 +pos_in_node_t nr_units_extent(const coord_t *);
49750 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
49751 +void init_coord_extent(coord_t *);
49752 +int init_extent(coord_t *, reiser4_item_data *);
49753 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
49754 +int can_shift_extent(unsigned free_space,
49755 + coord_t * source, znode * target, shift_direction,
49756 + unsigned *size, unsigned want);
49757 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
49758 + unsigned count, shift_direction where_is_free_space,
49759 + unsigned free_space);
49760 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
49761 + struct carry_kill_data *);
49762 +int create_hook_extent(const coord_t * coord, void *arg);
49763 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49764 + struct carry_cut_data *, reiser4_key * smallest_removed,
49765 + reiser4_key * new_first);
49766 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49767 + struct carry_kill_data *, reiser4_key * smallest_removed,
49768 + reiser4_key * new_first);
49769 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
49770 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
49771 +void print_extent(const char *, coord_t *);
49772 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
49773 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
49774 + reiser4_block_nr * block);
49775 +void item_stat_extent(const coord_t * coord, void *vp);
49776 +int reiser4_check_extent(const coord_t * coord, const char **error);
49777 +
49778 +/* plugin->u.item.s.file.* */
49779 +ssize_t reiser4_write_extent(struct file *, const char __user *,
49780 + size_t, loff_t *);
49781 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
49782 +int reiser4_readpage_extent(void *, struct page *);
49783 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
49784 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
49785 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
49786 +int get_block_address_extent(const coord_t *, sector_t block,
49787 + sector_t * result);
49788 +
49789 +/* these are used in flush.c
49790 + FIXME-VS: should they be somewhere in item_plugin? */
49791 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
49792 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
49793 + reiser4_key * stop_key);
49794 +
49795 +int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
49796 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
49797 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
49798 +
49799 +/* plugin->u.item.f. */
49800 +int reiser4_scan_extent(flush_scan * scan);
49801 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
49802 +
49803 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
49804 + int nr_extents);
49805 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
49806 +extent_state state_of_extent(reiser4_extent * ext);
49807 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
49808 + reiser4_block_nr width);
49809 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
49810 + int *plugged_hole);
49811 +
49812 +#include "../../coord.h"
49813 +#include "../../lock.h"
49814 +#include "../../tap.h"
49815 +
49816 +struct replace_handle {
49817 + /* these are to be set before calling reiser4_replace_extent */
49818 + coord_t *coord;
49819 + lock_handle *lh;
49820 + reiser4_key key;
49821 + reiser4_key *pkey;
49822 + reiser4_extent overwrite;
49823 + reiser4_extent new_extents[2];
49824 + int nr_new_extents;
49825 + unsigned flags;
49826 +
49827 + /* these are used by reiser4_replace_extent */
49828 + reiser4_item_data item;
49829 + coord_t coord_after;
49830 + lock_handle lh_after;
49831 + tap_t watch;
49832 + reiser4_key paste_key;
49833 +#if REISER4_DEBUG
49834 + reiser4_extent orig_ext;
49835 + reiser4_key tmp;
49836 +#endif
49837 +};
49838 +
49839 +/* this structure is kmalloced before calling make_extent to avoid excessive
49840 + stack consumption on plug_hole->reiser4_replace_extent */
49841 +struct make_extent_handle {
49842 + uf_coord_t *uf_coord;
49843 + reiser4_block_nr blocknr;
49844 + int created;
49845 + struct inode *inode;
49846 + union {
49847 + struct {
49848 + } append;
49849 + struct replace_handle replace;
49850 + } u;
49851 +};
49852 +
49853 +int reiser4_replace_extent(struct replace_handle *,
49854 + int return_inserted_position);
49855 +lock_handle *znode_lh(znode *);
49856 +
49857 +/* the reiser4 repacker support */
49858 +struct repacker_cursor;
49859 +extern int process_extent_backward_for_repacking(tap_t *,
49860 + struct repacker_cursor *);
49861 +extern int mark_extent_for_repacking(tap_t *, int);
49862 +
49863 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
49864 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
49865 +
49866 +/* __REISER4_EXTENT_H__ */
49867 +#endif
49868 +/*
49869 + Local variables:
49870 + c-indentation-style: "K&R"
49871 + mode-name: "LC"
49872 + c-basic-offset: 8
49873 + tab-width: 8
49874 + fill-column: 120
49875 + End:
49876 +*/
49877 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.22/fs/reiser4/plugin/item/extent_item_ops.c
49878 --- linux-2.6.22.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300
49879 +++ linux-2.6.22/fs/reiser4/plugin/item/extent_item_ops.c 2007-07-29 00:25:34.968720289 +0400
49880 @@ -0,0 +1,889 @@
49881 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49882 +
49883 +#include "item.h"
49884 +#include "../../inode.h"
49885 +#include "../../tree_walk.h" /* check_sibling_list() */
49886 +#include "../../page_cache.h"
49887 +#include "../../carry.h"
49888 +
49889 +#include <linux/quotaops.h>
49890 +
49891 +/* item_plugin->b.max_key_inside */
49892 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
49893 +{
49894 + item_key_by_coord(coord, key);
49895 + set_key_offset(key, get_key_offset(reiser4_max_key()));
49896 + return key;
49897 +}
49898 +
49899 +/* item_plugin->b.can_contain_key
49900 + this checks whether @key of @data is matching to position set by @coord */
49901 +int
49902 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
49903 + const reiser4_item_data * data)
49904 +{
49905 + reiser4_key item_key;
49906 +
49907 + if (item_plugin_by_coord(coord) != data->iplug)
49908 + return 0;
49909 +
49910 + item_key_by_coord(coord, &item_key);
49911 + if (get_key_locality(key) != get_key_locality(&item_key) ||
49912 + get_key_objectid(key) != get_key_objectid(&item_key) ||
49913 + get_key_ordering(key) != get_key_ordering(&item_key))
49914 + return 0;
49915 +
49916 + return 1;
49917 +}
49918 +
49919 +/* item_plugin->b.mergeable
49920 + first item is of extent type */
49921 +/* Audited by: green(2002.06.13) */
49922 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
49923 +{
49924 + reiser4_key key1, key2;
49925 +
49926 + assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
49927 + /* FIXME-VS: Which is it? Assert or return 0 */
49928 + if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
49929 + return 0;
49930 + }
49931 +
49932 + item_key_by_coord(p1, &key1);
49933 + item_key_by_coord(p2, &key2);
49934 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
49935 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
49936 + get_key_ordering(&key1) != get_key_ordering(&key2) ||
49937 + get_key_type(&key1) != get_key_type(&key2))
49938 + return 0;
49939 + if (get_key_offset(&key1) +
49940 + reiser4_extent_size(p1, nr_units_extent(p1)) !=
49941 + get_key_offset(&key2))
49942 + return 0;
49943 + return 1;
49944 +}
49945 +
49946 +/* item_plugin->b.nr_units */
49947 +pos_in_node_t nr_units_extent(const coord_t * coord)
49948 +{
49949 + /* length of extent item has to be multiple of extent size */
49950 + assert("vs-1424",
49951 + (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
49952 + return item_length_by_coord(coord) / sizeof(reiser4_extent);
49953 +}
49954 +
49955 +/* item_plugin->b.lookup */
49956 +lookup_result
49957 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
49958 + coord_t * coord)
49959 +{ /* znode and item_pos are
49960 + set to an extent item to
49961 + look through */
49962 + reiser4_key item_key;
49963 + reiser4_block_nr lookuped, offset;
49964 + unsigned i, nr_units;
49965 + reiser4_extent *ext;
49966 + unsigned blocksize;
49967 + unsigned char blocksize_bits;
49968 +
49969 + item_key_by_coord(coord, &item_key);
49970 + offset = get_key_offset(&item_key);
49971 +
49972 + /* key we are looking for must be greater than key of item @coord */
49973 + assert("vs-414", keygt(key, &item_key));
49974 +
49975 + assert("umka-99945",
49976 + !keygt(key, max_key_inside_extent(coord, &item_key)));
49977 +
49978 + ext = extent_item(coord);
49979 + assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
49980 +
49981 + blocksize = current_blocksize;
49982 + blocksize_bits = current_blocksize_bits;
49983 +
49984 + /* offset we are looking for */
49985 + lookuped = get_key_offset(key);
49986 +
49987 + nr_units = nr_units_extent(coord);
49988 + /* go through all extents until the one which address given offset */
49989 + for (i = 0; i < nr_units; i++, ext++) {
49990 + offset += (extent_get_width(ext) << blocksize_bits);
49991 + if (offset > lookuped) {
49992 + /* desired byte is somewhere in this extent */
49993 + coord->unit_pos = i;
49994 + coord->between = AT_UNIT;
49995 + return CBK_COORD_FOUND;
49996 + }
49997 + }
49998 +
49999 + /* set coord after last unit */
50000 + coord->unit_pos = nr_units - 1;
50001 + coord->between = AFTER_UNIT;
50002 + return CBK_COORD_FOUND;
50003 +}
50004 +
50005 +/* item_plugin->b.paste
50006 + item @coord is set to has been appended with @data->length of free
50007 + space. data->data contains data to be pasted into the item in position
50008 + @coord->in_item.unit_pos. It must fit into that free space.
50009 + @coord must be set between units.
50010 +*/
50011 +int
50012 +paste_extent(coord_t * coord, reiser4_item_data * data,
50013 + carry_plugin_info * info UNUSED_ARG)
50014 +{
50015 + unsigned old_nr_units;
50016 + reiser4_extent *ext;
50017 + int item_length;
50018 +
50019 + ext = extent_item(coord);
50020 + item_length = item_length_by_coord(coord);
50021 + old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50022 +
50023 + /* this is also used to copy extent into newly created item, so
50024 + old_nr_units could be 0 */
50025 + assert("vs-260", item_length >= data->length);
50026 +
50027 + /* make sure that coord is set properly */
50028 + assert("vs-35",
50029 + ((!coord_is_existing_unit(coord))
50030 + || (!old_nr_units && !coord->unit_pos)));
50031 +
50032 + /* first unit to be moved */
50033 + switch (coord->between) {
50034 + case AFTER_UNIT:
50035 + coord->unit_pos++;
50036 + case BEFORE_UNIT:
50037 + coord->between = AT_UNIT;
50038 + break;
50039 + case AT_UNIT:
50040 + assert("vs-331", !old_nr_units && !coord->unit_pos);
50041 + break;
50042 + default:
50043 + impossible("vs-330", "coord is set improperly");
50044 + }
50045 +
50046 + /* prepare space for new units */
50047 + memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50048 + ext + coord->unit_pos,
50049 + (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50050 +
50051 + /* copy new data from kernel space */
50052 + assert("vs-556", data->user == 0);
50053 + memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50054 +
50055 + /* after paste @coord is set to first of pasted units */
50056 + assert("vs-332", coord_is_existing_unit(coord));
50057 + assert("vs-333",
50058 + !memcmp(data->data, extent_by_coord(coord),
50059 + (unsigned)data->length));
50060 + return 0;
50061 +}
50062 +
50063 +/* item_plugin->b.can_shift */
50064 +int
50065 +can_shift_extent(unsigned free_space, coord_t * source,
50066 + znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50067 + unsigned *size, unsigned want)
50068 +{
50069 + *size = item_length_by_coord(source);
50070 + if (*size > free_space)
50071 + /* never split a unit of extent item */
50072 + *size = free_space - free_space % sizeof(reiser4_extent);
50073 +
50074 + /* we can shift *size bytes, calculate how many do we want to shift */
50075 + if (*size > want * sizeof(reiser4_extent))
50076 + *size = want * sizeof(reiser4_extent);
50077 +
50078 + if (*size % sizeof(reiser4_extent) != 0)
50079 + impossible("vs-119", "Wrong extent size: %i %zd", *size,
50080 + sizeof(reiser4_extent));
50081 + return *size / sizeof(reiser4_extent);
50082 +
50083 +}
50084 +
50085 +/* item_plugin->b.copy_units */
50086 +void
50087 +copy_units_extent(coord_t * target, coord_t * source,
50088 + unsigned from, unsigned count,
50089 + shift_direction where_is_free_space, unsigned free_space)
50090 +{
50091 + char *from_ext, *to_ext;
50092 +
50093 + assert("vs-217", free_space == count * sizeof(reiser4_extent));
50094 +
50095 + from_ext = item_body_by_coord(source);
50096 + to_ext = item_body_by_coord(target);
50097 +
50098 + if (where_is_free_space == SHIFT_LEFT) {
50099 + assert("vs-215", from == 0);
50100 +
50101 + /* At this moment, item length was already updated in the item
50102 + header by shifting code, hence nr_units_extent() will
50103 + return "new" number of units---one we obtain after copying
50104 + units.
50105 + */
50106 + to_ext +=
50107 + (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50108 + } else {
50109 + reiser4_key key;
50110 + coord_t coord;
50111 +
50112 + assert("vs-216",
50113 + from + count == coord_last_unit_pos(source) + 1);
50114 +
50115 + from_ext += item_length_by_coord(source) - free_space;
50116 +
50117 + /* new units are inserted before first unit in an item,
50118 + therefore, we have to update item key */
50119 + coord = *source;
50120 + coord.unit_pos = from;
50121 + unit_key_extent(&coord, &key);
50122 +
50123 + node_plugin_by_node(target->node)->update_item_key(target, &key,
50124 + NULL /*info */);
50125 + }
50126 +
50127 + memcpy(to_ext, from_ext, free_space);
50128 +}
50129 +
50130 +/* item_plugin->b.create_hook
50131 + @arg is znode of leaf node for which we need to update right delimiting key */
50132 +int create_hook_extent(const coord_t * coord, void *arg)
50133 +{
50134 + coord_t *child_coord;
50135 + znode *node;
50136 + reiser4_key key;
50137 + reiser4_tree *tree;
50138 +
50139 + if (!arg)
50140 + return 0;
50141 +
50142 + child_coord = arg;
50143 + tree = znode_get_tree(coord->node);
50144 +
50145 + assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50146 +
50147 + write_lock_tree(tree);
50148 + write_lock_dk(tree);
50149 + /* find a node on the left level for which right delimiting key has to
50150 + be updated */
50151 + if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50152 + assert("vs-411", znode_is_left_connected(child_coord->node));
50153 + node = child_coord->node->left;
50154 + } else {
50155 + assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50156 + node = child_coord->node;
50157 + assert("nikita-3314", node != NULL);
50158 + }
50159 +
50160 + if (node != NULL) {
50161 + znode_set_rd_key(node, item_key_by_coord(coord, &key));
50162 +
50163 + assert("nikita-3282", check_sibling_list(node));
50164 + /* break sibling links */
50165 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50166 + ON_DEBUG(node->right->left_version =
50167 + atomic_inc_return(&delim_key_version);
50168 + node->right_version =
50169 + atomic_inc_return(&delim_key_version););
50170 +
50171 + node->right->left = NULL;
50172 + node->right = NULL;
50173 + }
50174 + }
50175 + write_unlock_dk(tree);
50176 + write_unlock_tree(tree);
50177 + return 0;
50178 +}
50179 +
50180 +#define ITEM_TAIL_KILLED 0
50181 +#define ITEM_HEAD_KILLED 1
50182 +#define ITEM_KILLED 2
50183 +
50184 +/* item_plugin->b.kill_hook
50185 + this is called when @count units starting from @from-th one are going to be removed
50186 + */
50187 +int
50188 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50189 + struct carry_kill_data *kdata)
50190 +{
50191 + reiser4_extent *ext;
50192 + reiser4_block_nr start, length;
50193 + const reiser4_key *pfrom_key, *pto_key;
50194 + struct inode *inode;
50195 + reiser4_tree *tree;
50196 + pgoff_t from_off, to_off, offset, skip;
50197 + int retval;
50198 +
50199 + /* these are located in memory kmalloc-ed by kill_node_content */
50200 + reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50201 + coord_t *dup, *next;
50202 +
50203 + assert("zam-811", znode_is_write_locked(coord->node));
50204 + assert("nikita-3315", kdata != NULL);
50205 + assert("vs-34", kdata->buf != NULL);
50206 +
50207 + /* map structures to kdata->buf */
50208 + min_item_key = (reiser4_key *) (kdata->buf);
50209 + max_item_key = min_item_key + 1;
50210 + from_key = max_item_key + 1;
50211 + to_key = from_key + 1;
50212 + key = to_key + 1;
50213 + dup = (coord_t *) (key + 1);
50214 + next = dup + 1;
50215 +
50216 + item_key_by_coord(coord, min_item_key);
50217 + max_item_key_by_coord(coord, max_item_key);
50218 +
50219 + if (kdata->params.from_key) {
50220 + pfrom_key = kdata->params.from_key;
50221 + pto_key = kdata->params.to_key;
50222 + } else {
50223 + assert("vs-1549", from == coord->unit_pos);
50224 + unit_key_by_coord(coord, from_key);
50225 + pfrom_key = from_key;
50226 +
50227 + coord_dup(dup, coord);
50228 + dup->unit_pos = from + count - 1;
50229 + max_unit_key_by_coord(dup, to_key);
50230 + pto_key = to_key;
50231 + }
50232 +
50233 + if (!keylt(pto_key, max_item_key)) {
50234 + if (!keygt(pfrom_key, min_item_key)) {
50235 + znode *left, *right;
50236 +
50237 + /* item is to be removed completely */
50238 + assert("nikita-3316", kdata->left != NULL
50239 + && kdata->right != NULL);
50240 +
50241 + left = kdata->left->node;
50242 + right = kdata->right->node;
50243 +
50244 + tree = current_tree;
50245 + /* we have to do two things:
50246 + *
50247 + * 1. link left and right formatted neighbors of
50248 + * extent being removed, and
50249 + *
50250 + * 2. update their delimiting keys.
50251 + *
50252 + * atomicity of these operations is protected by
50253 + * taking dk-lock and tree-lock.
50254 + */
50255 + /* if neighbors of item being removed are znodes -
50256 + * link them */
50257 + write_lock_tree(tree);
50258 + write_lock_dk(tree);
50259 + link_left_and_right(left, right);
50260 + if (left) {
50261 + /* update right delimiting key of left
50262 + * neighbor of extent item */
50263 + /*coord_t next;
50264 + reiser4_key key; */
50265 +
50266 + coord_dup(next, coord);
50267 +
50268 + if (coord_next_item(next))
50269 + *key = *znode_get_rd_key(coord->node);
50270 + else
50271 + item_key_by_coord(next, key);
50272 + znode_set_rd_key(left, key);
50273 + }
50274 + write_unlock_dk(tree);
50275 + write_unlock_tree(tree);
50276 +
50277 + from_off =
50278 + get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
50279 + to_off =
50280 + (get_key_offset(max_item_key) +
50281 + 1) >> PAGE_CACHE_SHIFT;
50282 + retval = ITEM_KILLED;
50283 + } else {
50284 + /* tail of item is to be removed */
50285 + from_off =
50286 + (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
50287 + 1) >> PAGE_CACHE_SHIFT;
50288 + to_off =
50289 + (get_key_offset(max_item_key) +
50290 + 1) >> PAGE_CACHE_SHIFT;
50291 + retval = ITEM_TAIL_KILLED;
50292 + }
50293 + } else {
50294 + /* head of item is to be removed */
50295 + assert("vs-1571", keyeq(pfrom_key, min_item_key));
50296 + assert("vs-1572",
50297 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
50298 + 0);
50299 + assert("vs-1573",
50300 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50301 + 1)) == 0);
50302 +
50303 + if (kdata->left->node) {
50304 + /* update right delimiting key of left neighbor of extent item */
50305 + /*reiser4_key key; */
50306 +
50307 + *key = *pto_key;
50308 + set_key_offset(key, get_key_offset(pto_key) + 1);
50309 +
50310 + write_lock_dk(current_tree);
50311 + znode_set_rd_key(kdata->left->node, key);
50312 + write_unlock_dk(current_tree);
50313 + }
50314 +
50315 + from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
50316 + to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
50317 + retval = ITEM_HEAD_KILLED;
50318 + }
50319 +
50320 + inode = kdata->inode;
50321 + assert("vs-1545", inode != NULL);
50322 + if (inode != NULL)
50323 + /* take care of pages and jnodes corresponding to part of item being killed */
50324 + reiser4_invalidate_pages(inode->i_mapping, from_off,
50325 + to_off - from_off,
50326 + kdata->params.truncate);
50327 +
50328 + ext = extent_item(coord) + from;
50329 + offset =
50330 + (get_key_offset(min_item_key) +
50331 + reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
50332 +
50333 + assert("vs-1551", from_off >= offset);
50334 + assert("vs-1552", from_off - offset <= extent_get_width(ext));
50335 + skip = from_off - offset;
50336 + offset = from_off;
50337 +
50338 + while (offset < to_off) {
50339 + length = extent_get_width(ext) - skip;
50340 + if (state_of_extent(ext) == HOLE_EXTENT) {
50341 + skip = 0;
50342 + offset += length;
50343 + ext++;
50344 + continue;
50345 + }
50346 +
50347 + if (offset + length > to_off) {
50348 + length = to_off - offset;
50349 + }
50350 +
50351 + DQUOT_FREE_BLOCK_NODIRTY(inode, length);
50352 +
50353 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50354 + /* some jnodes corresponding to this unallocated extent */
50355 + fake_allocated2free(length, 0 /* unformatted */ );
50356 +
50357 + skip = 0;
50358 + offset += length;
50359 + ext++;
50360 + continue;
50361 + }
50362 +
50363 + assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
50364 +
50365 + if (length != 0) {
50366 + start = extent_get_start(ext) + skip;
50367 +
50368 + /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
50369 + immediately */
50370 + reiser4_dealloc_blocks(&start, &length,
50371 + 0 /* not used */ ,
50372 + BA_DEFER
50373 + /* unformatted with defer */ );
50374 + }
50375 + skip = 0;
50376 + offset += length;
50377 + ext++;
50378 + }
50379 + return retval;
50380 +}
50381 +
50382 +/* item_plugin->b.kill_units */
50383 +int
50384 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50385 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
50386 + reiser4_key * new_first)
50387 +{
50388 + reiser4_extent *ext;
50389 + reiser4_key item_key;
50390 + pos_in_node_t count;
50391 + reiser4_key from_key, to_key;
50392 + const reiser4_key *pfrom_key, *pto_key;
50393 + loff_t off;
50394 + int result;
50395 +
50396 + assert("vs-1541",
50397 + ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
50398 + || (kdata->params.from_key != NULL
50399 + && kdata->params.to_key != NULL)));
50400 +
50401 + if (kdata->params.from_key) {
50402 + pfrom_key = kdata->params.from_key;
50403 + pto_key = kdata->params.to_key;
50404 + } else {
50405 + coord_t dup;
50406 +
50407 + /* calculate key range of kill */
50408 + assert("vs-1549", from == coord->unit_pos);
50409 + unit_key_by_coord(coord, &from_key);
50410 + pfrom_key = &from_key;
50411 +
50412 + coord_dup(&dup, coord);
50413 + dup.unit_pos = to;
50414 + max_unit_key_by_coord(&dup, &to_key);
50415 + pto_key = &to_key;
50416 + }
50417 +
50418 + item_key_by_coord(coord, &item_key);
50419 +
50420 +#if REISER4_DEBUG
50421 + {
50422 + reiser4_key max_item_key;
50423 +
50424 + max_item_key_by_coord(coord, &max_item_key);
50425 +
50426 + if (new_first) {
50427 + /* head of item is to be cut */
50428 + assert("vs-1542", keyeq(pfrom_key, &item_key));
50429 + assert("vs-1538", keylt(pto_key, &max_item_key));
50430 + } else {
50431 + /* tail of item is to be cut */
50432 + assert("vs-1540", keygt(pfrom_key, &item_key));
50433 + assert("vs-1543", !keylt(pto_key, &max_item_key));
50434 + }
50435 + }
50436 +#endif
50437 +
50438 + if (smallest_removed)
50439 + *smallest_removed = *pfrom_key;
50440 +
50441 + if (new_first) {
50442 + /* item head is cut. Item key will change. This new key is calculated here */
50443 + assert("vs-1556",
50444 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50445 + (PAGE_CACHE_SIZE - 1));
50446 + *new_first = *pto_key;
50447 + set_key_offset(new_first, get_key_offset(new_first) + 1);
50448 + }
50449 +
50450 + count = to - from + 1;
50451 + result = kill_hook_extent(coord, from, count, kdata);
50452 + if (result == ITEM_TAIL_KILLED) {
50453 + assert("vs-1553",
50454 + get_key_offset(pfrom_key) >=
50455 + get_key_offset(&item_key) +
50456 + reiser4_extent_size(coord, from));
50457 + off =
50458 + get_key_offset(pfrom_key) -
50459 + (get_key_offset(&item_key) +
50460 + reiser4_extent_size(coord, from));
50461 + if (off) {
50462 + /* unit @from is to be cut partially. Its width decreases */
50463 + ext = extent_item(coord) + from;
50464 + extent_set_width(ext,
50465 + (off + PAGE_CACHE_SIZE -
50466 + 1) >> PAGE_CACHE_SHIFT);
50467 + count--;
50468 + }
50469 + } else {
50470 + __u64 max_to_offset;
50471 + __u64 rest;
50472 +
50473 + assert("vs-1575", result == ITEM_HEAD_KILLED);
50474 + assert("", from == 0);
50475 + assert("",
50476 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50477 + 1)) == 0);
50478 + assert("",
50479 + get_key_offset(pto_key) + 1 >
50480 + get_key_offset(&item_key) +
50481 + reiser4_extent_size(coord, to));
50482 + max_to_offset =
50483 + get_key_offset(&item_key) +
50484 + reiser4_extent_size(coord, to + 1) - 1;
50485 + assert("", get_key_offset(pto_key) <= max_to_offset);
50486 +
50487 + rest =
50488 + (max_to_offset -
50489 + get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
50490 + if (rest) {
50491 + /* unit @to is to be cut partially */
50492 + ext = extent_item(coord) + to;
50493 +
50494 + assert("", extent_get_width(ext) > rest);
50495 +
50496 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
50497 + extent_set_start(ext,
50498 + extent_get_start(ext) +
50499 + (extent_get_width(ext) -
50500 + rest));
50501 +
50502 + extent_set_width(ext, rest);
50503 + count--;
50504 + }
50505 + }
50506 + return count * sizeof(reiser4_extent);
50507 +}
50508 +
50509 +/* item_plugin->b.cut_units
50510 + this is too similar to kill_units_extent */
50511 +int
50512 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50513 + struct carry_cut_data *cdata, reiser4_key * smallest_removed,
50514 + reiser4_key * new_first)
50515 +{
50516 + reiser4_extent *ext;
50517 + reiser4_key item_key;
50518 + pos_in_node_t count;
50519 + reiser4_key from_key, to_key;
50520 + const reiser4_key *pfrom_key, *pto_key;
50521 + loff_t off;
50522 +
50523 + assert("vs-1541",
50524 + ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
50525 + || (cdata->params.from_key != NULL
50526 + && cdata->params.to_key != NULL)));
50527 +
50528 + if (cdata->params.from_key) {
50529 + pfrom_key = cdata->params.from_key;
50530 + pto_key = cdata->params.to_key;
50531 + } else {
50532 + coord_t dup;
50533 +
50534 + /* calculate key range of kill */
50535 + coord_dup(&dup, coord);
50536 + dup.unit_pos = from;
50537 + unit_key_by_coord(&dup, &from_key);
50538 +
50539 + dup.unit_pos = to;
50540 + max_unit_key_by_coord(&dup, &to_key);
50541 +
50542 + pfrom_key = &from_key;
50543 + pto_key = &to_key;
50544 + }
50545 +
50546 + assert("vs-1555",
50547 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
50548 + assert("vs-1556",
50549 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50550 + (PAGE_CACHE_SIZE - 1));
50551 +
50552 + item_key_by_coord(coord, &item_key);
50553 +
50554 +#if REISER4_DEBUG
50555 + {
50556 + reiser4_key max_item_key;
50557 +
50558 + assert("vs-1584",
50559 + get_key_locality(pfrom_key) ==
50560 + get_key_locality(&item_key));
50561 + assert("vs-1585",
50562 + get_key_type(pfrom_key) == get_key_type(&item_key));
50563 + assert("vs-1586",
50564 + get_key_objectid(pfrom_key) ==
50565 + get_key_objectid(&item_key));
50566 + assert("vs-1587",
50567 + get_key_ordering(pfrom_key) ==
50568 + get_key_ordering(&item_key));
50569 +
50570 + max_item_key_by_coord(coord, &max_item_key);
50571 +
50572 + if (new_first != NULL) {
50573 + /* head of item is to be cut */
50574 + assert("vs-1542", keyeq(pfrom_key, &item_key));
50575 + assert("vs-1538", keylt(pto_key, &max_item_key));
50576 + } else {
50577 + /* tail of item is to be cut */
50578 + assert("vs-1540", keygt(pfrom_key, &item_key));
50579 + assert("vs-1543", keyeq(pto_key, &max_item_key));
50580 + }
50581 + }
50582 +#endif
50583 +
50584 + if (smallest_removed)
50585 + *smallest_removed = *pfrom_key;
50586 +
50587 + if (new_first) {
50588 + /* item head is cut. Item key will change. This new key is calculated here */
50589 + *new_first = *pto_key;
50590 + set_key_offset(new_first, get_key_offset(new_first) + 1);
50591 + }
50592 +
50593 + count = to - from + 1;
50594 +
50595 + assert("vs-1553",
50596 + get_key_offset(pfrom_key) >=
50597 + get_key_offset(&item_key) + reiser4_extent_size(coord, from));
50598 + off =
50599 + get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
50600 + reiser4_extent_size(coord, from));
50601 + if (off) {
50602 + /* tail of unit @from is to be cut partially. Its width decreases */
50603 + assert("vs-1582", new_first == NULL);
50604 + ext = extent_item(coord) + from;
50605 + extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
50606 + count--;
50607 + }
50608 +
50609 + assert("vs-1554",
50610 + get_key_offset(pto_key) <=
50611 + get_key_offset(&item_key) +
50612 + reiser4_extent_size(coord, to + 1) - 1);
50613 + off =
50614 + (get_key_offset(&item_key) +
50615 + reiser4_extent_size(coord, to + 1) - 1) -
50616 + get_key_offset(pto_key);
50617 + if (off) {
50618 + /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
50619 + and width decreased. */
50620 + assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
50621 + ext = extent_item(coord) + to;
50622 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
50623 + extent_set_start(ext,
50624 + extent_get_start(ext) +
50625 + (extent_get_width(ext) -
50626 + (off >> PAGE_CACHE_SHIFT)));
50627 +
50628 + extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
50629 + count--;
50630 + }
50631 + return count * sizeof(reiser4_extent);
50632 +}
50633 +
50634 +/* item_plugin->b.unit_key */
50635 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
50636 +{
50637 + assert("vs-300", coord_is_existing_unit(coord));
50638 +
50639 + item_key_by_coord(coord, key);
50640 + set_key_offset(key,
50641 + (get_key_offset(key) +
50642 + reiser4_extent_size(coord, coord->unit_pos)));
50643 +
50644 + return key;
50645 +}
50646 +
50647 +/* item_plugin->b.max_unit_key */
50648 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
50649 +{
50650 + assert("vs-300", coord_is_existing_unit(coord));
50651 +
50652 + item_key_by_coord(coord, key);
50653 + set_key_offset(key,
50654 + (get_key_offset(key) +
50655 + reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
50656 + return key;
50657 +}
50658 +
50659 +/* item_plugin->b.estimate
50660 + item_plugin->b.item_data_by_flow */
50661 +
50662 +#if REISER4_DEBUG
50663 +
50664 +/* item_plugin->b.check
50665 + used for debugging, every item should have here the most complete
50666 + possible check of the consistency of the item that the inventor can
50667 + construct
50668 +*/
50669 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
50670 + const char **error /* where to store error message */)
50671 +{
50672 + reiser4_extent *ext, *first;
50673 + unsigned i, j;
50674 + reiser4_block_nr start, width, blk_cnt;
50675 + unsigned num_units;
50676 + reiser4_tree *tree;
50677 + oid_t oid;
50678 + reiser4_key key;
50679 + coord_t scan;
50680 +
50681 + assert("vs-933", REISER4_DEBUG);
50682 +
50683 + if (znode_get_level(coord->node) != TWIG_LEVEL) {
50684 + *error = "Extent on the wrong level";
50685 + return -1;
50686 + }
50687 + if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
50688 + *error = "Wrong item size";
50689 + return -1;
50690 + }
50691 + ext = first = extent_item(coord);
50692 + blk_cnt = reiser4_block_count(reiser4_get_current_sb());
50693 + num_units = coord_num_units(coord);
50694 + tree = znode_get_tree(coord->node);
50695 + item_key_by_coord(coord, &key);
50696 + oid = get_key_objectid(&key);
50697 + coord_dup(&scan, coord);
50698 +
50699 + for (i = 0; i < num_units; ++i, ++ext) {
50700 + __u64 index;
50701 +
50702 + scan.unit_pos = i;
50703 + index = extent_unit_index(&scan);
50704 +
50705 +#if 0
50706 + /* check that all jnodes are present for the unallocated
50707 + * extent */
50708 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50709 + for (j = 0; j < extent_get_width(ext); j++) {
50710 + jnode *node;
50711 +
50712 + node = jlookup(tree, oid, index + j);
50713 + if (node == NULL) {
50714 + print_coord("scan", &scan, 0);
50715 + *error = "Jnode missing";
50716 + return -1;
50717 + }
50718 + jput(node);
50719 + }
50720 + }
50721 +#endif
50722 +
50723 + start = extent_get_start(ext);
50724 + if (start < 2)
50725 + continue;
50726 + /* extent is allocated one */
50727 + width = extent_get_width(ext);
50728 + if (start >= blk_cnt) {
50729 + *error = "Start too large";
50730 + return -1;
50731 + }
50732 + if (start + width > blk_cnt) {
50733 + *error = "End too large";
50734 + return -1;
50735 + }
50736 + /* make sure that this extent does not overlap with other
50737 + allocated extents extents */
50738 + for (j = 0; j < i; j++) {
50739 + if (state_of_extent(first + j) != ALLOCATED_EXTENT)
50740 + continue;
50741 + if (!
50742 + ((extent_get_start(ext) >=
50743 + extent_get_start(first + j) +
50744 + extent_get_width(first + j))
50745 + || (extent_get_start(ext) +
50746 + extent_get_width(ext) <=
50747 + extent_get_start(first + j)))) {
50748 + *error = "Extent overlaps with others";
50749 + return -1;
50750 + }
50751 + }
50752 +
50753 + }
50754 +
50755 + return 0;
50756 +}
50757 +
50758 +#endif /* REISER4_DEBUG */
50759 +
50760 +/*
50761 + Local variables:
50762 + c-indentation-style: "K&R"
50763 + mode-name: "LC"
50764 + c-basic-offset: 8
50765 + tab-width: 8
50766 + fill-column: 120
50767 + scroll-step: 1
50768 + End:
50769 +*/
50770 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/internal.c linux-2.6.22/fs/reiser4/plugin/item/internal.c
50771 --- linux-2.6.22.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
50772 +++ linux-2.6.22/fs/reiser4/plugin/item/internal.c 2007-07-29 00:25:34.968720289 +0400
50773 @@ -0,0 +1,396 @@
50774 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50775 +
50776 +/* Implementation of internal-item plugin methods. */
50777 +
50778 +#include "../../forward.h"
50779 +#include "../../debug.h"
50780 +#include "../../dformat.h"
50781 +#include "../../key.h"
50782 +#include "../../coord.h"
50783 +#include "internal.h"
50784 +#include "item.h"
50785 +#include "../node/node.h"
50786 +#include "../plugin.h"
50787 +#include "../../jnode.h"
50788 +#include "../../znode.h"
50789 +#include "../../tree_walk.h"
50790 +#include "../../tree_mod.h"
50791 +#include "../../tree.h"
50792 +#include "../../super.h"
50793 +#include "../../block_alloc.h"
50794 +
50795 +/* see internal.h for explanation */
50796 +
50797 +/* plugin->u.item.b.mergeable */
50798 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
50799 + const coord_t * p2 UNUSED_ARG /* second item */ )
50800 +{
50801 + /* internal items are not mergeable */
50802 + return 0;
50803 +}
50804 +
50805 +/* ->lookup() method for internal items */
50806 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
50807 + lookup_bias bias UNUSED_ARG /* lookup bias */ ,
50808 + coord_t * coord /* coord of item */ )
50809 +{
50810 + reiser4_key ukey;
50811 +
50812 + switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
50813 + default:
50814 + impossible("", "keycmp()?!");
50815 + case LESS_THAN:
50816 + /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
50817 + item plugin can not be taken using coord set this way */
50818 + assert("vs-681", coord->unit_pos == 0);
50819 + coord->between = AFTER_UNIT;
50820 + case EQUAL_TO:
50821 + return CBK_COORD_FOUND;
50822 + case GREATER_THAN:
50823 + return CBK_COORD_NOTFOUND;
50824 + }
50825 +}
50826 +
50827 +/* return body of internal item at @coord */
50828 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
50829 + * item */ )
50830 +{
50831 + assert("nikita-607", coord != NULL);
50832 + assert("nikita-1650",
50833 + item_plugin_by_coord(coord) ==
50834 + item_plugin_by_id(NODE_POINTER_ID));
50835 + return (internal_item_layout *) item_body_by_coord(coord);
50836 +}
50837 +
50838 +void reiser4_update_internal(const coord_t * coord,
50839 + const reiser4_block_nr * blocknr)
50840 +{
50841 + internal_item_layout *item = internal_at(coord);
50842 + assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
50843 +
50844 + put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
50845 +}
50846 +
50847 +/* return child block number stored in the internal item at @coord */
50848 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
50849 +{
50850 + assert("nikita-608", coord != NULL);
50851 + return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
50852 +}
50853 +
50854 +/* get znode pointed to by internal @item */
50855 +static znode *znode_at(const coord_t * item /* coord of item */ ,
50856 + znode * parent /* parent node */ )
50857 +{
50858 + return child_znode(item, parent, 1, 0);
50859 +}
50860 +
50861 +/* store pointer from internal item into "block". Implementation of
50862 + ->down_link() method */
50863 +void down_link_internal(const coord_t * coord /* coord of item */ ,
50864 + const reiser4_key * key UNUSED_ARG /* key to get
50865 + * pointer for */ ,
50866 + reiser4_block_nr * block /* resulting block number */ )
50867 +{
50868 + ON_DEBUG(reiser4_key item_key);
50869 +
50870 + assert("nikita-609", coord != NULL);
50871 + assert("nikita-611", block != NULL);
50872 + assert("nikita-612", (key == NULL) ||
50873 + /* twig horrors */
50874 + (znode_get_level(coord->node) == TWIG_LEVEL)
50875 + || keyle(item_key_by_coord(coord, &item_key), key));
50876 +
50877 + *block = pointer_at(coord);
50878 + assert("nikita-2960", reiser4_blocknr_is_sane(block));
50879 +}
50880 +
50881 +/* Get the child's block number, or 0 if the block is unallocated. */
50882 +int
50883 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
50884 + reiser4_block_nr * block)
50885 +{
50886 + assert("jmacd-2059", coord != NULL);
50887 +
50888 + *block = pointer_at(coord);
50889 + assert("nikita-2961", reiser4_blocknr_is_sane(block));
50890 +
50891 + if (reiser4_blocknr_is_fake(block)) {
50892 + *block = 0;
50893 + }
50894 +
50895 + return 0;
50896 +}
50897 +
50898 +/* Return the child. */
50899 +int
50900 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
50901 + jnode ** childp)
50902 +{
50903 + reiser4_block_nr block = pointer_at(coord);
50904 + znode *child;
50905 +
50906 + assert("jmacd-2059", childp != NULL);
50907 + assert("nikita-2962", reiser4_blocknr_is_sane(&block));
50908 +
50909 + child = zlook(znode_get_tree(coord->node), &block);
50910 +
50911 + if (IS_ERR(child)) {
50912 + return PTR_ERR(child);
50913 + }
50914 +
50915 + *childp = ZJNODE(child);
50916 +
50917 + return 0;
50918 +}
50919 +
50920 +#if REISER4_DEBUG
50921 +
50922 +static void check_link(znode * left, znode * right)
50923 +{
50924 + znode *scan;
50925 +
50926 + for (scan = left; scan != right; scan = scan->right) {
50927 + if (ZF_ISSET(scan, JNODE_RIP))
50928 + break;
50929 + if (znode_is_right_connected(scan) && scan->right != NULL) {
50930 + if (ZF_ISSET(scan->right, JNODE_RIP))
50931 + break;
50932 + assert("nikita-3285",
50933 + znode_is_left_connected(scan->right));
50934 + assert("nikita-3265",
50935 + ergo(scan != left,
50936 + ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
50937 + assert("nikita-3284", scan->right->left == scan);
50938 + } else
50939 + break;
50940 + }
50941 +}
50942 +
50943 +int check__internal(const coord_t * coord, const char **error)
50944 +{
50945 + reiser4_block_nr blk;
50946 + znode *child;
50947 + coord_t cpy;
50948 +
50949 + blk = pointer_at(coord);
50950 + if (!reiser4_blocknr_is_sane(&blk)) {
50951 + *error = "Invalid pointer";
50952 + return -1;
50953 + }
50954 + coord_dup(&cpy, coord);
50955 + child = znode_at(&cpy, cpy.node);
50956 + if (child != NULL) {
50957 + znode *left_child;
50958 + znode *right_child;
50959 +
50960 + left_child = right_child = NULL;
50961 +
50962 + assert("nikita-3256", znode_invariant(child));
50963 + if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
50964 + left_child = znode_at(&cpy, cpy.node);
50965 + if (left_child != NULL) {
50966 + read_lock_tree(znode_get_tree(child));
50967 + check_link(left_child, child);
50968 + read_unlock_tree(znode_get_tree(child));
50969 + zput(left_child);
50970 + }
50971 + }
50972 + coord_dup(&cpy, coord);
50973 + if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
50974 + right_child = znode_at(&cpy, cpy.node);
50975 + if (right_child != NULL) {
50976 + read_lock_tree(znode_get_tree(child));
50977 + check_link(child, right_child);
50978 + read_unlock_tree(znode_get_tree(child));
50979 + zput(right_child);
50980 + }
50981 + }
50982 + zput(child);
50983 + }
50984 + return 0;
50985 +}
50986 +
50987 +#endif /* REISER4_DEBUG */
50988 +
50989 +/* return true only if this item really points to "block" */
50990 +/* Audited by: green(2002.06.14) */
50991 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
50992 + const reiser4_block_nr * block /* block number to
50993 + * check */ )
50994 +{
50995 + assert("nikita-613", coord != NULL);
50996 + assert("nikita-614", block != NULL);
50997 +
50998 + return pointer_at(coord) == *block;
50999 +}
51000 +
51001 +/* hook called by ->create_item() method of node plugin after new internal
51002 + item was just created.
51003 +
51004 + This is point where pointer to new node is inserted into tree. Initialize
51005 + parent pointer in child znode, insert child into sibling list and slum.
51006 +
51007 +*/
51008 +int create_hook_internal(const coord_t * item /* coord of item */ ,
51009 + void *arg /* child's left neighbor, if any */ )
51010 +{
51011 + znode *child;
51012 + __u64 child_ptr;
51013 +
51014 + assert("nikita-1252", item != NULL);
51015 + assert("nikita-1253", item->node != NULL);
51016 + assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51017 + assert("nikita-1450", item->unit_pos == 0);
51018 +
51019 + /*
51020 + * preparing to item insertion build_child_ptr_data sets pointer to
51021 + * data to be inserted to jnode's blocknr which is in cpu byte
51022 + * order. Node's create_item simply copied those data. As result we
51023 + * have child pointer in cpu's byte order. Convert content of internal
51024 + * item to little endian byte order.
51025 + */
51026 + child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51027 + reiser4_update_internal(item, &child_ptr);
51028 +
51029 + child = znode_at(item, item->node);
51030 + if (child != NULL && !IS_ERR(child)) {
51031 + znode *left;
51032 + int result = 0;
51033 + reiser4_tree *tree;
51034 +
51035 + left = arg;
51036 + tree = znode_get_tree(item->node);
51037 + write_lock_tree(tree);
51038 + write_lock_dk(tree);
51039 + assert("nikita-1400", (child->in_parent.node == NULL)
51040 + || (znode_above_root(child->in_parent.node)));
51041 + ++item->node->c_count;
51042 + coord_to_parent_coord(item, &child->in_parent);
51043 + sibling_list_insert_nolock(child, left);
51044 +
51045 + assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51046 + ZF_CLR(child, JNODE_ORPHAN);
51047 +
51048 + if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51049 + znode_get_rd_key(child))) {
51050 + znode_set_rd_key(child, znode_get_rd_key(left));
51051 + }
51052 + write_unlock_dk(tree);
51053 + write_unlock_tree(tree);
51054 + zput(child);
51055 + return result;
51056 + } else {
51057 + if (child == NULL)
51058 + child = ERR_PTR(-EIO);
51059 + return PTR_ERR(child);
51060 + }
51061 +}
51062 +
51063 +/* hook called by ->cut_and_kill() method of node plugin just before internal
51064 + item is removed.
51065 +
51066 + This is point where empty node is removed from the tree. Clear parent
51067 + pointer in child, and mark node for pending deletion.
51068 +
51069 + Node will be actually deleted later and in several installations:
51070 +
51071 + . when last lock on this node will be released, node will be removed from
51072 + the sibling list and its lock will be invalidated
51073 +
51074 + . when last reference to this node will be dropped, bitmap will be updated
51075 + and node will be actually removed from the memory.
51076 +
51077 +*/
51078 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
51079 + pos_in_node_t from UNUSED_ARG /* start unit */ ,
51080 + pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51081 + struct carry_kill_data *p UNUSED_ARG)
51082 +{
51083 + znode *child;
51084 +
51085 + assert("nikita-1222", item != NULL);
51086 + assert("nikita-1224", from == 0);
51087 + assert("nikita-1225", count == 1);
51088 +
51089 + child = znode_at(item, item->node);
51090 + if (IS_ERR(child))
51091 + return PTR_ERR(child);
51092 + else if (node_is_empty(child)) {
51093 + reiser4_tree *tree;
51094 +
51095 + assert("nikita-1397", znode_is_write_locked(child));
51096 + assert("nikita-1398", child->c_count == 0);
51097 + assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51098 +
51099 + tree = znode_get_tree(item->node);
51100 + write_lock_tree(tree);
51101 + init_parent_coord(&child->in_parent, NULL);
51102 + --item->node->c_count;
51103 + write_unlock_tree(tree);
51104 + zput(child);
51105 + return 0;
51106 + } else {
51107 + warning("nikita-1223",
51108 + "Cowardly refuse to remove link to non-empty node");
51109 + zput(child);
51110 + return RETERR(-EIO);
51111 + }
51112 +}
51113 +
51114 +/* hook called by ->shift() node plugin method when iternal item was just
51115 + moved from one node to another.
51116 +
51117 + Update parent pointer in child and c_counts in old and new parent
51118 +
51119 +*/
51120 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
51121 + unsigned from UNUSED_ARG /* start unit */ ,
51122 + unsigned count UNUSED_ARG /* stop unit */ ,
51123 + znode * old_node /* old parent */ )
51124 +{
51125 + znode *child;
51126 + znode *new_node;
51127 + reiser4_tree *tree;
51128 +
51129 + assert("nikita-1276", item != NULL);
51130 + assert("nikita-1277", from == 0);
51131 + assert("nikita-1278", count == 1);
51132 + assert("nikita-1451", item->unit_pos == 0);
51133 +
51134 + new_node = item->node;
51135 + assert("nikita-2132", new_node != old_node);
51136 + tree = znode_get_tree(item->node);
51137 + child = child_znode(item, old_node, 1, 0);
51138 + if (child == NULL)
51139 + return 0;
51140 + if (!IS_ERR(child)) {
51141 + write_lock_tree(tree);
51142 + ++new_node->c_count;
51143 + assert("nikita-1395", znode_parent(child) == old_node);
51144 + assert("nikita-1396", old_node->c_count > 0);
51145 + coord_to_parent_coord(item, &child->in_parent);
51146 + assert("nikita-1781", znode_parent(child) == new_node);
51147 + assert("nikita-1782",
51148 + check_tree_pointer(item, child) == NS_FOUND);
51149 + --old_node->c_count;
51150 + write_unlock_tree(tree);
51151 + zput(child);
51152 + return 0;
51153 + } else
51154 + return PTR_ERR(child);
51155 +}
51156 +
51157 +/* plugin->u.item.b.max_key_inside - not defined */
51158 +
51159 +/* plugin->u.item.b.nr_units - item.c:single_unit */
51160 +
51161 +/* Make Linus happy.
51162 + Local variables:
51163 + c-indentation-style: "K&R"
51164 + mode-name: "LC"
51165 + c-basic-offset: 8
51166 + tab-width: 8
51167 + fill-column: 120
51168 + End:
51169 +*/
51170 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/internal.h linux-2.6.22/fs/reiser4/plugin/item/internal.h
51171 --- linux-2.6.22.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
51172 +++ linux-2.6.22/fs/reiser4/plugin/item/internal.h 2007-07-29 00:25:34.968720289 +0400
51173 @@ -0,0 +1,57 @@
51174 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51175 +/* Internal item contains down-link to the child of the internal/twig
51176 + node in a tree. It is internal items that are actually used during
51177 + tree traversal. */
51178 +
51179 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51180 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51181 +
51182 +#include "../../forward.h"
51183 +#include "../../dformat.h"
51184 +
51185 +/* on-disk layout of internal item */
51186 +typedef struct internal_item_layout {
51187 + /* 0 */ reiser4_dblock_nr pointer;
51188 + /* 4 */
51189 +} internal_item_layout;
51190 +
51191 +struct cut_list;
51192 +
51193 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
51194 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51195 + coord_t * coord);
51196 +/* store pointer from internal item into "block". Implementation of
51197 + ->down_link() method */
51198 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51199 + reiser4_block_nr * block);
51200 +extern int has_pointer_to_internal(const coord_t * coord,
51201 + const reiser4_block_nr * block);
51202 +extern int create_hook_internal(const coord_t * item, void *arg);
51203 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51204 + pos_in_node_t count, struct carry_kill_data *);
51205 +extern int shift_hook_internal(const coord_t * item, unsigned from,
51206 + unsigned count, znode * old_node);
51207 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
51208 +
51209 +extern int utmost_child_internal(const coord_t * coord, sideof side,
51210 + jnode ** child);
51211 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51212 + reiser4_block_nr * block);
51213 +
51214 +extern void reiser4_update_internal(const coord_t * coord,
51215 + const reiser4_block_nr * blocknr);
51216 +/* FIXME: reiserfs has check_internal */
51217 +extern int check__internal(const coord_t * coord, const char **error);
51218 +
51219 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51220 +#endif
51221 +
51222 +/* Make Linus happy.
51223 + Local variables:
51224 + c-indentation-style: "K&R"
51225 + mode-name: "LC"
51226 + c-basic-offset: 8
51227 + tab-width: 8
51228 + fill-column: 120
51229 + End:
51230 +*/
51231 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/item.c linux-2.6.22/fs/reiser4/plugin/item/item.c
51232 --- linux-2.6.22.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300
51233 +++ linux-2.6.22/fs/reiser4/plugin/item/item.c 2007-07-29 00:25:34.972721325 +0400
51234 @@ -0,0 +1,719 @@
51235 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51236 +
51237 +/* definition of item plugins. */
51238 +
51239 +#include "../../forward.h"
51240 +#include "../../debug.h"
51241 +#include "../../key.h"
51242 +#include "../../coord.h"
51243 +#include "../plugin_header.h"
51244 +#include "sde.h"
51245 +#include "internal.h"
51246 +#include "item.h"
51247 +#include "static_stat.h"
51248 +#include "../plugin.h"
51249 +#include "../../znode.h"
51250 +#include "../../tree.h"
51251 +#include "../../context.h"
51252 +#include "ctail.h"
51253 +
51254 +/* return pointer to item body */
51255 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
51256 +{
51257 + assert("nikita-324", coord != NULL);
51258 + assert("nikita-325", coord->node != NULL);
51259 + assert("nikita-326", znode_is_loaded(coord->node));
51260 + assert("nikita-3200", coord->offset == INVALID_OFFSET);
51261 +
51262 + coord->offset =
51263 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
51264 + zdata(coord->node);
51265 + ON_DEBUG(coord->body_v = coord->node->times_locked);
51266 +}
51267 +
51268 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
51269 +{
51270 + return zdata(coord->node) + coord->offset;
51271 +}
51272 +
51273 +#if REISER4_DEBUG
51274 +
51275 +int item_body_is_valid(const coord_t * coord)
51276 +{
51277 + return
51278 + coord->offset ==
51279 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
51280 + zdata(coord->node);
51281 +}
51282 +
51283 +#endif
51284 +
51285 +/* return length of item at @coord */
51286 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
51287 +{
51288 + int len;
51289 +
51290 + assert("nikita-327", coord != NULL);
51291 + assert("nikita-328", coord->node != NULL);
51292 + assert("nikita-329", znode_is_loaded(coord->node));
51293 +
51294 + len = node_plugin_by_node(coord->node)->length_by_coord(coord);
51295 + return len;
51296 +}
51297 +
51298 +void obtain_item_plugin(const coord_t * coord)
51299 +{
51300 + assert("nikita-330", coord != NULL);
51301 + assert("nikita-331", coord->node != NULL);
51302 + assert("nikita-332", znode_is_loaded(coord->node));
51303 +
51304 + coord_set_iplug((coord_t *) coord,
51305 + node_plugin_by_node(coord->node)->
51306 + plugin_by_coord(coord));
51307 + assert("nikita-2479",
51308 + coord_iplug(coord) ==
51309 + node_plugin_by_node(coord->node)->plugin_by_coord(coord));
51310 +}
51311 +
51312 +/* return id of item */
51313 +/* Audited by: green(2002.06.15) */
51314 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
51315 +{
51316 + assert("vs-539", coord != NULL);
51317 + assert("vs-538", coord->node != NULL);
51318 + assert("vs-537", znode_is_loaded(coord->node));
51319 + assert("vs-536", item_plugin_by_coord(coord) != NULL);
51320 + assert("vs-540",
51321 + item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
51322 +
51323 + return item_id_by_plugin(item_plugin_by_coord(coord));
51324 +}
51325 +
51326 +/* return key of item at @coord */
51327 +/* Audited by: green(2002.06.15) */
51328 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
51329 + reiser4_key * key /* result */ )
51330 +{
51331 + assert("nikita-338", coord != NULL);
51332 + assert("nikita-339", coord->node != NULL);
51333 + assert("nikita-340", znode_is_loaded(coord->node));
51334 +
51335 + return node_plugin_by_node(coord->node)->key_at(coord, key);
51336 +}
51337 +
51338 +/* this returns max key in the item */
51339 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
51340 + reiser4_key * key /* result */ )
51341 +{
51342 + coord_t last;
51343 +
51344 + assert("nikita-338", coord != NULL);
51345 + assert("nikita-339", coord->node != NULL);
51346 + assert("nikita-340", znode_is_loaded(coord->node));
51347 +
51348 + /* make coord pointing to last item's unit */
51349 + coord_dup(&last, coord);
51350 + last.unit_pos = coord_num_units(&last) - 1;
51351 + assert("vs-1560", coord_is_existing_unit(&last));
51352 +
51353 + max_unit_key_by_coord(&last, key);
51354 + return key;
51355 +}
51356 +
51357 +/* return key of unit at @coord */
51358 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51359 + reiser4_key * key /* result */ )
51360 +{
51361 + assert("nikita-772", coord != NULL);
51362 + assert("nikita-774", coord->node != NULL);
51363 + assert("nikita-775", znode_is_loaded(coord->node));
51364 +
51365 + if (item_plugin_by_coord(coord)->b.unit_key != NULL)
51366 + return item_plugin_by_coord(coord)->b.unit_key(coord, key);
51367 + else
51368 + return item_key_by_coord(coord, key);
51369 +}
51370 +
51371 +/* return the biggest key contained the unit @coord */
51372 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51373 + reiser4_key * key /* result */ )
51374 +{
51375 + assert("nikita-772", coord != NULL);
51376 + assert("nikita-774", coord->node != NULL);
51377 + assert("nikita-775", znode_is_loaded(coord->node));
51378 +
51379 + if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
51380 + return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
51381 + else
51382 + return unit_key_by_coord(coord, key);
51383 +}
51384 +
51385 +/* ->max_key_inside() method for items consisting of exactly one key (like
51386 + stat-data) */
51387 +static reiser4_key *max_key_inside_single_key(const coord_t *
51388 + coord /* coord of item */ ,
51389 + reiser4_key *
51390 + result /* resulting key */ )
51391 +{
51392 + assert("nikita-604", coord != NULL);
51393 +
51394 + /* coord -> key is starting key of this item and it has to be already
51395 + filled in */
51396 + return unit_key_by_coord(coord, result);
51397 +}
51398 +
51399 +/* ->nr_units() method for items consisting of exactly one unit always */
51400 +pos_in_node_t
51401 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
51402 +{
51403 + return 1;
51404 +}
51405 +
51406 +static int
51407 +paste_no_paste(coord_t * coord UNUSED_ARG,
51408 + reiser4_item_data * data UNUSED_ARG,
51409 + carry_plugin_info * info UNUSED_ARG)
51410 +{
51411 + return 0;
51412 +}
51413 +
51414 +/* default ->fast_paste() method */
51415 +static int
51416 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
51417 +{
51418 + return 1;
51419 +}
51420 +
51421 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
51422 + const reiser4_key * key /* key to check */ ,
51423 + const reiser4_item_data * data /* parameters of item
51424 + * being created */ )
51425 +{
51426 + item_plugin *iplug;
51427 + reiser4_key min_key_in_item;
51428 + reiser4_key max_key_in_item;
51429 +
51430 + assert("nikita-1658", item != NULL);
51431 + assert("nikita-1659", key != NULL);
51432 +
51433 + iplug = item_plugin_by_coord(item);
51434 + if (iplug->b.can_contain_key != NULL)
51435 + return iplug->b.can_contain_key(item, key, data);
51436 + else {
51437 + assert("nikita-1681", iplug->b.max_key_inside != NULL);
51438 + item_key_by_coord(item, &min_key_in_item);
51439 + iplug->b.max_key_inside(item, &max_key_in_item);
51440 +
51441 + /* can contain key if
51442 + min_key_in_item <= key &&
51443 + key <= max_key_in_item
51444 + */
51445 + return keyle(&min_key_in_item, key)
51446 + && keyle(key, &max_key_in_item);
51447 + }
51448 +}
51449 +
51450 +/* mergeable method for non mergeable items */
51451 +static int
51452 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
51453 +{
51454 + return 0;
51455 +}
51456 +
51457 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
51458 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
51459 + const coord_t * i2 /* coord of second item */ )
51460 +{
51461 + item_plugin *iplug;
51462 + reiser4_key k1;
51463 + reiser4_key k2;
51464 +
51465 + assert("nikita-1336", i1 != NULL);
51466 + assert("nikita-1337", i2 != NULL);
51467 +
51468 + iplug = item_plugin_by_coord(i1);
51469 + assert("nikita-1338", iplug != NULL);
51470 +
51471 + /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
51472 + shifting code when nodes are in "suspended" state. */
51473 + assert("nikita-1663",
51474 + keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
51475 +
51476 + if (iplug->b.mergeable != NULL) {
51477 + return iplug->b.mergeable(i1, i2);
51478 + } else if (iplug->b.max_key_inside != NULL) {
51479 + iplug->b.max_key_inside(i1, &k1);
51480 + item_key_by_coord(i2, &k2);
51481 +
51482 + /* mergeable if ->max_key_inside() >= key of i2; */
51483 + return keyge(iplug->b.max_key_inside(i1, &k1),
51484 + item_key_by_coord(i2, &k2));
51485 + } else {
51486 + item_key_by_coord(i1, &k1);
51487 + item_key_by_coord(i2, &k2);
51488 +
51489 + return
51490 + (get_key_locality(&k1) == get_key_locality(&k2)) &&
51491 + (get_key_objectid(&k1) == get_key_objectid(&k2))
51492 + && (iplug == item_plugin_by_coord(i2));
51493 + }
51494 +}
51495 +
51496 +int item_is_extent(const coord_t * item)
51497 +{
51498 + assert("vs-482", coord_is_existing_item(item));
51499 + return item_id_by_coord(item) == EXTENT_POINTER_ID;
51500 +}
51501 +
51502 +int item_is_tail(const coord_t * item)
51503 +{
51504 + assert("vs-482", coord_is_existing_item(item));
51505 + return item_id_by_coord(item) == FORMATTING_ID;
51506 +}
51507 +
51508 +#if REISER4_DEBUG
51509 +
51510 +int item_is_statdata(const coord_t * item)
51511 +{
51512 + assert("vs-516", coord_is_existing_item(item));
51513 + return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
51514 +}
51515 +
51516 +int item_is_ctail(const coord_t * item)
51517 +{
51518 + assert("edward-xx", coord_is_existing_item(item));
51519 + return item_id_by_coord(item) == CTAIL_ID;
51520 +}
51521 +
51522 +#endif /* REISER4_DEBUG */
51523 +
51524 +static int change_item(struct inode *inode,
51525 + reiser4_plugin * plugin,
51526 + pset_member memb)
51527 +{
51528 + /* cannot change constituent item (sd, or dir_item) */
51529 + return RETERR(-EINVAL);
51530 +}
51531 +
51532 +static reiser4_plugin_ops item_plugin_ops = {
51533 + .init = NULL,
51534 + .load = NULL,
51535 + .save_len = NULL,
51536 + .save = NULL,
51537 + .change = change_item
51538 +};
51539 +
51540 +item_plugin item_plugins[LAST_ITEM_ID] = {
51541 + [STATIC_STAT_DATA_ID] = {
51542 + .h = {
51543 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51544 + .id = STATIC_STAT_DATA_ID,
51545 + .groups = (1 << STAT_DATA_ITEM_TYPE),
51546 + .pops = &item_plugin_ops,
51547 + .label = "sd",
51548 + .desc = "stat-data",
51549 + .linkage = {NULL, NULL}
51550 + },
51551 + .b = {
51552 + .max_key_inside = max_key_inside_single_key,
51553 + .can_contain_key = NULL,
51554 + .mergeable = not_mergeable,
51555 + .nr_units = nr_units_single_unit,
51556 + .lookup = NULL,
51557 + .init = NULL,
51558 + .paste = paste_no_paste,
51559 + .fast_paste = NULL,
51560 + .can_shift = NULL,
51561 + .copy_units = NULL,
51562 + .create_hook = NULL,
51563 + .kill_hook = NULL,
51564 + .shift_hook = NULL,
51565 + .cut_units = NULL,
51566 + .kill_units = NULL,
51567 + .unit_key = NULL,
51568 + .max_unit_key = NULL,
51569 + .estimate = NULL,
51570 + .item_data_by_flow = NULL,
51571 +#if REISER4_DEBUG
51572 + .check = NULL
51573 +#endif
51574 + },
51575 + .f = {
51576 + .utmost_child = NULL,
51577 + .utmost_child_real_block = NULL,
51578 + .update = NULL,
51579 + .scan = NULL,
51580 + .convert = NULL
51581 + },
51582 + .s = {
51583 + .sd = {
51584 + .init_inode = init_inode_static_sd,
51585 + .save_len = save_len_static_sd,
51586 + .save = save_static_sd
51587 + }
51588 + }
51589 + },
51590 + [SIMPLE_DIR_ENTRY_ID] = {
51591 + .h = {
51592 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51593 + .id = SIMPLE_DIR_ENTRY_ID,
51594 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
51595 + .pops = &item_plugin_ops,
51596 + .label = "de",
51597 + .desc = "directory entry",
51598 + .linkage = {NULL, NULL}
51599 + },
51600 + .b = {
51601 + .max_key_inside = max_key_inside_single_key,
51602 + .can_contain_key = NULL,
51603 + .mergeable = NULL,
51604 + .nr_units = nr_units_single_unit,
51605 + .lookup = NULL,
51606 + .init = NULL,
51607 + .paste = NULL,
51608 + .fast_paste = NULL,
51609 + .can_shift = NULL,
51610 + .copy_units = NULL,
51611 + .create_hook = NULL,
51612 + .kill_hook = NULL,
51613 + .shift_hook = NULL,
51614 + .cut_units = NULL,
51615 + .kill_units = NULL,
51616 + .unit_key = NULL,
51617 + .max_unit_key = NULL,
51618 + .estimate = NULL,
51619 + .item_data_by_flow = NULL,
51620 +#if REISER4_DEBUG
51621 + .check = NULL
51622 +#endif
51623 + },
51624 + .f = {
51625 + .utmost_child = NULL,
51626 + .utmost_child_real_block = NULL,
51627 + .update = NULL,
51628 + .scan = NULL,
51629 + .convert = NULL
51630 + },
51631 + .s = {
51632 + .dir = {
51633 + .extract_key = extract_key_de,
51634 + .update_key = update_key_de,
51635 + .extract_name = extract_name_de,
51636 + .extract_file_type = extract_file_type_de,
51637 + .add_entry = add_entry_de,
51638 + .rem_entry = rem_entry_de,
51639 + .max_name_len = max_name_len_de
51640 + }
51641 + }
51642 + },
51643 + [COMPOUND_DIR_ID] = {
51644 + .h = {
51645 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51646 + .id = COMPOUND_DIR_ID,
51647 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
51648 + .pops = &item_plugin_ops,
51649 + .label = "cde",
51650 + .desc = "compressed directory entry",
51651 + .linkage = {NULL, NULL}
51652 + },
51653 + .b = {
51654 + .max_key_inside = max_key_inside_cde,
51655 + .can_contain_key = can_contain_key_cde,
51656 + .mergeable = mergeable_cde,
51657 + .nr_units = nr_units_cde,
51658 + .lookup = lookup_cde,
51659 + .init = init_cde,
51660 + .paste = paste_cde,
51661 + .fast_paste = agree_to_fast_op,
51662 + .can_shift = can_shift_cde,
51663 + .copy_units = copy_units_cde,
51664 + .create_hook = NULL,
51665 + .kill_hook = NULL,
51666 + .shift_hook = NULL,
51667 + .cut_units = cut_units_cde,
51668 + .kill_units = kill_units_cde,
51669 + .unit_key = unit_key_cde,
51670 + .max_unit_key = unit_key_cde,
51671 + .estimate = estimate_cde,
51672 + .item_data_by_flow = NULL,
51673 +#if REISER4_DEBUG
51674 + .check = reiser4_check_cde
51675 +#endif
51676 + },
51677 + .f = {
51678 + .utmost_child = NULL,
51679 + .utmost_child_real_block = NULL,
51680 + .update = NULL,
51681 + .scan = NULL,
51682 + .convert = NULL
51683 + },
51684 + .s = {
51685 + .dir = {
51686 + .extract_key = extract_key_cde,
51687 + .update_key = update_key_cde,
51688 + .extract_name = extract_name_cde,
51689 + .extract_file_type = extract_file_type_de,
51690 + .add_entry = add_entry_cde,
51691 + .rem_entry = rem_entry_cde,
51692 + .max_name_len = max_name_len_cde
51693 + }
51694 + }
51695 + },
51696 + [NODE_POINTER_ID] = {
51697 + .h = {
51698 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51699 + .id = NODE_POINTER_ID,
51700 + .groups = (1 << INTERNAL_ITEM_TYPE),
51701 + .pops = NULL,
51702 + .label = "internal",
51703 + .desc = "internal item",
51704 + .linkage = {NULL, NULL}
51705 + },
51706 + .b = {
51707 + .max_key_inside = NULL,
51708 + .can_contain_key = NULL,
51709 + .mergeable = mergeable_internal,
51710 + .nr_units = nr_units_single_unit,
51711 + .lookup = lookup_internal,
51712 + .init = NULL,
51713 + .paste = NULL,
51714 + .fast_paste = NULL,
51715 + .can_shift = NULL,
51716 + .copy_units = NULL,
51717 + .create_hook = create_hook_internal,
51718 + .kill_hook = kill_hook_internal,
51719 + .shift_hook = shift_hook_internal,
51720 + .cut_units = NULL,
51721 + .kill_units = NULL,
51722 + .unit_key = NULL,
51723 + .max_unit_key = NULL,
51724 + .estimate = NULL,
51725 + .item_data_by_flow = NULL,
51726 +#if REISER4_DEBUG
51727 + .check = check__internal
51728 +#endif
51729 + },
51730 + .f = {
51731 + .utmost_child = utmost_child_internal,
51732 + .utmost_child_real_block =
51733 + utmost_child_real_block_internal,
51734 + .update = reiser4_update_internal,
51735 + .scan = NULL,
51736 + .convert = NULL
51737 + },
51738 + .s = {
51739 + .internal = {
51740 + .down_link = down_link_internal,
51741 + .has_pointer_to = has_pointer_to_internal
51742 + }
51743 + }
51744 + },
51745 + [EXTENT_POINTER_ID] = {
51746 + .h = {
51747 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51748 + .id = EXTENT_POINTER_ID,
51749 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
51750 + .pops = NULL,
51751 + .label = "extent",
51752 + .desc = "extent item",
51753 + .linkage = {NULL, NULL}
51754 + },
51755 + .b = {
51756 + .max_key_inside = max_key_inside_extent,
51757 + .can_contain_key = can_contain_key_extent,
51758 + .mergeable = mergeable_extent,
51759 + .nr_units = nr_units_extent,
51760 + .lookup = lookup_extent,
51761 + .init = NULL,
51762 + .paste = paste_extent,
51763 + .fast_paste = agree_to_fast_op,
51764 + .can_shift = can_shift_extent,
51765 + .create_hook = create_hook_extent,
51766 + .copy_units = copy_units_extent,
51767 + .kill_hook = kill_hook_extent,
51768 + .shift_hook = NULL,
51769 + .cut_units = cut_units_extent,
51770 + .kill_units = kill_units_extent,
51771 + .unit_key = unit_key_extent,
51772 + .max_unit_key = max_unit_key_extent,
51773 + .estimate = NULL,
51774 + .item_data_by_flow = NULL,
51775 +#if REISER4_DEBUG
51776 + .check = reiser4_check_extent
51777 +#endif
51778 + },
51779 + .f = {
51780 + .utmost_child = utmost_child_extent,
51781 + .utmost_child_real_block =
51782 + utmost_child_real_block_extent,
51783 + .update = NULL,
51784 + .scan = reiser4_scan_extent,
51785 + .convert = NULL,
51786 + .key_by_offset = key_by_offset_extent
51787 + },
51788 + .s = {
51789 + .file = {
51790 + .write = reiser4_write_extent,
51791 + .read = reiser4_read_extent,
51792 + .readpage = reiser4_readpage_extent,
51793 + .get_block = get_block_address_extent,
51794 + .append_key = append_key_extent,
51795 + .init_coord_extension =
51796 + init_coord_extension_extent
51797 + }
51798 + }
51799 + },
51800 + [FORMATTING_ID] = {
51801 + .h = {
51802 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51803 + .id = FORMATTING_ID,
51804 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
51805 + .pops = NULL,
51806 + .label = "body",
51807 + .desc = "body (or tail?) item",
51808 + .linkage = {NULL, NULL}
51809 + },
51810 + .b = {
51811 + .max_key_inside = max_key_inside_tail,
51812 + .can_contain_key = can_contain_key_tail,
51813 + .mergeable = mergeable_tail,
51814 + .nr_units = nr_units_tail,
51815 + .lookup = lookup_tail,
51816 + .init = NULL,
51817 + .paste = paste_tail,
51818 + .fast_paste = agree_to_fast_op,
51819 + .can_shift = can_shift_tail,
51820 + .create_hook = NULL,
51821 + .copy_units = copy_units_tail,
51822 + .kill_hook = kill_hook_tail,
51823 + .shift_hook = NULL,
51824 + .cut_units = cut_units_tail,
51825 + .kill_units = kill_units_tail,
51826 + .unit_key = unit_key_tail,
51827 + .max_unit_key = unit_key_tail,
51828 + .estimate = NULL,
51829 + .item_data_by_flow = NULL,
51830 +#if REISER4_DEBUG
51831 + .check = NULL
51832 +#endif
51833 + },
51834 + .f = {
51835 + .utmost_child = NULL,
51836 + .utmost_child_real_block = NULL,
51837 + .update = NULL,
51838 + .scan = NULL,
51839 + .convert = NULL
51840 + },
51841 + .s = {
51842 + .file = {
51843 + .write = reiser4_write_tail,
51844 + .read = reiser4_read_tail,
51845 + .readpage = readpage_tail,
51846 + .get_block = get_block_address_tail,
51847 + .append_key = append_key_tail,
51848 + .init_coord_extension =
51849 + init_coord_extension_tail
51850 + }
51851 + }
51852 + },
51853 + [CTAIL_ID] = {
51854 + .h = {
51855 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51856 + .id = CTAIL_ID,
51857 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
51858 + .pops = NULL,
51859 + .label = "ctail",
51860 + .desc = "cryptcompress tail item",
51861 + .linkage = {NULL, NULL}
51862 + },
51863 + .b = {
51864 + .max_key_inside = max_key_inside_tail,
51865 + .can_contain_key = can_contain_key_ctail,
51866 + .mergeable = mergeable_ctail,
51867 + .nr_units = nr_units_ctail,
51868 + .lookup = NULL,
51869 + .init = init_ctail,
51870 + .paste = paste_ctail,
51871 + .fast_paste = agree_to_fast_op,
51872 + .can_shift = can_shift_ctail,
51873 + .create_hook = create_hook_ctail,
51874 + .copy_units = copy_units_ctail,
51875 + .kill_hook = kill_hook_ctail,
51876 + .shift_hook = shift_hook_ctail,
51877 + .cut_units = cut_units_ctail,
51878 + .kill_units = kill_units_ctail,
51879 + .unit_key = unit_key_tail,
51880 + .max_unit_key = unit_key_tail,
51881 + .estimate = estimate_ctail,
51882 + .item_data_by_flow = NULL,
51883 +#if REISER4_DEBUG
51884 + .check = check_ctail
51885 +#endif
51886 + },
51887 + .f = {
51888 + .utmost_child = utmost_child_ctail,
51889 + /* FIXME-EDWARD: write this */
51890 + .utmost_child_real_block = NULL,
51891 + .update = NULL,
51892 + .scan = scan_ctail,
51893 + .convert = convert_ctail
51894 + },
51895 + .s = {
51896 + .file = {
51897 + .write = NULL,
51898 + .read = read_ctail,
51899 + .readpage = readpage_ctail,
51900 + .get_block = get_block_address_tail,
51901 + .append_key = append_key_ctail,
51902 + .init_coord_extension =
51903 + init_coord_extension_tail
51904 + }
51905 + }
51906 + },
51907 + [BLACK_BOX_ID] = {
51908 + .h = {
51909 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
51910 + .id = BLACK_BOX_ID,
51911 + .groups = (1 << OTHER_ITEM_TYPE),
51912 + .pops = NULL,
51913 + .label = "blackbox",
51914 + .desc = "black box item",
51915 + .linkage = {NULL, NULL}
51916 + },
51917 + .b = {
51918 + .max_key_inside = NULL,
51919 + .can_contain_key = NULL,
51920 + .mergeable = not_mergeable,
51921 + .nr_units = nr_units_single_unit,
51922 + /* to need for ->lookup method */
51923 + .lookup = NULL,
51924 + .init = NULL,
51925 + .paste = NULL,
51926 + .fast_paste = NULL,
51927 + .can_shift = NULL,
51928 + .copy_units = NULL,
51929 + .create_hook = NULL,
51930 + .kill_hook = NULL,
51931 + .shift_hook = NULL,
51932 + .cut_units = NULL,
51933 + .kill_units = NULL,
51934 + .unit_key = NULL,
51935 + .max_unit_key = NULL,
51936 + .estimate = NULL,
51937 + .item_data_by_flow = NULL,
51938 +#if REISER4_DEBUG
51939 + .check = NULL
51940 +#endif
51941 + }
51942 + }
51943 +};
51944 +
51945 +/* Make Linus happy.
51946 + Local variables:
51947 + c-indentation-style: "K&R"
51948 + mode-name: "LC"
51949 + c-basic-offset: 8
51950 + tab-width: 8
51951 + fill-column: 120
51952 + End:
51953 +*/
51954 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/item.h linux-2.6.22/fs/reiser4/plugin/item/item.h
51955 --- linux-2.6.22.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300
51956 +++ linux-2.6.22/fs/reiser4/plugin/item/item.h 2007-07-29 00:25:34.972721325 +0400
51957 @@ -0,0 +1,397 @@
51958 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51959 +
51960 +/* first read balance.c comments before reading this */
51961 +
51962 +/* An item_plugin implements all of the operations required for
51963 + balancing that are item specific. */
51964 +
51965 +/* an item plugin also implements other operations that are specific to that
51966 + item. These go into the item specific operations portion of the item
51967 + handler, and all of the item specific portions of the item handler are put
51968 + into a union. */
51969 +
51970 +#if !defined( __REISER4_ITEM_H__ )
51971 +#define __REISER4_ITEM_H__
51972 +
51973 +#include "../../forward.h"
51974 +#include "../plugin_header.h"
51975 +#include "../../dformat.h"
51976 +#include "../../seal.h"
51977 +#include "../../plugin/file/file.h"
51978 +
51979 +#include <linux/fs.h> /* for struct file, struct inode */
51980 +#include <linux/mm.h> /* for struct page */
51981 +#include <linux/dcache.h> /* for struct dentry */
51982 +
51983 +typedef enum {
51984 + STAT_DATA_ITEM_TYPE,
51985 + DIR_ENTRY_ITEM_TYPE,
51986 + INTERNAL_ITEM_TYPE,
51987 + UNIX_FILE_METADATA_ITEM_TYPE,
51988 + OTHER_ITEM_TYPE
51989 +} item_type_id;
51990 +
51991 +/* this is the part of each item plugin that all items are expected to
51992 + support or at least explicitly fail to support by setting the
51993 + pointer to null. */
51994 +struct balance_ops {
51995 + /* operations called by balancing
51996 +
51997 + It is interesting to consider that some of these item
51998 + operations could be given sources or targets that are not
51999 + really items in nodes. This could be ok/useful.
52000 +
52001 + */
52002 + /* maximal key that can _possibly_ be occupied by this item
52003 +
52004 + When inserting, and node ->lookup() method (called by
52005 + coord_by_key()) reaches an item after binary search,
52006 + the ->max_key_inside() item plugin method is used to determine
52007 + whether new item should pasted into existing item
52008 + (new_key<=max_key_inside()) or new item has to be created
52009 + (new_key>max_key_inside()).
52010 +
52011 + For items that occupy exactly one key (like stat-data)
52012 + this method should return this key. For items that can
52013 + grow indefinitely (extent, directory item) this should
52014 + return reiser4_max_key().
52015 +
52016 + For example extent with the key
52017 +
52018 + (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52019 +
52020 + ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52021 + */
52022 + reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52023 +
52024 + /* true if item @coord can merge data at @key. */
52025 + int (*can_contain_key) (const coord_t *, const reiser4_key *,
52026 + const reiser4_item_data *);
52027 + /* mergeable() - check items for mergeability
52028 +
52029 + Optional method. Returns true if two items can be merged.
52030 +
52031 + */
52032 + int (*mergeable) (const coord_t *, const coord_t *);
52033 +
52034 + /* number of atomic things in an item.
52035 + NOTE FOR CONTRIBUTORS: use a generic method
52036 + nr_units_single_unit() for solid (atomic) items, as
52037 + tree operations use it as a criterion of solidness
52038 + (see is_solid_item macro) */
52039 + pos_in_node_t(*nr_units) (const coord_t *);
52040 +
52041 + /* search within item for a unit within the item, and return a
52042 + pointer to it. This can be used to calculate how many
52043 + bytes to shrink an item if you use pointer arithmetic and
52044 + compare to the start of the item body if the item's data
52045 + are continuous in the node, if the item's data are not
52046 + continuous in the node, all sorts of other things are maybe
52047 + going to break as well. */
52048 + lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52049 + /* method called by ode_plugin->create_item() to initialise new
52050 + item */
52051 + int (*init) (coord_t * target, coord_t * from,
52052 + reiser4_item_data * data);
52053 + /* method called (e.g., by reiser4_resize_item()) to place new data
52054 + into item when it grows */
52055 + int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52056 + /* return true if paste into @coord is allowed to skip
52057 + carry. That is, if such paste would require any changes
52058 + at the parent level
52059 + */
52060 + int (*fast_paste) (const coord_t *);
52061 + /* how many but not more than @want units of @source can be
52062 + shifted into @target node. If pend == append - we try to
52063 + append last item of @target by first units of @source. If
52064 + pend == prepend - we try to "prepend" first item in @target
52065 + by last units of @source. @target node has @free_space
52066 + bytes of free space. Total size of those units are returned
52067 + via @size.
52068 +
52069 + @target is not NULL if shifting to the mergeable item and
52070 + NULL is new item will be created during shifting.
52071 + */
52072 + int (*can_shift) (unsigned free_space, coord_t *,
52073 + znode *, shift_direction, unsigned *size,
52074 + unsigned want);
52075 +
52076 + /* starting off @from-th unit of item @source append or
52077 + prepend @count units to @target. @target has been already
52078 + expanded by @free_space bytes. That must be exactly what is
52079 + needed for those items in @target. If @where_is_free_space
52080 + == SHIFT_LEFT - free space is at the end of @target item,
52081 + othersize - it is in the beginning of it. */
52082 + void (*copy_units) (coord_t *, coord_t *,
52083 + unsigned from, unsigned count,
52084 + shift_direction where_is_free_space,
52085 + unsigned free_space);
52086 +
52087 + int (*create_hook) (const coord_t *, void *);
52088 + /* do whatever is necessary to do when @count units starting
52089 + from @from-th one are removed from the tree */
52090 + /* FIXME-VS: this is used to be here for, in particular,
52091 + extents and items of internal type to free blocks they point
52092 + to at the same time with removing items from a
52093 + tree. Problems start, however, when dealloc_block fails due
52094 + to some reason. Item gets removed, but blocks it pointed to
52095 + are not freed. It is not clear how to fix this for items of
52096 + internal type because a need to remove internal item may
52097 + appear in the middle of balancing, and there is no way to
52098 + undo changes made. OTOH, if space allocator involves
52099 + balancing to perform dealloc_block - this will probably
52100 + break balancing due to deadlock issues
52101 + */
52102 + int (*kill_hook) (const coord_t *, pos_in_node_t from,
52103 + pos_in_node_t count, struct carry_kill_data *);
52104 + int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52105 + znode * _node);
52106 +
52107 + /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52108 + including boundaries. When units are cut from item beginning - move space which gets freed to head of
52109 + item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52110 + item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52111 + @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52112 + */
52113 + int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52114 + struct carry_cut_data *,
52115 + reiser4_key * smallest_removed,
52116 + reiser4_key * new_first_key);
52117 +
52118 + /* like cut_units, except that these units are removed from the
52119 + tree, not only from a node */
52120 + int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52121 + struct carry_kill_data *,
52122 + reiser4_key * smallest_removed,
52123 + reiser4_key * new_first);
52124 +
52125 + /* if @key_of_coord == 1 - returned key of coord, otherwise -
52126 + key of unit is returned. If @coord is not set to certain
52127 + unit - ERR_PTR(-ENOENT) is returned */
52128 + reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52129 + reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52130 + /* estimate how much space is needed for paste @data into item at
52131 + @coord. if @coord==0 - estimate insertion, otherwise - estimate
52132 + pasting
52133 + */
52134 + int (*estimate) (const coord_t *, const reiser4_item_data *);
52135 +
52136 + /* converts flow @f to item data. @coord == 0 on insert */
52137 + int (*item_data_by_flow) (const coord_t *, const flow_t *,
52138 + reiser4_item_data *);
52139 +
52140 + /*void (*show) (struct seq_file *, coord_t *); */
52141 +
52142 +#if REISER4_DEBUG
52143 + /* used for debugging, every item should have here the most
52144 + complete possible check of the consistency of the item that
52145 + the inventor can construct */
52146 + int (*check) (const coord_t *, const char **error);
52147 +#endif
52148 +
52149 +};
52150 +
52151 +struct flush_ops {
52152 + /* return the right or left child of @coord, only if it is in memory */
52153 + int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52154 +
52155 + /* return whether the right or left child of @coord has a non-fake
52156 + block number. */
52157 + int (*utmost_child_real_block) (const coord_t *, sideof side,
52158 + reiser4_block_nr *);
52159 + /* relocate child at @coord to the @block */
52160 + void (*update) (const coord_t *, const reiser4_block_nr *);
52161 + /* count unformatted nodes per item for leave relocation policy, etc.. */
52162 + int (*scan) (flush_scan * scan);
52163 + /* convert item by flush */
52164 + int (*convert) (flush_pos_t * pos);
52165 + /* backward mapping from jnode offset to a key. */
52166 + int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52167 +};
52168 +
52169 +/* operations specific to the directory item */
52170 +struct dir_entry_iops {
52171 + /* extract stat-data key from directory entry at @coord and place it
52172 + into @key. */
52173 + int (*extract_key) (const coord_t *, reiser4_key * key);
52174 + /* update object key in item. */
52175 + int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52176 + /* extract name from directory entry at @coord and return it */
52177 + char *(*extract_name) (const coord_t *, char *buf);
52178 + /* extract file type (DT_* stuff) from directory entry at @coord and
52179 + return it */
52180 + unsigned (*extract_file_type) (const coord_t *);
52181 + int (*add_entry) (struct inode * dir,
52182 + coord_t *, lock_handle *,
52183 + const struct dentry * name,
52184 + reiser4_dir_entry_desc * entry);
52185 + int (*rem_entry) (struct inode * dir, const struct qstr * name,
52186 + coord_t *, lock_handle *,
52187 + reiser4_dir_entry_desc * entry);
52188 + int (*max_name_len) (const struct inode * dir);
52189 +};
52190 +
52191 +/* operations specific to items regular (unix) file metadata are built of */
52192 +struct file_iops{
52193 + int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52194 + int (*read) (struct file *, flow_t *, hint_t *);
52195 + int (*readpage) (void *, struct page *);
52196 + int (*get_block) (const coord_t *, sector_t, sector_t *);
52197 + /*
52198 + * key of first byte which is not addressed by the item @coord is set
52199 + * to.
52200 + * For example, for extent item with the key
52201 + *
52202 + * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52203 + *
52204 + * ->append_key is
52205 + *
52206 + * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52207 + */
52208 + reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52209 +
52210 + void (*init_coord_extension) (uf_coord_t *, loff_t);
52211 +};
52212 +
52213 +/* operations specific to items of stat data type */
52214 +struct sd_iops {
52215 + int (*init_inode) (struct inode * inode, char *sd, int len);
52216 + int (*save_len) (struct inode * inode);
52217 + int (*save) (struct inode * inode, char **area);
52218 +};
52219 +
52220 +/* operations specific to internal item */
52221 +struct internal_iops{
52222 + /* all tree traversal want to know from internal item is where
52223 + to go next. */
52224 + void (*down_link) (const coord_t * coord,
52225 + const reiser4_key * key, reiser4_block_nr * block);
52226 + /* check that given internal item contains given pointer. */
52227 + int (*has_pointer_to) (const coord_t * coord,
52228 + const reiser4_block_nr * block);
52229 +};
52230 +
52231 +struct item_plugin {
52232 + /* generic fields */
52233 + plugin_header h;
52234 + /* methods common for all item types */
52235 + struct balance_ops b; /* balance operations */
52236 + struct flush_ops f; /* flush operates with items via this methods */
52237 +
52238 + /* methods specific to particular type of item */
52239 + union {
52240 + struct dir_entry_iops dir;
52241 + struct file_iops file;
52242 + struct sd_iops sd;
52243 + struct internal_iops internal;
52244 + } s;
52245 +};
52246 +
52247 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
52248 +
52249 +static inline item_id item_id_by_plugin(item_plugin * plugin)
52250 +{
52251 + return plugin->h.id;
52252 +}
52253 +
52254 +static inline char get_iplugid(item_plugin * iplug)
52255 +{
52256 + assert("nikita-2838", iplug != NULL);
52257 + assert("nikita-2839", iplug->h.id < 0xff);
52258 + return (char)item_id_by_plugin(iplug);
52259 +}
52260 +
52261 +extern unsigned long znode_times_locked(const znode * z);
52262 +
52263 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
52264 +{
52265 + assert("nikita-2837", coord != NULL);
52266 + assert("nikita-2838", iplug != NULL);
52267 + coord->iplugid = get_iplugid(iplug);
52268 + ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
52269 +}
52270 +
52271 +static inline item_plugin *coord_iplug(const coord_t * coord)
52272 +{
52273 + assert("nikita-2833", coord != NULL);
52274 + assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
52275 + assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
52276 + return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
52277 + coord->iplugid);
52278 +}
52279 +
52280 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
52281 + const reiser4_item_data *);
52282 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
52283 +extern int item_is_extent(const coord_t *);
52284 +extern int item_is_tail(const coord_t *);
52285 +extern int item_is_statdata(const coord_t * item);
52286 +extern int item_is_ctail(const coord_t *);
52287 +
52288 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
52289 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
52290 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
52291 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
52292 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
52293 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
52294 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
52295 + reiser4_key * key);
52296 +extern void obtain_item_plugin(const coord_t * coord);
52297 +
52298 +#if defined(REISER4_DEBUG)
52299 +extern int znode_is_loaded(const znode * node);
52300 +#endif
52301 +
52302 +/* return plugin of item at @coord */
52303 +static inline item_plugin *item_plugin_by_coord(const coord_t *
52304 + coord /* coord to query */ )
52305 +{
52306 + assert("nikita-330", coord != NULL);
52307 + assert("nikita-331", coord->node != NULL);
52308 + assert("nikita-332", znode_is_loaded(coord->node));
52309 +
52310 + if (unlikely(!coord_is_iplug_set(coord)))
52311 + obtain_item_plugin(coord);
52312 + return coord_iplug(coord);
52313 +}
52314 +
52315 +/* this returns true if item is of internal type */
52316 +static inline int item_is_internal(const coord_t * item)
52317 +{
52318 + assert("vs-483", coord_is_existing_item(item));
52319 + return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
52320 +}
52321 +
52322 +extern void item_body_by_coord_hard(coord_t * coord);
52323 +extern void *item_body_by_coord_easy(const coord_t * coord);
52324 +#if REISER4_DEBUG
52325 +extern int item_body_is_valid(const coord_t * coord);
52326 +#endif
52327 +
52328 +/* return pointer to item body */
52329 +static inline void *item_body_by_coord(const coord_t *
52330 + coord /* coord to query */ )
52331 +{
52332 + assert("nikita-324", coord != NULL);
52333 + assert("nikita-325", coord->node != NULL);
52334 + assert("nikita-326", znode_is_loaded(coord->node));
52335 +
52336 + if (coord->offset == INVALID_OFFSET)
52337 + item_body_by_coord_hard((coord_t *) coord);
52338 + assert("nikita-3201", item_body_is_valid(coord));
52339 + assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
52340 + return item_body_by_coord_easy(coord);
52341 +}
52342 +
52343 +/* __REISER4_ITEM_H__ */
52344 +#endif
52345 +/* Make Linus happy.
52346 + Local variables:
52347 + c-indentation-style: "K&R"
52348 + mode-name: "LC"
52349 + c-basic-offset: 8
52350 + tab-width: 8
52351 + fill-column: 120
52352 + scroll-step: 1
52353 + End:
52354 +*/
52355 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/Makefile linux-2.6.22/fs/reiser4/plugin/item/Makefile
52356 --- linux-2.6.22.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300
52357 +++ linux-2.6.22/fs/reiser4/plugin/item/Makefile 2007-07-29 00:25:34.972721325 +0400
52358 @@ -0,0 +1,18 @@
52359 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
52360 +
52361 +item_plugins-objs := \
52362 + item.o \
52363 + static_stat.o \
52364 + sde.o \
52365 + cde.o \
52366 + blackbox.o \
52367 + internal.o \
52368 + tail.o \
52369 + ctail.o \
52370 + extent.o \
52371 + extent_item_ops.o \
52372 + extent_file_ops.o \
52373 + extent_flush_ops.o
52374 +
52375 +
52376 +
52377 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/sde.c linux-2.6.22/fs/reiser4/plugin/item/sde.c
52378 --- linux-2.6.22.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300
52379 +++ linux-2.6.22/fs/reiser4/plugin/item/sde.c 2007-07-29 00:25:34.972721325 +0400
52380 @@ -0,0 +1,190 @@
52381 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52382 +
52383 +/* Directory entry implementation */
52384 +#include "../../forward.h"
52385 +#include "../../debug.h"
52386 +#include "../../dformat.h"
52387 +#include "../../kassign.h"
52388 +#include "../../coord.h"
52389 +#include "sde.h"
52390 +#include "item.h"
52391 +#include "../plugin.h"
52392 +#include "../../znode.h"
52393 +#include "../../carry.h"
52394 +#include "../../tree.h"
52395 +#include "../../inode.h"
52396 +
52397 +#include <linux/fs.h> /* for struct inode */
52398 +#include <linux/dcache.h> /* for struct dentry */
52399 +#include <linux/quotaops.h>
52400 +
52401 +/* ->extract_key() method of simple directory item plugin. */
52402 +int extract_key_de(const coord_t * coord /* coord of item */ ,
52403 + reiser4_key * key /* resulting key */ )
52404 +{
52405 + directory_entry_format *dent;
52406 +
52407 + assert("nikita-1458", coord != NULL);
52408 + assert("nikita-1459", key != NULL);
52409 +
52410 + dent = (directory_entry_format *) item_body_by_coord(coord);
52411 + assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
52412 + return extract_key_from_id(&dent->id, key);
52413 +}
52414 +
52415 +int
52416 +update_key_de(const coord_t * coord, const reiser4_key * key,
52417 + lock_handle * lh UNUSED_ARG)
52418 +{
52419 + directory_entry_format *dent;
52420 + obj_key_id obj_id;
52421 + int result;
52422 +
52423 + assert("nikita-2342", coord != NULL);
52424 + assert("nikita-2343", key != NULL);
52425 +
52426 + dent = (directory_entry_format *) item_body_by_coord(coord);
52427 + result = build_obj_key_id(key, &obj_id);
52428 + if (result == 0) {
52429 + dent->id = obj_id;
52430 + znode_make_dirty(coord->node);
52431 + }
52432 + return 0;
52433 +}
52434 +
52435 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
52436 + char *buf)
52437 +{
52438 + reiser4_key key;
52439 +
52440 + unit_key_by_coord(coord, &key);
52441 + if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
52442 + reiser4_print_address("oops", znode_get_block(coord->node));
52443 + if (!is_longname_key(&key)) {
52444 + if (is_dot_key(&key))
52445 + return (char *)".";
52446 + else
52447 + return extract_name_from_key(&key, buf);
52448 + } else
52449 + return (char *)dent->name;
52450 +}
52451 +
52452 +/* ->extract_name() method of simple directory item plugin. */
52453 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
52454 +{
52455 + directory_entry_format *dent;
52456 +
52457 + assert("nikita-1460", coord != NULL);
52458 +
52459 + dent = (directory_entry_format *) item_body_by_coord(coord);
52460 + return extract_dent_name(coord, dent, buf);
52461 +}
52462 +
52463 +/* ->extract_file_type() method of simple directory item plugin. */
52464 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
52465 + * item */ )
52466 +{
52467 + assert("nikita-1764", coord != NULL);
52468 + /* we don't store file type in the directory entry yet.
52469 +
52470 + But see comments at kassign.h:obj_key_id
52471 + */
52472 + return DT_UNKNOWN;
52473 +}
52474 +
52475 +int add_entry_de(struct inode *dir /* directory of item */ ,
52476 + coord_t * coord /* coord of item */ ,
52477 + lock_handle * lh /* insertion lock handle */ ,
52478 + const struct dentry *de /* name to add */ ,
52479 + reiser4_dir_entry_desc * entry /* parameters of new directory
52480 + * entry */ )
52481 +{
52482 + reiser4_item_data data;
52483 + directory_entry_format *dent;
52484 + int result;
52485 + const char *name;
52486 + int len;
52487 + int longname;
52488 +
52489 + name = de->d_name.name;
52490 + len = de->d_name.len;
52491 + assert("nikita-1163", strlen(name) == len);
52492 +
52493 + longname = is_longname(name, len);
52494 +
52495 + data.length = sizeof *dent;
52496 + if (longname)
52497 + data.length += len + 1;
52498 + data.data = NULL;
52499 + data.user = 0;
52500 + data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
52501 +
52502 + /* NOTE-NIKITA quota plugin */
52503 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
52504 + return -EDQUOT;
52505 +
52506 + result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
52507 + if (result != 0)
52508 + return result;
52509 +
52510 + dent = (directory_entry_format *) item_body_by_coord(coord);
52511 + build_inode_key_id(entry->obj, &dent->id);
52512 + if (longname) {
52513 + memcpy(dent->name, name, len);
52514 + put_unaligned(0, &dent->name[len]);
52515 + }
52516 + return 0;
52517 +}
52518 +
52519 +int rem_entry_de(struct inode *dir /* directory of item */ ,
52520 + const struct qstr *name UNUSED_ARG,
52521 + coord_t * coord /* coord of item */ ,
52522 + lock_handle * lh UNUSED_ARG /* lock handle for
52523 + * removal */ ,
52524 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
52525 + * directory entry
52526 + * being removed */ )
52527 +{
52528 + coord_t shadow;
52529 + int result;
52530 + int length;
52531 +
52532 + length = item_length_by_coord(coord);
52533 + if (inode_get_bytes(dir) < length) {
52534 + warning("nikita-2627", "Dir is broke: %llu: %llu",
52535 + (unsigned long long)get_inode_oid(dir),
52536 + inode_get_bytes(dir));
52537 +
52538 + return RETERR(-EIO);
52539 + }
52540 +
52541 + /* cut_node() is supposed to take pointers to _different_
52542 + coords, because it will modify them without respect to
52543 + possible aliasing. To work around this, create temporary copy
52544 + of @coord.
52545 + */
52546 + coord_dup(&shadow, coord);
52547 + result =
52548 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
52549 + if (result == 0) {
52550 + /* NOTE-NIKITA quota plugin */
52551 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
52552 + }
52553 + return result;
52554 +}
52555 +
52556 +int max_name_len_de(const struct inode *dir)
52557 +{
52558 + return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
52559 + sizeof(directory_entry_format) - 2;
52560 +}
52561 +
52562 +/* Make Linus happy.
52563 + Local variables:
52564 + c-indentation-style: "K&R"
52565 + mode-name: "LC"
52566 + c-basic-offset: 8
52567 + tab-width: 8
52568 + fill-column: 120
52569 + End:
52570 +*/
52571 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/sde.h linux-2.6.22/fs/reiser4/plugin/item/sde.h
52572 --- linux-2.6.22.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300
52573 +++ linux-2.6.22/fs/reiser4/plugin/item/sde.h 2007-07-29 00:25:34.976722360 +0400
52574 @@ -0,0 +1,66 @@
52575 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52576 +
52577 +/* Directory entry. */
52578 +
52579 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
52580 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
52581 +
52582 +#include "../../forward.h"
52583 +#include "../../dformat.h"
52584 +#include "../../kassign.h"
52585 +#include "../../key.h"
52586 +
52587 +#include <linux/fs.h>
52588 +#include <linux/dcache.h> /* for struct dentry */
52589 +
52590 +typedef struct directory_entry_format {
52591 + /* key of object stat-data. It's not necessary to store whole
52592 + key here, because it's always key of stat-data, so minor
52593 + packing locality and offset can be omitted here. But this
52594 + relies on particular key allocation scheme for stat-data, so,
52595 + for extensibility sake, whole key can be stored here.
52596 +
52597 + We store key as array of bytes, because we don't want 8-byte
52598 + alignment of dir entries.
52599 + */
52600 + obj_key_id id;
52601 + /* file name. Null terminated string. */
52602 + d8 name[0];
52603 +} directory_entry_format;
52604 +
52605 +void print_de(const char *prefix, coord_t * coord);
52606 +int extract_key_de(const coord_t * coord, reiser4_key * key);
52607 +int update_key_de(const coord_t * coord, const reiser4_key * key,
52608 + lock_handle * lh);
52609 +char *extract_name_de(const coord_t * coord, char *buf);
52610 +unsigned extract_file_type_de(const coord_t * coord);
52611 +int add_entry_de(struct inode *dir, coord_t * coord,
52612 + lock_handle * lh, const struct dentry *name,
52613 + reiser4_dir_entry_desc * entry);
52614 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
52615 + lock_handle * lh, reiser4_dir_entry_desc * entry);
52616 +int max_name_len_de(const struct inode *dir);
52617 +
52618 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
52619 +
52620 +char *extract_dent_name(const coord_t * coord,
52621 + directory_entry_format * dent, char *buf);
52622 +
52623 +#if REISER4_LARGE_KEY
52624 +#define DE_NAME_BUF_LEN (24)
52625 +#else
52626 +#define DE_NAME_BUF_LEN (16)
52627 +#endif
52628 +
52629 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
52630 +#endif
52631 +
52632 +/* Make Linus happy.
52633 + Local variables:
52634 + c-indentation-style: "K&R"
52635 + mode-name: "LC"
52636 + c-basic-offset: 8
52637 + tab-width: 8
52638 + fill-column: 120
52639 + End:
52640 +*/
52641 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.22/fs/reiser4/plugin/item/static_stat.c
52642 --- linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300
52643 +++ linux-2.6.22/fs/reiser4/plugin/item/static_stat.c 2007-07-29 00:25:34.976722360 +0400
52644 @@ -0,0 +1,1107 @@
52645 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52646 +
52647 +/* stat data manipulation. */
52648 +
52649 +#include "../../forward.h"
52650 +#include "../../super.h"
52651 +#include "../../vfs_ops.h"
52652 +#include "../../inode.h"
52653 +#include "../../debug.h"
52654 +#include "../../dformat.h"
52655 +#include "../object.h"
52656 +#include "../plugin.h"
52657 +#include "../plugin_header.h"
52658 +#include "static_stat.h"
52659 +#include "item.h"
52660 +
52661 +#include <linux/types.h>
52662 +#include <linux/fs.h>
52663 +
52664 +/* see static_stat.h for explanation */
52665 +
52666 +/* helper function used while we are dumping/loading inode/plugin state
52667 + to/from the stat-data. */
52668 +
52669 +static void move_on(int *length /* space remaining in stat-data */ ,
52670 + char **area /* current coord in stat data */ ,
52671 + int size_of /* how many bytes to move forward */ )
52672 +{
52673 + assert("nikita-615", length != NULL);
52674 + assert("nikita-616", area != NULL);
52675 +
52676 + *length -= size_of;
52677 + *area += size_of;
52678 +
52679 + assert("nikita-617", *length >= 0);
52680 +}
52681 +
52682 +/* helper function used while loading inode/plugin state from stat-data.
52683 + Complain if there is less space in stat-data than was expected.
52684 + Can only happen on disk corruption. */
52685 +static int not_enough_space(struct inode *inode /* object being processed */ ,
52686 + const char *where /* error message */ )
52687 +{
52688 + assert("nikita-618", inode != NULL);
52689 +
52690 + warning("nikita-619", "Not enough space in %llu while loading %s",
52691 + (unsigned long long)get_inode_oid(inode), where);
52692 +
52693 + return RETERR(-EINVAL);
52694 +}
52695 +
52696 +/* helper function used while loading inode/plugin state from
52697 + stat-data. Call it if invalid plugin id was found. */
52698 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
52699 + struct inode *inode /* object being processed */ )
52700 +{
52701 + warning("nikita-620", "Unknown plugin %i in %llu",
52702 + id, (unsigned long long)get_inode_oid(inode));
52703 +
52704 + return RETERR(-EINVAL);
52705 +}
52706 +
52707 +/* this is installed as ->init_inode() method of
52708 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
52709 + Copies data from on-disk stat-data format into inode.
52710 + Handles stat-data extensions. */
52711 +/* was sd_load */
52712 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
52713 + char *sd /* stat-data body */ ,
52714 + int len /* length of stat-data */ )
52715 +{
52716 + int result;
52717 + int bit;
52718 + int chunk;
52719 + __u16 mask;
52720 + __u64 bigmask;
52721 + reiser4_stat_data_base *sd_base;
52722 + reiser4_inode *state;
52723 +
52724 + assert("nikita-625", inode != NULL);
52725 + assert("nikita-626", sd != NULL);
52726 +
52727 + result = 0;
52728 + sd_base = (reiser4_stat_data_base *) sd;
52729 + state = reiser4_inode_data(inode);
52730 + mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
52731 + bigmask = mask;
52732 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
52733 +
52734 + move_on(&len, &sd, sizeof *sd_base);
52735 + for (bit = 0, chunk = 0;
52736 + mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
52737 + ++bit, mask >>= 1) {
52738 + if (((bit + 1) % 16) != 0) {
52739 + /* handle extension */
52740 + sd_ext_plugin *sdplug;
52741 +
52742 + if (bit >= LAST_SD_EXTENSION) {
52743 + warning("vpf-1904",
52744 + "No such extension %i in inode %llu",
52745 + bit,
52746 + (unsigned long long)
52747 + get_inode_oid(inode));
52748 +
52749 + result = RETERR(-EINVAL);
52750 + break;
52751 + }
52752 +
52753 + sdplug = sd_ext_plugin_by_id(bit);
52754 + if (sdplug == NULL) {
52755 + warning("nikita-627",
52756 + "No such extension %i in inode %llu",
52757 + bit,
52758 + (unsigned long long)
52759 + get_inode_oid(inode));
52760 +
52761 + result = RETERR(-EINVAL);
52762 + break;
52763 + }
52764 + if (mask & 1) {
52765 + assert("nikita-628", sdplug->present);
52766 + /* alignment is not supported in node layout
52767 + plugin yet.
52768 + result = align( inode, &len, &sd,
52769 + sdplug -> alignment );
52770 + if( result != 0 )
52771 + return result; */
52772 + result = sdplug->present(inode, &sd, &len);
52773 + } else if (sdplug->absent != NULL)
52774 + result = sdplug->absent(inode);
52775 + if (result)
52776 + break;
52777 + /* else, we are looking at the last bit in 16-bit
52778 + portion of bitmask */
52779 + } else if (mask & 1) {
52780 + /* next portion of bitmask */
52781 + if (len < (int)sizeof(d16)) {
52782 + warning("nikita-629",
52783 + "No space for bitmap in inode %llu",
52784 + (unsigned long long)
52785 + get_inode_oid(inode));
52786 +
52787 + result = RETERR(-EINVAL);
52788 + break;
52789 + }
52790 + mask = le16_to_cpu(get_unaligned((d16 *)sd));
52791 + bigmask <<= 16;
52792 + bigmask |= mask;
52793 + move_on(&len, &sd, sizeof(d16));
52794 + ++chunk;
52795 + if (chunk == 3) {
52796 + if (!(mask & 0x8000)) {
52797 + /* clear last bit */
52798 + mask &= ~0x8000;
52799 + continue;
52800 + }
52801 + /* too much */
52802 + warning("nikita-630",
52803 + "Too many extensions in %llu",
52804 + (unsigned long long)
52805 + get_inode_oid(inode));
52806 +
52807 + result = RETERR(-EINVAL);
52808 + break;
52809 + }
52810 + } else
52811 + /* bitmask exhausted */
52812 + break;
52813 + }
52814 + state->extmask = bigmask;
52815 + /* common initialisations */
52816 + if (len - (bit / 16 * sizeof(d16)) > 0) {
52817 + /* alignment in save_len_static_sd() is taken into account
52818 + -edward */
52819 + warning("nikita-631", "unused space in inode %llu",
52820 + (unsigned long long)get_inode_oid(inode));
52821 + }
52822 +
52823 + return result;
52824 +}
52825 +
52826 +/* estimates size of stat-data required to store inode.
52827 + Installed as ->save_len() method of
52828 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
52829 +/* was sd_len */
52830 +int save_len_static_sd(struct inode *inode /* object being processed */ )
52831 +{
52832 + unsigned int result;
52833 + __u64 mask;
52834 + int bit;
52835 +
52836 + assert("nikita-632", inode != NULL);
52837 +
52838 + result = sizeof(reiser4_stat_data_base);
52839 + mask = reiser4_inode_data(inode)->extmask;
52840 + for (bit = 0; mask != 0; ++bit, mask >>= 1) {
52841 + if (mask & 1) {
52842 + sd_ext_plugin *sdplug;
52843 +
52844 + sdplug = sd_ext_plugin_by_id(bit);
52845 + assert("nikita-633", sdplug != NULL);
52846 + /* no aligment support
52847 + result +=
52848 + round_up( result, sdplug -> alignment ) - result; */
52849 + result += sdplug->save_len(inode);
52850 + }
52851 + }
52852 + result += bit / 16 * sizeof(d16);
52853 + return result;
52854 +}
52855 +
52856 +/* saves inode into stat-data.
52857 + Installed as ->save() method of
52858 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
52859 +/* was sd_save */
52860 +int save_static_sd(struct inode *inode /* object being processed */ ,
52861 + char **area /* where to save stat-data */ )
52862 +{
52863 + int result;
52864 + __u64 emask;
52865 + int bit;
52866 + unsigned int len;
52867 + reiser4_stat_data_base *sd_base;
52868 +
52869 + assert("nikita-634", inode != NULL);
52870 + assert("nikita-635", area != NULL);
52871 +
52872 + result = 0;
52873 + emask = reiser4_inode_data(inode)->extmask;
52874 + sd_base = (reiser4_stat_data_base *) * area;
52875 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
52876 + /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
52877 +
52878 + *area += sizeof *sd_base;
52879 + len = 0xffffffffu;
52880 + for (bit = 0; emask != 0; ++bit, emask >>= 1) {
52881 + if (emask & 1) {
52882 + if ((bit + 1) % 16 != 0) {
52883 + sd_ext_plugin *sdplug;
52884 + sdplug = sd_ext_plugin_by_id(bit);
52885 + assert("nikita-636", sdplug != NULL);
52886 + /* no alignment support yet
52887 + align( inode, &len, area,
52888 + sdplug -> alignment ); */
52889 + result = sdplug->save(inode, area);
52890 + if (result)
52891 + break;
52892 + } else {
52893 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
52894 + (d16 *)(*area));
52895 + /*cputod16((unsigned)(emask & 0xffff),
52896 + (d16 *) * area);*/
52897 + *area += sizeof(d16);
52898 + }
52899 + }
52900 + }
52901 + return result;
52902 +}
52903 +
52904 +/* stat-data extension handling functions. */
52905 +
52906 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
52907 + char **area /* position in stat-data */ ,
52908 + int *len /* remaining length */ )
52909 +{
52910 + if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
52911 + reiser4_light_weight_stat *sd_lw;
52912 +
52913 + sd_lw = (reiser4_light_weight_stat *) * area;
52914 +
52915 + inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
52916 + inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
52917 + inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
52918 + if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
52919 + inode->i_mode &= ~S_IFIFO;
52920 + warning("", "partially converted file is encountered");
52921 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
52922 + }
52923 + move_on(len, area, sizeof *sd_lw);
52924 + return 0;
52925 + } else
52926 + return not_enough_space(inode, "lw sd");
52927 +}
52928 +
52929 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
52930 + * processed */ )
52931 +{
52932 + return sizeof(reiser4_light_weight_stat);
52933 +}
52934 +
52935 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
52936 + char **area /* position in stat-data */ )
52937 +{
52938 + reiser4_light_weight_stat *sd;
52939 + mode_t delta;
52940 +
52941 + assert("nikita-2705", inode != NULL);
52942 + assert("nikita-2706", area != NULL);
52943 + assert("nikita-2707", *area != NULL);
52944 +
52945 + sd = (reiser4_light_weight_stat *) * area;
52946 +
52947 + delta = (reiser4_inode_get_flag(inode,
52948 + REISER4_PART_MIXED) ? S_IFIFO : 0);
52949 + put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
52950 + put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
52951 + put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
52952 + *area += sizeof *sd;
52953 + return 0;
52954 +}
52955 +
52956 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
52957 + char **area /* position in stat-data */ ,
52958 + int *len /* remaining length */ )
52959 +{
52960 + assert("nikita-637", inode != NULL);
52961 + assert("nikita-638", area != NULL);
52962 + assert("nikita-639", *area != NULL);
52963 + assert("nikita-640", len != NULL);
52964 + assert("nikita-641", *len > 0);
52965 +
52966 + if (*len >= (int)sizeof(reiser4_unix_stat)) {
52967 + reiser4_unix_stat *sd;
52968 +
52969 + sd = (reiser4_unix_stat *) * area;
52970 +
52971 + inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
52972 + inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
52973 + inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
52974 + inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
52975 + inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
52976 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
52977 + inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
52978 + else
52979 + inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
52980 + move_on(len, area, sizeof *sd);
52981 + return 0;
52982 + } else
52983 + return not_enough_space(inode, "unix sd");
52984 +}
52985 +
52986 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
52987 +{
52988 + inode->i_uid = get_super_private(inode->i_sb)->default_uid;
52989 + inode->i_gid = get_super_private(inode->i_sb)->default_gid;
52990 + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
52991 + inode_set_bytes(inode, inode->i_size);
52992 + /* mark inode as lightweight, so that caller (lookup_common) will
52993 + complete initialisation by copying [ug]id from a parent. */
52994 + reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
52995 + return 0;
52996 +}
52997 +
52998 +/* Audited by: green(2002.06.14) */
52999 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
53000 + * processed */ )
53001 +{
53002 + return sizeof(reiser4_unix_stat);
53003 +}
53004 +
53005 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
53006 + char **area /* position in stat-data */ )
53007 +{
53008 + reiser4_unix_stat *sd;
53009 +
53010 + assert("nikita-642", inode != NULL);
53011 + assert("nikita-643", area != NULL);
53012 + assert("nikita-644", *area != NULL);
53013 +
53014 + sd = (reiser4_unix_stat *) * area;
53015 + put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53016 + put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53017 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53018 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53019 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53020 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53021 + put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53022 + else
53023 + put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53024 + *area += sizeof *sd;
53025 + return 0;
53026 +}
53027 +
53028 +static int
53029 +present_large_times_sd(struct inode *inode /* object being processed */ ,
53030 + char **area /* position in stat-data */ ,
53031 + int *len /* remaining length */ )
53032 +{
53033 + if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53034 + reiser4_large_times_stat *sd_lt;
53035 +
53036 + sd_lt = (reiser4_large_times_stat *) * area;
53037 +
53038 + inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53039 + inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53040 + inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53041 +
53042 + move_on(len, area, sizeof *sd_lt);
53043 + return 0;
53044 + } else
53045 + return not_enough_space(inode, "large times sd");
53046 +}
53047 +
53048 +static int
53049 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
53050 + /* object being processed */ )
53051 +{
53052 + return sizeof(reiser4_large_times_stat);
53053 +}
53054 +
53055 +static int
53056 +save_large_times_sd(struct inode *inode /* object being processed */ ,
53057 + char **area /* position in stat-data */ )
53058 +{
53059 + reiser4_large_times_stat *sd;
53060 +
53061 + assert("nikita-2817", inode != NULL);
53062 + assert("nikita-2818", area != NULL);
53063 + assert("nikita-2819", *area != NULL);
53064 +
53065 + sd = (reiser4_large_times_stat *) * area;
53066 +
53067 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53068 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53069 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53070 +
53071 + *area += sizeof *sd;
53072 + return 0;
53073 +}
53074 +
53075 +/* symlink stat data extension */
53076 +
53077 +/* allocate memory for symlink target and attach it to inode->i_private */
53078 +static int
53079 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
53080 +{
53081 + assert("vs-845", inode->i_private == NULL);
53082 + assert("vs-846", !reiser4_inode_get_flag(inode,
53083 + REISER4_GENERIC_PTR_USED));
53084 + /* FIXME-VS: this is prone to deadlock. Not more than other similar
53085 + places, though */
53086 + inode->i_private = kmalloc((size_t) len + 1,
53087 + reiser4_ctx_gfp_mask_get());
53088 + if (!inode->i_private)
53089 + return RETERR(-ENOMEM);
53090 +
53091 + memcpy((char *)(inode->i_private), target, (size_t) len);
53092 + ((char *)(inode->i_private))[len] = 0;
53093 + reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53094 + return 0;
53095 +}
53096 +
53097 +/* this is called on read_inode. There is nothing to do actually, but some
53098 + sanity checks */
53099 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
53100 +{
53101 + int result;
53102 + int length;
53103 + reiser4_symlink_stat *sd;
53104 +
53105 + length = (int)inode->i_size;
53106 + /*
53107 + * *len is number of bytes in stat data item from *area to the end of
53108 + * item. It must be not less than size of symlink + 1 for ending 0
53109 + */
53110 + if (length > *len)
53111 + return not_enough_space(inode, "symlink");
53112 +
53113 + if (*(*area + length) != 0) {
53114 + warning("vs-840", "Symlink is not zero terminated");
53115 + return RETERR(-EIO);
53116 + }
53117 +
53118 + sd = (reiser4_symlink_stat *) * area;
53119 + result = symlink_target_to_inode(inode, sd->body, length);
53120 +
53121 + move_on(len, area, length + 1);
53122 + return result;
53123 +}
53124 +
53125 +static int save_len_symlink_sd(struct inode *inode)
53126 +{
53127 + return inode->i_size + 1;
53128 +}
53129 +
53130 +/* this is called on create and update stat data. Do nothing on update but
53131 + update @area */
53132 +static int save_symlink_sd(struct inode *inode, char **area)
53133 +{
53134 + int result;
53135 + int length;
53136 + reiser4_symlink_stat *sd;
53137 +
53138 + length = (int)inode->i_size;
53139 + /* inode->i_size must be set already */
53140 + assert("vs-841", length);
53141 +
53142 + result = 0;
53143 + sd = (reiser4_symlink_stat *) * area;
53144 + if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53145 + const char *target;
53146 +
53147 + target = (const char *)(inode->i_private);
53148 + inode->i_private = NULL;
53149 +
53150 + result = symlink_target_to_inode(inode, target, length);
53151 +
53152 + /* copy symlink to stat data */
53153 + memcpy(sd->body, target, (size_t) length);
53154 + (*area)[length] = 0;
53155 + } else {
53156 + /* there is nothing to do in update but move area */
53157 + assert("vs-844",
53158 + !memcmp(inode->i_private, sd->body,
53159 + (size_t) length + 1));
53160 + }
53161 +
53162 + *area += (length + 1);
53163 + return result;
53164 +}
53165 +
53166 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
53167 + char **area /* position in stat-data */ ,
53168 + int *len /* remaining length */ )
53169 +{
53170 + assert("nikita-645", inode != NULL);
53171 + assert("nikita-646", area != NULL);
53172 + assert("nikita-647", *area != NULL);
53173 + assert("nikita-648", len != NULL);
53174 + assert("nikita-649", *len > 0);
53175 +
53176 + if (*len >= (int)sizeof(reiser4_flags_stat)) {
53177 + reiser4_flags_stat *sd;
53178 +
53179 + sd = (reiser4_flags_stat *) * area;
53180 + inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53181 + move_on(len, area, sizeof *sd);
53182 + return 0;
53183 + } else
53184 + return not_enough_space(inode, "generation and attrs");
53185 +}
53186 +
53187 +/* Audited by: green(2002.06.14) */
53188 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
53189 + * processed */ )
53190 +{
53191 + return sizeof(reiser4_flags_stat);
53192 +}
53193 +
53194 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
53195 + char **area /* position in stat-data */ )
53196 +{
53197 + reiser4_flags_stat *sd;
53198 +
53199 + assert("nikita-650", inode != NULL);
53200 + assert("nikita-651", area != NULL);
53201 + assert("nikita-652", *area != NULL);
53202 +
53203 + sd = (reiser4_flags_stat *) * area;
53204 + put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53205 + *area += sizeof *sd;
53206 + return 0;
53207 +}
53208 +
53209 +static int absent_plugin_sd(struct inode *inode);
53210 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53211 + char **area /* position in stat-data */ ,
53212 + int *len /* remaining length */,
53213 + int is_pset /* 1 if plugin set, 0 if heir set. */)
53214 +{
53215 + reiser4_plugin_stat *sd;
53216 + reiser4_plugin *plugin;
53217 + reiser4_inode *info;
53218 + int i;
53219 + __u16 mask;
53220 + int result;
53221 + int num_of_plugins;
53222 +
53223 + assert("nikita-653", inode != NULL);
53224 + assert("nikita-654", area != NULL);
53225 + assert("nikita-655", *area != NULL);
53226 + assert("nikita-656", len != NULL);
53227 + assert("nikita-657", *len > 0);
53228 +
53229 + if (*len < (int)sizeof(reiser4_plugin_stat))
53230 + return not_enough_space(inode, "plugin");
53231 +
53232 + sd = (reiser4_plugin_stat *) * area;
53233 + info = reiser4_inode_data(inode);
53234 +
53235 + mask = 0;
53236 + num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
53237 + move_on(len, area, sizeof *sd);
53238 + result = 0;
53239 + for (i = 0; i < num_of_plugins; ++i) {
53240 + reiser4_plugin_slot *slot;
53241 + reiser4_plugin_type type;
53242 + pset_member memb;
53243 +
53244 + slot = (reiser4_plugin_slot *) * area;
53245 + if (*len < (int)sizeof *slot)
53246 + return not_enough_space(inode, "additional plugin");
53247 +
53248 + memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
53249 + type = aset_member_to_type_unsafe(memb);
53250 +
53251 + if (type == REISER4_PLUGIN_TYPES) {
53252 + warning("nikita-3502",
53253 + "wrong %s member (%i) for %llu", is_pset ?
53254 + "pset" : "hset", memb,
53255 + (unsigned long long)get_inode_oid(inode));
53256 + return RETERR(-EINVAL);
53257 + }
53258 + plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
53259 + type, &slot->id);
53260 + if (plugin == NULL)
53261 + return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
53262 +
53263 + /* plugin is loaded into inode, mark this into inode's
53264 + bitmask of loaded non-standard plugins */
53265 + if (!(mask & (1 << memb))) {
53266 + mask |= (1 << memb);
53267 + } else {
53268 + warning("nikita-658", "duplicate plugin for %llu",
53269 + (unsigned long long)get_inode_oid(inode));
53270 + return RETERR(-EINVAL);
53271 + }
53272 + move_on(len, area, sizeof *slot);
53273 + /* load plugin data, if any */
53274 + if (plugin->h.pops != NULL && plugin->h.pops->load)
53275 + result = plugin->h.pops->load(inode, plugin, area, len);
53276 + else
53277 + result = aset_set_unsafe(is_pset ? &info->pset :
53278 + &info->hset, memb, plugin);
53279 + if (result)
53280 + return result;
53281 + }
53282 + if (is_pset) {
53283 + /* if object plugin wasn't loaded from stat-data, guess it by
53284 + mode bits */
53285 + plugin = file_plugin_to_plugin(inode_file_plugin(inode));
53286 + if (plugin == NULL)
53287 + result = absent_plugin_sd(inode);
53288 + info->plugin_mask = mask;
53289 + } else
53290 + info->heir_mask = mask;
53291 +
53292 + return result;
53293 +}
53294 +
53295 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
53296 + return present_plugin_sd(inode, area, len, 1 /* pset */);
53297 +}
53298 +
53299 +/* Determine object plugin for @inode based on i_mode.
53300 +
53301 + Many objects in reiser4 file system are controlled by standard object
53302 + plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
53303 +
53304 + For such files we don't explicitly store plugin id in object stat
53305 + data. Rather required plugin is guessed from mode bits, where file "type"
53306 + is encoded (see stat(2)).
53307 +*/
53308 +static int
53309 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
53310 +{
53311 + int fplug_id;
53312 + int dplug_id;
53313 + reiser4_inode *info;
53314 +
53315 + assert("nikita-736", inode != NULL);
53316 +
53317 + dplug_id = fplug_id = -1;
53318 +
53319 + switch (inode->i_mode & S_IFMT) {
53320 + case S_IFSOCK:
53321 + case S_IFBLK:
53322 + case S_IFCHR:
53323 + case S_IFIFO:
53324 + fplug_id = SPECIAL_FILE_PLUGIN_ID;
53325 + break;
53326 + case S_IFLNK:
53327 + fplug_id = SYMLINK_FILE_PLUGIN_ID;
53328 + break;
53329 + case S_IFDIR:
53330 + fplug_id = DIRECTORY_FILE_PLUGIN_ID;
53331 + dplug_id = HASHED_DIR_PLUGIN_ID;
53332 + break;
53333 + default:
53334 + warning("nikita-737", "wrong file mode: %o", inode->i_mode);
53335 + return RETERR(-EIO);
53336 + case S_IFREG:
53337 + fplug_id = UNIX_FILE_PLUGIN_ID;
53338 + break;
53339 + }
53340 + info = reiser4_inode_data(inode);
53341 + set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
53342 + plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
53343 + set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
53344 + plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
53345 + return 0;
53346 +}
53347 +
53348 +/* Audited by: green(2002.06.14) */
53349 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
53350 +{
53351 + int result;
53352 +
53353 + assert("nikita-659", inode != NULL);
53354 +
53355 + result = guess_plugin_by_mode(inode);
53356 + /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
53357 + but setup_inode_ops() will call make_bad_inode().
53358 + Another, more logical but bit more complex solution is to add
53359 + "bad-file plugin". */
53360 + /* FIXME-VS: activate was called here */
53361 + return result;
53362 +}
53363 +
53364 +/* helper function for plugin_sd_save_len(): calculate how much space
53365 + required to save state of given plugin */
53366 +/* Audited by: green(2002.06.14) */
53367 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
53368 + struct inode *inode /* object being processed */ ,
53369 + pset_member memb,
53370 + int len, int is_pset)
53371 +{
53372 + reiser4_inode *info;
53373 + assert("nikita-661", inode != NULL);
53374 +
53375 + if (plugin == NULL)
53376 + return len;
53377 +
53378 + info = reiser4_inode_data(inode);
53379 + if (is_pset ?
53380 + info->plugin_mask & (1 << memb) :
53381 + info->heir_mask & (1 << memb)) {
53382 + len += sizeof(reiser4_plugin_slot);
53383 + if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
53384 + /* non-standard plugin, call method */
53385 + /* commented as it is incompatible with alignment
53386 + * policy in save_plug() -edward */
53387 + /* len = round_up(len, plugin->h.pops->alignment); */
53388 + len += plugin->h.pops->save_len(inode, plugin);
53389 + }
53390 + }
53391 + return len;
53392 +}
53393 +
53394 +/* calculate how much space is required to save state of all plugins,
53395 + associated with inode */
53396 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
53397 + int is_pset)
53398 +{
53399 + int len;
53400 + int last;
53401 + reiser4_inode *state;
53402 + pset_member memb;
53403 +
53404 + assert("nikita-663", inode != NULL);
53405 +
53406 + state = reiser4_inode_data(inode);
53407 +
53408 + /* common case: no non-standard plugins */
53409 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
53410 + return 0;
53411 + len = sizeof(reiser4_plugin_stat);
53412 + last = PSET_LAST;
53413 +
53414 + for (memb = 0; memb < last; ++memb) {
53415 + len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
53416 + inode, memb, len, is_pset);
53417 + }
53418 + assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
53419 + return len;
53420 +}
53421 +
53422 +static int save_len_pset_sd(struct inode *inode) {
53423 + return save_len_plugin_sd(inode, 1 /* pset */);
53424 +}
53425 +
53426 +/* helper function for plugin_sd_save(): save plugin, associated with
53427 + inode. */
53428 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
53429 + struct inode *inode /* object being processed */ ,
53430 + int memb /* what element of pset is saved */ ,
53431 + char **area /* position in stat-data */ ,
53432 + int *count /* incremented if plugin were actually saved. */,
53433 + int is_pset /* 1 for plugin set, 0 for heir set */)
53434 +{
53435 + reiser4_plugin_slot *slot;
53436 + int fake_len;
53437 + int result;
53438 +
53439 + assert("nikita-665", inode != NULL);
53440 + assert("nikita-666", area != NULL);
53441 + assert("nikita-667", *area != NULL);
53442 +
53443 + if (plugin == NULL)
53444 + return 0;
53445 +
53446 + if (is_pset ?
53447 + !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
53448 + !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
53449 + return 0;
53450 + slot = (reiser4_plugin_slot *) * area;
53451 + put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
53452 + put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
53453 + fake_len = (int)0xffff;
53454 + move_on(&fake_len, area, sizeof *slot);
53455 + ++*count;
53456 + result = 0;
53457 + if (plugin->h.pops != NULL) {
53458 + if (plugin->h.pops->save != NULL)
53459 + result = plugin->h.pops->save(inode, plugin, area);
53460 + }
53461 + return result;
53462 +}
53463 +
53464 +/* save state of all non-standard plugins associated with inode */
53465 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
53466 + char **area /* position in stat-data */,
53467 + int is_pset /* 1 for pset, 0 for hset */)
53468 +{
53469 + int fake_len;
53470 + int result = 0;
53471 + int num_of_plugins;
53472 + reiser4_plugin_stat *sd;
53473 + reiser4_inode *state;
53474 + pset_member memb;
53475 +
53476 + assert("nikita-669", inode != NULL);
53477 + assert("nikita-670", area != NULL);
53478 + assert("nikita-671", *area != NULL);
53479 +
53480 + state = reiser4_inode_data(inode);
53481 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
53482 + return 0;
53483 + sd = (reiser4_plugin_stat *) * area;
53484 + fake_len = (int)0xffff;
53485 + move_on(&fake_len, area, sizeof *sd);
53486 +
53487 + num_of_plugins = 0;
53488 + for (memb = 0; memb < PSET_LAST; ++memb) {
53489 + result = save_plug(aset_get(is_pset ? state->pset : state->hset,
53490 + memb),
53491 + inode, memb, area, &num_of_plugins, is_pset);
53492 + if (result != 0)
53493 + break;
53494 + }
53495 +
53496 + put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
53497 + return result;
53498 +}
53499 +
53500 +static int save_pset_sd(struct inode *inode, char **area) {
53501 + return save_plugin_sd(inode, area, 1 /* pset */);
53502 +}
53503 +
53504 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
53505 + return present_plugin_sd(inode, area, len, 0 /* hset */);
53506 +}
53507 +
53508 +static int save_len_hset_sd(struct inode *inode) {
53509 + return save_len_plugin_sd(inode, 0 /* pset */);
53510 +}
53511 +
53512 +static int save_hset_sd(struct inode *inode, char **area) {
53513 + return save_plugin_sd(inode, area, 0 /* hset */);
53514 +}
53515 +
53516 +/* helper function for crypto_sd_present(), crypto_sd_save.
53517 + Extract crypto info from stat-data and attach it to inode */
53518 +static int extract_crypto_info (struct inode * inode,
53519 + reiser4_crypto_stat * sd)
53520 +{
53521 + struct reiser4_crypto_info * info;
53522 + assert("edward-11", !inode_crypto_info(inode));
53523 + assert("edward-1413",
53524 + !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
53525 + /* create and attach a crypto-stat without secret key loaded */
53526 + info = reiser4_alloc_crypto_info(inode);
53527 + if (IS_ERR(info))
53528 + return PTR_ERR(info);
53529 + info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
53530 + memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
53531 + reiser4_attach_crypto_info(inode, info);
53532 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53533 + return 0;
53534 +}
53535 +
53536 +/* crypto stat-data extension */
53537 +
53538 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
53539 +{
53540 + int result;
53541 + reiser4_crypto_stat *sd;
53542 + digest_plugin *dplug = inode_digest_plugin(inode);
53543 +
53544 + assert("edward-06", dplug != NULL);
53545 + assert("edward-684", dplug->fipsize);
53546 + assert("edward-07", area != NULL);
53547 + assert("edward-08", *area != NULL);
53548 + assert("edward-09", len != NULL);
53549 + assert("edward-10", *len > 0);
53550 +
53551 + if (*len < (int)sizeof(reiser4_crypto_stat)) {
53552 + return not_enough_space(inode, "crypto-sd");
53553 + }
53554 + /* *len is number of bytes in stat data item from *area to the end of
53555 + item. It must be not less than size of this extension */
53556 + assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
53557 +
53558 + sd = (reiser4_crypto_stat *) * area;
53559 + result = extract_crypto_info(inode, sd);
53560 + move_on(len, area, sizeof(*sd) + dplug->fipsize);
53561 +
53562 + return result;
53563 +}
53564 +
53565 +static int save_len_crypto_sd(struct inode *inode)
53566 +{
53567 + return sizeof(reiser4_crypto_stat) +
53568 + inode_digest_plugin(inode)->fipsize;
53569 +}
53570 +
53571 +static int save_crypto_sd(struct inode *inode, char **area)
53572 +{
53573 + int result = 0;
53574 + reiser4_crypto_stat *sd;
53575 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
53576 + digest_plugin *dplug = inode_digest_plugin(inode);
53577 +
53578 + assert("edward-12", dplug != NULL);
53579 + assert("edward-13", area != NULL);
53580 + assert("edward-14", *area != NULL);
53581 + assert("edward-15", info != NULL);
53582 + assert("edward-1414", info->keyid != NULL);
53583 + assert("edward-1415", info->keysize != 0);
53584 + assert("edward-76", reiser4_inode_data(inode) != NULL);
53585 +
53586 + if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
53587 + /* file is just created */
53588 + sd = (reiser4_crypto_stat *) *area;
53589 + /* copy everything but private key to the disk stat-data */
53590 + put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
53591 + memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
53592 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53593 + }
53594 + *area += (sizeof(*sd) + dplug->fipsize);
53595 + return result;
53596 +}
53597 +
53598 +static int eio(struct inode *inode, char **area, int *len)
53599 +{
53600 + return RETERR(-EIO);
53601 +}
53602 +
53603 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
53604 + [LIGHT_WEIGHT_STAT] = {
53605 + .h = {
53606 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53607 + .id = LIGHT_WEIGHT_STAT,
53608 + .pops = NULL,
53609 + .label = "light-weight sd",
53610 + .desc = "sd for light-weight files",
53611 + .linkage = {NULL,NULL}
53612 + },
53613 + .present = present_lw_sd,
53614 + .absent = NULL,
53615 + .save_len = save_len_lw_sd,
53616 + .save = save_lw_sd,
53617 + .alignment = 8
53618 + },
53619 + [UNIX_STAT] = {
53620 + .h = {
53621 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53622 + .id = UNIX_STAT,
53623 + .pops = NULL,
53624 + .label = "unix-sd",
53625 + .desc = "unix stat-data fields",
53626 + .linkage = {NULL,NULL}
53627 + },
53628 + .present = present_unix_sd,
53629 + .absent = absent_unix_sd,
53630 + .save_len = save_len_unix_sd,
53631 + .save = save_unix_sd,
53632 + .alignment = 8
53633 + },
53634 + [LARGE_TIMES_STAT] = {
53635 + .h = {
53636 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53637 + .id = LARGE_TIMES_STAT,
53638 + .pops = NULL,
53639 + .label = "64time-sd",
53640 + .desc = "nanosecond resolution for times",
53641 + .linkage = {NULL,NULL}
53642 + },
53643 + .present = present_large_times_sd,
53644 + .absent = NULL,
53645 + .save_len = save_len_large_times_sd,
53646 + .save = save_large_times_sd,
53647 + .alignment = 8
53648 + },
53649 + [SYMLINK_STAT] = {
53650 + /* stat data of symlink has this extension */
53651 + .h = {
53652 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53653 + .id = SYMLINK_STAT,
53654 + .pops = NULL,
53655 + .label = "symlink-sd",
53656 + .desc =
53657 + "stat data is appended with symlink name",
53658 + .linkage = {NULL,NULL}
53659 + },
53660 + .present = present_symlink_sd,
53661 + .absent = NULL,
53662 + .save_len = save_len_symlink_sd,
53663 + .save = save_symlink_sd,
53664 + .alignment = 8
53665 + },
53666 + [PLUGIN_STAT] = {
53667 + .h = {
53668 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53669 + .id = PLUGIN_STAT,
53670 + .pops = NULL,
53671 + .label = "plugin-sd",
53672 + .desc = "plugin stat-data fields",
53673 + .linkage = {NULL,NULL}
53674 + },
53675 + .present = present_pset_sd,
53676 + .absent = absent_plugin_sd,
53677 + .save_len = save_len_pset_sd,
53678 + .save = save_pset_sd,
53679 + .alignment = 8
53680 + },
53681 + [HEIR_STAT] = {
53682 + .h = {
53683 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53684 + .id = HEIR_STAT,
53685 + .pops = NULL,
53686 + .label = "heir-plugin-sd",
53687 + .desc = "heir plugin stat-data fields",
53688 + .linkage = {NULL,NULL}
53689 + },
53690 + .present = present_hset_sd,
53691 + .absent = NULL,
53692 + .save_len = save_len_hset_sd,
53693 + .save = save_hset_sd,
53694 + .alignment = 8
53695 + },
53696 + [FLAGS_STAT] = {
53697 + .h = {
53698 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53699 + .id = FLAGS_STAT,
53700 + .pops = NULL,
53701 + .label = "flags-sd",
53702 + .desc = "inode bit flags",
53703 + .linkage = {NULL, NULL}
53704 + },
53705 + .present = present_flags_sd,
53706 + .absent = NULL,
53707 + .save_len = save_len_flags_sd,
53708 + .save = save_flags_sd,
53709 + .alignment = 8
53710 + },
53711 + [CAPABILITIES_STAT] = {
53712 + .h = {
53713 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53714 + .id = CAPABILITIES_STAT,
53715 + .pops = NULL,
53716 + .label = "capabilities-sd",
53717 + .desc = "capabilities",
53718 + .linkage = {NULL, NULL}
53719 + },
53720 + .present = eio,
53721 + .absent = NULL,
53722 + .save_len = save_len_flags_sd,
53723 + .save = save_flags_sd,
53724 + .alignment = 8
53725 + },
53726 + [CRYPTO_STAT] = {
53727 + .h = {
53728 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53729 + .id = CRYPTO_STAT,
53730 + .pops = NULL,
53731 + .label = "crypto-sd",
53732 + .desc = "secret key size and id",
53733 + .linkage = {NULL, NULL}
53734 + },
53735 + .present = present_crypto_sd,
53736 + .absent = NULL,
53737 + .save_len = save_len_crypto_sd,
53738 + .save = save_crypto_sd,
53739 + .alignment = 8
53740 + }
53741 +};
53742 +
53743 +/* Make Linus happy.
53744 + Local variables:
53745 + c-indentation-style: "K&R"
53746 + mode-name: "LC"
53747 + c-basic-offset: 8
53748 + tab-width: 8
53749 + fill-column: 120
53750 + End:
53751 +*/
53752 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.22/fs/reiser4/plugin/item/static_stat.h
53753 --- linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300
53754 +++ linux-2.6.22/fs/reiser4/plugin/item/static_stat.h 2007-07-29 00:25:34.976722360 +0400
53755 @@ -0,0 +1,224 @@
53756 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53757 +
53758 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
53759 +
53760 +In the case where each file has not less than the fields needed by the
53761 +stat() syscall, it is more compact to store those fields in this
53762 +struct.
53763 +
53764 +If this item does not exist, then all stats are dynamically resolved.
53765 +At the moment, we either resolve all stats dynamically or all of them
53766 +statically. If you think this is not fully optimal, and the rest of
53767 +reiser4 is working, then fix it...:-)
53768 +
53769 +*/
53770 +
53771 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
53772 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
53773 +
53774 +#include "../../forward.h"
53775 +#include "../../dformat.h"
53776 +
53777 +#include <linux/fs.h> /* for struct inode */
53778 +
53779 +/* Stat data layout: goals and implementation.
53780 +
53781 + We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
53782 + them, including not having semantic metadata attached to them.
53783 +
53784 + There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
53785 + want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
53786 + sized structure because the statically sized structure knows without recording it what the names and lengths of the
53787 + attributes are.
53788 +
53789 + This leads to a natural compromise, which is to special case those files which have simply the standard unix file
53790 + attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
53791 + file in their use of file attributes.
53792 +
53793 + Yet this compromise deserves to be compromised a little.
53794 +
53795 + We accommodate the case where you have no more than the standard unix file attributes by using an "extension
53796 + bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
53797 +
53798 + If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
53799 + from parent directory (as uid, gid) or initialised to some sane values.
53800 +
53801 + To capitalize on existing code infrastructure, extensions are
53802 + implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
53803 + Each stat-data extension plugin implements four methods:
53804 +
53805 + ->present() called by sd_load() when this extension is found in stat-data
53806 + ->absent() called by sd_load() when this extension is not found in stat-data
53807 + ->save_len() called by sd_len() to calculate total length of stat-data
53808 + ->save() called by sd_save() to store extension data into stat-data
53809 +
53810 + Implementation is in fs/reiser4/plugin/item/static_stat.c
53811 +*/
53812 +
53813 +/* stat-data extension. Please order this by presumed frequency of use */
53814 +typedef enum {
53815 + /* support for light-weight files */
53816 + LIGHT_WEIGHT_STAT,
53817 + /* data required to implement unix stat(2) call. Layout is in
53818 + reiser4_unix_stat. If this is not present, file is light-weight */
53819 + UNIX_STAT,
53820 + /* this contains additional set of 32bit [anc]time fields to implement
53821 + nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
53822 + if this extension is governed by 32bittimes mount option. */
53823 + LARGE_TIMES_STAT,
53824 + /* stat data has link name included */
53825 + SYMLINK_STAT,
53826 + /* on-disk slots of non-standard plugins for main plugin table
53827 + (@reiser4_inode->pset), that is, plugins that cannot be deduced
53828 + from file mode bits), for example, aggregation, interpolation etc. */
53829 + PLUGIN_STAT,
53830 + /* this extension contains persistent inode flags. These flags are
53831 + single bits: immutable, append, only, etc. Layout is in
53832 + reiser4_flags_stat. */
53833 + FLAGS_STAT,
53834 + /* this extension contains capabilities sets, associated with this
53835 + file. Layout is in reiser4_capabilities_stat */
53836 + CAPABILITIES_STAT,
53837 + /* this extension contains size and public id of the secret key.
53838 + Layout is in reiser4_crypto_stat */
53839 + CRYPTO_STAT,
53840 + /* on-disk slots of non-default plugins for inheritance, which
53841 + are extracted to special plugin table (@reiser4_inode->hset).
53842 + By default, children of the object will inherit plugins from
53843 + its main plugin table (pset). */
53844 + HEIR_STAT,
53845 + LAST_SD_EXTENSION,
53846 + /*
53847 + * init_inode_static_sd() iterates over extension mask until all
53848 + * non-zero bits are processed. This means, that neither ->present(),
53849 + * nor ->absent() methods will be called for stat-data extensions that
53850 + * go after last present extension. But some basic extensions, we want
53851 + * either ->absent() or ->present() method to be called, because these
53852 + * extensions set up something in inode even when they are not
53853 + * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
53854 + * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
53855 + * ->present(), or ->absent() method will be called, independently of
53856 + * what other extensions are present.
53857 + */
53858 + LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
53859 +} sd_ext_bits;
53860 +
53861 +/* minimal stat-data. This allows to support light-weight files. */
53862 +typedef struct reiser4_stat_data_base {
53863 + /* 0 */ __le16 extmask;
53864 + /* 2 */
53865 +} PACKED reiser4_stat_data_base;
53866 +
53867 +typedef struct reiser4_light_weight_stat {
53868 + /* 0 */ __le16 mode;
53869 + /* 2 */ __le32 nlink;
53870 + /* 6 */ __le64 size;
53871 + /* size in bytes */
53872 + /* 14 */
53873 +} PACKED reiser4_light_weight_stat;
53874 +
53875 +typedef struct reiser4_unix_stat {
53876 + /* owner id */
53877 + /* 0 */ __le32 uid;
53878 + /* group id */
53879 + /* 4 */ __le32 gid;
53880 + /* access time */
53881 + /* 8 */ __le32 atime;
53882 + /* modification time */
53883 + /* 12 */ __le32 mtime;
53884 + /* change time */
53885 + /* 16 */ __le32 ctime;
53886 + union {
53887 + /* minor:major for device files */
53888 + /* 20 */ __le64 rdev;
53889 + /* bytes used by file */
53890 + /* 20 */ __le64 bytes;
53891 + } u;
53892 + /* 28 */
53893 +} PACKED reiser4_unix_stat;
53894 +
53895 +/* symlink stored as part of inode */
53896 +typedef struct reiser4_symlink_stat {
53897 + char body[0];
53898 +} PACKED reiser4_symlink_stat;
53899 +
53900 +typedef struct reiser4_plugin_slot {
53901 + /* 0 */ __le16 pset_memb;
53902 + /* 2 */ __le16 id;
53903 + /* 4 *//* here plugin stores its persistent state */
53904 +} PACKED reiser4_plugin_slot;
53905 +
53906 +/* stat-data extension for files with non-standard plugin. */
53907 +typedef struct reiser4_plugin_stat {
53908 + /* number of additional plugins, associated with this object */
53909 + /* 0 */ __le16 plugins_no;
53910 + /* 2 */ reiser4_plugin_slot slot[0];
53911 + /* 2 */
53912 +} PACKED reiser4_plugin_stat;
53913 +
53914 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
53915 + * bit mask. If need arise, this can be replaced with variable width
53916 + * bitmask. */
53917 +typedef struct reiser4_flags_stat {
53918 + /* 0 */ __le32 flags;
53919 + /* 4 */
53920 +} PACKED reiser4_flags_stat;
53921 +
53922 +typedef struct reiser4_capabilities_stat {
53923 + /* 0 */ __le32 effective;
53924 + /* 8 */ __le32 permitted;
53925 + /* 16 */
53926 +} PACKED reiser4_capabilities_stat;
53927 +
53928 +typedef struct reiser4_cluster_stat {
53929 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
53930 + /* 0 */ d8 cluster_shift;
53931 + /* 1 */
53932 +} PACKED reiser4_cluster_stat;
53933 +
53934 +typedef struct reiser4_crypto_stat {
53935 + /* secret key size, bits */
53936 + /* 0 */ d16 keysize;
53937 + /* secret key id */
53938 + /* 2 */ d8 keyid[0];
53939 + /* 2 */
53940 +} PACKED reiser4_crypto_stat;
53941 +
53942 +typedef struct reiser4_large_times_stat {
53943 + /* access time */
53944 + /* 0 */ d32 atime;
53945 + /* modification time */
53946 + /* 4 */ d32 mtime;
53947 + /* change time */
53948 + /* 8 */ d32 ctime;
53949 + /* 12 */
53950 +} PACKED reiser4_large_times_stat;
53951 +
53952 +/* this structure is filled by sd_item_stat */
53953 +typedef struct sd_stat {
53954 + int dirs;
53955 + int files;
53956 + int others;
53957 +} sd_stat;
53958 +
53959 +/* plugin->item.common.* */
53960 +extern void print_sd(const char *prefix, coord_t * coord);
53961 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
53962 +
53963 +/* plugin->item.s.sd.* */
53964 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
53965 +extern int save_len_static_sd(struct inode *inode);
53966 +extern int save_static_sd(struct inode *inode, char **area);
53967 +
53968 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
53969 +#endif
53970 +
53971 +/* Make Linus happy.
53972 + Local variables:
53973 + c-indentation-style: "K&R"
53974 + mode-name: "LC"
53975 + c-basic-offset: 8
53976 + tab-width: 8
53977 + fill-column: 120
53978 + End:
53979 +*/
53980 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/tail.c linux-2.6.22/fs/reiser4/plugin/item/tail.c
53981 --- linux-2.6.22.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300
53982 +++ linux-2.6.22/fs/reiser4/plugin/item/tail.c 2007-07-29 00:25:34.980723395 +0400
53983 @@ -0,0 +1,809 @@
53984 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53985 +
53986 +#include "item.h"
53987 +#include "../../inode.h"
53988 +#include "../../page_cache.h"
53989 +#include "../../carry.h"
53990 +#include "../../vfs_ops.h"
53991 +
53992 +#include <linux/quotaops.h>
53993 +#include <asm/uaccess.h>
53994 +#include <linux/swap.h>
53995 +#include <linux/writeback.h>
53996 +
53997 +/* plugin->u.item.b.max_key_inside */
53998 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
53999 +{
54000 + item_key_by_coord(coord, key);
54001 + set_key_offset(key, get_key_offset(reiser4_max_key()));
54002 + return key;
54003 +}
54004 +
54005 +/* plugin->u.item.b.can_contain_key */
54006 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54007 + const reiser4_item_data *data)
54008 +{
54009 + reiser4_key item_key;
54010 +
54011 + if (item_plugin_by_coord(coord) != data->iplug)
54012 + return 0;
54013 +
54014 + item_key_by_coord(coord, &item_key);
54015 + if (get_key_locality(key) != get_key_locality(&item_key) ||
54016 + get_key_objectid(key) != get_key_objectid(&item_key))
54017 + return 0;
54018 +
54019 + return 1;
54020 +}
54021 +
54022 +/* plugin->u.item.b.mergeable
54023 + first item is of tail type */
54024 +/* Audited by: green(2002.06.14) */
54025 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
54026 +{
54027 + reiser4_key key1, key2;
54028 +
54029 + assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
54030 + UNIX_FILE_METADATA_ITEM_TYPE));
54031 + assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54032 +
54033 + if (item_id_by_coord(p2) != FORMATTING_ID) {
54034 + /* second item is of another type */
54035 + return 0;
54036 + }
54037 +
54038 + item_key_by_coord(p1, &key1);
54039 + item_key_by_coord(p2, &key2);
54040 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
54041 + get_key_objectid(&key1) != get_key_objectid(&key2)
54042 + || get_key_type(&key1) != get_key_type(&key2)) {
54043 + /* items of different objects */
54044 + return 0;
54045 + }
54046 + if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54047 + /* not adjacent items */
54048 + return 0;
54049 + }
54050 + return 1;
54051 +}
54052 +
54053 +/* plugin->u.item.b.print
54054 + plugin->u.item.b.check */
54055 +
54056 +/* plugin->u.item.b.nr_units */
54057 +pos_in_node_t nr_units_tail(const coord_t * coord)
54058 +{
54059 + return item_length_by_coord(coord);
54060 +}
54061 +
54062 +/* plugin->u.item.b.lookup */
54063 +lookup_result
54064 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54065 +{
54066 + reiser4_key item_key;
54067 + __u64 lookuped, offset;
54068 + unsigned nr_units;
54069 +
54070 + item_key_by_coord(coord, &item_key);
54071 + offset = get_key_offset(item_key_by_coord(coord, &item_key));
54072 + nr_units = nr_units_tail(coord);
54073 +
54074 + /* key we are looking for must be greater than key of item @coord */
54075 + assert("vs-416", keygt(key, &item_key));
54076 +
54077 + /* offset we are looking for */
54078 + lookuped = get_key_offset(key);
54079 +
54080 + if (lookuped >= offset && lookuped < offset + nr_units) {
54081 + /* byte we are looking for is in this item */
54082 + coord->unit_pos = lookuped - offset;
54083 + coord->between = AT_UNIT;
54084 + return CBK_COORD_FOUND;
54085 + }
54086 +
54087 + /* set coord after last unit */
54088 + coord->unit_pos = nr_units - 1;
54089 + coord->between = AFTER_UNIT;
54090 + return bias ==
54091 + FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54092 +}
54093 +
54094 +/* plugin->u.item.b.paste */
54095 +int
54096 +paste_tail(coord_t *coord, reiser4_item_data *data,
54097 + carry_plugin_info *info UNUSED_ARG)
54098 +{
54099 + unsigned old_item_length;
54100 + char *item;
54101 +
54102 + /* length the item had before resizing has been performed */
54103 + old_item_length = item_length_by_coord(coord) - data->length;
54104 +
54105 + /* tail items never get pasted in the middle */
54106 + assert("vs-363",
54107 + (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54108 + (coord->unit_pos == old_item_length - 1 &&
54109 + coord->between == AFTER_UNIT) ||
54110 + (coord->unit_pos == 0 && old_item_length == 0
54111 + && coord->between == AT_UNIT));
54112 +
54113 + item = item_body_by_coord(coord);
54114 + if (coord->unit_pos == 0)
54115 + /* make space for pasted data when pasting at the beginning of
54116 + the item */
54117 + memmove(item + data->length, item, old_item_length);
54118 +
54119 + if (coord->between == AFTER_UNIT)
54120 + coord->unit_pos++;
54121 +
54122 + if (data->data) {
54123 + assert("vs-554", data->user == 0 || data->user == 1);
54124 + if (data->user) {
54125 + assert("nikita-3035", reiser4_schedulable());
54126 + /* copy from user space */
54127 + if (__copy_from_user(item + coord->unit_pos,
54128 + (const char __user *)data->data,
54129 + (unsigned)data->length))
54130 + return RETERR(-EFAULT);
54131 + } else
54132 + /* copy from kernel space */
54133 + memcpy(item + coord->unit_pos, data->data,
54134 + (unsigned)data->length);
54135 + } else {
54136 + memset(item + coord->unit_pos, 0, (unsigned)data->length);
54137 + }
54138 + return 0;
54139 +}
54140 +
54141 +/* plugin->u.item.b.fast_paste */
54142 +
54143 +/* plugin->u.item.b.can_shift
54144 + number of units is returned via return value, number of bytes via @size. For
54145 + tail items they coincide */
54146 +int
54147 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54148 + znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54149 + unsigned *size, unsigned want)
54150 +{
54151 + /* make sure that that we do not want to shift more than we have */
54152 + assert("vs-364", want > 0
54153 + && want <= (unsigned)item_length_by_coord(source));
54154 +
54155 + *size = min(want, free_space);
54156 + return *size;
54157 +}
54158 +
54159 +/* plugin->u.item.b.copy_units */
54160 +void
54161 +copy_units_tail(coord_t * target, coord_t * source,
54162 + unsigned from, unsigned count,
54163 + shift_direction where_is_free_space,
54164 + unsigned free_space UNUSED_ARG)
54165 +{
54166 + /* make sure that item @target is expanded already */
54167 + assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54168 + assert("vs-370", free_space >= count);
54169 +
54170 + if (where_is_free_space == SHIFT_LEFT) {
54171 + /* append item @target with @count first bytes of @source */
54172 + assert("vs-365", from == 0);
54173 +
54174 + memcpy((char *)item_body_by_coord(target) +
54175 + item_length_by_coord(target) - count,
54176 + (char *)item_body_by_coord(source), count);
54177 + } else {
54178 + /* target item is moved to right already */
54179 + reiser4_key key;
54180 +
54181 + assert("vs-367",
54182 + (unsigned)item_length_by_coord(source) == from + count);
54183 +
54184 + memcpy((char *)item_body_by_coord(target),
54185 + (char *)item_body_by_coord(source) + from, count);
54186 +
54187 + /* new units are inserted before first unit in an item,
54188 + therefore, we have to update item key */
54189 + item_key_by_coord(source, &key);
54190 + set_key_offset(&key, get_key_offset(&key) + from);
54191 +
54192 + node_plugin_by_node(target->node)->update_item_key(target, &key,
54193 + NULL /*info */);
54194 + }
54195 +}
54196 +
54197 +/* plugin->u.item.b.create_hook */
54198 +
54199 +/* item_plugin->b.kill_hook
54200 + this is called when @count units starting from @from-th one are going to be removed
54201 + */
54202 +int
54203 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54204 + pos_in_node_t count, struct carry_kill_data *kdata)
54205 +{
54206 + reiser4_key key;
54207 + loff_t start, end;
54208 +
54209 + assert("vs-1577", kdata);
54210 + assert("vs-1579", kdata->inode);
54211 +
54212 + item_key_by_coord(coord, &key);
54213 + start = get_key_offset(&key) + from;
54214 + end = start + count;
54215 + fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54216 + return 0;
54217 +}
54218 +
54219 +/* plugin->u.item.b.shift_hook */
54220 +
54221 +/* helper for kill_units_tail and cut_units_tail */
54222 +static int
54223 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54224 + reiser4_key * smallest_removed, reiser4_key * new_first)
54225 +{
54226 + pos_in_node_t count;
54227 +
54228 + /* this method is only called to remove part of item */
54229 + assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54230 + /* tails items are never cut from the middle of an item */
54231 + assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
54232 + assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
54233 +
54234 + count = to - from + 1;
54235 +
54236 + if (smallest_removed) {
54237 + /* store smallest key removed */
54238 + item_key_by_coord(coord, smallest_removed);
54239 + set_key_offset(smallest_removed,
54240 + get_key_offset(smallest_removed) + from);
54241 + }
54242 + if (new_first) {
54243 + /* head of item is cut */
54244 + assert("vs-1529", from == 0);
54245 +
54246 + item_key_by_coord(coord, new_first);
54247 + set_key_offset(new_first,
54248 + get_key_offset(new_first) + from + count);
54249 + }
54250 +
54251 + if (REISER4_DEBUG)
54252 + memset((char *)item_body_by_coord(coord) + from, 0, count);
54253 + return count;
54254 +}
54255 +
54256 +/* plugin->u.item.b.cut_units */
54257 +int
54258 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54259 + struct carry_cut_data *cdata UNUSED_ARG,
54260 + reiser4_key * smallest_removed, reiser4_key * new_first)
54261 +{
54262 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54263 +}
54264 +
54265 +/* plugin->u.item.b.kill_units */
54266 +int
54267 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54268 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
54269 + reiser4_key * new_first)
54270 +{
54271 + kill_hook_tail(coord, from, to - from + 1, kdata);
54272 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54273 +}
54274 +
54275 +/* plugin->u.item.b.unit_key */
54276 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
54277 +{
54278 + assert("vs-375", coord_is_existing_unit(coord));
54279 +
54280 + item_key_by_coord(coord, key);
54281 + set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
54282 +
54283 + return key;
54284 +}
54285 +
54286 +/* plugin->u.item.b.estimate
54287 + plugin->u.item.b.item_data_by_flow */
54288 +
54289 +/* tail redpage function. It is called from readpage_tail(). */
54290 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
54291 +{
54292 + tap_t tap;
54293 + int result;
54294 + coord_t coord;
54295 + lock_handle lh;
54296 + int count, mapped;
54297 + struct inode *inode;
54298 + char *pagedata;
54299 +
54300 + /* saving passed coord in order to do not move it by tap. */
54301 + init_lh(&lh);
54302 + copy_lh(&lh, uf_coord->lh);
54303 + inode = page->mapping->host;
54304 + coord_dup(&coord, &uf_coord->coord);
54305 +
54306 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
54307 +
54308 + if ((result = reiser4_tap_load(&tap)))
54309 + goto out_tap_done;
54310 +
54311 + /* lookup until page is filled up. */
54312 + for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
54313 + /* number of bytes to be copied to page */
54314 + count = item_length_by_coord(&coord) - coord.unit_pos;
54315 + if (count > PAGE_CACHE_SIZE - mapped)
54316 + count = PAGE_CACHE_SIZE - mapped;
54317 +
54318 + /* attach @page to address space and get data address */
54319 + pagedata = kmap_atomic(page, KM_USER0);
54320 +
54321 + /* copy tail item to page */
54322 + memcpy(pagedata + mapped,
54323 + ((char *)item_body_by_coord(&coord) + coord.unit_pos),
54324 + count);
54325 + mapped += count;
54326 +
54327 + flush_dcache_page(page);
54328 +
54329 + /* dettach page from address space */
54330 + kunmap_atomic(pagedata, KM_USER0);
54331 +
54332 + /* Getting next tail item. */
54333 + if (mapped < PAGE_CACHE_SIZE) {
54334 + /*
54335 + * unlock page in order to avoid keep it locked
54336 + * during tree lookup, which takes long term locks
54337 + */
54338 + unlock_page(page);
54339 +
54340 + /* getting right neighbour. */
54341 + result = go_dir_el(&tap, RIGHT_SIDE, 0);
54342 +
54343 + /* lock page back */
54344 + lock_page(page);
54345 + if (PageUptodate(page)) {
54346 + /*
54347 + * another thread read the page, we have
54348 + * nothing to do
54349 + */
54350 + result = 0;
54351 + goto out_unlock_page;
54352 + }
54353 +
54354 + if (result) {
54355 + if (result == -E_NO_NEIGHBOR) {
54356 + /*
54357 + * rigth neighbor is not a formatted
54358 + * node
54359 + */
54360 + result = 0;
54361 + goto done;
54362 + } else {
54363 + goto out_tap_relse;
54364 + }
54365 + } else {
54366 + if (!inode_file_plugin(inode)->
54367 + owns_item(inode, &coord)) {
54368 + /* item of another file is found */
54369 + result = 0;
54370 + goto done;
54371 + }
54372 + }
54373 + }
54374 + }
54375 +
54376 + done:
54377 + if (mapped != PAGE_CACHE_SIZE)
54378 + zero_user_page(page, mapped, PAGE_CACHE_SIZE - mapped,
54379 + KM_USER0);
54380 + SetPageUptodate(page);
54381 + out_unlock_page:
54382 + unlock_page(page);
54383 + out_tap_relse:
54384 + reiser4_tap_relse(&tap);
54385 + out_tap_done:
54386 + reiser4_tap_done(&tap);
54387 + return result;
54388 +}
54389 +
54390 +/*
54391 + plugin->s.file.readpage
54392 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
54393 + or
54394 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
54395 +
54396 + At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
54397 + item. */
54398 +int readpage_tail(void *vp, struct page *page)
54399 +{
54400 + uf_coord_t *uf_coord = vp;
54401 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
54402 + ON_DEBUG(reiser4_key key);
54403 +
54404 + assert("umka-2515", PageLocked(page));
54405 + assert("umka-2516", !PageUptodate(page));
54406 + assert("umka-2517", !jprivate(page) && !PagePrivate(page));
54407 + assert("umka-2518", page->mapping && page->mapping->host);
54408 +
54409 + assert("umka-2519", znode_is_loaded(coord->node));
54410 + assert("umka-2520", item_is_tail(coord));
54411 + assert("umka-2521", coord_is_existing_unit(coord));
54412 + assert("umka-2522", znode_is_rlocked(coord->node));
54413 + assert("umka-2523",
54414 + page->mapping->host->i_ino ==
54415 + get_key_objectid(item_key_by_coord(coord, &key)));
54416 +
54417 + return do_readpage_tail(uf_coord, page);
54418 +}
54419 +
54420 +/**
54421 + * overwrite_tail
54422 + * @flow:
54423 + * @coord:
54424 + *
54425 + * Overwrites tail item or its part by user data. Returns number of bytes
54426 + * written or error code.
54427 + */
54428 +static int overwrite_tail(flow_t *flow, coord_t *coord)
54429 +{
54430 + unsigned count;
54431 +
54432 + assert("vs-570", flow->user == 1);
54433 + assert("vs-946", flow->data);
54434 + assert("vs-947", coord_is_existing_unit(coord));
54435 + assert("vs-948", znode_is_write_locked(coord->node));
54436 + assert("nikita-3036", reiser4_schedulable());
54437 +
54438 + count = item_length_by_coord(coord) - coord->unit_pos;
54439 + if (count > flow->length)
54440 + count = flow->length;
54441 +
54442 + if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
54443 + (const char __user *)flow->data, count))
54444 + return RETERR(-EFAULT);
54445 +
54446 + znode_make_dirty(coord->node);
54447 + return count;
54448 +}
54449 +
54450 +/**
54451 + * insert_first_tail
54452 + * @inode:
54453 + * @flow:
54454 + * @coord:
54455 + * @lh:
54456 + *
54457 + * Returns number of bytes written or error code.
54458 + */
54459 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
54460 + coord_t *coord, lock_handle *lh)
54461 +{
54462 + int result;
54463 + loff_t to_write;
54464 + struct unix_file_info *uf_info;
54465 +
54466 + if (get_key_offset(&flow->key) != 0) {
54467 + /*
54468 + * file is empty and we have to write not to the beginning of
54469 + * file. Create a hole at the beginning of file. On success
54470 + * insert_flow returns 0 as number of written bytes which is
54471 + * what we have to return on padding a file with holes
54472 + */
54473 + flow->data = NULL;
54474 + flow->length = get_key_offset(&flow->key);
54475 + set_key_offset(&flow->key, 0);
54476 + /*
54477 + * holes in files built of tails are stored just like if there
54478 + * were real data which are all zeros. Therefore we have to
54479 + * allocate quota here as well
54480 + */
54481 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54482 + return RETERR(-EDQUOT);
54483 + result = reiser4_insert_flow(coord, lh, flow);
54484 + if (flow->length)
54485 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54486 +
54487 + uf_info = unix_file_inode_data(inode);
54488 +
54489 + /*
54490 + * first item insertion is only possible when writing to empty
54491 + * file or performing tail conversion
54492 + */
54493 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
54494 + (reiser4_inode_get_flag(inode,
54495 + REISER4_PART_MIXED) &&
54496 + reiser4_inode_get_flag(inode,
54497 + REISER4_PART_IN_CONV))));
54498 + /* if file was empty - update its state */
54499 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
54500 + uf_info->container = UF_CONTAINER_TAILS;
54501 + return result;
54502 + }
54503 +
54504 + /* check quota before appending data */
54505 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54506 + return RETERR(-EDQUOT);
54507 +
54508 + to_write = flow->length;
54509 + result = reiser4_insert_flow(coord, lh, flow);
54510 + if (flow->length)
54511 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54512 + return (to_write - flow->length) ? (to_write - flow->length) : result;
54513 +}
54514 +
54515 +/**
54516 + * append_tail
54517 + * @inode:
54518 + * @flow:
54519 + * @coord:
54520 + * @lh:
54521 + *
54522 + * Returns number of bytes written or error code.
54523 + */
54524 +static ssize_t append_tail(struct inode *inode,
54525 + flow_t *flow, coord_t *coord, lock_handle *lh)
54526 +{
54527 + int result;
54528 + reiser4_key append_key;
54529 + loff_t to_write;
54530 +
54531 + if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
54532 + flow->data = NULL;
54533 + flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
54534 + set_key_offset(&flow->key, get_key_offset(&append_key));
54535 + /*
54536 + * holes in files built of tails are stored just like if there
54537 + * were real data which are all zeros. Therefore we have to
54538 + * allocate quota here as well
54539 + */
54540 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54541 + return RETERR(-EDQUOT);
54542 + result = reiser4_insert_flow(coord, lh, flow);
54543 + if (flow->length)
54544 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54545 + return result;
54546 + }
54547 +
54548 + /* check quota before appending data */
54549 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54550 + return RETERR(-EDQUOT);
54551 +
54552 + to_write = flow->length;
54553 + result = reiser4_insert_flow(coord, lh, flow);
54554 + if (flow->length)
54555 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54556 + return (to_write - flow->length) ? (to_write - flow->length) : result;
54557 +}
54558 +
54559 +/**
54560 + * write_tail_reserve_space - reserve space for tail write operation
54561 + * @inode:
54562 + *
54563 + * Estimates and reserves space which may be required for writing one flow to a
54564 + * file
54565 + */
54566 +static int write_extent_reserve_space(struct inode *inode)
54567 +{
54568 + __u64 count;
54569 + reiser4_tree *tree;
54570 +
54571 + /*
54572 + * to write one flow to a file by tails we have to reserve disk space for:
54573 +
54574 + * 1. find_file_item may have to insert empty node to the tree (empty
54575 + * leaf node between two extent items). This requires 1 block and
54576 + * number of blocks which are necessary to perform insertion of an
54577 + * internal item into twig level.
54578 + *
54579 + * 2. flow insertion
54580 + *
54581 + * 3. stat data update
54582 + */
54583 + tree = reiser4_tree_by_inode(inode);
54584 + count = estimate_one_insert_item(tree) +
54585 + estimate_insert_flow(tree->height) +
54586 + estimate_one_insert_item(tree);
54587 + grab_space_enable();
54588 + return reiser4_grab_space(count, 0 /* flags */);
54589 +}
54590 +
54591 +#define PAGE_PER_FLOW 4
54592 +
54593 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
54594 +{
54595 + loff_t faulted;
54596 + int to_fault;
54597 +
54598 + if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
54599 + count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
54600 + faulted = 0;
54601 + while (count > 0) {
54602 + to_fault = PAGE_CACHE_SIZE;
54603 + if (count < to_fault)
54604 + to_fault = count;
54605 + fault_in_pages_readable(buf + faulted, to_fault);
54606 + count -= to_fault;
54607 + faulted += to_fault;
54608 + }
54609 + return faulted;
54610 +}
54611 +
54612 +/**
54613 + * reiser4_write_extent - write method of tail item plugin
54614 + * @file: file to write to
54615 + * @buf: address of user-space buffer
54616 + * @count: number of bytes to write
54617 + * @pos: position in file to write to
54618 + *
54619 + * Returns number of written bytes or error code.
54620 + */
54621 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
54622 + size_t count, loff_t *pos)
54623 +{
54624 + struct inode *inode;
54625 + struct hint hint;
54626 + int result;
54627 + flow_t flow;
54628 + coord_t *coord;
54629 + lock_handle *lh;
54630 + znode *loaded;
54631 +
54632 + inode = file->f_dentry->d_inode;
54633 +
54634 + if (write_extent_reserve_space(inode))
54635 + return RETERR(-ENOSPC);
54636 +
54637 + result = load_file_hint(file, &hint);
54638 + BUG_ON(result != 0);
54639 +
54640 + flow.length = faultin_user_pages(buf, count);
54641 + flow.user = 1;
54642 + memcpy(&flow.data, &buf, sizeof(buf));
54643 + flow.op = WRITE_OP;
54644 + key_by_inode_and_offset_common(inode, *pos, &flow.key);
54645 +
54646 + result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
54647 + if (IS_CBKERR(result))
54648 + return result;
54649 +
54650 + coord = &hint.ext_coord.coord;
54651 + lh = hint.ext_coord.lh;
54652 +
54653 + result = zload(coord->node);
54654 + BUG_ON(result != 0);
54655 + loaded = coord->node;
54656 +
54657 + if (coord->between == AFTER_UNIT) {
54658 + /* append with data or hole */
54659 + result = append_tail(inode, &flow, coord, lh);
54660 + } else if (coord->between == AT_UNIT) {
54661 + /* overwrite */
54662 + result = overwrite_tail(&flow, coord);
54663 + } else {
54664 + /* no items of this file yet. insert data or hole */
54665 + result = insert_first_tail(inode, &flow, coord, lh);
54666 + }
54667 + zrelse(loaded);
54668 + if (result < 0) {
54669 + done_lh(lh);
54670 + return result;
54671 + }
54672 +
54673 + /* seal and unlock znode */
54674 + hint.ext_coord.valid = 0;
54675 + if (hint.ext_coord.valid)
54676 + reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
54677 + else
54678 + reiser4_unset_hint(&hint);
54679 +
54680 + save_file_hint(file, &hint);
54681 + return result;
54682 +}
54683 +
54684 +#if REISER4_DEBUG
54685 +
54686 +static int
54687 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
54688 +{
54689 + reiser4_key item_key;
54690 +
54691 + assert("vs-1356", coord_is_existing_unit(coord));
54692 + assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
54693 + assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
54694 + return get_key_offset(key) ==
54695 + get_key_offset(&item_key) + coord->unit_pos;
54696 +
54697 +}
54698 +
54699 +#endif
54700 +
54701 +/* plugin->u.item.s.file.read */
54702 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
54703 +{
54704 + unsigned count;
54705 + int item_length;
54706 + coord_t *coord;
54707 + uf_coord_t *uf_coord;
54708 +
54709 + uf_coord = &hint->ext_coord;
54710 + coord = &uf_coord->coord;
54711 +
54712 + assert("vs-571", f->user == 1);
54713 + assert("vs-571", f->data);
54714 + assert("vs-967", coord && coord->node);
54715 + assert("vs-1117", znode_is_rlocked(coord->node));
54716 + assert("vs-1118", znode_is_loaded(coord->node));
54717 +
54718 + assert("nikita-3037", reiser4_schedulable());
54719 + assert("vs-1357", coord_matches_key_tail(coord, &f->key));
54720 +
54721 + /* calculate number of bytes to read off the item */
54722 + item_length = item_length_by_coord(coord);
54723 + count = item_length_by_coord(coord) - coord->unit_pos;
54724 + if (count > f->length)
54725 + count = f->length;
54726 +
54727 + /* user page has to be brought in so that major page fault does not
54728 + * occur here when longtem lock is held */
54729 + if (__copy_to_user((char __user *)f->data,
54730 + ((char *)item_body_by_coord(coord) + coord->unit_pos),
54731 + count))
54732 + return RETERR(-EFAULT);
54733 +
54734 + /* probably mark_page_accessed() should only be called if
54735 + * coord->unit_pos is zero. */
54736 + mark_page_accessed(znode_page(coord->node));
54737 + move_flow_forward(f, count);
54738 +
54739 + coord->unit_pos += count;
54740 + if (item_length == coord->unit_pos) {
54741 + coord->unit_pos--;
54742 + coord->between = AFTER_UNIT;
54743 + }
54744 + reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
54745 + return 0;
54746 +}
54747 +
54748 +/*
54749 + plugin->u.item.s.file.append_key
54750 + key of first byte which is the next to last byte by addressed by this item
54751 +*/
54752 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
54753 +{
54754 + item_key_by_coord(coord, key);
54755 + set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
54756 + return key;
54757 +}
54758 +
54759 +/* plugin->u.item.s.file.init_coord_extension */
54760 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
54761 +{
54762 + uf_coord->valid = 1;
54763 +}
54764 +
54765 +/*
54766 + plugin->u.item.s.file.get_block
54767 +*/
54768 +int
54769 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
54770 +{
54771 + assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
54772 +
54773 + if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
54774 + /* if node has'nt obtainet its block number yet, return 0.
54775 + * Lets avoid upsetting users with some cosmic numbers beyond
54776 + * the device capacity.*/
54777 + *block = 0;
54778 + else
54779 + *block = *znode_get_block(coord->node);
54780 + return 0;
54781 +}
54782 +
54783 +/*
54784 + * Local variables:
54785 + * c-indentation-style: "K&R"
54786 + * mode-name: "LC"
54787 + * c-basic-offset: 8
54788 + * tab-width: 8
54789 + * fill-column: 79
54790 + * scroll-step: 1
54791 + * End:
54792 + */
54793 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/tail.h linux-2.6.22/fs/reiser4/plugin/item/tail.h
54794 --- linux-2.6.22.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300
54795 +++ linux-2.6.22/fs/reiser4/plugin/item/tail.h 2007-07-29 00:25:34.980723395 +0400
54796 @@ -0,0 +1,58 @@
54797 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54798 +
54799 +#if !defined( __REISER4_TAIL_H__ )
54800 +#define __REISER4_TAIL_H__
54801 +
54802 +struct tail_coord_extension {
54803 + int not_used;
54804 +};
54805 +
54806 +struct cut_list;
54807 +
54808 +/* plugin->u.item.b.* */
54809 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
54810 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
54811 + const reiser4_item_data *);
54812 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
54813 +pos_in_node_t nr_units_tail(const coord_t *);
54814 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
54815 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
54816 +int can_shift_tail(unsigned free_space, coord_t * source,
54817 + znode * target, shift_direction, unsigned *size,
54818 + unsigned want);
54819 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
54820 + unsigned count, shift_direction, unsigned free_space);
54821 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
54822 + struct carry_kill_data *);
54823 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
54824 + struct carry_cut_data *, reiser4_key * smallest_removed,
54825 + reiser4_key * new_first);
54826 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
54827 + struct carry_kill_data *, reiser4_key * smallest_removed,
54828 + reiser4_key * new_first);
54829 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
54830 +
54831 +/* plugin->u.item.s.* */
54832 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
54833 + size_t count, loff_t *pos);
54834 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
54835 +int readpage_tail(void *vp, struct page *page);
54836 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
54837 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
54838 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
54839 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
54840 + hint_t *, int back_to_dirty, int set_hint);
54841 +
54842 +/* __REISER4_TAIL_H__ */
54843 +#endif
54844 +
54845 +/* Make Linus happy.
54846 + Local variables:
54847 + c-indentation-style: "K&R"
54848 + mode-name: "LC"
54849 + c-basic-offset: 8
54850 + tab-width: 8
54851 + fill-column: 120
54852 + scroll-step: 1
54853 + End:
54854 +*/
54855 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/Makefile linux-2.6.22/fs/reiser4/plugin/Makefile
54856 --- linux-2.6.22.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300
54857 +++ linux-2.6.22/fs/reiser4/plugin/Makefile 2007-07-29 00:25:34.980723395 +0400
54858 @@ -0,0 +1,26 @@
54859 +obj-$(CONFIG_REISER4_FS) += plugins.o
54860 +
54861 +plugins-objs := \
54862 + plugin.o \
54863 + plugin_set.o \
54864 + object.o \
54865 + inode_ops.o \
54866 + inode_ops_rename.o \
54867 + file_ops.o \
54868 + file_ops_readdir.o \
54869 + file_plugin_common.o \
54870 + dir_plugin_common.o \
54871 + digest.o \
54872 + hash.o \
54873 + fibration.o \
54874 + tail_policy.o \
54875 + regular.o
54876 +
54877 +obj-$(CONFIG_REISER4_FS) += item/
54878 +obj-$(CONFIG_REISER4_FS) += file/
54879 +obj-$(CONFIG_REISER4_FS) += dir/
54880 +obj-$(CONFIG_REISER4_FS) += node/
54881 +obj-$(CONFIG_REISER4_FS) += compress/
54882 +obj-$(CONFIG_REISER4_FS) += space/
54883 +obj-$(CONFIG_REISER4_FS) += disk_format/
54884 +obj-$(CONFIG_REISER4_FS) += security/
54885 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/Makefile linux-2.6.22/fs/reiser4/plugin/node/Makefile
54886 --- linux-2.6.22.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300
54887 +++ linux-2.6.22/fs/reiser4/plugin/node/Makefile 2007-07-29 00:25:34.980723395 +0400
54888 @@ -0,0 +1,5 @@
54889 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
54890 +
54891 +node_plugins-objs := \
54892 + node.o \
54893 + node40.o
54894 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node40.c linux-2.6.22/fs/reiser4/plugin/node/node40.c
54895 --- linux-2.6.22.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300
54896 +++ linux-2.6.22/fs/reiser4/plugin/node/node40.c 2007-07-29 00:25:34.988725466 +0400
54897 @@ -0,0 +1,2924 @@
54898 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54899 +
54900 +#include "../../debug.h"
54901 +#include "../../key.h"
54902 +#include "../../coord.h"
54903 +#include "../plugin_header.h"
54904 +#include "../item/item.h"
54905 +#include "node.h"
54906 +#include "node40.h"
54907 +#include "../plugin.h"
54908 +#include "../../jnode.h"
54909 +#include "../../znode.h"
54910 +#include "../../pool.h"
54911 +#include "../../carry.h"
54912 +#include "../../tap.h"
54913 +#include "../../tree.h"
54914 +#include "../../super.h"
54915 +#include "../../reiser4.h"
54916 +
54917 +#include <asm/uaccess.h>
54918 +#include <linux/types.h>
54919 +#include <linux/prefetch.h>
54920 +
54921 +/* leaf 40 format:
54922 +
54923 + [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
54924 + plugin_id (16) key
54925 + free_space (16) pluginid (16)
54926 + free_space_start (16) offset (16)
54927 + level (8)
54928 + num_items (16)
54929 + magic (32)
54930 + flush_time (32)
54931 +*/
54932 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
54933 +/* magic number that is stored in ->magic field of node header */
54934 +static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
54935 +
54936 +static int prepare_for_update(znode * left, znode * right,
54937 + carry_plugin_info * info);
54938 +
54939 +/* header of node of reiser40 format is at the beginning of node */
54940 +static inline node40_header *node40_node_header(const znode * node /* node to
54941 + * query */ )
54942 +{
54943 + assert("nikita-567", node != NULL);
54944 + assert("nikita-568", znode_page(node) != NULL);
54945 + assert("nikita-569", zdata(node) != NULL);
54946 + return (node40_header *) zdata(node);
54947 +}
54948 +
54949 +/* functions to get/set fields of node40_header */
54950 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
54951 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
54952 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
54953 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
54954 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
54955 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
54956 +
54957 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
54958 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
54959 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
54960 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
54961 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
54962 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
54963 +
54964 +/* plugin field of node header should be read/set by
54965 + plugin_by_disk_id/save_disk_plugin */
54966 +
54967 +/* array of item headers is at the end of node */
54968 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
54969 +{
54970 + return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
54971 +}
54972 +
54973 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
54974 + */
54975 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
54976 +{
54977 + return (item_header40 *) (zdata(coord->node) +
54978 + znode_size(coord->node)) - (coord->item_pos) -
54979 + 1;
54980 +}
54981 +
54982 +/* functions to get/set fields of item_header40 */
54983 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
54984 +
54985 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
54986 +
54987 +/* plugin field of item header should be read/set by
54988 + plugin_by_disk_id/save_disk_plugin */
54989 +
54990 +/* plugin methods */
54991 +
54992 +/* plugin->u.node.item_overhead
54993 + look for description of this method in plugin/node/node.h */
54994 +size_t
54995 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
54996 +{
54997 + return sizeof(item_header40);
54998 +}
54999 +
55000 +/* plugin->u.node.free_space
55001 + look for description of this method in plugin/node/node.h */
55002 +size_t free_space_node40(znode * node)
55003 +{
55004 + assert("nikita-577", node != NULL);
55005 + assert("nikita-578", znode_is_loaded(node));
55006 + assert("nikita-579", zdata(node) != NULL);
55007 +
55008 + return nh40_get_free_space(node40_node_header(node));
55009 +}
55010 +
55011 +/* private inline version of node40_num_of_items() for use in this file. This
55012 + is necessary, because address of node40_num_of_items() is taken and it is
55013 + never inlined as a result. */
55014 +static inline short node40_num_of_items_internal(const znode * node)
55015 +{
55016 + return nh40_get_num_items(node40_node_header(node));
55017 +}
55018 +
55019 +#if REISER4_DEBUG
55020 +static inline void check_num_items(const znode * node)
55021 +{
55022 + assert("nikita-2749",
55023 + node40_num_of_items_internal(node) == node->nr_items);
55024 + assert("nikita-2746", znode_is_write_locked(node));
55025 +}
55026 +#else
55027 +#define check_num_items(node) noop
55028 +#endif
55029 +
55030 +/* plugin->u.node.num_of_items
55031 + look for description of this method in plugin/node/node.h */
55032 +int num_of_items_node40(const znode * node)
55033 +{
55034 + return node40_num_of_items_internal(node);
55035 +}
55036 +
55037 +static void
55038 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
55039 +{
55040 + assert("nikita-2751", node != NULL);
55041 + assert("nikita-2750", nh == node40_node_header(node));
55042 +
55043 + check_num_items(node);
55044 + nh40_set_num_items(nh, value);
55045 + node->nr_items = value;
55046 + check_num_items(node);
55047 +}
55048 +
55049 +/* plugin->u.node.item_by_coord
55050 + look for description of this method in plugin/node/node.h */
55051 +char *item_by_coord_node40(const coord_t * coord)
55052 +{
55053 + item_header40 *ih;
55054 + char *p;
55055 +
55056 + /* @coord is set to existing item */
55057 + assert("nikita-596", coord != NULL);
55058 + assert("vs-255", coord_is_existing_item(coord));
55059 +
55060 + ih = node40_ih_at_coord(coord);
55061 + p = zdata(coord->node) + ih40_get_offset(ih);
55062 + return p;
55063 +}
55064 +
55065 +/* plugin->u.node.length_by_coord
55066 + look for description of this method in plugin/node/node.h */
55067 +int length_by_coord_node40(const coord_t * coord)
55068 +{
55069 + item_header40 *ih;
55070 + int result;
55071 +
55072 + /* @coord is set to existing item */
55073 + assert("vs-256", coord != NULL);
55074 + assert("vs-257", coord_is_existing_item(coord));
55075 +
55076 + ih = node40_ih_at_coord(coord);
55077 + if ((int)coord->item_pos ==
55078 + node40_num_of_items_internal(coord->node) - 1)
55079 + result =
55080 + nh40_get_free_space_start(node40_node_header(coord->node)) -
55081 + ih40_get_offset(ih);
55082 + else
55083 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55084 +
55085 + return result;
55086 +}
55087 +
55088 +static pos_in_node_t
55089 +node40_item_length(const znode * node, pos_in_node_t item_pos)
55090 +{
55091 + item_header40 *ih;
55092 + pos_in_node_t result;
55093 +
55094 + /* @coord is set to existing item */
55095 + assert("vs-256", node != NULL);
55096 + assert("vs-257", node40_num_of_items_internal(node) > item_pos);
55097 +
55098 + ih = node40_ih_at(node, item_pos);
55099 + if (item_pos == node40_num_of_items_internal(node) - 1)
55100 + result =
55101 + nh40_get_free_space_start(node40_node_header(node)) -
55102 + ih40_get_offset(ih);
55103 + else
55104 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55105 +
55106 + return result;
55107 +}
55108 +
55109 +/* plugin->u.node.plugin_by_coord
55110 + look for description of this method in plugin/node/node.h */
55111 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
55112 +{
55113 + item_header40 *ih;
55114 + item_plugin *result;
55115 +
55116 + /* @coord is set to existing item */
55117 + assert("vs-258", coord != NULL);
55118 + assert("vs-259", coord_is_existing_item(coord));
55119 +
55120 + ih = node40_ih_at_coord(coord);
55121 + /* pass NULL in stead of current tree. This is time critical call. */
55122 + result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
55123 + return result;
55124 +}
55125 +
55126 +/* plugin->u.node.key_at
55127 + look for description of this method in plugin/node/node.h */
55128 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
55129 +{
55130 + item_header40 *ih;
55131 +
55132 + assert("nikita-1765", coord_is_existing_item(coord));
55133 +
55134 + /* @coord is set to existing item */
55135 + ih = node40_ih_at_coord(coord);
55136 + memcpy(key, &ih->key, sizeof(reiser4_key));
55137 + return key;
55138 +}
55139 +
55140 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
55141 +
55142 +#define NODE_INCSTAT(n, counter) \
55143 + reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
55144 +
55145 +#define NODE_ADDSTAT(n, counter, val) \
55146 + reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
55147 +
55148 +/* plugin->u.node.lookup
55149 + look for description of this method in plugin/node/node.h */
55150 +node_search_result lookup_node40(znode * node /* node to query */ ,
55151 + const reiser4_key * key /* key to look for */ ,
55152 + lookup_bias bias /* search bias */ ,
55153 + coord_t * coord /* resulting coord */ )
55154 +{
55155 + int left;
55156 + int right;
55157 + int found;
55158 + int items;
55159 +
55160 + item_header40 *lefth;
55161 + item_header40 *righth;
55162 +
55163 + item_plugin *iplug;
55164 + item_header40 *bstop;
55165 + item_header40 *ih;
55166 + cmp_t order;
55167 +
55168 + assert("nikita-583", node != NULL);
55169 + assert("nikita-584", key != NULL);
55170 + assert("nikita-585", coord != NULL);
55171 + assert("nikita-2693", znode_is_any_locked(node));
55172 + cassert(REISER4_SEQ_SEARCH_BREAK > 2);
55173 +
55174 + items = node_num_items(node);
55175 +
55176 + if (unlikely(items == 0)) {
55177 + coord_init_first_unit(coord, node);
55178 + return NS_NOT_FOUND;
55179 + }
55180 +
55181 + /* binary search for item that can contain given key */
55182 + left = 0;
55183 + right = items - 1;
55184 + coord->node = node;
55185 + coord_clear_iplug(coord);
55186 + found = 0;
55187 +
55188 + lefth = node40_ih_at(node, left);
55189 + righth = node40_ih_at(node, right);
55190 +
55191 + /* It is known that for small arrays sequential search is on average
55192 + more efficient than binary. This is because sequential search is
55193 + coded as tight loop that can be better optimized by compilers and
55194 + for small array size gain from this optimization makes sequential
55195 + search the winner. Another, maybe more important, reason for this,
55196 + is that sequential array is more CPU cache friendly, whereas binary
55197 + search effectively destroys CPU caching.
55198 +
55199 + Critical here is the notion of "smallness". Reasonable value of
55200 + REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
55201 + fs/reiser4/ulevel/ulevel.c:test_search().
55202 +
55203 + Don't try to further optimize sequential search by scanning from
55204 + right to left in attempt to use more efficient loop termination
55205 + condition (comparison with 0). This doesn't work.
55206 +
55207 + */
55208 +
55209 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
55210 + int median;
55211 + item_header40 *medianh;
55212 +
55213 + median = (left + right) / 2;
55214 + medianh = node40_ih_at(node, median);
55215 +
55216 + assert("nikita-1084", median >= 0);
55217 + assert("nikita-1085", median < items);
55218 + switch (keycmp(key, &medianh->key)) {
55219 + case LESS_THAN:
55220 + right = median;
55221 + righth = medianh;
55222 + break;
55223 + default:
55224 + wrong_return_value("nikita-586", "keycmp");
55225 + case GREATER_THAN:
55226 + left = median;
55227 + lefth = medianh;
55228 + break;
55229 + case EQUAL_TO:
55230 + do {
55231 + --median;
55232 + /* headers are ordered from right to left */
55233 + ++medianh;
55234 + } while (median >= 0 && keyeq(key, &medianh->key));
55235 + right = left = median + 1;
55236 + ih = lefth = righth = medianh - 1;
55237 + found = 1;
55238 + break;
55239 + }
55240 + }
55241 + /* sequential scan. Item headers, and, therefore, keys are stored at
55242 + the rightmost part of a node from right to left. We are trying to
55243 + access memory from left to right, and hence, scan in _descending_
55244 + order of item numbers.
55245 + */
55246 + if (!found) {
55247 + for (left = right, ih = righth; left >= 0; ++ih, --left) {
55248 + cmp_t comparison;
55249 +
55250 + prefetchkey(&(ih + 1)->key);
55251 + comparison = keycmp(&ih->key, key);
55252 + if (comparison == GREATER_THAN)
55253 + continue;
55254 + if (comparison == EQUAL_TO) {
55255 + found = 1;
55256 + do {
55257 + --left;
55258 + ++ih;
55259 + } while (left >= 0 && keyeq(&ih->key, key));
55260 + ++left;
55261 + --ih;
55262 + } else {
55263 + assert("nikita-1256", comparison == LESS_THAN);
55264 + }
55265 + break;
55266 + }
55267 + if (unlikely(left < 0))
55268 + left = 0;
55269 + }
55270 +
55271 + assert("nikita-3212", right >= left);
55272 + assert("nikita-3214",
55273 + equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
55274 +
55275 + coord_set_item_pos(coord, left);
55276 + coord->unit_pos = 0;
55277 + coord->between = AT_UNIT;
55278 +
55279 + /* key < leftmost key in a mode or node is corrupted and keys
55280 + are not sorted */
55281 + bstop = node40_ih_at(node, (unsigned)left);
55282 + order = keycmp(&bstop->key, key);
55283 + if (unlikely(order == GREATER_THAN)) {
55284 + if (unlikely(left != 0)) {
55285 + /* screw up */
55286 + warning("nikita-587", "Key less than %i key in a node",
55287 + left);
55288 + reiser4_print_key("key", key);
55289 + reiser4_print_key("min", &bstop->key);
55290 + print_coord_content("coord", coord);
55291 + return RETERR(-EIO);
55292 + } else {
55293 + coord->between = BEFORE_UNIT;
55294 + return NS_NOT_FOUND;
55295 + }
55296 + }
55297 + /* left <= key, ok */
55298 + iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
55299 +
55300 + if (unlikely(iplug == NULL)) {
55301 + warning("nikita-588", "Unknown plugin %i",
55302 + le16_to_cpu(get_unaligned(&bstop->plugin_id)));
55303 + reiser4_print_key("key", key);
55304 + print_coord_content("coord", coord);
55305 + return RETERR(-EIO);
55306 + }
55307 +
55308 + coord_set_iplug(coord, iplug);
55309 +
55310 + /* if exact key from item header was found by binary search, no
55311 + further checks are necessary. */
55312 + if (found) {
55313 + assert("nikita-1259", order == EQUAL_TO);
55314 + return NS_FOUND;
55315 + }
55316 + if (iplug->b.max_key_inside != NULL) {
55317 + reiser4_key max_item_key;
55318 +
55319 + /* key > max_item_key --- outside of an item */
55320 + if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
55321 + coord->unit_pos = 0;
55322 + coord->between = AFTER_ITEM;
55323 + /* FIXME-VS: key we are looking for does not fit into
55324 + found item. Return NS_NOT_FOUND then. Without that
55325 + the following case does not work: there is extent of
55326 + file 10000, 10001. File 10000, 10002 has been just
55327 + created. When writing to position 0 in that file -
55328 + traverse_tree will stop here on twig level. When we
55329 + want it to go down to leaf level
55330 + */
55331 + return NS_NOT_FOUND;
55332 + }
55333 + }
55334 +
55335 + if (iplug->b.lookup != NULL) {
55336 + return iplug->b.lookup(key, bias, coord);
55337 + } else {
55338 + assert("nikita-1260", order == LESS_THAN);
55339 + coord->between = AFTER_UNIT;
55340 + return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
55341 + }
55342 +}
55343 +
55344 +#undef NODE_ADDSTAT
55345 +#undef NODE_INCSTAT
55346 +
55347 +/* plugin->u.node.estimate
55348 + look for description of this method in plugin/node/node.h */
55349 +size_t estimate_node40(znode * node)
55350 +{
55351 + size_t result;
55352 +
55353 + assert("nikita-597", node != NULL);
55354 +
55355 + result = free_space_node40(node) - sizeof(item_header40);
55356 +
55357 + return (result > 0) ? result : 0;
55358 +}
55359 +
55360 +/* plugin->u.node.check
55361 + look for description of this method in plugin/node/node.h */
55362 +int check_node40(const znode * node /* node to check */ ,
55363 + __u32 flags /* check flags */ ,
55364 + const char **error /* where to store error message */ )
55365 +{
55366 + int nr_items;
55367 + int i;
55368 + reiser4_key prev;
55369 + unsigned old_offset;
55370 + tree_level level;
55371 + coord_t coord;
55372 + int result;
55373 +
55374 + assert("nikita-580", node != NULL);
55375 + assert("nikita-581", error != NULL);
55376 + assert("nikita-2948", znode_is_loaded(node));
55377 +
55378 + if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
55379 + return 0;
55380 +
55381 + assert("nikita-582", zdata(node) != NULL);
55382 +
55383 + nr_items = node40_num_of_items_internal(node);
55384 + if (nr_items < 0) {
55385 + *error = "Negative number of items";
55386 + return -1;
55387 + }
55388 +
55389 + if (flags & REISER4_NODE_DKEYS)
55390 + prev = *znode_get_ld_key((znode *) node);
55391 + else
55392 + prev = *reiser4_min_key();
55393 +
55394 + old_offset = 0;
55395 + coord_init_zero(&coord);
55396 + coord.node = (znode *) node;
55397 + coord.unit_pos = 0;
55398 + coord.between = AT_UNIT;
55399 + level = znode_get_level(node);
55400 + for (i = 0; i < nr_items; i++) {
55401 + item_header40 *ih;
55402 + reiser4_key unit_key;
55403 + unsigned j;
55404 +
55405 + ih = node40_ih_at(node, (unsigned)i);
55406 + coord_set_item_pos(&coord, i);
55407 + if ((ih40_get_offset(ih) >=
55408 + znode_size(node) - nr_items * sizeof(item_header40)) ||
55409 + (ih40_get_offset(ih) < sizeof(node40_header))) {
55410 + *error = "Offset is out of bounds";
55411 + return -1;
55412 + }
55413 + if (ih40_get_offset(ih) <= old_offset) {
55414 + *error = "Offsets are in wrong order";
55415 + return -1;
55416 + }
55417 + if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
55418 + *error = "Wrong offset of first item";
55419 + return -1;
55420 + }
55421 + old_offset = ih40_get_offset(ih);
55422 +
55423 + if (keygt(&prev, &ih->key)) {
55424 + *error = "Keys are in wrong order";
55425 + return -1;
55426 + }
55427 + if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
55428 + *error = "Wrong key of first unit";
55429 + return -1;
55430 + }
55431 + prev = ih->key;
55432 + for (j = 0; j < coord_num_units(&coord); ++j) {
55433 + coord.unit_pos = j;
55434 + unit_key_by_coord(&coord, &unit_key);
55435 + if (keygt(&prev, &unit_key)) {
55436 + *error = "Unit keys are in wrong order";
55437 + return -1;
55438 + }
55439 + prev = unit_key;
55440 + }
55441 + coord.unit_pos = 0;
55442 + if (level != TWIG_LEVEL && item_is_extent(&coord)) {
55443 + *error = "extent on the wrong level";
55444 + return -1;
55445 + }
55446 + if (level == LEAF_LEVEL && item_is_internal(&coord)) {
55447 + *error = "internal item on the wrong level";
55448 + return -1;
55449 + }
55450 + if (level != LEAF_LEVEL &&
55451 + !item_is_internal(&coord) && !item_is_extent(&coord)) {
55452 + *error = "wrong item on the internal level";
55453 + return -1;
55454 + }
55455 + if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
55456 + *error = "non-internal item on the internal level";
55457 + return -1;
55458 + }
55459 +#if REISER4_DEBUG
55460 + if (item_plugin_by_coord(&coord)->b.check
55461 + && item_plugin_by_coord(&coord)->b.check(&coord, error))
55462 + return -1;
55463 +#endif
55464 + if (i) {
55465 + coord_t prev_coord;
55466 + /* two neighboring items can not be mergeable */
55467 + coord_dup(&prev_coord, &coord);
55468 + coord_prev_item(&prev_coord);
55469 + if (are_items_mergeable(&prev_coord, &coord)) {
55470 + *error = "mergeable items in one node";
55471 + return -1;
55472 + }
55473 +
55474 + }
55475 + }
55476 +
55477 + if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
55478 + coord_t coord;
55479 + item_plugin *iplug;
55480 +
55481 + coord_init_last_unit(&coord, node);
55482 + iplug = item_plugin_by_coord(&coord);
55483 + if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
55484 + iplug->s.file.append_key != NULL) {
55485 + reiser4_key mkey;
55486 +
55487 + iplug->s.file.append_key(&coord, &mkey);
55488 + set_key_offset(&mkey, get_key_offset(&mkey) - 1);
55489 + read_lock_dk(current_tree);
55490 + result = keygt(&mkey, znode_get_rd_key((znode *) node));
55491 + read_unlock_dk(current_tree);
55492 + if (result) {
55493 + *error = "key of rightmost item is too large";
55494 + return -1;
55495 + }
55496 + }
55497 + }
55498 + if (flags & REISER4_NODE_DKEYS) {
55499 + read_lock_tree(current_tree);
55500 + read_lock_dk(current_tree);
55501 +
55502 + flags |= REISER4_NODE_TREE_STABLE;
55503 +
55504 + if (keygt(&prev, znode_get_rd_key((znode *) node))) {
55505 + if (flags & REISER4_NODE_TREE_STABLE) {
55506 + *error = "Last key is greater than rdkey";
55507 + read_unlock_dk(current_tree);
55508 + read_unlock_tree(current_tree);
55509 + return -1;
55510 + }
55511 + }
55512 + if (keygt
55513 + (znode_get_ld_key((znode *) node),
55514 + znode_get_rd_key((znode *) node))) {
55515 + *error = "ldkey is greater than rdkey";
55516 + read_unlock_dk(current_tree);
55517 + read_unlock_tree(current_tree);
55518 + return -1;
55519 + }
55520 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
55521 + (node->left != NULL) &&
55522 + !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
55523 + ergo(flags & REISER4_NODE_TREE_STABLE,
55524 + !keyeq(znode_get_rd_key(node->left),
55525 + znode_get_ld_key((znode *) node)))
55526 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55527 + keygt(znode_get_rd_key(node->left),
55528 + znode_get_ld_key((znode *) node)))) {
55529 + *error = "left rdkey or ldkey is wrong";
55530 + read_unlock_dk(current_tree);
55531 + read_unlock_tree(current_tree);
55532 + return -1;
55533 + }
55534 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
55535 + (node->right != NULL) &&
55536 + !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
55537 + ergo(flags & REISER4_NODE_TREE_STABLE,
55538 + !keyeq(znode_get_rd_key((znode *) node),
55539 + znode_get_ld_key(node->right)))
55540 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55541 + keygt(znode_get_rd_key((znode *) node),
55542 + znode_get_ld_key(node->right)))) {
55543 + *error = "rdkey or right ldkey is wrong";
55544 + read_unlock_dk(current_tree);
55545 + read_unlock_tree(current_tree);
55546 + return -1;
55547 + }
55548 +
55549 + read_unlock_dk(current_tree);
55550 + read_unlock_tree(current_tree);
55551 + }
55552 +
55553 + return 0;
55554 +}
55555 +
55556 +/* plugin->u.node.parse
55557 + look for description of this method in plugin/node/node.h */
55558 +int parse_node40(znode * node /* node to parse */ )
55559 +{
55560 + node40_header *header;
55561 + int result;
55562 + d8 level;
55563 +
55564 + header = node40_node_header((znode *) node);
55565 + result = -EIO;
55566 + level = nh40_get_level(header);
55567 + if (unlikely(((__u8) znode_get_level(node)) != level))
55568 + warning("nikita-494", "Wrong level found in node: %i != %i",
55569 + znode_get_level(node), level);
55570 + else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
55571 + warning("nikita-495",
55572 + "Wrong magic in tree node: want %x, got %x",
55573 + REISER4_NODE_MAGIC, nh40_get_magic(header));
55574 + else {
55575 + node->nr_items = node40_num_of_items_internal(node);
55576 + result = 0;
55577 + }
55578 + return RETERR(result);
55579 +}
55580 +
55581 +/* plugin->u.node.init
55582 + look for description of this method in plugin/node/node.h */
55583 +int init_node40(znode * node /* node to initialise */ )
55584 +{
55585 + node40_header *header;
55586 +
55587 + assert("nikita-570", node != NULL);
55588 + assert("nikita-572", zdata(node) != NULL);
55589 +
55590 + header = node40_node_header(node);
55591 + memset(header, 0, sizeof(node40_header));
55592 + nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
55593 + nh40_set_free_space_start(header, sizeof(node40_header));
55594 + /* sane hypothesis: 0 in CPU format is 0 in disk format */
55595 + /* items: 0 */
55596 + save_plugin_id(node_plugin_to_plugin(node->nplug),
55597 + &header->common_header.plugin_id);
55598 + nh40_set_level(header, znode_get_level(node));
55599 + nh40_set_magic(header, REISER4_NODE_MAGIC);
55600 + node->nr_items = 0;
55601 + nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
55602 +
55603 + /* flags: 0 */
55604 + return 0;
55605 +}
55606 +
55607 +#ifdef GUESS_EXISTS
55608 +int guess_node40(const znode * node /* node to guess plugin of */ )
55609 +{
55610 + node40_header *nethack;
55611 +
55612 + assert("nikita-1058", node != NULL);
55613 + nethack = node40_node_header(node);
55614 + return
55615 + (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
55616 + (plugin_by_disk_id(znode_get_tree(node),
55617 + REISER4_NODE_PLUGIN_TYPE,
55618 + &nethack->common_header.plugin_id)->h.id ==
55619 + NODE40_ID);
55620 +}
55621 +#endif
55622 +
55623 +/* plugin->u.node.chage_item_size
55624 + look for description of this method in plugin/node/node.h */
55625 +void change_item_size_node40(coord_t * coord, int by)
55626 +{
55627 + node40_header *nh;
55628 + item_header40 *ih;
55629 + char *item_data;
55630 + int item_length;
55631 + unsigned i;
55632 +
55633 + /* make sure that @item is coord of existing item */
55634 + assert("vs-210", coord_is_existing_item(coord));
55635 +
55636 + nh = node40_node_header(coord->node);
55637 +
55638 + item_data = item_by_coord_node40(coord);
55639 + item_length = length_by_coord_node40(coord);
55640 +
55641 + /* move item bodies */
55642 + ih = node40_ih_at_coord(coord);
55643 + memmove(item_data + item_length + by, item_data + item_length,
55644 + nh40_get_free_space_start(node40_node_header(coord->node)) -
55645 + (ih40_get_offset(ih) + item_length));
55646 +
55647 + /* update offsets of moved items */
55648 + for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
55649 + ih = node40_ih_at(coord->node, i);
55650 + ih40_set_offset(ih, ih40_get_offset(ih) + by);
55651 + }
55652 +
55653 + /* update node header */
55654 + nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
55655 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
55656 +}
55657 +
55658 +static int should_notify_parent(const znode * node)
55659 +{
55660 + /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
55661 + return !disk_addr_eq(znode_get_block(node),
55662 + &znode_get_tree(node)->root_block);
55663 +}
55664 +
55665 +/* plugin->u.node.create_item
55666 + look for description of this method in plugin/node/node.h */
55667 +int
55668 +create_item_node40(coord_t *target, const reiser4_key *key,
55669 + reiser4_item_data *data, carry_plugin_info *info)
55670 +{
55671 + node40_header *nh;
55672 + item_header40 *ih;
55673 + unsigned offset;
55674 + unsigned i;
55675 +
55676 + nh = node40_node_header(target->node);
55677 +
55678 + assert("vs-212", coord_is_between_items(target));
55679 + /* node must have enough free space */
55680 + assert("vs-254",
55681 + free_space_node40(target->node) >=
55682 + data->length + sizeof(item_header40));
55683 + assert("vs-1410", data->length >= 0);
55684 +
55685 + if (coord_set_to_right(target))
55686 + /* there are not items to the right of @target, so, new item
55687 + will be inserted after last one */
55688 + coord_set_item_pos(target, nh40_get_num_items(nh));
55689 +
55690 + if (target->item_pos < nh40_get_num_items(nh)) {
55691 + /* there are items to be moved to prepare space for new
55692 + item */
55693 + ih = node40_ih_at_coord(target);
55694 + /* new item will start at this offset */
55695 + offset = ih40_get_offset(ih);
55696 +
55697 + memmove(zdata(target->node) + offset + data->length,
55698 + zdata(target->node) + offset,
55699 + nh40_get_free_space_start(nh) - offset);
55700 + /* update headers of moved items */
55701 + for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
55702 + ih = node40_ih_at(target->node, i);
55703 + ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
55704 + }
55705 +
55706 + /* @ih is set to item header of the last item, move item headers */
55707 + memmove(ih - 1, ih,
55708 + sizeof(item_header40) * (nh40_get_num_items(nh) -
55709 + target->item_pos));
55710 + } else {
55711 + /* new item will start at this offset */
55712 + offset = nh40_get_free_space_start(nh);
55713 + }
55714 +
55715 + /* make item header for the new item */
55716 + ih = node40_ih_at_coord(target);
55717 + memcpy(&ih->key, key, sizeof(reiser4_key));
55718 + ih40_set_offset(ih, offset);
55719 + save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
55720 +
55721 + /* update node header */
55722 + nh40_set_free_space(nh,
55723 + nh40_get_free_space(nh) - data->length -
55724 + sizeof(item_header40));
55725 + nh40_set_free_space_start(nh,
55726 + nh40_get_free_space_start(nh) + data->length);
55727 + node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
55728 +
55729 + /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
55730 + target->unit_pos = 0;
55731 + target->between = AT_UNIT;
55732 + coord_clear_iplug(target);
55733 +
55734 + /* initialize item */
55735 + if (data->iplug->b.init != NULL) {
55736 + data->iplug->b.init(target, NULL, data);
55737 + }
55738 + /* copy item body */
55739 + if (data->iplug->b.paste != NULL) {
55740 + data->iplug->b.paste(target, data, info);
55741 + } else if (data->data != NULL) {
55742 + if (data->user) {
55743 + /* AUDIT: Are we really should not check that pointer
55744 + from userspace was valid and data bytes were
55745 + available? How will we return -EFAULT of some kind
55746 + without this check? */
55747 + assert("nikita-3038", reiser4_schedulable());
55748 + /* copy data from user space */
55749 + __copy_from_user(zdata(target->node) + offset,
55750 + (const char __user *)data->data,
55751 + (unsigned)data->length);
55752 + } else
55753 + /* copy from kernel space */
55754 + memcpy(zdata(target->node) + offset, data->data,
55755 + (unsigned)data->length);
55756 + }
55757 +
55758 + if (target->item_pos == 0) {
55759 + /* left delimiting key has to be updated */
55760 + prepare_for_update(NULL, target->node, info);
55761 + }
55762 +
55763 + if (item_plugin_by_coord(target)->b.create_hook != NULL) {
55764 + item_plugin_by_coord(target)->b.create_hook(target, data->arg);
55765 + }
55766 +
55767 + return 0;
55768 +}
55769 +
55770 +/* plugin->u.node.update_item_key
55771 + look for description of this method in plugin/node/node.h */
55772 +void
55773 +update_item_key_node40(coord_t * target, const reiser4_key * key,
55774 + carry_plugin_info * info)
55775 +{
55776 + item_header40 *ih;
55777 +
55778 + ih = node40_ih_at_coord(target);
55779 + memcpy(&ih->key, key, sizeof(reiser4_key));
55780 +
55781 + if (target->item_pos == 0) {
55782 + prepare_for_update(NULL, target->node, info);
55783 + }
55784 +}
55785 +
55786 +/* this bits encode cut mode */
55787 +#define CMODE_TAIL 1
55788 +#define CMODE_WHOLE 2
55789 +#define CMODE_HEAD 4
55790 +
55791 +struct cut40_info {
55792 + int mode;
55793 + pos_in_node_t tail_removed; /* position of item which gets tail removed */
55794 + pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
55795 + pos_in_node_t removed_count; /* number of items removed completely */
55796 + pos_in_node_t head_removed; /* position of item which gets head removed */
55797 +
55798 + pos_in_node_t freed_space_start;
55799 + pos_in_node_t freed_space_end;
55800 + pos_in_node_t first_moved;
55801 + pos_in_node_t head_removed_location;
55802 +};
55803 +
55804 +static void init_cinfo(struct cut40_info *cinfo)
55805 +{
55806 + cinfo->mode = 0;
55807 + cinfo->tail_removed = MAX_POS_IN_NODE;
55808 + cinfo->first_removed = MAX_POS_IN_NODE;
55809 + cinfo->removed_count = MAX_POS_IN_NODE;
55810 + cinfo->head_removed = MAX_POS_IN_NODE;
55811 + cinfo->freed_space_start = MAX_POS_IN_NODE;
55812 + cinfo->freed_space_end = MAX_POS_IN_NODE;
55813 + cinfo->first_moved = MAX_POS_IN_NODE;
55814 + cinfo->head_removed_location = MAX_POS_IN_NODE;
55815 +}
55816 +
55817 +/* complete cut_node40/kill_node40 content by removing the gap created by */
55818 +static void compact(znode * node, struct cut40_info *cinfo)
55819 +{
55820 + node40_header *nh;
55821 + item_header40 *ih;
55822 + pos_in_node_t freed;
55823 + pos_in_node_t pos, nr_items;
55824 +
55825 + assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
55826 + cinfo->freed_space_end != MAX_POS_IN_NODE &&
55827 + cinfo->first_moved != MAX_POS_IN_NODE));
55828 + assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
55829 +
55830 + nh = node40_node_header(node);
55831 + nr_items = nh40_get_num_items(nh);
55832 +
55833 + /* remove gap made up by removal */
55834 + memmove(zdata(node) + cinfo->freed_space_start,
55835 + zdata(node) + cinfo->freed_space_end,
55836 + nh40_get_free_space_start(nh) - cinfo->freed_space_end);
55837 +
55838 + /* update item headers of moved items - change their locations */
55839 + pos = cinfo->first_moved;
55840 + ih = node40_ih_at(node, pos);
55841 + if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
55842 + assert("vs-1580", pos == cinfo->head_removed);
55843 + ih40_set_offset(ih, cinfo->head_removed_location);
55844 + pos++;
55845 + ih--;
55846 + }
55847 +
55848 + freed = cinfo->freed_space_end - cinfo->freed_space_start;
55849 + for (; pos < nr_items; pos++, ih--) {
55850 + assert("vs-1581", ih == node40_ih_at(node, pos));
55851 + ih40_set_offset(ih, ih40_get_offset(ih) - freed);
55852 + }
55853 +
55854 + /* free space start moved to right */
55855 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
55856 +
55857 + if (cinfo->removed_count != MAX_POS_IN_NODE) {
55858 + /* number of items changed. Remove item headers of those items */
55859 + ih = node40_ih_at(node, nr_items - 1);
55860 + memmove(ih + cinfo->removed_count, ih,
55861 + sizeof(item_header40) * (nr_items -
55862 + cinfo->removed_count -
55863 + cinfo->first_removed));
55864 + freed += sizeof(item_header40) * cinfo->removed_count;
55865 + node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
55866 + }
55867 +
55868 + /* total amount of free space increased */
55869 + nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
55870 +}
55871 +
55872 +int shrink_item_node40(coord_t * coord, int delta)
55873 +{
55874 + node40_header *nh;
55875 + item_header40 *ih;
55876 + pos_in_node_t pos;
55877 + pos_in_node_t nr_items;
55878 + char *end;
55879 + znode *node;
55880 + int off;
55881 +
55882 + assert("nikita-3487", coord != NULL);
55883 + assert("nikita-3488", delta >= 0);
55884 +
55885 + node = coord->node;
55886 + nh = node40_node_header(node);
55887 + nr_items = nh40_get_num_items(nh);
55888 +
55889 + ih = node40_ih_at_coord(coord);
55890 + assert("nikita-3489", delta <= length_by_coord_node40(coord));
55891 + off = ih40_get_offset(ih) + length_by_coord_node40(coord);
55892 + end = zdata(node) + off;
55893 +
55894 + /* remove gap made up by removal */
55895 + memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
55896 +
55897 + /* update item headers of moved items - change their locations */
55898 + pos = coord->item_pos + 1;
55899 + ih = node40_ih_at(node, pos);
55900 + for (; pos < nr_items; pos++, ih--) {
55901 + assert("nikita-3490", ih == node40_ih_at(node, pos));
55902 + ih40_set_offset(ih, ih40_get_offset(ih) - delta);
55903 + }
55904 +
55905 + /* free space start moved to left */
55906 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
55907 + /* total amount of free space increased */
55908 + nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
55909 + /*
55910 + * This method does _not_ changes number of items. Hence, it cannot
55911 + * make node empty. Also it doesn't remove items at all, which means
55912 + * that no keys have to be updated either.
55913 + */
55914 + return 0;
55915 +}
55916 +
55917 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
55918 + of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
55919 + rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
55920 + getting head cut. Function returns 0 in this case */
55921 +static int
55922 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
55923 +{
55924 + reiser4_key left_key, right_key;
55925 + reiser4_key min_from_key, max_to_key;
55926 + const reiser4_key *from_key, *to_key;
55927 +
55928 + init_cinfo(cinfo);
55929 +
55930 + /* calculate minimal key stored in first item of items to be cut (params->from) */
55931 + item_key_by_coord(params->from, &min_from_key);
55932 + /* and max key stored in last item of items to be cut (params->to) */
55933 + max_item_key_by_coord(params->to, &max_to_key);
55934 +
55935 + /* if cut key range is not defined in input parameters - define it using cut coord range */
55936 + if (params->from_key == NULL) {
55937 + assert("vs-1513", params->to_key == NULL);
55938 + unit_key_by_coord(params->from, &left_key);
55939 + from_key = &left_key;
55940 + max_unit_key_by_coord(params->to, &right_key);
55941 + to_key = &right_key;
55942 + } else {
55943 + from_key = params->from_key;
55944 + to_key = params->to_key;
55945 + }
55946 +
55947 + if (params->from->item_pos == params->to->item_pos) {
55948 + if (keylt(&min_from_key, from_key)
55949 + && keylt(to_key, &max_to_key))
55950 + return 1;
55951 +
55952 + if (keygt(from_key, &min_from_key)) {
55953 + /* tail of item is to be cut cut */
55954 + cinfo->tail_removed = params->from->item_pos;
55955 + cinfo->mode |= CMODE_TAIL;
55956 + } else if (keylt(to_key, &max_to_key)) {
55957 + /* head of item is to be cut */
55958 + cinfo->head_removed = params->from->item_pos;
55959 + cinfo->mode |= CMODE_HEAD;
55960 + } else {
55961 + /* item is removed completely */
55962 + cinfo->first_removed = params->from->item_pos;
55963 + cinfo->removed_count = 1;
55964 + cinfo->mode |= CMODE_WHOLE;
55965 + }
55966 + } else {
55967 + cinfo->first_removed = params->from->item_pos + 1;
55968 + cinfo->removed_count =
55969 + params->to->item_pos - params->from->item_pos - 1;
55970 +
55971 + if (keygt(from_key, &min_from_key)) {
55972 + /* first item is not cut completely */
55973 + cinfo->tail_removed = params->from->item_pos;
55974 + cinfo->mode |= CMODE_TAIL;
55975 + } else {
55976 + cinfo->first_removed--;
55977 + cinfo->removed_count++;
55978 + }
55979 + if (keylt(to_key, &max_to_key)) {
55980 + /* last item is not cut completely */
55981 + cinfo->head_removed = params->to->item_pos;
55982 + cinfo->mode |= CMODE_HEAD;
55983 + } else {
55984 + cinfo->removed_count++;
55985 + }
55986 + if (cinfo->removed_count)
55987 + cinfo->mode |= CMODE_WHOLE;
55988 + }
55989 +
55990 + return 0;
55991 +}
55992 +
55993 +static void
55994 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
55995 + carry_kill_data * kdata)
55996 +{
55997 + coord_t coord;
55998 + item_plugin *iplug;
55999 + pos_in_node_t pos;
56000 +
56001 + coord.node = node;
56002 + coord.unit_pos = 0;
56003 + coord.between = AT_UNIT;
56004 + for (pos = 0; pos < count; pos++) {
56005 + coord_set_item_pos(&coord, from + pos);
56006 + coord.unit_pos = 0;
56007 + coord.between = AT_UNIT;
56008 + iplug = item_plugin_by_coord(&coord);
56009 + if (iplug->b.kill_hook) {
56010 + iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
56011 + kdata);
56012 + }
56013 + }
56014 +}
56015 +
56016 +/* this is used to kill item partially */
56017 +static pos_in_node_t
56018 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56019 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
56020 +{
56021 + struct carry_kill_data *kdata;
56022 + item_plugin *iplug;
56023 +
56024 + kdata = data;
56025 + iplug = item_plugin_by_coord(coord);
56026 +
56027 + assert("vs-1524", iplug->b.kill_units);
56028 + return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
56029 + new_first_key);
56030 +}
56031 +
56032 +/* call item plugin to cut tail of file */
56033 +static pos_in_node_t
56034 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56035 +{
56036 + struct carry_kill_data *kdata;
56037 + pos_in_node_t to;
56038 +
56039 + kdata = data;
56040 + to = coord_last_unit_pos(coord);
56041 + return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
56042 + NULL);
56043 +}
56044 +
56045 +/* call item plugin to cut head of item */
56046 +static pos_in_node_t
56047 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56048 + reiser4_key * new_first_key)
56049 +{
56050 + return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
56051 + new_first_key);
56052 +}
56053 +
56054 +/* this is used to cut item partially */
56055 +static pos_in_node_t
56056 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56057 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
56058 +{
56059 + carry_cut_data *cdata;
56060 + item_plugin *iplug;
56061 +
56062 + cdata = data;
56063 + iplug = item_plugin_by_coord(coord);
56064 + assert("vs-302", iplug->b.cut_units);
56065 + return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
56066 + new_first_key);
56067 +}
56068 +
56069 +/* call item plugin to cut tail of file */
56070 +static pos_in_node_t
56071 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56072 +{
56073 + carry_cut_data *cdata;
56074 + pos_in_node_t to;
56075 +
56076 + cdata = data;
56077 + to = coord_last_unit_pos(cdata->params.from);
56078 + return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
56079 +}
56080 +
56081 +/* call item plugin to cut head of item */
56082 +static pos_in_node_t
56083 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56084 + reiser4_key * new_first_key)
56085 +{
56086 + return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
56087 + new_first_key);
56088 +}
56089 +
56090 +/* this returns 1 of key of first item changed, 0 - if it did not */
56091 +static int
56092 +prepare_for_compact(struct cut40_info *cinfo,
56093 + const struct cut_kill_params *params, int is_cut,
56094 + void *data, carry_plugin_info * info)
56095 +{
56096 + znode *node;
56097 + item_header40 *ih;
56098 + pos_in_node_t freed;
56099 + pos_in_node_t item_pos;
56100 + coord_t coord;
56101 + reiser4_key new_first_key;
56102 + pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
56103 + void *, reiser4_key *, reiser4_key *);
56104 + pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
56105 + pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
56106 + reiser4_key *);
56107 + int retval;
56108 +
56109 + retval = 0;
56110 +
56111 + node = params->from->node;
56112 +
56113 + assert("vs-184", node == params->to->node);
56114 + assert("vs-312", !node_is_empty(node));
56115 + assert("vs-297",
56116 + coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
56117 +
56118 + if (is_cut) {
56119 + kill_units_f = cut_units;
56120 + kill_tail_f = cut_tail;
56121 + kill_head_f = cut_head;
56122 + } else {
56123 + kill_units_f = kill_units;
56124 + kill_tail_f = kill_tail;
56125 + kill_head_f = kill_head;
56126 + }
56127 +
56128 + if (parse_cut(cinfo, params) == 1) {
56129 + /* cut from the middle of item */
56130 + freed =
56131 + kill_units_f(params->from, params->from->unit_pos,
56132 + params->to->unit_pos, data,
56133 + params->smallest_removed, NULL);
56134 +
56135 + item_pos = params->from->item_pos;
56136 + ih = node40_ih_at(node, item_pos);
56137 + cinfo->freed_space_start =
56138 + ih40_get_offset(ih) + node40_item_length(node,
56139 + item_pos) - freed;
56140 + cinfo->freed_space_end = cinfo->freed_space_start + freed;
56141 + cinfo->first_moved = item_pos + 1;
56142 + } else {
56143 + assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
56144 + cinfo->first_removed != MAX_POS_IN_NODE ||
56145 + cinfo->head_removed != MAX_POS_IN_NODE));
56146 +
56147 + switch (cinfo->mode) {
56148 + case CMODE_TAIL:
56149 + /* one item gets cut partially from its end */
56150 + assert("vs-1562",
56151 + cinfo->tail_removed == params->from->item_pos);
56152 +
56153 + freed =
56154 + kill_tail_f(params->from, data,
56155 + params->smallest_removed);
56156 +
56157 + item_pos = cinfo->tail_removed;
56158 + ih = node40_ih_at(node, item_pos);
56159 + cinfo->freed_space_start =
56160 + ih40_get_offset(ih) + node40_item_length(node,
56161 + item_pos) -
56162 + freed;
56163 + cinfo->freed_space_end =
56164 + cinfo->freed_space_start + freed;
56165 + cinfo->first_moved = cinfo->tail_removed + 1;
56166 + break;
56167 +
56168 + case CMODE_WHOLE:
56169 + /* one or more items get removed completely */
56170 + assert("vs-1563",
56171 + cinfo->first_removed == params->from->item_pos);
56172 + assert("vs-1564", cinfo->removed_count > 0
56173 + && cinfo->removed_count != MAX_POS_IN_NODE);
56174 +
56175 + /* call kill hook for all items removed completely */
56176 + if (is_cut == 0)
56177 + call_kill_hooks(node, cinfo->first_removed,
56178 + cinfo->removed_count, data);
56179 +
56180 + item_pos = cinfo->first_removed;
56181 + ih = node40_ih_at(node, item_pos);
56182 +
56183 + if (params->smallest_removed)
56184 + memcpy(params->smallest_removed, &ih->key,
56185 + sizeof(reiser4_key));
56186 +
56187 + cinfo->freed_space_start = ih40_get_offset(ih);
56188 +
56189 + item_pos += (cinfo->removed_count - 1);
56190 + ih -= (cinfo->removed_count - 1);
56191 + cinfo->freed_space_end =
56192 + ih40_get_offset(ih) + node40_item_length(node,
56193 + item_pos);
56194 + cinfo->first_moved = item_pos + 1;
56195 + if (cinfo->first_removed == 0)
56196 + /* key of first item of the node changes */
56197 + retval = 1;
56198 + break;
56199 +
56200 + case CMODE_HEAD:
56201 + /* one item gets cut partially from its head */
56202 + assert("vs-1565",
56203 + cinfo->head_removed == params->from->item_pos);
56204 +
56205 + freed =
56206 + kill_head_f(params->to, data,
56207 + params->smallest_removed,
56208 + &new_first_key);
56209 +
56210 + item_pos = cinfo->head_removed;
56211 + ih = node40_ih_at(node, item_pos);
56212 + cinfo->freed_space_start = ih40_get_offset(ih);
56213 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56214 + cinfo->first_moved = cinfo->head_removed + 1;
56215 +
56216 + /* item head is removed, therefore, item key changed */
56217 + coord.node = node;
56218 + coord_set_item_pos(&coord, item_pos);
56219 + coord.unit_pos = 0;
56220 + coord.between = AT_UNIT;
56221 + update_item_key_node40(&coord, &new_first_key, NULL);
56222 + if (item_pos == 0)
56223 + /* key of first item of the node changes */
56224 + retval = 1;
56225 + break;
56226 +
56227 + case CMODE_TAIL | CMODE_WHOLE:
56228 + /* one item gets cut from its end and one or more items get removed completely */
56229 + assert("vs-1566",
56230 + cinfo->tail_removed == params->from->item_pos);
56231 + assert("vs-1567",
56232 + cinfo->first_removed == cinfo->tail_removed + 1);
56233 + assert("vs-1564", cinfo->removed_count > 0
56234 + && cinfo->removed_count != MAX_POS_IN_NODE);
56235 +
56236 + freed =
56237 + kill_tail_f(params->from, data,
56238 + params->smallest_removed);
56239 +
56240 + item_pos = cinfo->tail_removed;
56241 + ih = node40_ih_at(node, item_pos);
56242 + cinfo->freed_space_start =
56243 + ih40_get_offset(ih) + node40_item_length(node,
56244 + item_pos) -
56245 + freed;
56246 +
56247 + /* call kill hook for all items removed completely */
56248 + if (is_cut == 0)
56249 + call_kill_hooks(node, cinfo->first_removed,
56250 + cinfo->removed_count, data);
56251 +
56252 + item_pos += cinfo->removed_count;
56253 + ih -= cinfo->removed_count;
56254 + cinfo->freed_space_end =
56255 + ih40_get_offset(ih) + node40_item_length(node,
56256 + item_pos);
56257 + cinfo->first_moved = item_pos + 1;
56258 + break;
56259 +
56260 + case CMODE_WHOLE | CMODE_HEAD:
56261 + /* one or more items get removed completely and one item gets cut partially from its head */
56262 + assert("vs-1568",
56263 + cinfo->first_removed == params->from->item_pos);
56264 + assert("vs-1564", cinfo->removed_count > 0
56265 + && cinfo->removed_count != MAX_POS_IN_NODE);
56266 + assert("vs-1569",
56267 + cinfo->head_removed ==
56268 + cinfo->first_removed + cinfo->removed_count);
56269 +
56270 + /* call kill hook for all items removed completely */
56271 + if (is_cut == 0)
56272 + call_kill_hooks(node, cinfo->first_removed,
56273 + cinfo->removed_count, data);
56274 +
56275 + item_pos = cinfo->first_removed;
56276 + ih = node40_ih_at(node, item_pos);
56277 +
56278 + if (params->smallest_removed)
56279 + memcpy(params->smallest_removed, &ih->key,
56280 + sizeof(reiser4_key));
56281 +
56282 + freed =
56283 + kill_head_f(params->to, data, NULL, &new_first_key);
56284 +
56285 + cinfo->freed_space_start = ih40_get_offset(ih);
56286 +
56287 + ih = node40_ih_at(node, cinfo->head_removed);
56288 + /* this is the most complex case. Item which got head removed and items which are to be moved
56289 + intact change their location differently. */
56290 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56291 + cinfo->first_moved = cinfo->head_removed;
56292 + cinfo->head_removed_location = cinfo->freed_space_start;
56293 +
56294 + /* item head is removed, therefore, item key changed */
56295 + coord.node = node;
56296 + coord_set_item_pos(&coord, cinfo->head_removed);
56297 + coord.unit_pos = 0;
56298 + coord.between = AT_UNIT;
56299 + update_item_key_node40(&coord, &new_first_key, NULL);
56300 +
56301 + assert("vs-1579", cinfo->first_removed == 0);
56302 + /* key of first item of the node changes */
56303 + retval = 1;
56304 + break;
56305 +
56306 + case CMODE_TAIL | CMODE_HEAD:
56307 + /* one item get cut from its end and its neighbor gets cut from its tail */
56308 + impossible("vs-1576", "this can not happen currently");
56309 + break;
56310 +
56311 + case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
56312 + impossible("vs-1577", "this can not happen currently");
56313 + break;
56314 + default:
56315 + impossible("vs-1578", "unexpected cut mode");
56316 + break;
56317 + }
56318 + }
56319 + return retval;
56320 +}
56321 +
56322 +/* plugin->u.node.kill
56323 + return value is number of items removed completely */
56324 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
56325 +{
56326 + znode *node;
56327 + struct cut40_info cinfo;
56328 + int first_key_changed;
56329 +
56330 + node = kdata->params.from->node;
56331 +
56332 + first_key_changed =
56333 + prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
56334 + info);
56335 + compact(node, &cinfo);
56336 +
56337 + if (info) {
56338 + /* it is not called by node40_shift, so we have to take care
56339 + of changes on upper levels */
56340 + if (node_is_empty(node)
56341 + && !(kdata->flags & DELETE_RETAIN_EMPTY))
56342 + /* all contents of node is deleted */
56343 + prepare_removal_node40(node, info);
56344 + else if (first_key_changed) {
56345 + prepare_for_update(NULL, node, info);
56346 + }
56347 + }
56348 +
56349 + coord_clear_iplug(kdata->params.from);
56350 + coord_clear_iplug(kdata->params.to);
56351 +
56352 + znode_make_dirty(node);
56353 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56354 +}
56355 +
56356 +/* plugin->u.node.cut
56357 + return value is number of items removed completely */
56358 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
56359 +{
56360 + znode *node;
56361 + struct cut40_info cinfo;
56362 + int first_key_changed;
56363 +
56364 + node = cdata->params.from->node;
56365 +
56366 + first_key_changed =
56367 + prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
56368 + info);
56369 + compact(node, &cinfo);
56370 +
56371 + if (info) {
56372 + /* it is not called by node40_shift, so we have to take care
56373 + of changes on upper levels */
56374 + if (node_is_empty(node))
56375 + /* all contents of node is deleted */
56376 + prepare_removal_node40(node, info);
56377 + else if (first_key_changed) {
56378 + prepare_for_update(NULL, node, info);
56379 + }
56380 + }
56381 +
56382 + coord_clear_iplug(cdata->params.from);
56383 + coord_clear_iplug(cdata->params.to);
56384 +
56385 + znode_make_dirty(node);
56386 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56387 +}
56388 +
56389 +/* this structure is used by shift method of node40 plugin */
56390 +struct shift_params {
56391 + shift_direction pend; /* when @pend == append - we are shifting to
56392 + left, when @pend == prepend - to right */
56393 + coord_t wish_stop; /* when shifting to left this is last unit we
56394 + want shifted, when shifting to right - this
56395 + is set to unit we want to start shifting
56396 + from */
56397 + znode *target;
56398 + int everything; /* it is set to 1 if everything we have to shift is
56399 + shifted, 0 - otherwise */
56400 +
56401 + /* FIXME-VS: get rid of read_stop */
56402 +
56403 + /* these are set by estimate_shift */
56404 + coord_t real_stop; /* this will be set to last unit which will be
56405 + really shifted */
56406 +
56407 + /* coordinate in source node before operation of unit which becomes
56408 + first after shift to left of last after shift to right */
56409 + union {
56410 + coord_t future_first;
56411 + coord_t future_last;
56412 + } u;
56413 +
56414 + unsigned merging_units; /* number of units of first item which have to
56415 + be merged with last item of target node */
56416 + unsigned merging_bytes; /* number of bytes in those units */
56417 +
56418 + unsigned entire; /* items shifted in their entirety */
56419 + unsigned entire_bytes; /* number of bytes in those items */
56420 +
56421 + unsigned part_units; /* number of units of partially copied item */
56422 + unsigned part_bytes; /* number of bytes in those units */
56423 +
56424 + unsigned shift_bytes; /* total number of bytes in items shifted (item
56425 + headers not included) */
56426 +
56427 +};
56428 +
56429 +static int item_creation_overhead(coord_t *item)
56430 +{
56431 + return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
56432 +}
56433 +
56434 +/* how many units are there in @source starting from source->unit_pos
56435 + but not further than @stop_coord */
56436 +static int
56437 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
56438 +{
56439 + if (pend == SHIFT_LEFT) {
56440 + assert("vs-181", source->unit_pos == 0);
56441 + } else {
56442 + assert("vs-182",
56443 + source->unit_pos == coord_last_unit_pos(source));
56444 + }
56445 +
56446 + if (source->item_pos != stop_coord->item_pos) {
56447 + /* @source and @stop_coord are different items */
56448 + return coord_last_unit_pos(source) + 1;
56449 + }
56450 +
56451 + if (pend == SHIFT_LEFT) {
56452 + return stop_coord->unit_pos + 1;
56453 + } else {
56454 + return source->unit_pos - stop_coord->unit_pos + 1;
56455 + }
56456 +}
56457 +
56458 +/* this calculates what can be copied from @shift->wish_stop.node to
56459 + @shift->target */
56460 +static void
56461 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
56462 +{
56463 + unsigned target_free_space, size;
56464 + pos_in_node_t stop_item; /* item which estimating should not consider */
56465 + unsigned want; /* number of units of item we want shifted */
56466 + coord_t source; /* item being estimated */
56467 + item_plugin *iplug;
56468 +
56469 + /* shifting to left/right starts from first/last units of
56470 + @shift->wish_stop.node */
56471 + if (shift->pend == SHIFT_LEFT) {
56472 + coord_init_first_unit(&source, shift->wish_stop.node);
56473 + } else {
56474 + coord_init_last_unit(&source, shift->wish_stop.node);
56475 + }
56476 + shift->real_stop = source;
56477 +
56478 + /* free space in target node and number of items in source */
56479 + target_free_space = znode_free_space(shift->target);
56480 +
56481 + shift->everything = 0;
56482 + if (!node_is_empty(shift->target)) {
56483 + /* target node is not empty, check for boundary items
56484 + mergeability */
56485 + coord_t to;
56486 +
56487 + /* item we try to merge @source with */
56488 + if (shift->pend == SHIFT_LEFT) {
56489 + coord_init_last_unit(&to, shift->target);
56490 + } else {
56491 + coord_init_first_unit(&to, shift->target);
56492 + }
56493 +
56494 + if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
56495 + &source) :
56496 + are_items_mergeable(&source, &to)) {
56497 + /* how many units of @source do we want to merge to
56498 + item @to */
56499 + want =
56500 + wanted_units(&source, &shift->wish_stop,
56501 + shift->pend);
56502 +
56503 + /* how many units of @source we can merge to item
56504 + @to */
56505 + iplug = item_plugin_by_coord(&source);
56506 + if (iplug->b.can_shift != NULL)
56507 + shift->merging_units =
56508 + iplug->b.can_shift(target_free_space,
56509 + &source, shift->target,
56510 + shift->pend, &size,
56511 + want);
56512 + else {
56513 + shift->merging_units = 0;
56514 + size = 0;
56515 + }
56516 + shift->merging_bytes = size;
56517 + shift->shift_bytes += size;
56518 + /* update stop coord to be set to last unit of @source
56519 + we can merge to @target */
56520 + if (shift->merging_units)
56521 + /* at least one unit can be shifted */
56522 + shift->real_stop.unit_pos =
56523 + (shift->merging_units - source.unit_pos -
56524 + 1) * shift->pend;
56525 + else {
56526 + /* nothing can be shifted */
56527 + if (shift->pend == SHIFT_LEFT)
56528 + coord_init_before_first_item(&shift->
56529 + real_stop,
56530 + source.
56531 + node);
56532 + else
56533 + coord_init_after_last_item(&shift->
56534 + real_stop,
56535 + source.node);
56536 + }
56537 + assert("nikita-2081", shift->real_stop.unit_pos + 1);
56538 +
56539 + if (shift->merging_units != want) {
56540 + /* we could not copy as many as we want, so,
56541 + there is no reason for estimating any
56542 + longer */
56543 + return;
56544 + }
56545 +
56546 + target_free_space -= size;
56547 + coord_add_item_pos(&source, shift->pend);
56548 + }
56549 + }
56550 +
56551 + /* number of item nothing of which we want to shift */
56552 + stop_item = shift->wish_stop.item_pos + shift->pend;
56553 +
56554 + /* calculate how many items can be copied into given free
56555 + space as whole */
56556 + for (; source.item_pos != stop_item;
56557 + coord_add_item_pos(&source, shift->pend)) {
56558 + if (shift->pend == SHIFT_RIGHT)
56559 + source.unit_pos = coord_last_unit_pos(&source);
56560 +
56561 + /* how many units of @source do we want to copy */
56562 + want = wanted_units(&source, &shift->wish_stop, shift->pend);
56563 +
56564 + if (want == coord_last_unit_pos(&source) + 1) {
56565 + /* we want this item to be copied entirely */
56566 + size =
56567 + item_length_by_coord(&source) +
56568 + item_creation_overhead(&source);
56569 + if (size <= target_free_space) {
56570 + /* item fits into target node as whole */
56571 + target_free_space -= size;
56572 + shift->shift_bytes +=
56573 + size - item_creation_overhead(&source);
56574 + shift->entire_bytes +=
56575 + size - item_creation_overhead(&source);
56576 + shift->entire++;
56577 +
56578 + /* update shift->real_stop coord to be set to
56579 + last unit of @source we can merge to
56580 + @target */
56581 + shift->real_stop = source;
56582 + if (shift->pend == SHIFT_LEFT)
56583 + shift->real_stop.unit_pos =
56584 + coord_last_unit_pos(&shift->
56585 + real_stop);
56586 + else
56587 + shift->real_stop.unit_pos = 0;
56588 + continue;
56589 + }
56590 + }
56591 +
56592 + /* we reach here only for an item which does not fit into
56593 + target node in its entirety. This item may be either
56594 + partially shifted, or not shifted at all. We will have to
56595 + create new item in target node, so decrease amout of free
56596 + space by an item creation overhead. We can reach here also
56597 + if stop coord is in this item */
56598 + if (target_free_space >=
56599 + (unsigned)item_creation_overhead(&source)) {
56600 + target_free_space -= item_creation_overhead(&source);
56601 + iplug = item_plugin_by_coord(&source);
56602 + if (iplug->b.can_shift) {
56603 + shift->part_units = iplug->b.can_shift(target_free_space,
56604 + &source,
56605 + NULL, /* target */
56606 + shift->pend,
56607 + &size,
56608 + want);
56609 + } else {
56610 + target_free_space = 0;
56611 + shift->part_units = 0;
56612 + size = 0;
56613 + }
56614 + } else {
56615 + target_free_space = 0;
56616 + shift->part_units = 0;
56617 + size = 0;
56618 + }
56619 + shift->part_bytes = size;
56620 + shift->shift_bytes += size;
56621 +
56622 + /* set @shift->real_stop to last unit of @source we can merge
56623 + to @shift->target */
56624 + if (shift->part_units) {
56625 + shift->real_stop = source;
56626 + shift->real_stop.unit_pos =
56627 + (shift->part_units - source.unit_pos -
56628 + 1) * shift->pend;
56629 + assert("nikita-2082", shift->real_stop.unit_pos + 1);
56630 + }
56631 +
56632 + if (want != shift->part_units)
56633 + /* not everything wanted were shifted */
56634 + return;
56635 + break;
56636 + }
56637 +
56638 + shift->everything = 1;
56639 +}
56640 +
56641 +static void
56642 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
56643 + shift_direction dir, unsigned free_space)
56644 +{
56645 + item_plugin *iplug;
56646 +
56647 + assert("nikita-1463", target != NULL);
56648 + assert("nikita-1464", source != NULL);
56649 + assert("nikita-1465", from + count <= coord_num_units(source));
56650 +
56651 + iplug = item_plugin_by_coord(source);
56652 + assert("nikita-1468", iplug == item_plugin_by_coord(target));
56653 + iplug->b.copy_units(target, source, from, count, dir, free_space);
56654 +
56655 + if (dir == SHIFT_RIGHT) {
56656 + /* FIXME-VS: this looks not necessary. update_item_key was
56657 + called already by copy_units method */
56658 + reiser4_key split_key;
56659 +
56660 + assert("nikita-1469", target->unit_pos == 0);
56661 +
56662 + unit_key_by_coord(target, &split_key);
56663 + node_plugin_by_coord(target)->update_item_key(target,
56664 + &split_key, NULL);
56665 + }
56666 +}
56667 +
56668 +/* copy part of @shift->real_stop.node starting either from its beginning or
56669 + from its end and ending at @shift->real_stop to either the end or the
56670 + beginning of @shift->target */
56671 +static void copy(struct shift_params *shift)
56672 +{
56673 + node40_header *nh;
56674 + coord_t from;
56675 + coord_t to;
56676 + item_header40 *from_ih, *to_ih;
56677 + int free_space_start;
56678 + int new_items;
56679 + unsigned old_items;
56680 + int old_offset;
56681 + unsigned i;
56682 +
56683 + nh = node40_node_header(shift->target);
56684 + free_space_start = nh40_get_free_space_start(nh);
56685 + old_items = nh40_get_num_items(nh);
56686 + new_items = shift->entire + (shift->part_units ? 1 : 0);
56687 + assert("vs-185",
56688 + shift->shift_bytes ==
56689 + shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
56690 +
56691 + from = shift->wish_stop;
56692 +
56693 + coord_init_first_unit(&to, shift->target);
56694 +
56695 + /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
56696 + hence to.between is set to EMPTY_NODE above. Looks like we want it
56697 + to be AT_UNIT.
56698 +
56699 + Oh, wonders of ->betweeness...
56700 +
56701 + */
56702 + to.between = AT_UNIT;
56703 +
56704 + if (shift->pend == SHIFT_LEFT) {
56705 + /* copying to left */
56706 +
56707 + coord_set_item_pos(&from, 0);
56708 + from_ih = node40_ih_at(from.node, 0);
56709 +
56710 + coord_set_item_pos(&to,
56711 + node40_num_of_items_internal(to.node) - 1);
56712 + if (shift->merging_units) {
56713 + /* expand last item, so that plugin methods will see
56714 + correct data */
56715 + free_space_start += shift->merging_bytes;
56716 + nh40_set_free_space_start(nh,
56717 + (unsigned)free_space_start);
56718 + nh40_set_free_space(nh,
56719 + nh40_get_free_space(nh) -
56720 + shift->merging_bytes);
56721 +
56722 + /* appending last item of @target */
56723 + copy_units(&to, &from, 0, /* starting from 0-th unit */
56724 + shift->merging_units, SHIFT_LEFT,
56725 + shift->merging_bytes);
56726 + coord_inc_item_pos(&from);
56727 + from_ih--;
56728 + coord_inc_item_pos(&to);
56729 + }
56730 +
56731 + to_ih = node40_ih_at(shift->target, old_items);
56732 + if (shift->entire) {
56733 + /* copy @entire items entirely */
56734 +
56735 + /* copy item headers */
56736 + memcpy(to_ih - shift->entire + 1,
56737 + from_ih - shift->entire + 1,
56738 + shift->entire * sizeof(item_header40));
56739 + /* update item header offset */
56740 + old_offset = ih40_get_offset(from_ih);
56741 + /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
56742 + for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
56743 + ih40_set_offset(to_ih,
56744 + ih40_get_offset(from_ih) -
56745 + old_offset + free_space_start);
56746 +
56747 + /* copy item bodies */
56748 + memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
56749 + shift->entire_bytes);
56750 +
56751 + coord_add_item_pos(&from, (int)shift->entire);
56752 + coord_add_item_pos(&to, (int)shift->entire);
56753 + }
56754 +
56755 + nh40_set_free_space_start(nh,
56756 + free_space_start +
56757 + shift->shift_bytes -
56758 + shift->merging_bytes);
56759 + nh40_set_free_space(nh,
56760 + nh40_get_free_space(nh) -
56761 + (shift->shift_bytes - shift->merging_bytes +
56762 + sizeof(item_header40) * new_items));
56763 +
56764 + /* update node header */
56765 + node40_set_num_items(shift->target, nh, old_items + new_items);
56766 + assert("vs-170",
56767 + nh40_get_free_space(nh) < znode_size(shift->target));
56768 +
56769 + if (shift->part_units) {
56770 + /* copy heading part (@part units) of @source item as
56771 + a new item into @target->node */
56772 +
56773 + /* copy item header of partially copied item */
56774 + coord_set_item_pos(&to,
56775 + node40_num_of_items_internal(to.node)
56776 + - 1);
56777 + memcpy(to_ih, from_ih, sizeof(item_header40));
56778 + ih40_set_offset(to_ih,
56779 + nh40_get_free_space_start(nh) -
56780 + shift->part_bytes);
56781 + if (item_plugin_by_coord(&to)->b.init)
56782 + item_plugin_by_coord(&to)->b.init(&to, &from,
56783 + NULL);
56784 + copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
56785 + shift->part_bytes);
56786 + }
56787 +
56788 + } else {
56789 + /* copying to right */
56790 +
56791 + coord_set_item_pos(&from,
56792 + node40_num_of_items_internal(from.node) - 1);
56793 + from_ih = node40_ih_at_coord(&from);
56794 +
56795 + coord_set_item_pos(&to, 0);
56796 +
56797 + /* prepare space for new items */
56798 + memmove(zdata(to.node) + sizeof(node40_header) +
56799 + shift->shift_bytes,
56800 + zdata(to.node) + sizeof(node40_header),
56801 + free_space_start - sizeof(node40_header));
56802 + /* update item headers of moved items */
56803 + to_ih = node40_ih_at(to.node, 0);
56804 + /* first item gets @merging_bytes longer. free space appears
56805 + at its beginning */
56806 + if (!node_is_empty(to.node))
56807 + ih40_set_offset(to_ih,
56808 + ih40_get_offset(to_ih) +
56809 + shift->shift_bytes -
56810 + shift->merging_bytes);
56811 +
56812 + for (i = 1; i < old_items; i++)
56813 + ih40_set_offset(to_ih - i,
56814 + ih40_get_offset(to_ih - i) +
56815 + shift->shift_bytes);
56816 +
56817 + /* move item headers to make space for new items */
56818 + memmove(to_ih - old_items + 1 - new_items,
56819 + to_ih - old_items + 1,
56820 + sizeof(item_header40) * old_items);
56821 + to_ih -= (new_items - 1);
56822 +
56823 + nh40_set_free_space_start(nh,
56824 + free_space_start +
56825 + shift->shift_bytes);
56826 + nh40_set_free_space(nh,
56827 + nh40_get_free_space(nh) -
56828 + (shift->shift_bytes +
56829 + sizeof(item_header40) * new_items));
56830 +
56831 + /* update node header */
56832 + node40_set_num_items(shift->target, nh, old_items + new_items);
56833 + assert("vs-170",
56834 + nh40_get_free_space(nh) < znode_size(shift->target));
56835 +
56836 + if (shift->merging_units) {
56837 + coord_add_item_pos(&to, new_items);
56838 + to.unit_pos = 0;
56839 + to.between = AT_UNIT;
56840 + /* prepend first item of @to */
56841 + copy_units(&to, &from,
56842 + coord_last_unit_pos(&from) -
56843 + shift->merging_units + 1,
56844 + shift->merging_units, SHIFT_RIGHT,
56845 + shift->merging_bytes);
56846 + coord_dec_item_pos(&from);
56847 + from_ih++;
56848 + }
56849 +
56850 + if (shift->entire) {
56851 + /* copy @entire items entirely */
56852 +
56853 + /* copy item headers */
56854 + memcpy(to_ih, from_ih,
56855 + shift->entire * sizeof(item_header40));
56856 +
56857 + /* update item header offset */
56858 + old_offset =
56859 + ih40_get_offset(from_ih + shift->entire - 1);
56860 + /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
56861 + for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
56862 + ih40_set_offset(to_ih,
56863 + ih40_get_offset(from_ih) -
56864 + old_offset +
56865 + sizeof(node40_header) +
56866 + shift->part_bytes);
56867 + /* copy item bodies */
56868 + coord_add_item_pos(&from, -(int)(shift->entire - 1));
56869 + memcpy(zdata(to.node) + sizeof(node40_header) +
56870 + shift->part_bytes, item_by_coord_node40(&from),
56871 + shift->entire_bytes);
56872 + coord_dec_item_pos(&from);
56873 + }
56874 +
56875 + if (shift->part_units) {
56876 + coord_set_item_pos(&to, 0);
56877 + to.unit_pos = 0;
56878 + to.between = AT_UNIT;
56879 + /* copy heading part (@part units) of @source item as
56880 + a new item into @target->node */
56881 +
56882 + /* copy item header of partially copied item */
56883 + memcpy(to_ih, from_ih, sizeof(item_header40));
56884 + ih40_set_offset(to_ih, sizeof(node40_header));
56885 + if (item_plugin_by_coord(&to)->b.init)
56886 + item_plugin_by_coord(&to)->b.init(&to, &from,
56887 + NULL);
56888 + copy_units(&to, &from,
56889 + coord_last_unit_pos(&from) -
56890 + shift->part_units + 1, shift->part_units,
56891 + SHIFT_RIGHT, shift->part_bytes);
56892 + }
56893 + }
56894 +}
56895 +
56896 +/* remove everything either before or after @fact_stop. Number of items
56897 + removed completely is returned */
56898 +static int delete_copied(struct shift_params *shift)
56899 +{
56900 + coord_t from;
56901 + coord_t to;
56902 + struct carry_cut_data cdata;
56903 +
56904 + if (shift->pend == SHIFT_LEFT) {
56905 + /* we were shifting to left, remove everything from the
56906 + beginning of @shift->wish_stop->node upto
56907 + @shift->wish_stop */
56908 + coord_init_first_unit(&from, shift->real_stop.node);
56909 + to = shift->real_stop;
56910 +
56911 + /* store old coordinate of unit which will be first after
56912 + shift to left */
56913 + shift->u.future_first = to;
56914 + coord_next_unit(&shift->u.future_first);
56915 + } else {
56916 + /* we were shifting to right, remove everything from
56917 + @shift->stop_coord upto to end of
56918 + @shift->stop_coord->node */
56919 + from = shift->real_stop;
56920 + coord_init_last_unit(&to, from.node);
56921 +
56922 + /* store old coordinate of unit which will be last after
56923 + shift to right */
56924 + shift->u.future_last = from;
56925 + coord_prev_unit(&shift->u.future_last);
56926 + }
56927 +
56928 + cdata.params.from = &from;
56929 + cdata.params.to = &to;
56930 + cdata.params.from_key = NULL;
56931 + cdata.params.to_key = NULL;
56932 + cdata.params.smallest_removed = NULL;
56933 + return cut_node40(&cdata, NULL);
56934 +}
56935 +
56936 +/* something was moved between @left and @right. Add carry operation to @info
56937 + list to have carry to update delimiting key between them */
56938 +static int
56939 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
56940 +{
56941 + carry_op *op;
56942 + carry_node *cn;
56943 +
56944 + if (info == NULL)
56945 + /* nowhere to send operation to. */
56946 + return 0;
56947 +
56948 + if (!should_notify_parent(right))
56949 + return 0;
56950 +
56951 + op = node_post_carry(info, COP_UPDATE, right, 1);
56952 + if (IS_ERR(op) || op == NULL)
56953 + return op ? PTR_ERR(op) : -EIO;
56954 +
56955 + if (left != NULL) {
56956 + carry_node *reference;
56957 +
56958 + if (info->doing)
56959 + reference = insert_carry_node(info->doing,
56960 + info->todo, left);
56961 + else
56962 + reference = op->node;
56963 + assert("nikita-2992", reference != NULL);
56964 + cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
56965 + if (IS_ERR(cn))
56966 + return PTR_ERR(cn);
56967 + cn->parent = 1;
56968 + cn->node = left;
56969 + if (ZF_ISSET(left, JNODE_ORPHAN))
56970 + cn->left_before = 1;
56971 + op->u.update.left = cn;
56972 + } else
56973 + op->u.update.left = NULL;
56974 + return 0;
56975 +}
56976 +
56977 +/* plugin->u.node.prepare_removal
56978 + to delete a pointer to @empty from the tree add corresponding carry
56979 + operation (delete) to @info list */
56980 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
56981 +{
56982 + carry_op *op;
56983 + reiser4_tree *tree;
56984 +
56985 + if (!should_notify_parent(empty))
56986 + return 0;
56987 + /* already on a road to Styx */
56988 + if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
56989 + return 0;
56990 + op = node_post_carry(info, COP_DELETE, empty, 1);
56991 + if (IS_ERR(op) || op == NULL)
56992 + return RETERR(op ? PTR_ERR(op) : -EIO);
56993 +
56994 + op->u.delete.child = NULL;
56995 + op->u.delete.flags = 0;
56996 +
56997 + /* fare thee well */
56998 + tree = znode_get_tree(empty);
56999 + read_lock_tree(tree);
57000 + write_lock_dk(tree);
57001 + znode_set_ld_key(empty, znode_get_rd_key(empty));
57002 + if (znode_is_left_connected(empty) && empty->left)
57003 + znode_set_rd_key(empty->left, znode_get_rd_key(empty));
57004 + write_unlock_dk(tree);
57005 + read_unlock_tree(tree);
57006 +
57007 + ZF_SET(empty, JNODE_HEARD_BANSHEE);
57008 + return 0;
57009 +}
57010 +
57011 +/* something were shifted from @insert_coord->node to @shift->target, update
57012 + @insert_coord correspondingly */
57013 +static void
57014 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
57015 + int including_insert_coord)
57016 +{
57017 + /* item plugin was invalidated by shifting */
57018 + coord_clear_iplug(insert_coord);
57019 +
57020 + if (node_is_empty(shift->wish_stop.node)) {
57021 + assert("vs-242", shift->everything);
57022 + if (including_insert_coord) {
57023 + if (shift->pend == SHIFT_RIGHT) {
57024 + /* set @insert_coord before first unit of
57025 + @shift->target node */
57026 + coord_init_before_first_item(insert_coord,
57027 + shift->target);
57028 + } else {
57029 + /* set @insert_coord after last in target node */
57030 + coord_init_after_last_item(insert_coord,
57031 + shift->target);
57032 + }
57033 + } else {
57034 + /* set @insert_coord inside of empty node. There is
57035 + only one possible coord within an empty
57036 + node. init_first_unit will set that coord */
57037 + coord_init_first_unit(insert_coord,
57038 + shift->wish_stop.node);
57039 + }
57040 + return;
57041 + }
57042 +
57043 + if (shift->pend == SHIFT_RIGHT) {
57044 + /* there was shifting to right */
57045 + if (shift->everything) {
57046 + /* everything wanted was shifted */
57047 + if (including_insert_coord) {
57048 + /* @insert_coord is set before first unit of
57049 + @to node */
57050 + coord_init_before_first_item(insert_coord,
57051 + shift->target);
57052 + insert_coord->between = BEFORE_UNIT;
57053 + } else {
57054 + /* @insert_coord is set after last unit of
57055 + @insert->node */
57056 + coord_init_last_unit(insert_coord,
57057 + shift->wish_stop.node);
57058 + insert_coord->between = AFTER_UNIT;
57059 + }
57060 + }
57061 + return;
57062 + }
57063 +
57064 + /* there was shifting to left */
57065 + if (shift->everything) {
57066 + /* everything wanted was shifted */
57067 + if (including_insert_coord) {
57068 + /* @insert_coord is set after last unit in @to node */
57069 + coord_init_after_last_item(insert_coord, shift->target);
57070 + } else {
57071 + /* @insert_coord is set before first unit in the same
57072 + node */
57073 + coord_init_before_first_item(insert_coord,
57074 + shift->wish_stop.node);
57075 + }
57076 + return;
57077 + }
57078 +
57079 + /* FIXME-VS: the code below is complicated because with between ==
57080 + AFTER_ITEM unit_pos is set to 0 */
57081 +
57082 + if (!removed) {
57083 + /* no items were shifted entirely */
57084 + assert("vs-195", shift->merging_units == 0
57085 + || shift->part_units == 0);
57086 +
57087 + if (shift->real_stop.item_pos == insert_coord->item_pos) {
57088 + if (shift->merging_units) {
57089 + if (insert_coord->between == AFTER_UNIT) {
57090 + assert("nikita-1441",
57091 + insert_coord->unit_pos >=
57092 + shift->merging_units);
57093 + insert_coord->unit_pos -=
57094 + shift->merging_units;
57095 + } else if (insert_coord->between == BEFORE_UNIT) {
57096 + assert("nikita-2090",
57097 + insert_coord->unit_pos >
57098 + shift->merging_units);
57099 + insert_coord->unit_pos -=
57100 + shift->merging_units;
57101 + }
57102 +
57103 + assert("nikita-2083",
57104 + insert_coord->unit_pos + 1);
57105 + } else {
57106 + if (insert_coord->between == AFTER_UNIT) {
57107 + assert("nikita-1442",
57108 + insert_coord->unit_pos >=
57109 + shift->part_units);
57110 + insert_coord->unit_pos -=
57111 + shift->part_units;
57112 + } else if (insert_coord->between == BEFORE_UNIT) {
57113 + assert("nikita-2089",
57114 + insert_coord->unit_pos >
57115 + shift->part_units);
57116 + insert_coord->unit_pos -=
57117 + shift->part_units;
57118 + }
57119 +
57120 + assert("nikita-2084",
57121 + insert_coord->unit_pos + 1);
57122 + }
57123 + }
57124 + return;
57125 + }
57126 +
57127 + /* we shifted to left and there was no enough space for everything */
57128 + switch (insert_coord->between) {
57129 + case AFTER_UNIT:
57130 + case BEFORE_UNIT:
57131 + if (shift->real_stop.item_pos == insert_coord->item_pos)
57132 + insert_coord->unit_pos -= shift->part_units;
57133 + case AFTER_ITEM:
57134 + coord_add_item_pos(insert_coord, -removed);
57135 + break;
57136 + default:
57137 + impossible("nikita-2087", "not ready");
57138 + }
57139 + assert("nikita-2085", insert_coord->unit_pos + 1);
57140 +}
57141 +
57142 +static int call_shift_hooks(struct shift_params *shift)
57143 +{
57144 + unsigned i, shifted;
57145 + coord_t coord;
57146 + item_plugin *iplug;
57147 +
57148 + assert("vs-275", !node_is_empty(shift->target));
57149 +
57150 + /* number of items shift touches */
57151 + shifted =
57152 + shift->entire + (shift->merging_units ? 1 : 0) +
57153 + (shift->part_units ? 1 : 0);
57154 +
57155 + if (shift->pend == SHIFT_LEFT) {
57156 + /* moved items are at the end */
57157 + coord_init_last_unit(&coord, shift->target);
57158 + coord.unit_pos = 0;
57159 +
57160 + assert("vs-279", shift->pend == 1);
57161 + for (i = 0; i < shifted; i++) {
57162 + unsigned from, count;
57163 +
57164 + iplug = item_plugin_by_coord(&coord);
57165 + if (i == 0 && shift->part_units) {
57166 + assert("vs-277",
57167 + coord_num_units(&coord) ==
57168 + shift->part_units);
57169 + count = shift->part_units;
57170 + from = 0;
57171 + } else if (i == shifted - 1 && shift->merging_units) {
57172 + count = shift->merging_units;
57173 + from = coord_num_units(&coord) - count;
57174 + } else {
57175 + count = coord_num_units(&coord);
57176 + from = 0;
57177 + }
57178 +
57179 + if (iplug->b.shift_hook) {
57180 + iplug->b.shift_hook(&coord, from, count,
57181 + shift->wish_stop.node);
57182 + }
57183 + coord_add_item_pos(&coord, -shift->pend);
57184 + }
57185 + } else {
57186 + /* moved items are at the beginning */
57187 + coord_init_first_unit(&coord, shift->target);
57188 +
57189 + assert("vs-278", shift->pend == -1);
57190 + for (i = 0; i < shifted; i++) {
57191 + unsigned from, count;
57192 +
57193 + iplug = item_plugin_by_coord(&coord);
57194 + if (i == 0 && shift->part_units) {
57195 + assert("vs-277",
57196 + coord_num_units(&coord) ==
57197 + shift->part_units);
57198 + count = coord_num_units(&coord);
57199 + from = 0;
57200 + } else if (i == shifted - 1 && shift->merging_units) {
57201 + count = shift->merging_units;
57202 + from = 0;
57203 + } else {
57204 + count = coord_num_units(&coord);
57205 + from = 0;
57206 + }
57207 +
57208 + if (iplug->b.shift_hook) {
57209 + iplug->b.shift_hook(&coord, from, count,
57210 + shift->wish_stop.node);
57211 + }
57212 + coord_add_item_pos(&coord, -shift->pend);
57213 + }
57214 + }
57215 +
57216 + return 0;
57217 +}
57218 +
57219 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
57220 +static int
57221 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
57222 +{
57223 + assert("vs-944", shift->real_stop.node == old->node);
57224 +
57225 + if (shift->real_stop.item_pos < old->item_pos)
57226 + return 0;
57227 + if (shift->real_stop.item_pos == old->item_pos) {
57228 + if (shift->real_stop.unit_pos < old->unit_pos)
57229 + return 0;
57230 + }
57231 + return 1;
57232 +}
57233 +
57234 +/* shift to right is completed. Return 1 if unit @old was moved to right
57235 + neighbor */
57236 +static int
57237 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
57238 +{
57239 + assert("vs-944", shift->real_stop.node == old->node);
57240 +
57241 + if (shift->real_stop.item_pos > old->item_pos)
57242 + return 0;
57243 + if (shift->real_stop.item_pos == old->item_pos) {
57244 + if (shift->real_stop.unit_pos > old->unit_pos)
57245 + return 0;
57246 + }
57247 + return 1;
57248 +}
57249 +
57250 +/* coord @old was set in node from which shift was performed. What was shifted
57251 + is stored in @shift. Update @old correspondingly to performed shift */
57252 +static coord_t *adjust_coord2(const struct shift_params *shift,
57253 + const coord_t * old, coord_t * new)
57254 +{
57255 + coord_clear_iplug(new);
57256 + new->between = old->between;
57257 +
57258 + coord_clear_iplug(new);
57259 + if (old->node == shift->target) {
57260 + if (shift->pend == SHIFT_LEFT) {
57261 + /* coord which is set inside of left neighbor does not
57262 + change during shift to left */
57263 + coord_dup(new, old);
57264 + return new;
57265 + }
57266 + new->node = old->node;
57267 + coord_set_item_pos(new,
57268 + old->item_pos + shift->entire +
57269 + (shift->part_units ? 1 : 0));
57270 + new->unit_pos = old->unit_pos;
57271 + if (old->item_pos == 0 && shift->merging_units)
57272 + new->unit_pos += shift->merging_units;
57273 + return new;
57274 + }
57275 +
57276 + assert("vs-977", old->node == shift->wish_stop.node);
57277 + if (shift->pend == SHIFT_LEFT) {
57278 + if (unit_moved_left(shift, old)) {
57279 + /* unit @old moved to left neighbor. Calculate its
57280 + coordinate there */
57281 + new->node = shift->target;
57282 + coord_set_item_pos(new,
57283 + node_num_items(shift->target) -
57284 + shift->entire -
57285 + (shift->part_units ? 1 : 0) +
57286 + old->item_pos);
57287 +
57288 + new->unit_pos = old->unit_pos;
57289 + if (shift->merging_units) {
57290 + coord_dec_item_pos(new);
57291 + if (old->item_pos == 0) {
57292 + /* unit_pos only changes if item got
57293 + merged */
57294 + new->unit_pos =
57295 + coord_num_units(new) -
57296 + (shift->merging_units -
57297 + old->unit_pos);
57298 + }
57299 + }
57300 + } else {
57301 + /* unit @old did not move to left neighbor.
57302 +
57303 + Use _nocheck, because @old is outside of its node.
57304 + */
57305 + coord_dup_nocheck(new, old);
57306 + coord_add_item_pos(new,
57307 + -shift->u.future_first.item_pos);
57308 + if (new->item_pos == 0)
57309 + new->unit_pos -= shift->u.future_first.unit_pos;
57310 + }
57311 + } else {
57312 + if (unit_moved_right(shift, old)) {
57313 + /* unit @old moved to right neighbor */
57314 + new->node = shift->target;
57315 + coord_set_item_pos(new,
57316 + old->item_pos -
57317 + shift->real_stop.item_pos);
57318 + if (new->item_pos == 0) {
57319 + /* unit @old might change unit pos */
57320 + coord_set_item_pos(new,
57321 + old->unit_pos -
57322 + shift->real_stop.unit_pos);
57323 + }
57324 + } else {
57325 + /* unit @old did not move to right neighbor, therefore
57326 + it did not change */
57327 + coord_dup(new, old);
57328 + }
57329 + }
57330 + coord_set_iplug(new, item_plugin_by_coord(new));
57331 + return new;
57332 +}
57333 +
57334 +/* this is called when shift is completed (something of source node is copied
57335 + to target and deleted in source) to update all taps set in current
57336 + context */
57337 +static void update_taps(const struct shift_params *shift)
57338 +{
57339 + tap_t *tap;
57340 + coord_t new;
57341 +
57342 + for_all_taps(tap) {
57343 + /* update only taps set to nodes participating in shift */
57344 + if (tap->coord->node == shift->wish_stop.node
57345 + || tap->coord->node == shift->target)
57346 + tap_to_coord(tap,
57347 + adjust_coord2(shift, tap->coord, &new));
57348 + }
57349 +}
57350 +
57351 +#if REISER4_DEBUG
57352 +
57353 +struct shift_check {
57354 + reiser4_key key;
57355 + __u16 plugin_id;
57356 + union {
57357 + __u64 bytes;
57358 + __u64 entries;
57359 + void *unused;
57360 + } u;
57361 +};
57362 +
57363 +void *shift_check_prepare(const znode * left, const znode * right)
57364 +{
57365 + pos_in_node_t i, nr_items;
57366 + int mergeable;
57367 + struct shift_check *data;
57368 + item_header40 *ih;
57369 +
57370 + if (node_is_empty(left) || node_is_empty(right))
57371 + mergeable = 0;
57372 + else {
57373 + coord_t l, r;
57374 +
57375 + coord_init_last_unit(&l, left);
57376 + coord_init_first_unit(&r, right);
57377 + mergeable = are_items_mergeable(&l, &r);
57378 + }
57379 + nr_items =
57380 + node40_num_of_items_internal(left) +
57381 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57382 + data =
57383 + kmalloc(sizeof(struct shift_check) * nr_items,
57384 + reiser4_ctx_gfp_mask_get());
57385 + if (data != NULL) {
57386 + coord_t coord;
57387 + pos_in_node_t item_pos;
57388 +
57389 + coord_init_first_unit(&coord, left);
57390 + i = 0;
57391 +
57392 + for (item_pos = 0;
57393 + item_pos < node40_num_of_items_internal(left);
57394 + item_pos++) {
57395 +
57396 + coord_set_item_pos(&coord, item_pos);
57397 + ih = node40_ih_at_coord(&coord);
57398 +
57399 + data[i].key = ih->key;
57400 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57401 + switch (data[i].plugin_id) {
57402 + case CTAIL_ID:
57403 + case FORMATTING_ID:
57404 + data[i].u.bytes = coord_num_units(&coord);
57405 + break;
57406 + case EXTENT_POINTER_ID:
57407 + data[i].u.bytes =
57408 + reiser4_extent_size(&coord,
57409 + coord_num_units(&coord));
57410 + break;
57411 + case COMPOUND_DIR_ID:
57412 + data[i].u.entries = coord_num_units(&coord);
57413 + break;
57414 + default:
57415 + data[i].u.unused = NULL;
57416 + break;
57417 + }
57418 + i++;
57419 + }
57420 +
57421 + coord_init_first_unit(&coord, right);
57422 +
57423 + if (mergeable) {
57424 + assert("vs-1609", i != 0);
57425 +
57426 + ih = node40_ih_at_coord(&coord);
57427 +
57428 + assert("vs-1589",
57429 + data[i - 1].plugin_id ==
57430 + le16_to_cpu(get_unaligned(&ih->plugin_id)));
57431 + switch (data[i - 1].plugin_id) {
57432 + case CTAIL_ID:
57433 + case FORMATTING_ID:
57434 + data[i - 1].u.bytes += coord_num_units(&coord);
57435 + break;
57436 + case EXTENT_POINTER_ID:
57437 + data[i - 1].u.bytes +=
57438 + reiser4_extent_size(&coord,
57439 + coord_num_units(&coord));
57440 + break;
57441 + case COMPOUND_DIR_ID:
57442 + data[i - 1].u.entries +=
57443 + coord_num_units(&coord);
57444 + break;
57445 + default:
57446 + impossible("vs-1605", "wrong mergeable item");
57447 + break;
57448 + }
57449 + item_pos = 1;
57450 + } else
57451 + item_pos = 0;
57452 + for (; item_pos < node40_num_of_items_internal(right);
57453 + item_pos++) {
57454 +
57455 + assert("vs-1604", i < nr_items);
57456 + coord_set_item_pos(&coord, item_pos);
57457 + ih = node40_ih_at_coord(&coord);
57458 +
57459 + data[i].key = ih->key;
57460 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57461 + switch (data[i].plugin_id) {
57462 + case CTAIL_ID:
57463 + case FORMATTING_ID:
57464 + data[i].u.bytes = coord_num_units(&coord);
57465 + break;
57466 + case EXTENT_POINTER_ID:
57467 + data[i].u.bytes =
57468 + reiser4_extent_size(&coord,
57469 + coord_num_units(&coord));
57470 + break;
57471 + case COMPOUND_DIR_ID:
57472 + data[i].u.entries = coord_num_units(&coord);
57473 + break;
57474 + default:
57475 + data[i].u.unused = NULL;
57476 + break;
57477 + }
57478 + i++;
57479 + }
57480 + assert("vs-1606", i == nr_items);
57481 + }
57482 + return data;
57483 +}
57484 +
57485 +void shift_check(void *vp, const znode * left, const znode * right)
57486 +{
57487 + pos_in_node_t i, nr_items;
57488 + coord_t coord;
57489 + __u64 last_bytes;
57490 + int mergeable;
57491 + item_header40 *ih;
57492 + pos_in_node_t item_pos;
57493 + struct shift_check *data;
57494 +
57495 + data = (struct shift_check *)vp;
57496 +
57497 + if (data == NULL)
57498 + return;
57499 +
57500 + if (node_is_empty(left) || node_is_empty(right))
57501 + mergeable = 0;
57502 + else {
57503 + coord_t l, r;
57504 +
57505 + coord_init_last_unit(&l, left);
57506 + coord_init_first_unit(&r, right);
57507 + mergeable = are_items_mergeable(&l, &r);
57508 + }
57509 +
57510 + nr_items =
57511 + node40_num_of_items_internal(left) +
57512 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57513 +
57514 + i = 0;
57515 + last_bytes = 0;
57516 +
57517 + coord_init_first_unit(&coord, left);
57518 +
57519 + for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
57520 + item_pos++) {
57521 +
57522 + coord_set_item_pos(&coord, item_pos);
57523 + ih = node40_ih_at_coord(&coord);
57524 +
57525 + assert("vs-1611", i == item_pos);
57526 + assert("vs-1590", keyeq(&ih->key, &data[i].key));
57527 + assert("vs-1591",
57528 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57529 + if ((i < (node40_num_of_items_internal(left) - 1))
57530 + || !mergeable) {
57531 + switch (data[i].plugin_id) {
57532 + case CTAIL_ID:
57533 + case FORMATTING_ID:
57534 + assert("vs-1592",
57535 + data[i].u.bytes ==
57536 + coord_num_units(&coord));
57537 + break;
57538 + case EXTENT_POINTER_ID:
57539 + assert("vs-1593",
57540 + data[i].u.bytes ==
57541 + reiser4_extent_size(&coord,
57542 + coord_num_units
57543 + (&coord)));
57544 + break;
57545 + case COMPOUND_DIR_ID:
57546 + assert("vs-1594",
57547 + data[i].u.entries ==
57548 + coord_num_units(&coord));
57549 + break;
57550 + default:
57551 + break;
57552 + }
57553 + }
57554 + if (item_pos == (node40_num_of_items_internal(left) - 1)
57555 + && mergeable) {
57556 + switch (data[i].plugin_id) {
57557 + case CTAIL_ID:
57558 + case FORMATTING_ID:
57559 + last_bytes = coord_num_units(&coord);
57560 + break;
57561 + case EXTENT_POINTER_ID:
57562 + last_bytes =
57563 + reiser4_extent_size(&coord,
57564 + coord_num_units(&coord));
57565 + break;
57566 + case COMPOUND_DIR_ID:
57567 + last_bytes = coord_num_units(&coord);
57568 + break;
57569 + default:
57570 + impossible("vs-1595", "wrong mergeable item");
57571 + break;
57572 + }
57573 + }
57574 + i++;
57575 + }
57576 +
57577 + coord_init_first_unit(&coord, right);
57578 + if (mergeable) {
57579 + ih = node40_ih_at_coord(&coord);
57580 +
57581 + assert("vs-1589",
57582 + data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
57583 + assert("vs-1608", last_bytes != 0);
57584 + switch (data[i - 1].plugin_id) {
57585 + case CTAIL_ID:
57586 + case FORMATTING_ID:
57587 + assert("vs-1596",
57588 + data[i - 1].u.bytes ==
57589 + last_bytes + coord_num_units(&coord));
57590 + break;
57591 +
57592 + case EXTENT_POINTER_ID:
57593 + assert("vs-1597",
57594 + data[i - 1].u.bytes ==
57595 + last_bytes + reiser4_extent_size(&coord,
57596 + coord_num_units
57597 + (&coord)));
57598 + break;
57599 +
57600 + case COMPOUND_DIR_ID:
57601 + assert("vs-1598",
57602 + data[i - 1].u.bytes ==
57603 + last_bytes + coord_num_units(&coord));
57604 + break;
57605 + default:
57606 + impossible("vs-1599", "wrong mergeable item");
57607 + break;
57608 + }
57609 + item_pos = 1;
57610 + } else
57611 + item_pos = 0;
57612 +
57613 + for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
57614 +
57615 + coord_set_item_pos(&coord, item_pos);
57616 + ih = node40_ih_at_coord(&coord);
57617 +
57618 + assert("vs-1612", keyeq(&ih->key, &data[i].key));
57619 + assert("vs-1613",
57620 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57621 + switch (data[i].plugin_id) {
57622 + case CTAIL_ID:
57623 + case FORMATTING_ID:
57624 + assert("vs-1600",
57625 + data[i].u.bytes == coord_num_units(&coord));
57626 + break;
57627 + case EXTENT_POINTER_ID:
57628 + assert("vs-1601",
57629 + data[i].u.bytes ==
57630 + reiser4_extent_size(&coord,
57631 + coord_num_units
57632 + (&coord)));
57633 + break;
57634 + case COMPOUND_DIR_ID:
57635 + assert("vs-1602",
57636 + data[i].u.entries == coord_num_units(&coord));
57637 + break;
57638 + default:
57639 + break;
57640 + }
57641 + i++;
57642 + }
57643 +
57644 + assert("vs-1603", i == nr_items);
57645 + kfree(data);
57646 +}
57647 +
57648 +#endif
57649 +
57650 +/* plugin->u.node.shift
57651 + look for description of this method in plugin/node/node.h */
57652 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
57653 + deleted from the tree if this is set to 1 */
57654 + int including_stop_coord, carry_plugin_info * info)
57655 +{
57656 + struct shift_params shift;
57657 + int result;
57658 + znode *left, *right;
57659 + znode *source;
57660 + int target_empty;
57661 +
57662 + assert("nikita-2161", coord_check(from));
57663 +
57664 + memset(&shift, 0, sizeof(shift));
57665 + shift.pend = pend;
57666 + shift.wish_stop = *from;
57667 + shift.target = to;
57668 +
57669 + assert("nikita-1473", znode_is_write_locked(from->node));
57670 + assert("nikita-1474", znode_is_write_locked(to));
57671 +
57672 + source = from->node;
57673 +
57674 + /* set @shift.wish_stop to rightmost/leftmost unit among units we want
57675 + shifted */
57676 + if (pend == SHIFT_LEFT) {
57677 + result = coord_set_to_left(&shift.wish_stop);
57678 + left = to;
57679 + right = from->node;
57680 + } else {
57681 + result = coord_set_to_right(&shift.wish_stop);
57682 + left = from->node;
57683 + right = to;
57684 + }
57685 +
57686 + if (result) {
57687 + /* move insertion coord even if there is nothing to move */
57688 + if (including_stop_coord) {
57689 + /* move insertion coord (@from) */
57690 + if (pend == SHIFT_LEFT) {
57691 + /* after last item in target node */
57692 + coord_init_after_last_item(from, to);
57693 + } else {
57694 + /* before first item in target node */
57695 + coord_init_before_first_item(from, to);
57696 + }
57697 + }
57698 +
57699 + if (delete_child && node_is_empty(shift.wish_stop.node))
57700 + result =
57701 + prepare_removal_node40(shift.wish_stop.node, info);
57702 + else
57703 + result = 0;
57704 + /* there is nothing to shift */
57705 + assert("nikita-2078", coord_check(from));
57706 + return result;
57707 + }
57708 +
57709 + target_empty = node_is_empty(to);
57710 +
57711 + /* when first node plugin with item body compression is implemented,
57712 + this must be changed to call node specific plugin */
57713 +
57714 + /* shift->stop_coord is updated to last unit which really will be
57715 + shifted */
57716 + estimate_shift(&shift, get_current_context());
57717 + if (!shift.shift_bytes) {
57718 + /* we could not shift anything */
57719 + assert("nikita-2079", coord_check(from));
57720 + return 0;
57721 + }
57722 +
57723 + copy(&shift);
57724 +
57725 + /* result value of this is important. It is used by adjust_coord below */
57726 + result = delete_copied(&shift);
57727 +
57728 + assert("vs-1610", result >= 0);
57729 + assert("vs-1471",
57730 + ((reiser4_context *) current->journal_info)->magic ==
57731 + context_magic);
57732 +
57733 + /* item which has been moved from one node to another might want to do
57734 + something on that event. This can be done by item's shift_hook
57735 + method, which will be now called for every moved items */
57736 + call_shift_hooks(&shift);
57737 +
57738 + assert("vs-1472",
57739 + ((reiser4_context *) current->journal_info)->magic ==
57740 + context_magic);
57741 +
57742 + update_taps(&shift);
57743 +
57744 + assert("vs-1473",
57745 + ((reiser4_context *) current->journal_info)->magic ==
57746 + context_magic);
57747 +
57748 + /* adjust @from pointer in accordance with @including_stop_coord flag
57749 + and amount of data which was really shifted */
57750 + adjust_coord(from, &shift, result, including_stop_coord);
57751 +
57752 + if (target_empty)
57753 + /*
57754 + * items were shifted into empty node. Update delimiting key.
57755 + */
57756 + result = prepare_for_update(NULL, left, info);
57757 +
57758 + /* add update operation to @info, which is the list of operations to
57759 + be performed on a higher level */
57760 + result = prepare_for_update(left, right, info);
57761 + if (!result && node_is_empty(source) && delete_child) {
57762 + /* all contents of @from->node is moved to @to and @from->node
57763 + has to be removed from the tree, so, on higher level we
57764 + will be removing the pointer to node @from->node */
57765 + result = prepare_removal_node40(source, info);
57766 + }
57767 + assert("nikita-2080", coord_check(from));
57768 + return result ? result : (int)shift.shift_bytes;
57769 +}
57770 +
57771 +/* plugin->u.node.fast_insert()
57772 + look for description of this method in plugin/node/node.h */
57773 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57774 +{
57775 + return 1;
57776 +}
57777 +
57778 +/* plugin->u.node.fast_paste()
57779 + look for description of this method in plugin/node/node.h */
57780 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57781 +{
57782 + return 1;
57783 +}
57784 +
57785 +/* plugin->u.node.fast_cut()
57786 + look for description of this method in plugin/node/node.h */
57787 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57788 +{
57789 + return 1;
57790 +}
57791 +
57792 +/* plugin->u.node.modify - not defined */
57793 +
57794 +/* plugin->u.node.max_item_size */
57795 +int max_item_size_node40(void)
57796 +{
57797 + return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
57798 + sizeof(item_header40);
57799 +}
57800 +
57801 +/* plugin->u.node.set_item_plugin */
57802 +int set_item_plugin_node40(coord_t *coord, item_id id)
57803 +{
57804 + item_header40 *ih;
57805 +
57806 + ih = node40_ih_at_coord(coord);
57807 + put_unaligned(cpu_to_le16(id), &ih->plugin_id);
57808 + coord->iplugid = id;
57809 + return 0;
57810 +}
57811 +
57812 +/*
57813 + Local variables:
57814 + c-indentation-style: "K&R"
57815 + mode-name: "LC"
57816 + c-basic-offset: 8
57817 + tab-width: 8
57818 + fill-column: 120
57819 + scroll-step: 1
57820 + End:
57821 +*/
57822 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node40.h linux-2.6.22/fs/reiser4/plugin/node/node40.h
57823 --- linux-2.6.22.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300
57824 +++ linux-2.6.22/fs/reiser4/plugin/node/node40.h 2007-07-29 00:25:34.988725466 +0400
57825 @@ -0,0 +1,125 @@
57826 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57827 +
57828 +#if !defined( __REISER4_NODE40_H__ )
57829 +#define __REISER4_NODE40_H__
57830 +
57831 +#include "../../forward.h"
57832 +#include "../../dformat.h"
57833 +#include "node.h"
57834 +
57835 +#include <linux/types.h>
57836 +
57837 +/* format of node header for 40 node layouts. Keep bloat out of this struct. */
57838 +typedef struct node40_header {
57839 + /* identifier of node plugin. Must be located at the very beginning
57840 + of a node. */
57841 + common_node_header common_header; /* this is 16 bits */
57842 + /* number of items. Should be first element in the node header,
57843 + because we haven't yet finally decided whether it shouldn't go into
57844 + common_header.
57845 + */
57846 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
57847 + * node format at compile time, and it is this one, accesses do not function dereference when
57848 + * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
57849 + d16 nr_items;
57850 + /* free space in node measured in bytes */
57851 + d16 free_space;
57852 + /* offset to start of free space in node */
57853 + d16 free_space_start;
57854 + /* for reiser4_fsck. When information about what is a free
57855 + block is corrupted, and we try to recover everything even
57856 + if marked as freed, then old versions of data may
57857 + duplicate newer versions, and this field allows us to
57858 + restore the newer version. Also useful for when users
57859 + who don't have the new trashcan installed on their linux distro
57860 + delete the wrong files and send us desperate emails
57861 + offering $25 for them back. */
57862 +
57863 + /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
57864 + d32 magic;
57865 + /* flushstamp is made of mk_id and write_counter. mk_id is an
57866 + id generated randomly at mkreiserfs time. So we can just
57867 + skip all nodes with different mk_id. write_counter is d64
57868 + incrementing counter of writes on disk. It is used for
57869 + choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
57870 +
57871 + d32 mkfs_id;
57872 + d64 flush_id;
57873 + /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
57874 + and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
57875 + d16 flags;
57876 +
57877 + /* 1 is leaf level, 2 is twig level, root is the numerically
57878 + largest level */
57879 + d8 level;
57880 +
57881 + d8 pad;
57882 +} PACKED node40_header;
57883 +
57884 +/* item headers are not standard across all node layouts, pass
57885 + pos_in_node to functions instead */
57886 +typedef struct item_header40 {
57887 + /* key of item */
57888 + /* 0 */ reiser4_key key;
57889 + /* offset from start of a node measured in 8-byte chunks */
57890 + /* 24 */ d16 offset;
57891 + /* 26 */ d16 flags;
57892 + /* 28 */ d16 plugin_id;
57893 +} PACKED item_header40;
57894 +
57895 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
57896 +size_t free_space_node40(znode * node);
57897 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
57898 + lookup_bias bias, coord_t * coord);
57899 +int num_of_items_node40(const znode * node);
57900 +char *item_by_coord_node40(const coord_t * coord);
57901 +int length_by_coord_node40(const coord_t * coord);
57902 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
57903 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
57904 +size_t estimate_node40(znode * node);
57905 +int check_node40(const znode * node, __u32 flags, const char **error);
57906 +int parse_node40(znode * node);
57907 +int init_node40(znode * node);
57908 +#ifdef GUESS_EXISTS
57909 +int guess_node40(const znode * node);
57910 +#endif
57911 +void change_item_size_node40(coord_t * coord, int by);
57912 +int create_item_node40(coord_t * target, const reiser4_key * key,
57913 + reiser4_item_data * data, carry_plugin_info * info);
57914 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
57915 + carry_plugin_info * info);
57916 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
57917 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
57918 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
57919 + /* if @from->node becomes
57920 + empty - it will be deleted from
57921 + the tree if this is set to 1
57922 + */
57923 + int delete_child, int including_stop_coord,
57924 + carry_plugin_info * info);
57925 +
57926 +int fast_insert_node40(const coord_t * coord);
57927 +int fast_paste_node40(const coord_t * coord);
57928 +int fast_cut_node40(const coord_t * coord);
57929 +int max_item_size_node40(void);
57930 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
57931 +int set_item_plugin_node40(coord_t * coord, item_id id);
57932 +int shrink_item_node40(coord_t * coord, int delta);
57933 +
57934 +#if REISER4_DEBUG
57935 +void *shift_check_prepare(const znode *left, const znode *right);
57936 +void shift_check(void *vp, const znode *left, const znode *right);
57937 +#endif
57938 +
57939 +/* __REISER4_NODE40_H__ */
57940 +#endif
57941 +/*
57942 + Local variables:
57943 + c-indentation-style: "K&R"
57944 + mode-name: "LC"
57945 + c-basic-offset: 8
57946 + tab-width: 8
57947 + fill-column: 120
57948 + scroll-step: 1
57949 + End:
57950 +*/
57951 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node.c linux-2.6.22/fs/reiser4/plugin/node/node.c
57952 --- linux-2.6.22.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300
57953 +++ linux-2.6.22/fs/reiser4/plugin/node/node.c 2007-07-29 00:25:34.988725466 +0400
57954 @@ -0,0 +1,131 @@
57955 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57956 +
57957 +/* Node plugin interface.
57958 +
57959 + Description: The tree provides the abstraction of flows, which it
57960 + internally fragments into items which it stores in nodes.
57961 +
57962 + A key_atom is a piece of data bound to a single key.
57963 +
57964 + For reasonable space efficiency to be achieved it is often
57965 + necessary to store key_atoms in the nodes in the form of items, where
57966 + an item is a sequence of key_atoms of the same or similar type. It is
57967 + more space-efficient, because the item can implement (very)
57968 + efficient compression of key_atom's bodies using internal knowledge
57969 + about their semantics, and it can often avoid having a key for each
57970 + key_atom. Each type of item has specific operations implemented by its
57971 + item handler (see balance.c).
57972 +
57973 + Rationale: the rest of the code (specifically balancing routines)
57974 + accesses leaf level nodes through this interface. This way we can
57975 + implement various block layouts and even combine various layouts
57976 + within the same tree. Balancing/allocating algorithms should not
57977 + care about peculiarities of splitting/merging specific item types,
57978 + but rather should leave that to the item's item handler.
57979 +
57980 + Items, including those that provide the abstraction of flows, have
57981 + the property that if you move them in part or in whole to another
57982 + node, the balancing code invokes their is_left_mergeable()
57983 + item_operation to determine if they are mergeable with their new
57984 + neighbor in the node you have moved them to. For some items the
57985 + is_left_mergeable() function always returns null.
57986 +
57987 + When moving the bodies of items from one node to another:
57988 +
57989 + if a partial item is shifted to another node the balancing code invokes
57990 + an item handler method to handle the item splitting.
57991 +
57992 + if the balancing code needs to merge with an item in the node it
57993 + is shifting to, it will invoke an item handler method to handle
57994 + the item merging.
57995 +
57996 + if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
57997 + adjusting the item headers after the move is done using the node handler.
57998 +*/
57999 +
58000 +#include "../../forward.h"
58001 +#include "../../debug.h"
58002 +#include "../../key.h"
58003 +#include "../../coord.h"
58004 +#include "../plugin_header.h"
58005 +#include "../item/item.h"
58006 +#include "node.h"
58007 +#include "../plugin.h"
58008 +#include "../../znode.h"
58009 +#include "../../tree.h"
58010 +#include "../../super.h"
58011 +#include "../../reiser4.h"
58012 +
58013 +/**
58014 + * leftmost_key_in_node - get the smallest key in node
58015 + * @node:
58016 + * @key: store result here
58017 + *
58018 + * Stores the leftmost key of @node in @key.
58019 + */
58020 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
58021 +{
58022 + assert("nikita-1634", node != NULL);
58023 + assert("nikita-1635", key != NULL);
58024 +
58025 + if (!node_is_empty(node)) {
58026 + coord_t first_item;
58027 +
58028 + coord_init_first_unit(&first_item, (znode *) node);
58029 + item_key_by_coord(&first_item, key);
58030 + } else
58031 + *key = *reiser4_max_key();
58032 + return key;
58033 +}
58034 +
58035 +node_plugin node_plugins[LAST_NODE_ID] = {
58036 + [NODE40_ID] = {
58037 + .h = {
58038 + .type_id = REISER4_NODE_PLUGIN_TYPE,
58039 + .id = NODE40_ID,
58040 + .pops = NULL,
58041 + .label = "unified",
58042 + .desc = "unified node layout",
58043 + .linkage = {NULL, NULL}
58044 + },
58045 + .item_overhead = item_overhead_node40,
58046 + .free_space = free_space_node40,
58047 + .lookup = lookup_node40,
58048 + .num_of_items = num_of_items_node40,
58049 + .item_by_coord = item_by_coord_node40,
58050 + .length_by_coord = length_by_coord_node40,
58051 + .plugin_by_coord = plugin_by_coord_node40,
58052 + .key_at = key_at_node40,
58053 + .estimate = estimate_node40,
58054 + .check = check_node40,
58055 + .parse = parse_node40,
58056 + .init = init_node40,
58057 +#ifdef GUESS_EXISTS
58058 + .guess = guess_node40,
58059 +#endif
58060 + .change_item_size = change_item_size_node40,
58061 + .create_item = create_item_node40,
58062 + .update_item_key = update_item_key_node40,
58063 + .cut_and_kill = kill_node40,
58064 + .cut = cut_node40,
58065 + .shift = shift_node40,
58066 + .shrink_item = shrink_item_node40,
58067 + .fast_insert = fast_insert_node40,
58068 + .fast_paste = fast_paste_node40,
58069 + .fast_cut = fast_cut_node40,
58070 + .max_item_size = max_item_size_node40,
58071 + .prepare_removal = prepare_removal_node40,
58072 + .set_item_plugin = set_item_plugin_node40
58073 + }
58074 +};
58075 +
58076 +/*
58077 + Local variables:
58078 + c-indentation-style: "K&R"
58079 + mode-name: "LC"
58080 + c-basic-offset: 8
58081 + tab-width: 8
58082 + fill-column: 120
58083 + scroll-step: 1
58084 + End:
58085 +*/
58086 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node.h linux-2.6.22/fs/reiser4/plugin/node/node.h
58087 --- linux-2.6.22.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300
58088 +++ linux-2.6.22/fs/reiser4/plugin/node/node.h 2007-07-29 00:25:34.988725466 +0400
58089 @@ -0,0 +1,272 @@
58090 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58091 +
58092 +/* We need a definition of the default node layout here. */
58093 +
58094 +/* Generally speaking, it is best to have free space in the middle of the
58095 + node so that two sets of things can grow towards it, and to have the
58096 + item bodies on the left so that the last one of them grows into free
58097 + space. We optimize for the case where we append new items to the end
58098 + of the node, or grow the last item, because it hurts nothing to so
58099 + optimize and it is a common special case to do massive insertions in
58100 + increasing key order (and one of cases more likely to have a real user
58101 + notice the delay time for).
58102 +
58103 + formatted leaf default layout: (leaf1)
58104 +
58105 + |node header:item bodies:free space:key + pluginid + item offset|
58106 +
58107 + We grow towards the middle, optimizing layout for the case where we
58108 + append new items to the end of the node. The node header is fixed
58109 + length. Keys, and item offsets plus pluginids for the items
58110 + corresponding to them are in increasing key order, and are fixed
58111 + length. Item offsets are relative to start of node (16 bits creating
58112 + a node size limit of 64k, 12 bits might be a better choice....). Item
58113 + bodies are in decreasing key order. Item bodies have a variable size.
58114 + There is a one to one to one mapping of keys to item offsets to item
58115 + bodies. Item offsets consist of pointers to the zeroth byte of the
58116 + item body. Item length equals the start of the next item minus the
58117 + start of this item, except the zeroth item whose length equals the end
58118 + of the node minus the start of that item (plus a byte). In other
58119 + words, the item length is not recorded anywhere, and it does not need
58120 + to be since it is computable.
58121 +
58122 + Leaf variable length items and keys layout : (lvar)
58123 +
58124 + |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
58125 +
58126 + We grow towards the middle, optimizing layout for the case where we
58127 + append new items to the end of the node. The node header is fixed
58128 + length. Keys and item offsets for the items corresponding to them are
58129 + in increasing key order, and keys are variable length. Item offsets
58130 + are relative to start of node (16 bits). Item bodies are in
58131 + decreasing key order. Item bodies have a variable size. There is a
58132 + one to one to one mapping of keys to item offsets to item bodies.
58133 + Item offsets consist of pointers to the zeroth byte of the item body.
58134 + Item length equals the start of the next item's key minus the start of
58135 + this item, except the zeroth item whose length equals the end of the
58136 + node minus the start of that item (plus a byte).
58137 +
58138 + leaf compressed keys layout: (lcomp)
58139 +
58140 + |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
58141 +
58142 + We grow towards the middle, optimizing layout for the case where we
58143 + append new items to the end of the node. The node header is fixed
58144 + length. Keys and item offsets for the items corresponding to them are
58145 + in increasing key order, and keys are variable length. The "key
58146 + inherit" field indicates how much of the key prefix is identical to
58147 + the previous key (stem compression as described in "Managing
58148 + Gigabytes" is used). key_inherit is a one byte integer. The
58149 + intra-node searches performed through this layout are linear searches,
58150 + and this is theorized to not hurt performance much due to the high
58151 + cost of processor stalls on modern CPUs, and the small number of keys
58152 + in a single node. Item offsets are relative to start of node (16
58153 + bits). Item bodies are in decreasing key order. Item bodies have a
58154 + variable size. There is a one to one to one mapping of keys to item
58155 + offsets to item bodies. Item offsets consist of pointers to the
58156 + zeroth byte of the item body. Item length equals the start of the
58157 + next item minus the start of this item, except the zeroth item whose
58158 + length equals the end of the node minus the start of that item (plus a
58159 + byte). In other words, item length and key length is not recorded
58160 + anywhere, and it does not need to be since it is computable.
58161 +
58162 + internal node default layout: (idef1)
58163 +
58164 + just like ldef1 except that item bodies are either blocknrs of
58165 + children or extents, and moving them may require updating parent
58166 + pointers in the nodes that they point to.
58167 +*/
58168 +
58169 +/* There is an inherent 3-way tradeoff between optimizing and
58170 + exchanging disks between different architectures and code
58171 + complexity. This is optimal and simple and inexchangeable.
58172 + Someone else can do the code for exchanging disks and make it
58173 + complex. It would not be that hard. Using other than the PAGE_SIZE
58174 + might be suboptimal.
58175 +*/
58176 +
58177 +#if !defined( __REISER4_NODE_H__ )
58178 +#define __REISER4_NODE_H__
58179 +
58180 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
58181 +
58182 +#include "../../dformat.h"
58183 +#include "../plugin_header.h"
58184 +
58185 +#include <linux/types.h>
58186 +
58187 +typedef enum {
58188 + NS_FOUND = 0,
58189 + NS_NOT_FOUND = -ENOENT
58190 +} node_search_result;
58191 +
58192 +/* Maximal possible space overhead for creation of new item in a node */
58193 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
58194 +
58195 +typedef enum {
58196 + REISER4_NODE_DKEYS = (1 << 0),
58197 + REISER4_NODE_TREE_STABLE = (1 << 1)
58198 +} reiser4_node_check_flag;
58199 +
58200 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
58201 +struct cut_list {
58202 + coord_t *from;
58203 + coord_t *to;
58204 + const reiser4_key *from_key;
58205 + const reiser4_key *to_key;
58206 + reiser4_key *smallest_removed;
58207 + carry_plugin_info *info;
58208 + __u32 flags;
58209 + struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
58210 + lock_handle *left;
58211 + lock_handle *right;
58212 +};
58213 +
58214 +struct carry_cut_data;
58215 +struct carry_kill_data;
58216 +
58217 +/* The responsibility of the node plugin is to store and give access
58218 + to the sequence of items within the node. */
58219 +typedef struct node_plugin {
58220 + /* generic plugin fields */
58221 + plugin_header h;
58222 +
58223 + /* calculates the amount of space that will be required to store an
58224 + item which is in addition to the space consumed by the item body.
58225 + (the space consumed by the item body can be gotten by calling
58226 + item->estimate) */
58227 + size_t(*item_overhead) (const znode * node, flow_t * f);
58228 +
58229 + /* returns free space by looking into node (i.e., without using
58230 + znode->free_space). */
58231 + size_t(*free_space) (znode * node);
58232 + /* search within the node for the one item which might
58233 + contain the key, invoking item->search_within to search within
58234 + that item to see if it is in there */
58235 + node_search_result(*lookup) (znode * node, const reiser4_key * key,
58236 + lookup_bias bias, coord_t * coord);
58237 + /* number of items in node */
58238 + int (*num_of_items) (const znode * node);
58239 +
58240 + /* store information about item in @coord in @data */
58241 + /* break into several node ops, don't add any more uses of this before doing so */
58242 + /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
58243 + char *(*item_by_coord) (const coord_t * coord);
58244 + int (*length_by_coord) (const coord_t * coord);
58245 + item_plugin *(*plugin_by_coord) (const coord_t * coord);
58246 +
58247 + /* store item key in @key */
58248 + reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
58249 + /* conservatively estimate whether unit of what size can fit
58250 + into node. This estimation should be performed without
58251 + actually looking into the node's content (free space is saved in
58252 + znode). */
58253 + size_t(*estimate) (znode * node);
58254 +
58255 + /* performs every consistency check the node plugin author could
58256 + imagine. Optional. */
58257 + int (*check) (const znode * node, __u32 flags, const char **error);
58258 +
58259 + /* Called when node is read into memory and node plugin is
58260 + already detected. This should read some data into znode (like free
58261 + space counter) and, optionally, check data consistency.
58262 + */
58263 + int (*parse) (znode * node);
58264 + /* This method is called on a new node to initialise plugin specific
58265 + data (header, etc.) */
58266 + int (*init) (znode * node);
58267 + /* Check whether @node content conforms to this plugin format.
58268 + Probably only useful after support for old V3.x formats is added.
58269 + Uncomment after 4.0 only.
58270 + */
58271 + /* int ( *guess )( const znode *node ); */
58272 +#if REISER4_DEBUG
58273 + void (*print) (const char *prefix, const znode * node, __u32 flags);
58274 +#endif
58275 + /* change size of @item by @by bytes. @item->node has enough free
58276 + space. When @by > 0 - free space is appended to end of item. When
58277 + @by < 0 - item is truncated - it is assumed that last @by bytes if
58278 + the item are freed already */
58279 + void (*change_item_size) (coord_t * item, int by);
58280 +
58281 + /* create new item @length bytes long in coord @target */
58282 + int (*create_item) (coord_t * target, const reiser4_key * key,
58283 + reiser4_item_data * data, carry_plugin_info * info);
58284 +
58285 + /* update key of item. */
58286 + void (*update_item_key) (coord_t * target, const reiser4_key * key,
58287 + carry_plugin_info * info);
58288 +
58289 + int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
58290 + int (*cut) (struct carry_cut_data *, carry_plugin_info *);
58291 +
58292 + /*
58293 + * shrink item pointed to by @coord by @delta bytes.
58294 + */
58295 + int (*shrink_item) (coord_t * coord, int delta);
58296 +
58297 + /* copy as much as possible but not more than up to @stop from
58298 + @stop->node to @target. If (pend == append) then data from beginning of
58299 + @stop->node are copied to the end of @target. If (pend == prepend) then
58300 + data from the end of @stop->node are copied to the beginning of
58301 + @target. Copied data are removed from @stop->node. Information
58302 + about what to do on upper level is stored in @todo */
58303 + int (*shift) (coord_t * stop, znode * target, shift_direction pend,
58304 + int delete_node, int including_insert_coord,
58305 + carry_plugin_info * info);
58306 + /* return true if this node allows skip carry() in some situations
58307 + (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
58308 + emulation doesn't.
58309 +
58310 + This will speedup insertions that doesn't require updates to the
58311 + parent, by bypassing initialisation of carry() structures. It's
58312 + believed that majority of insertions will fit there.
58313 +
58314 + */
58315 + int (*fast_insert) (const coord_t * coord);
58316 + int (*fast_paste) (const coord_t * coord);
58317 + int (*fast_cut) (const coord_t * coord);
58318 + /* this limits max size of item which can be inserted into a node and
58319 + number of bytes item in a node may be appended with */
58320 + int (*max_item_size) (void);
58321 + int (*prepare_removal) (znode * empty, carry_plugin_info * info);
58322 + /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
58323 + * files */
58324 + int (*set_item_plugin) (coord_t * coord, item_id);
58325 +} node_plugin;
58326 +
58327 +typedef enum {
58328 + /* standard unified node layout used for both leaf and internal
58329 + nodes */
58330 + NODE40_ID,
58331 + LAST_NODE_ID
58332 +} reiser4_node_id;
58333 +
58334 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
58335 +#if REISER4_DEBUG
58336 +extern void print_node_content(const char *prefix, const znode * node,
58337 + __u32 flags);
58338 +#endif
58339 +
58340 +extern void indent_znode(const znode * node);
58341 +
58342 +typedef struct common_node_header {
58343 + /*
58344 + * identifier of node plugin. Must be located at the very beginning of
58345 + * a node.
58346 + */
58347 + __le16 plugin_id;
58348 +} common_node_header;
58349 +
58350 +/* __REISER4_NODE_H__ */
58351 +#endif
58352 +/*
58353 + * Local variables:
58354 + * c-indentation-style: "K&R"
58355 + * mode-name: "LC"
58356 + * c-basic-offset: 8
58357 + * tab-width: 8
58358 + * fill-column: 79
58359 + * scroll-step: 1
58360 + * End:
58361 + */
58362 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/object.c linux-2.6.22/fs/reiser4/plugin/object.c
58363 --- linux-2.6.22.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300
58364 +++ linux-2.6.22/fs/reiser4/plugin/object.c 2007-07-29 00:25:34.992726502 +0400
58365 @@ -0,0 +1,516 @@
58366 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58367 + * reiser4/README */
58368 +
58369 +/*
58370 + * Examples of object plugins: file, directory, symlink, special file.
58371 + *
58372 + * Plugins associated with inode:
58373 + *
58374 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
58375 + * stat-data. How we store this plugin in in-core inode is not
58376 + * important. Currently pointers are used, another variant is to store offsets
58377 + * and do array lookup on each access.
58378 + *
58379 + * Now, each inode has one selected plugin: object plugin that
58380 + * determines what type of file this object is: directory, regular etc.
58381 + *
58382 + * This main plugin can use other plugins that are thus subordinated to
58383 + * it. Directory instance of object plugin uses hash; regular file
58384 + * instance uses tail policy plugin.
58385 + *
58386 + * Object plugin is either taken from id in stat-data or guessed from
58387 + * i_mode bits. Once it is established we ask it to install its
58388 + * subordinate plugins, by looking again in stat-data or inheriting them
58389 + * from parent.
58390 + *
58391 + * How new inode is initialized during ->read_inode():
58392 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
58393 + * i_generation, capabilities etc.
58394 + * 2 read plugin id from stat data or try to guess plugin id
58395 + * from inode->i_mode bits if plugin id is missing.
58396 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
58397 + *
58398 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
58399 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
58400 + *
58401 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
58402 + * from stat-data or guessed from mode bits
58403 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
58404 + * plugins from parent.
58405 + *
58406 + * Easy induction proves that on last step all plugins of inode would be
58407 + * initialized.
58408 + *
58409 + * When creating new object:
58410 + * 1 obtain object plugin id (see next period)
58411 + * NIKITA-FIXME-HANS: period?
58412 + * 2 ->install() this plugin
58413 + * 3 ->inherit() the rest from the parent
58414 + *
58415 + * We need some examples of creating an object with default and non-default
58416 + * plugin ids. Nikita, please create them.
58417 + */
58418 +
58419 +#include "../inode.h"
58420 +
58421 +static int _bugop(void)
58422 +{
58423 + BUG_ON(1);
58424 + return 0;
58425 +}
58426 +
58427 +#define bugop ((void *)_bugop)
58428 +
58429 +static int _dummyop(void)
58430 +{
58431 + return 0;
58432 +}
58433 +
58434 +#define dummyop ((void *)_dummyop)
58435 +
58436 +static int change_file(struct inode *inode,
58437 + reiser4_plugin * plugin,
58438 + pset_member memb)
58439 +{
58440 + /* cannot change object plugin of already existing object */
58441 + if (memb == PSET_FILE)
58442 + return RETERR(-EINVAL);
58443 +
58444 + /* Change PSET_CREATE */
58445 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
58446 +}
58447 +
58448 +static reiser4_plugin_ops file_plugin_ops = {
58449 + .change = change_file
58450 +};
58451 +
58452 +/*
58453 + * Definitions of object plugins.
58454 + */
58455 +
58456 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
58457 + [UNIX_FILE_PLUGIN_ID] = {
58458 + .h = {
58459 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58460 + .id = UNIX_FILE_PLUGIN_ID,
58461 + .groups = (1 << REISER4_REGULAR_FILE),
58462 + .pops = &file_plugin_ops,
58463 + .label = "reg",
58464 + .desc = "regular file",
58465 + .linkage = {NULL, NULL},
58466 + },
58467 + .inode_ops = {
58468 + .permission = reiser4_permission_common,
58469 + .setattr = setattr_unix_file,
58470 + .getattr = reiser4_getattr_common
58471 + },
58472 + .file_ops = {
58473 + .llseek = generic_file_llseek,
58474 + .read = read_unix_file,
58475 + .write = write_unix_file,
58476 + .aio_read = generic_file_aio_read,
58477 + .ioctl = ioctl_unix_file,
58478 + .mmap = mmap_unix_file,
58479 + .open = open_unix_file,
58480 + .release = release_unix_file,
58481 + .fsync = sync_unix_file,
58482 + .sendfile = sendfile_unix_file
58483 + },
58484 + .as_ops = {
58485 + .writepage = reiser4_writepage,
58486 + .readpage = readpage_unix_file,
58487 + .sync_page = block_sync_page,
58488 + .writepages = writepages_unix_file,
58489 + .set_page_dirty = reiser4_set_page_dirty,
58490 + .readpages = readpages_unix_file,
58491 + .prepare_write = prepare_write_unix_file,
58492 + .commit_write = commit_write_unix_file,
58493 + .bmap = bmap_unix_file,
58494 + .invalidatepage = reiser4_invalidatepage,
58495 + .releasepage = reiser4_releasepage
58496 + },
58497 + .write_sd_by_inode = write_sd_by_inode_common,
58498 + .flow_by_inode = flow_by_inode_unix_file,
58499 + .key_by_inode = key_by_inode_and_offset_common,
58500 + .set_plug_in_inode = set_plug_in_inode_common,
58501 + .adjust_to_parent = adjust_to_parent_common,
58502 + .create_object = reiser4_create_object_common,
58503 + .delete_object = delete_object_unix_file,
58504 + .add_link = reiser4_add_link_common,
58505 + .rem_link = reiser4_rem_link_common,
58506 + .owns_item = owns_item_unix_file,
58507 + .can_add_link = can_add_link_common,
58508 + .detach = dummyop,
58509 + .bind = dummyop,
58510 + .safelink = safelink_common,
58511 + .estimate = {
58512 + .create = estimate_create_common,
58513 + .update = estimate_update_common,
58514 + .unlink = estimate_unlink_common
58515 + },
58516 + .init_inode_data = init_inode_data_unix_file,
58517 + .cut_tree_worker = cut_tree_worker_common,
58518 + .wire = {
58519 + .write = wire_write_common,
58520 + .read = wire_read_common,
58521 + .get = wire_get_common,
58522 + .size = wire_size_common,
58523 + .done = wire_done_common
58524 + }
58525 + },
58526 + [DIRECTORY_FILE_PLUGIN_ID] = {
58527 + .h = {
58528 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58529 + .id = DIRECTORY_FILE_PLUGIN_ID,
58530 + .groups = (1 << REISER4_DIRECTORY_FILE),
58531 + .pops = &file_plugin_ops,
58532 + .label = "dir",
58533 + .desc = "directory",
58534 + .linkage = {NULL, NULL}
58535 + },
58536 + .inode_ops = {.create = NULL},
58537 + .file_ops = {.owner = NULL},
58538 + .as_ops = {.writepage = NULL},
58539 +
58540 + .write_sd_by_inode = write_sd_by_inode_common,
58541 + .flow_by_inode = bugop,
58542 + .key_by_inode = bugop,
58543 + .set_plug_in_inode = set_plug_in_inode_common,
58544 + .adjust_to_parent = adjust_to_parent_common_dir,
58545 + .create_object = reiser4_create_object_common,
58546 + .delete_object = reiser4_delete_dir_common,
58547 + .add_link = reiser4_add_link_common,
58548 + .rem_link = rem_link_common_dir,
58549 + .owns_item = owns_item_common_dir,
58550 + .can_add_link = can_add_link_common,
58551 + .can_rem_link = can_rem_link_common_dir,
58552 + .detach = reiser4_detach_common_dir,
58553 + .bind = reiser4_bind_common_dir,
58554 + .safelink = safelink_common,
58555 + .estimate = {
58556 + .create = estimate_create_common_dir,
58557 + .update = estimate_update_common,
58558 + .unlink = estimate_unlink_common_dir
58559 + },
58560 + .wire = {
58561 + .write = wire_write_common,
58562 + .read = wire_read_common,
58563 + .get = wire_get_common,
58564 + .size = wire_size_common,
58565 + .done = wire_done_common
58566 + },
58567 + .init_inode_data = init_inode_ordering,
58568 + .cut_tree_worker = cut_tree_worker_common,
58569 + },
58570 + [SYMLINK_FILE_PLUGIN_ID] = {
58571 + .h = {
58572 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58573 + .id = SYMLINK_FILE_PLUGIN_ID,
58574 + .groups = (1 << REISER4_SYMLINK_FILE),
58575 + .pops = &file_plugin_ops,
58576 + .label = "symlink",
58577 + .desc = "symbolic link",
58578 + .linkage = {NULL,NULL}
58579 + },
58580 + .inode_ops = {
58581 + .readlink = generic_readlink,
58582 + .follow_link = reiser4_follow_link_common,
58583 + .permission = reiser4_permission_common,
58584 + .setattr = reiser4_setattr_common,
58585 + .getattr = reiser4_getattr_common
58586 + },
58587 + /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
58588 + .file_ops = {.owner = NULL},
58589 + .as_ops = {.writepage = NULL},
58590 +
58591 + .write_sd_by_inode = write_sd_by_inode_common,
58592 + .set_plug_in_inode = set_plug_in_inode_common,
58593 + .adjust_to_parent = adjust_to_parent_common,
58594 + .create_object = reiser4_create_symlink,
58595 + .delete_object = reiser4_delete_object_common,
58596 + .add_link = reiser4_add_link_common,
58597 + .rem_link = reiser4_rem_link_common,
58598 + .can_add_link = can_add_link_common,
58599 + .detach = dummyop,
58600 + .bind = dummyop,
58601 + .safelink = safelink_common,
58602 + .estimate = {
58603 + .create = estimate_create_common,
58604 + .update = estimate_update_common,
58605 + .unlink = estimate_unlink_common
58606 + },
58607 + .init_inode_data = init_inode_ordering,
58608 + .cut_tree_worker = cut_tree_worker_common,
58609 + .destroy_inode = destroy_inode_symlink,
58610 + .wire = {
58611 + .write = wire_write_common,
58612 + .read = wire_read_common,
58613 + .get = wire_get_common,
58614 + .size = wire_size_common,
58615 + .done = wire_done_common
58616 + }
58617 + },
58618 + [SPECIAL_FILE_PLUGIN_ID] = {
58619 + .h = {
58620 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58621 + .id = SPECIAL_FILE_PLUGIN_ID,
58622 + .groups = (1 << REISER4_SPECIAL_FILE),
58623 + .pops = &file_plugin_ops,
58624 + .label = "special",
58625 + .desc =
58626 + "special: fifo, device or socket",
58627 + .linkage = {NULL, NULL}
58628 + },
58629 + .inode_ops = {
58630 + .permission = reiser4_permission_common,
58631 + .setattr = reiser4_setattr_common,
58632 + .getattr = reiser4_getattr_common
58633 + },
58634 + /* file_ops of special files (sockets, block, char, fifo) are
58635 + initialized by init_special_inode. */
58636 + .file_ops = {.owner = NULL},
58637 + .as_ops = {.writepage = NULL},
58638 +
58639 + .write_sd_by_inode = write_sd_by_inode_common,
58640 + .set_plug_in_inode = set_plug_in_inode_common,
58641 + .adjust_to_parent = adjust_to_parent_common,
58642 + .create_object = reiser4_create_object_common,
58643 + .delete_object = reiser4_delete_object_common,
58644 + .add_link = reiser4_add_link_common,
58645 + .rem_link = reiser4_rem_link_common,
58646 + .owns_item = owns_item_common,
58647 + .can_add_link = can_add_link_common,
58648 + .detach = dummyop,
58649 + .bind = dummyop,
58650 + .safelink = safelink_common,
58651 + .estimate = {
58652 + .create = estimate_create_common,
58653 + .update = estimate_update_common,
58654 + .unlink = estimate_unlink_common
58655 + },
58656 + .init_inode_data = init_inode_ordering,
58657 + .cut_tree_worker = cut_tree_worker_common,
58658 + .wire = {
58659 + .write = wire_write_common,
58660 + .read = wire_read_common,
58661 + .get = wire_get_common,
58662 + .size = wire_size_common,
58663 + .done = wire_done_common
58664 + }
58665 + },
58666 + [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
58667 + .h = {
58668 + .type_id = REISER4_FILE_PLUGIN_TYPE,
58669 + .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
58670 + .groups = (1 << REISER4_REGULAR_FILE),
58671 + .pops = &file_plugin_ops,
58672 + .label = "cryptcompress",
58673 + .desc = "cryptcompress file",
58674 + .linkage = {NULL, NULL}
58675 + },
58676 + .inode_ops = {
58677 + .permission = reiser4_permission_common,
58678 + .setattr = prot_setattr_cryptcompress,
58679 + .getattr = reiser4_getattr_common
58680 + },
58681 + .file_ops = {
58682 + .llseek = generic_file_llseek,
58683 + .read = prot_read_cryptcompress,
58684 + .write = prot_write_cryptcompress,
58685 + .aio_read = generic_file_aio_read,
58686 + .mmap = prot_mmap_cryptcompress,
58687 + .release = prot_release_cryptcompress,
58688 + .fsync = reiser4_sync_common,
58689 + .sendfile = prot_sendfile_cryptcompress
58690 + },
58691 + .as_ops = {
58692 + .writepage = reiser4_writepage,
58693 + .readpage = readpage_cryptcompress,
58694 + .sync_page = block_sync_page,
58695 + .writepages = writepages_cryptcompress,
58696 + .set_page_dirty = reiser4_set_page_dirty,
58697 + .readpages = readpages_cryptcompress,
58698 + .prepare_write = prepare_write_common,
58699 + .invalidatepage = reiser4_invalidatepage,
58700 + .releasepage = reiser4_releasepage
58701 + },
58702 + .write_sd_by_inode = write_sd_by_inode_common,
58703 + .flow_by_inode = flow_by_inode_cryptcompress,
58704 + .key_by_inode = key_by_inode_cryptcompress,
58705 + .set_plug_in_inode = set_plug_in_inode_common,
58706 + .adjust_to_parent = adjust_to_parent_cryptcompress,
58707 + .create_object = create_cryptcompress,
58708 + .open_object = open_object_cryptcompress,
58709 + .delete_object = delete_object_cryptcompress,
58710 + .add_link = reiser4_add_link_common,
58711 + .rem_link = reiser4_rem_link_common,
58712 + .owns_item = owns_item_common,
58713 + .can_add_link = can_add_link_common,
58714 + .detach = dummyop,
58715 + .bind = dummyop,
58716 + .safelink = safelink_common,
58717 + .estimate = {
58718 + .create = estimate_create_common,
58719 + .update = estimate_update_common,
58720 + .unlink = estimate_unlink_common
58721 + },
58722 + .init_inode_data = init_inode_data_cryptcompress,
58723 + .cut_tree_worker = cut_tree_worker_cryptcompress,
58724 + .destroy_inode = destroy_inode_cryptcompress,
58725 + .wire = {
58726 + .write = wire_write_common,
58727 + .read = wire_read_common,
58728 + .get = wire_get_common,
58729 + .size = wire_size_common,
58730 + .done = wire_done_common
58731 + }
58732 + }
58733 +};
58734 +
58735 +static int change_dir(struct inode *inode,
58736 + reiser4_plugin * plugin,
58737 + pset_member memb)
58738 +{
58739 + /* cannot change dir plugin of already existing object */
58740 + return RETERR(-EINVAL);
58741 +}
58742 +
58743 +static reiser4_plugin_ops dir_plugin_ops = {
58744 + .change = change_dir
58745 +};
58746 +
58747 +/*
58748 + * definition of directory plugins
58749 + */
58750 +
58751 +dir_plugin dir_plugins[LAST_DIR_ID] = {
58752 + /* standard hashed directory plugin */
58753 + [HASHED_DIR_PLUGIN_ID] = {
58754 + .h = {
58755 + .type_id = REISER4_DIR_PLUGIN_TYPE,
58756 + .id = HASHED_DIR_PLUGIN_ID,
58757 + .pops = &dir_plugin_ops,
58758 + .label = "dir",
58759 + .desc = "hashed directory",
58760 + .linkage = {NULL, NULL}
58761 + },
58762 + .inode_ops = {
58763 + .create = reiser4_create_common,
58764 + .lookup = reiser4_lookup_common,
58765 + .link = reiser4_link_common,
58766 + .unlink = reiser4_unlink_common,
58767 + .symlink = reiser4_symlink_common,
58768 + .mkdir = reiser4_mkdir_common,
58769 + .rmdir = reiser4_unlink_common,
58770 + .mknod = reiser4_mknod_common,
58771 + .rename = reiser4_rename_common,
58772 + .permission = reiser4_permission_common,
58773 + .setattr = reiser4_setattr_common,
58774 + .getattr = reiser4_getattr_common
58775 + },
58776 + .file_ops = {
58777 + .llseek = reiser4_llseek_dir_common,
58778 + .read = generic_read_dir,
58779 + .readdir = reiser4_readdir_common,
58780 + .release = reiser4_release_dir_common,
58781 + .fsync = reiser4_sync_common
58782 + },
58783 + .as_ops = {
58784 + .writepage = bugop,
58785 + .sync_page = bugop,
58786 + .writepages = dummyop,
58787 + .set_page_dirty = bugop,
58788 + .readpages = bugop,
58789 + .prepare_write = bugop,
58790 + .commit_write = bugop,
58791 + .bmap = bugop,
58792 + .invalidatepage = bugop,
58793 + .releasepage = bugop
58794 + },
58795 + .get_parent = get_parent_common,
58796 + .is_name_acceptable = is_name_acceptable_common,
58797 + .build_entry_key = build_entry_key_hashed,
58798 + .build_readdir_key = build_readdir_key_common,
58799 + .add_entry = reiser4_add_entry_common,
58800 + .rem_entry = reiser4_rem_entry_common,
58801 + .init = reiser4_dir_init_common,
58802 + .done = reiser4_dir_done_common,
58803 + .attach = reiser4_attach_common,
58804 + .detach = reiser4_detach_common,
58805 + .estimate = {
58806 + .add_entry = estimate_add_entry_common,
58807 + .rem_entry = estimate_rem_entry_common,
58808 + .unlink = dir_estimate_unlink_common
58809 + }
58810 + },
58811 + /* hashed directory for which seekdir/telldir are guaranteed to
58812 + * work. Brain-damage. */
58813 + [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
58814 + .h = {
58815 + .type_id = REISER4_DIR_PLUGIN_TYPE,
58816 + .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
58817 + .pops = &dir_plugin_ops,
58818 + .label = "dir32",
58819 + .desc = "directory hashed with 31 bit hash",
58820 + .linkage = {NULL, NULL}
58821 + },
58822 + .inode_ops = {
58823 + .create = reiser4_create_common,
58824 + .lookup = reiser4_lookup_common,
58825 + .link = reiser4_link_common,
58826 + .unlink = reiser4_unlink_common,
58827 + .symlink = reiser4_symlink_common,
58828 + .mkdir = reiser4_mkdir_common,
58829 + .rmdir = reiser4_unlink_common,
58830 + .mknod = reiser4_mknod_common,
58831 + .rename = reiser4_rename_common,
58832 + .permission = reiser4_permission_common,
58833 + .setattr = reiser4_setattr_common,
58834 + .getattr = reiser4_getattr_common
58835 + },
58836 + .file_ops = {
58837 + .llseek = reiser4_llseek_dir_common,
58838 + .read = generic_read_dir,
58839 + .readdir = reiser4_readdir_common,
58840 + .release = reiser4_release_dir_common,
58841 + .fsync = reiser4_sync_common
58842 + },
58843 + .as_ops = {
58844 + .writepage = bugop,
58845 + .sync_page = bugop,
58846 + .writepages = dummyop,
58847 + .set_page_dirty = bugop,
58848 + .readpages = bugop,
58849 + .prepare_write = bugop,
58850 + .commit_write = bugop,
58851 + .bmap = bugop,
58852 + .invalidatepage = bugop,
58853 + .releasepage = bugop
58854 + },
58855 + .get_parent = get_parent_common,
58856 + .is_name_acceptable = is_name_acceptable_common,
58857 + .build_entry_key = build_entry_key_seekable,
58858 + .build_readdir_key = build_readdir_key_common,
58859 + .add_entry = reiser4_add_entry_common,
58860 + .rem_entry = reiser4_rem_entry_common,
58861 + .init = reiser4_dir_init_common,
58862 + .done = reiser4_dir_done_common,
58863 + .attach = reiser4_attach_common,
58864 + .detach = reiser4_detach_common,
58865 + .estimate = {
58866 + .add_entry = estimate_add_entry_common,
58867 + .rem_entry = estimate_rem_entry_common,
58868 + .unlink = dir_estimate_unlink_common
58869 + }
58870 + }
58871 +};
58872 +
58873 +/* Make Linus happy.
58874 + Local variables:
58875 + c-indentation-style: "K&R"
58876 + mode-name: "LC"
58877 + c-basic-offset: 8
58878 + tab-width: 8
58879 + fill-column: 120
58880 + End:
58881 +*/
58882 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/object.h linux-2.6.22/fs/reiser4/plugin/object.h
58883 --- linux-2.6.22.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300
58884 +++ linux-2.6.22/fs/reiser4/plugin/object.h 2007-07-29 00:25:34.992726502 +0400
58885 @@ -0,0 +1,121 @@
58886 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
58887 + * reiser4/README */
58888 +
58889 +/* Declaration of object plugin functions. */
58890 +
58891 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
58892 +#define __FS_REISER4_PLUGIN_OBJECT_H__
58893 +
58894 +#include "../type_safe_hash.h"
58895 +
58896 +/* common implementations of inode operations */
58897 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
58898 + int mode, struct nameidata *);
58899 +struct dentry * reiser4_lookup_common(struct inode *parent,
58900 + struct dentry *dentry,
58901 + struct nameidata *nameidata);
58902 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
58903 + struct dentry *newname);
58904 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
58905 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
58906 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
58907 + const char *linkname);
58908 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
58909 + int mode, dev_t rdev);
58910 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
58911 + struct inode *new_dir, struct dentry *new_name);
58912 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
58913 +int reiser4_permission_common(struct inode *, int mask,
58914 + struct nameidata *nameidata);
58915 +int reiser4_setattr_common(struct dentry *, struct iattr *);
58916 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
58917 + struct kstat *);
58918 +
58919 +/* common implementations of file operations */
58920 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
58921 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
58922 +int reiser4_release_dir_common(struct inode *, struct file *);
58923 +int reiser4_sync_common(struct file *, struct dentry *, int datasync);
58924 +
58925 +/* common implementations of address space operations */
58926 +int prepare_write_common(struct file *, struct page *, unsigned from,
58927 + unsigned to);
58928 +
58929 +/* file plugin operations: common implementations */
58930 +int write_sd_by_inode_common(struct inode *);
58931 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
58932 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
58933 + reiser4_object_create_data *);
58934 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
58935 + struct inode *root);
58936 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
58937 + struct inode *root);
58938 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
58939 + struct inode *root);
58940 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
58941 + reiser4_object_create_data *);
58942 +int reiser4_delete_object_common(struct inode *);
58943 +int reiser4_delete_dir_common(struct inode *);
58944 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
58945 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
58946 +int rem_link_common_dir(struct inode *object, struct inode *parent);
58947 +int owns_item_common(const struct inode *, const coord_t *);
58948 +int owns_item_common_dir(const struct inode *, const coord_t *);
58949 +int can_add_link_common(const struct inode *);
58950 +int can_rem_link_common_dir(const struct inode *);
58951 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
58952 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
58953 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
58954 +reiser4_block_nr estimate_create_common(const struct inode *);
58955 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
58956 +reiser4_block_nr estimate_update_common(const struct inode *);
58957 +reiser4_block_nr estimate_unlink_common(const struct inode *,
58958 + const struct inode *);
58959 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
58960 + const struct inode *);
58961 +char *wire_write_common(struct inode *, char *start);
58962 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
58963 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
58964 +int wire_size_common(struct inode *);
58965 +void wire_done_common(reiser4_object_on_wire *);
58966 +
58967 +/* dir plugin operations: common implementations */
58968 +struct dentry *get_parent_common(struct inode *child);
58969 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
58970 +void build_entry_key_common(const struct inode *,
58971 + const struct qstr *qname, reiser4_key *);
58972 +int build_readdir_key_common(struct file *dir, reiser4_key *);
58973 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
58974 + reiser4_object_create_data *, reiser4_dir_entry_desc *);
58975 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
58976 + reiser4_dir_entry_desc *);
58977 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
58978 + reiser4_object_create_data *);
58979 +int reiser4_dir_done_common(struct inode *);
58980 +int reiser4_attach_common(struct inode *child, struct inode *parent);
58981 +int reiser4_detach_common(struct inode *object, struct inode *parent);
58982 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
58983 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
58984 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
58985 + const struct inode *);
58986 +
58987 +/* these are essential parts of common implementations, they are to make
58988 + customized implementations easier */
58989 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
58990 +
58991 +/* merely useful functions */
58992 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
58993 + const reiser4_key *, int silent);
58994 +
58995 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
58996 +#endif
58997 +
58998 +/* Make Linus happy.
58999 + Local variables:
59000 + c-indentation-style: "K&R"
59001 + mode-name: "LC"
59002 + c-basic-offset: 8
59003 + tab-width: 8
59004 + fill-column: 120
59005 + End:
59006 +*/
59007 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin.c linux-2.6.22/fs/reiser4/plugin/plugin.c
59008 --- linux-2.6.22.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300
59009 +++ linux-2.6.22/fs/reiser4/plugin/plugin.c 2007-07-29 00:25:34.992726502 +0400
59010 @@ -0,0 +1,559 @@
59011 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59012 + * reiser4/README */
59013 +
59014 +/* Basic plugin infrastructure, lookup etc. */
59015 +
59016 +/* PLUGINS:
59017 +
59018 + Plugins are internal Reiser4 "modules" or "objects" used to increase
59019 + extensibility and allow external users to easily adapt reiser4 to
59020 + their needs.
59021 +
59022 + Plugins are classified into several disjoint "types". Plugins
59023 + belonging to the particular plugin type are termed "instances" of
59024 + this type. Existing types are listed by enum reiser4_plugin_type
59025 + (see plugin/plugin_header.h)
59026 +
59027 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59028 +
59029 + Object (file) plugin determines how given file-system object serves
59030 + standard VFS requests for read, write, seek, mmap etc. Instances of
59031 + file plugins are: regular file, directory, symlink. Another example
59032 + of file plugin is audit plugin, that optionally records accesses to
59033 + underlying object and forwards requests to it.
59034 +
59035 + Hash plugins compute hashes used by reiser4 to store and locate
59036 + files within directories. Instances of hash plugin type are: r5,
59037 + tea, rupasov.
59038 +
59039 + Tail plugins (or, more precisely, tail policy plugins) determine
59040 + when last part of the file should be stored in a formatted item.
59041 +
59042 + Scope and lookup:
59043 +
59044 + label such that pair ( type_label, plugin_label ) is unique. This
59045 + pair is a globally persistent and user-visible plugin
59046 + identifier. Internally kernel maintains plugins and plugin types in
59047 + arrays using an index into those arrays as plugin and plugin type
59048 + identifiers. File-system in turn, also maintains persistent
59049 + "dictionary" which is mapping from plugin label to numerical
59050 + identifier which is stored in file-system objects. That is, we
59051 + store the offset into the plugin array for that plugin type as the
59052 + plugin id in the stat data of the filesystem object.
59053 +
59054 + Internal kernel plugin type identifier (index in plugins[] array) is
59055 + of type reiser4_plugin_type. Set of available plugin types is
59056 + currently static, but dynamic loading doesn't seem to pose
59057 + insurmountable problems.
59058 +
59059 + Within each type plugins are addressed by the identifiers of type
59060 + reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
59061 + Such identifiers are only required to be unique within one type,
59062 + not globally.
59063 +
59064 + Thus, plugin in memory is uniquely identified by the pair (type_id,
59065 + id).
59066 +
59067 + Usage:
59068 +
59069 + There exists only one instance of each plugin instance, but this
59070 + single instance can be associated with many entities (file-system
59071 + objects, items, nodes, transactions, file-descriptors etc.). Entity
59072 + to which plugin of given type is termed (due to the lack of
59073 + imagination) "subject" of this plugin type and, by abuse of
59074 + terminology, subject of particular instance of this type to which
59075 + it's attached currently. For example, inode is subject of object
59076 + plugin type. Inode representing directory is subject of directory
59077 + plugin, hash plugin type and some particular instance of hash plugin
59078 + type. Inode, representing regular file is subject of "regular file"
59079 + plugin, tail-policy plugin type etc.
59080 +
59081 + With each subject the plugin possibly stores some state. For example,
59082 + the state of a directory plugin (instance of object plugin type) is pointer
59083 + to hash plugin (if directories always use hashing that is).
59084 +
59085 + Interface:
59086 +
59087 + In addition to a scalar identifier, each plugin type and plugin
59088 + proper has a "label": short string and a "description"---longer
59089 + descriptive string. Labels and descriptions of plugin types are
59090 + hard-coded into plugins[] array, declared and defined in
59091 + plugin.c. Label and description of plugin are stored in .label and
59092 + .desc fields of reiser4_plugin_header respectively. It's possible to
59093 + locate plugin by the pair of labels.
59094 +
59095 + Features (not implemented):
59096 +
59097 + . user-level plugin manipulations:
59098 + + reiser4("filename/..file_plugin<='audit'");
59099 + + write(open("filename/..file_plugin"), "audit", 8);
59100 +
59101 + . user level utilities lsplug and chplug to manipulate plugins.
59102 + Utilities are not of primary priority. Possibly they will be not
59103 + working on v4.0
59104 +
59105 + NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
59106 + option, do you agree? I don't think that specifying it at mount time,
59107 + and then changing it with each mount, is a good model for usage.
59108 +
59109 + . mount option "plug" to set-up plugins of root-directory.
59110 + "plug=foo:bar" will set "bar" as default plugin of type "foo".
59111 +
59112 + Limitations:
59113 +
59114 + . each plugin type has to provide at least one builtin
59115 + plugin. This is technical limitation and it can be lifted in the
59116 + future.
59117 +
59118 + TODO:
59119 +
59120 + New plugin types/plugings:
59121 + Things we should be able to separately choose to inherit:
59122 +
59123 + security plugins
59124 +
59125 + stat data
59126 +
59127 + file bodies
59128 +
59129 + file plugins
59130 +
59131 + dir plugins
59132 +
59133 + . perm:acl
59134 +
59135 + . audi---audit plugin intercepting and possibly logging all
59136 + accesses to object. Requires to put stub functions in file_operations
59137 + in stead of generic_file_*.
59138 +
59139 +NIKITA-FIXME-HANS: why make overflows a plugin?
59140 + . over---handle hash overflows
59141 +
59142 + . sqnt---handle different access patterns and instruments read-ahead
59143 +
59144 +NIKITA-FIXME-HANS: describe the line below in more detail.
59145 +
59146 + . hier---handle inheritance of plugins along file-system hierarchy
59147 +
59148 + Different kinds of inheritance: on creation vs. on access.
59149 + Compatible/incompatible plugins.
59150 + Inheritance for multi-linked files.
59151 + Layered plugins.
59152 + Notion of plugin context is abandoned.
59153 +
59154 +Each file is associated
59155 + with one plugin and dependant plugins (hash, etc.) are stored as
59156 + main plugin state. Now, if we have plugins used for regular files
59157 + but not for directories, how such plugins would be inherited?
59158 + . always store them with directories also
59159 +
59160 +NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing
59161 +the line below which is also useful.
59162 +
59163 + . use inheritance hierarchy, independent of file-system namespace
59164 +*/
59165 +
59166 +#include "../debug.h"
59167 +#include "../dformat.h"
59168 +#include "plugin_header.h"
59169 +#include "item/static_stat.h"
59170 +#include "node/node.h"
59171 +#include "security/perm.h"
59172 +#include "space/space_allocator.h"
59173 +#include "disk_format/disk_format.h"
59174 +#include "plugin.h"
59175 +#include "../reiser4.h"
59176 +#include "../jnode.h"
59177 +#include "../inode.h"
59178 +
59179 +#include <linux/fs.h> /* for struct super_block */
59180 +
59181 +/*
59182 + * init_plugins - initialize plugin sub-system.
59183 + * Just call this once on reiser4 startup.
59184 + *
59185 + * Initializes plugin sub-system. It is part of reiser4 module
59186 + * initialization. For each plugin of each type init method is called and each
59187 + * plugin is put into list of plugins.
59188 + */
59189 +int init_plugins(void)
59190 +{
59191 + reiser4_plugin_type type_id;
59192 +
59193 + for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59194 + struct reiser4_plugin_type_data *ptype;
59195 + int i;
59196 +
59197 + ptype = &plugins[type_id];
59198 + assert("nikita-3508", ptype->label != NULL);
59199 + assert("nikita-3509", ptype->type_id == type_id);
59200 +
59201 + INIT_LIST_HEAD(&ptype->plugins_list);
59202 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59203 + for (i = 0; i < ptype->builtin_num; ++i) {
59204 + reiser4_plugin *plugin;
59205 +
59206 + plugin = plugin_at(ptype, i);
59207 +
59208 + if (plugin->h.label == NULL)
59209 + /* uninitialized slot encountered */
59210 + continue;
59211 + assert("nikita-3445", plugin->h.type_id == type_id);
59212 + plugin->h.id = i;
59213 + if (plugin->h.pops != NULL &&
59214 + plugin->h.pops->init != NULL) {
59215 + int result;
59216 +
59217 + result = plugin->h.pops->init(plugin);
59218 + if (result != 0)
59219 + return result;
59220 + }
59221 + INIT_LIST_HEAD(&plugin->h.linkage);
59222 + list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
59223 + }
59224 + }
59225 + return 0;
59226 +}
59227 +
59228 +/* true if plugin type id is valid */
59229 +int is_plugin_type_valid(reiser4_plugin_type type)
59230 +{
59231 + /* "type" is unsigned, so no comparison with 0 is
59232 + necessary */
59233 + return (type < REISER4_PLUGIN_TYPES);
59234 +}
59235 +
59236 +/* true if plugin id is valid */
59237 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
59238 +{
59239 + assert("nikita-1653", is_plugin_type_valid(type));
59240 + return id < plugins[type].builtin_num;
59241 +}
59242 +
59243 +/* return plugin by its @type and @id.
59244 +
59245 + Both arguments are checked for validness: this is supposed to be called
59246 + from user-level.
59247 +
59248 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
59249 +user space, and passed to the filesystem by use of method files? Your
59250 +comment really confused me on the first reading....
59251 +
59252 +*/
59253 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
59254 + * unchecked */,
59255 + reiser4_plugin_id id /* plugin id,
59256 + * unchecked */)
59257 +{
59258 + if (is_plugin_type_valid(type)) {
59259 + if (is_plugin_id_valid(type, id))
59260 + return plugin_at(&plugins[type], id);
59261 + else
59262 + /* id out of bounds */
59263 + warning("nikita-2913",
59264 + "Invalid plugin id: [%i:%i]", type, id);
59265 + } else
59266 + /* type_id out of bounds */
59267 + warning("nikita-2914", "Invalid type_id: %i", type);
59268 + return NULL;
59269 +}
59270 +
59271 +/**
59272 + * save_plugin_id - store plugin id in disk format
59273 + * @plugin: plugin to convert
59274 + * @area: where to store result
59275 + *
59276 + * Puts id of @plugin in little endian format to address @area.
59277 + */
59278 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
59279 + d16 *area /* where to store result */ )
59280 +{
59281 + assert("nikita-1261", plugin != NULL);
59282 + assert("nikita-1262", area != NULL);
59283 +
59284 + put_unaligned(cpu_to_le16(plugin->h.id), area);
59285 + return 0;
59286 +}
59287 +
59288 +/* list of all plugins of given type */
59289 +struct list_head *get_plugin_list(reiser4_plugin_type type)
59290 +{
59291 + assert("nikita-1056", is_plugin_type_valid(type));
59292 + return &plugins[type].plugins_list;
59293 +}
59294 +
59295 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
59296 +{
59297 + struct dentry *rootdir;
59298 + reiser4_inode *root;
59299 +
59300 + assert("edward-1443", memb != PSET_FILE);
59301 +
59302 + rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
59303 + if (rootdir != NULL) {
59304 + root = reiser4_inode_data(rootdir->d_inode);
59305 + /*
59306 + * if inode is different from the default one, or we are
59307 + * changing plugin of root directory, update plugin_mask
59308 + */
59309 + if (aset_get(info->pset, memb) !=
59310 + aset_get(root->pset, memb) ||
59311 + info == root)
59312 + info->plugin_mask |= (1 << memb);
59313 + else
59314 + info->plugin_mask &= ~(1 << memb);
59315 + }
59316 +}
59317 +
59318 +/* Get specified plugin set member from parent,
59319 + or from fs-defaults (if no parent is given) and
59320 + install the result to pset of @self */
59321 +int grab_plugin_pset(struct inode *self,
59322 + struct inode *ancestor,
59323 + pset_member memb)
59324 +{
59325 + reiser4_plugin *plug;
59326 + reiser4_inode *info;
59327 + int result = 0;
59328 +
59329 + /* Do not grab if initialised already. */
59330 + info = reiser4_inode_data(self);
59331 + if (aset_get(info->pset, memb) != NULL)
59332 + return 0;
59333 + if (ancestor) {
59334 + reiser4_inode *parent;
59335 +
59336 + parent = reiser4_inode_data(ancestor);
59337 + plug = aset_get(parent->hset, memb) ? :
59338 + aset_get(parent->pset, memb);
59339 + }
59340 + else
59341 + plug = get_default_plugin(memb);
59342 +
59343 + result = set_plugin(&info->pset, memb, plug);
59344 + if (result == 0) {
59345 + if (!ancestor || self->i_sb->s_root->d_inode != self)
59346 + update_pset_mask(info, memb);
59347 + }
59348 + return result;
59349 +}
59350 +
59351 +/* Take missing pset members from root inode */
59352 +int finish_pset(struct inode *inode)
59353 +{
59354 + reiser4_plugin *plug;
59355 + reiser4_inode *root;
59356 + reiser4_inode *info;
59357 + pset_member memb;
59358 + int result = 0;
59359 +
59360 + root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
59361 + info = reiser4_inode_data(inode);
59362 +
59363 + assert("edward-1455", root != NULL);
59364 + assert("edward-1456", info != NULL);
59365 +
59366 + /* file and directory plugins are already initialized. */
59367 + for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
59368 +
59369 + /* Do not grab if initialised already. */
59370 + if (aset_get(info->pset, memb) != NULL)
59371 + continue;
59372 +
59373 + plug = aset_get(root->pset, memb);
59374 + result = set_plugin(&info->pset, memb, plug);
59375 + if (result != 0)
59376 + break;
59377 + }
59378 + if (result != 0) {
59379 + warning("nikita-3447",
59380 + "Cannot set up plugins for %lli",
59381 + (unsigned long long)
59382 + get_inode_oid(inode));
59383 + }
59384 + return result;
59385 +}
59386 +
59387 +int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
59388 +{
59389 + reiser4_inode *info;
59390 + int result = 0;
59391 +
59392 + if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
59393 + /* Changing pset in the root object. */
59394 + return RETERR(-EINVAL);
59395 + }
59396 +
59397 + info = reiser4_inode_data(self);
59398 + if (plug->h.pops != NULL && plug->h.pops->change != NULL)
59399 + result = plug->h.pops->change(self, plug, memb);
59400 + else
59401 + result = aset_set_unsafe(&info->pset, memb, plug);
59402 + if (result == 0) {
59403 + __u16 oldmask = info->plugin_mask;
59404 +
59405 + update_pset_mask(info, memb);
59406 + if (oldmask != info->plugin_mask)
59407 + reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
59408 + }
59409 + return result;
59410 +}
59411 +
59412 +struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
59413 + /* C90 initializers */
59414 + [REISER4_FILE_PLUGIN_TYPE] = {
59415 + .type_id = REISER4_FILE_PLUGIN_TYPE,
59416 + .label = "file",
59417 + .desc = "Object plugins",
59418 + .builtin_num = sizeof_array(file_plugins),
59419 + .builtin = file_plugins,
59420 + .plugins_list = {NULL, NULL},
59421 + .size = sizeof(file_plugin)
59422 + },
59423 + [REISER4_DIR_PLUGIN_TYPE] = {
59424 + .type_id = REISER4_DIR_PLUGIN_TYPE,
59425 + .label = "dir",
59426 + .desc = "Directory plugins",
59427 + .builtin_num = sizeof_array(dir_plugins),
59428 + .builtin = dir_plugins,
59429 + .plugins_list = {NULL, NULL},
59430 + .size = sizeof(dir_plugin)
59431 + },
59432 + [REISER4_HASH_PLUGIN_TYPE] = {
59433 + .type_id = REISER4_HASH_PLUGIN_TYPE,
59434 + .label = "hash",
59435 + .desc = "Directory hashes",
59436 + .builtin_num = sizeof_array(hash_plugins),
59437 + .builtin = hash_plugins,
59438 + .plugins_list = {NULL, NULL},
59439 + .size = sizeof(hash_plugin)
59440 + },
59441 + [REISER4_FIBRATION_PLUGIN_TYPE] = {
59442 + .type_id =
59443 + REISER4_FIBRATION_PLUGIN_TYPE,
59444 + .label = "fibration",
59445 + .desc = "Directory fibrations",
59446 + .builtin_num = sizeof_array(fibration_plugins),
59447 + .builtin = fibration_plugins,
59448 + .plugins_list = {NULL, NULL},
59449 + .size = sizeof(fibration_plugin)
59450 + },
59451 + [REISER4_CIPHER_PLUGIN_TYPE] = {
59452 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
59453 + .label = "cipher",
59454 + .desc = "Cipher plugins",
59455 + .builtin_num = sizeof_array(cipher_plugins),
59456 + .builtin = cipher_plugins,
59457 + .plugins_list = {NULL, NULL},
59458 + .size = sizeof(cipher_plugin)
59459 + },
59460 + [REISER4_DIGEST_PLUGIN_TYPE] = {
59461 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
59462 + .label = "digest",
59463 + .desc = "Digest plugins",
59464 + .builtin_num = sizeof_array(digest_plugins),
59465 + .builtin = digest_plugins,
59466 + .plugins_list = {NULL, NULL},
59467 + .size = sizeof(digest_plugin)
59468 + },
59469 + [REISER4_COMPRESSION_PLUGIN_TYPE] = {
59470 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
59471 + .label = "compression",
59472 + .desc = "Compression plugins",
59473 + .builtin_num = sizeof_array(compression_plugins),
59474 + .builtin = compression_plugins,
59475 + .plugins_list = {NULL, NULL},
59476 + .size = sizeof(compression_plugin)
59477 + },
59478 + [REISER4_FORMATTING_PLUGIN_TYPE] = {
59479 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
59480 + .label = "formatting",
59481 + .desc = "Tail inlining policies",
59482 + .builtin_num = sizeof_array(formatting_plugins),
59483 + .builtin = formatting_plugins,
59484 + .plugins_list = {NULL, NULL},
59485 + .size = sizeof(formatting_plugin)
59486 + },
59487 + [REISER4_PERM_PLUGIN_TYPE] = {
59488 + .type_id = REISER4_PERM_PLUGIN_TYPE,
59489 + .label = "perm",
59490 + .desc = "Permission checks",
59491 + .builtin_num = sizeof_array(perm_plugins),
59492 + .builtin = perm_plugins,
59493 + .plugins_list = {NULL, NULL},
59494 + .size = sizeof(perm_plugin)
59495 + },
59496 + [REISER4_ITEM_PLUGIN_TYPE] = {
59497 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
59498 + .label = "item",
59499 + .desc = "Item handlers",
59500 + .builtin_num = sizeof_array(item_plugins),
59501 + .builtin = item_plugins,
59502 + .plugins_list = {NULL, NULL},
59503 + .size = sizeof(item_plugin)
59504 + },
59505 + [REISER4_NODE_PLUGIN_TYPE] = {
59506 + .type_id = REISER4_NODE_PLUGIN_TYPE,
59507 + .label = "node",
59508 + .desc = "node layout handlers",
59509 + .builtin_num = sizeof_array(node_plugins),
59510 + .builtin = node_plugins,
59511 + .plugins_list = {NULL, NULL},
59512 + .size = sizeof(node_plugin)
59513 + },
59514 + [REISER4_SD_EXT_PLUGIN_TYPE] = {
59515 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
59516 + .label = "sd_ext",
59517 + .desc = "Parts of stat-data",
59518 + .builtin_num = sizeof_array(sd_ext_plugins),
59519 + .builtin = sd_ext_plugins,
59520 + .plugins_list = {NULL, NULL},
59521 + .size = sizeof(sd_ext_plugin)
59522 + },
59523 + [REISER4_FORMAT_PLUGIN_TYPE] = {
59524 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
59525 + .label = "disk_layout",
59526 + .desc = "defines filesystem on disk layout",
59527 + .builtin_num = sizeof_array(format_plugins),
59528 + .builtin = format_plugins,
59529 + .plugins_list = {NULL, NULL},
59530 + .size = sizeof(disk_format_plugin)
59531 + },
59532 + [REISER4_JNODE_PLUGIN_TYPE] = {
59533 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
59534 + .label = "jnode",
59535 + .desc = "defines kind of jnode",
59536 + .builtin_num = sizeof_array(jnode_plugins),
59537 + .builtin = jnode_plugins,
59538 + .plugins_list = {NULL, NULL},
59539 + .size = sizeof(jnode_plugin)
59540 + },
59541 + [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
59542 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59543 + .label = "compression_mode",
59544 + .desc = "Defines compression mode",
59545 + .builtin_num = sizeof_array(compression_mode_plugins),
59546 + .builtin = compression_mode_plugins,
59547 + .plugins_list = {NULL, NULL},
59548 + .size = sizeof(compression_mode_plugin)
59549 + },
59550 + [REISER4_CLUSTER_PLUGIN_TYPE] = {
59551 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
59552 + .label = "cluster",
59553 + .desc = "Defines cluster size",
59554 + .builtin_num = sizeof_array(cluster_plugins),
59555 + .builtin = cluster_plugins,
59556 + .plugins_list = {NULL, NULL},
59557 + .size = sizeof(cluster_plugin)
59558 + }
59559 +};
59560 +
59561 +/*
59562 + * Local variables:
59563 + * c-indentation-style: "K&R"
59564 + * mode-name: "LC"
59565 + * c-basic-offset: 8
59566 + * tab-width: 8
59567 + * fill-column: 120
59568 + * End:
59569 + */
59570 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin.h linux-2.6.22/fs/reiser4/plugin/plugin.h
59571 --- linux-2.6.22.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300
59572 +++ linux-2.6.22/fs/reiser4/plugin/plugin.h 2007-07-29 00:25:34.996727537 +0400
59573 @@ -0,0 +1,899 @@
59574 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59575 +
59576 +/* Basic plugin data-types.
59577 + see fs/reiser4/plugin/plugin.c for details */
59578 +
59579 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
59580 +#define __FS_REISER4_PLUGIN_TYPES_H__
59581 +
59582 +#include "../forward.h"
59583 +#include "../debug.h"
59584 +#include "../dformat.h"
59585 +#include "../key.h"
59586 +#include "compress/compress.h"
59587 +#include "crypto/cipher.h"
59588 +#include "plugin_header.h"
59589 +#include "item/static_stat.h"
59590 +#include "item/internal.h"
59591 +#include "item/sde.h"
59592 +#include "item/cde.h"
59593 +#include "item/item.h"
59594 +#include "node/node.h"
59595 +#include "node/node40.h"
59596 +#include "security/perm.h"
59597 +#include "fibration.h"
59598 +
59599 +#include "space/bitmap.h"
59600 +#include "space/space_allocator.h"
59601 +
59602 +#include "disk_format/disk_format40.h"
59603 +#include "disk_format/disk_format.h"
59604 +
59605 +#include <linux/fs.h> /* for struct super_block, address_space */
59606 +#include <linux/mm.h> /* for struct page */
59607 +#include <linux/buffer_head.h> /* for struct buffer_head */
59608 +#include <linux/dcache.h> /* for struct dentry */
59609 +#include <linux/types.h>
59610 +#include <linux/crypto.h>
59611 +
59612 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
59613 +
59614 +/*
59615 + * File plugin. Defines the set of methods that file plugins implement, some
59616 + * of which are optional.
59617 + *
59618 + * A file plugin offers to the caller an interface for IO ( writing to and/or
59619 + * reading from) to what the caller sees as one sequence of bytes. An IO to it
59620 + * may affect more than one physical sequence of bytes, or no physical sequence
59621 + * of bytes, it may affect sequences of bytes offered by other file plugins to
59622 + * the semantic layer, and the file plugin may invoke other plugins and
59623 + * delegate work to them, but its interface is structured for offering the
59624 + * caller the ability to read and/or write what the caller sees as being a
59625 + * single sequence of bytes.
59626 + *
59627 + * The file plugin must present a sequence of bytes to the caller, but it does
59628 + * not necessarily have to store a sequence of bytes, it does not necessarily
59629 + * have to support efficient tree traversal to any offset in the sequence of
59630 + * bytes (tail and extent items, whose keys contain offsets, do however provide
59631 + * efficient non-sequential lookup of any offset in the sequence of bytes).
59632 + *
59633 + * Directory plugins provide methods for selecting file plugins by resolving a
59634 + * name for them.
59635 + *
59636 + * The functionality other filesystems call an attribute, and rigidly tie
59637 + * together, we decompose into orthogonal selectable features of files. Using
59638 + * the terminology we will define next, an attribute is a perhaps constrained,
59639 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
59640 + * which might be grandparent-major-packed, and whose parent has a deletion
59641 + * method that deletes it.
59642 + *
59643 + * File plugins can implement constraints.
59644 + *
59645 + * Files can be of variable length (e.g. regular unix files), or of static
59646 + * length (e.g. static sized attributes).
59647 + *
59648 + * An object may have many sequences of bytes, and many file plugins, but, it
59649 + * has exactly one objectid. It is usually desirable that an object has a
59650 + * deletion method which deletes every item with that objectid. Items cannot
59651 + * in general be found by just their objectids. This means that an object must
59652 + * have either a method built into its deletion plugin method for knowing what
59653 + * items need to be deleted, or links stored with the object that provide the
59654 + * plugin with a method for finding those items. Deleting a file within an
59655 + * object may or may not have the effect of deleting the entire object,
59656 + * depending on the file plugin's deletion method.
59657 + *
59658 + * LINK TAXONOMY:
59659 + *
59660 + * Many objects have a reference count, and when the reference count reaches 0
59661 + * the object's deletion method is invoked. Some links embody a reference
59662 + * count increase ("countlinks"), and others do not ("nocountlinks").
59663 + *
59664 + * Some links are bi-directional links ("bilinks"), and some are
59665 + * uni-directional("unilinks").
59666 + *
59667 + * Some links are between parts of the same object ("intralinks"), and some are
59668 + * between different objects ("interlinks").
59669 + *
59670 + * PACKING TAXONOMY:
59671 + *
59672 + * Some items of an object are stored with a major packing locality based on
59673 + * their object's objectid (e.g. unix directory items in plan A), and these are
59674 + * called "self-major-packed".
59675 + *
59676 + * Some items of an object are stored with a major packing locality based on
59677 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
59678 + * and these are called "parent-major-packed".
59679 + *
59680 + * Some items of an object are stored with a major packing locality based on
59681 + * their semantic grandparent, and these are called "grandparent-major-packed".
59682 + * Now carefully notice that we run into trouble with key length if we have to
59683 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
59684 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
59685 + * a 24 byte key. One of these fields must be sacrificed if an item is to be
59686 + * grandparent-major-packed, and which to sacrifice is left to the item author
59687 + * choosing to make the item grandparent-major-packed. You cannot make tail
59688 + * items and extent items grandparent-major-packed, though you could make them
59689 + * self-major-packed (usually they are parent-major-packed).
59690 + *
59691 + * In the case of ACLs (which are composed of fixed length ACEs which consist
59692 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
59693 + * to not have an offset field in the ACE item key, and to allow duplicate keys
59694 + * for ACEs. Thus, the set of ACES for a given file is found by looking for a
59695 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
59696 + * a directory together), the minor packing locality of ACE, the objectid of
59697 + * the file, and 0.
59698 + *
59699 + * IO involves moving data from one location to another, which means that two
59700 + * locations must be specified, source and destination.
59701 + *
59702 + * This source and destination can be in the filesystem, or they can be a
59703 + * pointer in the user process address space plus a byte count.
59704 + *
59705 + * If both source and destination are in the filesystem, then at least one of
59706 + * them must be representable as a pure stream of bytes (which we call a flow,
59707 + * and define as a struct containing a key, a data pointer, and a length).
59708 + * This may mean converting one of them into a flow. We provide a generic
59709 + * cast_into_flow() method, which will work for any plugin supporting
59710 + * read_flow(), though it is inefficiently implemented in that it temporarily
59711 + * stores the flow in a buffer (Question: what to do with huge flows that
59712 + * cannot fit into memory? Answer: we must not convert them all at once. )
59713 + *
59714 + * Performing a write requires resolving the write request into a flow defining
59715 + * the source, and a method that performs the write, and a key that defines
59716 + * where in the tree the write is to go.
59717 + *
59718 + * Performing a read requires resolving the read request into a flow defining
59719 + * the target, and a method that performs the read, and a key that defines
59720 + * where in the tree the read is to come from.
59721 + *
59722 + * There will exist file plugins which have no pluginid stored on the disk for
59723 + * them, and which are only invoked by other plugins.
59724 + */
59725 +
59726 +/* This should be incremented with each new contributed
59727 + pair (plugin type, plugin id).
59728 + NOTE: Make sure there is a release of reiser4progs
59729 + with the corresponding version number */
59730 +#define PLUGIN_LIBRARY_VERSION 0
59731 +
59732 + /* enumeration of fields within plugin_set */
59733 +typedef enum {
59734 + PSET_FILE,
59735 + PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
59736 + * inode.c:read_inode() depends on this. */
59737 + PSET_PERM,
59738 + PSET_FORMATTING,
59739 + PSET_HASH,
59740 + PSET_FIBRATION,
59741 + PSET_SD,
59742 + PSET_DIR_ITEM,
59743 + PSET_CIPHER,
59744 + PSET_DIGEST,
59745 + PSET_COMPRESSION,
59746 + PSET_COMPRESSION_MODE,
59747 + PSET_CLUSTER,
59748 + PSET_CREATE,
59749 + PSET_LAST
59750 +} pset_member;
59751 +
59752 +/* builtin file-plugins */
59753 +typedef enum {
59754 + /* regular file */
59755 + UNIX_FILE_PLUGIN_ID,
59756 + /* directory */
59757 + DIRECTORY_FILE_PLUGIN_ID,
59758 + /* symlink */
59759 + SYMLINK_FILE_PLUGIN_ID,
59760 + /* for objects completely handled by the VFS: fifos, devices,
59761 + sockets */
59762 + SPECIAL_FILE_PLUGIN_ID,
59763 + /* regular cryptcompress file */
59764 + CRYPTCOMPRESS_FILE_PLUGIN_ID,
59765 + /* number of file plugins. Used as size of arrays to hold
59766 + file plugins. */
59767 + LAST_FILE_PLUGIN_ID
59768 +} reiser4_file_id;
59769 +
59770 +typedef struct file_plugin {
59771 +
59772 + /* generic fields */
59773 + plugin_header h;
59774 +
59775 + struct inode_operations inode_ops;
59776 + struct file_operations file_ops;
59777 + struct address_space_operations as_ops;
59778 +
59779 + /* save inode cached stat-data onto disk. It was called
59780 + reiserfs_update_sd() in 3.x */
59781 + int (*write_sd_by_inode) (struct inode *);
59782 +
59783 + /*
59784 + * private methods: These are optional. If used they will allow you to
59785 + * minimize the amount of code needed to implement a deviation from
59786 + * some other method that also uses them.
59787 + */
59788 +
59789 + /*
59790 + * Construct flow into @flow according to user-supplied data.
59791 + *
59792 + * This is used by read/write methods to construct a flow to
59793 + * write/read. ->flow_by_inode() is plugin method, rather than single
59794 + * global implementation, because key in a flow used by plugin may
59795 + * depend on data in a @buf.
59796 + *
59797 + * NIKITA-FIXME-HANS: please create statistics on what functions are
59798 + * dereferenced how often for the mongo benchmark. You can supervise
59799 + * Elena doing this for you if that helps. Email me the list of the
59800 + * top 10, with their counts, and an estimate of the total number of
59801 + * CPU cycles spent dereferencing as a percentage of CPU cycles spent
59802 + * processing (non-idle processing). If the total percent is, say,
59803 + * less than 1%, it will make our coding discussions much easier, and
59804 + * keep me from questioning whether functions like the below are too
59805 + * frequently called to be dereferenced. If the total percent is more
59806 + * than 1%, perhaps private methods should be listed in a "required"
59807 + * comment at the top of each plugin (with stern language about how if
59808 + * the comment is missing it will not be accepted by the maintainer),
59809 + * and implemented using macros not dereferenced functions. How about
59810 + * replacing this whole private methods part of the struct with a
59811 + * thorough documentation of what the standard helper functions are for
59812 + * use in constructing plugins? I think users have been asking for
59813 + * that, though not in so many words.
59814 + */
59815 + int (*flow_by_inode) (struct inode *, const char __user *buf,
59816 + int user, loff_t size,
59817 + loff_t off, rw_op op, flow_t *);
59818 +
59819 + /*
59820 + * Return the key used to retrieve an offset of a file. It is used by
59821 + * default implementation of ->flow_by_inode() method
59822 + * (common_build_flow()) and, among other things, to get to the extent
59823 + * from jnode of unformatted node.
59824 + */
59825 + int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
59826 +
59827 + /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
59828 + /*
59829 + * set the plugin for a file. Called during file creation in creat()
59830 + * but not reiser4() unless an inode already exists for the file.
59831 + */
59832 + int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
59833 + reiser4_object_create_data *);
59834 +
59835 + /* NIKITA-FIXME-HANS: comment and name seem to say different things,
59836 + * are you setting up the object itself also or just adjusting the
59837 + * parent?.... */
59838 + /* set up plugins for new @object created in @parent. @root is root
59839 + directory. */
59840 + int (*adjust_to_parent) (struct inode *object, struct inode *parent,
59841 + struct inode *root);
59842 + /*
59843 + * this does whatever is necessary to do when object is created. For
59844 + * instance, for unix files stat data is inserted. It is supposed to be
59845 + * called by create of struct inode_operations.
59846 + */
59847 + int (*create_object) (struct inode *object, struct inode *parent,
59848 + reiser4_object_create_data *);
59849 +
59850 + /* this does whatever is necessary to do when object is opened */
59851 + int (*open_object) (struct inode * inode, struct file * file);
59852 + /*
59853 + * this method should check REISER4_NO_SD and set REISER4_NO_SD on
59854 + * success. Deletion of an object usually includes removal of items
59855 + * building file body (for directories this is removal of "." and "..")
59856 + * and removal of stat-data item.
59857 + */
59858 + int (*delete_object) (struct inode *);
59859 +
59860 + /* add link from @parent to @object */
59861 + int (*add_link) (struct inode *object, struct inode *parent);
59862 +
59863 + /* remove link from @parent to @object */
59864 + int (*rem_link) (struct inode *object, struct inode *parent);
59865 +
59866 + /*
59867 + * return true if item addressed by @coord belongs to @inode. This is
59868 + * used by read/write to properly slice flow into items in presence of
59869 + * multiple key assignment policies, because items of a file are not
59870 + * necessarily contiguous in a key space, for example, in a plan-b.
59871 + */
59872 + int (*owns_item) (const struct inode *, const coord_t *);
59873 +
59874 + /* checks whether yet another hard links to this object can be
59875 + added */
59876 + int (*can_add_link) (const struct inode *);
59877 +
59878 + /* checks whether hard links to this object can be removed */
59879 + int (*can_rem_link) (const struct inode *);
59880 +
59881 + /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
59882 + detach of directory plugin to remove ".." */
59883 + int (*detach) (struct inode * child, struct inode * parent);
59884 +
59885 + /* called when @child was just looked up in the @parent. It is not
59886 + empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
59887 + directory plugin */
59888 + int (*bind) (struct inode * child, struct inode * parent);
59889 +
59890 + /* process safe-link during mount */
59891 + int (*safelink) (struct inode * object, reiser4_safe_link_t link,
59892 + __u64 value);
59893 +
59894 + /* The couple of estimate methods for all file operations */
59895 + struct {
59896 + reiser4_block_nr(*create) (const struct inode *);
59897 + reiser4_block_nr(*update) (const struct inode *);
59898 + reiser4_block_nr(*unlink) (const struct inode *,
59899 + const struct inode *);
59900 + } estimate;
59901 +
59902 + /*
59903 + * reiser4 specific part of inode has a union of structures which are
59904 + * specific to a plugin. This method is called when inode is read
59905 + * (read_inode) and when file is created (common_create_child) so that
59906 + * file plugin could initialize its inode data
59907 + */
59908 + void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
59909 + int);
59910 +
59911 + /*
59912 + * This method performs progressive deletion of items and whole nodes
59913 + * from right to left.
59914 + *
59915 + * @tap: the point deletion process begins from,
59916 + * @from_key: the beginning of the deleted key range,
59917 + * @to_key: the end of the deleted key range,
59918 + * @smallest_removed: the smallest removed key,
59919 + *
59920 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
59921 + * operation was interrupted for allowing atom commit .
59922 + */
59923 + int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
59924 + const reiser4_key * to_key,
59925 + reiser4_key * smallest_removed, struct inode *,
59926 + int, int *);
59927 +
59928 + /* called from ->destroy_inode() */
59929 + void (*destroy_inode) (struct inode *);
59930 +
59931 + /*
59932 + * methods to serialize object identify. This is used, for example, by
59933 + * reiser4_{en,de}code_fh().
59934 + */
59935 + struct {
59936 + /* store object's identity at @area */
59937 + char *(*write) (struct inode * inode, char *area);
59938 + /* parse object from wire to the @obj */
59939 + char *(*read) (char *area, reiser4_object_on_wire * obj);
59940 + /* given object identity in @obj, find or create its dentry */
59941 + struct dentry *(*get) (struct super_block * s,
59942 + reiser4_object_on_wire * obj);
59943 + /* how many bytes ->wire.write() consumes */
59944 + int (*size) (struct inode * inode);
59945 + /* finish with object identify */
59946 + void (*done) (reiser4_object_on_wire * obj);
59947 + } wire;
59948 +} file_plugin;
59949 +
59950 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
59951 +
59952 +struct reiser4_object_on_wire {
59953 + file_plugin *plugin;
59954 + union {
59955 + struct {
59956 + obj_key_id key_id;
59957 + } std;
59958 + void *generic;
59959 + } u;
59960 +};
59961 +
59962 +/* builtin dir-plugins */
59963 +typedef enum {
59964 + HASHED_DIR_PLUGIN_ID,
59965 + SEEKABLE_HASHED_DIR_PLUGIN_ID,
59966 + LAST_DIR_ID
59967 +} reiser4_dir_id;
59968 +
59969 +typedef struct dir_plugin {
59970 + /* generic fields */
59971 + plugin_header h;
59972 +
59973 + struct inode_operations inode_ops;
59974 + struct file_operations file_ops;
59975 + struct address_space_operations as_ops;
59976 +
59977 + /*
59978 + * private methods: These are optional. If used they will allow you to
59979 + * minimize the amount of code needed to implement a deviation from
59980 + * some other method that uses them. You could logically argue that
59981 + * they should be a separate type of plugin.
59982 + */
59983 +
59984 + struct dentry *(*get_parent) (struct inode * childdir);
59985 +
59986 + /*
59987 + * check whether "name" is acceptable name to be inserted into this
59988 + * object. Optionally implemented by directory-like objects. Can check
59989 + * for maximal length, reserved symbols etc
59990 + */
59991 + int (*is_name_acceptable) (const struct inode * inode, const char *name,
59992 + int len);
59993 +
59994 + void (*build_entry_key) (const struct inode * dir /* directory where
59995 + * entry is (or will
59996 + * be) in.*/ ,
59997 + const struct qstr * name /* name of file
59998 + * referenced by this
59999 + * entry */ ,
60000 + reiser4_key * result /* resulting key of
60001 + * directory entry */ );
60002 + int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60003 + int (*add_entry) (struct inode * object, struct dentry * where,
60004 + reiser4_object_create_data * data,
60005 + reiser4_dir_entry_desc * entry);
60006 + int (*rem_entry) (struct inode * object, struct dentry * where,
60007 + reiser4_dir_entry_desc * entry);
60008 +
60009 + /*
60010 + * initialize directory structure for newly created object. For normal
60011 + * unix directories, insert dot and dotdot.
60012 + */
60013 + int (*init) (struct inode * object, struct inode * parent,
60014 + reiser4_object_create_data * data);
60015 +
60016 + /* destroy directory */
60017 + int (*done) (struct inode * child);
60018 +
60019 + /* called when @subdir was just looked up in the @dir */
60020 + int (*attach) (struct inode * subdir, struct inode * dir);
60021 + int (*detach) (struct inode * subdir, struct inode * dir);
60022 +
60023 + struct {
60024 + reiser4_block_nr(*add_entry) (const struct inode *);
60025 + reiser4_block_nr(*rem_entry) (const struct inode *);
60026 + reiser4_block_nr(*unlink) (const struct inode *,
60027 + const struct inode *);
60028 + } estimate;
60029 +} dir_plugin;
60030 +
60031 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60032 +
60033 +typedef struct formatting_plugin {
60034 + /* generic fields */
60035 + plugin_header h;
60036 + /* returns non-zero iff file's tail has to be stored
60037 + in a direct item. */
60038 + int (*have_tail) (const struct inode * inode, loff_t size);
60039 +} formatting_plugin;
60040 +
60041 +typedef struct hash_plugin {
60042 + /* generic fields */
60043 + plugin_header h;
60044 + /* computes hash of the given name */
60045 + __u64(*hash) (const unsigned char *name, int len);
60046 +} hash_plugin;
60047 +
60048 +typedef struct cipher_plugin {
60049 + /* generic fields */
60050 + plugin_header h;
60051 + struct crypto_blkcipher * (*alloc) (void);
60052 + void (*free) (struct crypto_blkcipher * tfm);
60053 + /* Offset translator. For each offset this returns (k * offset), where
60054 + k (k >= 1) is an expansion factor of the cipher algorithm.
60055 + For all symmetric algorithms k == 1. For asymmetric algorithms (which
60056 + inflate data) offset translation guarantees that all disk cluster's
60057 + units will have keys smaller then next cluster's one.
60058 + */
60059 + loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60060 + /* Cipher algorithms can accept data only by chunks of cipher block
60061 + size. This method is to align any flow up to cipher block size when
60062 + we pass it to cipher algorithm. To align means to append padding of
60063 + special format specific to the cipher algorithm */
60064 + int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60065 + /* low-level key manager (check, install, etc..) */
60066 + int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60067 + unsigned int keylen);
60068 + /* main text processing procedures */
60069 + void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60070 + void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60071 +} cipher_plugin;
60072 +
60073 +typedef struct digest_plugin {
60074 + /* generic fields */
60075 + plugin_header h;
60076 + /* fingerprint size in bytes */
60077 + int fipsize;
60078 + struct crypto_hash * (*alloc) (void);
60079 + void (*free) (struct crypto_hash * tfm);
60080 +} digest_plugin;
60081 +
60082 +typedef struct compression_plugin {
60083 + /* generic fields */
60084 + plugin_header h;
60085 + int (*init) (void);
60086 + /* the maximum number of bytes the size of the "compressed" data can
60087 + * exceed the uncompressed data. */
60088 + int (*overrun) (unsigned src_len);
60089 + coa_t(*alloc) (tfm_action act);
60090 + void (*free) (coa_t coa, tfm_action act);
60091 + /* minimal size of the flow we still try to compress */
60092 + int (*min_size_deflate) (void);
60093 + __u32(*checksum) (char *data, __u32 length);
60094 + /* main transform procedures */
60095 + void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60096 + __u8 * dst_first, unsigned *dst_len);
60097 + void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60098 + __u8 * dst_first, unsigned *dst_len);
60099 +} compression_plugin;
60100 +
60101 +typedef struct compression_mode_plugin {
60102 + /* generic fields */
60103 + plugin_header h;
60104 + /* this is called when estimating compressibility
60105 + of a logical cluster by its content */
60106 + int (*should_deflate) (struct inode * inode, cloff_t index);
60107 + /* this is called when results of compression should be saved */
60108 + int (*accept_hook) (struct inode * inode, cloff_t index);
60109 + /* this is called when results of compression should be discarded */
60110 + int (*discard_hook) (struct inode * inode, cloff_t index);
60111 +} compression_mode_plugin;
60112 +
60113 +typedef struct cluster_plugin {
60114 + /* generic fields */
60115 + plugin_header h;
60116 + int shift;
60117 +} cluster_plugin;
60118 +
60119 +typedef struct sd_ext_plugin {
60120 + /* generic fields */
60121 + plugin_header h;
60122 + int (*present) (struct inode * inode, char **area, int *len);
60123 + int (*absent) (struct inode * inode);
60124 + int (*save_len) (struct inode * inode);
60125 + int (*save) (struct inode * inode, char **area);
60126 + /* alignment requirement for this stat-data part */
60127 + int alignment;
60128 +} sd_ext_plugin;
60129 +
60130 +/* this plugin contains methods to allocate objectid for newly created files,
60131 + to deallocate objectid when file gets removed, to report number of used and
60132 + free objectids */
60133 +typedef struct oid_allocator_plugin {
60134 + /* generic fields */
60135 + plugin_header h;
60136 + int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60137 + __u64 oids);
60138 + /* used to report statfs->f_files */
60139 + __u64(*oids_used) (reiser4_oid_allocator * map);
60140 + /* get next oid to use */
60141 + __u64(*next_oid) (reiser4_oid_allocator * map);
60142 + /* used to report statfs->f_ffree */
60143 + __u64(*oids_free) (reiser4_oid_allocator * map);
60144 + /* allocate new objectid */
60145 + int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60146 + /* release objectid */
60147 + int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60148 + /* how many pages to reserve in transaction for allocation of new
60149 + objectid */
60150 + int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60151 + /* how many pages to reserve in transaction for freeing of an
60152 + objectid */
60153 + int (*oid_reserve_release) (reiser4_oid_allocator * map);
60154 + void (*print_info) (const char *, reiser4_oid_allocator *);
60155 +} oid_allocator_plugin;
60156 +
60157 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
60158 + are any) locations, etc */
60159 +typedef struct disk_format_plugin {
60160 + /* generic fields */
60161 + plugin_header h;
60162 + /* replay journal, initialize super_info_data, etc */
60163 + int (*init_format) (struct super_block *, void *data);
60164 +
60165 + /* key of root directory stat data */
60166 + const reiser4_key *(*root_dir_key) (const struct super_block *);
60167 +
60168 + int (*release) (struct super_block *);
60169 + jnode *(*log_super) (struct super_block *);
60170 + int (*check_open) (const struct inode * object);
60171 + int (*version_update) (struct super_block *);
60172 +} disk_format_plugin;
60173 +
60174 +struct jnode_plugin {
60175 + /* generic fields */
60176 + plugin_header h;
60177 + int (*init) (jnode * node);
60178 + int (*parse) (jnode * node);
60179 + struct address_space *(*mapping) (const jnode * node);
60180 + unsigned long (*index) (const jnode * node);
60181 + jnode *(*clone) (jnode * node);
60182 +};
60183 +
60184 +/* plugin instance. */
60185 +/* */
60186 +/* This is "wrapper" union for all types of plugins. Most of the code uses */
60187 +/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
60188 +/* operates with pointers to reiser4_plugin. This union is only used in */
60189 +/* some generic code in plugin/plugin.c that operates on all */
60190 +/* plugins. Technically speaking purpose of this union is to add type */
60191 +/* safety to said generic code: each plugin type (file_plugin, for */
60192 +/* example), contains plugin_header as its first memeber. This first member */
60193 +/* is located at the same place in memory as .h member of */
60194 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
60195 +/* looks in the .h which is header of plugin type located in union. This */
60196 +/* allows to avoid type-casts. */
60197 +union reiser4_plugin {
60198 + /* generic fields */
60199 + plugin_header h;
60200 + /* file plugin */
60201 + file_plugin file;
60202 + /* directory plugin */
60203 + dir_plugin dir;
60204 + /* hash plugin, used by directory plugin */
60205 + hash_plugin hash;
60206 + /* fibration plugin used by directory plugin */
60207 + fibration_plugin fibration;
60208 + /* cipher transform plugin, used by file plugin */
60209 + cipher_plugin cipher;
60210 + /* digest transform plugin, used by file plugin */
60211 + digest_plugin digest;
60212 + /* compression transform plugin, used by file plugin */
60213 + compression_plugin compression;
60214 + /* tail plugin, used by file plugin */
60215 + formatting_plugin formatting;
60216 + /* permission plugin */
60217 + perm_plugin perm;
60218 + /* node plugin */
60219 + node_plugin node;
60220 + /* item plugin */
60221 + item_plugin item;
60222 + /* stat-data extension plugin */
60223 + sd_ext_plugin sd_ext;
60224 + /* disk layout plugin */
60225 + disk_format_plugin format;
60226 + /* object id allocator plugin */
60227 + oid_allocator_plugin oid_allocator;
60228 + /* plugin for different jnode types */
60229 + jnode_plugin jnode;
60230 + /* compression mode plugin, used by object plugin */
60231 + compression_mode_plugin compression_mode;
60232 + /* cluster plugin, used by object plugin */
60233 + cluster_plugin clust;
60234 + /* place-holder for new plugin types that can be registered
60235 + dynamically, and used by other dynamically loaded plugins. */
60236 + void *generic;
60237 +};
60238 +
60239 +struct reiser4_plugin_ops {
60240 + /* called when plugin is initialized */
60241 + int (*init) (reiser4_plugin * plugin);
60242 + /* called when plugin is unloaded */
60243 + int (*done) (reiser4_plugin * plugin);
60244 + /* load given plugin from disk */
60245 + int (*load) (struct inode * inode,
60246 + reiser4_plugin * plugin, char **area, int *len);
60247 + /* how many space is required to store this plugin's state
60248 + in stat-data */
60249 + int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
60250 + /* save persistent plugin-data to disk */
60251 + int (*save) (struct inode * inode, reiser4_plugin * plugin,
60252 + char **area);
60253 + /* alignment requirement for on-disk state of this plugin
60254 + in number of bytes */
60255 + int alignment;
60256 + /* install itself into given inode. This can return error
60257 + (e.g., you cannot change hash of non-empty directory). */
60258 + int (*change) (struct inode * inode, reiser4_plugin * plugin,
60259 + pset_member memb);
60260 + /* install itself into given inode. This can return error
60261 + (e.g., you cannot change hash of non-empty directory). */
60262 + int (*inherit) (struct inode * inode, struct inode * parent,
60263 + reiser4_plugin * plugin);
60264 +};
60265 +
60266 +/* functions implemented in fs/reiser4/plugin/plugin.c */
60267 +
60268 +/* stores plugin reference in reiser4-specific part of inode */
60269 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
60270 +extern int init_plugins(void);
60271 +
60272 +/* builtin plugins */
60273 +
60274 +/* builtin hash-plugins */
60275 +
60276 +typedef enum {
60277 + RUPASOV_HASH_ID,
60278 + R5_HASH_ID,
60279 + TEA_HASH_ID,
60280 + FNV1_HASH_ID,
60281 + DEGENERATE_HASH_ID,
60282 + LAST_HASH_ID
60283 +} reiser4_hash_id;
60284 +
60285 +/* builtin cipher plugins */
60286 +
60287 +typedef enum {
60288 + NONE_CIPHER_ID,
60289 + LAST_CIPHER_ID
60290 +} reiser4_cipher_id;
60291 +
60292 +/* builtin digest plugins */
60293 +
60294 +typedef enum {
60295 + SHA256_32_DIGEST_ID,
60296 + LAST_DIGEST_ID
60297 +} reiser4_digest_id;
60298 +
60299 +/* builtin compression mode plugins */
60300 +typedef enum {
60301 + NONE_COMPRESSION_MODE_ID,
60302 + LATTD_COMPRESSION_MODE_ID,
60303 + ULTIM_COMPRESSION_MODE_ID,
60304 + FORCE_COMPRESSION_MODE_ID,
60305 + CONVX_COMPRESSION_MODE_ID,
60306 + LAST_COMPRESSION_MODE_ID
60307 +} reiser4_compression_mode_id;
60308 +
60309 +/* builtin cluster plugins */
60310 +typedef enum {
60311 + CLUSTER_64K_ID,
60312 + CLUSTER_32K_ID,
60313 + CLUSTER_16K_ID,
60314 + CLUSTER_8K_ID,
60315 + CLUSTER_4K_ID,
60316 + LAST_CLUSTER_ID
60317 +} reiser4_cluster_id;
60318 +
60319 +/* builtin tail-plugins */
60320 +
60321 +typedef enum {
60322 + NEVER_TAILS_FORMATTING_ID,
60323 + ALWAYS_TAILS_FORMATTING_ID,
60324 + SMALL_FILE_FORMATTING_ID,
60325 + LAST_TAIL_FORMATTING_ID
60326 +} reiser4_formatting_id;
60327 +
60328 +/* data type used to pack parameters that we pass to vfs object creation
60329 + function create_object() */
60330 +struct reiser4_object_create_data {
60331 + /* plugin to control created object */
60332 + reiser4_file_id id;
60333 + /* mode of regular file, directory or special file */
60334 +/* what happens if some other sort of perm plugin is in use? */
60335 + int mode;
60336 + /* rdev of special file */
60337 + dev_t rdev;
60338 + /* symlink target */
60339 + const char *name;
60340 + /* add here something for non-standard objects you invent, like
60341 + query for interpolation file etc. */
60342 +
60343 + struct reiser4_crypto_info * crypto;
60344 +
60345 + struct inode *parent;
60346 + struct dentry *dentry;
60347 +};
60348 +
60349 +/* description of directory entry being created/destroyed/sought for
60350 +
60351 + It is passed down to the directory plugin and farther to the
60352 + directory item plugin methods. Creation of new directory is done in
60353 + several stages: first we search for an entry with the same name, then
60354 + create new one. reiser4_dir_entry_desc is used to store some information
60355 + collected at some stage of this process and required later: key of
60356 + item that we want to insert/delete and pointer to an object that will
60357 + be bound by the new directory entry. Probably some more fields will
60358 + be added there.
60359 +
60360 +*/
60361 +struct reiser4_dir_entry_desc {
60362 + /* key of directory entry */
60363 + reiser4_key key;
60364 + /* object bound by this entry. */
60365 + struct inode *obj;
60366 +};
60367 +
60368 +#define MAX_PLUGIN_TYPE_LABEL_LEN 32
60369 +#define MAX_PLUGIN_PLUG_LABEL_LEN 32
60370 +
60371 +#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
60372 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
60373 +{ \
60374 + reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
60375 + return plugin ? & plugin -> FIELD : NULL; \
60376 +} \
60377 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
60378 +{ \
60379 + reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
60380 + return plugin ? & plugin -> FIELD : NULL; \
60381 +} \
60382 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
60383 +{ \
60384 + reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
60385 + return plugin ? & plugin -> FIELD : NULL; \
60386 +} \
60387 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
60388 +{ \
60389 + return ( reiser4_plugin * ) plugin; \
60390 +} \
60391 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
60392 +{ \
60393 + return TYPE ## _to_plugin (plugin) -> h.id; \
60394 +} \
60395 +typedef struct { int foo; } TYPE ## _plugin_dummy
60396 +
60397 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
60398 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
60399 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
60400 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
60401 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
60402 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
60403 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
60404 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
60405 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
60406 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
60407 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
60408 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
60409 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
60410 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
60411 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60412 + compression_mode);
60413 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
60414 +
60415 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
60416 +
60417 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
60418 +
60419 +#define for_all_plugins(ptype, plugin) \
60420 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
60421 + get_plugin_list(ptype) != &plugin->h.linkage; \
60422 + plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
60423 +
60424 +
60425 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
60426 +extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
60427 +extern int finish_pset(struct inode *inode);
60428 +
60429 +/* defined in fs/reiser4/plugin/object.c */
60430 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60431 +/* defined in fs/reiser4/plugin/object.c */
60432 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60433 +/* defined in fs/reiser4/plugin/item/static_stat.c */
60434 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
60435 +/* defined in fs/reiser4/plugin/hash.c */
60436 +extern hash_plugin hash_plugins[LAST_HASH_ID];
60437 +/* defined in fs/reiser4/plugin/fibration.c */
60438 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
60439 +/* defined in fs/reiser4/plugin/crypt.c */
60440 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
60441 +/* defined in fs/reiser4/plugin/digest.c */
60442 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
60443 +/* defined in fs/reiser4/plugin/compress/compress.c */
60444 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
60445 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
60446 +extern compression_mode_plugin
60447 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
60448 +/* defined in fs/reiser4/plugin/cluster.c */
60449 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
60450 +/* defined in fs/reiser4/plugin/tail.c */
60451 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
60452 +/* defined in fs/reiser4/plugin/security/security.c */
60453 +extern perm_plugin perm_plugins[LAST_PERM_ID];
60454 +/* defined in fs/reiser4/plugin/item/item.c */
60455 +extern item_plugin item_plugins[LAST_ITEM_ID];
60456 +/* defined in fs/reiser4/plugin/node/node.c */
60457 +extern node_plugin node_plugins[LAST_NODE_ID];
60458 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
60459 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
60460 +
60461 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
60462 +#endif
60463 +
60464 +/* Make Linus happy.
60465 + Local variables:
60466 + c-indentation-style: "K&R"
60467 + mode-name: "LC"
60468 + c-basic-offset: 8
60469 + tab-width: 8
60470 + fill-column: 120
60471 + End:
60472 +*/
60473 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.22/fs/reiser4/plugin/plugin_header.h
60474 --- linux-2.6.22.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
60475 +++ linux-2.6.22/fs/reiser4/plugin/plugin_header.h 2007-07-29 00:25:34.996727537 +0400
60476 @@ -0,0 +1,155 @@
60477 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60478 +
60479 +/* plugin header. Data structures required by all plugin types. */
60480 +
60481 +#if !defined( __PLUGIN_HEADER_H__ )
60482 +#define __PLUGIN_HEADER_H__
60483 +
60484 +/* plugin data-types and constants */
60485 +
60486 +#include "../debug.h"
60487 +#include "../dformat.h"
60488 +
60489 +/* Every plugin type can be considered as a class of virtual objects
60490 + {(type, i) | i = 0, 1, ...}, which has one the following categories
60491 + of virtualization:
60492 + A - no virtualization;
60493 + F - per-file virtualization;
60494 + S - per-superblock virtualization;
60495 + FIXME-EDWARD: Define every such category */
60496 +
60497 +/* Supported plugin types: (id, (virtualization category), short description) */
60498 +typedef enum {
60499 + REISER4_FILE_PLUGIN_TYPE, /* (F) service VFS enry-points */
60500 + REISER4_DIR_PLUGIN_TYPE, /* (F) service VFS enry-points */
60501 + REISER4_ITEM_PLUGIN_TYPE, /* (F) manage items */
60502 + REISER4_NODE_PLUGIN_TYPE, /* (S) manage formatted nodes */
60503 + REISER4_HASH_PLUGIN_TYPE, /* (F) compute hash */
60504 + REISER4_FIBRATION_PLUGIN_TYPE, /* (F) directory fibrations */
60505 + REISER4_FORMATTING_PLUGIN_TYPE, /* (F) tail-packing policy */
60506 + REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */
60507 + REISER4_SD_EXT_PLUGIN_TYPE, /* (A) stat-data extensions */
60508 + REISER4_FORMAT_PLUGIN_TYPE, /* (S) specify disk format */
60509 + REISER4_JNODE_PLUGIN_TYPE, /* (A) in-memory node headers */
60510 + REISER4_CIPHER_PLUGIN_TYPE, /* (F) cipher transform algs */
60511 + REISER4_DIGEST_PLUGIN_TYPE, /* (F) digest transform algs */
60512 + REISER4_COMPRESSION_PLUGIN_TYPE, /* (F) compression tfm algs */
60513 + REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
60514 + REISER4_CLUSTER_PLUGIN_TYPE, /* (F) size of logical cluster */
60515 + REISER4_PLUGIN_TYPES
60516 +} reiser4_plugin_type;
60517 +
60518 +/* Supported plugin groups */
60519 +typedef enum {
60520 + REISER4_DIRECTORY_FILE,
60521 + REISER4_REGULAR_FILE,
60522 + REISER4_SYMLINK_FILE,
60523 + REISER4_SPECIAL_FILE,
60524 +} file_plugin_group;
60525 +
60526 +struct reiser4_plugin_ops;
60527 +/* generic plugin operations, supported by each
60528 + plugin type. */
60529 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
60530 +
60531 +/* the common part of all plugin instances. */
60532 +typedef struct plugin_header {
60533 + /* plugin type */
60534 + reiser4_plugin_type type_id;
60535 + /* id of this plugin */
60536 + reiser4_plugin_id id;
60537 + /* bitmask of groups the plugin belongs to. */
60538 + reiser4_plugin_groups groups;
60539 + /* plugin operations */
60540 + reiser4_plugin_ops *pops;
60541 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
60542 + /* short label of this plugin */
60543 + const char *label;
60544 + /* descriptive string.. */
60545 + const char *desc;
60546 + /* list linkage */
60547 + struct list_head linkage;
60548 +} plugin_header;
60549 +
60550 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
60551 +
60552 +/* PRIVATE INTERFACES */
60553 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
60554 +/* plugin type representation. */
60555 +struct reiser4_plugin_type_data {
60556 + /* internal plugin type identifier. Should coincide with
60557 + index of this item in plugins[] array. */
60558 + reiser4_plugin_type type_id;
60559 + /* short symbolic label of this plugin type. Should be no longer
60560 + than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
60561 + const char *label;
60562 + /* plugin type description longer than .label */
60563 + const char *desc;
60564 +
60565 +/* NIKITA-FIXME-HANS: define built-in */
60566 + /* number of built-in plugin instances of this type */
60567 + int builtin_num;
60568 + /* array of built-in plugins */
60569 + void *builtin;
60570 + struct list_head plugins_list;
60571 + size_t size;
60572 +};
60573 +
60574 +extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
60575 +
60576 +int is_plugin_type_valid(reiser4_plugin_type type);
60577 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
60578 +
60579 +static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data * ptype,
60580 + int i)
60581 +{
60582 + char *builtin;
60583 +
60584 + builtin = ptype->builtin;
60585 + return (reiser4_plugin *) (builtin + i * ptype->size);
60586 +}
60587 +
60588 +/* return plugin by its @type_id and @id */
60589 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
60590 + reiser4_plugin_id id)
60591 +{
60592 + assert("nikita-1651", is_plugin_type_valid(type));
60593 + assert("nikita-1652", is_plugin_id_valid(type, id));
60594 + return plugin_at(&plugins[type], id);
60595 +}
60596 +
60597 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
60598 + reiser4_plugin_id id);
60599 +
60600 +/**
60601 + * plugin_by_disk_id - get reiser4_plugin
60602 + * @type_id: plugin type id
60603 + * @did: plugin id in disk format
60604 + *
60605 + * Returns reiser4_plugin by plugin type id an dplugin_id.
60606 + */
60607 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
60608 + reiser4_plugin_type type_id,
60609 + __le16 *plugin_id)
60610 +{
60611 + /*
60612 + * what we should do properly is to maintain within each file-system a
60613 + * dictionary that maps on-disk plugin ids to "universal" ids. This
60614 + * dictionary will be resolved on mount time, so that this function
60615 + * will perform just one additional array lookup.
60616 + */
60617 + return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
60618 +}
60619 +
60620 +/* __PLUGIN_HEADER_H__ */
60621 +#endif
60622 +
60623 +/*
60624 + * Local variables:
60625 + * c-indentation-style: "K&R"
60626 + * mode-name: "LC"
60627 + * c-basic-offset: 8
60628 + * tab-width: 8
60629 + * fill-column: 79
60630 + * End:
60631 + */
60632 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.22/fs/reiser4/plugin/plugin_set.c
60633 --- linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300
60634 +++ linux-2.6.22/fs/reiser4/plugin/plugin_set.c 2007-07-29 00:25:35.000728572 +0400
60635 @@ -0,0 +1,379 @@
60636 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60637 + * reiser4/README */
60638 +/* This file contains Reiser4 plugin set operations */
60639 +
60640 +/* plugin sets
60641 + *
60642 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
60643 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
60644 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
60645 + * set of plugins (so called pset) is described by structure plugin_set (see
60646 + * plugin/plugin_set.h), which contains pointers to all required plugins.
60647 + *
60648 + * Children can inherit some pset members from their parent, however sometimes
60649 + * it is useful to specify members different from parent ones. Since object's
60650 + * pset can not be easily changed without fatal consequences, we use for this
60651 + * purpose another special plugin table (so called hset, or heir set) described
60652 + * by the same structure.
60653 + *
60654 + * Inode only stores a pointers to pset and hset. Different inodes with the
60655 + * same set of pset (hset) members point to the same pset (hset). This is
60656 + * archived by storing psets and hsets in global hash table. Races are avoided
60657 + * by simple (and efficient so far) solution of never recycling psets, even
60658 + * when last inode pointing to it is destroyed.
60659 + */
60660 +
60661 +#include "../debug.h"
60662 +#include "../super.h"
60663 +#include "plugin_set.h"
60664 +
60665 +#include <linux/slab.h>
60666 +#include <linux/stddef.h>
60667 +
60668 +/* slab for plugin sets */
60669 +static struct kmem_cache *plugin_set_slab;
60670 +
60671 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
60672 + [0 ... 7] = SPIN_LOCK_UNLOCKED
60673 +};
60674 +
60675 +/* hash table support */
60676 +
60677 +#define PS_TABLE_SIZE (32)
60678 +
60679 +static inline plugin_set *cast_to(const unsigned long *a)
60680 +{
60681 + return container_of(a, plugin_set, hashval);
60682 +}
60683 +
60684 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
60685 +{
60686 + plugin_set *set1;
60687 + plugin_set *set2;
60688 +
60689 + /* make sure fields are not missed in the code below */
60690 + cassert(sizeof *set1 ==
60691 + sizeof set1->hashval +
60692 + sizeof set1->link +
60693 + sizeof set1->file +
60694 + sizeof set1->dir +
60695 + sizeof set1->perm +
60696 + sizeof set1->formatting +
60697 + sizeof set1->hash +
60698 + sizeof set1->fibration +
60699 + sizeof set1->sd +
60700 + sizeof set1->dir_item +
60701 + sizeof set1->cipher +
60702 + sizeof set1->digest +
60703 + sizeof set1->compression +
60704 + sizeof set1->compression_mode +
60705 + sizeof set1->cluster +
60706 + sizeof set1->create);
60707 +
60708 + set1 = cast_to(a1);
60709 + set2 = cast_to(a2);
60710 + return
60711 + set1->hashval == set2->hashval &&
60712 + set1->file == set2->file &&
60713 + set1->dir == set2->dir &&
60714 + set1->perm == set2->perm &&
60715 + set1->formatting == set2->formatting &&
60716 + set1->hash == set2->hash &&
60717 + set1->fibration == set2->fibration &&
60718 + set1->sd == set2->sd &&
60719 + set1->dir_item == set2->dir_item &&
60720 + set1->cipher == set2->cipher &&
60721 + set1->digest == set2->digest &&
60722 + set1->compression == set2->compression &&
60723 + set1->compression_mode == set2->compression_mode &&
60724 + set1->cluster == set2->cluster &&
60725 + set1->create == set2->create;
60726 +}
60727 +
60728 +#define HASH_FIELD(hash, set, field) \
60729 +({ \
60730 + (hash) += (unsigned long)(set)->field >> 2; \
60731 +})
60732 +
60733 +static inline unsigned long calculate_hash(const plugin_set * set)
60734 +{
60735 + unsigned long result;
60736 +
60737 + result = 0;
60738 + HASH_FIELD(result, set, file);
60739 + HASH_FIELD(result, set, dir);
60740 + HASH_FIELD(result, set, perm);
60741 + HASH_FIELD(result, set, formatting);
60742 + HASH_FIELD(result, set, hash);
60743 + HASH_FIELD(result, set, fibration);
60744 + HASH_FIELD(result, set, sd);
60745 + HASH_FIELD(result, set, dir_item);
60746 + HASH_FIELD(result, set, cipher);
60747 + HASH_FIELD(result, set, digest);
60748 + HASH_FIELD(result, set, compression);
60749 + HASH_FIELD(result, set, compression_mode);
60750 + HASH_FIELD(result, set, cluster);
60751 + HASH_FIELD(result, set, create);
60752 + return result & (PS_TABLE_SIZE - 1);
60753 +}
60754 +
60755 +static inline unsigned long
60756 +pshash(ps_hash_table * table, const unsigned long *a)
60757 +{
60758 + return *a;
60759 +}
60760 +
60761 +/* The hash table definition */
60762 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
60763 +#define KFREE(ptr, size) kfree(ptr)
60764 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
60765 + pseq);
60766 +#undef KFREE
60767 +#undef KMALLOC
60768 +
60769 +static ps_hash_table ps_table;
60770 +static plugin_set empty_set = {
60771 + .hashval = 0,
60772 + .file = NULL,
60773 + .dir = NULL,
60774 + .perm = NULL,
60775 + .formatting = NULL,
60776 + .hash = NULL,
60777 + .fibration = NULL,
60778 + .sd = NULL,
60779 + .dir_item = NULL,
60780 + .cipher = NULL,
60781 + .digest = NULL,
60782 + .compression = NULL,
60783 + .compression_mode = NULL,
60784 + .cluster = NULL,
60785 + .create = NULL,
60786 + .link = {NULL}
60787 +};
60788 +
60789 +plugin_set *plugin_set_get_empty(void)
60790 +{
60791 + return &empty_set;
60792 +}
60793 +
60794 +void plugin_set_put(plugin_set * set)
60795 +{
60796 +}
60797 +
60798 +static inline unsigned long *pset_field(plugin_set * set, int offset)
60799 +{
60800 + return (unsigned long *)(((char *)set) + offset);
60801 +}
60802 +
60803 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
60804 + const int offset)
60805 +{
60806 + unsigned long *spot;
60807 + spinlock_t *lock;
60808 + plugin_set replica;
60809 + plugin_set *twin;
60810 + plugin_set *psal;
60811 + plugin_set *orig;
60812 +
60813 + assert("nikita-2902", set != NULL);
60814 + assert("nikita-2904", *set != NULL);
60815 +
60816 + spot = pset_field(*set, offset);
60817 + if (unlikely(*spot == val))
60818 + return 0;
60819 +
60820 + replica = *(orig = *set);
60821 + *pset_field(&replica, offset) = val;
60822 + replica.hashval = calculate_hash(&replica);
60823 + rcu_read_lock();
60824 + twin = ps_hash_find(&ps_table, &replica.hashval);
60825 + if (unlikely(twin == NULL)) {
60826 + rcu_read_unlock();
60827 + psal = kmem_cache_alloc(plugin_set_slab,
60828 + reiser4_ctx_gfp_mask_get());
60829 + if (psal == NULL)
60830 + return RETERR(-ENOMEM);
60831 + *psal = replica;
60832 + lock = &plugin_set_lock[replica.hashval & 7];
60833 + spin_lock(lock);
60834 + twin = ps_hash_find(&ps_table, &replica.hashval);
60835 + if (likely(twin == NULL)) {
60836 + *set = psal;
60837 + ps_hash_insert_rcu(&ps_table, psal);
60838 + } else {
60839 + *set = twin;
60840 + kmem_cache_free(plugin_set_slab, psal);
60841 + }
60842 + spin_unlock(lock);
60843 + } else {
60844 + rcu_read_unlock();
60845 + *set = twin;
60846 + }
60847 + return 0;
60848 +}
60849 +
60850 +static struct {
60851 + int offset;
60852 + reiser4_plugin_groups groups;
60853 + reiser4_plugin_type type;
60854 +} pset_descr[PSET_LAST] = {
60855 + [PSET_FILE] = {
60856 + .offset = offsetof(plugin_set, file),
60857 + .type = REISER4_FILE_PLUGIN_TYPE,
60858 + .groups = 0
60859 + },
60860 + [PSET_DIR] = {
60861 + .offset = offsetof(plugin_set, dir),
60862 + .type = REISER4_DIR_PLUGIN_TYPE,
60863 + .groups = 0
60864 + },
60865 + [PSET_PERM] = {
60866 + .offset = offsetof(plugin_set, perm),
60867 + .type = REISER4_PERM_PLUGIN_TYPE,
60868 + .groups = 0
60869 + },
60870 + [PSET_FORMATTING] = {
60871 + .offset = offsetof(plugin_set, formatting),
60872 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
60873 + .groups = 0
60874 + },
60875 + [PSET_HASH] = {
60876 + .offset = offsetof(plugin_set, hash),
60877 + .type = REISER4_HASH_PLUGIN_TYPE,
60878 + .groups = 0
60879 + },
60880 + [PSET_FIBRATION] = {
60881 + .offset = offsetof(plugin_set, fibration),
60882 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
60883 + .groups = 0
60884 + },
60885 + [PSET_SD] = {
60886 + .offset = offsetof(plugin_set, sd),
60887 + .type = REISER4_ITEM_PLUGIN_TYPE,
60888 + .groups = (1 << STAT_DATA_ITEM_TYPE)
60889 + },
60890 + [PSET_DIR_ITEM] = {
60891 + .offset = offsetof(plugin_set, dir_item),
60892 + .type = REISER4_ITEM_PLUGIN_TYPE,
60893 + .groups = (1 << DIR_ENTRY_ITEM_TYPE)
60894 + },
60895 + [PSET_CIPHER] = {
60896 + .offset = offsetof(plugin_set, cipher),
60897 + .type = REISER4_CIPHER_PLUGIN_TYPE,
60898 + .groups = 0
60899 + },
60900 + [PSET_DIGEST] = {
60901 + .offset = offsetof(plugin_set, digest),
60902 + .type = REISER4_DIGEST_PLUGIN_TYPE,
60903 + .groups = 0
60904 + },
60905 + [PSET_COMPRESSION] = {
60906 + .offset = offsetof(plugin_set, compression),
60907 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
60908 + .groups = 0
60909 + },
60910 + [PSET_COMPRESSION_MODE] = {
60911 + .offset = offsetof(plugin_set, compression_mode),
60912 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60913 + .groups = 0
60914 + },
60915 + [PSET_CLUSTER] = {
60916 + .offset = offsetof(plugin_set, cluster),
60917 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
60918 + .groups = 0
60919 + },
60920 + [PSET_CREATE] = {
60921 + .offset = offsetof(plugin_set, create),
60922 + .type = REISER4_FILE_PLUGIN_TYPE,
60923 + .groups = (1 << REISER4_REGULAR_FILE)
60924 + }
60925 +};
60926 +
60927 +#define DEFINE_PSET_OPS(PREFIX) \
60928 + reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
60929 +{ \
60930 + if (memb > PSET_LAST) \
60931 + return REISER4_PLUGIN_TYPES; \
60932 + return pset_descr[memb].type; \
60933 +} \
60934 + \
60935 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
60936 + reiser4_plugin * plugin) \
60937 +{ \
60938 + assert("nikita-3492", set != NULL); \
60939 + assert("nikita-3493", *set != NULL); \
60940 + assert("nikita-3494", plugin != NULL); \
60941 + assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
60942 + assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
60943 + \
60944 + if (pset_descr[memb].groups) \
60945 + if (!(pset_descr[memb].groups & plugin->h.groups)) \
60946 + return -EINVAL; \
60947 + \
60948 + return plugin_set_field(set, \
60949 + (unsigned long)plugin, pset_descr[memb].offset); \
60950 +} \
60951 + \
60952 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
60953 +{ \
60954 + assert("nikita-3497", set != NULL); \
60955 + assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
60956 + \
60957 + return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
60958 +}
60959 +
60960 +DEFINE_PSET_OPS(aset);
60961 +
60962 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
60963 + return plugin_set_field(set,
60964 + (unsigned long)plugin, pset_descr[memb].offset);
60965 +}
60966 +
60967 +/**
60968 + * init_plugin_set - create plugin set cache and hash table
60969 + *
60970 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
60971 + * reiser4 module initialization.
60972 + */
60973 +int init_plugin_set(void)
60974 +{
60975 + int result;
60976 +
60977 + result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
60978 + if (result == 0) {
60979 + plugin_set_slab = kmem_cache_create("plugin_set",
60980 + sizeof(plugin_set), 0,
60981 + SLAB_HWCACHE_ALIGN,
60982 + NULL, NULL);
60983 + if (plugin_set_slab == NULL)
60984 + result = RETERR(-ENOMEM);
60985 + }
60986 + return result;
60987 +}
60988 +
60989 +/**
60990 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
60991 + *
60992 + * This is called on reiser4 module unloading or system shutdown.
60993 + */
60994 +void done_plugin_set(void)
60995 +{
60996 + plugin_set *cur, *next;
60997 +
60998 + for_all_in_htable(&ps_table, ps, cur, next) {
60999 + ps_hash_remove(&ps_table, cur);
61000 + kmem_cache_free(plugin_set_slab, cur);
61001 + }
61002 + destroy_reiser4_cache(&plugin_set_slab);
61003 + ps_hash_done(&ps_table);
61004 +}
61005 +
61006 +/*
61007 + * Local variables:
61008 + * c-indentation-style: "K&R"
61009 + * mode-name: "LC"
61010 + * c-basic-offset: 8
61011 + * tab-width: 8
61012 + * fill-column: 120
61013 + * End:
61014 + */
61015 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.22/fs/reiser4/plugin/plugin_set.h
61016 --- linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300
61017 +++ linux-2.6.22/fs/reiser4/plugin/plugin_set.h 2007-07-29 00:25:35.000728572 +0400
61018 @@ -0,0 +1,77 @@
61019 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61020 +
61021 +/* Reiser4 plugin set definition.
61022 + See fs/reiser4/plugin/plugin_set.c for details */
61023 +
61024 +#if !defined( __PLUGIN_SET_H__ )
61025 +#define __PLUGIN_SET_H__
61026 +
61027 +#include "../type_safe_hash.h"
61028 +#include "plugin.h"
61029 +
61030 +#include <linux/rcupdate.h>
61031 +
61032 +struct plugin_set;
61033 +typedef struct plugin_set plugin_set;
61034 +
61035 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61036 +
61037 +struct plugin_set {
61038 + unsigned long hashval;
61039 + /* plugin of file */
61040 + file_plugin *file;
61041 + /* plugin of dir */
61042 + dir_plugin *dir;
61043 + /* perm plugin for this file */
61044 + perm_plugin *perm;
61045 + /* tail policy plugin. Only meaningful for regular files */
61046 + formatting_plugin *formatting;
61047 + /* hash plugin. Only meaningful for directories. */
61048 + hash_plugin *hash;
61049 + /* fibration plugin. Only meaningful for directories. */
61050 + fibration_plugin *fibration;
61051 + /* plugin of stat-data */
61052 + item_plugin *sd;
61053 + /* plugin of items a directory is built of */
61054 + item_plugin *dir_item;
61055 + /* cipher plugin */
61056 + cipher_plugin *cipher;
61057 + /* digest plugin */
61058 + digest_plugin *digest;
61059 + /* compression plugin */
61060 + compression_plugin *compression;
61061 + /* compression mode plugin */
61062 + compression_mode_plugin *compression_mode;
61063 + /* cluster plugin */
61064 + cluster_plugin *cluster;
61065 + /* this specifies file plugin of regular children.
61066 + only meaningful for directories */
61067 + file_plugin *create;
61068 + ps_hash_link link;
61069 +};
61070 +
61071 +extern plugin_set *plugin_set_get_empty(void);
61072 +extern void plugin_set_put(plugin_set * set);
61073 +
61074 +extern int init_plugin_set(void);
61075 +extern void done_plugin_set(void);
61076 +
61077 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
61078 +extern int set_plugin(plugin_set ** set, pset_member memb,
61079 + reiser4_plugin * plugin);
61080 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
61081 + reiser4_plugin * plugin);
61082 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
61083 +
61084 +/* __PLUGIN_SET_H__ */
61085 +#endif
61086 +
61087 +/* Make Linus happy.
61088 + Local variables:
61089 + c-indentation-style: "K&R"
61090 + mode-name: "LC"
61091 + c-basic-offset: 8
61092 + tab-width: 8
61093 + fill-column: 120
61094 + End:
61095 +*/
61096 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/security/Makefile linux-2.6.22/fs/reiser4/plugin/security/Makefile
61097 --- linux-2.6.22.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300
61098 +++ linux-2.6.22/fs/reiser4/plugin/security/Makefile 2007-07-29 00:25:35.000728572 +0400
61099 @@ -0,0 +1,4 @@
61100 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
61101 +
61102 +security_plugins-objs := \
61103 + perm.o
61104 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/security/perm.c linux-2.6.22/fs/reiser4/plugin/security/perm.c
61105 --- linux-2.6.22.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
61106 +++ linux-2.6.22/fs/reiser4/plugin/security/perm.c 2007-07-29 00:25:35.000728572 +0400
61107 @@ -0,0 +1,33 @@
61108 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61109 +
61110 +/*
61111 + * This file contains implementation of permission plugins.
61112 + * See the comments in perm.h
61113 + */
61114 +
61115 +#include "../plugin.h"
61116 +#include "../plugin_header.h"
61117 +#include "../../debug.h"
61118 +
61119 +perm_plugin perm_plugins[LAST_PERM_ID] = {
61120 + [NULL_PERM_ID] = {
61121 + .h = {
61122 + .type_id = REISER4_PERM_PLUGIN_TYPE,
61123 + .id = NULL_PERM_ID,
61124 + .pops = NULL,
61125 + .label = "null",
61126 + .desc = "stub permission plugin",
61127 + .linkage = {NULL, NULL}
61128 + }
61129 + }
61130 +};
61131 +
61132 +/*
61133 + * Local variables:
61134 + * c-indentation-style: "K&R"
61135 + * mode-name: "LC"
61136 + * c-basic-offset: 8
61137 + * tab-width: 8
61138 + * fill-column: 79
61139 + * End:
61140 + */
61141 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/security/perm.h linux-2.6.22/fs/reiser4/plugin/security/perm.h
61142 --- linux-2.6.22.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
61143 +++ linux-2.6.22/fs/reiser4/plugin/security/perm.h 2007-07-29 00:25:35.000728572 +0400
61144 @@ -0,0 +1,38 @@
61145 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61146 +
61147 +/* Perm (short for "permissions") plugins common stuff. */
61148 +
61149 +#if !defined( __REISER4_PERM_H__ )
61150 +#define __REISER4_PERM_H__
61151 +
61152 +#include "../../forward.h"
61153 +#include "../plugin_header.h"
61154 +
61155 +#include <linux/types.h>
61156 +
61157 +/* Definition of permission plugin */
61158 +/* NIKITA-FIXME-HANS: define what this is targeted for.
61159 + It does not seem to be intended for use with sys_reiser4. Explain. */
61160 +
61161 +/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
61162 + Consider it like a temporary "seam" and reserved pset member.
61163 + If you have something usefull to add, then rename this plugin and add here */
61164 +typedef struct perm_plugin {
61165 + /* generic plugin fields */
61166 + plugin_header h;
61167 +} perm_plugin;
61168 +
61169 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
61170 +
61171 +/* __REISER4_PERM_H__ */
61172 +#endif
61173 +
61174 +/* Make Linus happy.
61175 + Local variables:
61176 + c-indentation-style: "K&R"
61177 + mode-name: "LC"
61178 + c-basic-offset: 8
61179 + tab-width: 8
61180 + fill-column: 120
61181 + End:
61182 +*/
61183 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.22/fs/reiser4/plugin/space/bitmap.c
61184 --- linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300
61185 +++ linux-2.6.22/fs/reiser4/plugin/space/bitmap.c 2007-07-29 00:25:35.004729608 +0400
61186 @@ -0,0 +1,1585 @@
61187 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61188 +
61189 +#include "../../debug.h"
61190 +#include "../../dformat.h"
61191 +#include "../../txnmgr.h"
61192 +#include "../../jnode.h"
61193 +#include "../../block_alloc.h"
61194 +#include "../../tree.h"
61195 +#include "../../super.h"
61196 +#include "../plugin.h"
61197 +#include "space_allocator.h"
61198 +#include "bitmap.h"
61199 +
61200 +#include <linux/types.h>
61201 +#include <linux/fs.h> /* for struct super_block */
61202 +#include <linux/mutex.h>
61203 +#include <asm/div64.h>
61204 +
61205 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
61206 + * blocks
61207 +
61208 + A useful optimization of reiser4 bitmap handling would be dynamic bitmap
61209 + blocks loading/unloading which is different from v3.x where all bitmap
61210 + blocks are loaded at mount time.
61211 +
61212 + To implement bitmap blocks unloading we need to count bitmap block usage
61213 + and detect currently unused blocks allowing them to be unloaded. It is not
61214 + a simple task since we allow several threads to modify one bitmap block
61215 + simultaneously.
61216 +
61217 + Briefly speaking, the following schema is proposed: we count in special
61218 + variable associated with each bitmap block. That is for counting of block
61219 + alloc/dealloc operations on that bitmap block. With a deferred block
61220 + deallocation feature of reiser4 all those operation will be represented in
61221 + atom dirty/deleted lists as jnodes for freshly allocated or deleted
61222 + nodes.
61223 +
61224 + So, we increment usage counter for each new node allocated or deleted, and
61225 + decrement it at atom commit one time for each node from the dirty/deleted
61226 + atom's list. Of course, freshly allocated node deletion and node reusing
61227 + from atom deleted (if we do so) list should decrement bitmap usage counter
61228 + also.
61229 +
61230 + This schema seems to be working but that reference counting is
61231 + not easy to debug. I think we should agree with Hans and do not implement
61232 + it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
61233 +
61234 + For simplicity all bitmap nodes (both commit and working bitmap blocks) are
61235 + loaded into memory on fs mount time or each bitmap nodes are loaded at the
61236 + first access to it, the "dont_load_bitmap" mount option controls whether
61237 + bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
61238 + nodes currently is not supported. */
61239 +
61240 +#define CHECKSUM_SIZE 4
61241 +
61242 +#define BYTES_PER_LONG (sizeof(long))
61243 +
61244 +#if BITS_PER_LONG == 64
61245 +# define LONG_INT_SHIFT (6)
61246 +#else
61247 +# define LONG_INT_SHIFT (5)
61248 +#endif
61249 +
61250 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
61251 +
61252 +typedef unsigned long ulong_t;
61253 +
61254 +#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
61255 +#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
61256 +
61257 +/* Block allocation/deallocation are done through special bitmap objects which
61258 + are allocated in an array at fs mount. */
61259 +struct bitmap_node {
61260 + struct mutex mutex; /* long term lock object */
61261 +
61262 + jnode *wjnode; /* j-nodes for WORKING ... */
61263 + jnode *cjnode; /* ... and COMMIT bitmap blocks */
61264 +
61265 + bmap_off_t first_zero_bit; /* for skip_busy option implementation */
61266 +
61267 + atomic_t loaded; /* a flag which shows that bnode is loaded
61268 + * already */
61269 +};
61270 +
61271 +static inline char *bnode_working_data(struct bitmap_node *bnode)
61272 +{
61273 + char *data;
61274 +
61275 + data = jdata(bnode->wjnode);
61276 + assert("zam-429", data != NULL);
61277 +
61278 + return data + CHECKSUM_SIZE;
61279 +}
61280 +
61281 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
61282 +{
61283 + char *data;
61284 +
61285 + data = jdata(bnode->cjnode);
61286 + assert("zam-430", data != NULL);
61287 +
61288 + return data + CHECKSUM_SIZE;
61289 +}
61290 +
61291 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
61292 +{
61293 + char *data;
61294 +
61295 + data = jdata(bnode->cjnode);
61296 + assert("vpf-261", data != NULL);
61297 +
61298 + return le32_to_cpu(get_unaligned((d32 *)data));
61299 +}
61300 +
61301 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
61302 +{
61303 + char *data;
61304 +
61305 + data = jdata(bnode->cjnode);
61306 + assert("vpf-261", data != NULL);
61307 +
61308 + put_unaligned(cpu_to_le32(crc), (d32 *)data);
61309 +}
61310 +
61311 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
61312 + * written the code, does this added abstraction still have */
61313 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
61314 + * reiser4_space_allocator structure) */
61315 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
61316 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
61317 + * someday?". What they about? If there is a reason to have a union, it should
61318 + * be a union, if not, it should not be a union. "..might be someday" means no
61319 + * reason. */
61320 +struct bitmap_allocator_data {
61321 + /* an array for bitmap blocks direct access */
61322 + struct bitmap_node *bitmap;
61323 +};
61324 +
61325 +#define get_barray(super) \
61326 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
61327 +
61328 +#define get_bnode(super, i) (get_barray(super) + i)
61329 +
61330 +/* allocate and initialize jnode with JNODE_BITMAP type */
61331 +static jnode *bnew(void)
61332 +{
61333 + jnode *jal = jalloc();
61334 +
61335 + if (jal)
61336 + jnode_init(jal, current_tree, JNODE_BITMAP);
61337 +
61338 + return jal;
61339 +}
61340 +
61341 +/* this file contains:
61342 + - bitmap based implementation of space allocation plugin
61343 + - all the helper functions like set bit, find_first_zero_bit, etc */
61344 +
61345 +/* Audited by: green(2002.06.12) */
61346 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
61347 +{
61348 + ulong_t mask = 1UL << start_bit;
61349 + int i = start_bit;
61350 +
61351 + while ((word & mask) != 0) {
61352 + mask <<= 1;
61353 + if (++i >= BITS_PER_LONG)
61354 + break;
61355 + }
61356 +
61357 + return i;
61358 +}
61359 +
61360 +#include <asm/bitops.h>
61361 +
61362 +#if BITS_PER_LONG == 64
61363 +
61364 +#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
61365 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
61366 +
61367 +static inline void reiser4_set_bit(int nr, void *addr)
61368 +{
61369 + ext2_set_bit(nr + OFF(addr), BASE(addr));
61370 +}
61371 +
61372 +static inline void reiser4_clear_bit(int nr, void *addr)
61373 +{
61374 + ext2_clear_bit(nr + OFF(addr), BASE(addr));
61375 +}
61376 +
61377 +static inline int reiser4_test_bit(int nr, void *addr)
61378 +{
61379 + return ext2_test_bit(nr + OFF(addr), BASE(addr));
61380 +}
61381 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
61382 + int offset)
61383 +{
61384 + int off = OFF(addr);
61385 +
61386 + return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
61387 + offset + off) - off;
61388 +}
61389 +
61390 +#else
61391 +
61392 +#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
61393 +#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
61394 +#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
61395 +
61396 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
61397 +ext2_find_next_zero_bit(addr, maxoffset, offset)
61398 +#endif
61399 +
61400 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
61401 + * are counted from @addr, return the offset of the first bit if it is found,
61402 + * @maxoffset otherwise. */
61403 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61404 + bmap_off_t start_offset)
61405 +{
61406 + ulong_t *base = addr;
61407 + /* start_offset is in bits, convert it to byte offset within bitmap. */
61408 + int word_nr = start_offset >> LONG_INT_SHIFT;
61409 + /* bit number within the byte. */
61410 + int bit_nr = start_offset & LONG_INT_MASK;
61411 + int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
61412 +
61413 + assert("zam-387", max_offset != 0);
61414 +
61415 + /* Unaligned @start_offset case. */
61416 + if (bit_nr != 0) {
61417 + bmap_nr_t nr;
61418 +
61419 + nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
61420 +
61421 + if (nr < BITS_PER_LONG)
61422 + return (word_nr << LONG_INT_SHIFT) + nr;
61423 +
61424 + ++word_nr;
61425 + }
61426 +
61427 + /* Fast scan trough aligned words. */
61428 + while (word_nr <= max_word_nr) {
61429 + if (base[word_nr] != 0) {
61430 + return (word_nr << LONG_INT_SHIFT)
61431 + + find_next_zero_bit_in_word(~(base[word_nr]), 0);
61432 + }
61433 +
61434 + ++word_nr;
61435 + }
61436 +
61437 + return max_offset;
61438 +}
61439 +
61440 +#if BITS_PER_LONG == 64
61441 +
61442 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61443 + bmap_off_t start_offset)
61444 +{
61445 + bmap_off_t off = OFF(addr);
61446 +
61447 + return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
61448 + start_offset + off) - off;
61449 +}
61450 +
61451 +#else
61452 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
61453 + __reiser4_find_next_set_bit(addr, max_offset, start_offset)
61454 +#endif
61455 +
61456 +/* search for the first set bit in single word. */
61457 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
61458 +{
61459 + ulong_t bit_mask;
61460 + int nr = start_bit;
61461 +
61462 + assert("zam-965", start_bit < BITS_PER_LONG);
61463 + assert("zam-966", start_bit >= 0);
61464 +
61465 + bit_mask = (1UL << nr);
61466 +
61467 + while (bit_mask != 0) {
61468 + if (bit_mask & word)
61469 + return nr;
61470 + bit_mask >>= 1;
61471 + nr--;
61472 + }
61473 + return BITS_PER_LONG;
61474 +}
61475 +
61476 +/* Search bitmap for a set bit in backward direction from the end to the
61477 + * beginning of given region
61478 + *
61479 + * @result: result offset of the last set bit
61480 + * @addr: base memory address,
61481 + * @low_off: low end of the search region, edge bit included into the region,
61482 + * @high_off: high end of the search region, edge bit included into the region,
61483 + *
61484 + * @return: 0 - set bit was found, -1 otherwise.
61485 + */
61486 +static int
61487 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61488 + bmap_off_t high_off)
61489 +{
61490 + ulong_t *base = addr;
61491 + int last_word;
61492 + int first_word;
61493 + int last_bit;
61494 + int nr;
61495 +
61496 + assert("zam-962", high_off >= low_off);
61497 +
61498 + last_word = high_off >> LONG_INT_SHIFT;
61499 + last_bit = high_off & LONG_INT_MASK;
61500 + first_word = low_off >> LONG_INT_SHIFT;
61501 +
61502 + if (last_bit < BITS_PER_LONG) {
61503 + nr = find_last_set_bit_in_word(base[last_word], last_bit);
61504 + if (nr < BITS_PER_LONG) {
61505 + *result = (last_word << LONG_INT_SHIFT) + nr;
61506 + return 0;
61507 + }
61508 + --last_word;
61509 + }
61510 + while (last_word >= first_word) {
61511 + if (base[last_word] != 0x0) {
61512 + last_bit =
61513 + find_last_set_bit_in_word(base[last_word],
61514 + BITS_PER_LONG - 1);
61515 + assert("zam-972", last_bit < BITS_PER_LONG);
61516 + *result = (last_word << LONG_INT_SHIFT) + last_bit;
61517 + return 0;
61518 + }
61519 + --last_word;
61520 + }
61521 +
61522 + return -1; /* set bit not found */
61523 +}
61524 +
61525 +/* Search bitmap for a clear bit in backward direction from the end to the
61526 + * beginning of given region */
61527 +static int
61528 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61529 + bmap_off_t high_off)
61530 +{
61531 + ulong_t *base = addr;
61532 + int last_word;
61533 + int first_word;
61534 + int last_bit;
61535 + int nr;
61536 +
61537 + last_word = high_off >> LONG_INT_SHIFT;
61538 + last_bit = high_off & LONG_INT_MASK;
61539 + first_word = low_off >> LONG_INT_SHIFT;
61540 +
61541 + if (last_bit < BITS_PER_LONG) {
61542 + nr = find_last_set_bit_in_word(~base[last_word], last_bit);
61543 + if (nr < BITS_PER_LONG) {
61544 + *result = (last_word << LONG_INT_SHIFT) + nr;
61545 + return 0;
61546 + }
61547 + --last_word;
61548 + }
61549 + while (last_word >= first_word) {
61550 + if (base[last_word] != (ulong_t) (-1)) {
61551 + *result = (last_word << LONG_INT_SHIFT) +
61552 + find_last_set_bit_in_word(~base[last_word],
61553 + BITS_PER_LONG - 1);
61554 + return 0;
61555 + }
61556 + --last_word;
61557 + }
61558 +
61559 + return -1; /* zero bit not found */
61560 +}
61561 +
61562 +/* Audited by: green(2002.06.12) */
61563 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
61564 +{
61565 + int first_byte;
61566 + int last_byte;
61567 +
61568 + unsigned char first_byte_mask = 0xFF;
61569 + unsigned char last_byte_mask = 0xFF;
61570 +
61571 + assert("zam-410", start < end);
61572 +
61573 + first_byte = start >> 3;
61574 + last_byte = (end - 1) >> 3;
61575 +
61576 + if (last_byte > first_byte + 1)
61577 + memset(addr + first_byte + 1, 0,
61578 + (size_t) (last_byte - first_byte - 1));
61579 +
61580 + first_byte_mask >>= 8 - (start & 0x7);
61581 + last_byte_mask <<= ((end - 1) & 0x7) + 1;
61582 +
61583 + if (first_byte == last_byte) {
61584 + addr[first_byte] &= (first_byte_mask | last_byte_mask);
61585 + } else {
61586 + addr[first_byte] &= first_byte_mask;
61587 + addr[last_byte] &= last_byte_mask;
61588 + }
61589 +}
61590 +
61591 +/* Audited by: green(2002.06.12) */
61592 +/* ZAM-FIXME-HANS: comment this */
61593 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
61594 +{
61595 + int first_byte;
61596 + int last_byte;
61597 +
61598 + unsigned char first_byte_mask = 0xFF;
61599 + unsigned char last_byte_mask = 0xFF;
61600 +
61601 + assert("zam-386", start < end);
61602 +
61603 + first_byte = start >> 3;
61604 + last_byte = (end - 1) >> 3;
61605 +
61606 + if (last_byte > first_byte + 1)
61607 + memset(addr + first_byte + 1, 0xFF,
61608 + (size_t) (last_byte - first_byte - 1));
61609 +
61610 + first_byte_mask <<= start & 0x7;
61611 + last_byte_mask >>= 7 - ((end - 1) & 0x7);
61612 +
61613 + if (first_byte == last_byte) {
61614 + addr[first_byte] |= (first_byte_mask & last_byte_mask);
61615 + } else {
61616 + addr[first_byte] |= first_byte_mask;
61617 + addr[last_byte] |= last_byte_mask;
61618 + }
61619 +}
61620 +
61621 +#define ADLER_BASE 65521
61622 +#define ADLER_NMAX 5552
61623 +
61624 +/* Calculates the adler32 checksum for the data pointed by `data` of the
61625 + length `len`. This function was originally taken from zlib, version 1.1.3,
61626 + July 9th, 1998.
61627 +
61628 + Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
61629 +
61630 + This software is provided 'as-is', without any express or implied
61631 + warranty. In no event will the authors be held liable for any damages
61632 + arising from the use of this software.
61633 +
61634 + Permission is granted to anyone to use this software for any purpose,
61635 + including commercial applications, and to alter it and redistribute it
61636 + freely, subject to the following restrictions:
61637 +
61638 + 1. The origin of this software must not be misrepresented; you must not
61639 + claim that you wrote the original software. If you use this software
61640 + in a product, an acknowledgment in the product documentation would be
61641 + appreciated but is not required.
61642 + 2. Altered source versions must be plainly marked as such, and must not be
61643 + misrepresented as being the original software.
61644 + 3. This notice may not be removed or altered from any source distribution.
61645 +
61646 + Jean-loup Gailly Mark Adler
61647 + jloup@gzip.org madler@alumni.caltech.edu
61648 +
61649 + The above comment applies only to the reiser4_adler32 function.
61650 +*/
61651 +
61652 +__u32 reiser4_adler32(char *data, __u32 len)
61653 +{
61654 + unsigned char *t = data;
61655 + __u32 s1 = 1;
61656 + __u32 s2 = 0;
61657 + int k;
61658 +
61659 + while (len > 0) {
61660 + k = len < ADLER_NMAX ? len : ADLER_NMAX;
61661 + len -= k;
61662 +
61663 + while (k--) {
61664 + s1 += *t++;
61665 + s2 += s1;
61666 + }
61667 +
61668 + s1 %= ADLER_BASE;
61669 + s2 %= ADLER_BASE;
61670 + }
61671 + return (s2 << 16) | s1;
61672 +}
61673 +
61674 +#define sb_by_bnode(bnode) \
61675 + ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
61676 +
61677 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
61678 +{
61679 + return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
61680 +}
61681 +
61682 +static int
61683 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
61684 +{
61685 + if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
61686 + bmap_nr_t bmap;
61687 +
61688 + bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
61689 +
61690 + warning("vpf-263",
61691 + "Checksum for the bitmap block %llu is incorrect",
61692 + bmap);
61693 +
61694 + return RETERR(-EIO);
61695 + }
61696 +
61697 + return 0;
61698 +}
61699 +
61700 +#define REISER4_CHECK_BMAP_CRC (0)
61701 +
61702 +#if REISER4_CHECK_BMAP_CRC
61703 +static int bnode_check_crc(const struct bitmap_node *bnode)
61704 +{
61705 + return bnode_check_adler32(bnode,
61706 + bmap_size(sb_by_bnode(bnode)->s_blocksize));
61707 +}
61708 +
61709 +/* REISER4_CHECK_BMAP_CRC */
61710 +#else
61711 +
61712 +#define bnode_check_crc(bnode) (0)
61713 +
61714 +/* REISER4_CHECK_BMAP_CRC */
61715 +#endif
61716 +
61717 +/* Recalculates the adler32 checksum for only 1 byte change.
61718 + adler - previous adler checksum
61719 + old_data, data - old, new byte values.
61720 + tail == (chunk - offset) : length, checksum was calculated for, - offset of
61721 + the changed byte within this chunk.
61722 + This function can be used for checksum calculation optimisation.
61723 +*/
61724 +
61725 +static __u32
61726 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
61727 + __u32 tail)
61728 +{
61729 + __u32 delta = data - old_data + 2 * ADLER_BASE;
61730 + __u32 s1 = adler & 0xffff;
61731 + __u32 s2 = (adler >> 16) & 0xffff;
61732 +
61733 + s1 = (delta + s1) % ADLER_BASE;
61734 + s2 = (delta * tail + s2) % ADLER_BASE;
61735 +
61736 + return (s2 << 16) | s1;
61737 +}
61738 +
61739 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
61740 +
61741 +/**
61742 + * get_nr_bitmap - calculate number of bitmap blocks
61743 + * @super: super block with initialized blocksize and block count
61744 + *
61745 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
61746 + * maintain free disk space. It assumes that each bitmap addresses the same
61747 + * number of blocks which is calculated by bmap_block_count macro defined in
61748 + * above. Number of blocks in the filesystem has to be initialized in reiser4
61749 + * private data of super block already so that it can be obtained via
61750 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
61751 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
61752 + * to use special function to divide and modulo 64bits filesystem block
61753 + * counters.
61754 + *
61755 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
61756 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
61757 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
61758 + */
61759 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
61760 +{
61761 + u64 quotient;
61762 +
61763 + assert("zam-393", reiser4_block_count(super) != 0);
61764 +
61765 + quotient = reiser4_block_count(super) - 1;
61766 + do_div(quotient, bmap_bit_count(super->s_blocksize));
61767 + return quotient + 1;
61768 +}
61769 +
61770 +/**
61771 + * parse_blocknr - calculate bitmap number and offset in it by block number
61772 + * @block: pointer to block number to calculate location in bitmap of
61773 + * @bmap: pointer where to store bitmap block number
61774 + * @offset: pointer where to store offset within bitmap block
61775 + *
61776 + * Calculates location of bit which is responsible for allocation/freeing of
61777 + * block @*block. That location is represented by bitmap block number and offset
61778 + * within that bitmap block.
61779 + */
61780 +static void
61781 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
61782 + bmap_off_t *offset)
61783 +{
61784 + struct super_block *super = get_current_context()->super;
61785 + u64 quotient = *block;
61786 +
61787 + *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
61788 + *bmap = quotient;
61789 +
61790 + assert("zam-433", *bmap < get_nr_bmap(super));
61791 + assert("", *offset < bmap_bit_count(super->s_blocksize));
61792 +}
61793 +
61794 +#if REISER4_DEBUG
61795 +/* Audited by: green(2002.06.12) */
61796 +static void
61797 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
61798 +{
61799 + struct super_block *sb = reiser4_get_current_sb();
61800 +
61801 + assert("zam-436", sb != NULL);
61802 +
61803 + assert("zam-455", start != NULL);
61804 + assert("zam-437", *start != 0);
61805 + assert("zam-541", !reiser4_blocknr_is_fake(start));
61806 + assert("zam-441", *start < reiser4_block_count(sb));
61807 +
61808 + if (len != NULL) {
61809 + assert("zam-438", *len != 0);
61810 + assert("zam-442", *start + *len <= reiser4_block_count(sb));
61811 + }
61812 +}
61813 +
61814 +static void check_bnode_loaded(const struct bitmap_node *bnode)
61815 +{
61816 + assert("zam-485", bnode != NULL);
61817 + assert("zam-483", jnode_page(bnode->wjnode) != NULL);
61818 + assert("zam-484", jnode_page(bnode->cjnode) != NULL);
61819 + assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
61820 + assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
61821 +}
61822 +
61823 +#else
61824 +
61825 +# define check_block_range(start, len) do { /* nothing */} while(0)
61826 +# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
61827 +
61828 +#endif
61829 +
61830 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
61831 + spin-locked */
61832 +static inline void
61833 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
61834 +{
61835 + if (offset < bnode->first_zero_bit)
61836 + bnode->first_zero_bit = offset;
61837 +}
61838 +
61839 +/* return a physical disk address for logical bitmap number @bmap */
61840 +/* FIXME-VS: this is somehow related to disk layout? */
61841 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
61842 + * per block allocation so that performance is not affected. Probably this
61843 + * whole file should be considered part of the disk layout plugin, and other
61844 + * disk layouts can use other defines and efficiency will not be significantly
61845 + * affected. */
61846 +
61847 +#define REISER4_FIRST_BITMAP_BLOCK \
61848 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
61849 +
61850 +/* Audited by: green(2002.06.12) */
61851 +static void
61852 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
61853 + reiser4_block_nr * bnr)
61854 +{
61855 +
61856 + assert("zam-390", bmap < get_nr_bmap(super));
61857 +
61858 +#ifdef CONFIG_REISER4_BADBLOCKS
61859 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
61860 + /* Check if the diskmap have this already, first. */
61861 + if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
61862 + return; /* Found it in diskmap */
61863 +#endif
61864 + /* FIXME_ZAM: before discussing of disk layouts and disk format
61865 + plugins I implement bitmap location scheme which is close to scheme
61866 + used in reiser 3.6 */
61867 + if (bmap == 0) {
61868 + *bnr = REISER4_FIRST_BITMAP_BLOCK;
61869 + } else {
61870 + *bnr = bmap * bmap_bit_count(super->s_blocksize);
61871 + }
61872 +}
61873 +
61874 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
61875 +/* Audited by: green(2002.06.12) */
61876 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
61877 +{
61878 + *bnr =
61879 + (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
61880 + REISER4_BITMAP_BLOCKS_STATUS_VALUE);
61881 +}
61882 +
61883 +/* bnode structure initialization */
61884 +static void
61885 +init_bnode(struct bitmap_node *bnode,
61886 + struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
61887 +{
61888 + memset(bnode, 0, sizeof(struct bitmap_node));
61889 +
61890 + mutex_init(&bnode->mutex);
61891 + atomic_set(&bnode->loaded, 0);
61892 +}
61893 +
61894 +static void release(jnode * node)
61895 +{
61896 + jrelse(node);
61897 + JF_SET(node, JNODE_HEARD_BANSHEE);
61898 + jput(node);
61899 +}
61900 +
61901 +/* This function is for internal bitmap.c use because it assumes that jnode is
61902 + in under full control of this thread */
61903 +static void done_bnode(struct bitmap_node *bnode)
61904 +{
61905 + if (bnode) {
61906 + atomic_set(&bnode->loaded, 0);
61907 + if (bnode->wjnode != NULL)
61908 + release(bnode->wjnode);
61909 + if (bnode->cjnode != NULL)
61910 + release(bnode->cjnode);
61911 + bnode->wjnode = bnode->cjnode = NULL;
61912 + }
61913 +}
61914 +
61915 +/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
61916 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
61917 + jnode **wjnode_ret)
61918 +{
61919 + struct super_block *super;
61920 + jnode *cjnode;
61921 + jnode *wjnode;
61922 + bmap_nr_t bmap;
61923 + int ret;
61924 +
61925 + super = reiser4_get_current_sb();
61926 +
61927 + *wjnode_ret = wjnode = bnew();
61928 + if (wjnode == NULL) {
61929 + *cjnode_ret = NULL;
61930 + return RETERR(-ENOMEM);
61931 + }
61932 +
61933 + *cjnode_ret = cjnode = bnew();
61934 + if (cjnode == NULL)
61935 + return RETERR(-ENOMEM);
61936 +
61937 + bmap = bnode - get_bnode(super, 0);
61938 +
61939 + get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
61940 + get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
61941 +
61942 + jref(cjnode);
61943 + jref(wjnode);
61944 +
61945 + /* load commit bitmap */
61946 + ret = jload_gfp(cjnode, GFP_NOFS, 1);
61947 +
61948 + if (ret)
61949 + goto error;
61950 +
61951 + /* allocate memory for working bitmap block. Note that for
61952 + * bitmaps jinit_new() doesn't actually modifies node content,
61953 + * so parallel calls to this are ok. */
61954 + ret = jinit_new(wjnode, GFP_NOFS);
61955 +
61956 + if (ret != 0) {
61957 + jrelse(cjnode);
61958 + goto error;
61959 + }
61960 +
61961 + return 0;
61962 +
61963 + error:
61964 + jput(cjnode);
61965 + jput(wjnode);
61966 + *wjnode_ret = *cjnode_ret = NULL;
61967 + return ret;
61968 +
61969 +}
61970 +
61971 +/* Check the bnode data on read. */
61972 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
61973 +{
61974 + void *data;
61975 + int ret;
61976 +
61977 + /* Check CRC */
61978 + ret = bnode_check_adler32(bnode, blksize);
61979 +
61980 + if (ret) {
61981 + return ret;
61982 + }
61983 +
61984 + data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
61985 +
61986 + /* Check the very first bit -- it must be busy. */
61987 + if (!reiser4_test_bit(0, data)) {
61988 + warning("vpf-1362", "The allocator block %llu is not marked "
61989 + "as used.", (unsigned long long)bnode->cjnode->blocknr);
61990 +
61991 + return -EINVAL;
61992 + }
61993 +
61994 + return 0;
61995 +}
61996 +
61997 +/* load bitmap blocks "on-demand" */
61998 +static int load_and_lock_bnode(struct bitmap_node *bnode)
61999 +{
62000 + int ret;
62001 +
62002 + jnode *cjnode;
62003 + jnode *wjnode;
62004 +
62005 + assert("nikita-3040", reiser4_schedulable());
62006 +
62007 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62008 + * need to be atomic, right? Just leave a comment that if bitmaps were
62009 + * unloadable, this would need to be atomic. */
62010 + if (atomic_read(&bnode->loaded)) {
62011 + /* bitmap is already loaded, nothing to do */
62012 + check_bnode_loaded(bnode);
62013 + mutex_lock(&bnode->mutex);
62014 + assert("nikita-2827", atomic_read(&bnode->loaded));
62015 + return 0;
62016 + }
62017 +
62018 + ret = prepare_bnode(bnode, &cjnode, &wjnode);
62019 + if (ret == 0) {
62020 + mutex_lock(&bnode->mutex);
62021 +
62022 + if (!atomic_read(&bnode->loaded)) {
62023 + assert("nikita-2822", cjnode != NULL);
62024 + assert("nikita-2823", wjnode != NULL);
62025 + assert("nikita-2824", jnode_is_loaded(cjnode));
62026 + assert("nikita-2825", jnode_is_loaded(wjnode));
62027 +
62028 + bnode->wjnode = wjnode;
62029 + bnode->cjnode = cjnode;
62030 +
62031 + ret = check_struct_bnode(bnode, current_blocksize);
62032 + if (!ret) {
62033 + cjnode = wjnode = NULL;
62034 + atomic_set(&bnode->loaded, 1);
62035 + /* working bitmap is initialized by on-disk
62036 + * commit bitmap. This should be performed
62037 + * under mutex. */
62038 + memcpy(bnode_working_data(bnode),
62039 + bnode_commit_data(bnode),
62040 + bmap_size(current_blocksize));
62041 + } else
62042 + mutex_unlock(&bnode->mutex);
62043 + } else
62044 + /* race: someone already loaded bitmap while we were
62045 + * busy initializing data. */
62046 + check_bnode_loaded(bnode);
62047 + }
62048 +
62049 + if (wjnode != NULL) {
62050 + release(wjnode);
62051 + bnode->wjnode = NULL;
62052 + }
62053 + if (cjnode != NULL) {
62054 + release(cjnode);
62055 + bnode->cjnode = NULL;
62056 + }
62057 +
62058 + return ret;
62059 +}
62060 +
62061 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
62062 +{
62063 + check_bnode_loaded(bnode);
62064 + mutex_unlock(&bnode->mutex);
62065 +}
62066 +
62067 +/* This function does all block allocation work but only for one bitmap
62068 + block.*/
62069 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62070 + block responsibility zone boundaries. This had no sense in v3.6 but may
62071 + have it in v4.x */
62072 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62073 +static int
62074 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62075 + bmap_off_t max_offset, int min_len, int max_len)
62076 +{
62077 + struct super_block *super = get_current_context()->super;
62078 + struct bitmap_node *bnode = get_bnode(super, bmap);
62079 +
62080 + char *data;
62081 +
62082 + bmap_off_t search_end;
62083 + bmap_off_t start;
62084 + bmap_off_t end;
62085 +
62086 + int set_first_zero_bit = 0;
62087 +
62088 + int ret;
62089 +
62090 + assert("zam-364", min_len > 0);
62091 + assert("zam-365", max_len >= min_len);
62092 + assert("zam-366", *offset <= max_offset);
62093 +
62094 + ret = load_and_lock_bnode(bnode);
62095 +
62096 + if (ret)
62097 + return ret;
62098 +
62099 + data = bnode_working_data(bnode);
62100 +
62101 + start = *offset;
62102 +
62103 + if (bnode->first_zero_bit >= start) {
62104 + start = bnode->first_zero_bit;
62105 + set_first_zero_bit = 1;
62106 + }
62107 +
62108 + while (start + min_len < max_offset) {
62109 +
62110 + start =
62111 + reiser4_find_next_zero_bit((long *)data, max_offset, start);
62112 + if (set_first_zero_bit) {
62113 + bnode->first_zero_bit = start;
62114 + set_first_zero_bit = 0;
62115 + }
62116 + if (start >= max_offset)
62117 + break;
62118 +
62119 + search_end = LIMIT(start + max_len, max_offset);
62120 + end =
62121 + reiser4_find_next_set_bit((long *)data, search_end, start);
62122 + if (end >= start + min_len) {
62123 + /* we can't trust find_next_set_bit result if set bit
62124 + was not fount, result may be bigger than
62125 + max_offset */
62126 + if (end > search_end)
62127 + end = search_end;
62128 +
62129 + ret = end - start;
62130 + *offset = start;
62131 +
62132 + reiser4_set_bits(data, start, end);
62133 +
62134 + /* FIXME: we may advance first_zero_bit if [start,
62135 + end] region overlaps the first_zero_bit point */
62136 +
62137 + break;
62138 + }
62139 +
62140 + start = end + 1;
62141 + }
62142 +
62143 + release_and_unlock_bnode(bnode);
62144 +
62145 + return ret;
62146 +}
62147 +
62148 +static int
62149 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
62150 + bmap_off_t end_offset, int min_len, int max_len)
62151 +{
62152 + struct super_block *super = get_current_context()->super;
62153 + struct bitmap_node *bnode = get_bnode(super, bmap);
62154 + char *data;
62155 + bmap_off_t start;
62156 + int ret;
62157 +
62158 + assert("zam-958", min_len > 0);
62159 + assert("zam-959", max_len >= min_len);
62160 + assert("zam-960", *start_offset >= end_offset);
62161 +
62162 + ret = load_and_lock_bnode(bnode);
62163 + if (ret)
62164 + return ret;
62165 +
62166 + data = bnode_working_data(bnode);
62167 + start = *start_offset;
62168 +
62169 + while (1) {
62170 + bmap_off_t end, search_end;
62171 +
62172 + /* Find the beginning of the zero filled region */
62173 + if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
62174 + break;
62175 + /* Is there more than `min_len' bits from `start' to
62176 + * `end_offset'? */
62177 + if (start < end_offset + min_len - 1)
62178 + break;
62179 +
62180 + /* Do not search to `end_offset' if we need to find less than
62181 + * `max_len' zero bits. */
62182 + if (end_offset + max_len - 1 < start)
62183 + search_end = start - max_len + 1;
62184 + else
62185 + search_end = end_offset;
62186 +
62187 + if (reiser4_find_last_set_bit(&end, data, search_end, start))
62188 + end = search_end;
62189 + else
62190 + end++;
62191 +
62192 + if (end + min_len <= start + 1) {
62193 + if (end < search_end)
62194 + end = search_end;
62195 + ret = start - end + 1;
62196 + *start_offset = end; /* `end' is lowest offset */
62197 + assert("zam-987",
62198 + reiser4_find_next_set_bit(data, start + 1,
62199 + end) >= start + 1);
62200 + reiser4_set_bits(data, end, start + 1);
62201 + break;
62202 + }
62203 +
62204 + if (end <= end_offset)
62205 + /* left search boundary reached. */
62206 + break;
62207 + start = end - 1;
62208 + }
62209 +
62210 + release_and_unlock_bnode(bnode);
62211 + return ret;
62212 +}
62213 +
62214 +/* allocate contiguous range of blocks in bitmap */
62215 +static int bitmap_alloc_forward(reiser4_block_nr * start,
62216 + const reiser4_block_nr * end, int min_len,
62217 + int max_len)
62218 +{
62219 + bmap_nr_t bmap, end_bmap;
62220 + bmap_off_t offset, end_offset;
62221 + int len;
62222 +
62223 + reiser4_block_nr tmp;
62224 +
62225 + struct super_block *super = get_current_context()->super;
62226 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62227 +
62228 + parse_blocknr(start, &bmap, &offset);
62229 +
62230 + tmp = *end - 1;
62231 + parse_blocknr(&tmp, &end_bmap, &end_offset);
62232 + ++end_offset;
62233 +
62234 + assert("zam-358", end_bmap >= bmap);
62235 + assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
62236 +
62237 + for (; bmap < end_bmap; bmap++, offset = 0) {
62238 + len =
62239 + search_one_bitmap_forward(bmap, &offset, max_offset,
62240 + min_len, max_len);
62241 + if (len != 0)
62242 + goto out;
62243 + }
62244 +
62245 + len =
62246 + search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
62247 + max_len);
62248 + out:
62249 + *start = bmap * max_offset + offset;
62250 + return len;
62251 +}
62252 +
62253 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
62254 + * backward direction) */
62255 +static int bitmap_alloc_backward(reiser4_block_nr * start,
62256 + const reiser4_block_nr * end, int min_len,
62257 + int max_len)
62258 +{
62259 + bmap_nr_t bmap, end_bmap;
62260 + bmap_off_t offset, end_offset;
62261 + int len;
62262 + struct super_block *super = get_current_context()->super;
62263 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62264 +
62265 + parse_blocknr(start, &bmap, &offset);
62266 + parse_blocknr(end, &end_bmap, &end_offset);
62267 +
62268 + assert("zam-961", end_bmap <= bmap);
62269 + assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
62270 +
62271 + for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
62272 + len =
62273 + search_one_bitmap_backward(bmap, &offset, 0, min_len,
62274 + max_len);
62275 + if (len != 0)
62276 + goto out;
62277 + }
62278 +
62279 + len =
62280 + search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
62281 + max_len);
62282 + out:
62283 + *start = bmap * max_offset + offset;
62284 + return len;
62285 +}
62286 +
62287 +/* plugin->u.space_allocator.alloc_blocks() */
62288 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
62289 + reiser4_block_nr *start, reiser4_block_nr *len)
62290 +{
62291 + struct super_block *super = get_current_context()->super;
62292 + int actual_len;
62293 +
62294 + reiser4_block_nr search_start;
62295 + reiser4_block_nr search_end;
62296 +
62297 + assert("zam-398", super != NULL);
62298 + assert("zam-412", hint != NULL);
62299 + assert("zam-397", hint->blk <= reiser4_block_count(super));
62300 +
62301 + if (hint->max_dist == 0)
62302 + search_end = reiser4_block_count(super);
62303 + else
62304 + search_end =
62305 + LIMIT(hint->blk + hint->max_dist,
62306 + reiser4_block_count(super));
62307 +
62308 + /* We use @hint -> blk as a search start and search from it to the end
62309 + of the disk or in given region if @hint -> max_dist is not zero */
62310 + search_start = hint->blk;
62311 +
62312 + actual_len =
62313 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62314 +
62315 + /* There is only one bitmap search if max_dist was specified or first
62316 + pass was from the beginning of the bitmap. We also do one pass for
62317 + scanning bitmap in backward direction. */
62318 + if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
62319 + /* next step is a scanning from 0 to search_start */
62320 + search_end = search_start;
62321 + search_start = 0;
62322 + actual_len =
62323 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62324 + }
62325 + if (actual_len == 0)
62326 + return RETERR(-ENOSPC);
62327 + if (actual_len < 0)
62328 + return RETERR(actual_len);
62329 + *len = actual_len;
62330 + *start = search_start;
62331 + return 0;
62332 +}
62333 +
62334 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
62335 + reiser4_block_nr * start,
62336 + reiser4_block_nr * len)
62337 +{
62338 + reiser4_block_nr search_start;
62339 + reiser4_block_nr search_end;
62340 + int actual_len;
62341 +
62342 + ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
62343 +
62344 + assert("zam-969", super != NULL);
62345 + assert("zam-970", hint != NULL);
62346 + assert("zam-971", hint->blk <= reiser4_block_count(super));
62347 +
62348 + search_start = hint->blk;
62349 + if (hint->max_dist == 0 || search_start <= hint->max_dist)
62350 + search_end = 0;
62351 + else
62352 + search_end = search_start - hint->max_dist;
62353 +
62354 + actual_len =
62355 + bitmap_alloc_backward(&search_start, &search_end, 1, needed);
62356 + if (actual_len == 0)
62357 + return RETERR(-ENOSPC);
62358 + if (actual_len < 0)
62359 + return RETERR(actual_len);
62360 + *len = actual_len;
62361 + *start = search_start;
62362 + return 0;
62363 +}
62364 +
62365 +/* plugin->u.space_allocator.alloc_blocks() */
62366 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
62367 + reiser4_blocknr_hint * hint, int needed,
62368 + reiser4_block_nr * start, reiser4_block_nr * len)
62369 +{
62370 + if (hint->backward)
62371 + return alloc_blocks_backward(hint, needed, start, len);
62372 + return alloc_blocks_forward(hint, needed, start, len);
62373 +}
62374 +
62375 +/* plugin->u.space_allocator.dealloc_blocks(). */
62376 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
62377 + nodes deletion is deferred until transaction commit. However, deallocation
62378 + of temporary objects like wandered blocks and transaction commit records
62379 + requires immediate node deletion from WORKING BITMAP.*/
62380 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
62381 + reiser4_block_nr start, reiser4_block_nr len)
62382 +{
62383 + struct super_block *super = reiser4_get_current_sb();
62384 +
62385 + bmap_nr_t bmap;
62386 + bmap_off_t offset;
62387 +
62388 + struct bitmap_node *bnode;
62389 + int ret;
62390 +
62391 + assert("zam-468", len != 0);
62392 + check_block_range(&start, &len);
62393 +
62394 + parse_blocknr(&start, &bmap, &offset);
62395 +
62396 + assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
62397 +
62398 + bnode = get_bnode(super, bmap);
62399 +
62400 + assert("zam-470", bnode != NULL);
62401 +
62402 + ret = load_and_lock_bnode(bnode);
62403 + assert("zam-481", ret == 0);
62404 +
62405 + reiser4_clear_bits(bnode_working_data(bnode), offset,
62406 + (bmap_off_t) (offset + len));
62407 +
62408 + adjust_first_zero_bit(bnode, offset);
62409 +
62410 + release_and_unlock_bnode(bnode);
62411 +}
62412 +
62413 +/* plugin->u.space_allocator.check_blocks(). */
62414 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
62415 + const reiser4_block_nr * len, int desired)
62416 +{
62417 +#if REISER4_DEBUG
62418 + struct super_block *super = reiser4_get_current_sb();
62419 +
62420 + bmap_nr_t bmap;
62421 + bmap_off_t start_offset;
62422 + bmap_off_t end_offset;
62423 +
62424 + struct bitmap_node *bnode;
62425 + int ret;
62426 +
62427 + assert("zam-622", len != NULL);
62428 + check_block_range(start, len);
62429 + parse_blocknr(start, &bmap, &start_offset);
62430 +
62431 + end_offset = start_offset + *len;
62432 + assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
62433 +
62434 + bnode = get_bnode(super, bmap);
62435 +
62436 + assert("nikita-2215", bnode != NULL);
62437 +
62438 + ret = load_and_lock_bnode(bnode);
62439 + assert("zam-626", ret == 0);
62440 +
62441 + assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
62442 +
62443 + if (desired) {
62444 + assert("zam-623",
62445 + reiser4_find_next_zero_bit(bnode_working_data(bnode),
62446 + end_offset, start_offset)
62447 + >= end_offset);
62448 + } else {
62449 + assert("zam-624",
62450 + reiser4_find_next_set_bit(bnode_working_data(bnode),
62451 + end_offset, start_offset)
62452 + >= end_offset);
62453 + }
62454 +
62455 + release_and_unlock_bnode(bnode);
62456 +#endif
62457 +}
62458 +
62459 +/* conditional insertion of @node into atom's overwrite set if it was not there */
62460 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
62461 +{
62462 + assert("zam-546", atom != NULL);
62463 + assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
62464 + assert("zam-548", node != NULL);
62465 +
62466 + spin_lock_atom(atom);
62467 + spin_lock_jnode(node);
62468 +
62469 + if (node->atom == NULL) {
62470 + JF_SET(node, JNODE_OVRWR);
62471 + insert_into_atom_ovrwr_list(atom, node);
62472 + } else {
62473 + assert("zam-549", node->atom == atom);
62474 + }
62475 +
62476 + spin_unlock_jnode(node);
62477 + spin_unlock_atom(atom);
62478 +}
62479 +
62480 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
62481 + pages in a single-linked list */
62482 +static int
62483 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
62484 + const reiser4_block_nr * len, void *data)
62485 +{
62486 +
62487 + bmap_nr_t bmap;
62488 + bmap_off_t offset;
62489 + int ret;
62490 +
62491 + long long *blocks_freed_p = data;
62492 +
62493 + struct bitmap_node *bnode;
62494 +
62495 + struct super_block *sb = reiser4_get_current_sb();
62496 +
62497 + check_block_range(start, len);
62498 +
62499 + parse_blocknr(start, &bmap, &offset);
62500 +
62501 + /* FIXME-ZAM: we assume that all block ranges are allocated by this
62502 + bitmap-based allocator and each block range can't go over a zone of
62503 + responsibility of one bitmap block; same assumption is used in
62504 + other journal hooks in bitmap code. */
62505 + bnode = get_bnode(sb, bmap);
62506 + assert("zam-448", bnode != NULL);
62507 +
62508 + /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
62509 + assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
62510 + ret = load_and_lock_bnode(bnode);
62511 + if (ret)
62512 + return ret;
62513 +
62514 + /* put bnode into atom's overwrite set */
62515 + cond_add_to_overwrite_set(atom, bnode->cjnode);
62516 +
62517 + data = bnode_commit_data(bnode);
62518 +
62519 + ret = bnode_check_crc(bnode);
62520 + if (ret != 0)
62521 + return ret;
62522 +
62523 + if (len != NULL) {
62524 + /* FIXME-ZAM: a check that all bits are set should be there */
62525 + assert("zam-443",
62526 + offset + *len <= bmap_bit_count(sb->s_blocksize));
62527 + reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
62528 +
62529 + (*blocks_freed_p) += *len;
62530 + } else {
62531 + reiser4_clear_bit(offset, data);
62532 + (*blocks_freed_p)++;
62533 + }
62534 +
62535 + bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
62536 +
62537 + release_and_unlock_bnode(bnode);
62538 +
62539 + return 0;
62540 +}
62541 +
62542 +/* plugin->u.space_allocator.pre_commit_hook(). */
62543 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
62544 + rest is done by transaction manager (allocate wandered locations for COMMIT
62545 + BITMAP blocks, copy COMMIT BITMAP blocks data). */
62546 +/* Only one instance of this function can be running at one given time, because
62547 + only one transaction can be committed a time, therefore it is safe to access
62548 + some global variables without any locking */
62549 +
62550 +int reiser4_pre_commit_hook_bitmap(void)
62551 +{
62552 + struct super_block *super = reiser4_get_current_sb();
62553 + txn_atom *atom;
62554 +
62555 + long long blocks_freed = 0;
62556 +
62557 + atom = get_current_atom_locked();
62558 + assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
62559 + spin_unlock_atom(atom);
62560 +
62561 + { /* scan atom's captured list and find all freshly allocated nodes,
62562 + * mark corresponded bits in COMMIT BITMAP as used */
62563 + struct list_head *head = ATOM_CLEAN_LIST(atom);
62564 + jnode *node = list_entry(head->next, jnode, capture_link);
62565 +
62566 + while (head != &node->capture_link) {
62567 + /* we detect freshly allocated jnodes */
62568 + if (JF_ISSET(node, JNODE_RELOC)) {
62569 + int ret;
62570 + bmap_nr_t bmap;
62571 +
62572 + bmap_off_t offset;
62573 + bmap_off_t index;
62574 + struct bitmap_node *bn;
62575 + __u32 size = bmap_size(super->s_blocksize);
62576 + __u32 crc;
62577 + char byte;
62578 +
62579 + assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
62580 + assert("zam-460",
62581 + !reiser4_blocknr_is_fake(&node->blocknr));
62582 +
62583 + parse_blocknr(&node->blocknr, &bmap, &offset);
62584 + bn = get_bnode(super, bmap);
62585 +
62586 + index = offset >> 3;
62587 + assert("vpf-276", index < size);
62588 +
62589 + ret = bnode_check_crc(bnode);
62590 + if (ret != 0)
62591 + return ret;
62592 +
62593 + check_bnode_loaded(bn);
62594 + load_and_lock_bnode(bn);
62595 +
62596 + byte = *(bnode_commit_data(bn) + index);
62597 + reiser4_set_bit(offset, bnode_commit_data(bn));
62598 +
62599 + crc = adler32_recalc(bnode_commit_crc(bn), byte,
62600 + *(bnode_commit_data(bn) +
62601 + index),
62602 + size - index),
62603 + bnode_set_commit_crc(bn, crc);
62604 +
62605 + release_and_unlock_bnode(bn);
62606 +
62607 + ret = bnode_check_crc(bn);
62608 + if (ret != 0)
62609 + return ret;
62610 +
62611 + /* working of this depends on how it inserts
62612 + new j-node into clean list, because we are
62613 + scanning the same list now. It is OK, if
62614 + insertion is done to the list front */
62615 + cond_add_to_overwrite_set(atom, bn->cjnode);
62616 + }
62617 +
62618 + node = list_entry(node->capture_link.next, jnode, capture_link);
62619 + }
62620 + }
62621 +
62622 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
62623 + &blocks_freed, 0);
62624 +
62625 + blocks_freed -= atom->nr_blocks_allocated;
62626 +
62627 + {
62628 + reiser4_super_info_data *sbinfo;
62629 +
62630 + sbinfo = get_super_private(super);
62631 +
62632 + spin_lock_reiser4_super(sbinfo);
62633 + sbinfo->blocks_free_committed += blocks_freed;
62634 + spin_unlock_reiser4_super(sbinfo);
62635 + }
62636 +
62637 + return 0;
62638 +}
62639 +
62640 +/* plugin->u.space_allocator.init_allocator
62641 + constructor of reiser4_space_allocator object. It is called on fs mount */
62642 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
62643 + struct super_block *super, void *arg)
62644 +{
62645 + struct bitmap_allocator_data *data = NULL;
62646 + bmap_nr_t bitmap_blocks_nr;
62647 + bmap_nr_t i;
62648 +
62649 + assert("nikita-3039", reiser4_schedulable());
62650 +
62651 + /* getting memory for bitmap allocator private data holder */
62652 + data =
62653 + kmalloc(sizeof(struct bitmap_allocator_data),
62654 + reiser4_ctx_gfp_mask_get());
62655 +
62656 + if (data == NULL)
62657 + return RETERR(-ENOMEM);
62658 +
62659 + /* allocation and initialization for the array of bnodes */
62660 + bitmap_blocks_nr = get_nr_bmap(super);
62661 +
62662 + /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
62663 + which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
62664 + may I never meet someone who still uses the ia32 architecture when
62665 + storage devices of that size enter the market, and wants to use ia32
62666 + with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
62667 + probably, another dynamic data structure should replace a static
62668 + array of bnodes. */
62669 + /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
62670 + data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
62671 + if (data->bitmap == NULL) {
62672 + kfree(data);
62673 + return RETERR(-ENOMEM);
62674 + }
62675 +
62676 + for (i = 0; i < bitmap_blocks_nr; i++)
62677 + init_bnode(data->bitmap + i, super, i);
62678 +
62679 + allocator->u.generic = data;
62680 +
62681 +#if REISER4_DEBUG
62682 + get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
62683 +#endif
62684 +
62685 + /* Load all bitmap blocks at mount time. */
62686 + if (!test_bit
62687 + (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
62688 + __u64 start_time, elapsed_time;
62689 + struct bitmap_node *bnode;
62690 + int ret;
62691 +
62692 + if (REISER4_DEBUG)
62693 + printk(KERN_INFO "loading reiser4 bitmap...");
62694 + start_time = jiffies;
62695 +
62696 + for (i = 0; i < bitmap_blocks_nr; i++) {
62697 + bnode = data->bitmap + i;
62698 + ret = load_and_lock_bnode(bnode);
62699 + if (ret) {
62700 + reiser4_destroy_allocator_bitmap(allocator,
62701 + super);
62702 + return ret;
62703 + }
62704 + release_and_unlock_bnode(bnode);
62705 + }
62706 +
62707 + elapsed_time = jiffies - start_time;
62708 + if (REISER4_DEBUG)
62709 + printk("...done (%llu jiffies)\n",
62710 + (unsigned long long)elapsed_time);
62711 + }
62712 +
62713 + return 0;
62714 +}
62715 +
62716 +/* plugin->u.space_allocator.destroy_allocator
62717 + destructor. It is called on fs unmount */
62718 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
62719 + struct super_block *super)
62720 +{
62721 + bmap_nr_t bitmap_blocks_nr;
62722 + bmap_nr_t i;
62723 +
62724 + struct bitmap_allocator_data *data = allocator->u.generic;
62725 +
62726 + assert("zam-414", data != NULL);
62727 + assert("zam-376", data->bitmap != NULL);
62728 +
62729 + bitmap_blocks_nr = get_nr_bmap(super);
62730 +
62731 + for (i = 0; i < bitmap_blocks_nr; i++) {
62732 + struct bitmap_node *bnode = data->bitmap + i;
62733 +
62734 + mutex_lock(&bnode->mutex);
62735 +
62736 +#if REISER4_DEBUG
62737 + if (atomic_read(&bnode->loaded)) {
62738 + jnode *wj = bnode->wjnode;
62739 + jnode *cj = bnode->cjnode;
62740 +
62741 + assert("zam-480", jnode_page(cj) != NULL);
62742 + assert("zam-633", jnode_page(wj) != NULL);
62743 +
62744 + assert("zam-634",
62745 + memcmp(jdata(wj), jdata(wj),
62746 + bmap_size(super->s_blocksize)) == 0);
62747 +
62748 + }
62749 +#endif
62750 + done_bnode(bnode);
62751 + mutex_unlock(&bnode->mutex);
62752 + }
62753 +
62754 + vfree(data->bitmap);
62755 + kfree(data);
62756 +
62757 + allocator->u.generic = NULL;
62758 +
62759 + return 0;
62760 +}
62761 +
62762 +/*
62763 + * Local variables:
62764 + * c-indentation-style: "K&R"
62765 + * mode-name: "LC"
62766 + * c-basic-offset: 8
62767 + * tab-width: 8
62768 + * fill-column: 79
62769 + * scroll-step: 1
62770 + * End:
62771 + */
62772 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.22/fs/reiser4/plugin/space/bitmap.h
62773 --- linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300
62774 +++ linux-2.6.22/fs/reiser4/plugin/space/bitmap.h 2007-07-29 00:25:35.004729608 +0400
62775 @@ -0,0 +1,47 @@
62776 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62777 +
62778 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
62779 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
62780 +
62781 +#include "../../dformat.h"
62782 +#include "../../block_alloc.h"
62783 +
62784 +#include <linux/types.h> /* for __u?? */
62785 +#include <linux/fs.h> /* for struct super_block */
62786 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
62787 +/* declarations of functions implementing methods of space allocator plugin for
62788 + bitmap based allocator. The functions themselves are in bitmap.c */
62789 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
62790 + struct super_block *, void *);
62791 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
62792 + struct super_block *);
62793 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
62794 + reiser4_blocknr_hint *, int needed,
62795 + reiser4_block_nr * start,
62796 + reiser4_block_nr * len);
62797 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
62798 + const reiser4_block_nr *, int);
62799 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
62800 + reiser4_block_nr,
62801 + reiser4_block_nr);
62802 +extern int reiser4_pre_commit_hook_bitmap(void);
62803 +
62804 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
62805 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
62806 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
62807 +
62808 +typedef __u64 bmap_nr_t;
62809 +typedef __u32 bmap_off_t;
62810 +
62811 +#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
62812 +
62813 +/* Make Linus happy.
62814 + Local variables:
62815 + c-indentation-style: "K&R"
62816 + mode-name: "LC"
62817 + c-basic-offset: 8
62818 + tab-width: 8
62819 + fill-column: 120
62820 + scroll-step: 1
62821 + End:
62822 +*/
62823 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/Makefile linux-2.6.22/fs/reiser4/plugin/space/Makefile
62824 --- linux-2.6.22.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300
62825 +++ linux-2.6.22/fs/reiser4/plugin/space/Makefile 2007-07-29 00:25:35.004729608 +0400
62826 @@ -0,0 +1,4 @@
62827 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
62828 +
62829 +space_plugins-objs := \
62830 + bitmap.o
62831 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.22/fs/reiser4/plugin/space/space_allocator.h
62832 --- linux-2.6.22.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
62833 +++ linux-2.6.22/fs/reiser4/plugin/space/space_allocator.h 2007-07-29 00:25:35.004729608 +0400
62834 @@ -0,0 +1,80 @@
62835 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62836 +
62837 +#ifndef __SPACE_ALLOCATOR_H__
62838 +#define __SPACE_ALLOCATOR_H__
62839 +
62840 +#include "../../forward.h"
62841 +#include "bitmap.h"
62842 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
62843 + * but... */
62844 +#define DEF_SPACE_ALLOCATOR(allocator) \
62845 + \
62846 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
62847 +{ \
62848 + return reiser4_init_allocator_##allocator (al, s, opaque); \
62849 +} \
62850 + \
62851 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
62852 +{ \
62853 + reiser4_destroy_allocator_##allocator (al, s); \
62854 +} \
62855 + \
62856 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
62857 + int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
62858 +{ \
62859 + return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
62860 +} \
62861 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
62862 +{ \
62863 + reiser4_dealloc_blocks_##allocator (al, start, len); \
62864 +} \
62865 + \
62866 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
62867 +{ \
62868 + reiser4_check_blocks_##allocator (start, end, desired); \
62869 +} \
62870 + \
62871 +static inline void sa_pre_commit_hook (void) \
62872 +{ \
62873 + reiser4_pre_commit_hook_##allocator (); \
62874 +} \
62875 + \
62876 +static inline void sa_post_commit_hook (void) \
62877 +{ \
62878 + reiser4_post_commit_hook_##allocator (); \
62879 +} \
62880 + \
62881 +static inline void sa_post_write_back_hook (void) \
62882 +{ \
62883 + reiser4_post_write_back_hook_##allocator(); \
62884 +} \
62885 + \
62886 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
62887 +{ \
62888 + reiser4_print_info_##allocator (prefix, al); \
62889 +}
62890 +
62891 +DEF_SPACE_ALLOCATOR(bitmap)
62892 +
62893 +/* this object is part of reiser4 private in-core super block */
62894 +struct reiser4_space_allocator {
62895 + union {
62896 + /* space allocators might use this pointer to reference their
62897 + * data. */
62898 + void *generic;
62899 + } u;
62900 +};
62901 +
62902 +/* __SPACE_ALLOCATOR_H__ */
62903 +#endif
62904 +
62905 +/* Make Linus happy.
62906 + Local variables:
62907 + c-indentation-style: "K&R"
62908 + mode-name: "LC"
62909 + c-basic-offset: 8
62910 + tab-width: 8
62911 + fill-column: 120
62912 + scroll-step: 1
62913 + End:
62914 +*/
62915 diff -urN linux-2.6.22.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.22/fs/reiser4/plugin/tail_policy.c
62916 --- linux-2.6.22.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300
62917 +++ linux-2.6.22/fs/reiser4/plugin/tail_policy.c 2007-07-29 00:25:35.008730643 +0400
62918 @@ -0,0 +1,113 @@
62919 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62920 + * reiser4/README */
62921 +
62922 +/* Formatting policy plugins */
62923 +
62924 +/*
62925 + * Formatting policy plugin is used by object plugin (of regular file) to
62926 + * convert file between two representations.
62927 + *
62928 + * Currently following policies are implemented:
62929 + * never store file in formatted nodes
62930 + * always store file in formatted nodes
62931 + * store file in formatted nodes if file is smaller than 4 blocks (default)
62932 + */
62933 +
62934 +#include "../tree.h"
62935 +#include "../inode.h"
62936 +#include "../super.h"
62937 +#include "object.h"
62938 +#include "plugin.h"
62939 +#include "node/node.h"
62940 +#include "plugin_header.h"
62941 +
62942 +#include <linux/pagemap.h>
62943 +#include <linux/fs.h> /* For struct inode */
62944 +
62945 +/**
62946 + * have_formatting_never -
62947 + * @inode:
62948 + * @size:
62949 + *
62950 + *
62951 + */
62952 +/* Never store file's tail as direct item */
62953 +/* Audited by: green(2002.06.12) */
62954 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
62955 + /* inode to operate on */ ,
62956 + loff_t size UNUSED_ARG /* new object size */ )
62957 +{
62958 + return 0;
62959 +}
62960 +
62961 +/* Always store file's tail as direct item */
62962 +/* Audited by: green(2002.06.12) */
62963 +static int
62964 +have_formatting_always(const struct inode *inode UNUSED_ARG
62965 + /* inode to operate on */ ,
62966 + loff_t size UNUSED_ARG /* new object size */ )
62967 +{
62968 + return 1;
62969 +}
62970 +
62971 +/* This function makes test if we should store file denoted @inode as tails only or
62972 + as extents only. */
62973 +static int
62974 +have_formatting_default(const struct inode *inode UNUSED_ARG
62975 + /* inode to operate on */ ,
62976 + loff_t size /* new object size */ )
62977 +{
62978 + assert("umka-1253", inode != NULL);
62979 +
62980 + if (size > inode->i_sb->s_blocksize * 4)
62981 + return 0;
62982 +
62983 + return 1;
62984 +}
62985 +
62986 +/* tail plugins */
62987 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
62988 + [NEVER_TAILS_FORMATTING_ID] = {
62989 + .h = {
62990 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
62991 + .id = NEVER_TAILS_FORMATTING_ID,
62992 + .pops = NULL,
62993 + .label = "never",
62994 + .desc = "Never store file's tail",
62995 + .linkage = {NULL, NULL}
62996 + },
62997 + .have_tail = have_formatting_never
62998 + },
62999 + [ALWAYS_TAILS_FORMATTING_ID] = {
63000 + .h = {
63001 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63002 + .id = ALWAYS_TAILS_FORMATTING_ID,
63003 + .pops = NULL,
63004 + .label = "always",
63005 + .desc = "Always store file's tail",
63006 + .linkage = {NULL, NULL}
63007 + },
63008 + .have_tail = have_formatting_always
63009 + },
63010 + [SMALL_FILE_FORMATTING_ID] = {
63011 + .h = {
63012 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63013 + .id = SMALL_FILE_FORMATTING_ID,
63014 + .pops = NULL,
63015 + .label = "4blocks",
63016 + .desc = "store files shorter than 4 blocks in tail items",
63017 + .linkage = {NULL, NULL}
63018 + },
63019 + .have_tail = have_formatting_default
63020 + }
63021 +};
63022 +
63023 +/*
63024 + * Local variables:
63025 + * c-indentation-style: "K&R"
63026 + * mode-name: "LC"
63027 + * c-basic-offset: 8
63028 + * tab-width: 8
63029 + * fill-column: 79
63030 + * End:
63031 + */
63032 diff -urN linux-2.6.22.orig/fs/reiser4/pool.c linux-2.6.22/fs/reiser4/pool.c
63033 --- linux-2.6.22.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
63034 +++ linux-2.6.22/fs/reiser4/pool.c 2007-07-29 00:25:35.008730643 +0400
63035 @@ -0,0 +1,231 @@
63036 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63037 + * reiser4/README */
63038 +
63039 +/* Fast pool allocation.
63040 +
63041 + There are situations when some sub-system normally asks memory allocator
63042 + for only few objects, but under some circumstances could require much
63043 + more. Typical and actually motivating example is tree balancing. It needs
63044 + to keep track of nodes that were involved into it, and it is well-known
63045 + that in reasonable packed balanced tree most (92.938121%) percent of all
63046 + balancings end up after working with only few nodes (3.141592 on
63047 + average). But in rare cases balancing can involve much more nodes
63048 + (3*tree_height+1 in extremal situation).
63049 +
63050 + On the one hand, we don't want to resort to dynamic allocation (slab,
63051 + malloc(), etc.) to allocate data structures required to keep track of
63052 + nodes during balancing. On the other hand, we cannot statically allocate
63053 + required amount of space on the stack, because first: it is useless wastage
63054 + of precious resource, and second: this amount is unknown in advance (tree
63055 + height can change).
63056 +
63057 + Pools, implemented in this file are solution for this problem:
63058 +
63059 + - some configurable amount of objects is statically preallocated on the
63060 + stack
63061 +
63062 + - if this preallocated pool is exhausted and more objects is requested
63063 + they are allocated dynamically.
63064 +
63065 + Pools encapsulate distinction between statically and dynamically allocated
63066 + objects. Both allocation and recycling look exactly the same.
63067 +
63068 + To keep track of dynamically allocated objects, pool adds its own linkage
63069 + to each object.
63070 +
63071 + NOTE-NIKITA This linkage also contains some balancing-specific data. This
63072 + is not perfect. On the other hand, balancing is currently the only client
63073 + of pool code.
63074 +
63075 + NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63076 + functions in the style of tslist/tshash, i.e., make them unreadable, but
63077 + type-safe.
63078 +
63079 +*/
63080 +
63081 +#include "debug.h"
63082 +#include "pool.h"
63083 +#include "super.h"
63084 +
63085 +#include <linux/types.h>
63086 +#include <linux/err.h>
63087 +
63088 +/* initialize new pool object @h */
63089 +static void reiser4_init_pool_obj(struct reiser4_pool_header * h)
63090 +{
63091 + INIT_LIST_HEAD(&h->usage_linkage);
63092 + INIT_LIST_HEAD(&h->level_linkage);
63093 + INIT_LIST_HEAD(&h->extra_linkage);
63094 +}
63095 +
63096 +/* initialize new pool */
63097 +void reiser4_init_pool(struct reiser4_pool * pool /* pool to initialize */ ,
63098 + size_t obj_size /* size of objects in @pool */ ,
63099 + int num_of_objs /* number of preallocated objects */ ,
63100 + char *data /* area for preallocated objects */ )
63101 +{
63102 + struct reiser4_pool_header *h;
63103 + int i;
63104 +
63105 + assert("nikita-955", pool != NULL);
63106 + assert("nikita-1044", obj_size > 0);
63107 + assert("nikita-956", num_of_objs >= 0);
63108 + assert("nikita-957", data != NULL);
63109 +
63110 + memset(pool, 0, sizeof *pool);
63111 + pool->obj_size = obj_size;
63112 + pool->data = data;
63113 + INIT_LIST_HEAD(&pool->free);
63114 + INIT_LIST_HEAD(&pool->used);
63115 + INIT_LIST_HEAD(&pool->extra);
63116 + memset(data, 0, obj_size * num_of_objs);
63117 + for (i = 0; i < num_of_objs; ++i) {
63118 + h = (struct reiser4_pool_header *) (data + i * obj_size);
63119 + reiser4_init_pool_obj(h);
63120 + /* add pool header to the end of pool's free list */
63121 + list_add_tail(&h->usage_linkage, &pool->free);
63122 + }
63123 +}
63124 +
63125 +/* release pool resources
63126 +
63127 + Release all resources acquired by this pool, specifically, dynamically
63128 + allocated objects.
63129 +
63130 +*/
63131 +void reiser4_done_pool(struct reiser4_pool * pool UNUSED_ARG)
63132 +{
63133 +}
63134 +
63135 +/* allocate carry object from @pool
63136 +
63137 + First, try to get preallocated object. If this fails, resort to dynamic
63138 + allocation.
63139 +
63140 +*/
63141 +static void *reiser4_pool_alloc(struct reiser4_pool * pool)
63142 +{
63143 + struct reiser4_pool_header *result;
63144 +
63145 + assert("nikita-959", pool != NULL);
63146 +
63147 + if (!list_empty(&pool->free)) {
63148 + struct list_head *linkage;
63149 +
63150 + linkage = pool->free.next;
63151 + list_del(linkage);
63152 + INIT_LIST_HEAD(linkage);
63153 + result = list_entry(linkage, struct reiser4_pool_header,
63154 + usage_linkage);
63155 + BUG_ON(!list_empty(&result->level_linkage) ||
63156 + !list_empty(&result->extra_linkage));
63157 + } else {
63158 + /* pool is empty. Extra allocations don't deserve dedicated
63159 + slab to be served from, as they are expected to be rare. */
63160 + result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
63161 + if (result != 0) {
63162 + reiser4_init_pool_obj(result);
63163 + list_add(&result->extra_linkage, &pool->extra);
63164 + } else
63165 + return ERR_PTR(RETERR(-ENOMEM));
63166 + BUG_ON(!list_empty(&result->usage_linkage) ||
63167 + !list_empty(&result->level_linkage));
63168 + }
63169 + ++pool->objs;
63170 + list_add(&result->usage_linkage, &pool->used);
63171 + memset(result + 1, 0, pool->obj_size - sizeof *result);
63172 + return result;
63173 +}
63174 +
63175 +/* return object back to the pool */
63176 +void reiser4_pool_free(struct reiser4_pool * pool,
63177 + struct reiser4_pool_header * h)
63178 +{
63179 + assert("nikita-961", h != NULL);
63180 + assert("nikita-962", pool != NULL);
63181 +
63182 + --pool->objs;
63183 + assert("nikita-963", pool->objs >= 0);
63184 +
63185 + list_del_init(&h->usage_linkage);
63186 + list_del_init(&h->level_linkage);
63187 +
63188 + if (list_empty(&h->extra_linkage))
63189 + /*
63190 + * pool header is not an extra one. Push it onto free list
63191 + * using usage_linkage
63192 + */
63193 + list_add(&h->usage_linkage, &pool->free);
63194 + else {
63195 + /* remove pool header from pool's extra list and kfree it */
63196 + list_del(&h->extra_linkage);
63197 + kfree(h);
63198 + }
63199 +}
63200 +
63201 +/* add new object to the carry level list
63202 +
63203 + Carry level is FIFO most of the time, but not always. Complications arise
63204 + when make_space() function tries to go to the left neighbor and thus adds
63205 + carry node before existing nodes, and also, when updating delimiting keys
63206 + after moving data between two nodes, we want left node to be locked before
63207 + right node.
63208 +
63209 + Latter case is confusing at the first glance. Problem is that COP_UPDATE
63210 + opration that updates delimiting keys is sometimes called with two nodes
63211 + (when data are moved between two nodes) and sometimes with only one node
63212 + (when leftmost item is deleted in a node). In any case operation is
63213 + supplied with at least node whose left delimiting key is to be updated
63214 + (that is "right" node).
63215 +
63216 + @pool - from which to allocate new object;
63217 + @list - where to add object;
63218 + @reference - after (or before) which existing object to add
63219 +*/
63220 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
63221 + struct list_head *list,
63222 + pool_ordering order,
63223 + struct reiser4_pool_header * reference)
63224 +{
63225 + struct reiser4_pool_header *result;
63226 +
63227 + assert("nikita-972", pool != NULL);
63228 +
63229 + result = reiser4_pool_alloc(pool);
63230 + if (IS_ERR(result))
63231 + return result;
63232 +
63233 + assert("nikita-973", result != NULL);
63234 +
63235 + switch (order) {
63236 + case POOLO_BEFORE:
63237 + __list_add(&result->level_linkage,
63238 + reference->level_linkage.prev,
63239 + &reference->level_linkage);
63240 + break;
63241 + case POOLO_AFTER:
63242 + __list_add(&result->level_linkage,
63243 + &reference->level_linkage,
63244 + reference->level_linkage.next);
63245 + break;
63246 + case POOLO_LAST:
63247 + list_add_tail(&result->level_linkage, list);
63248 + break;
63249 + case POOLO_FIRST:
63250 + list_add(&result->level_linkage, list);
63251 + break;
63252 + default:
63253 + wrong_return_value("nikita-927", "order");
63254 + }
63255 + return result;
63256 +}
63257 +
63258 +/* Make Linus happy.
63259 + Local variables:
63260 + c-indentation-style: "K&R"
63261 + mode-name: "LC"
63262 + c-basic-offset: 8
63263 + tab-width: 8
63264 + fill-column: 120
63265 + End:
63266 +*/
63267 diff -urN linux-2.6.22.orig/fs/reiser4/pool.h linux-2.6.22/fs/reiser4/pool.h
63268 --- linux-2.6.22.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
63269 +++ linux-2.6.22/fs/reiser4/pool.h 2007-07-29 00:25:35.008730643 +0400
63270 @@ -0,0 +1,56 @@
63271 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63272 +
63273 +/* Fast pool allocation */
63274 +
63275 +#ifndef __REISER4_POOL_H__
63276 +#define __REISER4_POOL_H__
63277 +
63278 +#include <linux/types.h>
63279 +
63280 +struct reiser4_pool {
63281 + size_t obj_size;
63282 + int objs;
63283 + char *data;
63284 + struct list_head free;
63285 + struct list_head used;
63286 + struct list_head extra;
63287 +};
63288 +
63289 +struct reiser4_pool_header {
63290 + /* object is either on free or "used" lists */
63291 + struct list_head usage_linkage;
63292 + struct list_head level_linkage;
63293 + struct list_head extra_linkage;
63294 +};
63295 +
63296 +typedef enum {
63297 + POOLO_BEFORE,
63298 + POOLO_AFTER,
63299 + POOLO_LAST,
63300 + POOLO_FIRST
63301 +} pool_ordering;
63302 +
63303 +/* pool manipulation functions */
63304 +
63305 +extern void reiser4_init_pool(struct reiser4_pool * pool, size_t obj_size,
63306 + int num_of_objs, char *data);
63307 +extern void reiser4_done_pool(struct reiser4_pool * pool);
63308 +extern void reiser4_pool_free(struct reiser4_pool * pool,
63309 + struct reiser4_pool_header * h);
63310 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
63311 + struct list_head * list,
63312 + pool_ordering order,
63313 + struct reiser4_pool_header *reference);
63314 +
63315 +/* __REISER4_POOL_H__ */
63316 +#endif
63317 +
63318 +/* Make Linus happy.
63319 + Local variables:
63320 + c-indentation-style: "K&R"
63321 + mode-name: "LC"
63322 + c-basic-offset: 8
63323 + tab-width: 8
63324 + fill-column: 120
63325 + End:
63326 +*/
63327 diff -urN linux-2.6.22.orig/fs/reiser4/readahead.c linux-2.6.22/fs/reiser4/readahead.c
63328 --- linux-2.6.22.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300
63329 +++ linux-2.6.22/fs/reiser4/readahead.c 2007-07-29 00:25:35.008730643 +0400
63330 @@ -0,0 +1,138 @@
63331 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63332 + * reiser4/README */
63333 +
63334 +#include "forward.h"
63335 +#include "tree.h"
63336 +#include "tree_walk.h"
63337 +#include "super.h"
63338 +#include "inode.h"
63339 +#include "key.h"
63340 +#include "znode.h"
63341 +
63342 +#include <linux/swap.h> /* for totalram_pages */
63343 +
63344 +void reiser4_init_ra_info(ra_info_t * rai)
63345 +{
63346 + rai->key_to_stop = *reiser4_min_key();
63347 +}
63348 +
63349 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
63350 +static inline int ra_adjacent_only(int flags)
63351 +{
63352 + return flags & RA_ADJACENT_ONLY;
63353 +}
63354 +
63355 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
63356 + if right neighbor's first key is less or equal to readahead's stop key */
63357 +static int should_readahead_neighbor(znode * node, ra_info_t * info)
63358 +{
63359 + int result;
63360 +
63361 + read_lock_dk(znode_get_tree(node));
63362 + result = keyle(znode_get_rd_key(node), &info->key_to_stop);
63363 + read_unlock_dk(znode_get_tree(node));
63364 + return result;
63365 +}
63366 +
63367 +#define LOW_MEM_PERCENTAGE (5)
63368 +
63369 +static int low_on_memory(void)
63370 +{
63371 + unsigned int freepages;
63372 +
63373 + freepages = nr_free_pages();
63374 + return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
63375 +}
63376 +
63377 +/* start read for @node and for a few of its right neighbors */
63378 +void formatted_readahead(znode * node, ra_info_t * info)
63379 +{
63380 + struct formatted_ra_params *ra_params;
63381 + znode *cur;
63382 + int i;
63383 + int grn_flags;
63384 + lock_handle next_lh;
63385 +
63386 + /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
63387 + if (reiser4_blocknr_is_fake(znode_get_block(node)))
63388 + return;
63389 +
63390 + ra_params = get_current_super_ra_params();
63391 +
63392 + if (znode_page(node) == NULL)
63393 + jstartio(ZJNODE(node));
63394 +
63395 + if (znode_get_level(node) != LEAF_LEVEL)
63396 + return;
63397 +
63398 + /* don't waste memory for read-ahead when low on memory */
63399 + if (low_on_memory())
63400 + return;
63401 +
63402 + /* We can have locked nodes on upper tree levels, in this situation lock
63403 + priorities do not help to resolve deadlocks, we have to use TRY_LOCK
63404 + here. */
63405 + grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
63406 +
63407 + i = 0;
63408 + cur = zref(node);
63409 + init_lh(&next_lh);
63410 + while (i < ra_params->max) {
63411 + const reiser4_block_nr *nextblk;
63412 +
63413 + if (!should_readahead_neighbor(cur, info))
63414 + break;
63415 +
63416 + if (reiser4_get_right_neighbor
63417 + (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
63418 + break;
63419 +
63420 + nextblk = znode_get_block(next_lh.node);
63421 + if (reiser4_blocknr_is_fake(nextblk) ||
63422 + (ra_adjacent_only(ra_params->flags)
63423 + && *nextblk != *znode_get_block(cur) + 1)) {
63424 + break;
63425 + }
63426 +
63427 + zput(cur);
63428 + cur = zref(next_lh.node);
63429 + done_lh(&next_lh);
63430 + if (znode_page(cur) == NULL)
63431 + jstartio(ZJNODE(cur));
63432 + else
63433 + /* Do not scan read-ahead window if pages already
63434 + * allocated (and i/o already started). */
63435 + break;
63436 +
63437 + i++;
63438 + }
63439 + zput(cur);
63440 + done_lh(&next_lh);
63441 +}
63442 +
63443 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
63444 +{
63445 + reiser4_key *stop_key;
63446 +
63447 + assert("nikita-3542", dir != NULL);
63448 + assert("nikita-3543", tap != NULL);
63449 +
63450 + stop_key = &tap->ra_info.key_to_stop;
63451 + /* initialize readdir readahead information: include into readahead
63452 + * stat data of all files of the directory */
63453 + set_key_locality(stop_key, get_inode_oid(dir));
63454 + set_key_type(stop_key, KEY_SD_MINOR);
63455 + set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
63456 + set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
63457 + set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
63458 +}
63459 +
63460 +/*
63461 + Local variables:
63462 + c-indentation-style: "K&R"
63463 + mode-name: "LC"
63464 + c-basic-offset: 8
63465 + tab-width: 8
63466 + fill-column: 80
63467 + End:
63468 +*/
63469 diff -urN linux-2.6.22.orig/fs/reiser4/readahead.h linux-2.6.22/fs/reiser4/readahead.h
63470 --- linux-2.6.22.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300
63471 +++ linux-2.6.22/fs/reiser4/readahead.h 2007-07-29 00:25:35.008730643 +0400
63472 @@ -0,0 +1,51 @@
63473 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63474 +
63475 +#ifndef __READAHEAD_H__
63476 +#define __READAHEAD_H__
63477 +
63478 +#include "key.h"
63479 +
63480 +typedef enum {
63481 + RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent.
63482 + Default is NO (not only adjacent) */
63483 +} ra_global_flags;
63484 +
63485 +/* reiser4 super block has a field of this type.
63486 + It controls readahead during tree traversals */
63487 +struct formatted_ra_params {
63488 + unsigned long max; /* request not more than this amount of nodes.
63489 + Default is totalram_pages / 4 */
63490 + int flags;
63491 +};
63492 +
63493 +typedef struct {
63494 + reiser4_key key_to_stop;
63495 +} ra_info_t;
63496 +
63497 +void formatted_readahead(znode *, ra_info_t *);
63498 +void reiser4_init_ra_info(ra_info_t * rai);
63499 +
63500 +struct reiser4_file_ra_state {
63501 + loff_t start; /* Current window */
63502 + loff_t size;
63503 + loff_t next_size; /* Next window size */
63504 + loff_t ahead_start; /* Ahead window */
63505 + loff_t ahead_size;
63506 + loff_t max_window_size; /* Maximum readahead window */
63507 + loff_t slow_start; /* enlarging r/a size algorithm. */
63508 +};
63509 +
63510 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
63511 +
63512 +/* __READAHEAD_H__ */
63513 +#endif
63514 +
63515 +/*
63516 + Local variables:
63517 + c-indentation-style: "K&R"
63518 + mode-name: "LC"
63519 + c-basic-offset: 8
63520 + tab-width: 8
63521 + fill-column: 120
63522 + End:
63523 +*/
63524 diff -urN linux-2.6.22.orig/fs/reiser4/README linux-2.6.22/fs/reiser4/README
63525 --- linux-2.6.22.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
63526 +++ linux-2.6.22/fs/reiser4/README 2007-07-29 00:25:35.008730643 +0400
63527 @@ -0,0 +1,128 @@
63528 +[LICENSING]
63529 +
63530 +Reiser4 is hereby licensed under the GNU General
63531 +Public License version 2.
63532 +
63533 +Source code files that contain the phrase "licensing governed by
63534 +reiser4/README" are "governed files" throughout this file. Governed
63535 +files are licensed under the GPL. The portions of them owned by Hans
63536 +Reiser, or authorized to be licensed by him, have been in the past,
63537 +and likely will be in the future, licensed to other parties under
63538 +other licenses. If you add your code to governed files, and don't
63539 +want it to be owned by Hans Reiser, put your copyright label on that
63540 +code so the poor blight and his customers can keep things straight.
63541 +All portions of governed files not labeled otherwise are owned by Hans
63542 +Reiser, and by adding your code to it, widely distributing it to
63543 +others or sending us a patch, and leaving the sentence in stating that
63544 +licensing is governed by the statement in this file, you accept this.
63545 +It will be a kindness if you identify whether Hans Reiser is allowed
63546 +to license code labeled as owned by you on your behalf other than
63547 +under the GPL, because he wants to know if it is okay to do so and put
63548 +a check in the mail to you (for non-trivial improvements) when he
63549 +makes his next sale. He makes no guarantees as to the amount if any,
63550 +though he feels motivated to motivate contributors, and you can surely
63551 +discuss this with him before or after contributing. You have the
63552 +right to decline to allow him to license your code contribution other
63553 +than under the GPL.
63554 +
63555 +Further licensing options are available for commercial and/or other
63556 +interests directly from Hans Reiser: reiser@namesys.com. If you interpret
63557 +the GPL as not allowing those additional licensing options, you read
63558 +it wrongly, and Richard Stallman agrees with me, when carefully read
63559 +you can see that those restrictions on additional terms do not apply
63560 +to the owner of the copyright, and my interpretation of this shall
63561 +govern for this license.
63562 +
63563 +[END LICENSING]
63564 +
63565 +Reiser4 is a file system based on dancing tree algorithms, and is
63566 +described at http://www.namesys.com
63567 +
63568 +mkfs.reiser4 and other utilities are on our webpage or wherever your
63569 +Linux provider put them. You really want to be running the latest
63570 +version off the website if you use fsck.
63571 +
63572 +Yes, if you update your reiser4 kernel module you do have to
63573 +recompile your kernel, most of the time. The errors you get will be
63574 +quite cryptic if your forget to do so.
63575 +
63576 +Hideous Commercial Pitch: Spread your development costs across other OS
63577 +vendors. Select from the best in the world, not the best in your
63578 +building, by buying from third party OS component suppliers. Leverage
63579 +the software component development power of the internet. Be the most
63580 +aggressive in taking advantage of the commercial possibilities of
63581 +decentralized internet development, and add value through your branded
63582 +integration that you sell as an operating system. Let your competitors
63583 +be the ones to compete against the entire internet by themselves. Be
63584 +hip, get with the new economic trend, before your competitors do. Send
63585 +email to reiser@namesys.com
63586 +
63587 +Hans Reiser was the primary architect of Reiser4, but a whole team
63588 +chipped their ideas in. He invested everything he had into Namesys
63589 +for 5.5 dark years of no money before Reiser3 finally started to work well
63590 +enough to bring in money. He owns the copyright.
63591 +
63592 +DARPA was the primary sponsor of Reiser4. DARPA does not endorse
63593 +Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
63594 +opinion, unique in its willingness to invest into things more
63595 +theoretical than the VC community can readily understand, and more
63596 +longterm than allows them to be sure that they will be the ones to
63597 +extract the economic benefits from. DARPA also integrated us into a
63598 +security community that transformed our security worldview.
63599 +
63600 +Vladimir Saveliev is our lead programmer, with us from the beginning,
63601 +and he worked long hours writing the cleanest code. This is why he is
63602 +now the lead programmer after years of commitment to our work. He
63603 +always made the effort to be the best he could be, and to make his
63604 +code the best that it could be. What resulted was quite remarkable. I
63605 +don't think that money can ever motivate someone to work the way he
63606 +did, he is one of the most selfless men I know.
63607 +
63608 +Alexander Lyamin was our sysadmin, and helped to educate us in
63609 +security issues. Moscow State University and IMT were very generous
63610 +in the internet access they provided us, and in lots of other little
63611 +ways that a generous institution can be.
63612 +
63613 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
63614 +locking code, the block allocator, and finished the flushing code.
63615 +His code is always crystal clean and well structured.
63616 +
63617 +Nikita Danilov wrote the core of the balancing code, the core of the
63618 +plugins code, and the directory code. He worked a steady pace of long
63619 +hours that produced a whole lot of well abstracted code. He is our
63620 +senior computer scientist.
63621 +
63622 +Vladimir Demidov wrote the parser. Writing an in kernel parser is
63623 +something very few persons have the skills for, and it is thanks to
63624 +him that we can say that the parser is really not so big compared to
63625 +various bits of our other code, and making a parser work in the kernel
63626 +was not so complicated as everyone would imagine mainly because it was
63627 +him doing it...
63628 +
63629 +Joshua McDonald wrote the transaction manager, and the flush code.
63630 +The flush code unexpectedly turned out be extremely hairy for reasons
63631 +you can read about on our web page, and he did a great job on an
63632 +extremely difficult task.
63633 +
63634 +Nina Reiser handled our accounting, government relations, and much
63635 +more.
63636 +
63637 +Ramon Reiser developed our website.
63638 +
63639 +Beverly Palmer drew our graphics.
63640 +
63641 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
63642 +and worked with Umka on developing libreiser4 and userspace plugins.
63643 +
63644 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
63645 +userspace tools (reiser4progs).
63646 +
63647 +Oleg Drokin (aka Green) is the release manager who fixes everything.
63648 +It is so nice to have someone like that on the team. He (plus Chris
63649 +and Jeff) make it possible for the entire rest of the Namesys team to
63650 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
63651 +is just amazing to watch his talent for spotting bugs in action.
63652 +
63653 +Edward Shishkin wrote cryptcompress file plugin (which manages files
63654 +built of encrypted and(or) compressed bodies) and other plugins related
63655 +to transparent encryption and compression support.
63656 diff -urN linux-2.6.22.orig/fs/reiser4/reiser4.h linux-2.6.22/fs/reiser4/reiser4.h
63657 --- linux-2.6.22.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300
63658 +++ linux-2.6.22/fs/reiser4/reiser4.h 2007-07-29 00:25:35.012731678 +0400
63659 @@ -0,0 +1,269 @@
63660 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63661 + * reiser4/README */
63662 +
63663 +/* definitions of common constants used by reiser4 */
63664 +
63665 +#if !defined( __REISER4_H__ )
63666 +#define __REISER4_H__
63667 +
63668 +#include <asm/param.h> /* for HZ */
63669 +#include <linux/errno.h>
63670 +#include <linux/types.h>
63671 +#include <linux/fs.h>
63672 +#include <linux/hardirq.h>
63673 +#include <linux/sched.h>
63674 +
63675 +/*
63676 + * reiser4 compilation options.
63677 + */
63678 +
63679 +#if defined(CONFIG_REISER4_DEBUG)
63680 +/* turn on assertion checks */
63681 +#define REISER4_DEBUG (1)
63682 +#else
63683 +#define REISER4_DEBUG (0)
63684 +#endif
63685 +
63686 +#if defined(CONFIG_ZLIB_INFLATE)
63687 +/* turn on zlib */
63688 +#define REISER4_ZLIB (1)
63689 +#else
63690 +#define REISER4_ZLIB (0)
63691 +#endif
63692 +
63693 +#if defined(CONFIG_CRYPTO_SHA256)
63694 +#define REISER4_SHA256 (1)
63695 +#else
63696 +#define REISER4_SHA256 (0)
63697 +#endif
63698 +
63699 +/*
63700 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
63701 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
63702 + * components. Additional component, referred to as "ordering" is used to
63703 + * order items from which given object is composed of. As such, ordering is
63704 + * placed between locality and objectid. For directory item ordering contains
63705 + * initial prefix of the file name this item is for. This sorts all directory
63706 + * items within given directory lexicographically (but see
63707 + * fibration.[ch]). For file body and stat-data, ordering contains initial
63708 + * prefix of the name file was initially created with. In the common case
63709 + * (files with single name) this allows to order file bodies and stat-datas in
63710 + * the same order as their respective directory entries, thus speeding up
63711 + * readdir.
63712 + *
63713 + * Note, that kernel can only mount file system with the same key size as one
63714 + * it is compiled for, so flipping this option may render your data
63715 + * inaccessible.
63716 + */
63717 +#define REISER4_LARGE_KEY (1)
63718 +/*#define REISER4_LARGE_KEY (0)*/
63719 +
63720 +/*#define GUESS_EXISTS 1*/
63721 +
63722 +/*
63723 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
63724 + * option
63725 + */
63726 +
63727 +extern const char *REISER4_SUPER_MAGIC_STRING;
63728 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
63729 + * beginning of device */
63730 +
63731 +/* here go tunable parameters that are not worth special entry in kernel
63732 + configuration */
63733 +
63734 +/* default number of slots in coord-by-key caches */
63735 +#define CBK_CACHE_SLOTS (16)
63736 +/* how many elementary tree operation to carry on the next level */
63737 +#define CARRIES_POOL_SIZE (5)
63738 +/* size of pool of preallocated nodes for carry process. */
63739 +#define NODES_LOCKED_POOL_SIZE (5)
63740 +
63741 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63742 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63743 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
63744 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
63745 +
63746 +/* we are supporting reservation of disk space on uid basis */
63747 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
63748 +/* we are supporting reservation of disk space for groups */
63749 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
63750 +/* we are supporting reservation of disk space for root */
63751 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
63752 +/* we use rapid flush mode, see flush.c for comments. */
63753 +#define REISER4_USE_RAPID_FLUSH (1)
63754 +
63755 +/*
63756 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
63757 + */
63758 +#define REISER4_USE_ENTD (1)
63759 +
63760 +/* key allocation is Plan-A */
63761 +#define REISER4_PLANA_KEY_ALLOCATION (1)
63762 +/* key allocation follows good old 3.x scheme */
63763 +#define REISER4_3_5_KEY_ALLOCATION (0)
63764 +
63765 +/* size of hash-table for znodes */
63766 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
63767 +
63768 +/* number of buckets in lnode hash-table */
63769 +#define LNODE_HTABLE_BUCKETS (1024)
63770 +
63771 +/* some ridiculously high maximal limit on height of znode tree. This
63772 + is used in declaration of various per level arrays and
63773 + to allocate stattistics gathering array for per-level stats. */
63774 +#define REISER4_MAX_ZTREE_HEIGHT (8)
63775 +
63776 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
63777 +
63778 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
63779 + sequential search is on average faster than binary. This is because
63780 + of better optimization and because sequential search is more CPU
63781 + cache friendly. This number (25) was found by experiments on dual AMD
63782 + Athlon(tm), 1400MHz.
63783 +
63784 + NOTE: testing in kernel has shown that binary search is more effective than
63785 + implied by results of the user level benchmarking. Probably because in the
63786 + node keys are separated by other data. So value was adjusted after few
63787 + tests. More thorough tuning is needed.
63788 +*/
63789 +#define REISER4_SEQ_SEARCH_BREAK (3)
63790 +
63791 +/* don't allow tree to be lower than this */
63792 +#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
63793 +
63794 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
63795 + * available memory. */
63796 +/* Default value of maximal atom size. Can be ovewritten by
63797 + tmgr.atom_max_size mount option. By default infinity. */
63798 +#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
63799 +
63800 +/* Default value of maximal atom age (in jiffies). After reaching this age
63801 + atom will be forced to commit, either synchronously or asynchronously. Can
63802 + be overwritten by tmgr.atom_max_age mount option. */
63803 +#define REISER4_ATOM_MAX_AGE (600 * HZ)
63804 +
63805 +/* sleeping period for ktxnmrgd */
63806 +#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
63807 +
63808 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
63809 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
63810 +
63811 +/* start complaining after that many restarts in coord_by_key().
63812 +
63813 + This either means incredibly heavy contention for this part of a tree, or
63814 + some corruption or bug.
63815 +*/
63816 +#define REISER4_CBK_ITERATIONS_LIMIT (100)
63817 +
63818 +/* return -EIO after that many iterations in coord_by_key().
63819 +
63820 + I have witnessed more than 800 iterations (in 30 thread test) before cbk
63821 + finished. --nikita
63822 +*/
63823 +#define REISER4_MAX_CBK_ITERATIONS 500000
63824 +
63825 +/* put a per-inode limit on maximal number of directory entries with identical
63826 + keys in hashed directory.
63827 +
63828 + Disable this until inheritance interfaces stabilize: we need some way to
63829 + set per directory limit.
63830 +*/
63831 +#define REISER4_USE_COLLISION_LIMIT (0)
63832 +
63833 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
63834 + will force them to be relocated. */
63835 +#define FLUSH_RELOCATE_THRESHOLD 64
63836 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
63837 + from the preceder it will relocate to that position. */
63838 +#define FLUSH_RELOCATE_DISTANCE 64
63839 +
63840 +/* If we have written this much or more blocks before encountering busy jnode
63841 + in flush list - abort flushing hoping that next time we get called
63842 + this jnode will be clean already, and we will save some seeks. */
63843 +#define FLUSH_WRITTEN_THRESHOLD 50
63844 +
63845 +/* The maximum number of nodes to scan left on a level during flush. */
63846 +#define FLUSH_SCAN_MAXNODES 10000
63847 +
63848 +/* per-atom limit of flushers */
63849 +#define ATOM_MAX_FLUSHERS (1)
63850 +
63851 +/* default tracing buffer size */
63852 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
63853 +
63854 +/* what size units of IO we would like cp, etc., to use, in writing to
63855 + reiser4. In bytes.
63856 +
63857 + Can be overwritten by optimal_io_size mount option.
63858 +*/
63859 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
63860 +
63861 +/* see comments in inode.c:oid_to_uino() */
63862 +#define REISER4_UINO_SHIFT (1 << 30)
63863 +
63864 +/* Mark function argument as unused to avoid compiler warnings. */
63865 +#define UNUSED_ARG __attribute__((unused))
63866 +
63867 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
63868 +#define NONNULL __attribute__((nonnull))
63869 +#else
63870 +#define NONNULL
63871 +#endif
63872 +
63873 +/* master super block offset in bytes.*/
63874 +#define REISER4_MASTER_OFFSET 65536
63875 +
63876 +/* size of VFS block */
63877 +#define VFS_BLKSIZE 512
63878 +/* number of bits in size of VFS block (512==2^9) */
63879 +#define VFS_BLKSIZE_BITS 9
63880 +
63881 +#define REISER4_I reiser4_inode_data
63882 +
63883 +/* implication */
63884 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
63885 +/* logical equivalence */
63886 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
63887 +
63888 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
63889 +
63890 +#define NOT_YET (0)
63891 +
63892 +/** Reiser4 specific error codes **/
63893 +
63894 +#define REISER4_ERROR_CODE_BASE 500
63895 +
63896 +/* Neighbor is not available (side neighbor or parent) */
63897 +#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
63898 +
63899 +/* Node was not found in cache */
63900 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
63901 +
63902 +/* node has no free space enough for completion of balancing operation */
63903 +#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
63904 +
63905 +/* repeat operation */
63906 +#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
63907 +
63908 +/* deadlock happens */
63909 +#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
63910 +
63911 +/* operation cannot be performed, because it would block and non-blocking mode
63912 + * was requested. */
63913 +#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
63914 +
63915 +/* wait some event (depends on context), then repeat */
63916 +#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
63917 +
63918 +#endif /* __REISER4_H__ */
63919 +
63920 +/* Make Linus happy.
63921 + Local variables:
63922 + c-indentation-style: "K&R"
63923 + mode-name: "LC"
63924 + c-basic-offset: 8
63925 + tab-width: 8
63926 + fill-column: 120
63927 + End:
63928 +*/
63929 diff -urN linux-2.6.22.orig/fs/reiser4/safe_link.c linux-2.6.22/fs/reiser4/safe_link.c
63930 --- linux-2.6.22.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300
63931 +++ linux-2.6.22/fs/reiser4/safe_link.c 2007-07-29 00:25:35.012731678 +0400
63932 @@ -0,0 +1,352 @@
63933 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
63934 + * reiser4/README */
63935 +
63936 +/* Safe-links. */
63937 +
63938 +/*
63939 + * Safe-links are used to maintain file system consistency during operations
63940 + * that spawns multiple transactions. For example:
63941 + *
63942 + * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
63943 + * without user-visible names in the file system, but still opened by some
63944 + * active process. What happens here is that unlink proper (i.e., removal
63945 + * of the last file name) and file deletion (truncate of file body to zero
63946 + * and deletion of stat-data, that happens when last file descriptor is
63947 + * closed), may belong to different transactions T1 and T2. If a crash
63948 + * happens after T1 commit, but before T2 commit, on-disk file system has
63949 + * a file without name, that is, disk space leak.
63950 + *
63951 + * 2. Truncate. Truncate of large file may spawn multiple transactions. If
63952 + * system crashes while truncate was in-progress, file is left partially
63953 + * truncated, which violates "atomicity guarantees" of reiser4, viz. that
63954 + * every system is atomic.
63955 + *
63956 + * Safe-links address both above cases. Basically, safe-link is a way post
63957 + * some operation to be executed during commit of some other transaction than
63958 + * current one. (Another way to look at the safe-link is to interpret it as a
63959 + * logical logging.)
63960 + *
63961 + * Specifically, at the beginning of unlink safe-link in inserted in the
63962 + * tree. This safe-link is normally removed by file deletion code (during
63963 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
63964 + * normally removed when truncate operation is finished.
63965 + *
63966 + * This means, that in the case of "clean umount" there are no safe-links in
63967 + * the tree. If safe-links are observed during mount, it means that (a) system
63968 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
63969 + * (i.e., not finished) operations that were in-progress during system
63970 + * termination. Each safe-link record enough information to complete
63971 + * corresponding operation, and mount simply "replays" them (hence, the
63972 + * analogy with the logical logging).
63973 + *
63974 + * Safe-links are implemented as blackbox items (see
63975 + * plugin/item/blackbox.[ch]).
63976 + *
63977 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
63978 + * list" there.
63979 + */
63980 +
63981 +#include "safe_link.h"
63982 +#include "debug.h"
63983 +#include "inode.h"
63984 +
63985 +#include "plugin/item/blackbox.h"
63986 +
63987 +#include <linux/fs.h>
63988 +
63989 +/*
63990 + * On-disk format of safe-link.
63991 + */
63992 +typedef struct safelink {
63993 + reiser4_key sdkey; /* key of stat-data for the file safe-link is
63994 + * for */
63995 + d64 size; /* size to which file should be truncated */
63996 +} safelink_t;
63997 +
63998 +/*
63999 + * locality where safe-link items are stored. Next to the objectid of root
64000 + * directory.
64001 + */
64002 +static oid_t safe_link_locality(reiser4_tree * tree)
64003 +{
64004 + return get_key_objectid(get_super_private(tree->super)->df_plug->
64005 + root_dir_key(tree->super)) + 1;
64006 +}
64007 +
64008 +/*
64009 + Construct a key for the safe-link. Key has the following format:
64010 +
64011 +| 60 | 4 | 64 | 4 | 60 | 64 |
64012 ++---------------+---+------------------+---+---------------+------------------+
64013 +| locality | 0 | 0 | 0 | objectid | link type |
64014 ++---------------+---+------------------+---+---------------+------------------+
64015 +| | | | |
64016 +| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
64017 +
64018 + This is in large keys format. In small keys format second 8 byte chunk is
64019 + out. Locality is a constant returned by safe_link_locality(). objectid is
64020 + an oid of a file on which operation protected by this safe-link is
64021 + performed. link-type is used to distinguish safe-links for different
64022 + operations.
64023 +
64024 + */
64025 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64026 + reiser4_safe_link_t link, reiser4_key * key)
64027 +{
64028 + reiser4_key_init(key);
64029 + set_key_locality(key, safe_link_locality(tree));
64030 + set_key_objectid(key, oid);
64031 + set_key_offset(key, link);
64032 + return key;
64033 +}
64034 +
64035 +/*
64036 + * how much disk space is necessary to insert and remove (in the
64037 + * error-handling path) safe-link.
64038 + */
64039 +static __u64 safe_link_tograb(reiser4_tree * tree)
64040 +{
64041 + return
64042 + /* insert safe link */
64043 + estimate_one_insert_item(tree) +
64044 + /* remove safe link */
64045 + estimate_one_item_removal(tree) +
64046 + /* drill to the leaf level during insertion */
64047 + 1 + estimate_one_insert_item(tree) +
64048 + /*
64049 + * possible update of existing safe-link. Actually, if
64050 + * safe-link existed already (we failed to remove it), then no
64051 + * insertion is necessary, so this term is already "covered",
64052 + * but for simplicity let's left it.
64053 + */
64054 + 1;
64055 +}
64056 +
64057 +/*
64058 + * grab enough disk space to insert and remove (in the error-handling path)
64059 + * safe-link.
64060 + */
64061 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64062 +{
64063 + int result;
64064 +
64065 + grab_space_enable();
64066 + /* The sbinfo->delete_mutex can be taken here.
64067 + * safe_link_release() should be called before leaving reiser4
64068 + * context. */
64069 + result =
64070 + reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64071 + grab_space_enable();
64072 + return result;
64073 +}
64074 +
64075 +/*
64076 + * release unused disk space reserved by safe_link_grab().
64077 + */
64078 +void safe_link_release(reiser4_tree * tree)
64079 +{
64080 + reiser4_release_reserved(tree->super);
64081 +}
64082 +
64083 +/*
64084 + * insert into tree safe-link for operation @link on inode @inode.
64085 + */
64086 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64087 +{
64088 + reiser4_key key;
64089 + safelink_t sl;
64090 + int length;
64091 + int result;
64092 + reiser4_tree *tree;
64093 +
64094 + build_sd_key(inode, &sl.sdkey);
64095 + length = sizeof sl.sdkey;
64096 +
64097 + if (link == SAFE_TRUNCATE) {
64098 + /*
64099 + * for truncate we have to store final file length also,
64100 + * expand item.
64101 + */
64102 + length += sizeof(sl.size);
64103 + put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64104 + }
64105 + tree = reiser4_tree_by_inode(inode);
64106 + build_link_key(tree, get_inode_oid(inode), link, &key);
64107 +
64108 + result = store_black_box(tree, &key, &sl, length);
64109 + if (result == -EEXIST)
64110 + result = update_black_box(tree, &key, &sl, length);
64111 + return result;
64112 +}
64113 +
64114 +/*
64115 + * remove safe-link corresponding to the operation @link on inode @inode from
64116 + * the tree.
64117 + */
64118 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64119 +{
64120 + reiser4_key key;
64121 +
64122 + return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64123 +}
64124 +
64125 +/*
64126 + * in-memory structure to keep information extracted from safe-link. This is
64127 + * used to iterate over all safe-links.
64128 + */
64129 +struct safe_link_context {
64130 + reiser4_tree *tree; /* internal tree */
64131 + reiser4_key key; /* safe-link key */
64132 + reiser4_key sdkey; /* key of object stat-data */
64133 + reiser4_safe_link_t link; /* safe-link type */
64134 + oid_t oid; /* object oid */
64135 + __u64 size; /* final size for truncate */
64136 +};
64137 +
64138 +/*
64139 + * start iterating over all safe-links.
64140 + */
64141 +static void safe_link_iter_begin(reiser4_tree * tree,
64142 + struct safe_link_context * ctx)
64143 +{
64144 + ctx->tree = tree;
64145 + reiser4_key_init(&ctx->key);
64146 + set_key_locality(&ctx->key, safe_link_locality(tree));
64147 + set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
64148 + set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
64149 +}
64150 +
64151 +/*
64152 + * return next safe-link.
64153 + */
64154 +static int safe_link_iter_next(struct safe_link_context * ctx)
64155 +{
64156 + int result;
64157 + safelink_t sl;
64158 +
64159 + result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64160 + if (result == 0) {
64161 + ctx->oid = get_key_objectid(&ctx->key);
64162 + ctx->link = get_key_offset(&ctx->key);
64163 + ctx->sdkey = sl.sdkey;
64164 + if (ctx->link == SAFE_TRUNCATE)
64165 + ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64166 + }
64167 + return result;
64168 +}
64169 +
64170 +/*
64171 + * check are there any more safe-links left in the tree.
64172 + */
64173 +static int safe_link_iter_finished(struct safe_link_context * ctx)
64174 +{
64175 + return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64176 +}
64177 +
64178 +/*
64179 + * finish safe-link iteration.
64180 + */
64181 +static void safe_link_iter_end(struct safe_link_context * ctx)
64182 +{
64183 + /* nothing special */
64184 +}
64185 +
64186 +/*
64187 + * process single safe-link.
64188 + */
64189 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64190 + reiser4_key * sdkey, oid_t oid, __u64 size)
64191 +{
64192 + struct inode *inode;
64193 + int result;
64194 +
64195 + /*
64196 + * obtain object inode by reiser4_iget(), then call object plugin
64197 + * ->safelink() method to do actual work, then delete safe-link on
64198 + * success.
64199 + */
64200 + inode = reiser4_iget(super, sdkey, 1);
64201 + if (!IS_ERR(inode)) {
64202 + file_plugin *fplug;
64203 +
64204 + fplug = inode_file_plugin(inode);
64205 + assert("nikita-3428", fplug != NULL);
64206 + assert("", oid == get_inode_oid(inode));
64207 + if (fplug->safelink != NULL) {
64208 + /* reiser4_txn_restart_current is not necessary because
64209 + * mounting is signle thread. However, without it
64210 + * deadlock detection code will complain (see
64211 + * nikita-3361). */
64212 + reiser4_txn_restart_current();
64213 + result = fplug->safelink(inode, link, size);
64214 + } else {
64215 + warning("nikita-3430",
64216 + "Cannot handle safelink for %lli",
64217 + (unsigned long long)oid);
64218 + reiser4_print_key("key", sdkey);
64219 + result = 0;
64220 + }
64221 + if (result != 0) {
64222 + warning("nikita-3431",
64223 + "Error processing safelink for %lli: %i",
64224 + (unsigned long long)oid, result);
64225 + }
64226 + reiser4_iget_complete(inode);
64227 + iput(inode);
64228 + if (result == 0) {
64229 + result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
64230 + if (result == 0)
64231 + result =
64232 + safe_link_del(reiser4_get_tree(super), oid, link);
64233 + safe_link_release(reiser4_get_tree(super));
64234 + /*
64235 + * restart transaction: if there was large number of
64236 + * safe-links, their processing may fail to fit into
64237 + * single transaction.
64238 + */
64239 + if (result == 0)
64240 + reiser4_txn_restart_current();
64241 + }
64242 + } else
64243 + result = PTR_ERR(inode);
64244 + return result;
64245 +}
64246 +
64247 +/*
64248 + * iterate over all safe-links in the file-system processing them one by one.
64249 + */
64250 +int process_safelinks(struct super_block *super)
64251 +{
64252 + struct safe_link_context ctx;
64253 + int result;
64254 +
64255 + if (rofs_super(super))
64256 + /* do nothing on the read-only file system */
64257 + return 0;
64258 + safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
64259 + result = 0;
64260 + do {
64261 + result = safe_link_iter_next(&ctx);
64262 + if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
64263 + result = 0;
64264 + break;
64265 + }
64266 + if (result == 0)
64267 + result = process_safelink(super, ctx.link,
64268 + &ctx.sdkey, ctx.oid,
64269 + ctx.size);
64270 + } while (result == 0);
64271 + safe_link_iter_end(&ctx);
64272 + return result;
64273 +}
64274 +
64275 +/* Make Linus happy.
64276 + Local variables:
64277 + c-indentation-style: "K&R"
64278 + mode-name: "LC"
64279 + c-basic-offset: 8
64280 + tab-width: 8
64281 + fill-column: 120
64282 + scroll-step: 1
64283 + End:
64284 +*/
64285 diff -urN linux-2.6.22.orig/fs/reiser4/safe_link.h linux-2.6.22/fs/reiser4/safe_link.h
64286 --- linux-2.6.22.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300
64287 +++ linux-2.6.22/fs/reiser4/safe_link.h 2007-07-29 00:25:35.012731678 +0400
64288 @@ -0,0 +1,29 @@
64289 +/* Copyright 2003 by Hans Reiser, licensing governed by
64290 + * reiser4/README */
64291 +
64292 +/* Safe-links. See safe_link.c for details. */
64293 +
64294 +#if !defined( __FS_SAFE_LINK_H__ )
64295 +#define __FS_SAFE_LINK_H__
64296 +
64297 +#include "tree.h"
64298 +
64299 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
64300 +void safe_link_release(reiser4_tree * tree);
64301 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
64302 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
64303 +
64304 +int process_safelinks(struct super_block *super);
64305 +
64306 +/* __FS_SAFE_LINK_H__ */
64307 +#endif
64308 +
64309 +/* Make Linus happy.
64310 + Local variables:
64311 + c-indentation-style: "K&R"
64312 + mode-name: "LC"
64313 + c-basic-offset: 8
64314 + tab-width: 8
64315 + fill-column: 120
64316 + End:
64317 +*/
64318 diff -urN linux-2.6.22.orig/fs/reiser4/seal.c linux-2.6.22/fs/reiser4/seal.c
64319 --- linux-2.6.22.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
64320 +++ linux-2.6.22/fs/reiser4/seal.c 2007-07-29 00:25:35.012731678 +0400
64321 @@ -0,0 +1,218 @@
64322 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64323 +/* Seals implementation. */
64324 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
64325 + allowing to bypass tree traversal. But normal usage of coords implies that
64326 + node pointed to by coord is locked, whereas seals don't keep a lock (or
64327 + even a reference) to znode. In stead, each znode contains a version number,
64328 + increased on each znode modification. This version number is copied into a
64329 + seal when seal is created. Later, one can "validate" seal by calling
64330 + reiser4_seal_validate(). If znode is in cache and its version number is
64331 + still the same, seal is "pristine" and coord associated with it can be
64332 + re-used immediately.
64333 +
64334 + If, on the other hand, znode is out of cache, or it is obviously different
64335 + one from the znode seal was initially attached to (for example, it is on
64336 + the different level, or is being removed from the tree), seal is
64337 + irreparably invalid ("burned") and tree traversal has to be repeated.
64338 +
64339 + Otherwise, there is some hope, that while znode was modified (and seal was
64340 + "broken" as a result), key attached to the seal is still in the node. This
64341 + is checked by first comparing this key with delimiting keys of node and, if
64342 + key is ok, doing intra-node lookup.
64343 +
64344 + Znode version is maintained in the following way:
64345 +
64346 + there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
64347 + znode_epoch is incremented and its new value is stored in ->version field
64348 + of new znode. Whenever znode is dirtied (which means it was probably
64349 + modified), znode_epoch is also incremented and its new value is stored in
64350 + znode->version. This is done so, because just incrementing znode->version
64351 + on each update is not enough: it may so happen, that znode get deleted, new
64352 + znode is allocated for the same disk block and gets the same version
64353 + counter, tricking seal code into false positive.
64354 +*/
64355 +
64356 +#include "forward.h"
64357 +#include "debug.h"
64358 +#include "key.h"
64359 +#include "coord.h"
64360 +#include "seal.h"
64361 +#include "plugin/item/item.h"
64362 +#include "plugin/node/node.h"
64363 +#include "jnode.h"
64364 +#include "znode.h"
64365 +#include "super.h"
64366 +
64367 +static znode *seal_node(const seal_t * seal);
64368 +static int seal_matches(const seal_t * seal, znode * node);
64369 +
64370 +/* initialise seal. This can be called several times on the same seal. @coord
64371 + and @key can be NULL. */
64372 +void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
64373 + const coord_t * coord /* coord @seal will be
64374 + * attached to */ ,
64375 + const reiser4_key * key UNUSED_ARG /* key @seal will be
64376 + * attached to */ )
64377 +{
64378 + assert("nikita-1886", seal != NULL);
64379 + memset(seal, 0, sizeof *seal);
64380 + if (coord != NULL) {
64381 + znode *node;
64382 +
64383 + node = coord->node;
64384 + assert("nikita-1987", node != NULL);
64385 + spin_lock_znode(node);
64386 + seal->version = node->version;
64387 + assert("nikita-1988", seal->version != 0);
64388 + seal->block = *znode_get_block(node);
64389 +#if REISER4_DEBUG
64390 + seal->coord1 = *coord;
64391 + if (key != NULL)
64392 + seal->key = *key;
64393 +#endif
64394 + spin_unlock_znode(node);
64395 + }
64396 +}
64397 +
64398 +/* finish with seal */
64399 +void reiser4_seal_done(seal_t * seal /* seal to clear */ )
64400 +{
64401 + assert("nikita-1887", seal != NULL);
64402 + seal->version = 0;
64403 +}
64404 +
64405 +/* true if seal was initialised */
64406 +int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
64407 +{
64408 + assert("nikita-1890", seal != NULL);
64409 + return seal->version != 0;
64410 +}
64411 +
64412 +#if REISER4_DEBUG
64413 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
64414 + * has expected key. This is to detect cases where node was modified but wasn't
64415 + * marked dirty. */
64416 +static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
64417 + const reiser4_key * k /* expected key */ )
64418 +{
64419 + reiser4_key ukey;
64420 +
64421 + return (coord->between != AT_UNIT) ||
64422 + /* FIXME-VS: we only can compare keys for items whose units
64423 + represent exactly one key */
64424 + ((coord_is_existing_unit(coord))
64425 + && (item_is_extent(coord)
64426 + || keyeq(k, unit_key_by_coord(coord, &ukey))))
64427 + || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
64428 + && keyge(k, unit_key_by_coord(coord, &ukey)));
64429 +}
64430 +#endif
64431 +
64432 +/* this is used by reiser4_seal_validate. It accepts return value of
64433 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
64434 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
64435 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
64436 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
64437 + * distinguish between -EINVAL and -E_REPEAT. */
64438 +static int should_repeat(int return_code)
64439 +{
64440 + return return_code == -EINVAL;
64441 +}
64442 +
64443 +/* (re-)validate seal.
64444 +
64445 + Checks whether seal is pristine, and try to revalidate it if possible.
64446 +
64447 + If seal was burned, or broken irreparably, return -E_REPEAT.
64448 +
64449 + NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
64450 + looking for is in range of keys covered by the sealed node, but item wasn't
64451 + found by node ->lookup() method. Alternative is to return -ENOENT in this
64452 + case, but this would complicate callers logic.
64453 +
64454 +*/
64455 +int reiser4_seal_validate(seal_t * seal /* seal to validate */,
64456 + coord_t * coord /* coord to validate against */,
64457 + const reiser4_key * key /* key to validate against */,
64458 + lock_handle * lh /* resulting lock handle */,
64459 + znode_lock_mode mode /* lock node */,
64460 + znode_lock_request request /* locking priority */)
64461 +{
64462 + znode *node;
64463 + int result;
64464 +
64465 + assert("nikita-1889", seal != NULL);
64466 + assert("nikita-1881", reiser4_seal_is_set(seal));
64467 + assert("nikita-1882", key != NULL);
64468 + assert("nikita-1883", coord != NULL);
64469 + assert("nikita-1884", lh != NULL);
64470 + assert("nikita-1885", keyeq(&seal->key, key));
64471 + assert("nikita-1989", coords_equal(&seal->coord1, coord));
64472 +
64473 + /* obtain znode by block number */
64474 + node = seal_node(seal);
64475 + if (node != NULL) {
64476 + /* znode was in cache, lock it */
64477 + result = longterm_lock_znode(lh, node, mode, request);
64478 + zput(node);
64479 + if (result == 0) {
64480 + if (seal_matches(seal, node)) {
64481 + /* if seal version and znode version
64482 + coincide */
64483 + ON_DEBUG(coord_update_v(coord));
64484 + assert("nikita-1990",
64485 + node == seal->coord1.node);
64486 + assert("nikita-1898",
64487 + WITH_DATA_RET(coord->node, 1,
64488 + check_seal_match(coord,
64489 + key)));
64490 + } else
64491 + result = RETERR(-E_REPEAT);
64492 + }
64493 + if (result != 0) {
64494 + if (should_repeat(result))
64495 + result = RETERR(-E_REPEAT);
64496 + /* unlock node on failure */
64497 + done_lh(lh);
64498 + }
64499 + } else {
64500 + /* znode wasn't in cache */
64501 + result = RETERR(-E_REPEAT);
64502 + }
64503 + return result;
64504 +}
64505 +
64506 +/* helpers functions */
64507 +
64508 +/* obtain reference to znode seal points to, if in cache */
64509 +static znode *seal_node(const seal_t * seal /* seal to query */ )
64510 +{
64511 + assert("nikita-1891", seal != NULL);
64512 + return zlook(current_tree, &seal->block);
64513 +}
64514 +
64515 +/* true if @seal version and @node version coincide */
64516 +static int seal_matches(const seal_t * seal /* seal to check */ ,
64517 + znode * node /* node to check */ )
64518 +{
64519 + int result;
64520 +
64521 + assert("nikita-1991", seal != NULL);
64522 + assert("nikita-1993", node != NULL);
64523 +
64524 + spin_lock_znode(node);
64525 + result = (seal->version == node->version);
64526 + spin_unlock_znode(node);
64527 + return result;
64528 +}
64529 +
64530 +/* Make Linus happy.
64531 + Local variables:
64532 + c-indentation-style: "K&R"
64533 + mode-name: "LC"
64534 + c-basic-offset: 8
64535 + tab-width: 8
64536 + fill-column: 120
64537 + scroll-step: 1
64538 + End:
64539 +*/
64540 diff -urN linux-2.6.22.orig/fs/reiser4/seal.h linux-2.6.22/fs/reiser4/seal.h
64541 --- linux-2.6.22.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
64542 +++ linux-2.6.22/fs/reiser4/seal.h 2007-07-29 00:25:35.012731678 +0400
64543 @@ -0,0 +1,49 @@
64544 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64545 +
64546 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
64547 +
64548 +#ifndef __SEAL_H__
64549 +#define __SEAL_H__
64550 +
64551 +#include "forward.h"
64552 +#include "debug.h"
64553 +#include "dformat.h"
64554 +#include "key.h"
64555 +#include "coord.h"
64556 +
64557 +/* for __u?? types */
64558 +/*#include <linux/types.h>*/
64559 +
64560 +/* seal. See comment at the top of seal.c */
64561 +typedef struct seal_s {
64562 + /* version of znode recorder at the time of seal creation */
64563 + __u64 version;
64564 + /* block number of znode attached to this seal */
64565 + reiser4_block_nr block;
64566 +#if REISER4_DEBUG
64567 + /* coord this seal is attached to. For debugging. */
64568 + coord_t coord1;
64569 + /* key this seal is attached to. For debugging. */
64570 + reiser4_key key;
64571 +#endif
64572 +} seal_t;
64573 +
64574 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
64575 +extern void reiser4_seal_done(seal_t *);
64576 +extern int reiser4_seal_is_set(const seal_t *);
64577 +extern int reiser4_seal_validate(seal_t *, coord_t *,
64578 + const reiser4_key *, lock_handle *,
64579 + znode_lock_mode mode, znode_lock_request request);
64580 +
64581 +/* __SEAL_H__ */
64582 +#endif
64583 +
64584 +/* Make Linus happy.
64585 + Local variables:
64586 + c-indentation-style: "K&R"
64587 + mode-name: "LC"
64588 + c-basic-offset: 8
64589 + tab-width: 8
64590 + fill-column: 120
64591 + End:
64592 +*/
64593 diff -urN linux-2.6.22.orig/fs/reiser4/search.c linux-2.6.22/fs/reiser4/search.c
64594 --- linux-2.6.22.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
64595 +++ linux-2.6.22/fs/reiser4/search.c 2007-07-29 00:25:35.016732714 +0400
64596 @@ -0,0 +1,1611 @@
64597 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64598 + * reiser4/README */
64599 +
64600 +#include "forward.h"
64601 +#include "debug.h"
64602 +#include "dformat.h"
64603 +#include "key.h"
64604 +#include "coord.h"
64605 +#include "seal.h"
64606 +#include "plugin/item/item.h"
64607 +#include "plugin/node/node.h"
64608 +#include "plugin/plugin.h"
64609 +#include "jnode.h"
64610 +#include "znode.h"
64611 +#include "block_alloc.h"
64612 +#include "tree_walk.h"
64613 +#include "tree.h"
64614 +#include "reiser4.h"
64615 +#include "super.h"
64616 +#include "inode.h"
64617 +
64618 +#include <linux/slab.h>
64619 +
64620 +static const char *bias_name(lookup_bias bias);
64621 +
64622 +/* tree searching algorithm, intranode searching algorithms are in
64623 + plugin/node/ */
64624 +
64625 +/* tree lookup cache
64626 + *
64627 + * The coord by key cache consists of small list of recently accessed nodes
64628 + * maintained according to the LRU discipline. Before doing real top-to-down
64629 + * tree traversal this cache is scanned for nodes that can contain key
64630 + * requested.
64631 + *
64632 + * The efficiency of coord cache depends heavily on locality of reference for
64633 + * tree accesses. Our user level simulations show reasonably good hit ratios
64634 + * for coord cache under most loads so far.
64635 + */
64636 +
64637 +/* Initialise coord cache slot */
64638 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
64639 +{
64640 + assert("nikita-345", slot != NULL);
64641 +
64642 + INIT_LIST_HEAD(&slot->lru);
64643 + slot->node = NULL;
64644 +}
64645 +
64646 +/* Initialize coord cache */
64647 +int cbk_cache_init(cbk_cache *cache /* cache to init */ )
64648 +{
64649 + int i;
64650 +
64651 + assert("nikita-346", cache != NULL);
64652 +
64653 + cache->slot =
64654 + kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
64655 + reiser4_ctx_gfp_mask_get());
64656 + if (cache->slot == NULL)
64657 + return RETERR(-ENOMEM);
64658 +
64659 + INIT_LIST_HEAD(&cache->lru);
64660 + for (i = 0; i < cache->nr_slots; ++i) {
64661 + cbk_cache_init_slot(cache->slot + i);
64662 + list_add_tail(&((cache->slot + i)->lru), &cache->lru);
64663 + }
64664 + rwlock_init(&cache->guard);
64665 + return 0;
64666 +}
64667 +
64668 +/* free cbk cache data */
64669 +void cbk_cache_done(cbk_cache * cache /* cache to release */ )
64670 +{
64671 + assert("nikita-2493", cache != NULL);
64672 + if (cache->slot != NULL) {
64673 + kfree(cache->slot);
64674 + cache->slot = NULL;
64675 + }
64676 +}
64677 +
64678 +/* macro to iterate over all cbk cache slots */
64679 +#define for_all_slots(cache, slot) \
64680 + for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
64681 + &(cache)->lru != &(slot)->lru; \
64682 + (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
64683 +
64684 +#if REISER4_DEBUG
64685 +/* this function assures that [cbk-cache-invariant] invariant holds */
64686 +static int cbk_cache_invariant(const cbk_cache *cache)
64687 +{
64688 + cbk_cache_slot *slot;
64689 + int result;
64690 + int unused;
64691 +
64692 + if (cache->nr_slots == 0)
64693 + return 1;
64694 +
64695 + assert("nikita-2469", cache != NULL);
64696 + unused = 0;
64697 + result = 1;
64698 + read_lock(&((cbk_cache *)cache)->guard);
64699 + for_all_slots(cache, slot) {
64700 + /* in LRU first go all `used' slots followed by `unused' */
64701 + if (unused && (slot->node != NULL))
64702 + result = 0;
64703 + if (slot->node == NULL)
64704 + unused = 1;
64705 + else {
64706 + cbk_cache_slot *scan;
64707 +
64708 + /* all cached nodes are different */
64709 + scan = slot;
64710 + while (result) {
64711 + scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
64712 + if (&cache->lru == &scan->lru)
64713 + break;
64714 + if (slot->node == scan->node)
64715 + result = 0;
64716 + }
64717 + }
64718 + if (!result)
64719 + break;
64720 + }
64721 + read_unlock(&((cbk_cache *)cache)->guard);
64722 + return result;
64723 +}
64724 +
64725 +#endif
64726 +
64727 +/* Remove references, if any, to @node from coord cache */
64728 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
64729 + reiser4_tree * tree /* tree to remove node from */ )
64730 +{
64731 + cbk_cache_slot *slot;
64732 + cbk_cache *cache;
64733 + int i;
64734 +
64735 + assert("nikita-350", node != NULL);
64736 + assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
64737 +
64738 + cache = &tree->cbk_cache;
64739 + assert("nikita-2470", cbk_cache_invariant(cache));
64740 +
64741 + write_lock(&(cache->guard));
64742 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
64743 + if (slot->node == node) {
64744 + list_move_tail(&slot->lru, &cache->lru);
64745 + slot->node = NULL;
64746 + break;
64747 + }
64748 + }
64749 + write_unlock(&(cache->guard));
64750 + assert("nikita-2471", cbk_cache_invariant(cache));
64751 +}
64752 +
64753 +/* add to the cbk-cache in the "tree" information about "node". This
64754 + can actually be update of existing slot in a cache. */
64755 +static void cbk_cache_add(const znode *node /* node to add to the cache */ )
64756 +{
64757 + cbk_cache *cache;
64758 + cbk_cache_slot *slot;
64759 + int i;
64760 +
64761 + assert("nikita-352", node != NULL);
64762 +
64763 + cache = &znode_get_tree(node)->cbk_cache;
64764 + assert("nikita-2472", cbk_cache_invariant(cache));
64765 +
64766 + if (cache->nr_slots == 0)
64767 + return;
64768 +
64769 + write_lock(&(cache->guard));
64770 + /* find slot to update/add */
64771 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
64772 + /* oops, this node is already in a cache */
64773 + if (slot->node == node)
64774 + break;
64775 + }
64776 + /* if all slots are used, reuse least recently used one */
64777 + if (i == cache->nr_slots) {
64778 + slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
64779 + slot->node = (znode *) node;
64780 + }
64781 + list_move(&slot->lru, &cache->lru);
64782 + write_unlock(&(cache->guard));
64783 + assert("nikita-2473", cbk_cache_invariant(cache));
64784 +}
64785 +
64786 +static int setup_delimiting_keys(cbk_handle * h);
64787 +static lookup_result coord_by_handle(cbk_handle * handle);
64788 +static lookup_result traverse_tree(cbk_handle * h);
64789 +static int cbk_cache_search(cbk_handle * h);
64790 +
64791 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
64792 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
64793 +
64794 +/* helper functions */
64795 +
64796 +static void update_stale_dk(reiser4_tree * tree, znode * node);
64797 +
64798 +/* release parent node during traversal */
64799 +static void put_parent(cbk_handle * h);
64800 +/* check consistency of fields */
64801 +static int sanity_check(cbk_handle * h);
64802 +/* release resources in handle */
64803 +static void hput(cbk_handle * h);
64804 +
64805 +static level_lookup_result search_to_left(cbk_handle * h);
64806 +
64807 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
64808 + * cbk_handle */
64809 +static cbk_handle *cbk_pack(cbk_handle * handle,
64810 + reiser4_tree * tree,
64811 + const reiser4_key * key,
64812 + coord_t * coord,
64813 + lock_handle * active_lh,
64814 + lock_handle * parent_lh,
64815 + znode_lock_mode lock_mode,
64816 + lookup_bias bias,
64817 + tree_level lock_level,
64818 + tree_level stop_level,
64819 + __u32 flags, ra_info_t * info)
64820 +{
64821 + memset(handle, 0, sizeof *handle);
64822 +
64823 + handle->tree = tree;
64824 + handle->key = key;
64825 + handle->lock_mode = lock_mode;
64826 + handle->bias = bias;
64827 + handle->lock_level = lock_level;
64828 + handle->stop_level = stop_level;
64829 + handle->coord = coord;
64830 + /* set flags. See comment in tree.h:cbk_flags */
64831 + handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
64832 +
64833 + handle->active_lh = active_lh;
64834 + handle->parent_lh = parent_lh;
64835 + handle->ra_info = info;
64836 + return handle;
64837 +}
64838 +
64839 +/* main tree lookup procedure
64840 +
64841 + Check coord cache. If key we are looking for is not found there, call cbk()
64842 + to do real tree traversal.
64843 +
64844 + As we have extents on the twig level, @lock_level and @stop_level can
64845 + be different from LEAF_LEVEL and each other.
64846 +
64847 + Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
64848 + long term locks) while calling this.
64849 +*/
64850 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
64851 + * in. Usually this tree is
64852 + * part of file-system
64853 + * super-block */ ,
64854 + const reiser4_key * key /* key to look for */ ,
64855 + coord_t * coord /* where to store found
64856 + * position in a tree. Fields
64857 + * in "coord" are only valid if
64858 + * coord_by_key() returned
64859 + * "CBK_COORD_FOUND" */ ,
64860 + lock_handle * lh, /* resulting lock handle */
64861 + znode_lock_mode lock_mode /* type of lookup we
64862 + * want on node. Pass
64863 + * ZNODE_READ_LOCK here
64864 + * if you only want to
64865 + * read item found and
64866 + * ZNODE_WRITE_LOCK if
64867 + * you want to modify
64868 + * it */ ,
64869 + lookup_bias bias /* what to return if coord
64870 + * with exactly the @key is
64871 + * not in the tree */ ,
64872 + tree_level lock_level /* tree level where to start
64873 + * taking @lock type of
64874 + * locks */ ,
64875 + tree_level stop_level /* tree level to stop. Pass
64876 + * LEAF_LEVEL or TWIG_LEVEL
64877 + * here Item being looked
64878 + * for has to be between
64879 + * @lock_level and
64880 + * @stop_level, inclusive */ ,
64881 + __u32 flags /* search flags */ ,
64882 + ra_info_t *
64883 + info
64884 + /* information about desired tree traversal readahead */
64885 + )
64886 +{
64887 + cbk_handle handle;
64888 + lock_handle parent_lh;
64889 + lookup_result result;
64890 +
64891 + init_lh(lh);
64892 + init_lh(&parent_lh);
64893 +
64894 + assert("nikita-3023", reiser4_schedulable());
64895 +
64896 + assert("nikita-353", tree != NULL);
64897 + assert("nikita-354", key != NULL);
64898 + assert("nikita-355", coord != NULL);
64899 + assert("nikita-356", (bias == FIND_EXACT)
64900 + || (bias == FIND_MAX_NOT_MORE_THAN));
64901 + assert("nikita-357", stop_level >= LEAF_LEVEL);
64902 + /* no locks can be held during tree traversal */
64903 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
64904 +
64905 + cbk_pack(&handle,
64906 + tree,
64907 + key,
64908 + coord,
64909 + lh,
64910 + &parent_lh,
64911 + lock_mode, bias, lock_level, stop_level, flags, info);
64912 +
64913 + result = coord_by_handle(&handle);
64914 + assert("nikita-3247",
64915 + ergo(!IS_CBKERR(result), coord->node == lh->node));
64916 + return result;
64917 +}
64918 +
64919 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
64920 + * from tree root. */
64921 +lookup_result reiser4_object_lookup(struct inode * object,
64922 + const reiser4_key * key,
64923 + coord_t * coord,
64924 + lock_handle * lh,
64925 + znode_lock_mode lock_mode,
64926 + lookup_bias bias,
64927 + tree_level lock_level,
64928 + tree_level stop_level, __u32 flags,
64929 + ra_info_t * info)
64930 +{
64931 + cbk_handle handle;
64932 + lock_handle parent_lh;
64933 + lookup_result result;
64934 +
64935 + init_lh(lh);
64936 + init_lh(&parent_lh);
64937 +
64938 + assert("nikita-3023", reiser4_schedulable());
64939 +
64940 + assert("nikita-354", key != NULL);
64941 + assert("nikita-355", coord != NULL);
64942 + assert("nikita-356", (bias == FIND_EXACT)
64943 + || (bias == FIND_MAX_NOT_MORE_THAN));
64944 + assert("nikita-357", stop_level >= LEAF_LEVEL);
64945 + /* no locks can be held during tree search by key */
64946 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
64947 +
64948 + cbk_pack(&handle,
64949 + object != NULL ? reiser4_tree_by_inode(object) : current_tree,
64950 + key,
64951 + coord,
64952 + lh,
64953 + &parent_lh,
64954 + lock_mode, bias, lock_level, stop_level, flags, info);
64955 + handle.object = object;
64956 +
64957 + result = coord_by_handle(&handle);
64958 + assert("nikita-3247",
64959 + ergo(!IS_CBKERR(result), coord->node == lh->node));
64960 + return result;
64961 +}
64962 +
64963 +/* lookup by cbk_handle. Common part of coord_by_key() and
64964 + reiser4_object_lookup(). */
64965 +static lookup_result coord_by_handle(cbk_handle * handle)
64966 +{
64967 + /*
64968 + * first check cbk_cache (which is look-aside cache for our tree) and
64969 + * of this fails, start traversal.
64970 + */
64971 + /* first check whether "key" is in cache of recent lookups. */
64972 + if (cbk_cache_search(handle) == 0)
64973 + return handle->result;
64974 + else
64975 + return traverse_tree(handle);
64976 +}
64977 +
64978 +/* Execute actor for each item (or unit, depending on @through_units_p),
64979 + starting from @coord, right-ward, until either:
64980 +
64981 + - end of the tree is reached
64982 + - unformatted node is met
64983 + - error occurred
64984 + - @actor returns 0 or less
64985 +
64986 + Error code, or last actor return value is returned.
64987 +
64988 + This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
64989 + sequence of entries with identical keys and alikes.
64990 +*/
64991 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
64992 + coord_t * coord /* coord to start from */ ,
64993 + lock_handle * lh /* lock handle to start with and to
64994 + * update along the way */ ,
64995 + tree_iterate_actor_t actor /* function to call on each
64996 + * item/unit */ ,
64997 + void *arg /* argument to pass to @actor */ ,
64998 + znode_lock_mode mode /* lock mode on scanned nodes */ ,
64999 + int through_units_p /* call @actor on each item or on
65000 + * each unit */ )
65001 +{
65002 + int result;
65003 +
65004 + assert("nikita-1143", tree != NULL);
65005 + assert("nikita-1145", coord != NULL);
65006 + assert("nikita-1146", lh != NULL);
65007 + assert("nikita-1147", actor != NULL);
65008 +
65009 + result = zload(coord->node);
65010 + coord_clear_iplug(coord);
65011 + if (result != 0)
65012 + return result;
65013 + if (!coord_is_existing_unit(coord)) {
65014 + zrelse(coord->node);
65015 + return -ENOENT;
65016 + }
65017 + while ((result = actor(tree, coord, lh, arg)) > 0) {
65018 + /* move further */
65019 + if ((through_units_p && coord_next_unit(coord)) ||
65020 + (!through_units_p && coord_next_item(coord))) {
65021 + do {
65022 + lock_handle couple;
65023 +
65024 + /* move to the next node */
65025 + init_lh(&couple);
65026 + result =
65027 + reiser4_get_right_neighbor(&couple,
65028 + coord->node,
65029 + (int)mode,
65030 + GN_CAN_USE_UPPER_LEVELS);
65031 + zrelse(coord->node);
65032 + if (result == 0) {
65033 +
65034 + result = zload(couple.node);
65035 + if (result != 0) {
65036 + done_lh(&couple);
65037 + return result;
65038 + }
65039 +
65040 + coord_init_first_unit(coord,
65041 + couple.node);
65042 + done_lh(lh);
65043 + move_lh(lh, &couple);
65044 + } else
65045 + return result;
65046 + } while (node_is_empty(coord->node));
65047 + }
65048 +
65049 + assert("nikita-1149", coord_is_existing_unit(coord));
65050 + }
65051 + zrelse(coord->node);
65052 + return result;
65053 +}
65054 +
65055 +/* return locked uber znode for @tree */
65056 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65057 + znode_lock_request pri, lock_handle * lh)
65058 +{
65059 + int result;
65060 +
65061 + result = longterm_lock_znode(lh, tree->uber, mode, pri);
65062 + return result;
65063 +}
65064 +
65065 +/* true if @key is strictly within @node
65066 +
65067 + we are looking for possibly non-unique key and it is item is at the edge of
65068 + @node. May be it is in the neighbor.
65069 +*/
65070 +static int znode_contains_key_strict(znode * node /* node to check key
65071 + * against */ ,
65072 + const reiser4_key *
65073 + key /* key to check */ ,
65074 + int isunique)
65075 +{
65076 + int answer;
65077 +
65078 + assert("nikita-1760", node != NULL);
65079 + assert("nikita-1722", key != NULL);
65080 +
65081 + if (keyge(key, &node->rd_key))
65082 + return 0;
65083 +
65084 + answer = keycmp(&node->ld_key, key);
65085 +
65086 + if (isunique)
65087 + return answer != GREATER_THAN;
65088 + else
65089 + return answer == LESS_THAN;
65090 +}
65091 +
65092 +/*
65093 + * Virtual Root (vroot) code.
65094 + *
65095 + * For given file system object (e.g., regular file or directory) let's
65096 + * define its "virtual root" as lowest in the tree (that is, furtherest
65097 + * from the tree root) node such that all body items of said object are
65098 + * located in a tree rooted at this node.
65099 + *
65100 + * Once vroot of object is found all tree lookups for items within body of
65101 + * this object ("object lookups") can be started from its vroot rather
65102 + * than from real root. This has following advantages:
65103 + *
65104 + * 1. amount of nodes traversed during lookup (and, hence, amount of
65105 + * key comparisons made) decreases, and
65106 + *
65107 + * 2. contention on tree root is decreased. This latter was actually
65108 + * motivating reason behind vroot, because spin lock of root node,
65109 + * which is taken when acquiring long-term lock on root node is the
65110 + * hottest lock in the reiser4.
65111 + *
65112 + * How to find vroot.
65113 + *
65114 + * When vroot of object F is not yet determined, all object lookups start
65115 + * from the root of the tree. At each tree level during traversal we have
65116 + * a node N such that a key we are looking for (which is the key inside
65117 + * object's body) is located within N. In function handle_vroot() called
65118 + * from cbk_level_lookup() we check whether N is possible vroot for
65119 + * F. Check is trivial---if neither leftmost nor rightmost item of N
65120 + * belongs to F (and we already have helpful ->owns_item() method of
65121 + * object plugin for this), then N is possible vroot of F. This, of
65122 + * course, relies on the assumption that each object occupies contiguous
65123 + * range of keys in the tree.
65124 + *
65125 + * Thus, traversing tree downward and checking each node as we go, we can
65126 + * find lowest such node, which, by definition, is vroot.
65127 + *
65128 + * How to track vroot.
65129 + *
65130 + * Nohow. If actual vroot changes, next object lookup will just restart
65131 + * from the actual tree root, refreshing object's vroot along the way.
65132 + *
65133 + */
65134 +
65135 +/*
65136 + * Check whether @node is possible vroot of @object.
65137 + */
65138 +static void handle_vroot(struct inode *object, znode * node)
65139 +{
65140 + file_plugin *fplug;
65141 + coord_t coord;
65142 +
65143 + fplug = inode_file_plugin(object);
65144 + assert("nikita-3353", fplug != NULL);
65145 + assert("nikita-3354", fplug->owns_item != NULL);
65146 +
65147 + if (unlikely(node_is_empty(node)))
65148 + return;
65149 +
65150 + coord_init_first_unit(&coord, node);
65151 + /*
65152 + * if leftmost item of @node belongs to @object, we cannot be sure
65153 + * that @node is vroot of @object, because, some items of @object are
65154 + * probably in the sub-tree rooted at the left neighbor of @node.
65155 + */
65156 + if (fplug->owns_item(object, &coord))
65157 + return;
65158 + coord_init_last_unit(&coord, node);
65159 + /* mutatis mutandis for the rightmost item */
65160 + if (fplug->owns_item(object, &coord))
65161 + return;
65162 + /* otherwise, @node is possible vroot of @object */
65163 + inode_set_vroot(object, node);
65164 +}
65165 +
65166 +/*
65167 + * helper function used by traverse tree to start tree traversal not from the
65168 + * tree root, but from @h->object's vroot, if possible.
65169 + */
65170 +static int prepare_object_lookup(cbk_handle * h)
65171 +{
65172 + znode *vroot;
65173 + int result;
65174 +
65175 + vroot = inode_get_vroot(h->object);
65176 + if (vroot == NULL) {
65177 + /*
65178 + * object doesn't have known vroot, start from real tree root.
65179 + */
65180 + return LOOKUP_CONT;
65181 + }
65182 +
65183 + h->level = znode_get_level(vroot);
65184 + /* take a long-term lock on vroot */
65185 + h->result = longterm_lock_znode(h->active_lh, vroot,
65186 + cbk_lock_mode(h->level, h),
65187 + ZNODE_LOCK_LOPRI);
65188 + result = LOOKUP_REST;
65189 + if (h->result == 0) {
65190 + int isunique;
65191 + int inside;
65192 +
65193 + isunique = h->flags & CBK_UNIQUE;
65194 + /* check that key is inside vroot */
65195 + read_lock_dk(h->tree);
65196 + inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65197 + !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65198 + read_unlock_dk(h->tree);
65199 + if (inside) {
65200 + h->result = zload(vroot);
65201 + if (h->result == 0) {
65202 + /* search for key in vroot. */
65203 + result = cbk_node_lookup(h);
65204 + zrelse(vroot); /*h->active_lh->node); */
65205 + if (h->active_lh->node != vroot) {
65206 + result = LOOKUP_REST;
65207 + } else if (result == LOOKUP_CONT) {
65208 + move_lh(h->parent_lh, h->active_lh);
65209 + h->flags &= ~CBK_DKSET;
65210 + }
65211 + }
65212 + }
65213 + }
65214 +
65215 + zput(vroot);
65216 +
65217 + if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65218 + hput(h);
65219 + return result;
65220 +}
65221 +
65222 +/* main function that handles common parts of tree traversal: starting
65223 + (fake znode handling), restarts, error handling, completion */
65224 +static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65225 +{
65226 + int done;
65227 + int iterations;
65228 + int vroot_used;
65229 +
65230 + assert("nikita-365", h != NULL);
65231 + assert("nikita-366", h->tree != NULL);
65232 + assert("nikita-367", h->key != NULL);
65233 + assert("nikita-368", h->coord != NULL);
65234 + assert("nikita-369", (h->bias == FIND_EXACT)
65235 + || (h->bias == FIND_MAX_NOT_MORE_THAN));
65236 + assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65237 + assert("nikita-2949", !(h->flags & CBK_DKSET));
65238 + assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65239 +
65240 + done = 0;
65241 + iterations = 0;
65242 + vroot_used = 0;
65243 +
65244 + /* loop for restarts */
65245 + restart:
65246 +
65247 + assert("nikita-3024", reiser4_schedulable());
65248 +
65249 + h->result = CBK_COORD_FOUND;
65250 + /* connect_znode() needs it */
65251 + h->ld_key = *reiser4_min_key();
65252 + h->rd_key = *reiser4_max_key();
65253 + h->flags |= CBK_DKSET;
65254 + h->error = NULL;
65255 +
65256 + if (!vroot_used && h->object != NULL) {
65257 + vroot_used = 1;
65258 + done = prepare_object_lookup(h);
65259 + if (done == LOOKUP_REST) {
65260 + goto restart;
65261 + } else if (done == LOOKUP_DONE)
65262 + return h->result;
65263 + }
65264 + if (h->parent_lh->node == NULL) {
65265 + done =
65266 + get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
65267 + h->parent_lh);
65268 +
65269 + assert("nikita-1637", done != -E_DEADLOCK);
65270 +
65271 + h->block = h->tree->root_block;
65272 + h->level = h->tree->height;
65273 + h->coord->node = h->parent_lh->node;
65274 +
65275 + if (done != 0)
65276 + return done;
65277 + }
65278 +
65279 + /* loop descending a tree */
65280 + while (!done) {
65281 +
65282 + if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
65283 + IS_POW(iterations))) {
65284 + warning("nikita-1481", "Too many iterations: %i",
65285 + iterations);
65286 + reiser4_print_key("key", h->key);
65287 + ++iterations;
65288 + } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
65289 + h->error =
65290 + "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
65291 + h->result = RETERR(-EIO);
65292 + break;
65293 + }
65294 + switch (cbk_level_lookup(h)) {
65295 + case LOOKUP_CONT:
65296 + move_lh(h->parent_lh, h->active_lh);
65297 + continue;
65298 + default:
65299 + wrong_return_value("nikita-372", "cbk_level");
65300 + case LOOKUP_DONE:
65301 + done = 1;
65302 + break;
65303 + case LOOKUP_REST:
65304 + hput(h);
65305 + /* deadlock avoidance is normal case. */
65306 + if (h->result != -E_DEADLOCK)
65307 + ++iterations;
65308 + reiser4_preempt_point();
65309 + goto restart;
65310 + }
65311 + }
65312 + /* that's all. The rest is error handling */
65313 + if (unlikely(h->error != NULL)) {
65314 + warning("nikita-373", "%s: level: %i, "
65315 + "lock_level: %i, stop_level: %i "
65316 + "lock_mode: %s, bias: %s",
65317 + h->error, h->level, h->lock_level, h->stop_level,
65318 + lock_mode_name(h->lock_mode), bias_name(h->bias));
65319 + reiser4_print_address("block", &h->block);
65320 + reiser4_print_key("key", h->key);
65321 + print_coord_content("coord", h->coord);
65322 + }
65323 + /* `unlikely' error case */
65324 + if (unlikely(IS_CBKERR(h->result))) {
65325 + /* failure. do cleanup */
65326 + hput(h);
65327 + } else {
65328 + assert("nikita-1605", WITH_DATA_RET
65329 + (h->coord->node, 1,
65330 + ergo((h->result == CBK_COORD_FOUND) &&
65331 + (h->bias == FIND_EXACT) &&
65332 + (!node_is_empty(h->coord->node)),
65333 + coord_is_existing_item(h->coord))));
65334 + }
65335 + return h->result;
65336 +}
65337 +
65338 +/* find delimiting keys of child
65339 +
65340 + Determine left and right delimiting keys for child pointed to by
65341 + @parent_coord.
65342 +
65343 +*/
65344 +static void find_child_delimiting_keys(znode * parent /* parent znode, passed
65345 + * locked */ ,
65346 + const coord_t * parent_coord /* coord where
65347 + * pointer to
65348 + * child is
65349 + * stored */ ,
65350 + reiser4_key * ld /* where to store left
65351 + * delimiting key */ ,
65352 + reiser4_key * rd /* where to store right
65353 + * delimiting key */ )
65354 +{
65355 + coord_t neighbor;
65356 +
65357 + assert("nikita-1484", parent != NULL);
65358 + assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
65359 +
65360 + coord_dup(&neighbor, parent_coord);
65361 +
65362 + if (neighbor.between == AT_UNIT)
65363 + /* imitate item ->lookup() behavior. */
65364 + neighbor.between = AFTER_UNIT;
65365 +
65366 + if (coord_set_to_left(&neighbor) == 0)
65367 + unit_key_by_coord(&neighbor, ld);
65368 + else {
65369 + assert("nikita-14851", 0);
65370 + *ld = *znode_get_ld_key(parent);
65371 + }
65372 +
65373 + coord_dup(&neighbor, parent_coord);
65374 + if (neighbor.between == AT_UNIT)
65375 + neighbor.between = AFTER_UNIT;
65376 + if (coord_set_to_right(&neighbor) == 0)
65377 + unit_key_by_coord(&neighbor, rd);
65378 + else
65379 + *rd = *znode_get_rd_key(parent);
65380 +}
65381 +
65382 +/*
65383 + * setup delimiting keys for a child
65384 + *
65385 + * @parent parent node
65386 + *
65387 + * @coord location in @parent where pointer to @child is
65388 + *
65389 + * @child child node
65390 + */
65391 +int
65392 +set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
65393 +{
65394 + reiser4_tree *tree;
65395 +
65396 + assert("nikita-2952",
65397 + znode_get_level(parent) == znode_get_level(coord->node));
65398 +
65399 + /* fast check without taking dk lock. This is safe, because
65400 + * JNODE_DKSET is never cleared once set. */
65401 + if (!ZF_ISSET(child, JNODE_DKSET)) {
65402 + tree = znode_get_tree(parent);
65403 + write_lock_dk(tree);
65404 + if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
65405 + find_child_delimiting_keys(parent, coord,
65406 + &child->ld_key,
65407 + &child->rd_key);
65408 + ON_DEBUG(child->ld_key_version =
65409 + atomic_inc_return(&delim_key_version);
65410 + child->rd_key_version =
65411 + atomic_inc_return(&delim_key_version););
65412 + ZF_SET(child, JNODE_DKSET);
65413 + }
65414 + write_unlock_dk(tree);
65415 + return 1;
65416 + }
65417 + return 0;
65418 +}
65419 +
65420 +/* Perform tree lookup at one level. This is called from cbk_traverse()
65421 + function that drives lookup through tree and calls cbk_node_lookup() to
65422 + perform lookup within one node.
65423 +
65424 + See comments in a code.
65425 +*/
65426 +static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
65427 +{
65428 + int ret;
65429 + int setdk;
65430 + int ldkeyset = 0;
65431 + reiser4_key ldkey;
65432 + reiser4_key key;
65433 + znode *active;
65434 +
65435 + assert("nikita-3025", reiser4_schedulable());
65436 +
65437 + /* acquire reference to @active node */
65438 + active =
65439 + zget(h->tree, &h->block, h->parent_lh->node, h->level,
65440 + reiser4_ctx_gfp_mask_get());
65441 +
65442 + if (IS_ERR(active)) {
65443 + h->result = PTR_ERR(active);
65444 + return LOOKUP_DONE;
65445 + }
65446 +
65447 + /* lock @active */
65448 + h->result = longterm_lock_znode(h->active_lh,
65449 + active,
65450 + cbk_lock_mode(h->level, h),
65451 + ZNODE_LOCK_LOPRI);
65452 + /* longterm_lock_znode() acquires additional reference to znode (which
65453 + will be later released by longterm_unlock_znode()). Release
65454 + reference acquired by zget().
65455 + */
65456 + zput(active);
65457 + if (unlikely(h->result != 0))
65458 + goto fail_or_restart;
65459 +
65460 + setdk = 0;
65461 + /* if @active is accessed for the first time, setup delimiting keys on
65462 + it. Delimiting keys are taken from the parent node. See
65463 + setup_delimiting_keys() for details.
65464 + */
65465 + if (h->flags & CBK_DKSET) {
65466 + setdk = setup_delimiting_keys(h);
65467 + h->flags &= ~CBK_DKSET;
65468 + } else {
65469 + znode *parent;
65470 +
65471 + parent = h->parent_lh->node;
65472 + h->result = zload(parent);
65473 + if (unlikely(h->result != 0))
65474 + goto fail_or_restart;
65475 +
65476 + if (!ZF_ISSET(active, JNODE_DKSET))
65477 + setdk = set_child_delimiting_keys(parent,
65478 + h->coord, active);
65479 + else {
65480 + read_lock_dk(h->tree);
65481 + find_child_delimiting_keys(parent, h->coord, &ldkey,
65482 + &key);
65483 + read_unlock_dk(h->tree);
65484 + ldkeyset = 1;
65485 + }
65486 + zrelse(parent);
65487 + }
65488 +
65489 + /* this is ugly kludge. Reminder: this is necessary, because
65490 + ->lookup() method returns coord with ->between field probably set
65491 + to something different from AT_UNIT.
65492 + */
65493 + h->coord->between = AT_UNIT;
65494 +
65495 + if (znode_just_created(active) && (h->coord->node != NULL)) {
65496 + write_lock_tree(h->tree);
65497 + /* if we are going to load znode right now, setup
65498 + ->in_parent: coord where pointer to this node is stored in
65499 + parent.
65500 + */
65501 + coord_to_parent_coord(h->coord, &active->in_parent);
65502 + write_unlock_tree(h->tree);
65503 + }
65504 +
65505 + /* check connectedness without holding tree lock---false negatives
65506 + * will be re-checked by connect_znode(), and false positives are
65507 + * impossible---@active cannot suddenly turn into unconnected
65508 + * state. */
65509 + if (!znode_is_connected(active)) {
65510 + h->result = connect_znode(h->coord, active);
65511 + if (unlikely(h->result != 0)) {
65512 + put_parent(h);
65513 + goto fail_or_restart;
65514 + }
65515 + }
65516 +
65517 + jload_prefetch(ZJNODE(active));
65518 +
65519 + if (setdk)
65520 + update_stale_dk(h->tree, active);
65521 +
65522 + /* put_parent() cannot be called earlier, because connect_znode()
65523 + assumes parent node is referenced; */
65524 + put_parent(h);
65525 +
65526 + if ((!znode_contains_key_lock(active, h->key) &&
65527 + (h->flags & CBK_TRUST_DK))
65528 + || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
65529 + /* 1. key was moved out of this node while this thread was
65530 + waiting for the lock. Restart. More elaborate solution is
65531 + to determine where key moved (to the left, or to the right)
65532 + and try to follow it through sibling pointers.
65533 +
65534 + 2. or, node itself is going to be removed from the
65535 + tree. Release lock and restart.
65536 + */
65537 + h->result = -E_REPEAT;
65538 + }
65539 + if (h->result == -E_REPEAT)
65540 + return LOOKUP_REST;
65541 +
65542 + h->result = zload_ra(active, h->ra_info);
65543 + if (h->result) {
65544 + return LOOKUP_DONE;
65545 + }
65546 +
65547 + /* sanity checks */
65548 + if (sanity_check(h)) {
65549 + zrelse(active);
65550 + return LOOKUP_DONE;
65551 + }
65552 +
65553 + /* check that key of leftmost item in the @active is the same as in
65554 + * its parent */
65555 + if (ldkeyset && !node_is_empty(active) &&
65556 + !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
65557 + warning("vs-3533", "Keys are inconsistent. Fsck?");
65558 + reiser4_print_key("inparent", &ldkey);
65559 + reiser4_print_key("inchild", &key);
65560 + h->result = RETERR(-EIO);
65561 + zrelse(active);
65562 + return LOOKUP_DONE;
65563 + }
65564 +
65565 + if (h->object != NULL)
65566 + handle_vroot(h->object, active);
65567 +
65568 + ret = cbk_node_lookup(h);
65569 +
65570 + /* h->active_lh->node might change, but active is yet to be zrelsed */
65571 + zrelse(active);
65572 +
65573 + return ret;
65574 +
65575 + fail_or_restart:
65576 + if (h->result == -E_DEADLOCK)
65577 + return LOOKUP_REST;
65578 + return LOOKUP_DONE;
65579 +}
65580 +
65581 +#if REISER4_DEBUG
65582 +/* check left and right delimiting keys of a znode */
65583 +void check_dkeys(znode * node)
65584 +{
65585 + znode *left;
65586 + znode *right;
65587 +
65588 + read_lock_tree(current_tree);
65589 + read_lock_dk(current_tree);
65590 +
65591 + assert("vs-1710", znode_is_any_locked(node));
65592 + assert("vs-1197",
65593 + !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
65594 +
65595 + left = node->left;
65596 + right = node->right;
65597 +
65598 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65599 + && left != NULL && ZF_ISSET(left, JNODE_DKSET))
65600 + /* check left neighbor. Note that left neighbor is not locked,
65601 + so it might get wrong delimiting keys therefore */
65602 + assert("vs-1198",
65603 + (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
65604 + || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
65605 +
65606 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65607 + && right != NULL && ZF_ISSET(right, JNODE_DKSET))
65608 + /* check right neighbor. Note that right neighbor is not
65609 + locked, so it might get wrong delimiting keys therefore */
65610 + assert("vs-1199",
65611 + (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
65612 + || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
65613 +
65614 + read_unlock_dk(current_tree);
65615 + read_unlock_tree(current_tree);
65616 +}
65617 +#endif
65618 +
65619 +/* true if @key is left delimiting key of @node */
65620 +static int key_is_ld(znode * node, const reiser4_key * key)
65621 +{
65622 + int ld;
65623 +
65624 + assert("nikita-1716", node != NULL);
65625 + assert("nikita-1758", key != NULL);
65626 +
65627 + read_lock_dk(znode_get_tree(node));
65628 + assert("nikita-1759", znode_contains_key(node, key));
65629 + ld = keyeq(znode_get_ld_key(node), key);
65630 + read_unlock_dk(znode_get_tree(node));
65631 + return ld;
65632 +}
65633 +
65634 +/* Process one node during tree traversal.
65635 +
65636 + This is called by cbk_level_lookup(). */
65637 +static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
65638 +{
65639 + /* node plugin of @active */
65640 + node_plugin *nplug;
65641 + /* item plugin of item that was found */
65642 + item_plugin *iplug;
65643 + /* search bias */
65644 + lookup_bias node_bias;
65645 + /* node we are operating upon */
65646 + znode *active;
65647 + /* tree we are searching in */
65648 + reiser4_tree *tree;
65649 + /* result */
65650 + int result;
65651 +
65652 + assert("nikita-379", h != NULL);
65653 +
65654 + active = h->active_lh->node;
65655 + tree = h->tree;
65656 +
65657 + nplug = active->nplug;
65658 + assert("nikita-380", nplug != NULL);
65659 +
65660 + ON_DEBUG(check_dkeys(active));
65661 +
65662 + /* return item from "active" node with maximal key not greater than
65663 + "key" */
65664 + node_bias = h->bias;
65665 + result = nplug->lookup(active, h->key, node_bias, h->coord);
65666 + if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
65667 + /* error occurred */
65668 + h->result = result;
65669 + return LOOKUP_DONE;
65670 + }
65671 + if (h->level == h->stop_level) {
65672 + /* welcome to the stop level */
65673 + assert("nikita-381", h->coord->node == active);
65674 + if (result == NS_FOUND) {
65675 + /* success of tree lookup */
65676 + if (!(h->flags & CBK_UNIQUE)
65677 + && key_is_ld(active, h->key)) {
65678 + return search_to_left(h);
65679 + } else
65680 + h->result = CBK_COORD_FOUND;
65681 + } else {
65682 + h->result = CBK_COORD_NOTFOUND;
65683 + }
65684 + if (!(h->flags & CBK_IN_CACHE))
65685 + cbk_cache_add(active);
65686 + return LOOKUP_DONE;
65687 + }
65688 +
65689 + if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
65690 + h->error = "not found on internal node";
65691 + h->result = result;
65692 + return LOOKUP_DONE;
65693 + }
65694 +
65695 + assert("vs-361", h->level > h->stop_level);
65696 +
65697 + if (handle_eottl(h, &result)) {
65698 + assert("vs-1674", (result == LOOKUP_DONE ||
65699 + result == LOOKUP_REST));
65700 + return result;
65701 + }
65702 +
65703 + /* go down to next level */
65704 + check_me("vs-12", zload(h->coord->node) == 0);
65705 + assert("nikita-2116", item_is_internal(h->coord));
65706 + iplug = item_plugin_by_coord(h->coord);
65707 + iplug->s.internal.down_link(h->coord, h->key, &h->block);
65708 + zrelse(h->coord->node);
65709 + --h->level;
65710 + return LOOKUP_CONT; /* continue */
65711 +}
65712 +
65713 +/* scan cbk_cache slots looking for a match for @h */
65714 +static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
65715 +{
65716 + level_lookup_result llr;
65717 + znode *node;
65718 + reiser4_tree *tree;
65719 + cbk_cache_slot *slot;
65720 + cbk_cache *cache;
65721 + tree_level level;
65722 + int isunique;
65723 + const reiser4_key *key;
65724 + int result;
65725 +
65726 + assert("nikita-1317", h != NULL);
65727 + assert("nikita-1315", h->tree != NULL);
65728 + assert("nikita-1316", h->key != NULL);
65729 +
65730 + tree = h->tree;
65731 + cache = &tree->cbk_cache;
65732 + if (cache->nr_slots == 0)
65733 + /* size of cbk cache was set to 0 by mount time option. */
65734 + return RETERR(-ENOENT);
65735 +
65736 + assert("nikita-2474", cbk_cache_invariant(cache));
65737 + node = NULL; /* to keep gcc happy */
65738 + level = h->level;
65739 + key = h->key;
65740 + isunique = h->flags & CBK_UNIQUE;
65741 + result = RETERR(-ENOENT);
65742 +
65743 + /*
65744 + * this is time-critical function and dragons had, hence, been settled
65745 + * here.
65746 + *
65747 + * Loop below scans cbk cache slots trying to find matching node with
65748 + * suitable range of delimiting keys and located at the h->level.
65749 + *
65750 + * Scan is done under cbk cache spin lock that protects slot->node
65751 + * pointers. If suitable node is found we want to pin it in
65752 + * memory. But slot->node can point to the node with x_count 0
65753 + * (unreferenced). Such node can be recycled at any moment, or can
65754 + * already be in the process of being recycled (within jput()).
65755 + *
65756 + * As we found node in the cbk cache, it means that jput() hasn't yet
65757 + * called cbk_cache_invalidate().
65758 + *
65759 + * We acquire reference to the node without holding tree lock, and
65760 + * later, check node's RIP bit. This avoids races with jput().
65761 + */
65762 +
65763 + rcu_read_lock();
65764 + read_lock(&((cbk_cache *)cache)->guard);
65765 +
65766 + slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
65767 + slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
65768 + BUG_ON(&slot->lru != &cache->lru);/*????*/
65769 + while (1) {
65770 +
65771 + slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
65772 +
65773 + if (&cache->lru != &slot->lru)
65774 + node = slot->node;
65775 + else
65776 + node = NULL;
65777 +
65778 + if (unlikely(node == NULL))
65779 + break;
65780 +
65781 + /*
65782 + * this is (hopefully) the only place in the code where we are
65783 + * working with delimiting keys without holding dk lock. This
65784 + * is fine here, because this is only "guess" anyway---keys
65785 + * are rechecked under dk lock below.
65786 + */
65787 + if (znode_get_level(node) == level &&
65788 + /* reiser4_min_key < key < reiser4_max_key */
65789 + znode_contains_key_strict(node, key, isunique)) {
65790 + zref(node);
65791 + result = 0;
65792 + spin_lock_prefetch(&tree->tree_lock);
65793 + break;
65794 + }
65795 + }
65796 + read_unlock(&((cbk_cache *)cache)->guard);
65797 +
65798 + assert("nikita-2475", cbk_cache_invariant(cache));
65799 +
65800 + if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
65801 + result = -ENOENT;
65802 +
65803 + rcu_read_unlock();
65804 +
65805 + if (result != 0) {
65806 + h->result = CBK_COORD_NOTFOUND;
65807 + return RETERR(-ENOENT);
65808 + }
65809 +
65810 + result =
65811 + longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
65812 + ZNODE_LOCK_LOPRI);
65813 + zput(node);
65814 + if (result != 0)
65815 + return result;
65816 + result = zload(node);
65817 + if (result != 0)
65818 + return result;
65819 +
65820 + /* recheck keys */
65821 + read_lock_dk(tree);
65822 + result = (znode_contains_key_strict(node, key, isunique) &&
65823 + !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
65824 + read_unlock_dk(tree);
65825 + if (result) {
65826 + /* do lookup inside node */
65827 + llr = cbk_node_lookup(h);
65828 + /* if cbk_node_lookup() wandered to another node (due to eottl
65829 + or non-unique keys), adjust @node */
65830 + /*node = h->active_lh->node; */
65831 +
65832 + if (llr != LOOKUP_DONE) {
65833 + /* restart or continue on the next level */
65834 + result = RETERR(-ENOENT);
65835 + } else if (IS_CBKERR(h->result))
65836 + /* io or oom */
65837 + result = RETERR(-ENOENT);
65838 + else {
65839 + /* good. Either item found or definitely not found. */
65840 + result = 0;
65841 +
65842 + write_lock(&(cache->guard));
65843 + if (slot->node == h->active_lh->node /*node */ ) {
65844 + /* if this node is still in cbk cache---move
65845 + its slot to the head of the LRU list. */
65846 + list_move(&slot->lru, &cache->lru);
65847 + }
65848 + write_unlock(&(cache->guard));
65849 + }
65850 + } else {
65851 + /* race. While this thread was waiting for the lock, node was
65852 + rebalanced and item we are looking for, shifted out of it
65853 + (if it ever was here).
65854 +
65855 + Continuing scanning is almost hopeless: node key range was
65856 + moved to, is almost certainly at the beginning of the LRU
65857 + list at this time, because it's hot, but restarting
65858 + scanning from the very beginning is complex. Just return,
65859 + so that cbk() will be performed. This is not that
65860 + important, because such races should be rare. Are they?
65861 + */
65862 + result = RETERR(-ENOENT); /* -ERAUGHT */
65863 + }
65864 + zrelse(node);
65865 + assert("nikita-2476", cbk_cache_invariant(cache));
65866 + return result;
65867 +}
65868 +
65869 +/* look for item with given key in the coord cache
65870 +
65871 + This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
65872 + which is a small LRU list of znodes accessed lately. For each znode in
65873 + znode in this list, it checks whether key we are looking for fits into key
65874 + range covered by this node. If so, and in addition, node lies at allowed
65875 + level (this is to handle extents on a twig level), node is locked, and
65876 + lookup inside it is performed.
65877 +
65878 + we need a measurement of the cost of this cache search compared to the cost
65879 + of coord_by_key.
65880 +
65881 +*/
65882 +static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
65883 +{
65884 + int result = 0;
65885 + tree_level level;
65886 +
65887 + /* add CBK_IN_CACHE to the handle flags. This means that
65888 + * cbk_node_lookup() assumes that cbk_cache is scanned and would add
65889 + * found node to the cache. */
65890 + h->flags |= CBK_IN_CACHE;
65891 + for (level = h->stop_level; level <= h->lock_level; ++level) {
65892 + h->level = level;
65893 + result = cbk_cache_scan_slots(h);
65894 + if (result != 0) {
65895 + done_lh(h->active_lh);
65896 + done_lh(h->parent_lh);
65897 + } else {
65898 + assert("nikita-1319", !IS_CBKERR(h->result));
65899 + break;
65900 + }
65901 + }
65902 + h->flags &= ~CBK_IN_CACHE;
65903 + return result;
65904 +}
65905 +
65906 +/* type of lock we want to obtain during tree traversal. On stop level
65907 + we want type of lock user asked for, on upper levels: read lock. */
65908 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
65909 +{
65910 + assert("nikita-382", h != NULL);
65911 +
65912 + return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
65913 +}
65914 +
65915 +/* update outdated delimiting keys */
65916 +static void stale_dk(reiser4_tree * tree, znode * node)
65917 +{
65918 + znode *right;
65919 +
65920 + read_lock_tree(tree);
65921 + write_lock_dk(tree);
65922 + right = node->right;
65923 +
65924 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
65925 + right && ZF_ISSET(right, JNODE_DKSET) &&
65926 + !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
65927 + znode_set_rd_key(node, znode_get_ld_key(right));
65928 +
65929 + write_unlock_dk(tree);
65930 + read_unlock_tree(tree);
65931 +}
65932 +
65933 +/* check for possibly outdated delimiting keys, and update them if
65934 + * necessary. */
65935 +static void update_stale_dk(reiser4_tree * tree, znode * node)
65936 +{
65937 + znode *right;
65938 + reiser4_key rd;
65939 +
65940 + read_lock_tree(tree);
65941 + read_lock_dk(tree);
65942 + rd = *znode_get_rd_key(node);
65943 + right = node->right;
65944 + if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
65945 + right && ZF_ISSET(right, JNODE_DKSET) &&
65946 + !keyeq(&rd, znode_get_ld_key(right)))) {
65947 + assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
65948 + read_unlock_dk(tree);
65949 + read_unlock_tree(tree);
65950 + stale_dk(tree, node);
65951 + return;
65952 + }
65953 + read_unlock_dk(tree);
65954 + read_unlock_tree(tree);
65955 +}
65956 +
65957 +/*
65958 + * handle searches a the non-unique key.
65959 + *
65960 + * Suppose that we are looking for an item with possibly non-unique key 100.
65961 + *
65962 + * Root node contains two pointers: one to a node with left delimiting key 0,
65963 + * and another to a node with left delimiting key 100. Item we interested in
65964 + * may well happen in the sub-tree rooted at the first pointer.
65965 + *
65966 + * To handle this search_to_left() is called when search reaches stop
65967 + * level. This function checks it is _possible_ that item we are looking for
65968 + * is in the left neighbor (this can be done by comparing delimiting keys) and
65969 + * if so, tries to lock left neighbor (this is low priority lock, so it can
65970 + * deadlock, tree traversal is just restarted if it did) and then checks
65971 + * whether left neighbor actually contains items with our key.
65972 + *
65973 + * Note that this is done on the stop level only. It is possible to try such
65974 + * left-check on each level, but as duplicate keys are supposed to be rare
65975 + * (very unlikely that more than one node is completely filled with items with
65976 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
65977 + *
65978 + */
65979 +static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
65980 +{
65981 + level_lookup_result result;
65982 + coord_t *coord;
65983 + znode *node;
65984 + znode *neighbor;
65985 +
65986 + lock_handle lh;
65987 +
65988 + assert("nikita-1761", h != NULL);
65989 + assert("nikita-1762", h->level == h->stop_level);
65990 +
65991 + init_lh(&lh);
65992 + coord = h->coord;
65993 + node = h->active_lh->node;
65994 + assert("nikita-1763", coord_is_leftmost_unit(coord));
65995 +
65996 + h->result =
65997 + reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
65998 + GN_CAN_USE_UPPER_LEVELS);
65999 + neighbor = NULL;
66000 + switch (h->result) {
66001 + case -E_DEADLOCK:
66002 + result = LOOKUP_REST;
66003 + break;
66004 + case 0:{
66005 + node_plugin *nplug;
66006 + coord_t crd;
66007 + lookup_bias bias;
66008 +
66009 + neighbor = lh.node;
66010 + h->result = zload(neighbor);
66011 + if (h->result != 0) {
66012 + result = LOOKUP_DONE;
66013 + break;
66014 + }
66015 +
66016 + nplug = neighbor->nplug;
66017 +
66018 + coord_init_zero(&crd);
66019 + bias = h->bias;
66020 + h->bias = FIND_EXACT;
66021 + h->result =
66022 + nplug->lookup(neighbor, h->key, h->bias, &crd);
66023 + h->bias = bias;
66024 +
66025 + if (h->result == NS_NOT_FOUND) {
66026 + case -E_NO_NEIGHBOR:
66027 + h->result = CBK_COORD_FOUND;
66028 + if (!(h->flags & CBK_IN_CACHE))
66029 + cbk_cache_add(node);
66030 + default: /* some other error */
66031 + result = LOOKUP_DONE;
66032 + } else if (h->result == NS_FOUND) {
66033 + read_lock_dk(znode_get_tree(neighbor));
66034 + h->rd_key = *znode_get_ld_key(node);
66035 + leftmost_key_in_node(neighbor, &h->ld_key);
66036 + read_unlock_dk(znode_get_tree(neighbor));
66037 + h->flags |= CBK_DKSET;
66038 +
66039 + h->block = *znode_get_block(neighbor);
66040 + /* clear coord -> node so that cbk_level_lookup()
66041 + wouldn't overwrite parent hint in neighbor.
66042 +
66043 + Parent hint was set up by
66044 + reiser4_get_left_neighbor()
66045 + */
66046 + /* FIXME: why do we have to spinlock here? */
66047 + write_lock_tree(znode_get_tree(neighbor));
66048 + h->coord->node = NULL;
66049 + write_unlock_tree(znode_get_tree(neighbor));
66050 + result = LOOKUP_CONT;
66051 + } else {
66052 + result = LOOKUP_DONE;
66053 + }
66054 + if (neighbor != NULL)
66055 + zrelse(neighbor);
66056 + }
66057 + }
66058 + done_lh(&lh);
66059 + return result;
66060 +}
66061 +
66062 +/* debugging aid: return symbolic name of search bias */
66063 +static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66064 +{
66065 + if (bias == FIND_EXACT)
66066 + return "exact";
66067 + else if (bias == FIND_MAX_NOT_MORE_THAN)
66068 + return "left-slant";
66069 +/* else if( bias == RIGHT_SLANT_BIAS ) */
66070 +/* return "right-bias"; */
66071 + else {
66072 + static char buf[30];
66073 +
66074 + sprintf(buf, "unknown: %i", bias);
66075 + return buf;
66076 + }
66077 +}
66078 +
66079 +#if REISER4_DEBUG
66080 +/* debugging aid: print human readable information about @p */
66081 +void print_coord_content(const char *prefix /* prefix to print */ ,
66082 + coord_t * p /* coord to print */ )
66083 +{
66084 + reiser4_key key;
66085 +
66086 + if (p == NULL) {
66087 + printk("%s: null\n", prefix);
66088 + return;
66089 + }
66090 + if ((p->node != NULL) && znode_is_loaded(p->node)
66091 + && coord_is_existing_item(p))
66092 + printk("%s: data: %p, length: %i\n", prefix,
66093 + item_body_by_coord(p), item_length_by_coord(p));
66094 + if (znode_is_loaded(p->node)) {
66095 + item_key_by_coord(p, &key);
66096 + reiser4_print_key(prefix, &key);
66097 + }
66098 +}
66099 +
66100 +/* debugging aid: print human readable information about @block */
66101 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
66102 + const reiser4_block_nr * block /* block number to print */ )
66103 +{
66104 + printk("%s: %s\n", prefix, sprint_address(block));
66105 +}
66106 +#endif
66107 +
66108 +/* return string containing human readable representation of @block */
66109 +char *sprint_address(const reiser4_block_nr *
66110 + block /* block number to print */ )
66111 +{
66112 + static char address[30];
66113 +
66114 + if (block == NULL)
66115 + sprintf(address, "null");
66116 + else if (reiser4_blocknr_is_fake(block))
66117 + sprintf(address, "%llx", (unsigned long long)(*block));
66118 + else
66119 + sprintf(address, "%llu", (unsigned long long)(*block));
66120 + return address;
66121 +}
66122 +
66123 +/* release parent node during traversal */
66124 +static void put_parent(cbk_handle * h /* search handle */ )
66125 +{
66126 + assert("nikita-383", h != NULL);
66127 + if (h->parent_lh->node != NULL) {
66128 + longterm_unlock_znode(h->parent_lh);
66129 + }
66130 +}
66131 +
66132 +/* helper function used by coord_by_key(): release reference to parent znode
66133 + stored in handle before processing its child. */
66134 +static void hput(cbk_handle * h /* search handle */ )
66135 +{
66136 + assert("nikita-385", h != NULL);
66137 + done_lh(h->parent_lh);
66138 + done_lh(h->active_lh);
66139 +}
66140 +
66141 +/* Helper function used by cbk(): update delimiting keys of child node (stored
66142 + in h->active_lh->node) using key taken from parent on the parent level. */
66143 +static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66144 +{
66145 + znode *active;
66146 + reiser4_tree *tree;
66147 +
66148 + assert("nikita-1088", h != NULL);
66149 +
66150 + active = h->active_lh->node;
66151 +
66152 + /* fast check without taking dk lock. This is safe, because
66153 + * JNODE_DKSET is never cleared once set. */
66154 + if (!ZF_ISSET(active, JNODE_DKSET)) {
66155 + tree = znode_get_tree(active);
66156 + write_lock_dk(tree);
66157 + if (!ZF_ISSET(active, JNODE_DKSET)) {
66158 + znode_set_ld_key(active, &h->ld_key);
66159 + znode_set_rd_key(active, &h->rd_key);
66160 + ZF_SET(active, JNODE_DKSET);
66161 + }
66162 + write_unlock_dk(tree);
66163 + return 1;
66164 + }
66165 + return 0;
66166 +}
66167 +
66168 +/* true if @block makes sense for the @tree. Used to detect corrupted node
66169 + * pointers */
66170 +static int
66171 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66172 + reiser4_tree * tree /* tree to check against */ )
66173 +{
66174 + assert("nikita-757", block != NULL);
66175 + assert("nikita-758", tree != NULL);
66176 +
66177 + /* check to see if it exceeds the size of the device. */
66178 + return reiser4_blocknr_is_sane_for(tree->super, block);
66179 +}
66180 +
66181 +/* check consistency of fields */
66182 +static int sanity_check(cbk_handle * h /* search handle */ )
66183 +{
66184 + assert("nikita-384", h != NULL);
66185 +
66186 + if (h->level < h->stop_level) {
66187 + h->error = "Buried under leaves";
66188 + h->result = RETERR(-EIO);
66189 + return LOOKUP_DONE;
66190 + } else if (!block_nr_is_correct(&h->block, h->tree)) {
66191 + h->error = "bad block number";
66192 + h->result = RETERR(-EIO);
66193 + return LOOKUP_DONE;
66194 + } else
66195 + return 0;
66196 +}
66197 +
66198 +/* Make Linus happy.
66199 + Local variables:
66200 + c-indentation-style: "K&R"
66201 + mode-name: "LC"
66202 + c-basic-offset: 8
66203 + tab-width: 8
66204 + fill-column: 120
66205 + scroll-step: 1
66206 + End:
66207 +*/
66208 diff -urN linux-2.6.22.orig/fs/reiser4/status_flags.c linux-2.6.22/fs/reiser4/status_flags.c
66209 --- linux-2.6.22.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
66210 +++ linux-2.6.22/fs/reiser4/status_flags.c 2007-07-29 00:25:35.016732714 +0400
66211 @@ -0,0 +1,175 @@
66212 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66213 + * reiser4/README */
66214 +
66215 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
66216 +
66217 +#include <linux/bio.h>
66218 +#include <linux/highmem.h>
66219 +#include <linux/fs.h>
66220 +#include <linux/blkdev.h>
66221 +#include "debug.h"
66222 +#include "dformat.h"
66223 +#include "status_flags.h"
66224 +#include "super.h"
66225 +
66226 +/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66227 + unconditionally unlocks the page, so we can see that io was done.
66228 + We do not free bio, because we hope to reuse that. */
66229 +static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66230 + int err)
66231 +{
66232 + if (bio->bi_size)
66233 + return 1;
66234 +
66235 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66236 + SetPageUptodate(bio->bi_io_vec->bv_page);
66237 + } else {
66238 + ClearPageUptodate(bio->bi_io_vec->bv_page);
66239 + SetPageError(bio->bi_io_vec->bv_page);
66240 + }
66241 + unlock_page(bio->bi_io_vec->bv_page);
66242 + return 0;
66243 +}
66244 +
66245 +/* Initialise status code. This is expected to be called from the disk format
66246 + code. block paremeter is where status block lives. */
66247 +int reiser4_status_init(reiser4_block_nr block)
66248 +{
66249 + struct super_block *sb = reiser4_get_current_sb();
66250 + struct reiser4_status *statuspage;
66251 + struct bio *bio;
66252 + struct page *page;
66253 +
66254 + get_super_private(sb)->status_page = NULL;
66255 + get_super_private(sb)->status_bio = NULL;
66256 +
66257 + page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
66258 + if (!page)
66259 + return -ENOMEM;
66260 +
66261 + bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
66262 + if (bio != NULL) {
66263 + bio->bi_sector = block * (sb->s_blocksize >> 9);
66264 + bio->bi_bdev = sb->s_bdev;
66265 + bio->bi_io_vec[0].bv_page = page;
66266 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66267 + bio->bi_io_vec[0].bv_offset = 0;
66268 + bio->bi_vcnt = 1;
66269 + bio->bi_size = sb->s_blocksize;
66270 + bio->bi_end_io = reiser4_status_endio;
66271 + } else {
66272 + __free_pages(page, 0);
66273 + return -ENOMEM;
66274 + }
66275 + lock_page(page);
66276 + submit_bio(READ, bio);
66277 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
66278 + wait_on_page_locked(page);
66279 + if (!PageUptodate(page)) {
66280 + warning("green-2007",
66281 + "I/O error while tried to read status page\n");
66282 + return -EIO;
66283 + }
66284 +
66285 + statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
66286 + if (memcmp
66287 + (statuspage->magic, REISER4_STATUS_MAGIC,
66288 + sizeof(REISER4_STATUS_MAGIC))) {
66289 + /* Magic does not match. */
66290 + kunmap_atomic((char *)statuspage, KM_USER0);
66291 + warning("green-2008", "Wrong magic in status block\n");
66292 + __free_pages(page, 0);
66293 + bio_put(bio);
66294 + return -EINVAL;
66295 + }
66296 + kunmap_atomic((char *)statuspage, KM_USER0);
66297 +
66298 + get_super_private(sb)->status_page = page;
66299 + get_super_private(sb)->status_bio = bio;
66300 + return 0;
66301 +}
66302 +
66303 +/* Query the status of fs. Returns if the FS can be safely mounted.
66304 + Also if "status" and "extended" parameters are given, it will fill
66305 + actual parts of status from disk there. */
66306 +int reiser4_status_query(u64 * status, u64 * extended)
66307 +{
66308 + struct super_block *sb = reiser4_get_current_sb();
66309 + struct reiser4_status *statuspage;
66310 + int retval;
66311 +
66312 + if (!get_super_private(sb)->status_page) { // No status page?
66313 + return REISER4_STATUS_MOUNT_UNKNOWN;
66314 + }
66315 + statuspage = (struct reiser4_status *)
66316 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66317 + switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
66318 + case REISER4_STATUS_OK:
66319 + retval = REISER4_STATUS_MOUNT_OK;
66320 + break;
66321 + case REISER4_STATUS_CORRUPTED:
66322 + retval = REISER4_STATUS_MOUNT_WARN;
66323 + break;
66324 + case REISER4_STATUS_DAMAGED:
66325 + case REISER4_STATUS_DESTROYED:
66326 + case REISER4_STATUS_IOERROR:
66327 + retval = REISER4_STATUS_MOUNT_RO;
66328 + break;
66329 + default:
66330 + retval = REISER4_STATUS_MOUNT_UNKNOWN;
66331 + break;
66332 + }
66333 +
66334 + if (status)
66335 + *status = le64_to_cpu(get_unaligned(&statuspage->status));
66336 + if (extended)
66337 + *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
66338 +
66339 + kunmap_atomic((char *)statuspage, KM_USER0);
66340 + return retval;
66341 +}
66342 +
66343 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
66344 + It fills the status structure and tries to push it to disk. */
66345 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
66346 +{
66347 + struct super_block *sb = reiser4_get_current_sb();
66348 + struct reiser4_status *statuspage;
66349 + struct bio *bio = get_super_private(sb)->status_bio;
66350 +
66351 + if (!get_super_private(sb)->status_page) { // No status page?
66352 + return -1;
66353 + }
66354 + statuspage = (struct reiser4_status *)
66355 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66356 +
66357 + put_unaligned(cpu_to_le64(status), &statuspage->status);
66358 + put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
66359 + strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
66360 +
66361 + kunmap_atomic((char *)statuspage, KM_USER0);
66362 + bio->bi_bdev = sb->s_bdev;
66363 + bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
66364 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66365 + bio->bi_io_vec[0].bv_offset = 0;
66366 + bio->bi_vcnt = 1;
66367 + bio->bi_size = sb->s_blocksize;
66368 + bio->bi_end_io = reiser4_status_endio;
66369 + lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
66370 + /* We can block now, but we have no other choice anyway */
66371 + submit_bio(WRITE, bio);
66372 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
66373 + return 0; // We do not wait for io to finish.
66374 +}
66375 +
66376 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
66377 +int reiser4_status_finish(void)
66378 +{
66379 + struct super_block *sb = reiser4_get_current_sb();
66380 +
66381 + __free_pages(get_super_private(sb)->status_page, 0);
66382 + get_super_private(sb)->status_page = NULL;
66383 + bio_put(get_super_private(sb)->status_bio);
66384 + get_super_private(sb)->status_bio = NULL;
66385 + return 0;
66386 +}
66387 diff -urN linux-2.6.22.orig/fs/reiser4/status_flags.h linux-2.6.22/fs/reiser4/status_flags.h
66388 --- linux-2.6.22.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
66389 +++ linux-2.6.22/fs/reiser4/status_flags.h 2007-07-29 00:25:35.016732714 +0400
66390 @@ -0,0 +1,43 @@
66391 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66392 + * reiser4/README */
66393 +
66394 +/* Here we declare structures and flags that store reiser4 status on disk.
66395 + The status that helps us to find out if the filesystem is valid or if it
66396 + contains some critical, or not so critical errors */
66397 +
66398 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
66399 +#define __REISER4_STATUS_FLAGS_H__
66400 +
66401 +#include "dformat.h"
66402 +/* These are major status flags */
66403 +#define REISER4_STATUS_OK 0
66404 +#define REISER4_STATUS_CORRUPTED 0x1
66405 +#define REISER4_STATUS_DAMAGED 0x2
66406 +#define REISER4_STATUS_DESTROYED 0x4
66407 +#define REISER4_STATUS_IOERROR 0x8
66408 +
66409 +/* Return values for reiser4_status_query() */
66410 +#define REISER4_STATUS_MOUNT_OK 0
66411 +#define REISER4_STATUS_MOUNT_WARN 1
66412 +#define REISER4_STATUS_MOUNT_RO 2
66413 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
66414 +
66415 +#define REISER4_TEXTERROR_LEN 256
66416 +
66417 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
66418 +/* We probably need to keep its size under sector size which is 512 bytes */
66419 +struct reiser4_status {
66420 + char magic[16];
66421 + d64 status; /* Current FS state */
66422 + d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
66423 + last sector where io error happened if status is "io error encountered" */
66424 + d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
66425 + char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
66426 +};
66427 +
66428 +int reiser4_status_init(reiser4_block_nr block);
66429 +int reiser4_status_query(u64 * status, u64 * extended);
66430 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
66431 +int reiser4_status_finish(void);
66432 +
66433 +#endif
66434 diff -urN linux-2.6.22.orig/fs/reiser4/super.c linux-2.6.22/fs/reiser4/super.c
66435 --- linux-2.6.22.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300
66436 +++ linux-2.6.22/fs/reiser4/super.c 2007-07-29 00:25:35.020733749 +0400
66437 @@ -0,0 +1,316 @@
66438 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66439 + * reiser4/README */
66440 +
66441 +/* Super-block manipulations. */
66442 +
66443 +#include "debug.h"
66444 +#include "dformat.h"
66445 +#include "key.h"
66446 +#include "plugin/security/perm.h"
66447 +#include "plugin/space/space_allocator.h"
66448 +#include "plugin/plugin.h"
66449 +#include "tree.h"
66450 +#include "vfs_ops.h"
66451 +#include "super.h"
66452 +#include "reiser4.h"
66453 +
66454 +#include <linux/types.h> /* for __u?? */
66455 +#include <linux/fs.h> /* for struct super_block */
66456 +
66457 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
66458 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
66459 +static __u64 reserved_for_root(const struct super_block *super);
66460 +
66461 +/* Return reiser4-specific part of super block */
66462 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
66463 + * queried */ )
66464 +{
66465 + return (reiser4_super_info_data *) super->s_fs_info;
66466 +}
66467 +
66468 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
66469 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
66470 +{
66471 + assert("nikita-448", super != NULL);
66472 + assert("nikita-449", is_reiser4_super(super));
66473 + return (long)REISER4_SUPER_MAGIC;
66474 +}
66475 +
66476 +/* functions to read/modify fields of reiser4_super_info_data */
66477 +
66478 +/* get number of blocks in file system */
66479 +__u64 reiser4_block_count(const struct super_block *super /* super block
66480 + queried */ )
66481 +{
66482 + assert("vs-494", super != NULL);
66483 + assert("vs-495", is_reiser4_super(super));
66484 + return get_super_private(super)->block_count;
66485 +}
66486 +
66487 +#if REISER4_DEBUG
66488 +/*
66489 + * number of blocks in the current file system
66490 + */
66491 +__u64 reiser4_current_block_count(void)
66492 +{
66493 + return get_current_super_private()->block_count;
66494 +}
66495 +#endif /* REISER4_DEBUG */
66496 +
66497 +/* set number of block in filesystem */
66498 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
66499 +{
66500 + assert("vs-501", super != NULL);
66501 + assert("vs-502", is_reiser4_super(super));
66502 + get_super_private(super)->block_count = nr;
66503 + /*
66504 + * The proper calculation of the reserved space counter (%5 of device
66505 + * block counter) we need a 64 bit division which is missing in Linux
66506 + * on i386 platform. Because we do not need a precise calculation here
66507 + * we can replace a div64 operation by this combination of
66508 + * multiplication and shift: 51. / (2^10) == .0498 .
66509 + * FIXME: this is a bug. It comes up only for very small filesystems
66510 + * which probably are never used. Nevertheless, it is a bug. Number of
66511 + * reserved blocks must be not less than maximal number of blocks which
66512 + * get grabbed with BA_RESERVED.
66513 + */
66514 + get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
66515 +}
66516 +
66517 +/* amount of blocks used (allocated for data) in file system */
66518 +__u64 reiser4_data_blocks(const struct super_block *super /* super block
66519 + queried */ )
66520 +{
66521 + assert("nikita-452", super != NULL);
66522 + assert("nikita-453", is_reiser4_super(super));
66523 + return get_super_private(super)->blocks_used;
66524 +}
66525 +
66526 +/* set number of block used in filesystem */
66527 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
66528 +{
66529 + assert("vs-503", super != NULL);
66530 + assert("vs-504", is_reiser4_super(super));
66531 + get_super_private(super)->blocks_used = nr;
66532 +}
66533 +
66534 +/* amount of free blocks in file system */
66535 +__u64 reiser4_free_blocks(const struct super_block *super /* super block
66536 + queried */ )
66537 +{
66538 + assert("nikita-454", super != NULL);
66539 + assert("nikita-455", is_reiser4_super(super));
66540 + return get_super_private(super)->blocks_free;
66541 +}
66542 +
66543 +/* set number of blocks free in filesystem */
66544 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
66545 +{
66546 + assert("vs-505", super != NULL);
66547 + assert("vs-506", is_reiser4_super(super));
66548 + get_super_private(super)->blocks_free = nr;
66549 +}
66550 +
66551 +/* get mkfs unique identifier */
66552 +__u32 reiser4_mkfs_id(const struct super_block *super /* super block
66553 + queried */ )
66554 +{
66555 + assert("vpf-221", super != NULL);
66556 + assert("vpf-222", is_reiser4_super(super));
66557 + return get_super_private(super)->mkfs_id;
66558 +}
66559 +
66560 +/* amount of free blocks in file system */
66561 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
66562 +{
66563 + assert("vs-497", super != NULL);
66564 + assert("vs-498", is_reiser4_super(super));
66565 + return get_super_private(super)->blocks_free_committed;
66566 +}
66567 +
66568 +/* amount of blocks in the file system reserved for @uid and @gid */
66569 +long reiser4_reserved_blocks(const struct super_block *super /* super block
66570 + queried */ ,
66571 + uid_t uid /* user id */ ,
66572 + gid_t gid /* group id */ )
66573 +{
66574 + long reserved;
66575 +
66576 + assert("nikita-456", super != NULL);
66577 + assert("nikita-457", is_reiser4_super(super));
66578 +
66579 + reserved = 0;
66580 + if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
66581 + reserved += reserved_for_gid(super, gid);
66582 + if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
66583 + reserved += reserved_for_uid(super, uid);
66584 + if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
66585 + reserved += reserved_for_root(super);
66586 + return reserved;
66587 +}
66588 +
66589 +/* get/set value of/to grabbed blocks counter */
66590 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
66591 +{
66592 + assert("zam-512", super != NULL);
66593 + assert("zam-513", is_reiser4_super(super));
66594 +
66595 + return get_super_private(super)->blocks_grabbed;
66596 +}
66597 +
66598 +__u64 reiser4_flush_reserved(const struct super_block * super)
66599 +{
66600 + assert("vpf-285", super != NULL);
66601 + assert("vpf-286", is_reiser4_super(super));
66602 +
66603 + return get_super_private(super)->blocks_flush_reserved;
66604 +}
66605 +
66606 +/* get/set value of/to counter of fake allocated formatted blocks */
66607 +__u64 reiser4_fake_allocated(const struct super_block * super)
66608 +{
66609 + assert("zam-516", super != NULL);
66610 + assert("zam-517", is_reiser4_super(super));
66611 +
66612 + return get_super_private(super)->blocks_fake_allocated;
66613 +}
66614 +
66615 +/* get/set value of/to counter of fake allocated unformatted blocks */
66616 +__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
66617 +{
66618 + assert("zam-516", super != NULL);
66619 + assert("zam-517", is_reiser4_super(super));
66620 +
66621 + return get_super_private(super)->blocks_fake_allocated_unformatted;
66622 +}
66623 +
66624 +/* get/set value of/to counter of clustered blocks */
66625 +__u64 reiser4_clustered_blocks(const struct super_block * super)
66626 +{
66627 + assert("edward-601", super != NULL);
66628 + assert("edward-602", is_reiser4_super(super));
66629 +
66630 + return get_super_private(super)->blocks_clustered;
66631 +}
66632 +
66633 +/* space allocator used by this file system */
66634 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
66635 + *super)
66636 +{
66637 + assert("nikita-1965", super != NULL);
66638 + assert("nikita-1966", is_reiser4_super(super));
66639 + return &get_super_private(super)->space_allocator;
66640 +}
66641 +
66642 +/* return fake inode used to bind formatted nodes in the page cache */
66643 +struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
66644 + queried */ )
66645 +{
66646 + assert("nikita-1757", super != NULL);
66647 + return get_super_private(super)->fake;
66648 +}
66649 +
66650 +/* return fake inode used to bind copied on capture nodes in the page cache */
66651 +struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
66652 + queried */ )
66653 +{
66654 + assert("nikita-1757", super != NULL);
66655 + return get_super_private(super)->cc;
66656 +}
66657 +
66658 +/* return fake inode used to bind bitmaps and journlal heads */
66659 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
66660 +{
66661 + assert("nikita-17571", super != NULL);
66662 + return get_super_private(super)->bitmap;
66663 +}
66664 +
66665 +/* tree used by this file system */
66666 +reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
66667 + * queried */ )
66668 +{
66669 + assert("nikita-460", super != NULL);
66670 + assert("nikita-461", is_reiser4_super(super));
66671 + return &get_super_private(super)->tree;
66672 +}
66673 +
66674 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
66675 + use in assertions. */
66676 +int is_reiser4_super(const struct super_block *super /* super block
66677 + * queried */ )
66678 +{
66679 + return
66680 + super != NULL &&
66681 + get_super_private(super) != NULL &&
66682 + super->s_op == &(get_super_private(super)->ops.super);
66683 +}
66684 +
66685 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
66686 +{
66687 + return test_bit((int)f, &get_super_private(super)->fs_flags);
66688 +}
66689 +
66690 +/* amount of blocks reserved for given group in file system */
66691 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
66692 + * block
66693 + * queried */ ,
66694 + gid_t gid UNUSED_ARG /* group id */ )
66695 +{
66696 + return 0;
66697 +}
66698 +
66699 +/* amount of blocks reserved for given user in file system */
66700 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
66701 + block
66702 + queried */ ,
66703 + uid_t uid UNUSED_ARG /* user id */ )
66704 +{
66705 + return 0;
66706 +}
66707 +
66708 +/* amount of blocks reserved for super user in file system */
66709 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
66710 + block
66711 + queried */ )
66712 +{
66713 + return 0;
66714 +}
66715 +
66716 +/*
66717 + * true if block number @blk makes sense for the file system at @super.
66718 + */
66719 +int
66720 +reiser4_blocknr_is_sane_for(const struct super_block *super,
66721 + const reiser4_block_nr * blk)
66722 +{
66723 + reiser4_super_info_data *sbinfo;
66724 +
66725 + assert("nikita-2957", super != NULL);
66726 + assert("nikita-2958", blk != NULL);
66727 +
66728 + if (reiser4_blocknr_is_fake(blk))
66729 + return 1;
66730 +
66731 + sbinfo = get_super_private(super);
66732 + return *blk < sbinfo->block_count;
66733 +}
66734 +
66735 +#if REISER4_DEBUG
66736 +/*
66737 + * true, if block number @blk makes sense for the current file system
66738 + */
66739 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
66740 +{
66741 + return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
66742 +}
66743 +#endif /* REISER4_DEBUG */
66744 +
66745 +/* Make Linus happy.
66746 + Local variables:
66747 + c-indentation-style: "K&R"
66748 + mode-name: "LC"
66749 + c-basic-offset: 8
66750 + tab-width: 8
66751 + fill-column: 120
66752 + End:
66753 +*/
66754 diff -urN linux-2.6.22.orig/fs/reiser4/super.h linux-2.6.22/fs/reiser4/super.h
66755 --- linux-2.6.22.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300
66756 +++ linux-2.6.22/fs/reiser4/super.h 2007-07-29 00:25:35.020733749 +0400
66757 @@ -0,0 +1,464 @@
66758 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66759 + * reiser4/README */
66760 +
66761 +/* Super-block functions. See super.c for details. */
66762 +
66763 +#if !defined( __REISER4_SUPER_H__ )
66764 +#define __REISER4_SUPER_H__
66765 +
66766 +#include "tree.h"
66767 +#include "entd.h"
66768 +#include "wander.h"
66769 +#include "fsdata.h"
66770 +#include "plugin/object.h"
66771 +#include "plugin/space/space_allocator.h"
66772 +
66773 +/*
66774 + * Flush algorithms parameters.
66775 + */
66776 +struct flush_params {
66777 + unsigned relocate_threshold;
66778 + unsigned relocate_distance;
66779 + unsigned written_threshold;
66780 + unsigned scan_maxnodes;
66781 +};
66782 +
66783 +typedef enum {
66784 + /*
66785 + * True if this file system doesn't support hard-links (multiple names)
66786 + * for directories: this is default UNIX behavior.
66787 + *
66788 + * If hard-links on directoires are not allowed, file system is Acyclic
66789 + * Directed Graph (modulo dot, and dotdot, of course).
66790 + *
66791 + * This is used by reiser4_link().
66792 + */
66793 + REISER4_ADG = 0,
66794 + /*
66795 + * set if all nodes in internal tree have the same node layout plugin.
66796 + * If so, znode_guess_plugin() will return tree->node_plugin in stead
66797 + * of guessing plugin by plugin id stored in the node.
66798 + */
66799 + REISER4_ONE_NODE_PLUGIN = 1,
66800 + /* if set, bsd gid assignment is supported. */
66801 + REISER4_BSD_GID = 2,
66802 + /* [mac]_time are 32 bit in inode */
66803 + REISER4_32_BIT_TIMES = 3,
66804 + /* load all bitmap blocks at mount time */
66805 + REISER4_DONT_LOAD_BITMAP = 5,
66806 + /* enforce atomicity during write(2) */
66807 + REISER4_ATOMIC_WRITE = 6,
66808 + /* don't use write barriers in the log writer code. */
66809 + REISER4_NO_WRITE_BARRIER = 7
66810 +} reiser4_fs_flag;
66811 +
66812 +/*
66813 + * VFS related operation vectors.
66814 + */
66815 +struct object_ops {
66816 + struct super_operations super;
66817 + struct dentry_operations dentry;
66818 + struct export_operations export;
66819 +};
66820 +
66821 +/* reiser4-specific part of super block
66822 +
66823 + Locking
66824 +
66825 + Fields immutable after mount:
66826 +
66827 + ->oid*
66828 + ->space*
66829 + ->default_[ug]id
66830 + ->mkfs_id
66831 + ->trace_flags
66832 + ->debug_flags
66833 + ->fs_flags
66834 + ->df_plug
66835 + ->optimal_io_size
66836 + ->plug
66837 + ->flush
66838 + ->u (bad name)
66839 + ->txnmgr
66840 + ->ra_params
66841 + ->fsuid
66842 + ->journal_header
66843 + ->journal_footer
66844 +
66845 + Fields protected by ->lnode_guard
66846 +
66847 + ->lnode_htable
66848 +
66849 + Fields protected by per-super block spin lock
66850 +
66851 + ->block_count
66852 + ->blocks_used
66853 + ->blocks_free
66854 + ->blocks_free_committed
66855 + ->blocks_grabbed
66856 + ->blocks_fake_allocated_unformatted
66857 + ->blocks_fake_allocated
66858 + ->blocks_flush_reserved
66859 + ->eflushed
66860 + ->blocknr_hint_default
66861 +
66862 + After journal replaying during mount,
66863 +
66864 + ->last_committed_tx
66865 +
66866 + is protected by ->tmgr.commit_mutex
66867 +
66868 + Invariants involving this data-type:
66869 +
66870 + [sb-block-counts]
66871 + [sb-grabbed]
66872 + [sb-fake-allocated]
66873 +*/
66874 +struct reiser4_super_info_data {
66875 + /*
66876 + * guard spinlock which protects reiser4 super block fields (currently
66877 + * blocks_free, blocks_free_committed)
66878 + */
66879 + spinlock_t guard;
66880 +
66881 + /* next oid that will be returned by oid_allocate() */
66882 + oid_t next_to_use;
66883 + /* total number of used oids */
66884 + oid_t oids_in_use;
66885 +
66886 + /* space manager plugin */
66887 + reiser4_space_allocator space_allocator;
66888 +
66889 + /* reiser4 internal tree */
66890 + reiser4_tree tree;
66891 +
66892 + /*
66893 + * default user id used for light-weight files without their own
66894 + * stat-data.
66895 + */
66896 + uid_t default_uid;
66897 +
66898 + /*
66899 + * default group id used for light-weight files without their own
66900 + * stat-data.
66901 + */
66902 + gid_t default_gid;
66903 +
66904 + /* mkfs identifier generated at mkfs time. */
66905 + __u32 mkfs_id;
66906 + /* amount of blocks in a file system */
66907 + __u64 block_count;
66908 +
66909 + /* inviolable reserve */
66910 + __u64 blocks_reserved;
66911 +
66912 + /* amount of blocks used by file system data and meta-data. */
66913 + __u64 blocks_used;
66914 +
66915 + /*
66916 + * amount of free blocks. This is "working" free blocks counter. It is
66917 + * like "working" bitmap, please see block_alloc.c for description.
66918 + */
66919 + __u64 blocks_free;
66920 +
66921 + /*
66922 + * free block count for fs committed state. This is "commit" version of
66923 + * free block counter.
66924 + */
66925 + __u64 blocks_free_committed;
66926 +
66927 + /*
66928 + * number of blocks reserved for further allocation, for all
66929 + * threads.
66930 + */
66931 + __u64 blocks_grabbed;
66932 +
66933 + /* number of fake allocated unformatted blocks in tree. */
66934 + __u64 blocks_fake_allocated_unformatted;
66935 +
66936 + /* number of fake allocated formatted blocks in tree. */
66937 + __u64 blocks_fake_allocated;
66938 +
66939 + /* number of blocks reserved for flush operations. */
66940 + __u64 blocks_flush_reserved;
66941 +
66942 + /* number of blocks reserved for cluster operations. */
66943 + __u64 blocks_clustered;
66944 +
66945 + /* unique file-system identifier */
66946 + __u32 fsuid;
66947 +
66948 + /* On-disk format version. If does not equal to the disk_format
66949 + plugin version, some format updates (e.g. enlarging plugin
66950 + set, etc) may have place on mount. */
66951 + int version;
66952 +
66953 + /* file-system wide flags. See reiser4_fs_flag enum */
66954 + unsigned long fs_flags;
66955 +
66956 + /* transaction manager */
66957 + txn_mgr tmgr;
66958 +
66959 + /* ent thread */
66960 + entd_context entd;
66961 +
66962 + /* fake inode used to bind formatted nodes */
66963 + struct inode *fake;
66964 + /* inode used to bind bitmaps (and journal heads) */
66965 + struct inode *bitmap;
66966 + /* inode used to bind copied on capture nodes */
66967 + struct inode *cc;
66968 +
66969 + /* disk layout plugin */
66970 + disk_format_plugin *df_plug;
66971 +
66972 + /* disk layout specific part of reiser4 super info data */
66973 + union {
66974 + format40_super_info format40;
66975 + } u;
66976 +
66977 + /* value we return in st_blksize on stat(2) */
66978 + unsigned long optimal_io_size;
66979 +
66980 + /* parameters for the flush algorithm */
66981 + struct flush_params flush;
66982 +
66983 + /* pointers to jnodes for journal header and footer */
66984 + jnode *journal_header;
66985 + jnode *journal_footer;
66986 +
66987 + journal_location jloc;
66988 +
66989 + /* head block number of last committed transaction */
66990 + __u64 last_committed_tx;
66991 +
66992 + /*
66993 + * we remember last written location for using as a hint for new block
66994 + * allocation
66995 + */
66996 + __u64 blocknr_hint_default;
66997 +
66998 + /* committed number of files (oid allocator state variable ) */
66999 + __u64 nr_files_committed;
67000 +
67001 + struct formatted_ra_params ra_params;
67002 +
67003 + /*
67004 + * A mutex for serializing cut tree operation if out-of-free-space:
67005 + * the only one cut_tree thread is allowed to grab space from reserved
67006 + * area (it is 5% of disk space)
67007 + */
67008 + struct mutex delete_mutex;
67009 + /* task owning ->delete_mutex */
67010 + struct task_struct *delete_mutex_owner;
67011 +
67012 + /* Diskmap's blocknumber */
67013 + __u64 diskmap_block;
67014 +
67015 + /* What to do in case of error */
67016 + int onerror;
67017 +
67018 + /* operations for objects on this file system */
67019 + struct object_ops ops;
67020 +
67021 + /*
67022 + * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67023 + * more details
67024 + */
67025 + struct d_cursor_info d_info;
67026 +
67027 +#ifdef CONFIG_REISER4_BADBLOCKS
67028 + /* Alternative master superblock offset (in bytes) */
67029 + unsigned long altsuper;
67030 +#endif
67031 + struct repacker *repacker;
67032 + struct page *status_page;
67033 + struct bio *status_bio;
67034 +
67035 +#if REISER4_DEBUG
67036 + /*
67037 + * minimum used blocks value (includes super blocks, bitmap blocks and
67038 + * other fs reserved areas), depends on fs format and fs size.
67039 + */
67040 + __u64 min_blocks_used;
67041 +
67042 + /*
67043 + * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67044 + * are kept on a list anchored at sbinfo->all_jnodes. This list is
67045 + * protected by sbinfo->all_guard spin lock. This lock should be taken
67046 + * with _irq modifier, because it is also modified from interrupt
67047 + * contexts (by RCU).
67048 + */
67049 + spinlock_t all_guard;
67050 + /* list of all jnodes */
67051 + struct list_head all_jnodes;
67052 +#endif
67053 + struct dentry *debugfs_root;
67054 +};
67055 +
67056 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
67057 + super_block *super);
67058 +
67059 +/* Return reiser4-specific part of super block */
67060 +static inline reiser4_super_info_data *get_super_private(const struct
67061 + super_block *super)
67062 +{
67063 + assert("nikita-447", super != NULL);
67064 +
67065 + return (reiser4_super_info_data *) super->s_fs_info;
67066 +}
67067 +
67068 +/* get ent context for the @super */
67069 +static inline entd_context *get_entd_context(struct super_block *super)
67070 +{
67071 + return &get_super_private(super)->entd;
67072 +}
67073 +
67074 +/* "Current" super-block: main super block used during current system
67075 + call. Reference to this super block is stored in reiser4_context. */
67076 +static inline struct super_block *reiser4_get_current_sb(void)
67077 +{
67078 + return get_current_context()->super;
67079 +}
67080 +
67081 +/* Reiser4-specific part of "current" super-block: main super block used
67082 + during current system call. Reference to this super block is stored in
67083 + reiser4_context. */
67084 +static inline reiser4_super_info_data *get_current_super_private(void)
67085 +{
67086 + return get_super_private(reiser4_get_current_sb());
67087 +}
67088 +
67089 +static inline struct formatted_ra_params *get_current_super_ra_params(void)
67090 +{
67091 + return &(get_current_super_private()->ra_params);
67092 +}
67093 +
67094 +/*
67095 + * true, if file system on @super is read-only
67096 + */
67097 +static inline int rofs_super(struct super_block *super)
67098 +{
67099 + return super->s_flags & MS_RDONLY;
67100 +}
67101 +
67102 +/*
67103 + * true, if @tree represents read-only file system
67104 + */
67105 +static inline int rofs_tree(reiser4_tree * tree)
67106 +{
67107 + return rofs_super(tree->super);
67108 +}
67109 +
67110 +/*
67111 + * true, if file system where @inode lives on, is read-only
67112 + */
67113 +static inline int rofs_inode(struct inode *inode)
67114 +{
67115 + return rofs_super(inode->i_sb);
67116 +}
67117 +
67118 +/*
67119 + * true, if file system where @node lives on, is read-only
67120 + */
67121 +static inline int rofs_jnode(jnode * node)
67122 +{
67123 + return rofs_tree(jnode_get_tree(node));
67124 +}
67125 +
67126 +extern __u64 reiser4_current_block_count(void);
67127 +
67128 +extern void build_object_ops(struct super_block *super, struct object_ops * ops);
67129 +
67130 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67131 +
67132 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67133 +{
67134 + spin_lock(&(sbinfo->guard));
67135 +}
67136 +
67137 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67138 +{
67139 + assert_spin_locked(&(sbinfo->guard));
67140 + spin_unlock(&(sbinfo->guard));
67141 +}
67142 +
67143 +extern __u64 reiser4_flush_reserved(const struct super_block *);
67144 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67145 +extern long reiser4_statfs_type(const struct super_block *super);
67146 +extern __u64 reiser4_block_count(const struct super_block *super);
67147 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67148 +extern __u64 reiser4_data_blocks(const struct super_block *super);
67149 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67150 +extern __u64 reiser4_free_blocks(const struct super_block *super);
67151 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67152 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
67153 +
67154 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67155 +
67156 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67157 +extern __u64 reiser4_fake_allocated(const struct super_block *);
67158 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67159 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
67160 +
67161 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67162 + gid_t gid);
67163 +
67164 +extern reiser4_space_allocator *
67165 +reiser4_get_space_allocator(const struct super_block *super);
67166 +extern reiser4_oid_allocator *
67167 +reiser4_get_oid_allocator(const struct super_block *super);
67168 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
67169 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
67170 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
67171 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
67172 +extern int is_reiser4_super(const struct super_block *super);
67173 +
67174 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67175 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67176 + const reiser4_block_nr * blk);
67177 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67178 +extern int reiser4_done_super(struct super_block *s);
67179 +
67180 +/* step of fill super */
67181 +extern int reiser4_init_fs_info(struct super_block *);
67182 +extern void reiser4_done_fs_info(struct super_block *);
67183 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
67184 +extern int reiser4_init_read_super(struct super_block *, int silent);
67185 +extern int reiser4_init_root_inode(struct super_block *);
67186 +extern reiser4_plugin *get_default_plugin(pset_member memb);
67187 +
67188 +/* Maximal possible object id. */
67189 +#define ABSOLUTE_MAX_OID ((oid_t)~0)
67190 +
67191 +#define OIDS_RESERVED ( 1 << 16 )
67192 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67193 +oid_t oid_allocate(struct super_block *);
67194 +int oid_release(struct super_block *, oid_t);
67195 +oid_t oid_next(const struct super_block *);
67196 +void oid_count_allocated(void);
67197 +void oid_count_released(void);
67198 +long oids_used(const struct super_block *);
67199 +
67200 +#if REISER4_DEBUG
67201 +void print_fs_info(const char *prefix, const struct super_block *);
67202 +#endif
67203 +
67204 +extern void destroy_reiser4_cache(struct kmem_cache **);
67205 +
67206 +extern struct super_operations reiser4_super_operations;
67207 +extern struct export_operations reiser4_export_operations;
67208 +extern struct dentry_operations reiser4_dentry_operations;
67209 +
67210 +/* __REISER4_SUPER_H__ */
67211 +#endif
67212 +
67213 +/*
67214 + * Local variables:
67215 + * c-indentation-style: "K&R"
67216 + * mode-name: "LC"
67217 + * c-basic-offset: 8
67218 + * tab-width: 8
67219 + * fill-column: 120
67220 + * End:
67221 + */
67222 diff -urN linux-2.6.22.orig/fs/reiser4/super_ops.c linux-2.6.22/fs/reiser4/super_ops.c
67223 --- linux-2.6.22.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300
67224 +++ linux-2.6.22/fs/reiser4/super_ops.c 2007-07-29 00:25:35.020733749 +0400
67225 @@ -0,0 +1,725 @@
67226 +/* Copyright 2005 by Hans Reiser, licensing governed by
67227 + * reiser4/README */
67228 +
67229 +#include "inode.h"
67230 +#include "page_cache.h"
67231 +#include "ktxnmgrd.h"
67232 +#include "flush.h"
67233 +#include "safe_link.h"
67234 +
67235 +#include <linux/vfs.h>
67236 +#include <linux/writeback.h>
67237 +#include <linux/mount.h>
67238 +#include <linux/seq_file.h>
67239 +#include <linux/debugfs.h>
67240 +
67241 +/* slab cache for inodes */
67242 +static struct kmem_cache *inode_cache;
67243 +
67244 +static struct dentry *reiser4_debugfs_root = NULL;
67245 +
67246 +/**
67247 + * init_once - constructor for reiser4 inodes
67248 + * @obj: inode to be initialized
67249 + * @cache: cache @obj belongs to
67250 + * @flags: SLAB flags
67251 + *
67252 + * Initialization function to be called when new page is allocated by reiser4
67253 + * inode cache. It is set on inode cache creation.
67254 + */
67255 +static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
67256 +{
67257 + struct reiser4_inode_object *info;
67258 +
67259 + info = obj;
67260 +
67261 + /* initialize vfs inode */
67262 + inode_init_once(&info->vfs_inode);
67263 +
67264 + /*
67265 + * initialize reiser4 specific part fo inode.
67266 + * NOTE-NIKITA add here initializations for locks, list heads,
67267 + * etc. that will be added to our private inode part.
67268 + */
67269 + INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
67270 + init_rwsem(&info->p.conv_sem);
67271 + /* init semaphore which is used during inode loading */
67272 + loading_init_once(&info->p);
67273 + INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
67274 + GFP_ATOMIC);
67275 +#if REISER4_DEBUG
67276 + info->p.nr_jnodes = 0;
67277 +#endif
67278 +}
67279 +
67280 +/**
67281 + * init_inodes - create znode cache
67282 + *
67283 + * Initializes slab cache of inodes. It is part of reiser4 module initialization.
67284 + */
67285 +static int init_inodes(void)
67286 +{
67287 + inode_cache = kmem_cache_create("reiser4_inode",
67288 + sizeof(struct reiser4_inode_object),
67289 + 0,
67290 + SLAB_HWCACHE_ALIGN |
67291 + SLAB_RECLAIM_ACCOUNT, init_once, NULL);
67292 + if (inode_cache == NULL)
67293 + return RETERR(-ENOMEM);
67294 + return 0;
67295 +}
67296 +
67297 +/**
67298 + * done_inodes - delete inode cache
67299 + *
67300 + * This is called on reiser4 module unloading or system shutdown.
67301 + */
67302 +static void done_inodes(void)
67303 +{
67304 + destroy_reiser4_cache(&inode_cache);
67305 +}
67306 +
67307 +/**
67308 + * reiser4_alloc_inode - alloc_inode of super operations
67309 + * @super: super block new inode is allocated for
67310 + *
67311 + * Allocates new inode, initializes reiser4 specific part of it.
67312 + */
67313 +static struct inode *reiser4_alloc_inode(struct super_block *super)
67314 +{
67315 + struct reiser4_inode_object *obj;
67316 +
67317 + assert("nikita-1696", super != NULL);
67318 + obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
67319 + if (obj != NULL) {
67320 + reiser4_inode *info;
67321 +
67322 + info = &obj->p;
67323 +
67324 + info->pset = plugin_set_get_empty();
67325 + info->hset = plugin_set_get_empty();
67326 + info->extmask = 0;
67327 + info->locality_id = 0ull;
67328 + info->plugin_mask = 0;
67329 + info->heir_mask = 0;
67330 +#if !REISER4_INO_IS_OID
67331 + info->oid_hi = 0;
67332 +#endif
67333 + reiser4_seal_init(&info->sd_seal, NULL, NULL);
67334 + coord_init_invalid(&info->sd_coord, NULL);
67335 + info->flags = 0;
67336 + spin_lock_init(&info->guard);
67337 + /* this deals with info's loading semaphore */
67338 + loading_alloc(info);
67339 + info->vroot = UBER_TREE_ADDR;
67340 + return &obj->vfs_inode;
67341 + } else
67342 + return NULL;
67343 +}
67344 +
67345 +/**
67346 + * reiser4_destroy_inode - destroy_inode of super operations
67347 + * @inode: inode being destroyed
67348 + *
67349 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
67350 + */
67351 +static void reiser4_destroy_inode(struct inode *inode)
67352 +{
67353 + reiser4_inode *info;
67354 +
67355 + info = reiser4_inode_data(inode);
67356 +
67357 + assert("vs-1220", inode_has_no_jnodes(info));
67358 +
67359 + if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
67360 + file_plugin *fplug = inode_file_plugin(inode);
67361 + if (fplug->destroy_inode != NULL)
67362 + fplug->destroy_inode(inode);
67363 + }
67364 + reiser4_dispose_cursors(inode);
67365 + if (info->pset)
67366 + plugin_set_put(info->pset);
67367 + if (info->hset)
67368 + plugin_set_put(info->hset);
67369 +
67370 + /*
67371 + * cannot add similar assertion about ->i_list as prune_icache return
67372 + * inode into slab with dangling ->list.{next,prev}. This is safe,
67373 + * because they are re-initialized in the new_inode().
67374 + */
67375 + assert("nikita-2895", list_empty(&inode->i_dentry));
67376 + assert("nikita-2896", hlist_unhashed(&inode->i_hash));
67377 + assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
67378 +
67379 + /* this deals with info's loading semaphore */
67380 + loading_destroy(info);
67381 +
67382 + kmem_cache_free(inode_cache,
67383 + container_of(info, struct reiser4_inode_object, p));
67384 +}
67385 +
67386 +/**
67387 + * reiser4_dirty_inode - dirty_inode of super operations
67388 + * @inode: inode being dirtied
67389 + *
67390 + * Updates stat data.
67391 + */
67392 +static void reiser4_dirty_inode(struct inode *inode)
67393 +{
67394 + int result;
67395 +
67396 + if (!is_in_reiser4_context())
67397 + return;
67398 + assert("", !IS_RDONLY(inode));
67399 + assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
67400 + get_current_context()->grabbed_blocks));
67401 +
67402 + result = reiser4_update_sd(inode);
67403 + if (result)
67404 + warning("", "failed to dirty inode for %llu: %d",
67405 + get_inode_oid(inode), result);
67406 +}
67407 +
67408 +/**
67409 + * reiser4_delete_inode - delete_inode of super operations
67410 + * @inode: inode to delete
67411 + *
67412 + * Calls file plugin's delete_object method to delete object items from
67413 + * filesystem tree and calls clear_inode.
67414 + */
67415 +static void reiser4_delete_inode(struct inode *inode)
67416 +{
67417 + reiser4_context *ctx;
67418 + file_plugin *fplug;
67419 +
67420 + ctx = reiser4_init_context(inode->i_sb);
67421 + if (IS_ERR(ctx)) {
67422 + warning("vs-15", "failed to init context");
67423 + return;
67424 + }
67425 +
67426 + if (is_inode_loaded(inode)) {
67427 + fplug = inode_file_plugin(inode);
67428 + if (fplug != NULL && fplug->delete_object != NULL)
67429 + fplug->delete_object(inode);
67430 + }
67431 +
67432 + truncate_inode_pages(&inode->i_data, 0);
67433 + inode->i_blocks = 0;
67434 + clear_inode(inode);
67435 + reiser4_exit_context(ctx);
67436 +}
67437 +
67438 +/**
67439 + * reiser4_put_super - put_super of super operations
67440 + * @super: super block to free
67441 + *
67442 + * Stops daemons, release resources, umounts in short.
67443 + */
67444 +static void reiser4_put_super(struct super_block *super)
67445 +{
67446 + reiser4_super_info_data *sbinfo;
67447 + reiser4_context *ctx;
67448 +
67449 + sbinfo = get_super_private(super);
67450 + assert("vs-1699", sbinfo);
67451 +
67452 + debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
67453 + debugfs_remove(sbinfo->tmgr.debugfs_id_count);
67454 + debugfs_remove(sbinfo->debugfs_root);
67455 +
67456 + ctx = reiser4_init_context(super);
67457 + if (IS_ERR(ctx)) {
67458 + warning("vs-17", "failed to init context");
67459 + return;
67460 + }
67461 +
67462 + /* have disk format plugin to free its resources */
67463 + if (get_super_private(super)->df_plug->release)
67464 + get_super_private(super)->df_plug->release(super);
67465 +
67466 + reiser4_done_formatted_fake(super);
67467 +
67468 + /* stop daemons: ktxnmgr and entd */
67469 + reiser4_done_entd(super);
67470 + reiser4_done_ktxnmgrd(super);
67471 + reiser4_done_txnmgr(&sbinfo->tmgr);
67472 +
67473 + reiser4_done_fs_info(super);
67474 + reiser4_exit_context(ctx);
67475 +}
67476 +
67477 +/**
67478 + * reiser4_write_super - write_super of super operations
67479 + * @super: super block to write
67480 + *
67481 + * Captures znode associated with super block, comit all transactions.
67482 + */
67483 +static void reiser4_write_super(struct super_block *super)
67484 +{
67485 + int ret;
67486 + reiser4_context *ctx;
67487 +
67488 + assert("vs-1700", !rofs_super(super));
67489 +
67490 + ctx = reiser4_init_context(super);
67491 + if (IS_ERR(ctx)) {
67492 + warning("vs-16", "failed to init context");
67493 + return;
67494 + }
67495 +
67496 + ret = reiser4_capture_super_block(super);
67497 + if (ret != 0)
67498 + warning("vs-1701",
67499 + "reiser4_capture_super_block failed in write_super: %d",
67500 + ret);
67501 + ret = txnmgr_force_commit_all(super, 0);
67502 + if (ret != 0)
67503 + warning("jmacd-77113",
67504 + "txn_force failed in write_super: %d", ret);
67505 +
67506 + super->s_dirt = 0;
67507 +
67508 + reiser4_exit_context(ctx);
67509 +}
67510 +
67511 +/**
67512 + * reiser4_statfs - statfs of super operations
67513 + * @super: super block of file system in queried
67514 + * @stafs: buffer to fill with statistics
67515 + *
67516 + * Returns information about filesystem.
67517 + */
67518 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
67519 +{
67520 + sector_t total;
67521 + sector_t reserved;
67522 + sector_t free;
67523 + sector_t forroot;
67524 + sector_t deleted;
67525 + reiser4_context *ctx;
67526 + struct super_block *super = dentry->d_sb;
67527 +
67528 + assert("nikita-408", super != NULL);
67529 + assert("nikita-409", statfs != NULL);
67530 +
67531 + ctx = reiser4_init_context(super);
67532 + if (IS_ERR(ctx))
67533 + return PTR_ERR(ctx);
67534 +
67535 + statfs->f_type = reiser4_statfs_type(super);
67536 + statfs->f_bsize = super->s_blocksize;
67537 +
67538 + /*
67539 + * 5% of total block space is reserved. This is needed for flush and
67540 + * for truncates (so that we are able to perform truncate/unlink even
67541 + * on the otherwise completely full file system). If this reservation
67542 + * is hidden from statfs(2), users will mistakenly guess that they
67543 + * have enough free space to complete some operation, which is
67544 + * frustrating.
67545 + *
67546 + * Another possible solution is to subtract ->blocks_reserved from
67547 + * ->f_bfree, but changing available space seems less intrusive than
67548 + * letting user to see 5% of disk space to be used directly after
67549 + * mkfs.
67550 + */
67551 + total = reiser4_block_count(super);
67552 + reserved = get_super_private(super)->blocks_reserved;
67553 + deleted = txnmgr_count_deleted_blocks();
67554 + free = reiser4_free_blocks(super) + deleted;
67555 + forroot = reiser4_reserved_blocks(super, 0, 0);
67556 +
67557 + /*
67558 + * These counters may be in inconsistent state because we take the
67559 + * values without keeping any global spinlock. Here we do a sanity
67560 + * check that free block counter does not exceed the number of all
67561 + * blocks.
67562 + */
67563 + if (free > total)
67564 + free = total;
67565 + statfs->f_blocks = total - reserved;
67566 + /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
67567 + if (free > reserved)
67568 + free -= reserved;
67569 + else
67570 + free = 0;
67571 + statfs->f_bfree = free;
67572 +
67573 + if (free > forroot)
67574 + free -= forroot;
67575 + else
67576 + free = 0;
67577 + statfs->f_bavail = free;
67578 +
67579 + statfs->f_files = 0;
67580 + statfs->f_ffree = 0;
67581 +
67582 + /* maximal acceptable name length depends on directory plugin. */
67583 + assert("nikita-3351", super->s_root->d_inode != NULL);
67584 + statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
67585 + reiser4_exit_context(ctx);
67586 + return 0;
67587 +}
67588 +
67589 +/**
67590 + * reiser4_clear_inode - clear_inode of super operation
67591 + * @inode: inode about to destroy
67592 + *
67593 + * Does sanity checks: being destroyed should have all jnodes detached.
67594 + */
67595 +static void reiser4_clear_inode(struct inode *inode)
67596 +{
67597 +#if REISER4_DEBUG
67598 + reiser4_inode *r4_inode;
67599 +
67600 + r4_inode = reiser4_inode_data(inode);
67601 + if (!inode_has_no_jnodes(r4_inode))
67602 + warning("vs-1732", "reiser4 inode has %ld jnodes\n",
67603 + r4_inode->nr_jnodes);
67604 +#endif
67605 +}
67606 +
67607 +/**
67608 + * reiser4_sync_inodes - sync_inodes of super operations
67609 + * @super:
67610 + * @wbc:
67611 + *
67612 + * This method is called by background and non-backgound writeback. Reiser4's
67613 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
67614 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
67615 + * mapping - dirty pages get into atoms. Writeout is called to flush some
67616 + * atoms.
67617 + */
67618 +static void reiser4_sync_inodes(struct super_block *super,
67619 + struct writeback_control *wbc)
67620 +{
67621 + reiser4_context *ctx;
67622 + long to_write;
67623 +
67624 + if (wbc->for_kupdate)
67625 + /* reiser4 has its own means of periodical write-out */
67626 + return;
67627 +
67628 + to_write = wbc->nr_to_write;
67629 + assert("vs-49", wbc->older_than_this == NULL);
67630 +
67631 + ctx = reiser4_init_context(super);
67632 + if (IS_ERR(ctx)) {
67633 + warning("vs-13", "failed to init context");
67634 + return;
67635 + }
67636 +
67637 + /*
67638 + * call reiser4_writepages for each of dirty inodes to turn dirty pages
67639 + * into transactions if they were not yet.
67640 + */
67641 + generic_sync_sb_inodes(super, wbc);
67642 +
67643 + /* flush goes here */
67644 + wbc->nr_to_write = to_write;
67645 + reiser4_writeout(super, wbc);
67646 +
67647 + /* avoid recursive calls to ->sync_inodes */
67648 + context_set_commit_async(ctx);
67649 + reiser4_exit_context(ctx);
67650 +}
67651 +
67652 +/**
67653 + * reiser4_show_options - show_options of super operations
67654 + * @m: file where to write information
67655 + * @mnt: mount structure
67656 + *
67657 + * Makes reiser4 mount options visible in /proc/mounts.
67658 + */
67659 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
67660 +{
67661 + struct super_block *super;
67662 + reiser4_super_info_data *sbinfo;
67663 +
67664 + super = mnt->mnt_sb;
67665 + sbinfo = get_super_private(super);
67666 +
67667 + seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
67668 + seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
67669 + seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
67670 + seq_printf(m, ",atom_max_flushers=0x%x",
67671 + sbinfo->tmgr.atom_max_flushers);
67672 + seq_printf(m, ",cbk_cache_slots=0x%x",
67673 + sbinfo->tree.cbk_cache.nr_slots);
67674 +
67675 + return 0;
67676 +}
67677 +
67678 +struct super_operations reiser4_super_operations = {
67679 + .alloc_inode = reiser4_alloc_inode,
67680 + .destroy_inode = reiser4_destroy_inode,
67681 + .dirty_inode = reiser4_dirty_inode,
67682 + .delete_inode = reiser4_delete_inode,
67683 + .put_super = reiser4_put_super,
67684 + .write_super = reiser4_write_super,
67685 + .statfs = reiser4_statfs,
67686 + .clear_inode = reiser4_clear_inode,
67687 + .sync_inodes = reiser4_sync_inodes,
67688 + .show_options = reiser4_show_options
67689 +};
67690 +
67691 +/**
67692 + * fill_super - initialize super block on mount
67693 + * @super: super block to fill
67694 + * @data: reiser4 specific mount option
67695 + * @silent:
67696 + *
67697 + * This is to be called by reiser4_get_sb. Mounts filesystem.
67698 + */
67699 +static int fill_super(struct super_block *super, void *data, int silent)
67700 +{
67701 + reiser4_context ctx;
67702 + int result;
67703 + reiser4_super_info_data *sbinfo;
67704 +
67705 + assert("zam-989", super != NULL);
67706 +
67707 + super->s_op = NULL;
67708 + init_stack_context(&ctx, super);
67709 +
67710 + /* allocate reiser4 specific super block */
67711 + if ((result = reiser4_init_fs_info(super)) != 0)
67712 + goto failed_init_sinfo;
67713 +
67714 + sbinfo = get_super_private(super);
67715 + /* initialize various reiser4 parameters, parse mount options */
67716 + if ((result = reiser4_init_super_data(super, data)) != 0)
67717 + goto failed_init_super_data;
67718 +
67719 + /* read reiser4 master super block, initialize disk format plugin */
67720 + if ((result = reiser4_init_read_super(super, silent)) != 0)
67721 + goto failed_init_read_super;
67722 +
67723 + /* initialize transaction manager */
67724 + reiser4_init_txnmgr(&sbinfo->tmgr);
67725 +
67726 + /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
67727 + if ((result = reiser4_init_ktxnmgrd(super)) != 0)
67728 + goto failed_init_ktxnmgrd;
67729 +
67730 + /* initialize entd context and start kernel thread entd */
67731 + if ((result = reiser4_init_entd(super)) != 0)
67732 + goto failed_init_entd;
67733 +
67734 + /* initialize address spaces for formatted nodes and bitmaps */
67735 + if ((result = reiser4_init_formatted_fake(super)) != 0)
67736 + goto failed_init_formatted_fake;
67737 +
67738 + /* initialize disk format plugin */
67739 + if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
67740 + goto failed_init_disk_format;
67741 +
67742 + /*
67743 + * There are some 'committed' versions of reiser4 super block counters,
67744 + * which correspond to reiser4 on-disk state. These counters are
67745 + * initialized here
67746 + */
67747 + sbinfo->blocks_free_committed = sbinfo->blocks_free;
67748 + sbinfo->nr_files_committed = oids_used(super);
67749 +
67750 + /* get inode of root directory */
67751 + if ((result = reiser4_init_root_inode(super)) != 0)
67752 + goto failed_init_root_inode;
67753 +
67754 + if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
67755 + goto failed_update_format_version;
67756 +
67757 + process_safelinks(super);
67758 + reiser4_exit_context(&ctx);
67759 +
67760 + sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
67761 + reiser4_debugfs_root);
67762 + if (sbinfo->debugfs_root) {
67763 + sbinfo->tmgr.debugfs_atom_count =
67764 + debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
67765 + sbinfo->debugfs_root,
67766 + &sbinfo->tmgr.atom_count);
67767 + sbinfo->tmgr.debugfs_id_count =
67768 + debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
67769 + sbinfo->debugfs_root,
67770 + &sbinfo->tmgr.id_count);
67771 + }
67772 + return 0;
67773 +
67774 + failed_update_format_version:
67775 + failed_init_root_inode:
67776 + if (sbinfo->df_plug->release)
67777 + sbinfo->df_plug->release(super);
67778 + failed_init_disk_format:
67779 + reiser4_done_formatted_fake(super);
67780 + failed_init_formatted_fake:
67781 + reiser4_done_entd(super);
67782 + failed_init_entd:
67783 + reiser4_done_ktxnmgrd(super);
67784 + failed_init_ktxnmgrd:
67785 + reiser4_done_txnmgr(&sbinfo->tmgr);
67786 + failed_init_read_super:
67787 + failed_init_super_data:
67788 + reiser4_done_fs_info(super);
67789 + failed_init_sinfo:
67790 + reiser4_exit_context(&ctx);
67791 + return result;
67792 +}
67793 +
67794 +/**
67795 + * reiser4_get_sb - get_sb of file_system_type operations
67796 + * @fs_type:
67797 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
67798 + * @dev_name: block device file name
67799 + * @data: specific mount options
67800 + *
67801 + * Reiser4 mount entry.
67802 + */
67803 +static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
67804 + const char *dev_name, void *data, struct vfsmount *mnt)
67805 +{
67806 + return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
67807 +}
67808 +
67809 +/* structure describing the reiser4 filesystem implementation */
67810 +static struct file_system_type reiser4_fs_type = {
67811 + .owner = THIS_MODULE,
67812 + .name = "reiser4",
67813 + .fs_flags = FS_REQUIRES_DEV,
67814 + .get_sb = reiser4_get_sb,
67815 + .kill_sb = kill_block_super,
67816 + .next = NULL
67817 +};
67818 +
67819 +void destroy_reiser4_cache(struct kmem_cache **cachep)
67820 +{
67821 + BUG_ON(*cachep == NULL);
67822 + kmem_cache_destroy(*cachep);
67823 + *cachep = NULL;
67824 +}
67825 +
67826 +/**
67827 + * init_reiser4 - reiser4 initialization entry point
67828 + *
67829 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
67830 + * on kernel initialization or during reiser4 module load.
67831 + */
67832 +static int __init init_reiser4(void)
67833 +{
67834 + int result;
67835 +
67836 + printk(KERN_INFO
67837 + "Loading Reiser4. "
67838 + "See www.namesys.com for a description of Reiser4.\n");
67839 +
67840 + /* initialize slab cache of inodes */
67841 + if ((result = init_inodes()) != 0)
67842 + goto failed_inode_cache;
67843 +
67844 + /* initialize cache of znodes */
67845 + if ((result = init_znodes()) != 0)
67846 + goto failed_init_znodes;
67847 +
67848 + /* initialize all plugins */
67849 + if ((result = init_plugins()) != 0)
67850 + goto failed_init_plugins;
67851 +
67852 + /* initialize cache of plugin_set-s and plugin_set's hash table */
67853 + if ((result = init_plugin_set()) != 0)
67854 + goto failed_init_plugin_set;
67855 +
67856 + /* initialize caches of txn_atom-s and txn_handle-s */
67857 + if ((result = init_txnmgr_static()) != 0)
67858 + goto failed_init_txnmgr_static;
67859 +
67860 + /* initialize cache of jnodes */
67861 + if ((result = init_jnodes()) != 0)
67862 + goto failed_init_jnodes;
67863 +
67864 + /* initialize cache of flush queues */
67865 + if ((result = reiser4_init_fqs()) != 0)
67866 + goto failed_init_fqs;
67867 +
67868 + /* initialize cache of structures attached to dentry->d_fsdata */
67869 + if ((result = reiser4_init_dentry_fsdata()) != 0)
67870 + goto failed_init_dentry_fsdata;
67871 +
67872 + /* initialize cache of structures attached to file->private_data */
67873 + if ((result = reiser4_init_file_fsdata()) != 0)
67874 + goto failed_init_file_fsdata;
67875 +
67876 + /*
67877 + * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
67878 + * more details
67879 + */
67880 + if ((result = reiser4_init_d_cursor()) != 0)
67881 + goto failed_init_d_cursor;
67882 +
67883 + if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
67884 + reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
67885 + return 0;
67886 + }
67887 +
67888 + reiser4_done_d_cursor();
67889 + failed_init_d_cursor:
67890 + reiser4_done_file_fsdata();
67891 + failed_init_file_fsdata:
67892 + reiser4_done_dentry_fsdata();
67893 + failed_init_dentry_fsdata:
67894 + reiser4_done_fqs();
67895 + failed_init_fqs:
67896 + done_jnodes();
67897 + failed_init_jnodes:
67898 + done_txnmgr_static();
67899 + failed_init_txnmgr_static:
67900 + done_plugin_set();
67901 + failed_init_plugin_set:
67902 + failed_init_plugins:
67903 + done_znodes();
67904 + failed_init_znodes:
67905 + done_inodes();
67906 + failed_inode_cache:
67907 + return result;
67908 +}
67909 +
67910 +/**
67911 + * done_reiser4 - reiser4 exit entry point
67912 + *
67913 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
67914 + * or at module unload.
67915 + */
67916 +static void __exit done_reiser4(void)
67917 +{
67918 + int result;
67919 +
67920 + debugfs_remove(reiser4_debugfs_root);
67921 + result = unregister_filesystem(&reiser4_fs_type);
67922 + BUG_ON(result != 0);
67923 + reiser4_done_d_cursor();
67924 + reiser4_done_file_fsdata();
67925 + reiser4_done_dentry_fsdata();
67926 + reiser4_done_fqs();
67927 + done_jnodes();
67928 + done_txnmgr_static();
67929 + done_plugin_set();
67930 + done_znodes();
67931 + destroy_reiser4_cache(&inode_cache);
67932 +}
67933 +
67934 +module_init(init_reiser4);
67935 +module_exit(done_reiser4);
67936 +
67937 +MODULE_DESCRIPTION("Reiser4 filesystem");
67938 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
67939 +
67940 +MODULE_LICENSE("GPL");
67941 +
67942 +/*
67943 + * Local variables:
67944 + * c-indentation-style: "K&R"
67945 + * mode-name: "LC"
67946 + * c-basic-offset: 8
67947 + * tab-width: 8
67948 + * fill-column: 79
67949 + * End:
67950 + */
67951 diff -urN linux-2.6.22.orig/fs/reiser4/tap.c linux-2.6.22/fs/reiser4/tap.c
67952 --- linux-2.6.22.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300
67953 +++ linux-2.6.22/fs/reiser4/tap.c 2007-07-29 00:25:35.024734784 +0400
67954 @@ -0,0 +1,377 @@
67955 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
67956 + * reiser4/README */
67957 +
67958 +/*
67959 + Tree Access Pointer (tap).
67960 +
67961 + tap is data structure combining coord and lock handle (mostly). It is
67962 + useful when one has to scan tree nodes (for example, in readdir, or flush),
67963 + for tap functions allow to move tap in either direction transparently
67964 + crossing unit/item/node borders.
67965 +
67966 + Tap doesn't provide automatic synchronization of its fields as it is
67967 + supposed to be per-thread object.
67968 +*/
67969 +
67970 +#include "forward.h"
67971 +#include "debug.h"
67972 +#include "coord.h"
67973 +#include "tree.h"
67974 +#include "context.h"
67975 +#include "tap.h"
67976 +#include "znode.h"
67977 +#include "tree_walk.h"
67978 +
67979 +#if REISER4_DEBUG
67980 +static int tap_invariant(const tap_t * tap);
67981 +static void tap_check(const tap_t * tap);
67982 +#else
67983 +#define tap_check(tap) noop
67984 +#endif
67985 +
67986 +/** load node tap is pointing to, if not loaded already */
67987 +int reiser4_tap_load(tap_t * tap)
67988 +{
67989 + tap_check(tap);
67990 + if (tap->loaded == 0) {
67991 + int result;
67992 +
67993 + result = zload_ra(tap->coord->node, &tap->ra_info);
67994 + if (result != 0)
67995 + return result;
67996 + coord_clear_iplug(tap->coord);
67997 + }
67998 + ++tap->loaded;
67999 + tap_check(tap);
68000 + return 0;
68001 +}
68002 +
68003 +/** release node tap is pointing to. Dual to tap_load() */
68004 +void reiser4_tap_relse(tap_t * tap)
68005 +{
68006 + tap_check(tap);
68007 + if (tap->loaded > 0) {
68008 + --tap->loaded;
68009 + if (tap->loaded == 0) {
68010 + zrelse(tap->coord->node);
68011 + }
68012 + }
68013 + tap_check(tap);
68014 +}
68015 +
68016 +/**
68017 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68018 + * @mode
68019 + */
68020 +void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68021 + znode_lock_mode mode)
68022 +{
68023 + tap->coord = coord;
68024 + tap->lh = lh;
68025 + tap->mode = mode;
68026 + tap->loaded = 0;
68027 + INIT_LIST_HEAD(&tap->linkage);
68028 + reiser4_init_ra_info(&tap->ra_info);
68029 +}
68030 +
68031 +/** add @tap to the per-thread list of all taps */
68032 +void reiser4_tap_monitor(tap_t * tap)
68033 +{
68034 + assert("nikita-2623", tap != NULL);
68035 + tap_check(tap);
68036 + list_add(&tap->linkage, reiser4_taps_list());
68037 + tap_check(tap);
68038 +}
68039 +
68040 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68041 + * loaded. */
68042 +void reiser4_tap_copy(tap_t * dst, tap_t * src)
68043 +{
68044 + assert("nikita-3193", src != NULL);
68045 + assert("nikita-3194", dst != NULL);
68046 +
68047 + *dst->coord = *src->coord;
68048 + if (src->lh->node)
68049 + copy_lh(dst->lh, src->lh);
68050 + dst->mode = src->mode;
68051 + dst->loaded = 0;
68052 + INIT_LIST_HEAD(&dst->linkage);
68053 + dst->ra_info = src->ra_info;
68054 +}
68055 +
68056 +/** finish with @tap */
68057 +void reiser4_tap_done(tap_t * tap)
68058 +{
68059 + assert("nikita-2565", tap != NULL);
68060 + tap_check(tap);
68061 + if (tap->loaded > 0)
68062 + zrelse(tap->coord->node);
68063 + done_lh(tap->lh);
68064 + tap->loaded = 0;
68065 + list_del_init(&tap->linkage);
68066 + tap->coord->node = NULL;
68067 +}
68068 +
68069 +/**
68070 + * move @tap to the new node, locked with @target. Load @target, if @tap was
68071 + * already loaded.
68072 + */
68073 +int reiser4_tap_move(tap_t * tap, lock_handle * target)
68074 +{
68075 + int result = 0;
68076 +
68077 + assert("nikita-2567", tap != NULL);
68078 + assert("nikita-2568", target != NULL);
68079 + assert("nikita-2570", target->node != NULL);
68080 + assert("nikita-2569", tap->coord->node == tap->lh->node);
68081 +
68082 + tap_check(tap);
68083 + if (tap->loaded > 0)
68084 + result = zload_ra(target->node, &tap->ra_info);
68085 +
68086 + if (result == 0) {
68087 + if (tap->loaded > 0)
68088 + zrelse(tap->coord->node);
68089 + done_lh(tap->lh);
68090 + copy_lh(tap->lh, target);
68091 + tap->coord->node = target->node;
68092 + coord_clear_iplug(tap->coord);
68093 + }
68094 + tap_check(tap);
68095 + return result;
68096 +}
68097 +
68098 +/**
68099 + * move @tap to @target. Acquire lock on @target, if @tap was already
68100 + * loaded.
68101 + */
68102 +static int tap_to(tap_t * tap, znode * target)
68103 +{
68104 + int result;
68105 +
68106 + assert("nikita-2624", tap != NULL);
68107 + assert("nikita-2625", target != NULL);
68108 +
68109 + tap_check(tap);
68110 + result = 0;
68111 + if (tap->coord->node != target) {
68112 + lock_handle here;
68113 +
68114 + init_lh(&here);
68115 + result = longterm_lock_znode(&here, target,
68116 + tap->mode, ZNODE_LOCK_HIPRI);
68117 + if (result == 0) {
68118 + result = reiser4_tap_move(tap, &here);
68119 + done_lh(&here);
68120 + }
68121 + }
68122 + tap_check(tap);
68123 + return result;
68124 +}
68125 +
68126 +/**
68127 + * move @tap to given @target, loading and locking @target->node if
68128 + * necessary
68129 + */
68130 +int tap_to_coord(tap_t * tap, coord_t * target)
68131 +{
68132 + int result;
68133 +
68134 + tap_check(tap);
68135 + result = tap_to(tap, target->node);
68136 + if (result == 0)
68137 + coord_dup(tap->coord, target);
68138 + tap_check(tap);
68139 + return result;
68140 +}
68141 +
68142 +/** return list of all taps */
68143 +struct list_head *reiser4_taps_list(void)
68144 +{
68145 + return &get_current_context()->taps;
68146 +}
68147 +
68148 +/** helper function for go_{next,prev}_{item,unit,node}() */
68149 +int go_dir_el(tap_t * tap, sideof dir, int units_p)
68150 +{
68151 + coord_t dup;
68152 + coord_t *coord;
68153 + int result;
68154 +
68155 + int (*coord_dir) (coord_t *);
68156 + int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68157 + void (*coord_init) (coord_t *, const znode *);
68158 + ON_DEBUG(int (*coord_check) (const coord_t *));
68159 +
68160 + assert("nikita-2556", tap != NULL);
68161 + assert("nikita-2557", tap->coord != NULL);
68162 + assert("nikita-2558", tap->lh != NULL);
68163 + assert("nikita-2559", tap->coord->node != NULL);
68164 +
68165 + tap_check(tap);
68166 + if (dir == LEFT_SIDE) {
68167 + coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68168 + get_dir_neighbor = reiser4_get_left_neighbor;
68169 + coord_init = coord_init_last_unit;
68170 + } else {
68171 + coord_dir = units_p ? coord_next_unit : coord_next_item;
68172 + get_dir_neighbor = reiser4_get_right_neighbor;
68173 + coord_init = coord_init_first_unit;
68174 + }
68175 + ON_DEBUG(coord_check =
68176 + units_p ? coord_is_existing_unit : coord_is_existing_item);
68177 + assert("nikita-2560", coord_check(tap->coord));
68178 +
68179 + coord = tap->coord;
68180 + coord_dup(&dup, coord);
68181 + if (coord_dir(&dup) != 0) {
68182 + do {
68183 + /* move to the left neighboring node */
68184 + lock_handle dup;
68185 +
68186 + init_lh(&dup);
68187 + result =
68188 + get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68189 + GN_CAN_USE_UPPER_LEVELS);
68190 + if (result == 0) {
68191 + result = reiser4_tap_move(tap, &dup);
68192 + if (result == 0)
68193 + coord_init(tap->coord, dup.node);
68194 + done_lh(&dup);
68195 + }
68196 + /* skip empty nodes */
68197 + } while ((result == 0) && node_is_empty(coord->node));
68198 + } else {
68199 + result = 0;
68200 + coord_dup(coord, &dup);
68201 + }
68202 + assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68203 + tap_check(tap);
68204 + return result;
68205 +}
68206 +
68207 +/**
68208 + * move @tap to the next unit, transparently crossing item and node
68209 + * boundaries
68210 + */
68211 +int go_next_unit(tap_t * tap)
68212 +{
68213 + return go_dir_el(tap, RIGHT_SIDE, 1);
68214 +}
68215 +
68216 +/**
68217 + * move @tap to the previous unit, transparently crossing item and node
68218 + * boundaries
68219 + */
68220 +int go_prev_unit(tap_t * tap)
68221 +{
68222 + return go_dir_el(tap, LEFT_SIDE, 1);
68223 +}
68224 +
68225 +/**
68226 + * @shift times apply @actor to the @tap. This is used to move @tap by
68227 + * @shift units (or items, or nodes) in either direction.
68228 + */
68229 +static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
68230 +{
68231 + int result;
68232 +
68233 + assert("nikita-2555", shift >= 0);
68234 + assert("nikita-2562", tap->coord->node == tap->lh->node);
68235 +
68236 + tap_check(tap);
68237 + result = reiser4_tap_load(tap);
68238 + if (result != 0)
68239 + return result;
68240 +
68241 + for (; shift > 0; --shift) {
68242 + result = actor(tap);
68243 + assert("nikita-2563", tap->coord->node == tap->lh->node);
68244 + if (result != 0)
68245 + break;
68246 + }
68247 + reiser4_tap_relse(tap);
68248 + tap_check(tap);
68249 + return result;
68250 +}
68251 +
68252 +/** move @tap @shift units rightward */
68253 +int rewind_right(tap_t * tap, int shift)
68254 +{
68255 + return rewind_to(tap, go_next_unit, shift);
68256 +}
68257 +
68258 +/** move @tap @shift units leftward */
68259 +int rewind_left(tap_t * tap, int shift)
68260 +{
68261 + return rewind_to(tap, go_prev_unit, shift);
68262 +}
68263 +
68264 +#if REISER4_DEBUG
68265 +/** debugging function: print @tap content in human readable form */
68266 +static void print_tap(const char *prefix, const tap_t * tap)
68267 +{
68268 + if (tap == NULL) {
68269 + printk("%s: null tap\n", prefix);
68270 + return;
68271 + }
68272 + printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
68273 + tap->loaded, (&tap->linkage == tap->linkage.next &&
68274 + &tap->linkage == tap->linkage.prev),
68275 + tap->lh->node,
68276 + lock_mode_name(tap->mode));
68277 + print_coord("\tcoord", tap->coord, 0);
68278 +}
68279 +
68280 +/** check [tap-sane] invariant */
68281 +static int tap_invariant(const tap_t * tap)
68282 +{
68283 + /* [tap-sane] invariant */
68284 +
68285 + if (tap == NULL)
68286 + return 1;
68287 + /* tap->mode is one of
68288 + *
68289 + * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
68290 + */
68291 + if (tap->mode != ZNODE_NO_LOCK &&
68292 + tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
68293 + return 2;
68294 + /* tap->coord != NULL, and */
68295 + if (tap->coord == NULL)
68296 + return 3;
68297 + /* tap->lh != NULL, and */
68298 + if (tap->lh == NULL)
68299 + return 4;
68300 + /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
68301 + if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
68302 + return 5;
68303 + /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
68304 + if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
68305 + return 6;
68306 + return 0;
68307 +}
68308 +
68309 +/** debugging function: check internal @tap consistency */
68310 +static void tap_check(const tap_t * tap)
68311 +{
68312 + int result;
68313 +
68314 + result = tap_invariant(tap);
68315 + if (result != 0) {
68316 + print_tap("broken", tap);
68317 + reiser4_panic("nikita-2831", "tap broken: %i\n", result);
68318 + }
68319 +}
68320 +#endif
68321 +
68322 +/* Make Linus happy.
68323 + Local variables:
68324 + c-indentation-style: "K&R"
68325 + mode-name: "LC"
68326 + c-basic-offset: 8
68327 + tab-width: 8
68328 + fill-column: 120
68329 + scroll-step: 1
68330 + End:
68331 +*/
68332 diff -urN linux-2.6.22.orig/fs/reiser4/tap.h linux-2.6.22/fs/reiser4/tap.h
68333 --- linux-2.6.22.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300
68334 +++ linux-2.6.22/fs/reiser4/tap.h 2007-07-29 00:25:35.024734784 +0400
68335 @@ -0,0 +1,70 @@
68336 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68337 +
68338 +/* Tree Access Pointers. See tap.c for more details. */
68339 +
68340 +#if !defined( __REISER4_TAP_H__ )
68341 +#define __REISER4_TAP_H__
68342 +
68343 +#include "forward.h"
68344 +#include "readahead.h"
68345 +
68346 +/**
68347 + tree_access_pointer aka tap. Data structure combining coord_t and lock
68348 + handle.
68349 + Invariants involving this data-type, see doc/lock-ordering for details:
68350 +
68351 + [tap-sane]
68352 + */
68353 +struct tree_access_pointer {
68354 + /* coord tap is at */
68355 + coord_t *coord;
68356 + /* lock handle on ->coord->node */
68357 + lock_handle *lh;
68358 + /* mode of lock acquired by this tap */
68359 + znode_lock_mode mode;
68360 + /* incremented by reiser4_tap_load().
68361 + Decremented by reiser4_tap_relse(). */
68362 + int loaded;
68363 + /* list of taps */
68364 + struct list_head linkage;
68365 + /* read-ahead hint */
68366 + ra_info_t ra_info;
68367 +};
68368 +
68369 +typedef int (*go_actor_t) (tap_t * tap);
68370 +
68371 +extern int reiser4_tap_load(tap_t * tap);
68372 +extern void reiser4_tap_relse(tap_t * tap);
68373 +extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68374 + znode_lock_mode mode);
68375 +extern void reiser4_tap_monitor(tap_t * tap);
68376 +extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
68377 +extern void reiser4_tap_done(tap_t * tap);
68378 +extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
68379 +extern int tap_to_coord(tap_t * tap, coord_t * target);
68380 +
68381 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
68382 +extern int go_next_unit(tap_t * tap);
68383 +extern int go_prev_unit(tap_t * tap);
68384 +extern int rewind_right(tap_t * tap, int shift);
68385 +extern int rewind_left(tap_t * tap, int shift);
68386 +
68387 +extern struct list_head *reiser4_taps_list(void);
68388 +
68389 +#define for_all_taps(tap) \
68390 + for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
68391 + reiser4_taps_list() != &tap->linkage; \
68392 + tap = list_entry(tap->linkage.next, tap_t, linkage))
68393 +
68394 +/* __REISER4_TAP_H__ */
68395 +#endif
68396 +/* Make Linus happy.
68397 + Local variables:
68398 + c-indentation-style: "K&R"
68399 + mode-name: "LC"
68400 + c-basic-offset: 8
68401 + tab-width: 8
68402 + fill-column: 120
68403 + scroll-step: 1
68404 + End:
68405 +*/
68406 diff -urN linux-2.6.22.orig/fs/reiser4/tree.c linux-2.6.22/fs/reiser4/tree.c
68407 --- linux-2.6.22.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
68408 +++ linux-2.6.22/fs/reiser4/tree.c 2007-07-29 00:25:35.028735820 +0400
68409 @@ -0,0 +1,1876 @@
68410 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68411 + * reiser4/README */
68412 +
68413 +/*
68414 + * KEYS IN A TREE.
68415 + *
68416 + * The tree consists of nodes located on the disk. Node in the tree is either
68417 + * formatted or unformatted. Formatted node is one that has structure
68418 + * understood by the tree balancing and traversal code. Formatted nodes are
68419 + * further classified into leaf and internal nodes. Latter distinctions is
68420 + * (almost) of only historical importance: general structure of leaves and
68421 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
68422 + * that are part of bodies of ordinary files and attributes.
68423 + *
68424 + * Each node in the tree spawns some interval in the key space. Key ranges for
68425 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
68426 + * sense, because of the non-unique keys: intersection of key ranges for
68427 + * different nodes is either empty, or consists of exactly one key.
68428 + *
68429 + * Formatted node consists of a sequence of items. Each item spawns some
68430 + * interval in key space. Key ranges for all items in a tree are disjoint,
68431 + * modulo non-unique keys again. Items within nodes are ordered in the key
68432 + * order of the smallest key in a item.
68433 + *
68434 + * Particular type of item can be further split into units. Unit is piece of
68435 + * item that can be cut from item and moved into another item of the same
68436 + * time. Units are used by balancing code to repack data during balancing.
68437 + *
68438 + * Unit can be further split into smaller entities (for example, extent unit
68439 + * represents several pages, and it is natural for extent code to operate on
68440 + * particular pages and even bytes within one unit), but this is of no
68441 + * relevance to the generic balancing and lookup code.
68442 + *
68443 + * Although item is said to "spawn" range or interval of keys, it is not
68444 + * necessary that item contains piece of data addressable by each and every
68445 + * key in this range. For example, compound directory item, consisting of
68446 + * units corresponding to directory entries and keyed by hashes of file names,
68447 + * looks more as having "discrete spectrum": only some disjoint keys inside
68448 + * range occupied by this item really address data.
68449 + *
68450 + * No than less, each item always has well-defined least (minimal) key, that
68451 + * is recorded in item header, stored in the node this item is in. Also, item
68452 + * plugin can optionally define method ->max_key_inside() returning maximal
68453 + * key that can _possibly_ be located within this item. This method is used
68454 + * (mainly) to determine when given piece of data should be merged into
68455 + * existing item, in stead of creating new one. Because of this, even though
68456 + * ->max_key_inside() can be larger that any key actually located in the item,
68457 + * intervals
68458 + *
68459 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
68460 + *
68461 + * are still disjoint for all items within the _same_ node.
68462 + *
68463 + * In memory node is represented by znode. It plays several roles:
68464 + *
68465 + * . something locks are taken on
68466 + *
68467 + * . something tracked by transaction manager (this is going to change)
68468 + *
68469 + * . something used to access node data
68470 + *
68471 + * . something used to maintain tree structure in memory: sibling and
68472 + * parental linkage.
68473 + *
68474 + * . something used to organize nodes into "slums"
68475 + *
68476 + * More on znodes see in znode.[ch]
68477 + *
68478 + * DELIMITING KEYS
68479 + *
68480 + * To simplify balancing, allow some flexibility in locking and speed up
68481 + * important coord cache optimization, we keep delimiting keys of nodes in
68482 + * memory. Depending on disk format (implemented by appropriate node plugin)
68483 + * node on disk can record both left and right delimiting key, only one of
68484 + * them, or none. Still, our balancing and tree traversal code keep both
68485 + * delimiting keys for a node that is in memory stored in the znode. When
68486 + * node is first brought into memory during tree traversal, its left
68487 + * delimiting key is taken from its parent, and its right delimiting key is
68488 + * either next key in its parent, or is right delimiting key of parent if
68489 + * node is the rightmost child of parent.
68490 + *
68491 + * Physical consistency of delimiting key is protected by special dk
68492 + * read-write lock. That is, delimiting keys can only be inspected or
68493 + * modified under this lock. But dk lock is only sufficient for fast
68494 + * "pessimistic" check, because to simplify code and to decrease lock
68495 + * contention, balancing (carry) only updates delimiting keys right before
68496 + * unlocking all locked nodes on the given tree level. For example,
68497 + * coord-by-key cache scans LRU list of recently accessed znodes. For each
68498 + * node it first does fast check under dk spin lock. If key looked for is
68499 + * not between delimiting keys for this node, next node is inspected and so
68500 + * on. If key is inside of the key range, long term lock is taken on node
68501 + * and key range is rechecked.
68502 + *
68503 + * COORDINATES
68504 + *
68505 + * To find something in the tree, you supply a key, and the key is resolved
68506 + * by coord_by_key() into a coord (coordinate) that is valid as long as the
68507 + * node the coord points to remains locked. As mentioned above trees
68508 + * consist of nodes that consist of items that consist of units. A unit is
68509 + * the smallest and indivisible piece of tree as far as balancing and tree
68510 + * search are concerned. Each node, item, and unit can be addressed by
68511 + * giving its level in the tree and the key occupied by this entity. A node
68512 + * knows what the key ranges are of the items within it, and how to find its
68513 + * items and invoke their item handlers, but it does not know how to access
68514 + * individual units within its items except through the item handlers.
68515 + * coord is a structure containing a pointer to the node, the ordinal number
68516 + * of the item within this node (a sort of item offset), and the ordinal
68517 + * number of the unit within this item.
68518 + *
68519 + * TREE LOOKUP
68520 + *
68521 + * There are two types of access to the tree: lookup and modification.
68522 + *
68523 + * Lookup is a search for the key in the tree. Search can look for either
68524 + * exactly the key given to it, or for the largest key that is not greater
68525 + * than the key given to it. This distinction is determined by "bias"
68526 + * parameter of search routine (coord_by_key()). coord_by_key() either
68527 + * returns error (key is not in the tree, or some kind of external error
68528 + * occurred), or successfully resolves key into coord.
68529 + *
68530 + * This resolution is done by traversing tree top-to-bottom from root level
68531 + * to the desired level. On levels above twig level (level one above the
68532 + * leaf level) nodes consist exclusively of internal items. Internal item is
68533 + * nothing more than pointer to the tree node on the child level. On twig
68534 + * level nodes consist of internal items intermixed with extent
68535 + * items. Internal items form normal search tree structure used by traversal
68536 + * to descent through the tree.
68537 + *
68538 + * TREE LOOKUP OPTIMIZATIONS
68539 + *
68540 + * Tree lookup described above is expensive even if all nodes traversed are
68541 + * already in the memory: for each node binary search within it has to be
68542 + * performed and binary searches are CPU consuming and tend to destroy CPU
68543 + * caches.
68544 + *
68545 + * Several optimizations are used to work around this:
68546 + *
68547 + * . cbk_cache (look-aside cache for tree traversals, see search.c for
68548 + * details)
68549 + *
68550 + * . seals (see seal.[ch])
68551 + *
68552 + * . vroot (see search.c)
68553 + *
68554 + * General search-by-key is layered thusly:
68555 + *
68556 + * [check seal, if any] --ok--> done
68557 + * |
68558 + * failed
68559 + * |
68560 + * V
68561 + * [vroot defined] --no--> node = tree_root
68562 + * | |
68563 + * yes |
68564 + * | |
68565 + * V |
68566 + * node = vroot |
68567 + * | |
68568 + * | |
68569 + * | |
68570 + * V V
68571 + * [check cbk_cache for key] --ok--> done
68572 + * |
68573 + * failed
68574 + * |
68575 + * V
68576 + * [start tree traversal from node]
68577 + *
68578 + */
68579 +
68580 +#include "forward.h"
68581 +#include "debug.h"
68582 +#include "dformat.h"
68583 +#include "key.h"
68584 +#include "coord.h"
68585 +#include "plugin/item/static_stat.h"
68586 +#include "plugin/item/item.h"
68587 +#include "plugin/node/node.h"
68588 +#include "plugin/plugin.h"
68589 +#include "txnmgr.h"
68590 +#include "jnode.h"
68591 +#include "znode.h"
68592 +#include "block_alloc.h"
68593 +#include "tree_walk.h"
68594 +#include "carry.h"
68595 +#include "carry_ops.h"
68596 +#include "tap.h"
68597 +#include "tree.h"
68598 +#include "vfs_ops.h"
68599 +#include "page_cache.h"
68600 +#include "super.h"
68601 +#include "reiser4.h"
68602 +#include "inode.h"
68603 +
68604 +#include <linux/fs.h> /* for struct super_block */
68605 +#include <linux/spinlock.h>
68606 +
68607 +/* Disk address (block number) never ever used for any real tree node. This is
68608 + used as block number of "uber" znode.
68609 +
68610 + Invalid block addresses are 0 by tradition.
68611 +
68612 +*/
68613 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
68614 +
68615 +#define CUT_TREE_MIN_ITERATIONS 64
68616 +
68617 +static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
68618 +
68619 +/* return node plugin of coord->node */
68620 +node_plugin *node_plugin_by_coord(const coord_t * coord)
68621 +{
68622 + assert("vs-1", coord != NULL);
68623 + assert("vs-2", coord->node != NULL);
68624 +
68625 + return coord->node->nplug;
68626 +}
68627 +
68628 +/* insert item into tree. Fields of @coord are updated so that they can be
68629 + * used by consequent insert operation. */
68630 +insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
68631 + * into */ ,
68632 + const reiser4_key * key /* key of new item */ ,
68633 + reiser4_item_data * data /* parameters for item
68634 + * creation */ ,
68635 + coord_t * coord /* resulting insertion coord */ ,
68636 + lock_handle * lh /* resulting lock
68637 + * handle */ ,
68638 + tree_level stop_level /** level where to insert */ ,
68639 + __u32 flags /* insertion flags */ )
68640 +{
68641 + int result;
68642 +
68643 + assert("nikita-358", tree != NULL);
68644 + assert("nikita-360", coord != NULL);
68645 +
68646 + result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
68647 + FIND_EXACT, stop_level, stop_level,
68648 + flags | CBK_FOR_INSERT, NULL /*ra_info */ );
68649 + switch (result) {
68650 + default:
68651 + break;
68652 + case CBK_COORD_FOUND:
68653 + result = IBK_ALREADY_EXISTS;
68654 + break;
68655 + case CBK_COORD_NOTFOUND:
68656 + assert("nikita-2017", coord->node != NULL);
68657 + result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
68658 + break;
68659 + }
68660 + return result;
68661 +}
68662 +
68663 +/* insert item by calling carry. Helper function called if short-cut
68664 + insertion failed */
68665 +static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
68666 + lock_handle * lh, /* lock handle of insertion
68667 + * node */
68668 + reiser4_item_data * data, /* parameters of new
68669 + * item */
68670 + const reiser4_key * key, /* key of new item */
68671 + carry_opcode cop, /* carry operation to perform */
68672 + cop_insert_flag flags
68673 + /* carry flags */ )
68674 +{
68675 + int result;
68676 + carry_pool *pool;
68677 + carry_level *lowest_level;
68678 + carry_insert_data *cdata;
68679 + carry_op *op;
68680 +
68681 + assert("umka-314", coord != NULL);
68682 +
68683 + /* allocate carry_pool and 3 carry_level-s */
68684 + pool =
68685 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68686 + sizeof(*cdata));
68687 + if (IS_ERR(pool))
68688 + return PTR_ERR(pool);
68689 + lowest_level = (carry_level *) (pool + 1);
68690 + init_carry_level(lowest_level, pool);
68691 +
68692 + op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
68693 + if (IS_ERR(op) || (op == NULL)) {
68694 + done_carry_pool(pool);
68695 + return RETERR(op ? PTR_ERR(op) : -EIO);
68696 + }
68697 + cdata = (carry_insert_data *) (lowest_level + 3);
68698 + cdata->coord = coord;
68699 + cdata->data = data;
68700 + cdata->key = key;
68701 + op->u.insert.d = cdata;
68702 + if (flags == 0)
68703 + flags = znode_get_tree(coord->node)->carry.insert_flags;
68704 + op->u.insert.flags = flags;
68705 + op->u.insert.type = COPT_ITEM_DATA;
68706 + op->u.insert.child = NULL;
68707 + if (lh != NULL) {
68708 + assert("nikita-3245", lh->node == coord->node);
68709 + lowest_level->track_type = CARRY_TRACK_CHANGE;
68710 + lowest_level->tracked = lh;
68711 + }
68712 +
68713 + result = reiser4_carry(lowest_level, NULL);
68714 + done_carry_pool(pool);
68715 +
68716 + return result;
68717 +}
68718 +
68719 +/* form carry queue to perform paste of @data with @key at @coord, and launch
68720 + its execution by calling carry().
68721 +
68722 + Instruct carry to update @lh it after balancing insertion coord moves into
68723 + different block.
68724 +
68725 +*/
68726 +static int paste_with_carry(coord_t * coord, /* coord of paste */
68727 + lock_handle * lh, /* lock handle of node
68728 + * where item is
68729 + * pasted */
68730 + reiser4_item_data * data, /* parameters of new
68731 + * item */
68732 + const reiser4_key * key, /* key of new item */
68733 + unsigned flags /* paste flags */ )
68734 +{
68735 + int result;
68736 + carry_pool *pool;
68737 + carry_level *lowest_level;
68738 + carry_insert_data *cdata;
68739 + carry_op *op;
68740 +
68741 + assert("umka-315", coord != NULL);
68742 + assert("umka-316", key != NULL);
68743 +
68744 + pool =
68745 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68746 + sizeof(*cdata));
68747 + if (IS_ERR(pool))
68748 + return PTR_ERR(pool);
68749 + lowest_level = (carry_level *) (pool + 1);
68750 + init_carry_level(lowest_level, pool);
68751 +
68752 + op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
68753 + if (IS_ERR(op) || (op == NULL)) {
68754 + done_carry_pool(pool);
68755 + return RETERR(op ? PTR_ERR(op) : -EIO);
68756 + }
68757 + cdata = (carry_insert_data *) (lowest_level + 3);
68758 + cdata->coord = coord;
68759 + cdata->data = data;
68760 + cdata->key = key;
68761 + op->u.paste.d = cdata;
68762 + if (flags == 0)
68763 + flags = znode_get_tree(coord->node)->carry.paste_flags;
68764 + op->u.paste.flags = flags;
68765 + op->u.paste.type = COPT_ITEM_DATA;
68766 + if (lh != NULL) {
68767 + lowest_level->track_type = CARRY_TRACK_CHANGE;
68768 + lowest_level->tracked = lh;
68769 + }
68770 +
68771 + result = reiser4_carry(lowest_level, NULL);
68772 + done_carry_pool(pool);
68773 +
68774 + return result;
68775 +}
68776 +
68777 +/* insert item at the given coord.
68778 +
68779 + First try to skip carry by directly calling ->create_item() method of node
68780 + plugin. If this is impossible (there is not enough free space in the node,
68781 + or leftmost item in the node is created), call insert_with_carry_by_coord()
68782 + that will do full carry().
68783 +
68784 +*/
68785 +insert_result insert_by_coord(coord_t * coord /* coord where to
68786 + * insert. coord->node has
68787 + * to be write locked by
68788 + * caller */ ,
68789 + reiser4_item_data * data /* data to be
68790 + * inserted */ ,
68791 + const reiser4_key * key /* key of new item */ ,
68792 + lock_handle * lh /* lock handle of write
68793 + * lock on node */ ,
68794 + __u32 flags /* insertion flags */ )
68795 +{
68796 + unsigned item_size;
68797 + int result;
68798 + znode *node;
68799 +
68800 + assert("vs-247", coord != NULL);
68801 + assert("vs-248", data != NULL);
68802 + assert("vs-249", data->length >= 0);
68803 + assert("nikita-1191", znode_is_write_locked(coord->node));
68804 +
68805 + node = coord->node;
68806 + coord_clear_iplug(coord);
68807 + result = zload(node);
68808 + if (result != 0)
68809 + return result;
68810 +
68811 + item_size = space_needed(node, NULL, data, 1);
68812 + if (item_size > znode_free_space(node) &&
68813 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
68814 + && (flags & COPI_DONT_ALLOCATE)) {
68815 + /* we are forced to use free space of coord->node and new item
68816 + does not fit into it.
68817 +
68818 + Currently we get here only when we allocate and copy units
68819 + of extent item from a node to its left neighbor during
68820 + "squalloc"-ing. If @node (this is left neighbor) does not
68821 + have enough free space - we do not want to attempt any
68822 + shifting and allocations because we are in squeezing and
68823 + everything to the left of @node is tightly packed.
68824 + */
68825 + result = -E_NODE_FULL;
68826 + } else if ((item_size <= znode_free_space(node)) &&
68827 + !coord_is_before_leftmost(coord) &&
68828 + (node_plugin_by_node(node)->fast_insert != NULL)
68829 + && node_plugin_by_node(node)->fast_insert(coord)) {
68830 + /* shortcut insertion without carry() overhead.
68831 +
68832 + Only possible if:
68833 +
68834 + - there is enough free space
68835 +
68836 + - insertion is not into the leftmost position in a node
68837 + (otherwise it would require updating of delimiting key in a
68838 + parent)
68839 +
68840 + - node plugin agrees with this
68841 +
68842 + */
68843 + result =
68844 + node_plugin_by_node(node)->create_item(coord, key, data,
68845 + NULL);
68846 + znode_make_dirty(node);
68847 + } else {
68848 + /* otherwise do full-fledged carry(). */
68849 + result =
68850 + insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
68851 + flags);
68852 + }
68853 + zrelse(node);
68854 + return result;
68855 +}
68856 +
68857 +/* @coord is set to leaf level and @data is to be inserted to twig level */
68858 +insert_result
68859 +insert_extent_by_coord(coord_t *
68860 + coord
68861 + /* coord where to insert. coord->node * has to be write * locked by caller */
68862 + ,
68863 + reiser4_item_data * data /* data to be inserted */ ,
68864 + const reiser4_key * key /* key of new item */ ,
68865 + lock_handle *
68866 + lh /* lock handle of write lock on * node */ )
68867 +{
68868 + assert("vs-405", coord != NULL);
68869 + assert("vs-406", data != NULL);
68870 + assert("vs-407", data->length > 0);
68871 + assert("vs-408", znode_is_write_locked(coord->node));
68872 + assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
68873 +
68874 + return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
68875 + 0 /*flags */ );
68876 +}
68877 +
68878 +/* Insert into the item at the given coord.
68879 +
68880 + First try to skip carry by directly calling ->paste() method of item
68881 + plugin. If this is impossible (there is not enough free space in the node,
68882 + or we are pasting into leftmost position in the node), call
68883 + paste_with_carry() that will do full carry().
68884 +
68885 +*/
68886 +/* paste_into_item */
68887 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
68888 + lock_handle * lh /* lock handle on node involved */ ,
68889 + const reiser4_key * key /* key of unit being pasted */ ,
68890 + reiser4_item_data * data /* parameters for new unit */ ,
68891 + unsigned flags /* insert/paste flags */ )
68892 +{
68893 + int result;
68894 + int size_change;
68895 + node_plugin *nplug;
68896 + item_plugin *iplug;
68897 +
68898 + assert("umka-317", coord != NULL);
68899 + assert("umka-318", key != NULL);
68900 +
68901 + iplug = item_plugin_by_coord(coord);
68902 + nplug = node_plugin_by_coord(coord);
68903 +
68904 + assert("nikita-1480", iplug == data->iplug);
68905 +
68906 + size_change = space_needed(coord->node, coord, data, 0);
68907 + if (size_change > (int)znode_free_space(coord->node) &&
68908 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
68909 + && (flags & COPI_DONT_ALLOCATE)) {
68910 + /* we are forced to use free space of coord->node and new data
68911 + does not fit into it. */
68912 + return -E_NODE_FULL;
68913 + }
68914 +
68915 + /* shortcut paste without carry() overhead.
68916 +
68917 + Only possible if:
68918 +
68919 + - there is enough free space
68920 +
68921 + - paste is not into the leftmost unit in a node (otherwise
68922 + it would require updating of delimiting key in a parent)
68923 +
68924 + - node plugin agrees with this
68925 +
68926 + - item plugin agrees with us
68927 + */
68928 + if (size_change <= (int)znode_free_space(coord->node) &&
68929 + (coord->item_pos != 0 ||
68930 + coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
68931 + coord->unit_pos != 0 && nplug->fast_paste != NULL &&
68932 + nplug->fast_paste(coord) &&
68933 + iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
68934 + if (size_change > 0)
68935 + nplug->change_item_size(coord, size_change);
68936 + /* NOTE-NIKITA: huh? where @key is used? */
68937 + result = iplug->b.paste(coord, data, NULL);
68938 + if (size_change < 0)
68939 + nplug->change_item_size(coord, size_change);
68940 + znode_make_dirty(coord->node);
68941 + } else
68942 + /* otherwise do full-fledged carry(). */
68943 + result = paste_with_carry(coord, lh, data, key, flags);
68944 + return result;
68945 +}
68946 +
68947 +/* this either appends or truncates item @coord */
68948 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
68949 + reiser4_item_data * data /* parameters of resize */ ,
68950 + reiser4_key * key /* key of new unit */ ,
68951 + lock_handle * lh /* lock handle of node
68952 + * being modified */ ,
68953 + cop_insert_flag flags /* carry flags */ )
68954 +{
68955 + int result;
68956 + znode *node;
68957 +
68958 + assert("nikita-362", coord != NULL);
68959 + assert("nikita-363", data != NULL);
68960 + assert("vs-245", data->length != 0);
68961 +
68962 + node = coord->node;
68963 + coord_clear_iplug(coord);
68964 + result = zload(node);
68965 + if (result != 0)
68966 + return result;
68967 +
68968 + if (data->length < 0)
68969 + result = node_plugin_by_coord(coord)->shrink_item(coord,
68970 + -data->length);
68971 + else
68972 + result = insert_into_item(coord, lh, key, data, flags);
68973 +
68974 + zrelse(node);
68975 + return result;
68976 +}
68977 +
68978 +/* insert flow @f */
68979 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
68980 +{
68981 + int result;
68982 + carry_pool *pool;
68983 + carry_level *lowest_level;
68984 + reiser4_item_data *data;
68985 + carry_op *op;
68986 +
68987 + pool =
68988 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68989 + sizeof(*data));
68990 + if (IS_ERR(pool))
68991 + return PTR_ERR(pool);
68992 + lowest_level = (carry_level *) (pool + 1);
68993 + init_carry_level(lowest_level, pool);
68994 +
68995 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
68996 + 0 /* operate directly on coord -> node */ );
68997 + if (IS_ERR(op) || (op == NULL)) {
68998 + done_carry_pool(pool);
68999 + return RETERR(op ? PTR_ERR(op) : -EIO);
69000 + }
69001 +
69002 + /* these are permanent during insert_flow */
69003 + data = (reiser4_item_data *) (lowest_level + 3);
69004 + data->user = 1;
69005 + data->iplug = item_plugin_by_id(FORMATTING_ID);
69006 + data->arg = NULL;
69007 + /* data.length and data.data will be set before calling paste or
69008 + insert */
69009 + data->length = 0;
69010 + data->data = NULL;
69011 +
69012 + op->u.insert_flow.flags = 0;
69013 + op->u.insert_flow.insert_point = coord;
69014 + op->u.insert_flow.flow = f;
69015 + op->u.insert_flow.data = data;
69016 + op->u.insert_flow.new_nodes = 0;
69017 +
69018 + lowest_level->track_type = CARRY_TRACK_CHANGE;
69019 + lowest_level->tracked = lh;
69020 +
69021 + result = reiser4_carry(lowest_level, NULL);
69022 + done_carry_pool(pool);
69023 +
69024 + return result;
69025 +}
69026 +
69027 +/* Given a coord in parent node, obtain a znode for the corresponding child */
69028 +znode *child_znode(const coord_t * parent_coord /* coord of pointer to
69029 + * child */ ,
69030 + znode * parent /* parent of child */ ,
69031 + int incore_p /* if !0 only return child if already in
69032 + * memory */ ,
69033 + int setup_dkeys_p /* if !0 update delimiting keys of
69034 + * child */ )
69035 +{
69036 + znode *child;
69037 +
69038 + assert("nikita-1374", parent_coord != NULL);
69039 + assert("nikita-1482", parent != NULL);
69040 +#if REISER4_DEBUG
69041 + if (setup_dkeys_p)
69042 + assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69043 +#endif
69044 + assert("nikita-2947", znode_is_any_locked(parent));
69045 +
69046 + if (znode_get_level(parent) <= LEAF_LEVEL) {
69047 + /* trying to get child of leaf node */
69048 + warning("nikita-1217", "Child of maize?");
69049 + return ERR_PTR(RETERR(-EIO));
69050 + }
69051 + if (item_is_internal(parent_coord)) {
69052 + reiser4_block_nr addr;
69053 + item_plugin *iplug;
69054 + reiser4_tree *tree;
69055 +
69056 + iplug = item_plugin_by_coord(parent_coord);
69057 + assert("vs-512", iplug->s.internal.down_link);
69058 + iplug->s.internal.down_link(parent_coord, NULL, &addr);
69059 +
69060 + tree = znode_get_tree(parent);
69061 + if (incore_p)
69062 + child = zlook(tree, &addr);
69063 + else
69064 + child =
69065 + zget(tree, &addr, parent,
69066 + znode_get_level(parent) - 1,
69067 + reiser4_ctx_gfp_mask_get());
69068 + if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69069 + set_child_delimiting_keys(parent, parent_coord, child);
69070 + } else {
69071 + warning("nikita-1483", "Internal item expected");
69072 + child = ERR_PTR(RETERR(-EIO));
69073 + }
69074 + return child;
69075 +}
69076 +
69077 +/* remove znode from transaction */
69078 +static void uncapture_znode(znode * node)
69079 +{
69080 + struct page *page;
69081 +
69082 + assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69083 +
69084 + if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
69085 + int ret;
69086 +
69087 + /* An already allocated block goes right to the atom's delete set. */
69088 + ret =
69089 + reiser4_dealloc_block(znode_get_block(node), 0,
69090 + BA_DEFER | BA_FORMATTED);
69091 + if (ret)
69092 + warning("zam-942",
69093 + "can\'t add a block (%llu) number to atom's delete set\n",
69094 + (unsigned long long)(*znode_get_block(node)));
69095 +
69096 + spin_lock_znode(node);
69097 + /* Here we return flush reserved block which was reserved at the
69098 + * moment when this allocated node was marked dirty and still
69099 + * not used by flush in node relocation procedure. */
69100 + if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69101 + txn_atom *atom;
69102 +
69103 + atom = jnode_get_atom(ZJNODE(node));
69104 + assert("zam-939", atom != NULL);
69105 + spin_unlock_znode(node);
69106 + flush_reserved2grabbed(atom, (__u64) 1);
69107 + spin_unlock_atom(atom);
69108 + } else
69109 + spin_unlock_znode(node);
69110 + } else {
69111 + /* znode has assigned block which is counted as "fake
69112 + allocated". Return it back to "free blocks") */
69113 + fake_allocated2free((__u64) 1, BA_FORMATTED);
69114 + }
69115 +
69116 + /*
69117 + * uncapture page from transaction. There is a possibility of a race
69118 + * with ->releasepage(): reiser4_releasepage() detaches page from this
69119 + * jnode and we have nothing to uncapture. To avoid this, get
69120 + * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
69121 + * will deal with released page itself.
69122 + */
69123 + spin_lock_znode(node);
69124 + page = znode_page(node);
69125 + if (likely(page != NULL)) {
69126 + /*
69127 + * reiser4_uncapture_page() can only be called when we are sure
69128 + * that znode is pinned in memory, which we are, because
69129 + * forget_znode() is only called from longterm_unlock_znode().
69130 + */
69131 + page_cache_get(page);
69132 + spin_unlock_znode(node);
69133 + lock_page(page);
69134 + reiser4_uncapture_page(page);
69135 + unlock_page(page);
69136 + page_cache_release(page);
69137 + } else {
69138 + txn_atom *atom;
69139 +
69140 + /* handle "flush queued" znodes */
69141 + while (1) {
69142 + atom = jnode_get_atom(ZJNODE(node));
69143 + assert("zam-943", atom != NULL);
69144 +
69145 + if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69146 + || !atom->nr_running_queues)
69147 + break;
69148 +
69149 + spin_unlock_znode(node);
69150 + reiser4_atom_wait_event(atom);
69151 + spin_lock_znode(node);
69152 + }
69153 +
69154 + reiser4_uncapture_block(ZJNODE(node));
69155 + spin_unlock_atom(atom);
69156 + zput(node);
69157 + }
69158 +}
69159 +
69160 +/* This is called from longterm_unlock_znode() when last lock is released from
69161 + the node that has been removed from the tree. At this point node is removed
69162 + from sibling list and its lock is invalidated. */
69163 +void forget_znode(lock_handle * handle)
69164 +{
69165 + znode *node;
69166 + reiser4_tree *tree;
69167 +
69168 + assert("umka-319", handle != NULL);
69169 +
69170 + node = handle->node;
69171 + tree = znode_get_tree(node);
69172 +
69173 + assert("vs-164", znode_is_write_locked(node));
69174 + assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69175 + assert_rw_locked(&(node->lock.guard));
69176 +
69177 + /* We assume that this node was detached from its parent before
69178 + * unlocking, it gives no way to reach this node from parent through a
69179 + * down link. The node should have no children and, thereby, can't be
69180 + * reached from them by their parent pointers. The only way to obtain a
69181 + * reference to the node is to use sibling pointers from its left and
69182 + * right neighbors. In the next several lines we remove the node from
69183 + * the sibling list. */
69184 +
69185 + write_lock_tree(tree);
69186 + sibling_list_remove(node);
69187 + znode_remove(node, tree);
69188 + write_unlock_tree(tree);
69189 +
69190 + /* Here we set JNODE_DYING and cancel all pending lock requests. It
69191 + * forces all lock requestor threads to repeat iterations of getting
69192 + * lock on a child, neighbor or parent node. But, those threads can't
69193 + * come to this node again, because this node is no longer a child,
69194 + * neighbor or parent of any other node. This order of znode
69195 + * invalidation does not allow other threads to waste cpu time is a busy
69196 + * loop, trying to lock dying object. The exception is in the flush
69197 + * code when we take node directly from atom's capture list.*/
69198 + reiser4_invalidate_lock(handle);
69199 + uncapture_znode(node);
69200 +}
69201 +
69202 +/* Check that internal item at @pointer really contains pointer to @child. */
69203 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69204 + * @child */ ,
69205 + const znode * child /* child znode */ )
69206 +{
69207 + assert("nikita-1016", pointer != NULL);
69208 + assert("nikita-1017", child != NULL);
69209 + assert("nikita-1018", pointer->node != NULL);
69210 +
69211 + assert("nikita-1325", znode_is_any_locked(pointer->node));
69212 +
69213 + assert("nikita-2985",
69214 + znode_get_level(pointer->node) == znode_get_level(child) + 1);
69215 +
69216 + coord_clear_iplug((coord_t *) pointer);
69217 +
69218 + if (coord_is_existing_unit(pointer)) {
69219 + item_plugin *iplug;
69220 + reiser4_block_nr addr;
69221 +
69222 + if (item_is_internal(pointer)) {
69223 + iplug = item_plugin_by_coord(pointer);
69224 + assert("vs-513", iplug->s.internal.down_link);
69225 + iplug->s.internal.down_link(pointer, NULL, &addr);
69226 + /* check that cached value is correct */
69227 + if (disk_addr_eq(&addr, znode_get_block(child))) {
69228 + return NS_FOUND;
69229 + }
69230 + }
69231 + }
69232 + /* warning ("jmacd-1002", "tree pointer incorrect"); */
69233 + return NS_NOT_FOUND;
69234 +}
69235 +
69236 +/* find coord of pointer to new @child in @parent.
69237 +
69238 + Find the &coord_t in the @parent where pointer to a given @child will
69239 + be in.
69240 +
69241 +*/
69242 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
69243 + znode *
69244 + child UNUSED_ARG /* child znode, passed locked */ ,
69245 + znode * left /* left brother of new node */ ,
69246 + coord_t * result /* where result is stored in */ )
69247 +{
69248 + int ret;
69249 +
69250 + assert("nikita-1486", parent != NULL);
69251 + assert("nikita-1487", child != NULL);
69252 + assert("nikita-1488", result != NULL);
69253 +
69254 + ret = find_child_ptr(parent, left, result);
69255 + if (ret != NS_FOUND) {
69256 + warning("nikita-1489", "Cannot find brother position: %i", ret);
69257 + return RETERR(-EIO);
69258 + } else {
69259 + result->between = AFTER_UNIT;
69260 + return RETERR(NS_NOT_FOUND);
69261 + }
69262 +}
69263 +
69264 +/* find coord of pointer to @child in @parent.
69265 +
69266 + Find the &coord_t in the @parent where pointer to a given @child is in.
69267 +
69268 +*/
69269 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
69270 + znode * child /* child znode, passed locked */ ,
69271 + coord_t * result /* where result is stored in */ )
69272 +{
69273 + int lookup_res;
69274 + node_plugin *nplug;
69275 + /* left delimiting key of a child */
69276 + reiser4_key ld;
69277 + reiser4_tree *tree;
69278 +
69279 + assert("nikita-934", parent != NULL);
69280 + assert("nikita-935", child != NULL);
69281 + assert("nikita-936", result != NULL);
69282 + assert("zam-356", znode_is_loaded(parent));
69283 +
69284 + coord_init_zero(result);
69285 + result->node = parent;
69286 +
69287 + nplug = parent->nplug;
69288 + assert("nikita-939", nplug != NULL);
69289 +
69290 + tree = znode_get_tree(parent);
69291 + /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
69292 + * not aliased to ->in_parent of some znode. Otherwise,
69293 + * parent_coord_to_coord() below would modify data protected by tree
69294 + * lock. */
69295 + read_lock_tree(tree);
69296 + /* fast path. Try to use cached value. Lock tree to keep
69297 + node->pos_in_parent and pos->*_blocknr consistent. */
69298 + if (child->in_parent.item_pos + 1 != 0) {
69299 + parent_coord_to_coord(&child->in_parent, result);
69300 + if (check_tree_pointer(result, child) == NS_FOUND) {
69301 + read_unlock_tree(tree);
69302 + return NS_FOUND;
69303 + }
69304 +
69305 + child->in_parent.item_pos = (unsigned short)~0;
69306 + }
69307 + read_unlock_tree(tree);
69308 +
69309 + /* is above failed, find some key from @child. We are looking for the
69310 + least key in a child. */
69311 + read_lock_dk(tree);
69312 + ld = *znode_get_ld_key(child);
69313 + read_unlock_dk(tree);
69314 + /*
69315 + * now, lookup parent with key just found. Note, that left delimiting
69316 + * key doesn't identify node uniquely, because (in extremely rare
69317 + * case) two nodes can have equal left delimiting keys, if one of them
69318 + * is completely filled with directory entries that all happened to be
69319 + * hash collision. But, we check block number in check_tree_pointer()
69320 + * and, so, are safe.
69321 + */
69322 + lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
69323 + /* update cached pos_in_node */
69324 + if (lookup_res == NS_FOUND) {
69325 + write_lock_tree(tree);
69326 + coord_to_parent_coord(result, &child->in_parent);
69327 + write_unlock_tree(tree);
69328 + lookup_res = check_tree_pointer(result, child);
69329 + }
69330 + if (lookup_res == NS_NOT_FOUND)
69331 + lookup_res = find_child_by_addr(parent, child, result);
69332 + return lookup_res;
69333 +}
69334 +
69335 +/* find coord of pointer to @child in @parent by scanning
69336 +
69337 + Find the &coord_t in the @parent where pointer to a given @child
69338 + is in by scanning all internal items in @parent and comparing block
69339 + numbers in them with that of @child.
69340 +
69341 +*/
69342 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
69343 + znode * child /* child znode, passed locked */ ,
69344 + coord_t * result /* where result is stored in */ )
69345 +{
69346 + int ret;
69347 +
69348 + assert("nikita-1320", parent != NULL);
69349 + assert("nikita-1321", child != NULL);
69350 + assert("nikita-1322", result != NULL);
69351 +
69352 + ret = NS_NOT_FOUND;
69353 +
69354 + for_all_units(result, parent) {
69355 + if (check_tree_pointer(result, child) == NS_FOUND) {
69356 + write_lock_tree(znode_get_tree(parent));
69357 + coord_to_parent_coord(result, &child->in_parent);
69358 + write_unlock_tree(znode_get_tree(parent));
69359 + ret = NS_FOUND;
69360 + break;
69361 + }
69362 + }
69363 + return ret;
69364 +}
69365 +
69366 +/* true, if @addr is "unallocated block number", which is just address, with
69367 + highest bit set. */
69368 +int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
69369 + * check */ )
69370 +{
69371 + assert("nikita-1766", addr != NULL);
69372 + cassert(sizeof(reiser4_block_nr) == 8);
69373 + return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
69374 + REISER4_UNALLOCATED_STATUS_VALUE;
69375 +}
69376 +
69377 +/* returns true if removing bytes of given range of key [from_key, to_key]
69378 + causes removing of whole item @from */
69379 +static int
69380 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
69381 + const reiser4_key * to_key)
69382 +{
69383 + item_plugin *iplug;
69384 + reiser4_key key_in_item;
69385 +
69386 + assert("umka-325", from != NULL);
69387 + assert("", item_is_extent(from));
69388 +
69389 + /* check first key just for case */
69390 + item_key_by_coord(from, &key_in_item);
69391 + if (keygt(from_key, &key_in_item))
69392 + return 0;
69393 +
69394 + /* check last key */
69395 + iplug = item_plugin_by_coord(from);
69396 + assert("vs-611", iplug && iplug->s.file.append_key);
69397 +
69398 + iplug->s.file.append_key(from, &key_in_item);
69399 + set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
69400 +
69401 + if (keylt(to_key, &key_in_item))
69402 + /* last byte is not removed */
69403 + return 0;
69404 + return 1;
69405 +}
69406 +
69407 +/* helper function for prepare_twig_kill(): @left and @right are formatted
69408 + * neighbors of extent item being completely removed. Load and lock neighbors
69409 + * and store lock handles into @cdata for later use by kill_hook_extent() */
69410 +static int
69411 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
69412 +{
69413 + int result;
69414 + int left_loaded;
69415 + int right_loaded;
69416 +
69417 + result = 0;
69418 + left_loaded = right_loaded = 0;
69419 +
69420 + if (left != NULL) {
69421 + result = zload(left);
69422 + if (result == 0) {
69423 + left_loaded = 1;
69424 + result = longterm_lock_znode(kdata->left, left,
69425 + ZNODE_READ_LOCK,
69426 + ZNODE_LOCK_LOPRI);
69427 + }
69428 + }
69429 + if (result == 0 && right != NULL) {
69430 + result = zload(right);
69431 + if (result == 0) {
69432 + right_loaded = 1;
69433 + result = longterm_lock_znode(kdata->right, right,
69434 + ZNODE_READ_LOCK,
69435 + ZNODE_LOCK_HIPRI |
69436 + ZNODE_LOCK_NONBLOCK);
69437 + }
69438 + }
69439 + if (result != 0) {
69440 + done_lh(kdata->left);
69441 + done_lh(kdata->right);
69442 + if (left_loaded != 0)
69443 + zrelse(left);
69444 + if (right_loaded != 0)
69445 + zrelse(right);
69446 + }
69447 + return result;
69448 +}
69449 +
69450 +static void done_children(carry_kill_data * kdata)
69451 +{
69452 + if (kdata->left != NULL && kdata->left->node != NULL) {
69453 + zrelse(kdata->left->node);
69454 + done_lh(kdata->left);
69455 + }
69456 + if (kdata->right != NULL && kdata->right->node != NULL) {
69457 + zrelse(kdata->right->node);
69458 + done_lh(kdata->right);
69459 + }
69460 +}
69461 +
69462 +/* part of cut_node. It is called when cut_node is called to remove or cut part
69463 + of extent item. When head of that item is removed - we have to update right
69464 + delimiting of left neighbor of extent. When item is removed completely - we
69465 + have to set sibling link between left and right neighbor of removed
69466 + extent. This may return -E_DEADLOCK because of trying to get left neighbor
69467 + locked. So, caller should repeat an attempt
69468 +*/
69469 +/* Audited by: umka (2002.06.16) */
69470 +static int
69471 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
69472 +{
69473 + int result;
69474 + reiser4_key key;
69475 + lock_handle left_lh;
69476 + lock_handle right_lh;
69477 + coord_t left_coord;
69478 + coord_t *from;
69479 + znode *left_child;
69480 + znode *right_child;
69481 + reiser4_tree *tree;
69482 + int left_zloaded_here, right_zloaded_here;
69483 +
69484 + from = kdata->params.from;
69485 + assert("umka-326", from != NULL);
69486 + assert("umka-327", kdata->params.to != NULL);
69487 +
69488 + /* for one extent item only yet */
69489 + assert("vs-591", item_is_extent(from));
69490 + assert("vs-592", from->item_pos == kdata->params.to->item_pos);
69491 +
69492 + if ((kdata->params.from_key
69493 + && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
69494 + || from->unit_pos != 0) {
69495 + /* head of item @from is not removed, there is nothing to
69496 + worry about */
69497 + return 0;
69498 + }
69499 +
69500 + result = 0;
69501 + left_zloaded_here = 0;
69502 + right_zloaded_here = 0;
69503 +
69504 + left_child = right_child = NULL;
69505 +
69506 + coord_dup(&left_coord, from);
69507 + init_lh(&left_lh);
69508 + init_lh(&right_lh);
69509 + if (coord_prev_unit(&left_coord)) {
69510 + /* @from is leftmost item in its node */
69511 + if (!locked_left_neighbor) {
69512 + result =
69513 + reiser4_get_left_neighbor(&left_lh, from->node,
69514 + ZNODE_READ_LOCK,
69515 + GN_CAN_USE_UPPER_LEVELS);
69516 + switch (result) {
69517 + case 0:
69518 + break;
69519 + case -E_NO_NEIGHBOR:
69520 + /* there is no formatted node to the left of
69521 + from->node */
69522 + warning("vs-605",
69523 + "extent item has smallest key in "
69524 + "the tree and it is about to be removed");
69525 + return 0;
69526 + case -E_DEADLOCK:
69527 + /* need to restart */
69528 + default:
69529 + return result;
69530 + }
69531 +
69532 + /* we have acquired left neighbor of from->node */
69533 + result = zload(left_lh.node);
69534 + if (result)
69535 + goto done;
69536 +
69537 + locked_left_neighbor = left_lh.node;
69538 + } else {
69539 + /* squalloc_right_twig_cut should have supplied locked
69540 + * left neighbor */
69541 + assert("vs-834",
69542 + znode_is_write_locked(locked_left_neighbor));
69543 + result = zload(locked_left_neighbor);
69544 + if (result)
69545 + return result;
69546 + }
69547 +
69548 + left_zloaded_here = 1;
69549 + coord_init_last_unit(&left_coord, locked_left_neighbor);
69550 + }
69551 +
69552 + if (!item_is_internal(&left_coord)) {
69553 + /* what else but extent can be on twig level */
69554 + assert("vs-606", item_is_extent(&left_coord));
69555 +
69556 + /* there is no left formatted child */
69557 + if (left_zloaded_here)
69558 + zrelse(locked_left_neighbor);
69559 + done_lh(&left_lh);
69560 + return 0;
69561 + }
69562 +
69563 + tree = znode_get_tree(left_coord.node);
69564 + left_child = child_znode(&left_coord, left_coord.node, 1, 0);
69565 +
69566 + if (IS_ERR(left_child)) {
69567 + result = PTR_ERR(left_child);
69568 + goto done;
69569 + }
69570 +
69571 + /* left child is acquired, calculate new right delimiting key for it
69572 + and get right child if it is necessary */
69573 + if (item_removed_completely
69574 + (from, kdata->params.from_key, kdata->params.to_key)) {
69575 + /* try to get right child of removed item */
69576 + coord_t right_coord;
69577 +
69578 + assert("vs-607",
69579 + kdata->params.to->unit_pos ==
69580 + coord_last_unit_pos(kdata->params.to));
69581 + coord_dup(&right_coord, kdata->params.to);
69582 + if (coord_next_unit(&right_coord)) {
69583 + /* @to is rightmost unit in the node */
69584 + result =
69585 + reiser4_get_right_neighbor(&right_lh, from->node,
69586 + ZNODE_READ_LOCK,
69587 + GN_CAN_USE_UPPER_LEVELS);
69588 + switch (result) {
69589 + case 0:
69590 + result = zload(right_lh.node);
69591 + if (result)
69592 + goto done;
69593 +
69594 + right_zloaded_here = 1;
69595 + coord_init_first_unit(&right_coord,
69596 + right_lh.node);
69597 + item_key_by_coord(&right_coord, &key);
69598 + break;
69599 +
69600 + case -E_NO_NEIGHBOR:
69601 + /* there is no formatted node to the right of
69602 + from->node */
69603 + read_lock_dk(tree);
69604 + key = *znode_get_rd_key(from->node);
69605 + read_unlock_dk(tree);
69606 + right_coord.node = NULL;
69607 + result = 0;
69608 + break;
69609 + default:
69610 + /* real error */
69611 + goto done;
69612 + }
69613 + } else {
69614 + /* there is an item to the right of @from - take its key */
69615 + item_key_by_coord(&right_coord, &key);
69616 + }
69617 +
69618 + /* try to get right child of @from */
69619 + if (right_coord.node && /* there is right neighbor of @from */
69620 + item_is_internal(&right_coord)) { /* it is internal item */
69621 + right_child = child_znode(&right_coord,
69622 + right_coord.node, 1, 0);
69623 +
69624 + if (IS_ERR(right_child)) {
69625 + result = PTR_ERR(right_child);
69626 + goto done;
69627 + }
69628 +
69629 + }
69630 + /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
69631 + update of right delimiting key of left_child */
69632 + result = prepare_children(left_child, right_child, kdata);
69633 + } else {
69634 + /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
69635 + result = prepare_children(left_child, NULL, kdata);
69636 + }
69637 +
69638 + done:
69639 + if (right_child)
69640 + zput(right_child);
69641 + if (right_zloaded_here)
69642 + zrelse(right_lh.node);
69643 + done_lh(&right_lh);
69644 +
69645 + if (left_child)
69646 + zput(left_child);
69647 + if (left_zloaded_here)
69648 + zrelse(locked_left_neighbor);
69649 + done_lh(&left_lh);
69650 + return result;
69651 +}
69652 +
69653 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
69654 + are to be cut completely */
69655 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
69656 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
69657 + const reiser4_key * to_key, /* last key to be removed */
69658 + reiser4_key *
69659 + smallest_removed /* smallest key actually removed */ )
69660 +{
69661 + int result;
69662 + carry_pool *pool;
69663 + carry_level *lowest_level;
69664 + carry_cut_data *cut_data;
69665 + carry_op *op;
69666 +
69667 + assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
69668 +
69669 + pool =
69670 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69671 + sizeof(*cut_data));
69672 + if (IS_ERR(pool))
69673 + return PTR_ERR(pool);
69674 + lowest_level = (carry_level *) (pool + 1);
69675 + init_carry_level(lowest_level, pool);
69676 +
69677 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
69678 + assert("vs-1509", op != 0);
69679 + if (IS_ERR(op)) {
69680 + done_carry_pool(pool);
69681 + return PTR_ERR(op);
69682 + }
69683 +
69684 + cut_data = (carry_cut_data *) (lowest_level + 3);
69685 + cut_data->params.from = from;
69686 + cut_data->params.to = to;
69687 + cut_data->params.from_key = from_key;
69688 + cut_data->params.to_key = to_key;
69689 + cut_data->params.smallest_removed = smallest_removed;
69690 +
69691 + op->u.cut_or_kill.is_cut = 1;
69692 + op->u.cut_or_kill.u.cut = cut_data;
69693 +
69694 + result = reiser4_carry(lowest_level, NULL);
69695 + done_carry_pool(pool);
69696 +
69697 + return result;
69698 +}
69699 +
69700 +/* cut part of the node
69701 +
69702 + Cut part or whole content of node.
69703 +
69704 + cut data between @from and @to of @from->node and call carry() to make
69705 + corresponding changes in the tree. @from->node may become empty. If so -
69706 + pointer to it will be removed. Neighboring nodes are not changed. Smallest
69707 + removed key is stored in @smallest_removed
69708 +
69709 +*/
69710 +int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
69711 + coord_t * to, /* coord of the last unit/item that will be eliminated */
69712 + const reiser4_key * from_key, /* first key to be removed */
69713 + const reiser4_key * to_key, /* last key to be removed */
69714 + reiser4_key * smallest_removed, /* smallest key actually removed */
69715 + znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
69716 + * locked (in squalloc_right_twig_cut, namely) */
69717 + struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
69718 + invalidate pages together with item pointing to them */
69719 + int truncate)
69720 +{ /* this call is made for file truncate) */
69721 + int result;
69722 + carry_pool *pool;
69723 + carry_level *lowest_level;
69724 + carry_kill_data *kdata;
69725 + lock_handle *left_child;
69726 + lock_handle *right_child;
69727 + carry_op *op;
69728 +
69729 + assert("umka-328", from != NULL);
69730 + assert("vs-316", !node_is_empty(from->node));
69731 + assert("nikita-1812", coord_is_existing_unit(from)
69732 + && coord_is_existing_unit(to));
69733 +
69734 + /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
69735 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69736 + sizeof(carry_kill_data) +
69737 + 2 * sizeof(lock_handle) +
69738 + 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
69739 + if (IS_ERR(pool))
69740 + return PTR_ERR(pool);
69741 +
69742 + lowest_level = (carry_level *) (pool + 1);
69743 + init_carry_level(lowest_level, pool);
69744 +
69745 + kdata = (carry_kill_data *) (lowest_level + 3);
69746 + left_child = (lock_handle *) (kdata + 1);
69747 + right_child = left_child + 1;
69748 +
69749 + init_lh(left_child);
69750 + init_lh(right_child);
69751 +
69752 + kdata->params.from = from;
69753 + kdata->params.to = to;
69754 + kdata->params.from_key = from_key;
69755 + kdata->params.to_key = to_key;
69756 + kdata->params.smallest_removed = smallest_removed;
69757 + kdata->params.truncate = truncate;
69758 + kdata->flags = 0;
69759 + kdata->inode = inode;
69760 + kdata->left = left_child;
69761 + kdata->right = right_child;
69762 + /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
69763 + kdata->buf = (char *)(right_child + 1);
69764 +
69765 + if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
69766 + /* left child of extent item may have to get updated right
69767 + delimiting key and to get linked with right child of extent
69768 + @from if it will be removed completely */
69769 + result = prepare_twig_kill(kdata, locked_left_neighbor);
69770 + if (result) {
69771 + done_children(kdata);
69772 + done_carry_pool(pool);
69773 + return result;
69774 + }
69775 + }
69776 +
69777 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
69778 + if (IS_ERR(op) || (op == NULL)) {
69779 + done_children(kdata);
69780 + done_carry_pool(pool);
69781 + return RETERR(op ? PTR_ERR(op) : -EIO);
69782 + }
69783 +
69784 + op->u.cut_or_kill.is_cut = 0;
69785 + op->u.cut_or_kill.u.kill = kdata;
69786 +
69787 + result = reiser4_carry(lowest_level, NULL);
69788 +
69789 + done_children(kdata);
69790 + done_carry_pool(pool);
69791 + return result;
69792 +}
69793 +
69794 +void
69795 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
69796 +{
69797 + if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
69798 + pgoff_t start_pg, end_pg;
69799 +
69800 + start_pg = start >> PAGE_CACHE_SHIFT;
69801 + end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
69802 +
69803 + if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
69804 + /*
69805 + * kill up to the page boundary.
69806 + */
69807 + assert("vs-123456", start_pg == end_pg);
69808 + reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
69809 + truncate);
69810 + } else if (start_pg != end_pg) {
69811 + /*
69812 + * page boundary is within killed portion of node.
69813 + */
69814 + assert("vs-654321", end_pg - start_pg == 1);
69815 + reiser4_invalidate_pages(inode->i_mapping, end_pg,
69816 + end_pg - start_pg, 1);
69817 + }
69818 + }
69819 + inode_sub_bytes(inode, end - start);
69820 +}
69821 +
69822 +/**
69823 + * Delete whole @node from the reiser4 tree without loading it.
69824 + *
69825 + * @left: locked left neighbor,
69826 + * @node: node to be deleted,
69827 + * @smallest_removed: leftmost key of deleted node,
69828 + * @object: inode pointer, if we truncate a file body.
69829 + * @truncate: true if called for file truncate.
69830 + *
69831 + * @return: 0 if success, error code otherwise.
69832 + *
69833 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
69834 + * contains the right value of the smallest removed key from the previous
69835 + * cut_worker() iteration. This is needed for proper accounting of
69836 + * "i_blocks" and "i_bytes" fields of the @object.
69837 + */
69838 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
69839 + struct inode *object, int truncate)
69840 +{
69841 + lock_handle parent_lock;
69842 + coord_t cut_from;
69843 + coord_t cut_to;
69844 + reiser4_tree *tree;
69845 + int ret;
69846 +
69847 + assert("zam-937", node != NULL);
69848 + assert("zam-933", znode_is_write_locked(node));
69849 + assert("zam-999", smallest_removed != NULL);
69850 +
69851 + init_lh(&parent_lock);
69852 +
69853 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
69854 + if (ret)
69855 + return ret;
69856 +
69857 + assert("zam-934", !znode_above_root(parent_lock.node));
69858 +
69859 + ret = zload(parent_lock.node);
69860 + if (ret)
69861 + goto failed_nozrelse;
69862 +
69863 + ret = find_child_ptr(parent_lock.node, node, &cut_from);
69864 + if (ret)
69865 + goto failed;
69866 +
69867 + /* decrement child counter and set parent pointer to NULL before
69868 + deleting the list from parent node because of checks in
69869 + internal_kill_item_hook (we can delete the last item from the parent
69870 + node, the parent node is going to be deleted and its c_count should
69871 + be zero). */
69872 +
69873 + tree = znode_get_tree(node);
69874 + write_lock_tree(tree);
69875 + init_parent_coord(&node->in_parent, NULL);
69876 + --parent_lock.node->c_count;
69877 + write_unlock_tree(tree);
69878 +
69879 + assert("zam-989", item_is_internal(&cut_from));
69880 +
69881 + /* @node should be deleted after unlocking. */
69882 + ZF_SET(node, JNODE_HEARD_BANSHEE);
69883 +
69884 + /* remove a pointer from the parent node to the node being deleted. */
69885 + coord_dup(&cut_to, &cut_from);
69886 + /* FIXME: shouldn't this be kill_node_content */
69887 + ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
69888 + if (ret)
69889 + /* FIXME(Zam): Should we re-connect the node to its parent if
69890 + * cut_node fails? */
69891 + goto failed;
69892 +
69893 + {
69894 + reiser4_tree *tree = current_tree;
69895 + __u64 start_offset = 0, end_offset = 0;
69896 +
69897 + read_lock_tree(tree);
69898 + write_lock_dk(tree);
69899 + if (object) {
69900 + /* We use @smallest_removed and the left delimiting of
69901 + * the current node for @object->i_blocks, i_bytes
69902 + * calculation. We assume that the items after the
69903 + * *@smallest_removed key have been deleted from the
69904 + * file body. */
69905 + start_offset = get_key_offset(znode_get_ld_key(node));
69906 + end_offset = get_key_offset(smallest_removed);
69907 + }
69908 +
69909 + assert("zam-1021", znode_is_connected(node));
69910 + if (node->left)
69911 + znode_set_rd_key(node->left, znode_get_rd_key(node));
69912 +
69913 + *smallest_removed = *znode_get_ld_key(node);
69914 +
69915 + write_unlock_dk(tree);
69916 + read_unlock_tree(tree);
69917 +
69918 + if (object) {
69919 + /* we used to perform actions which are to be performed on items on their removal from tree in
69920 + special item method - kill_hook. Here for optimization reasons we avoid reading node
69921 + containing item we remove and can not call item's kill hook. Instead we call function which
69922 + does exactly the same things as tail kill hook in assumption that node we avoid reading
69923 + contains only one item and that item is a tail one. */
69924 + fake_kill_hook_tail(object, start_offset, end_offset,
69925 + truncate);
69926 + }
69927 + }
69928 + failed:
69929 + zrelse(parent_lock.node);
69930 + failed_nozrelse:
69931 + done_lh(&parent_lock);
69932 +
69933 + return ret;
69934 +}
69935 +
69936 +static int can_delete(const reiser4_key *key, znode *node)
69937 +{
69938 + int result;
69939 +
69940 + read_lock_dk(current_tree);
69941 + result = keyle(key, znode_get_ld_key(node));
69942 + read_unlock_dk(current_tree);
69943 + return result;
69944 +}
69945 +
69946 +/**
69947 + * This subroutine is not optimal but implementation seems to
69948 + * be easier).
69949 + *
69950 + * @tap: the point deletion process begins from,
69951 + * @from_key: the beginning of the deleted key range,
69952 + * @to_key: the end of the deleted key range,
69953 + * @smallest_removed: the smallest removed key,
69954 + * @truncate: true if called for file truncate.
69955 + * @progress: return true if a progress in file items deletions was made,
69956 + * @smallest_removed value is actual in that case.
69957 + *
69958 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
69959 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
69960 + */
69961 +int
69962 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
69963 + const reiser4_key * to_key,
69964 + reiser4_key * smallest_removed, struct inode *object,
69965 + int truncate, int *progress)
69966 +{
69967 + lock_handle next_node_lock;
69968 + coord_t left_coord;
69969 + int result;
69970 +
69971 + assert("zam-931", tap->coord->node != NULL);
69972 + assert("zam-932", znode_is_write_locked(tap->coord->node));
69973 +
69974 + *progress = 0;
69975 + init_lh(&next_node_lock);
69976 +
69977 + while (1) {
69978 + znode *node; /* node from which items are cut */
69979 + node_plugin *nplug; /* node plugin for @node */
69980 +
69981 + node = tap->coord->node;
69982 +
69983 + /* Move next_node_lock to the next node on the left. */
69984 + result =
69985 + reiser4_get_left_neighbor(&next_node_lock, node,
69986 + ZNODE_WRITE_LOCK,
69987 + GN_CAN_USE_UPPER_LEVELS);
69988 + if (result != 0 && result != -E_NO_NEIGHBOR)
69989 + break;
69990 + /* Check can we delete the node as a whole. */
69991 + if (*progress && znode_get_level(node) == LEAF_LEVEL &&
69992 + can_delete(from_key, node)) {
69993 + result = reiser4_delete_node(node, smallest_removed,
69994 + object, truncate);
69995 + } else {
69996 + result = reiser4_tap_load(tap);
69997 + if (result)
69998 + return result;
69999 +
70000 + /* Prepare the second (right) point for cut_node() */
70001 + if (*progress)
70002 + coord_init_last_unit(tap->coord, node);
70003 +
70004 + else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70005 + NULL)
70006 + /* set rightmost unit for the items without lookup method */
70007 + tap->coord->unit_pos =
70008 + coord_last_unit_pos(tap->coord);
70009 +
70010 + nplug = node->nplug;
70011 +
70012 + assert("vs-686", nplug);
70013 + assert("vs-687", nplug->lookup);
70014 +
70015 + /* left_coord is leftmost unit cut from @node */
70016 + result = nplug->lookup(node, from_key,
70017 + FIND_MAX_NOT_MORE_THAN,
70018 + &left_coord);
70019 +
70020 + if (IS_CBKERR(result))
70021 + break;
70022 +
70023 + /* adjust coordinates so that they are set to existing units */
70024 + if (coord_set_to_right(&left_coord)
70025 + || coord_set_to_left(tap->coord)) {
70026 + result = 0;
70027 + break;
70028 + }
70029 +
70030 + if (coord_compare(&left_coord, tap->coord) ==
70031 + COORD_CMP_ON_RIGHT) {
70032 + /* keys from @from_key to @to_key are not in the tree */
70033 + result = 0;
70034 + break;
70035 + }
70036 +
70037 + if (left_coord.item_pos != tap->coord->item_pos) {
70038 + /* do not allow to cut more than one item. It is added to solve problem of truncating
70039 + partially converted files. If file is partially converted there may exist a twig node
70040 + containing both internal item or items pointing to leaf nodes with formatting items
70041 + and extent item. We do not want to kill internal items being at twig node here
70042 + because cut_tree_worker assumes killing them from level level */
70043 + coord_dup(&left_coord, tap->coord);
70044 + assert("vs-1652",
70045 + coord_is_existing_unit(&left_coord));
70046 + left_coord.unit_pos = 0;
70047 + }
70048 +
70049 + /* cut data from one node */
70050 + // *smallest_removed = *reiser4_min_key();
70051 + result =
70052 + kill_node_content(&left_coord, tap->coord, from_key,
70053 + to_key, smallest_removed,
70054 + next_node_lock.node, object,
70055 + truncate);
70056 + reiser4_tap_relse(tap);
70057 + }
70058 + if (result)
70059 + break;
70060 +
70061 + ++(*progress);
70062 +
70063 + /* Check whether all items with keys >= from_key were removed
70064 + * from the tree. */
70065 + if (keyle(smallest_removed, from_key))
70066 + /* result = 0; */
70067 + break;
70068 +
70069 + if (next_node_lock.node == NULL)
70070 + break;
70071 +
70072 + result = reiser4_tap_move(tap, &next_node_lock);
70073 + done_lh(&next_node_lock);
70074 + if (result)
70075 + break;
70076 +
70077 + /* Break long reiser4_cut_tree operation (deletion of a large
70078 + file) if atom requires commit. */
70079 + if (*progress > CUT_TREE_MIN_ITERATIONS
70080 + && current_atom_should_commit()) {
70081 + result = -E_REPEAT;
70082 + break;
70083 + }
70084 + }
70085 + done_lh(&next_node_lock);
70086 + // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
70087 + return result;
70088 +}
70089 +
70090 +/* there is a fundamental problem with optimizing deletes: VFS does it
70091 + one file at a time. Another problem is that if an item can be
70092 + anything, then deleting items must be done one at a time. It just
70093 + seems clean to writes this to specify a from and a to key, and cut
70094 + everything between them though. */
70095 +
70096 +/* use this function with care if deleting more than what is part of a single file. */
70097 +/* do not use this when cutting a single item, it is suboptimal for that */
70098 +
70099 +/* You are encouraged to write plugin specific versions of this. It
70100 + cannot be optimal for all plugins because it works item at a time,
70101 + and some plugins could sometimes work node at a time. Regular files
70102 + however are not optimizable to work node at a time because of
70103 + extents needing to free the blocks they point to.
70104 +
70105 + Optimizations compared to v3 code:
70106 +
70107 + It does not balance (that task is left to memory pressure code).
70108 +
70109 + Nodes are deleted only if empty.
70110 +
70111 + Uses extents.
70112 +
70113 + Performs read-ahead of formatted nodes whose contents are part of
70114 + the deletion.
70115 +*/
70116 +
70117 +/**
70118 + * Delete everything from the reiser4 tree between two keys: @from_key and
70119 + * @to_key.
70120 + *
70121 + * @from_key: the beginning of the deleted key range,
70122 + * @to_key: the end of the deleted key range,
70123 + * @smallest_removed: the smallest removed key,
70124 + * @object: owner of cutting items.
70125 + * @truncate: true if called for file truncate.
70126 + * @progress: return true if a progress in file items deletions was made,
70127 + * @smallest_removed value is actual in that case.
70128 + *
70129 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70130 + * operation was interrupted for allowing atom commit .
70131 + */
70132 +
70133 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70134 + const reiser4_key * to_key,
70135 + reiser4_key * smallest_removed_p,
70136 + struct inode *object, int truncate, int *progress)
70137 +{
70138 + lock_handle lock;
70139 + int result;
70140 + tap_t tap;
70141 + coord_t right_coord;
70142 + reiser4_key smallest_removed;
70143 + int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70144 + const reiser4_key *, reiser4_key *,
70145 + struct inode *, int, int *);
70146 + STORE_COUNTERS;
70147 +
70148 + assert("umka-329", tree != NULL);
70149 + assert("umka-330", from_key != NULL);
70150 + assert("umka-331", to_key != NULL);
70151 + assert("zam-936", keyle(from_key, to_key));
70152 +
70153 + if (smallest_removed_p == NULL)
70154 + smallest_removed_p = &smallest_removed;
70155 +
70156 + init_lh(&lock);
70157 +
70158 + do {
70159 + /* Find rightmost item to cut away from the tree. */
70160 + result = reiser4_object_lookup(object, to_key, &right_coord,
70161 + &lock, ZNODE_WRITE_LOCK,
70162 + FIND_MAX_NOT_MORE_THAN,
70163 + TWIG_LEVEL, LEAF_LEVEL,
70164 + CBK_UNIQUE, NULL /*ra_info */);
70165 + if (result != CBK_COORD_FOUND)
70166 + break;
70167 + if (object == NULL
70168 + || inode_file_plugin(object)->cut_tree_worker == NULL)
70169 + cut_tree_worker = cut_tree_worker_common;
70170 + else
70171 + cut_tree_worker =
70172 + inode_file_plugin(object)->cut_tree_worker;
70173 + reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70174 + result =
70175 + cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70176 + object, truncate, progress);
70177 + reiser4_tap_done(&tap);
70178 +
70179 + reiser4_preempt_point();
70180 +
70181 + } while (0);
70182 +
70183 + done_lh(&lock);
70184 +
70185 + if (result) {
70186 + switch (result) {
70187 + case -E_NO_NEIGHBOR:
70188 + result = 0;
70189 + break;
70190 + case -E_DEADLOCK:
70191 + result = -E_REPEAT;
70192 + case -E_REPEAT:
70193 + case -ENOMEM:
70194 + case -ENOENT:
70195 + break;
70196 + default:
70197 + warning("nikita-2861", "failure: %i", result);
70198 + }
70199 + }
70200 +
70201 + CHECK_COUNTERS;
70202 + return result;
70203 +}
70204 +
70205 +/* repeat reiser4_cut_tree_object until everything is deleted.
70206 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
70207 + * is returned by cut_tree_object. */
70208 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70209 + const reiser4_key * to, struct inode *inode, int truncate)
70210 +{
70211 + int result;
70212 + int progress;
70213 +
70214 + do {
70215 + result = reiser4_cut_tree_object(tree, from, to, NULL,
70216 + inode, truncate, &progress);
70217 + } while (result == -E_REPEAT);
70218 +
70219 + return result;
70220 +}
70221 +
70222 +/* finishing reiser4 initialization */
70223 +int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
70224 + * initialized */ ,
70225 + const reiser4_block_nr * root_block /* address of a root block
70226 + * on a disk */ ,
70227 + tree_level height /* height of a tree */ ,
70228 + node_plugin * nplug /* default node plugin */ )
70229 +{
70230 + int result;
70231 +
70232 + assert("nikita-306", tree != NULL);
70233 + assert("nikita-307", root_block != NULL);
70234 + assert("nikita-308", height > 0);
70235 + assert("nikita-309", nplug != NULL);
70236 + assert("zam-587", tree->super != NULL);
70237 +
70238 + tree->root_block = *root_block;
70239 + tree->height = height;
70240 + tree->estimate_one_insert = calc_estimate_one_insert(height);
70241 + tree->nplug = nplug;
70242 +
70243 + tree->znode_epoch = 1ull;
70244 +
70245 + cbk_cache_init(&tree->cbk_cache);
70246 +
70247 + result = znodes_tree_init(tree);
70248 + if (result == 0)
70249 + result = jnodes_tree_init(tree);
70250 + if (result == 0) {
70251 + tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
70252 + reiser4_ctx_gfp_mask_get());
70253 + if (IS_ERR(tree->uber)) {
70254 + result = PTR_ERR(tree->uber);
70255 + tree->uber = NULL;
70256 + }
70257 + }
70258 + return result;
70259 +}
70260 +
70261 +/* release resources associated with @tree */
70262 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
70263 +{
70264 + if (tree == NULL)
70265 + return;
70266 +
70267 + if (tree->uber != NULL) {
70268 + zput(tree->uber);
70269 + tree->uber = NULL;
70270 + }
70271 + znodes_tree_done(tree);
70272 + jnodes_tree_done(tree);
70273 + cbk_cache_done(&tree->cbk_cache);
70274 +}
70275 +
70276 +/* Make Linus happy.
70277 + Local variables:
70278 + c-indentation-style: "K&R"
70279 + mode-name: "LC"
70280 + c-basic-offset: 8
70281 + tab-width: 8
70282 + fill-column: 120
70283 + scroll-step: 1
70284 + End:
70285 +*/
70286 diff -urN linux-2.6.22.orig/fs/reiser4/tree.h linux-2.6.22/fs/reiser4/tree.h
70287 --- linux-2.6.22.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
70288 +++ linux-2.6.22/fs/reiser4/tree.h 2007-07-29 00:25:35.028735820 +0400
70289 @@ -0,0 +1,577 @@
70290 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70291 + * reiser4/README */
70292 +
70293 +/* Tree operations. See fs/reiser4/tree.c for comments */
70294 +
70295 +#if !defined( __REISER4_TREE_H__ )
70296 +#define __REISER4_TREE_H__
70297 +
70298 +#include "forward.h"
70299 +#include "debug.h"
70300 +#include "dformat.h"
70301 +#include "plugin/node/node.h"
70302 +#include "plugin/plugin.h"
70303 +#include "znode.h"
70304 +#include "tap.h"
70305 +
70306 +#include <linux/types.h> /* for __u?? */
70307 +#include <linux/fs.h> /* for struct super_block */
70308 +#include <linux/spinlock.h>
70309 +#include <linux/sched.h> /* for struct task_struct */
70310 +
70311 +/* fictive block number never actually used */
70312 +extern const reiser4_block_nr UBER_TREE_ADDR;
70313 +
70314 +/* &cbk_cache_slot - entry in a coord cache.
70315 +
70316 + This is entry in a coord_by_key (cbk) cache, represented by
70317 + &cbk_cache.
70318 +
70319 +*/
70320 +typedef struct cbk_cache_slot {
70321 + /* cached node */
70322 + znode *node;
70323 + /* linkage to the next cbk cache slot in a LRU order */
70324 + struct list_head lru;
70325 +} cbk_cache_slot;
70326 +
70327 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
70328 +
70329 + cbk_cache is supposed to speed up tree lookups by caching results of recent
70330 + successful lookups (we don't cache negative results as dentry cache
70331 + does). Cache consists of relatively small number of entries kept in a LRU
70332 + order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
70333 + which we can obtain a range of keys that covered by this znode. Before
70334 + embarking into real tree traversal we scan cbk_cache slot by slot and for
70335 + each slot check whether key we are looking for is between minimal and
70336 + maximal keys for node pointed to by this slot. If no match is found, real
70337 + tree traversal is performed and if result is successful, appropriate entry
70338 + is inserted into cache, possibly pulling least recently used entry out of
70339 + it.
70340 +
70341 + Tree spin lock is used to protect coord cache. If contention for this
70342 + lock proves to be too high, more finer grained locking can be added.
70343 +
70344 + Invariants involving parts of this data-type:
70345 +
70346 + [cbk-cache-invariant]
70347 +*/
70348 +typedef struct cbk_cache {
70349 + /* serializator */
70350 + rwlock_t guard;
70351 + int nr_slots;
70352 + /* head of LRU list of cache slots */
70353 + struct list_head lru;
70354 + /* actual array of slots */
70355 + cbk_cache_slot *slot;
70356 +} cbk_cache;
70357 +
70358 +/* level_lookup_result - possible outcome of looking up key at some level.
70359 + This is used by coord_by_key when traversing tree downward. */
70360 +typedef enum {
70361 + /* continue to the next level */
70362 + LOOKUP_CONT,
70363 + /* done. Either required item was found, or we can prove it
70364 + doesn't exist, or some error occurred. */
70365 + LOOKUP_DONE,
70366 + /* restart traversal from the root. Infamous "repetition". */
70367 + LOOKUP_REST
70368 +} level_lookup_result;
70369 +
70370 +/* This is representation of internal reiser4 tree where all file-system
70371 + data and meta-data are stored. This structure is passed to all tree
70372 + manipulation functions. It's different from the super block because:
70373 + we don't want to limit ourselves to strictly one to one mapping
70374 + between super blocks and trees, and, because they are logically
70375 + different: there are things in a super block that have no relation to
70376 + the tree (bitmaps, journalling area, mount options, etc.) and there
70377 + are things in a tree that bear no relation to the super block, like
70378 + tree of znodes.
70379 +
70380 + At this time, there is only one tree
70381 + per filesystem, and this struct is part of the super block. We only
70382 + call the super block the super block for historical reasons (most
70383 + other filesystems call the per filesystem metadata the super block).
70384 +*/
70385 +
70386 +struct reiser4_tree {
70387 + /* block_nr == 0 is fake znode. Write lock it, while changing
70388 + tree height. */
70389 + /* disk address of root node of a tree */
70390 + reiser4_block_nr root_block;
70391 +
70392 + /* level of the root node. If this is 1, tree consists of root
70393 + node only */
70394 + tree_level height;
70395 +
70396 + /*
70397 + * this is cached here avoid calling plugins through function
70398 + * dereference all the time.
70399 + */
70400 + __u64 estimate_one_insert;
70401 +
70402 + /* cache of recent tree lookup results */
70403 + cbk_cache cbk_cache;
70404 +
70405 + /* hash table to look up znodes by block number. */
70406 + z_hash_table zhash_table;
70407 + z_hash_table zfake_table;
70408 + /* hash table to look up jnodes by inode and offset. */
70409 + j_hash_table jhash_table;
70410 +
70411 + /* lock protecting:
70412 + - parent pointers,
70413 + - sibling pointers,
70414 + - znode hash table
70415 + - coord cache
70416 + */
70417 + /* NOTE: The "giant" tree lock can be replaced by more spin locks,
70418 + hoping they will be less contented. We can use one spin lock per one
70419 + znode hash bucket. With adding of some code complexity, sibling
70420 + pointers can be protected by both znode spin locks. However it looks
70421 + more SMP scalable we should test this locking change on n-ways (n >
70422 + 4) SMP machines. Current 4-ways machine test does not show that tree
70423 + lock is contented and it is a bottleneck (2003.07.25). */
70424 +
70425 + rwlock_t tree_lock;
70426 +
70427 + /* lock protecting delimiting keys */
70428 + rwlock_t dk_lock;
70429 +
70430 + /* spin lock protecting znode_epoch */
70431 + spinlock_t epoch_lock;
70432 + /* version stamp used to mark znode updates. See seal.[ch] for more
70433 + * information. */
70434 + __u64 znode_epoch;
70435 +
70436 + znode *uber;
70437 + node_plugin *nplug;
70438 + struct super_block *super;
70439 + struct {
70440 + /* carry flags used for insertion of new nodes */
70441 + __u32 new_node_flags;
70442 + /* carry flags used for insertion of new extents */
70443 + __u32 new_extent_flags;
70444 + /* carry flags used for paste operations */
70445 + __u32 paste_flags;
70446 + /* carry flags used for insert operations */
70447 + __u32 insert_flags;
70448 + } carry;
70449 +};
70450 +
70451 +extern int reiser4_init_tree(reiser4_tree * tree,
70452 + const reiser4_block_nr * root_block,
70453 + tree_level height, node_plugin * default_plugin);
70454 +extern void reiser4_done_tree(reiser4_tree * tree);
70455 +
70456 +/* cbk flags: options for coord_by_key() */
70457 +typedef enum {
70458 + /* coord_by_key() is called for insertion. This is necessary because
70459 + of extents being located at the twig level. For explanation, see
70460 + comment just above is_next_item_internal().
70461 + */
70462 + CBK_FOR_INSERT = (1 << 0),
70463 + /* coord_by_key() is called with key that is known to be unique */
70464 + CBK_UNIQUE = (1 << 1),
70465 + /* coord_by_key() can trust delimiting keys. This options is not user
70466 + accessible. coord_by_key() will set it automatically. It will be
70467 + only cleared by special-case in extents-on-the-twig-level handling
70468 + where it is necessary to insert item with a key smaller than
70469 + leftmost key in a node. This is necessary because of extents being
70470 + located at the twig level. For explanation, see comment just above
70471 + is_next_item_internal().
70472 + */
70473 + CBK_TRUST_DK = (1 << 2),
70474 + CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
70475 + CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
70476 + CBK_DKSET = (1 << 5),
70477 + CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
70478 + CBK_IN_CACHE = (1 << 7), /* node is already in cache */
70479 + CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
70480 + * lock */
70481 +} cbk_flags;
70482 +
70483 +/* insertion outcome. IBK = insert by key */
70484 +typedef enum {
70485 + IBK_INSERT_OK = 0,
70486 + IBK_ALREADY_EXISTS = -EEXIST,
70487 + IBK_IO_ERROR = -EIO,
70488 + IBK_NO_SPACE = -E_NODE_FULL,
70489 + IBK_OOM = -ENOMEM
70490 +} insert_result;
70491 +
70492 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
70493 +
70494 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
70495 + lock_handle * lh, void *arg);
70496 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
70497 + lock_handle * lh,
70498 + tree_iterate_actor_t actor, void *arg,
70499 + znode_lock_mode mode, int through_units_p);
70500 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
70501 + znode_lock_request pri, lock_handle * lh);
70502 +
70503 +/* return node plugin of @node */
70504 +static inline node_plugin *node_plugin_by_node(const znode *
70505 + node /* node to query */ )
70506 +{
70507 + assert("vs-213", node != NULL);
70508 + assert("vs-214", znode_is_loaded(node));
70509 +
70510 + return node->nplug;
70511 +}
70512 +
70513 +/* number of items in @node */
70514 +static inline pos_in_node_t node_num_items(const znode * node)
70515 +{
70516 + assert("nikita-2754", znode_is_loaded(node));
70517 + assert("nikita-2468",
70518 + node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
70519 +
70520 + return node->nr_items;
70521 +}
70522 +
70523 +/* Return the number of items at the present node. Asserts coord->node !=
70524 + NULL. */
70525 +static inline unsigned coord_num_items(const coord_t * coord)
70526 +{
70527 + assert("jmacd-9805", coord->node != NULL);
70528 +
70529 + return node_num_items(coord->node);
70530 +}
70531 +
70532 +/* true if @node is empty */
70533 +static inline int node_is_empty(const znode * node)
70534 +{
70535 + return node_num_items(node) == 0;
70536 +}
70537 +
70538 +typedef enum {
70539 + SHIFTED_SOMETHING = 0,
70540 + SHIFT_NO_SPACE = -E_NODE_FULL,
70541 + SHIFT_IO_ERROR = -EIO,
70542 + SHIFT_OOM = -ENOMEM,
70543 +} shift_result;
70544 +
70545 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
70546 +extern int is_coord_in_node(const coord_t * coord);
70547 +extern int key_in_node(const reiser4_key *, const coord_t *);
70548 +extern void coord_item_move_to(coord_t * coord, int items);
70549 +extern void coord_unit_move_to(coord_t * coord, int units);
70550 +
70551 +/* there are two types of repetitive accesses (ra): intra-syscall
70552 + (local) and inter-syscall (global). Local ra is used when
70553 + during single syscall we add/delete several items and units in the
70554 + same place in a tree. Note that plan-A fragments local ra by
70555 + separating stat-data and file body in key-space. Global ra is
70556 + used when user does repetitive modifications in the same place in a
70557 + tree.
70558 +
70559 + Our ra implementation serves following purposes:
70560 + 1 it affects balancing decisions so that next operation in a row
70561 + can be performed faster;
70562 + 2 it affects lower-level read-ahead in page-cache;
70563 + 3 it allows to avoid unnecessary lookups by maintaining some state
70564 + across several operations (this is only for local ra);
70565 + 4 it leaves room for lazy-micro-balancing: when we start a sequence of
70566 + operations they are performed without actually doing any intra-node
70567 + shifts, until we finish sequence or scope of sequence leaves
70568 + current node, only then we really pack node (local ra only).
70569 +*/
70570 +
70571 +/* another thing that can be useful is to keep per-tree and/or
70572 + per-process cache of recent lookups. This cache can be organised as a
70573 + list of block numbers of formatted nodes sorted by starting key in
70574 + this node. Balancings should invalidate appropriate parts of this
70575 + cache.
70576 +*/
70577 +
70578 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
70579 + coord_t * coord, lock_handle * handle,
70580 + znode_lock_mode lock, lookup_bias bias,
70581 + tree_level lock_level, tree_level stop_level,
70582 + __u32 flags, ra_info_t *);
70583 +
70584 +lookup_result reiser4_object_lookup(struct inode *object,
70585 + const reiser4_key * key,
70586 + coord_t * coord,
70587 + lock_handle * lh,
70588 + znode_lock_mode lock_mode,
70589 + lookup_bias bias,
70590 + tree_level lock_level,
70591 + tree_level stop_level,
70592 + __u32 flags, ra_info_t * info);
70593 +
70594 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
70595 + reiser4_item_data * data, coord_t * coord,
70596 + lock_handle * lh,
70597 + tree_level stop_level, __u32 flags);
70598 +insert_result insert_by_coord(coord_t * coord,
70599 + reiser4_item_data * data, const reiser4_key * key,
70600 + lock_handle * lh, __u32);
70601 +insert_result insert_extent_by_coord(coord_t * coord,
70602 + reiser4_item_data * data,
70603 + const reiser4_key * key, lock_handle * lh);
70604 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
70605 + const reiser4_key * to_key,
70606 + reiser4_key * smallest_removed);
70607 +int kill_node_content(coord_t * from, coord_t * to,
70608 + const reiser4_key * from_key, const reiser4_key * to_key,
70609 + reiser4_key * smallest_removed,
70610 + znode * locked_left_neighbor, struct inode *inode,
70611 + int truncate);
70612 +
70613 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
70614 + reiser4_key * key, lock_handle * lh, cop_insert_flag);
70615 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
70616 + reiser4_item_data * data, unsigned);
70617 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
70618 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
70619 + coord_t * result);
70620 +
70621 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
70622 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
70623 +
70624 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
70625 +
70626 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
70627 + const reiser4_key *, reiser4_key *,
70628 + struct inode *, int, int *);
70629 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
70630 + const reiser4_key *, reiser4_key *,
70631 + struct inode *, int, int *);
70632 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70633 + const reiser4_key * to, struct inode *, int);
70634 +
70635 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
70636 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
70637 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
70638 + znode * left, coord_t * result);
70639 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
70640 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
70641 + znode * child);
70642 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
70643 + int incore_p, int setup_dkeys_p);
70644 +
70645 +extern int cbk_cache_init(cbk_cache * cache);
70646 +extern void cbk_cache_done(cbk_cache * cache);
70647 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
70648 +
70649 +extern char *sprint_address(const reiser4_block_nr * block);
70650 +
70651 +#if REISER4_DEBUG
70652 +extern void print_coord_content(const char *prefix, coord_t * p);
70653 +extern void reiser4_print_address(const char *prefix,
70654 + const reiser4_block_nr * block);
70655 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
70656 + __u32 flags);
70657 +extern void check_dkeys(znode *node);
70658 +#else
70659 +#define print_coord_content(p, c) noop
70660 +#define reiser4_print_address(p, b) noop
70661 +#endif
70662 +
70663 +extern void forget_znode(lock_handle * handle);
70664 +extern int deallocate_znode(znode * node);
70665 +
70666 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
70667 +
70668 +/* struct used internally to pack all numerous arguments of tree lookup.
70669 + Used to avoid passing a lot of arguments to helper functions. */
70670 +typedef struct cbk_handle {
70671 + /* tree we are in */
70672 + reiser4_tree *tree;
70673 + /* key we are going after */
70674 + const reiser4_key *key;
70675 + /* coord we will store result in */
70676 + coord_t *coord;
70677 + /* type of lock to take on target node */
70678 + znode_lock_mode lock_mode;
70679 + /* lookup bias. See comments at the declaration of lookup_bias */
70680 + lookup_bias bias;
70681 + /* lock level: level starting from which tree traversal starts taking
70682 + * write locks. */
70683 + tree_level lock_level;
70684 + /* level where search will stop. Either item will be found between
70685 + lock_level and stop_level, or CBK_COORD_NOTFOUND will be
70686 + returned.
70687 + */
70688 + tree_level stop_level;
70689 + /* level we are currently at */
70690 + tree_level level;
70691 + /* block number of @active node. Tree traversal operates on two
70692 + nodes: active and parent. */
70693 + reiser4_block_nr block;
70694 + /* put here error message to be printed by caller */
70695 + const char *error;
70696 + /* result passed back to caller */
70697 + lookup_result result;
70698 + /* lock handles for active and parent */
70699 + lock_handle *parent_lh;
70700 + lock_handle *active_lh;
70701 + reiser4_key ld_key;
70702 + reiser4_key rd_key;
70703 + /* flags, passed to the cbk routine. Bits of this bitmask are defined
70704 + in tree.h:cbk_flags enum. */
70705 + __u32 flags;
70706 + ra_info_t *ra_info;
70707 + struct inode *object;
70708 +} cbk_handle;
70709 +
70710 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
70711 +
70712 +/* eottl.c */
70713 +extern int handle_eottl(cbk_handle *h, int *outcome);
70714 +
70715 +int lookup_multikey(cbk_handle * handle, int nr_keys);
70716 +int lookup_couple(reiser4_tree * tree,
70717 + const reiser4_key * key1, const reiser4_key * key2,
70718 + coord_t * coord1, coord_t * coord2,
70719 + lock_handle * lh1, lock_handle * lh2,
70720 + znode_lock_mode lock_mode, lookup_bias bias,
70721 + tree_level lock_level, tree_level stop_level, __u32 flags,
70722 + int *result1, int *result2);
70723 +
70724 +static inline void read_lock_tree(reiser4_tree *tree)
70725 +{
70726 + /* check that tree is not locked */
70727 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
70728 + LOCK_CNT_NIL(read_locked_tree) &&
70729 + LOCK_CNT_NIL(write_locked_tree)));
70730 + /* check that spinlocks of lower priorities are not held */
70731 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
70732 + LOCK_CNT_NIL(rw_locked_dk) &&
70733 + LOCK_CNT_NIL(spin_locked_stack)));
70734 +
70735 + read_lock(&(tree->tree_lock));
70736 +
70737 + LOCK_CNT_INC(read_locked_tree);
70738 + LOCK_CNT_INC(rw_locked_tree);
70739 + LOCK_CNT_INC(spin_locked);
70740 +}
70741 +
70742 +static inline void read_unlock_tree(reiser4_tree *tree)
70743 +{
70744 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
70745 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
70746 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70747 +
70748 + LOCK_CNT_DEC(read_locked_tree);
70749 + LOCK_CNT_DEC(rw_locked_tree);
70750 + LOCK_CNT_DEC(spin_locked);
70751 +
70752 + read_unlock(&(tree->tree_lock));
70753 +}
70754 +
70755 +static inline void write_lock_tree(reiser4_tree *tree)
70756 +{
70757 + /* check that tree is not locked */
70758 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
70759 + LOCK_CNT_NIL(read_locked_tree) &&
70760 + LOCK_CNT_NIL(write_locked_tree)));
70761 + /* check that spinlocks of lower priorities are not held */
70762 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
70763 + LOCK_CNT_NIL(rw_locked_dk) &&
70764 + LOCK_CNT_NIL(spin_locked_stack)));
70765 +
70766 + write_lock(&(tree->tree_lock));
70767 +
70768 + LOCK_CNT_INC(write_locked_tree);
70769 + LOCK_CNT_INC(rw_locked_tree);
70770 + LOCK_CNT_INC(spin_locked);
70771 +}
70772 +
70773 +static inline void write_unlock_tree(reiser4_tree *tree)
70774 +{
70775 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
70776 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
70777 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70778 +
70779 + LOCK_CNT_DEC(write_locked_tree);
70780 + LOCK_CNT_DEC(rw_locked_tree);
70781 + LOCK_CNT_DEC(spin_locked);
70782 +
70783 + write_unlock(&(tree->tree_lock));
70784 +}
70785 +
70786 +static inline void read_lock_dk(reiser4_tree *tree)
70787 +{
70788 + /* check that dk is not locked */
70789 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
70790 + LOCK_CNT_NIL(read_locked_dk) &&
70791 + LOCK_CNT_NIL(write_locked_dk)));
70792 + /* check that spinlocks of lower priorities are not held */
70793 + assert("", LOCK_CNT_NIL(spin_locked_stack));
70794 +
70795 + read_lock(&((tree)->dk_lock));
70796 +
70797 + LOCK_CNT_INC(read_locked_dk);
70798 + LOCK_CNT_INC(rw_locked_dk);
70799 + LOCK_CNT_INC(spin_locked);
70800 +}
70801 +
70802 +static inline void read_unlock_dk(reiser4_tree *tree)
70803 +{
70804 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
70805 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
70806 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70807 +
70808 + LOCK_CNT_DEC(read_locked_dk);
70809 + LOCK_CNT_DEC(rw_locked_dk);
70810 + LOCK_CNT_DEC(spin_locked);
70811 +
70812 + read_unlock(&(tree->dk_lock));
70813 +}
70814 +
70815 +static inline void write_lock_dk(reiser4_tree *tree)
70816 +{
70817 + /* check that dk is not locked */
70818 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
70819 + LOCK_CNT_NIL(read_locked_dk) &&
70820 + LOCK_CNT_NIL(write_locked_dk)));
70821 + /* check that spinlocks of lower priorities are not held */
70822 + assert("", LOCK_CNT_NIL(spin_locked_stack));
70823 +
70824 + write_lock(&((tree)->dk_lock));
70825 +
70826 + LOCK_CNT_INC(write_locked_dk);
70827 + LOCK_CNT_INC(rw_locked_dk);
70828 + LOCK_CNT_INC(spin_locked);
70829 +}
70830 +
70831 +static inline void write_unlock_dk(reiser4_tree *tree)
70832 +{
70833 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
70834 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
70835 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70836 +
70837 + LOCK_CNT_DEC(write_locked_dk);
70838 + LOCK_CNT_DEC(rw_locked_dk);
70839 + LOCK_CNT_DEC(spin_locked);
70840 +
70841 + write_unlock(&(tree->dk_lock));
70842 +}
70843 +
70844 +/* estimate api. Implementation is in estimate.c */
70845 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
70846 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
70847 +reiser4_block_nr estimate_insert_flow(tree_level);
70848 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
70849 +reiser4_block_nr calc_estimate_one_insert(tree_level);
70850 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
70851 +reiser4_block_nr estimate_insert_cluster(struct inode *);
70852 +reiser4_block_nr estimate_update_cluster(struct inode *);
70853 +
70854 +/* __REISER4_TREE_H__ */
70855 +#endif
70856 +
70857 +/* Make Linus happy.
70858 + Local variables:
70859 + c-indentation-style: "K&R"
70860 + mode-name: "LC"
70861 + c-basic-offset: 8
70862 + tab-width: 8
70863 + fill-column: 120
70864 + scroll-step: 1
70865 + End:
70866 +*/
70867 diff -urN linux-2.6.22.orig/fs/reiser4/tree_mod.c linux-2.6.22/fs/reiser4/tree_mod.c
70868 --- linux-2.6.22.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300
70869 +++ linux-2.6.22/fs/reiser4/tree_mod.c 2007-07-29 00:25:35.032736855 +0400
70870 @@ -0,0 +1,386 @@
70871 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70872 + * reiser4/README */
70873 +
70874 +/*
70875 + * Functions to add/delete new nodes to/from the tree.
70876 + *
70877 + * Functions from this file are used by carry (see carry*) to handle:
70878 + *
70879 + * . insertion of new formatted node into tree
70880 + *
70881 + * . addition of new tree root, increasing tree height
70882 + *
70883 + * . removing tree root, decreasing tree height
70884 + *
70885 + */
70886 +
70887 +#include "forward.h"
70888 +#include "debug.h"
70889 +#include "dformat.h"
70890 +#include "key.h"
70891 +#include "coord.h"
70892 +#include "plugin/plugin.h"
70893 +#include "jnode.h"
70894 +#include "znode.h"
70895 +#include "tree_mod.h"
70896 +#include "block_alloc.h"
70897 +#include "tree_walk.h"
70898 +#include "tree.h"
70899 +#include "super.h"
70900 +
70901 +#include <linux/err.h>
70902 +
70903 +static int add_child_ptr(znode * parent, znode * child);
70904 +/* warning only issued if error is not -E_REPEAT */
70905 +#define ewarning( error, ... ) \
70906 + if( ( error ) != -E_REPEAT ) \
70907 + warning( __VA_ARGS__ )
70908 +
70909 +/* allocate new node on the @level and immediately on the right of @brother. */
70910 +znode * reiser4_new_node(znode * brother /* existing left neighbor
70911 + * of new node */,
70912 + tree_level level /* tree level at which new node is to
70913 + * be allocated */)
70914 +{
70915 + znode *result;
70916 + int retcode;
70917 + reiser4_block_nr blocknr;
70918 +
70919 + assert("nikita-930", brother != NULL);
70920 + assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
70921 +
70922 + retcode = assign_fake_blocknr_formatted(&blocknr);
70923 + if (retcode == 0) {
70924 + result =
70925 + zget(znode_get_tree(brother), &blocknr, NULL, level,
70926 + reiser4_ctx_gfp_mask_get());
70927 + if (IS_ERR(result)) {
70928 + ewarning(PTR_ERR(result), "nikita-929",
70929 + "Cannot allocate znode for carry: %li",
70930 + PTR_ERR(result));
70931 + return result;
70932 + }
70933 + /* cheap test, can be executed even when debugging is off */
70934 + if (!znode_just_created(result)) {
70935 + warning("nikita-2213",
70936 + "Allocated already existing block: %llu",
70937 + (unsigned long long)blocknr);
70938 + zput(result);
70939 + return ERR_PTR(RETERR(-EIO));
70940 + }
70941 +
70942 + assert("nikita-931", result != NULL);
70943 + result->nplug = znode_get_tree(brother)->nplug;
70944 + assert("nikita-933", result->nplug != NULL);
70945 +
70946 + retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
70947 + if (retcode == 0) {
70948 + ZF_SET(result, JNODE_CREATED);
70949 + zrelse(result);
70950 + } else {
70951 + zput(result);
70952 + result = ERR_PTR(retcode);
70953 + }
70954 + } else {
70955 + /* failure to allocate new node during balancing.
70956 + This should never happen. Ever. Returning -E_REPEAT
70957 + is not viable solution, because "out of disk space"
70958 + is not transient error that will go away by itself.
70959 + */
70960 + ewarning(retcode, "nikita-928",
70961 + "Cannot allocate block for carry: %i", retcode);
70962 + result = ERR_PTR(retcode);
70963 + }
70964 + assert("nikita-1071", result != NULL);
70965 + return result;
70966 +}
70967 +
70968 +/* allocate new root and add it to the tree
70969 +
70970 + This helper function is called by add_new_root().
70971 +
70972 +*/
70973 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
70974 + znode * fake /* "fake" znode */ )
70975 +{
70976 + reiser4_tree *tree = znode_get_tree(old_root);
70977 + znode *new_root = NULL; /* to shut gcc up */
70978 + int result;
70979 +
70980 + assert("nikita-1069", old_root != NULL);
70981 + assert("umka-262", fake != NULL);
70982 + assert("umka-263", tree != NULL);
70983 +
70984 + /* "fake" znode---one always hanging just above current root. This
70985 + node is locked when new root is created or existing root is
70986 + deleted. Downward tree traversal takes lock on it before taking
70987 + lock on a root node. This avoids race conditions with root
70988 + manipulations.
70989 +
70990 + */
70991 + assert("nikita-1348", znode_above_root(fake));
70992 + assert("nikita-1211", znode_is_root(old_root));
70993 +
70994 + result = 0;
70995 + if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
70996 + warning("nikita-1344", "Tree is too tall: %i", tree->height);
70997 + /* ext2 returns -ENOSPC when it runs out of free inodes with a
70998 + following comment (fs/ext2/ialloc.c:441): Is it really
70999 + ENOSPC?
71000 +
71001 + -EXFULL? -EINVAL?
71002 + */
71003 + result = RETERR(-ENOSPC);
71004 + } else {
71005 + /* Allocate block for new root. It's not that
71006 + important where it will be allocated, as root is
71007 + almost always in memory. Moreover, allocate on
71008 + flush can be going here.
71009 + */
71010 + assert("nikita-1448", znode_is_root(old_root));
71011 + new_root = reiser4_new_node(fake, tree->height + 1);
71012 + if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71013 + lock_handle rlh;
71014 +
71015 + init_lh(&rlh);
71016 + result =
71017 + longterm_lock_znode(&rlh, new_root,
71018 + ZNODE_WRITE_LOCK,
71019 + ZNODE_LOCK_LOPRI);
71020 + if (result == 0) {
71021 + parent_coord_t *in_parent;
71022 +
71023 + znode_make_dirty(fake);
71024 +
71025 + /* new root is a child of "fake" node */
71026 + write_lock_tree(tree);
71027 +
71028 + ++tree->height;
71029 +
71030 + /* recalculate max balance overhead */
71031 + tree->estimate_one_insert =
71032 + estimate_one_insert_item(tree);
71033 +
71034 + tree->root_block = *znode_get_block(new_root);
71035 + in_parent = &new_root->in_parent;
71036 + init_parent_coord(in_parent, fake);
71037 + /* manually insert new root into sibling
71038 + * list. With this all nodes involved into
71039 + * balancing are connected after balancing is
71040 + * done---useful invariant to check. */
71041 + sibling_list_insert_nolock(new_root, NULL);
71042 + write_unlock_tree(tree);
71043 +
71044 + /* insert into new root pointer to the
71045 + @old_root. */
71046 + assert("nikita-1110",
71047 + WITH_DATA(new_root,
71048 + node_is_empty(new_root)));
71049 + write_lock_dk(tree);
71050 + znode_set_ld_key(new_root, reiser4_min_key());
71051 + znode_set_rd_key(new_root, reiser4_max_key());
71052 + write_unlock_dk(tree);
71053 + if (REISER4_DEBUG) {
71054 + ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71055 + ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71056 + ZF_SET(old_root, JNODE_ORPHAN);
71057 + }
71058 + result = add_child_ptr(new_root, old_root);
71059 + done_lh(&rlh);
71060 + }
71061 + zrelse(new_root);
71062 + }
71063 + }
71064 + if (result != 0)
71065 + new_root = ERR_PTR(result);
71066 + return new_root;
71067 +}
71068 +
71069 +/* build &reiser4_item_data for inserting child pointer
71070 +
71071 + Build &reiser4_item_data that can be later used to insert pointer to @child
71072 + in its parent.
71073 +
71074 +*/
71075 +void build_child_ptr_data(znode * child /* node pointer to which will be
71076 + * inserted */ ,
71077 + reiser4_item_data * data /* where to store result */ )
71078 +{
71079 + assert("nikita-1116", child != NULL);
71080 + assert("nikita-1117", data != NULL);
71081 +
71082 + /*
71083 + * NOTE: use address of child's blocknr as address of data to be
71084 + * inserted. As result of this data gets into on-disk structure in cpu
71085 + * byte order. internal's create_hook converts it to little endian byte
71086 + * order.
71087 + */
71088 + data->data = (char *)znode_get_block(child);
71089 + /* data -> data is kernel space */
71090 + data->user = 0;
71091 + data->length = sizeof(reiser4_block_nr);
71092 + /* FIXME-VS: hardcoded internal item? */
71093 +
71094 + /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71095 + data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71096 +}
71097 +
71098 +/* add pointer to @child into empty @parent.
71099 +
71100 + This is used when pointer to old root is inserted into new root which is
71101 + empty.
71102 +*/
71103 +static int add_child_ptr(znode * parent, znode * child)
71104 +{
71105 + coord_t coord;
71106 + reiser4_item_data data;
71107 + int result;
71108 + reiser4_key key;
71109 +
71110 + assert("nikita-1111", parent != NULL);
71111 + assert("nikita-1112", child != NULL);
71112 + assert("nikita-1115",
71113 + znode_get_level(parent) == znode_get_level(child) + 1);
71114 +
71115 + result = zload(parent);
71116 + if (result != 0)
71117 + return result;
71118 + assert("nikita-1113", node_is_empty(parent));
71119 + coord_init_first_unit(&coord, parent);
71120 +
71121 + build_child_ptr_data(child, &data);
71122 + data.arg = NULL;
71123 +
71124 + read_lock_dk(znode_get_tree(parent));
71125 + key = *znode_get_ld_key(child);
71126 + read_unlock_dk(znode_get_tree(parent));
71127 +
71128 + result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71129 + NULL);
71130 + znode_make_dirty(parent);
71131 + zrelse(parent);
71132 + return result;
71133 +}
71134 +
71135 +/* actually remove tree root */
71136 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
71137 + * being removed */,
71138 + znode * old_root /* root node that is being
71139 + * removed */ ,
71140 + znode * new_root /* new root---sole child of
71141 + * @old_root */,
71142 + const reiser4_block_nr * new_root_blk /* disk address of
71143 + * @new_root */)
71144 +{
71145 + znode *uber;
71146 + int result;
71147 + lock_handle handle_for_uber;
71148 +
71149 + assert("umka-265", tree != NULL);
71150 + assert("nikita-1198", new_root != NULL);
71151 + assert("nikita-1199",
71152 + znode_get_level(new_root) + 1 == znode_get_level(old_root));
71153 +
71154 + assert("nikita-1201", znode_is_write_locked(old_root));
71155 +
71156 + assert("nikita-1203",
71157 + disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71158 +
71159 + init_lh(&handle_for_uber);
71160 + /* obtain and lock "fake" znode protecting changes in tree height. */
71161 + result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71162 + &handle_for_uber);
71163 + if (result == 0) {
71164 + uber = handle_for_uber.node;
71165 +
71166 + znode_make_dirty(uber);
71167 +
71168 + /* don't take long term lock a @new_root. Take spinlock. */
71169 +
71170 + write_lock_tree(tree);
71171 +
71172 + tree->root_block = *new_root_blk;
71173 + --tree->height;
71174 +
71175 + /* recalculate max balance overhead */
71176 + tree->estimate_one_insert = estimate_one_insert_item(tree);
71177 +
71178 + assert("nikita-1202",
71179 + tree->height == znode_get_level(new_root));
71180 +
71181 + /* new root is child on "fake" node */
71182 + init_parent_coord(&new_root->in_parent, uber);
71183 + ++uber->c_count;
71184 +
71185 + /* sibling_list_insert_nolock(new_root, NULL); */
71186 + write_unlock_tree(tree);
71187 +
71188 + /* reinitialise old root. */
71189 + result = node_plugin_by_node(old_root)->init(old_root);
71190 + znode_make_dirty(old_root);
71191 + if (result == 0) {
71192 + assert("nikita-1279", node_is_empty(old_root));
71193 + ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71194 + old_root->c_count = 0;
71195 + }
71196 + }
71197 + done_lh(&handle_for_uber);
71198 +
71199 + return result;
71200 +}
71201 +
71202 +/* remove tree root
71203 +
71204 + This function removes tree root, decreasing tree height by one. Tree root
71205 + and its only child (that is going to become new tree root) are write locked
71206 + at the entry.
71207 +
71208 + To remove tree root we need to take lock on special "fake" znode that
71209 + protects changes of tree height. See comments in reiser4_add_tree_root() for
71210 + more on this.
71211 +
71212 + Also parent pointers have to be updated in
71213 + old and new root. To simplify code, function is split into two parts: outer
71214 + reiser4_kill_tree_root() collects all necessary arguments and calls
71215 + reiser4_kill_root() to do the actual job.
71216 +
71217 +*/
71218 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
71219 + removing*/)
71220 +{
71221 + int result;
71222 + coord_t down_link;
71223 + znode *new_root;
71224 + reiser4_tree *tree;
71225 +
71226 + assert("umka-266", current_tree != NULL);
71227 + assert("nikita-1194", old_root != NULL);
71228 + assert("nikita-1196", znode_is_root(old_root));
71229 + assert("nikita-1200", node_num_items(old_root) == 1);
71230 + assert("nikita-1401", znode_is_write_locked(old_root));
71231 +
71232 + coord_init_first_unit(&down_link, old_root);
71233 +
71234 + tree = znode_get_tree(old_root);
71235 + new_root = child_znode(&down_link, old_root, 0, 1);
71236 + if (!IS_ERR(new_root)) {
71237 + result =
71238 + reiser4_kill_root(tree, old_root, new_root,
71239 + znode_get_block(new_root));
71240 + zput(new_root);
71241 + } else
71242 + result = PTR_ERR(new_root);
71243 +
71244 + return result;
71245 +}
71246 +
71247 +/* Make Linus happy.
71248 + Local variables:
71249 + c-indentation-style: "K&R"
71250 + mode-name: "LC"
71251 + c-basic-offset: 8
71252 + tab-width: 8
71253 + fill-column: 120
71254 + scroll-step: 1
71255 + End:
71256 +*/
71257 diff -urN linux-2.6.22.orig/fs/reiser4/tree_mod.h linux-2.6.22/fs/reiser4/tree_mod.h
71258 --- linux-2.6.22.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300
71259 +++ linux-2.6.22/fs/reiser4/tree_mod.h 2007-07-29 00:25:35.032736855 +0400
71260 @@ -0,0 +1,29 @@
71261 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71262 + * reiser4/README */
71263 +
71264 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
71265 + * comments. */
71266 +
71267 +#if !defined( __REISER4_TREE_MOD_H__ )
71268 +#define __REISER4_TREE_MOD_H__
71269 +
71270 +#include "forward.h"
71271 +
71272 +znode *reiser4_new_node(znode * brother, tree_level level);
71273 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
71274 +int reiser4_kill_tree_root(znode * old_root);
71275 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
71276 +
71277 +/* __REISER4_TREE_MOD_H__ */
71278 +#endif
71279 +
71280 +/* Make Linus happy.
71281 + Local variables:
71282 + c-indentation-style: "K&R"
71283 + mode-name: "LC"
71284 + c-basic-offset: 8
71285 + tab-width: 8
71286 + fill-column: 120
71287 + scroll-step: 1
71288 + End:
71289 +*/
71290 diff -urN linux-2.6.22.orig/fs/reiser4/tree_walk.c linux-2.6.22/fs/reiser4/tree_walk.c
71291 --- linux-2.6.22.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300
71292 +++ linux-2.6.22/fs/reiser4/tree_walk.c 2007-07-29 00:25:35.032736855 +0400
71293 @@ -0,0 +1,927 @@
71294 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71295 + * reiser4/README */
71296 +
71297 +/* Routines and macros to:
71298 +
71299 + get_left_neighbor()
71300 +
71301 + get_right_neighbor()
71302 +
71303 + get_parent()
71304 +
71305 + get_first_child()
71306 +
71307 + get_last_child()
71308 +
71309 + various routines to walk the whole tree and do things to it like
71310 + repack it, or move it to tertiary storage. Please make them as
71311 + generic as is reasonable.
71312 +
71313 +*/
71314 +
71315 +#include "forward.h"
71316 +#include "debug.h"
71317 +#include "dformat.h"
71318 +#include "coord.h"
71319 +#include "plugin/item/item.h"
71320 +#include "jnode.h"
71321 +#include "znode.h"
71322 +#include "tree_walk.h"
71323 +#include "tree.h"
71324 +#include "super.h"
71325 +
71326 +/* These macros are used internally in tree_walk.c in attempt to make
71327 + lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
71328 + lock_left_neighbor */
71329 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
71330 +#define FIELD_OFFSET(name) offsetof(znode, name)
71331 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
71332 +#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
71333 +#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
71334 +
71335 +/* This is the generic procedure to get and lock `generic' neighbor (left or
71336 + right neighbor or parent). It implements common algorithm for all cases of
71337 + getting lock on neighbor node, only znode structure field is different in
71338 + each case. This is parameterized by ptr_offset argument, which is byte
71339 + offset for the pointer to the desired neighbor within the current node's
71340 + znode structure. This function should be called with the tree lock held */
71341 +static int lock_neighbor(
71342 + /* resulting lock handle */
71343 + lock_handle * result,
71344 + /* znode to lock */
71345 + znode * node,
71346 + /* pointer to neighbor (or parent) znode field offset, in bytes from
71347 + the base address of znode structure */
71348 + int ptr_offset,
71349 + /* lock mode for longterm_lock_znode call */
71350 + znode_lock_mode mode,
71351 + /* lock request for longterm_lock_znode call */
71352 + znode_lock_request req,
71353 + /* GN_* flags */
71354 + int flags, int rlocked)
71355 +{
71356 + reiser4_tree *tree = znode_get_tree(node);
71357 + znode *neighbor;
71358 + int ret;
71359 +
71360 + assert("umka-236", node != NULL);
71361 + assert("umka-237", tree != NULL);
71362 + assert_rw_locked(&(tree->tree_lock));
71363 +
71364 + if (flags & GN_TRY_LOCK)
71365 + req |= ZNODE_LOCK_NONBLOCK;
71366 + if (flags & GN_SAME_ATOM)
71367 + req |= ZNODE_LOCK_DONT_FUSE;
71368 +
71369 + /* get neighbor's address by using of sibling link, quit while loop
71370 + (and return) if link is not available. */
71371 + while (1) {
71372 + neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
71373 +
71374 + /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
71375 + * node pointed by it is not connected.
71376 + *
71377 + * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
71378 + * check and allows passing reference to not connected znode to
71379 + * subsequent longterm_lock_znode() call. This kills possible
71380 + * busy loop if we are trying to get longterm lock on locked but
71381 + * not yet connected parent node. */
71382 + if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
71383 + || znode_is_connected(neighbor))) {
71384 + return RETERR(-E_NO_NEIGHBOR);
71385 + }
71386 +
71387 + /* protect it from deletion. */
71388 + zref(neighbor);
71389 +
71390 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71391 +
71392 + ret = longterm_lock_znode(result, neighbor, mode, req);
71393 +
71394 + /* The lock handle obtains its own reference, release the one from above. */
71395 + zput(neighbor);
71396 +
71397 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71398 +
71399 + /* restart if node we got reference to is being
71400 + invalidated. we should not get reference to this node
71401 + again. */
71402 + if (ret == -EINVAL)
71403 + continue;
71404 + if (ret)
71405 + return ret;
71406 +
71407 + /* check if neighbor link still points to just locked znode;
71408 + the link could have been changed while the process slept. */
71409 + if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
71410 + return 0;
71411 +
71412 + /* znode was locked by mistake; unlock it and restart locking
71413 + process from beginning. */
71414 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71415 + longterm_unlock_znode(result);
71416 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71417 + }
71418 +}
71419 +
71420 +/* get parent node with longterm lock, accepts GN* flags. */
71421 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
71422 + znode * node /* child node */ ,
71423 + znode_lock_mode mode
71424 + /* type of lock: read or write */ ,
71425 + int flags /* GN_* flags */ )
71426 +{
71427 + int result;
71428 +
71429 + read_lock_tree(znode_get_tree(node));
71430 + result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
71431 + ZNODE_LOCK_HIPRI, flags, 1);
71432 + read_unlock_tree(znode_get_tree(node));
71433 + return result;
71434 +}
71435 +
71436 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
71437 + bit in @flags parameter */
71438 +/* Audited by: umka (2002.06.14) */
71439 +static inline int
71440 +lock_side_neighbor(lock_handle * result,
71441 + znode * node, znode_lock_mode mode, int flags, int rlocked)
71442 +{
71443 + int ret;
71444 + int ptr_offset;
71445 + znode_lock_request req;
71446 +
71447 + if (flags & GN_GO_LEFT) {
71448 + ptr_offset = LEFT_PTR_OFFSET;
71449 + req = ZNODE_LOCK_LOPRI;
71450 + } else {
71451 + ptr_offset = RIGHT_PTR_OFFSET;
71452 + req = ZNODE_LOCK_HIPRI;
71453 + }
71454 +
71455 + ret =
71456 + lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
71457 +
71458 + if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
71459 + * guarantee that neighbor is absent in the
71460 + * tree; in this case we return -ENOENT --
71461 + * means neighbor at least not found in
71462 + * cache */
71463 + return RETERR(-ENOENT);
71464 +
71465 + return ret;
71466 +}
71467 +
71468 +#if REISER4_DEBUG
71469 +
71470 +int check_sibling_list(znode * node)
71471 +{
71472 + znode *scan;
71473 + znode *next;
71474 +
71475 + assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
71476 +
71477 + if (node == NULL)
71478 + return 1;
71479 +
71480 + if (ZF_ISSET(node, JNODE_RIP))
71481 + return 1;
71482 +
71483 + assert("nikita-3270", node != NULL);
71484 + assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
71485 +
71486 + for (scan = node; znode_is_left_connected(scan); scan = next) {
71487 + next = scan->left;
71488 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71489 + assert("nikita-3271", znode_is_right_connected(next));
71490 + assert("nikita-3272", next->right == scan);
71491 + } else
71492 + break;
71493 + }
71494 + for (scan = node; znode_is_right_connected(scan); scan = next) {
71495 + next = scan->right;
71496 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71497 + assert("nikita-3273", znode_is_left_connected(next));
71498 + assert("nikita-3274", next->left == scan);
71499 + } else
71500 + break;
71501 + }
71502 + return 1;
71503 +}
71504 +
71505 +#endif
71506 +
71507 +/* Znode sibling pointers maintenence. */
71508 +
71509 +/* Znode sibling pointers are established between any neighbored nodes which are
71510 + in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
71511 + JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
71512 + value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
71513 +
71514 + Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
71515 + take care about searching (hash table lookup may be required) of znode
71516 + neighbors, establishing sibling pointers between them and setting
71517 + JNODE_*_CONNECTED state bits. */
71518 +
71519 +/* adjusting of sibling pointers and `connected' states for two
71520 + neighbors; works if one neighbor is NULL (was not found). */
71521 +
71522 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
71523 +void link_left_and_right(znode * left, znode * right)
71524 +{
71525 + assert("nikita-3275", check_sibling_list(left));
71526 + assert("nikita-3275", check_sibling_list(right));
71527 +
71528 + if (left != NULL) {
71529 + if (left->right == NULL) {
71530 + left->right = right;
71531 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
71532 +
71533 + ON_DEBUG(left->right_version =
71534 + atomic_inc_return(&delim_key_version);
71535 + );
71536 +
71537 + } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
71538 + && left->right != right) {
71539 +
71540 + ON_DEBUG(left->right->left_version =
71541 + atomic_inc_return(&delim_key_version);
71542 + left->right_version =
71543 + atomic_inc_return(&delim_key_version););
71544 +
71545 + left->right->left = NULL;
71546 + left->right = right;
71547 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
71548 + } else
71549 + /*
71550 + * there is a race condition in renew_sibling_link()
71551 + * and assertions below check that it is only one
71552 + * there. Thread T1 calls renew_sibling_link() without
71553 + * GN_NO_ALLOC flag. zlook() doesn't find neighbor
71554 + * node, but before T1 gets to the
71555 + * link_left_and_right(), another thread T2 creates
71556 + * neighbor node and connects it. check for
71557 + * left->right == NULL above protects T1 from
71558 + * overwriting correct left->right pointer installed
71559 + * by T2.
71560 + */
71561 + assert("nikita-3302",
71562 + right == NULL || left->right == right);
71563 + }
71564 + if (right != NULL) {
71565 + if (right->left == NULL) {
71566 + right->left = left;
71567 + ZF_SET(right, JNODE_LEFT_CONNECTED);
71568 +
71569 + ON_DEBUG(right->left_version =
71570 + atomic_inc_return(&delim_key_version);
71571 + );
71572 +
71573 + } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
71574 + && right->left != left) {
71575 +
71576 + ON_DEBUG(right->left->right_version =
71577 + atomic_inc_return(&delim_key_version);
71578 + right->left_version =
71579 + atomic_inc_return(&delim_key_version););
71580 +
71581 + right->left->right = NULL;
71582 + right->left = left;
71583 + ZF_SET(right, JNODE_LEFT_CONNECTED);
71584 +
71585 + } else
71586 + assert("nikita-3303",
71587 + left == NULL || right->left == left);
71588 + }
71589 + assert("nikita-3275", check_sibling_list(left));
71590 + assert("nikita-3275", check_sibling_list(right));
71591 +}
71592 +
71593 +/* Audited by: umka (2002.06.14) */
71594 +static void link_znodes(znode * first, znode * second, int to_left)
71595 +{
71596 + if (to_left)
71597 + link_left_and_right(second, first);
71598 + else
71599 + link_left_and_right(first, second);
71600 +}
71601 +
71602 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
71603 + coord's unit position in horizontal direction, even across node
71604 + boundary. Should be called under tree lock, it protects nonexistence of
71605 + sibling link on parent level, if lock_side_neighbor() fails with
71606 + -ENOENT. */
71607 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
71608 +{
71609 + int ret;
71610 + znode *node;
71611 + reiser4_tree *tree;
71612 +
71613 + assert("umka-243", coord != NULL);
71614 + assert("umka-244", handle != NULL);
71615 + assert("zam-1069", handle->node == NULL);
71616 +
71617 + ret =
71618 + (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
71619 + coord_next_unit(coord);
71620 + if (!ret)
71621 + return 0;
71622 +
71623 + ret =
71624 + lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
71625 + if (ret)
71626 + return ret;
71627 +
71628 + node = handle->node;
71629 + tree = znode_get_tree(node);
71630 + write_unlock_tree(tree);
71631 +
71632 + coord_init_zero(coord);
71633 +
71634 + /* We avoid synchronous read here if it is specified by flag. */
71635 + if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
71636 + ret = jstartio(ZJNODE(handle->node));
71637 + if (!ret)
71638 + ret = -E_REPEAT;
71639 + goto error_locked;
71640 + }
71641 +
71642 + /* corresponded zrelse() should be called by the clients of
71643 + far_next_coord(), in place when this node gets unlocked. */
71644 + ret = zload(handle->node);
71645 + if (ret)
71646 + goto error_locked;
71647 +
71648 + if (flags & GN_GO_LEFT)
71649 + coord_init_last_unit(coord, node);
71650 + else
71651 + coord_init_first_unit(coord, node);
71652 +
71653 + if (0) {
71654 + error_locked:
71655 + longterm_unlock_znode(handle);
71656 + }
71657 + write_lock_tree(tree);
71658 + return ret;
71659 +}
71660 +
71661 +/* Very significant function which performs a step in horizontal direction
71662 + when sibling pointer is not available. Actually, it is only function which
71663 + does it.
71664 + Note: this function does not restore locking status at exit,
71665 + caller should does care about proper unlocking and zrelsing */
71666 +static int
71667 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
71668 + tree_level level, int flags, int *nr_locked)
71669 +{
71670 + int ret;
71671 + int to_left = flags & GN_GO_LEFT;
71672 + reiser4_block_nr da;
71673 + /* parent of the neighbor node; we set it to parent until not sharing
71674 + of one parent between child and neighbor node is detected */
71675 + znode *side_parent = coord->node;
71676 + reiser4_tree *tree = znode_get_tree(child);
71677 + znode *neighbor = NULL;
71678 +
71679 + assert("umka-245", coord != NULL);
71680 + assert("umka-246", handle != NULL);
71681 + assert("umka-247", child != NULL);
71682 + assert("umka-303", tree != NULL);
71683 +
71684 + init_lh(handle);
71685 + write_lock_tree(tree);
71686 + ret = far_next_coord(coord, handle, flags);
71687 +
71688 + if (ret) {
71689 + if (ret != -ENOENT) {
71690 + write_unlock_tree(tree);
71691 + return ret;
71692 + }
71693 + } else {
71694 + item_plugin *iplug;
71695 +
71696 + if (handle->node != NULL) {
71697 + (*nr_locked)++;
71698 + side_parent = handle->node;
71699 + }
71700 +
71701 + /* does coord object points to internal item? We do not
71702 + support sibling pointers between znode for formatted and
71703 + unformatted nodes and return -E_NO_NEIGHBOR in that case. */
71704 + iplug = item_plugin_by_coord(coord);
71705 + if (!item_is_internal(coord)) {
71706 + link_znodes(child, NULL, to_left);
71707 + write_unlock_tree(tree);
71708 + /* we know there can't be formatted neighbor */
71709 + return RETERR(-E_NO_NEIGHBOR);
71710 + }
71711 + write_unlock_tree(tree);
71712 +
71713 + iplug->s.internal.down_link(coord, NULL, &da);
71714 +
71715 + if (flags & GN_NO_ALLOC) {
71716 + neighbor = zlook(tree, &da);
71717 + } else {
71718 + neighbor =
71719 + zget(tree, &da, side_parent, level,
71720 + reiser4_ctx_gfp_mask_get());
71721 + }
71722 +
71723 + if (IS_ERR(neighbor)) {
71724 + ret = PTR_ERR(neighbor);
71725 + return ret;
71726 + }
71727 +
71728 + if (neighbor)
71729 + /* update delimiting keys */
71730 + set_child_delimiting_keys(coord->node, coord, neighbor);
71731 +
71732 + write_lock_tree(tree);
71733 + }
71734 +
71735 + if (likely(neighbor == NULL ||
71736 + (znode_get_level(child) == znode_get_level(neighbor)
71737 + && child != neighbor)))
71738 + link_znodes(child, neighbor, to_left);
71739 + else {
71740 + warning("nikita-3532",
71741 + "Sibling nodes on the different levels: %i != %i\n",
71742 + znode_get_level(child), znode_get_level(neighbor));
71743 + ret = RETERR(-EIO);
71744 + }
71745 +
71746 + write_unlock_tree(tree);
71747 +
71748 + /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
71749 + if (neighbor != NULL && (flags & GN_NO_ALLOC))
71750 + /* atomic_dec(&ZJNODE(neighbor)->x_count); */
71751 + zput(neighbor);
71752 +
71753 + return ret;
71754 +}
71755 +
71756 +/* This function is for establishing of one side relation. */
71757 +/* Audited by: umka (2002.06.14) */
71758 +static int connect_one_side(coord_t * coord, znode * node, int flags)
71759 +{
71760 + coord_t local;
71761 + lock_handle handle;
71762 + int nr_locked;
71763 + int ret;
71764 +
71765 + assert("umka-248", coord != NULL);
71766 + assert("umka-249", node != NULL);
71767 +
71768 + coord_dup_nocheck(&local, coord);
71769 +
71770 + init_lh(&handle);
71771 +
71772 + ret =
71773 + renew_sibling_link(&local, &handle, node, znode_get_level(node),
71774 + flags | GN_NO_ALLOC, &nr_locked);
71775 +
71776 + if (handle.node != NULL) {
71777 + /* complementary operations for zload() and lock() in far_next_coord() */
71778 + zrelse(handle.node);
71779 + longterm_unlock_znode(&handle);
71780 + }
71781 +
71782 + /* we catch error codes which are not interesting for us because we
71783 + run renew_sibling_link() only for znode connection. */
71784 + if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
71785 + return 0;
71786 +
71787 + return ret;
71788 +}
71789 +
71790 +/* if @child is not in `connected' state, performs hash searches for left and
71791 + right neighbor nodes and establishes horizontal sibling links */
71792 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
71793 +int connect_znode(coord_t * parent_coord, znode * child)
71794 +{
71795 + reiser4_tree *tree = znode_get_tree(child);
71796 + int ret = 0;
71797 +
71798 + assert("zam-330", parent_coord != NULL);
71799 + assert("zam-331", child != NULL);
71800 + assert("zam-332", parent_coord->node != NULL);
71801 + assert("umka-305", tree != NULL);
71802 +
71803 + /* it is trivial to `connect' root znode because it can't have
71804 + neighbors */
71805 + if (znode_above_root(parent_coord->node)) {
71806 + child->left = NULL;
71807 + child->right = NULL;
71808 + ZF_SET(child, JNODE_LEFT_CONNECTED);
71809 + ZF_SET(child, JNODE_RIGHT_CONNECTED);
71810 +
71811 + ON_DEBUG(child->left_version =
71812 + atomic_inc_return(&delim_key_version);
71813 + child->right_version =
71814 + atomic_inc_return(&delim_key_version););
71815 +
71816 + return 0;
71817 + }
71818 +
71819 + /* load parent node */
71820 + coord_clear_iplug(parent_coord);
71821 + ret = zload(parent_coord->node);
71822 +
71823 + if (ret != 0)
71824 + return ret;
71825 +
71826 + /* protect `connected' state check by tree_lock */
71827 + read_lock_tree(tree);
71828 +
71829 + if (!znode_is_right_connected(child)) {
71830 + read_unlock_tree(tree);
71831 + /* connect right (default is right) */
71832 + ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
71833 + if (ret)
71834 + goto zrelse_and_ret;
71835 +
71836 + read_lock_tree(tree);
71837 + }
71838 +
71839 + ret = znode_is_left_connected(child);
71840 +
71841 + read_unlock_tree(tree);
71842 +
71843 + if (!ret) {
71844 + ret =
71845 + connect_one_side(parent_coord, child,
71846 + GN_NO_ALLOC | GN_GO_LEFT);
71847 + } else
71848 + ret = 0;
71849 +
71850 + zrelse_and_ret:
71851 + zrelse(parent_coord->node);
71852 +
71853 + return ret;
71854 +}
71855 +
71856 +/* this function is like renew_sibling_link() but allocates neighbor node if
71857 + it doesn't exist and `connects' it. It may require making two steps in
71858 + horizontal direction, first one for neighbor node finding/allocation,
71859 + second one is for finding neighbor of neighbor to connect freshly allocated
71860 + znode. */
71861 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
71862 +static int
71863 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
71864 +{
71865 + coord_t local;
71866 + lock_handle empty[2];
71867 + reiser4_tree *tree = znode_get_tree(node);
71868 + znode *neighbor = NULL;
71869 + int nr_locked = 0;
71870 + int ret;
71871 +
71872 + assert("umka-250", coord != NULL);
71873 + assert("umka-251", node != NULL);
71874 + assert("umka-307", tree != NULL);
71875 + assert("umka-308", level <= tree->height);
71876 +
71877 + /* umka (2002.06.14)
71878 + Here probably should be a check for given "level" validness.
71879 + Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
71880 + */
71881 +
71882 + coord_dup(&local, coord);
71883 +
71884 + ret =
71885 + renew_sibling_link(&local, &empty[0], node, level,
71886 + flags & ~GN_NO_ALLOC, &nr_locked);
71887 + if (ret)
71888 + goto out;
71889 +
71890 + /* tree lock is not needed here because we keep parent node(s) locked
71891 + and reference to neighbor znode incremented */
71892 + neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
71893 +
71894 + read_lock_tree(tree);
71895 + ret = znode_is_connected(neighbor);
71896 + read_unlock_tree(tree);
71897 + if (ret) {
71898 + ret = 0;
71899 + goto out;
71900 + }
71901 +
71902 + ret =
71903 + renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
71904 + flags | GN_NO_ALLOC, &nr_locked);
71905 + /* second renew_sibling_link() call is used for znode connection only,
71906 + so we can live with these errors */
71907 + if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
71908 + ret = 0;
71909 +
71910 + out:
71911 +
71912 + for (--nr_locked; nr_locked >= 0; --nr_locked) {
71913 + zrelse(empty[nr_locked].node);
71914 + longterm_unlock_znode(&empty[nr_locked]);
71915 + }
71916 +
71917 + if (neighbor != NULL)
71918 + /* decrement znode reference counter without actually
71919 + releasing it. */
71920 + atomic_dec(&ZJNODE(neighbor)->x_count);
71921 +
71922 + return ret;
71923 +}
71924 +
71925 +/*
71926 + reiser4_get_neighbor() -- lock node's neighbor.
71927 +
71928 + reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
71929 + given parameter) using sibling link to it. If sibling link is not available
71930 + (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
71931 + level up for information about neighbor's disk address. We lock node's
71932 + parent, if it is common parent for both 'node' and its neighbor, neighbor's
71933 + disk address is in next (to left or to right) down link from link that points
71934 + to original node. If not, we need to lock parent's neighbor, read its content
71935 + and take first(last) downlink with neighbor's disk address. That locking
71936 + could be done by using sibling link and lock_neighbor() function, if sibling
71937 + link exists. In another case we have to go level up again until we find
71938 + common parent or valid sibling link. Then go down
71939 + allocating/connecting/locking/reading nodes until neighbor of first one is
71940 + locked.
71941 +
71942 + @neighbor: result lock handle,
71943 + @node: a node which we lock neighbor of,
71944 + @lock_mode: lock mode {LM_READ, LM_WRITE},
71945 + @flags: logical OR of {GN_*} (see description above) subset.
71946 +
71947 + @return: 0 if success, negative value if lock was impossible due to an error
71948 + or lack of neighbor node.
71949 +*/
71950 +
71951 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
71952 +int
71953 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
71954 + znode_lock_mode lock_mode, int flags)
71955 +{
71956 + reiser4_tree *tree = znode_get_tree(node);
71957 + lock_handle path[REAL_MAX_ZTREE_HEIGHT];
71958 +
71959 + coord_t coord;
71960 +
71961 + tree_level base_level;
71962 + tree_level h = 0;
71963 + int ret;
71964 +
71965 + assert("umka-252", tree != NULL);
71966 + assert("umka-253", neighbor != NULL);
71967 + assert("umka-254", node != NULL);
71968 +
71969 + base_level = znode_get_level(node);
71970 +
71971 + assert("umka-310", base_level <= tree->height);
71972 +
71973 + coord_init_zero(&coord);
71974 +
71975 + again:
71976 + /* first, we try to use simple lock_neighbor() which requires sibling
71977 + link existence */
71978 + read_lock_tree(tree);
71979 + ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
71980 + read_unlock_tree(tree);
71981 + if (!ret) {
71982 + /* load znode content if it was specified */
71983 + if (flags & GN_LOAD_NEIGHBOR) {
71984 + ret = zload(node);
71985 + if (ret)
71986 + longterm_unlock_znode(neighbor);
71987 + }
71988 + return ret;
71989 + }
71990 +
71991 + /* only -ENOENT means we may look upward and try to connect
71992 + @node with its neighbor (if @flags allow us to do it) */
71993 + if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
71994 + return ret;
71995 +
71996 + /* before establishing of sibling link we lock parent node; it is
71997 + required by renew_neighbor() to work. */
71998 + init_lh(&path[0]);
71999 + ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72000 + if (ret)
72001 + return ret;
72002 + if (znode_above_root(path[0].node)) {
72003 + longterm_unlock_znode(&path[0]);
72004 + return RETERR(-E_NO_NEIGHBOR);
72005 + }
72006 +
72007 + while (1) {
72008 + znode *child = (h == 0) ? node : path[h - 1].node;
72009 + znode *parent = path[h].node;
72010 +
72011 + ret = zload(parent);
72012 + if (ret)
72013 + break;
72014 +
72015 + ret = find_child_ptr(parent, child, &coord);
72016 +
72017 + if (ret) {
72018 + zrelse(parent);
72019 + break;
72020 + }
72021 +
72022 + /* try to establish missing sibling link */
72023 + ret = renew_neighbor(&coord, child, h + base_level, flags);
72024 +
72025 + zrelse(parent);
72026 +
72027 + switch (ret) {
72028 + case 0:
72029 + /* unlocking of parent znode prevents simple
72030 + deadlock situation */
72031 + done_lh(&path[h]);
72032 +
72033 + /* depend on tree level we stay on we repeat first
72034 + locking attempt ... */
72035 + if (h == 0)
72036 + goto again;
72037 +
72038 + /* ... or repeat establishing of sibling link at
72039 + one level below. */
72040 + --h;
72041 + break;
72042 +
72043 + case -ENOENT:
72044 + /* sibling link is not available -- we go
72045 + upward. */
72046 + init_lh(&path[h + 1]);
72047 + ret =
72048 + reiser4_get_parent(&path[h + 1], parent,
72049 + ZNODE_READ_LOCK);
72050 + if (ret)
72051 + goto fail;
72052 + ++h;
72053 + if (znode_above_root(path[h].node)) {
72054 + ret = RETERR(-E_NO_NEIGHBOR);
72055 + goto fail;
72056 + }
72057 + break;
72058 +
72059 + case -E_DEADLOCK:
72060 + /* there was lock request from hi-pri locker. if
72061 + it is possible we unlock last parent node and
72062 + re-lock it again. */
72063 + for (; reiser4_check_deadlock(); h--) {
72064 + done_lh(&path[h]);
72065 + if (h == 0)
72066 + goto fail;
72067 + }
72068 +
72069 + break;
72070 +
72071 + default: /* other errors. */
72072 + goto fail;
72073 + }
72074 + }
72075 + fail:
72076 + ON_DEBUG(check_lock_node_data(node));
72077 + ON_DEBUG(check_lock_data());
72078 +
72079 + /* unlock path */
72080 + do {
72081 + /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72082 + fail; path[0] is already done_lh-ed, therefore
72083 + longterm_unlock_znode(&path[h]); is not applicable */
72084 + done_lh(&path[h]);
72085 + --h;
72086 + } while (h + 1 != 0);
72087 +
72088 + return ret;
72089 +}
72090 +
72091 +/* remove node from sibling list */
72092 +/* Audited by: umka (2002.06.14) */
72093 +void sibling_list_remove(znode * node)
72094 +{
72095 + reiser4_tree *tree;
72096 +
72097 + tree = znode_get_tree(node);
72098 + assert("umka-255", node != NULL);
72099 + assert_rw_write_locked(&(tree->tree_lock));
72100 + assert("nikita-3275", check_sibling_list(node));
72101 +
72102 + write_lock_dk(tree);
72103 + if (znode_is_right_connected(node) && node->right != NULL &&
72104 + znode_is_left_connected(node) && node->left != NULL) {
72105 + assert("zam-32245",
72106 + keyeq(znode_get_rd_key(node),
72107 + znode_get_ld_key(node->right)));
72108 + znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72109 + }
72110 + write_unlock_dk(tree);
72111 +
72112 + if (znode_is_right_connected(node) && node->right != NULL) {
72113 + assert("zam-322", znode_is_left_connected(node->right));
72114 + node->right->left = node->left;
72115 + ON_DEBUG(node->right->left_version =
72116 + atomic_inc_return(&delim_key_version);
72117 + );
72118 + }
72119 + if (znode_is_left_connected(node) && node->left != NULL) {
72120 + assert("zam-323", znode_is_right_connected(node->left));
72121 + node->left->right = node->right;
72122 + ON_DEBUG(node->left->right_version =
72123 + atomic_inc_return(&delim_key_version);
72124 + );
72125 + }
72126 +
72127 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
72128 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72129 + ON_DEBUG(node->left = node->right = NULL;
72130 + node->left_version = atomic_inc_return(&delim_key_version);
72131 + node->right_version = atomic_inc_return(&delim_key_version););
72132 + assert("nikita-3276", check_sibling_list(node));
72133 +}
72134 +
72135 +/* disconnect node from sibling list */
72136 +void sibling_list_drop(znode * node)
72137 +{
72138 + znode *right;
72139 + znode *left;
72140 +
72141 + assert("nikita-2464", node != NULL);
72142 + assert("nikita-3277", check_sibling_list(node));
72143 +
72144 + right = node->right;
72145 + if (right != NULL) {
72146 + assert("nikita-2465", znode_is_left_connected(right));
72147 + right->left = NULL;
72148 + ON_DEBUG(right->left_version =
72149 + atomic_inc_return(&delim_key_version);
72150 + );
72151 + }
72152 + left = node->left;
72153 + if (left != NULL) {
72154 + assert("zam-323", znode_is_right_connected(left));
72155 + left->right = NULL;
72156 + ON_DEBUG(left->right_version =
72157 + atomic_inc_return(&delim_key_version);
72158 + );
72159 + }
72160 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
72161 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72162 + ON_DEBUG(node->left = node->right = NULL;
72163 + node->left_version = atomic_inc_return(&delim_key_version);
72164 + node->right_version = atomic_inc_return(&delim_key_version););
72165 +}
72166 +
72167 +/* Insert new node into sibling list. Regular balancing inserts new node
72168 + after (at right side) existing and locked node (@before), except one case
72169 + of adding new tree root node. @before should be NULL in that case. */
72170 +void sibling_list_insert_nolock(znode * new, znode * before)
72171 +{
72172 + assert("zam-334", new != NULL);
72173 + assert("nikita-3298", !znode_is_left_connected(new));
72174 + assert("nikita-3299", !znode_is_right_connected(new));
72175 + assert("nikita-3300", new->left == NULL);
72176 + assert("nikita-3301", new->right == NULL);
72177 + assert("nikita-3278", check_sibling_list(new));
72178 + assert("nikita-3279", check_sibling_list(before));
72179 +
72180 + if (before != NULL) {
72181 + assert("zam-333", znode_is_connected(before));
72182 + new->right = before->right;
72183 + new->left = before;
72184 + ON_DEBUG(new->right_version =
72185 + atomic_inc_return(&delim_key_version);
72186 + new->left_version =
72187 + atomic_inc_return(&delim_key_version););
72188 + if (before->right != NULL) {
72189 + before->right->left = new;
72190 + ON_DEBUG(before->right->left_version =
72191 + atomic_inc_return(&delim_key_version);
72192 + );
72193 + }
72194 + before->right = new;
72195 + ON_DEBUG(before->right_version =
72196 + atomic_inc_return(&delim_key_version);
72197 + );
72198 + } else {
72199 + new->right = NULL;
72200 + new->left = NULL;
72201 + ON_DEBUG(new->right_version =
72202 + atomic_inc_return(&delim_key_version);
72203 + new->left_version =
72204 + atomic_inc_return(&delim_key_version););
72205 + }
72206 + ZF_SET(new, JNODE_LEFT_CONNECTED);
72207 + ZF_SET(new, JNODE_RIGHT_CONNECTED);
72208 + assert("nikita-3280", check_sibling_list(new));
72209 + assert("nikita-3281", check_sibling_list(before));
72210 +}
72211 +
72212 +/*
72213 + Local variables:
72214 + c-indentation-style: "K&R"
72215 + mode-name: "LC"
72216 + c-basic-offset: 8
72217 + tab-width: 8
72218 + fill-column: 80
72219 + End:
72220 +*/
72221 diff -urN linux-2.6.22.orig/fs/reiser4/tree_walk.h linux-2.6.22/fs/reiser4/tree_walk.h
72222 --- linux-2.6.22.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300
72223 +++ linux-2.6.22/fs/reiser4/tree_walk.h 2007-07-29 00:25:35.032736855 +0400
72224 @@ -0,0 +1,125 @@
72225 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
72226 +
72227 +/* definitions of reiser4 tree walk functions */
72228 +
72229 +#ifndef __FS_REISER4_TREE_WALK_H__
72230 +#define __FS_REISER4_TREE_WALK_H__
72231 +
72232 +#include "debug.h"
72233 +#include "forward.h"
72234 +
72235 +/* establishes horizontal links between cached znodes */
72236 +int connect_znode(coord_t * coord, znode * node);
72237 +
72238 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
72239 + have the following common arguments:
72240 +
72241 + return codes:
72242 +
72243 + @return : 0 - OK,
72244 +
72245 +ZAM-FIXME-HANS: wrong return code name. Change them all.
72246 + -ENOENT - neighbor is not in cache, what is detected by sibling
72247 + link absence.
72248 +
72249 + -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
72250 + found (because we are left-/right- most node of the
72251 + tree, for example). Also, this return code is for
72252 + reiser4_get_parent() when we see no parent link -- it
72253 + means that our node is root node.
72254 +
72255 + -E_DEADLOCK - deadlock detected (request from high-priority process
72256 + received), other error codes are conformed to
72257 + /usr/include/asm/errno.h .
72258 +*/
72259 +
72260 +int
72261 +reiser4_get_parent_flags(lock_handle * result, znode * node,
72262 + znode_lock_mode mode, int flags);
72263 +
72264 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
72265 +typedef enum {
72266 + /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
72267 + * find not allocated not connected neigbor by going though upper
72268 + * levels */
72269 + GN_CAN_USE_UPPER_LEVELS = 0x1,
72270 + /* locking left neighbor instead of right one */
72271 + GN_GO_LEFT = 0x2,
72272 + /* automatically load neighbor node content */
72273 + GN_LOAD_NEIGHBOR = 0x4,
72274 + /* return -E_REPEAT if can't lock */
72275 + GN_TRY_LOCK = 0x8,
72276 + /* used internally in tree_walk.c, causes renew_sibling to not
72277 + allocate neighbor znode, but only search for it in znode cache */
72278 + GN_NO_ALLOC = 0x10,
72279 + /* do not go across atom boundaries */
72280 + GN_SAME_ATOM = 0x20,
72281 + /* allow to lock not connected nodes */
72282 + GN_ALLOW_NOT_CONNECTED = 0x40,
72283 + /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
72284 + GN_ASYNC = 0x80
72285 +} znode_get_neigbor_flags;
72286 +
72287 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
72288 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
72289 + znode_lock_mode mode)
72290 +{
72291 + return reiser4_get_parent_flags(result, node, mode,
72292 + GN_ALLOW_NOT_CONNECTED);
72293 +}
72294 +
72295 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72296 + znode_lock_mode lock_mode, int flags);
72297 +
72298 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
72299 +static inline int
72300 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
72301 + int flags)
72302 +{
72303 + return reiser4_get_neighbor(result, node, lock_mode,
72304 + flags | GN_GO_LEFT);
72305 +}
72306 +
72307 +static inline int
72308 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
72309 + int flags)
72310 +{
72311 + ON_DEBUG(check_lock_node_data(node));
72312 + ON_DEBUG(check_lock_data());
72313 + return reiser4_get_neighbor(result, node, lock_mode,
72314 + flags & (~GN_GO_LEFT));
72315 +}
72316 +
72317 +extern void sibling_list_remove(znode * node);
72318 +extern void sibling_list_drop(znode * node);
72319 +extern void sibling_list_insert_nolock(znode * new, znode * before);
72320 +extern void link_left_and_right(znode * left, znode * right);
72321 +
72322 +/* Functions called by tree_walk() when tree_walk() ... */
72323 +struct tree_walk_actor {
72324 + /* ... meets a formatted node, */
72325 + int (*process_znode) (tap_t *, void *);
72326 + /* ... meets an extent, */
72327 + int (*process_extent) (tap_t *, void *);
72328 + /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
72329 + * node or extent processing functions. */
72330 + int (*before) (void *);
72331 +};
72332 +
72333 +#if REISER4_DEBUG
72334 +int check_sibling_list(znode * node);
72335 +#else
72336 +#define check_sibling_list(n) (1)
72337 +#endif
72338 +
72339 +#endif /* __FS_REISER4_TREE_WALK_H__ */
72340 +
72341 +/*
72342 + Local variables:
72343 + c-indentation-style: "K&R"
72344 + mode-name: "LC"
72345 + c-basic-offset: 8
72346 + tab-width: 8
72347 + fill-column: 120
72348 + End:
72349 +*/
72350 diff -urN linux-2.6.22.orig/fs/reiser4/txnmgr.c linux-2.6.22/fs/reiser4/txnmgr.c
72351 --- linux-2.6.22.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
72352 +++ linux-2.6.22/fs/reiser4/txnmgr.c 2007-07-29 00:25:35.040738926 +0400
72353 @@ -0,0 +1,3164 @@
72354 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72355 + * reiser4/README */
72356 +
72357 +/* Joshua MacDonald wrote the first draft of this code. */
72358 +
72359 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
72360 +filesystem scales only as well as its worst locking design. You need to
72361 +substantially restructure this code. Josh was not as experienced a programmer
72362 +as you. Particularly review how the locking style differs from what you did
72363 +for znodes usingt hi-lo priority locking, and present to me an opinion on
72364 +whether the differences are well founded. */
72365 +
72366 +/* I cannot help but to disagree with the sentiment above. Locking of
72367 + * transaction manager is _not_ badly designed, and, at the very least, is not
72368 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
72369 + * locking on znodes, especially on the root node of the tree. --nikita,
72370 + * 2003.10.13 */
72371 +
72372 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
72373 + txnmgr processes capture_block requests and manages the relationship between jnodes and
72374 + atoms through the various stages of a transcrash, and it also oversees the fusion and
72375 + capture-on-copy processes. The main difficulty with this task is maintaining a
72376 + deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
72377 + difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
72378 + must be broken. The main requirement is that atom-fusion be deadlock free, so once you
72379 + hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
72380 + that any time you check the atom-pointer of a jnode or handle and then try to lock that
72381 + atom, you must use trylock() and possibly reverse the order.
72382 +
72383 + This code implements the design documented at:
72384 +
72385 + http://namesys.com/txn-doc.html
72386 +
72387 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
72388 +above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
72389 +topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
72390 +year old --- define all technical terms used.
72391 +
72392 +*/
72393 +
72394 +/* Thoughts on the external transaction interface:
72395 +
72396 + In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
72397 + creates state that lasts for the duration of a system call and is called at the start
72398 + of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
72399 + occupying the scope of a single system call. We wish to give certain applications an
72400 + interface to begin and close (commit) transactions. Since our implementation of
72401 + transactions does not yet support isolation, allowing an application to open a
72402 + transaction implies trusting it to later close the transaction. Part of the
72403 + transaction interface will be aimed at enabling that trust, but the interface for
72404 + actually using transactions is fairly narrow.
72405 +
72406 + BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
72407 + this identifier into a string that a shell-script could use, allowing you to start a
72408 + transaction by issuing a command. Once open, the transcrash should be set in the task
72409 + structure, and there should be options (I suppose) to allow it to be carried across
72410 + fork/exec. A transcrash has several options:
72411 +
72412 + - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
72413 + on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
72414 + capture on reads as well, it should set READ_FUSING.
72415 +
72416 + - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
72417 + eventually close (or else the machine must crash). If the application dies an
72418 + unexpected death with an open transcrash, for example, or if it hangs for a long
72419 + duration, one solution (to avoid crashing the machine) is to simply close it anyway.
72420 + This is a dangerous option, but it is one way to solve the problem until isolated
72421 + transcrashes are available for untrusted applications.
72422 +
72423 + It seems to be what databases do, though it is unclear how one avoids a DoS attack
72424 + creating a vulnerability based on resource starvation. Guaranteeing that some
72425 + minimum amount of computational resources are made available would seem more correct
72426 + than guaranteeing some amount of time. When we again have someone to code the work,
72427 + this issue should be considered carefully. -Hans
72428 +
72429 + RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
72430 + many dirty blocks it expects. The reserve_blocks interface should be called at a point
72431 + where it is safe for the application to fail, because the system may not be able to
72432 + grant the allocation and the application must be able to back-out. For this reason,
72433 + the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
72434 + the application may also wish to extend the allocation after beginning its transcrash.
72435 +
72436 + CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
72437 + modifications that require transaction protection. When isolated transactions are
72438 + supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
72439 + RESERVE_BLOCKS call fails for the application, it should "abort" by calling
72440 + CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
72441 + why, for safety, the application should call RESERVE_BLOCKS before making any changes).
72442 +
72443 + For actually implementing these out-of-system-call-scopped transcrashes, the
72444 + reiser4_context has a "txn_handle *trans" pointer that may be set to an open
72445 + transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
72446 + "struct kmem_cache *_txnh_slab" created for that purpose in this file.
72447 +*/
72448 +
72449 +/* Extending the other system call interfaces for future transaction features:
72450 +
72451 + Specialized applications may benefit from passing flags to the ordinary system call
72452 + interface such as read(), write(), or stat(). For example, the application specifies
72453 + WRITE_FUSING by default but wishes to add that a certain read() command should be
72454 + treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
72455 + read, or the file-data read? These issues are straight-forward, but there are a lot of
72456 + them and adding the necessary flags-passing code will be tedious.
72457 +
72458 + When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
72459 + flag, which specifies that although it is a read operation being requested, a
72460 + write-lock should be taken. The reason is that read-locks are shared while write-locks
72461 + are exclusive, so taking a read-lock when a later-write is known in advance will often
72462 + leads to deadlock. If a reader knows it will write later, it should issue read
72463 + requests with the RMW flag set.
72464 +*/
72465 +
72466 +/*
72467 + The znode/atom deadlock avoidance.
72468 +
72469 + FIXME(Zam): writing of this comment is in progress.
72470 +
72471 + The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
72472 + long-term locking, which makes reiser4 locking scheme more complex. It had
72473 + deadlocks until we implement deadlock avoidance algorithms. That deadlocks
72474 + looked as the following: one stopped thread waits for a long-term lock on
72475 + znode, the thread who owns that lock waits when fusion with another atom will
72476 + be allowed.
72477 +
72478 + The source of the deadlocks is an optimization of not capturing index nodes
72479 + for read. Let's prove it. Suppose we have dumb node capturing scheme which
72480 + unconditionally captures each block before locking it.
72481 +
72482 + That scheme has no deadlocks. Let's begin with the thread which stage is
72483 + ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
72484 + a capture because it's stage allows fusion with any atom except which are
72485 + being committed currently. A process of atom commit can't deadlock because
72486 + atom commit procedure does not acquire locks and does not fuse with other
72487 + atoms. Reiser4 does capturing right before going to sleep inside the
72488 + longtertm_lock_znode() function, it means the znode which we want to lock is
72489 + already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
72490 + continue the analysis we understand that no one process in the sequence may
72491 + waits atom fusion. Thereby there are no deadlocks of described kind.
72492 +
72493 + The capturing optimization makes the deadlocks possible. A thread can wait a
72494 + lock which owner did not captured that node. The lock owner's current atom
72495 + is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
72496 + state. A deadlock is possible when that atom meets another one which is in
72497 + ASTAGE_CAPTURE_WAIT already.
72498 +
72499 + The deadlock avoidance scheme includes two algorithms:
72500 +
72501 + First algorithm is used when a thread captures a node which is locked but not
72502 + captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
72503 + moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
72504 + being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
72505 + routine which forces all lock owners to join with current atom is executed.
72506 +
72507 + Second algorithm does not allow to skip capturing of already captured nodes.
72508 +
72509 + Both algorithms together prevent waiting a longterm lock without atom fusion
72510 + with atoms of all lock owners, which is a key thing for getting atom/znode
72511 + locking deadlocks.
72512 +*/
72513 +
72514 +/*
72515 + * Transactions and mmap(2).
72516 + *
72517 + * 1. Transactions are not supported for accesses through mmap(2), because
72518 + * this would effectively amount to user-level transactions whose duration
72519 + * is beyond control of the kernel.
72520 + *
72521 + * 2. That said, we still want to preserve some decency with regard to
72522 + * mmap(2). During normal write(2) call, following sequence of events
72523 + * happens:
72524 + *
72525 + * 1. page is created;
72526 + *
72527 + * 2. jnode is created, dirtied and captured into current atom.
72528 + *
72529 + * 3. extent is inserted and modified.
72530 + *
72531 + * Steps (2) and (3) take place under long term lock on the twig node.
72532 + *
72533 + * When file is accessed through mmap(2) page is always created during
72534 + * page fault.
72535 + * After this (in reiser4_readpage()->reiser4_readpage_extent()):
72536 + *
72537 + * 1. if access is made to non-hole page new jnode is created, (if
72538 + * necessary)
72539 + *
72540 + * 2. if access is made to the hole page, jnode is not created (XXX
72541 + * not clear why).
72542 + *
72543 + * Also, even if page is created by write page fault it is not marked
72544 + * dirty immediately by handle_mm_fault(). Probably this is to avoid races
72545 + * with page write-out.
72546 + *
72547 + * Dirty bit installed by hardware is only transferred to the struct page
72548 + * later, when page is unmapped (in zap_pte_range(), or
72549 + * try_to_unmap_one()).
72550 + *
72551 + * So, with mmap(2) we have to handle following irksome situations:
72552 + *
72553 + * 1. there exists modified page (clean or dirty) without jnode
72554 + *
72555 + * 2. there exists modified page (clean or dirty) with clean jnode
72556 + *
72557 + * 3. clean page which is a part of atom can be transparently modified
72558 + * at any moment through mapping without becoming dirty.
72559 + *
72560 + * (1) and (2) can lead to the out-of-memory situation: ->writepage()
72561 + * doesn't know what to do with such pages and ->sync_sb()/->writepages()
72562 + * don't see them, because these methods operate on atoms.
72563 + *
72564 + * (3) can lead to the loss of data: suppose we have dirty page with dirty
72565 + * captured jnode captured by some atom. As part of early flush (for
72566 + * example) page was written out. Dirty bit was cleared on both page and
72567 + * jnode. After this page is modified through mapping, but kernel doesn't
72568 + * notice and just discards page and jnode as part of commit. (XXX
72569 + * actually it doesn't, because to reclaim page ->releasepage() has to be
72570 + * called and before this dirty bit will be transferred to the struct
72571 + * page).
72572 + *
72573 + */
72574 +
72575 +#include "debug.h"
72576 +#include "txnmgr.h"
72577 +#include "jnode.h"
72578 +#include "znode.h"
72579 +#include "block_alloc.h"
72580 +#include "tree.h"
72581 +#include "wander.h"
72582 +#include "ktxnmgrd.h"
72583 +#include "super.h"
72584 +#include "page_cache.h"
72585 +#include "reiser4.h"
72586 +#include "vfs_ops.h"
72587 +#include "inode.h"
72588 +#include "flush.h"
72589 +
72590 +#include <asm/atomic.h>
72591 +#include <linux/types.h>
72592 +#include <linux/fs.h>
72593 +#include <linux/mm.h>
72594 +#include <linux/slab.h>
72595 +#include <linux/pagemap.h>
72596 +#include <linux/writeback.h>
72597 +#include <linux/swap.h> /* for totalram_pages */
72598 +
72599 +static void atom_free(txn_atom * atom);
72600 +
72601 +static int commit_txnh(txn_handle * txnh);
72602 +
72603 +static void wakeup_atom_waitfor_list(txn_atom * atom);
72604 +static void wakeup_atom_waiting_list(txn_atom * atom);
72605 +
72606 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
72607 +
72608 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
72609 +
72610 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
72611 +
72612 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
72613 + txn_capture mode);
72614 +
72615 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
72616 +
72617 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
72618 +
72619 +void reiser4_invalidate_list(struct list_head *);
72620 +
72621 +/* GENERIC STRUCTURES */
72622 +
72623 +typedef struct _txn_wait_links txn_wait_links;
72624 +
72625 +struct _txn_wait_links {
72626 + lock_stack *_lock_stack;
72627 + struct list_head _fwaitfor_link;
72628 + struct list_head _fwaiting_link;
72629 + int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72630 + int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72631 +};
72632 +
72633 +/* FIXME: In theory, we should be using the slab cache init & destructor
72634 + methods instead of, e.g., jnode_init, etc. */
72635 +static struct kmem_cache *_atom_slab = NULL;
72636 +/* this is for user-visible, cross system-call transactions. */
72637 +static struct kmem_cache *_txnh_slab = NULL;
72638 +
72639 +/**
72640 + * init_txnmgr_static - create transaction manager slab caches
72641 + *
72642 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
72643 + * initialization.
72644 + */
72645 +int init_txnmgr_static(void)
72646 +{
72647 + assert("jmacd-600", _atom_slab == NULL);
72648 + assert("jmacd-601", _txnh_slab == NULL);
72649 +
72650 + ON_DEBUG(atomic_set(&flush_cnt, 0));
72651 +
72652 + _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
72653 + SLAB_HWCACHE_ALIGN |
72654 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
72655 + if (_atom_slab == NULL)
72656 + return RETERR(-ENOMEM);
72657 +
72658 + _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
72659 + SLAB_HWCACHE_ALIGN, NULL, NULL);
72660 + if (_txnh_slab == NULL) {
72661 + kmem_cache_destroy(_atom_slab);
72662 + _atom_slab = NULL;
72663 + return RETERR(-ENOMEM);
72664 + }
72665 +
72666 + return 0;
72667 +}
72668 +
72669 +/**
72670 + * done_txnmgr_static - delete txn_atom and txn_handle caches
72671 + *
72672 + * This is called on reiser4 module unloading or system shutdown.
72673 + */
72674 +void done_txnmgr_static(void)
72675 +{
72676 + destroy_reiser4_cache(&_atom_slab);
72677 + destroy_reiser4_cache(&_txnh_slab);
72678 +}
72679 +
72680 +/**
72681 + * init_txnmgr - initialize a new transaction manager
72682 + * @mgr: pointer to transaction manager embedded in reiser4 super block
72683 + *
72684 + * This is called on mount. Makes necessary initializations.
72685 + */
72686 +void reiser4_init_txnmgr(txn_mgr *mgr)
72687 +{
72688 + assert("umka-169", mgr != NULL);
72689 +
72690 + mgr->atom_count = 0;
72691 + mgr->id_count = 1;
72692 + INIT_LIST_HEAD(&mgr->atoms_list);
72693 + spin_lock_init(&mgr->tmgr_lock);
72694 + mutex_init(&mgr->commit_mutex);
72695 +}
72696 +
72697 +/**
72698 + * reiser4_done_txnmgr - stop transaction manager
72699 + * @mgr: pointer to transaction manager embedded in reiser4 super block
72700 + *
72701 + * This is called on umount. Does sanity checks.
72702 + */
72703 +void reiser4_done_txnmgr(txn_mgr *mgr)
72704 +{
72705 + assert("umka-170", mgr != NULL);
72706 + assert("umka-1701", list_empty_careful(&mgr->atoms_list));
72707 + assert("umka-1702", mgr->atom_count == 0);
72708 +}
72709 +
72710 +/* Initialize a transaction handle. */
72711 +/* Audited by: umka (2002.06.13) */
72712 +static void txnh_init(txn_handle * txnh, txn_mode mode)
72713 +{
72714 + assert("umka-171", txnh != NULL);
72715 +
72716 + txnh->mode = mode;
72717 + txnh->atom = NULL;
72718 + reiser4_ctx_gfp_mask_set();
72719 + txnh->flags = 0;
72720 + spin_lock_init(&txnh->hlock);
72721 + INIT_LIST_HEAD(&txnh->txnh_link);
72722 +}
72723 +
72724 +#if REISER4_DEBUG
72725 +/* Check if a transaction handle is clean. */
72726 +static int txnh_isclean(txn_handle * txnh)
72727 +{
72728 + assert("umka-172", txnh != NULL);
72729 + return txnh->atom == NULL &&
72730 + LOCK_CNT_NIL(spin_locked_txnh);
72731 +}
72732 +#endif
72733 +
72734 +/* Initialize an atom. */
72735 +static void atom_init(txn_atom * atom)
72736 +{
72737 + int level;
72738 +
72739 + assert("umka-173", atom != NULL);
72740 +
72741 + memset(atom, 0, sizeof(txn_atom));
72742 +
72743 + atom->stage = ASTAGE_FREE;
72744 + atom->start_time = jiffies;
72745 +
72746 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
72747 + INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
72748 +
72749 + INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
72750 + INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
72751 + INIT_LIST_HEAD(ATOM_WB_LIST(atom));
72752 + INIT_LIST_HEAD(&atom->inodes);
72753 + spin_lock_init(&(atom->alock));
72754 + /* list of transaction handles */
72755 + INIT_LIST_HEAD(&atom->txnh_list);
72756 + /* link to transaction manager's list of atoms */
72757 + INIT_LIST_HEAD(&atom->atom_link);
72758 + INIT_LIST_HEAD(&atom->fwaitfor_list);
72759 + INIT_LIST_HEAD(&atom->fwaiting_list);
72760 + blocknr_set_init(&atom->delete_set);
72761 + blocknr_set_init(&atom->wandered_map);
72762 +
72763 + init_atom_fq_parts(atom);
72764 +}
72765 +
72766 +#if REISER4_DEBUG
72767 +/* Check if an atom is clean. */
72768 +static int atom_isclean(txn_atom * atom)
72769 +{
72770 + int level;
72771 +
72772 + assert("umka-174", atom != NULL);
72773 +
72774 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72775 + if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
72776 + return 0;
72777 + }
72778 + }
72779 +
72780 + return atom->stage == ASTAGE_FREE &&
72781 + atom->txnh_count == 0 &&
72782 + atom->capture_count == 0 &&
72783 + atomic_read(&atom->refcount) == 0 &&
72784 + (&atom->atom_link == atom->atom_link.next &&
72785 + &atom->atom_link == atom->atom_link.prev) &&
72786 + list_empty_careful(&atom->txnh_list) &&
72787 + list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
72788 + list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
72789 + list_empty_careful(ATOM_WB_LIST(atom)) &&
72790 + list_empty_careful(&atom->fwaitfor_list) &&
72791 + list_empty_careful(&atom->fwaiting_list) &&
72792 + atom_fq_parts_are_clean(atom);
72793 +}
72794 +#endif
72795 +
72796 +/* Begin a transaction in this context. Currently this uses the reiser4_context's
72797 + trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
72798 + this will be extended to allow transaction handles to span several contexts. */
72799 +/* Audited by: umka (2002.06.13) */
72800 +void reiser4_txn_begin(reiser4_context * context)
72801 +{
72802 + assert("jmacd-544", context->trans == NULL);
72803 +
72804 + context->trans = &context->trans_in_ctx;
72805 +
72806 + /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
72807 + transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
72808 + stack allocated right now, but we would like to allow for dynamically allocated
72809 + transcrashes that span multiple system calls.
72810 + */
72811 + txnh_init(context->trans, TXN_WRITE_FUSING);
72812 +}
72813 +
72814 +/* Finish a transaction handle context. */
72815 +int reiser4_txn_end(reiser4_context * context)
72816 +{
72817 + long ret = 0;
72818 + txn_handle *txnh;
72819 +
72820 + assert("umka-283", context != NULL);
72821 + assert("nikita-3012", reiser4_schedulable());
72822 + assert("vs-24", context == get_current_context());
72823 + assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
72824 +
72825 + txnh = context->trans;
72826 + if (txnh != NULL) {
72827 + if (txnh->atom != NULL)
72828 + ret = commit_txnh(txnh);
72829 + assert("jmacd-633", txnh_isclean(txnh));
72830 + context->trans = NULL;
72831 + }
72832 + return ret;
72833 +}
72834 +
72835 +void reiser4_txn_restart(reiser4_context * context)
72836 +{
72837 + reiser4_txn_end(context);
72838 + reiser4_preempt_point();
72839 + reiser4_txn_begin(context);
72840 +}
72841 +
72842 +void reiser4_txn_restart_current(void)
72843 +{
72844 + reiser4_txn_restart(get_current_context());
72845 +}
72846 +
72847 +/* TXN_ATOM */
72848 +
72849 +/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
72850 + is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
72851 + return NULL. */
72852 +static txn_atom *txnh_get_atom(txn_handle * txnh)
72853 +{
72854 + txn_atom *atom;
72855 +
72856 + assert("umka-180", txnh != NULL);
72857 + assert_spin_not_locked(&(txnh->hlock));
72858 +
72859 + while (1) {
72860 + spin_lock_txnh(txnh);
72861 + atom = txnh->atom;
72862 +
72863 + if (atom == NULL)
72864 + break;
72865 +
72866 + if (spin_trylock_atom(atom))
72867 + break;
72868 +
72869 + atomic_inc(&atom->refcount);
72870 +
72871 + spin_unlock_txnh(txnh);
72872 + spin_lock_atom(atom);
72873 + spin_lock_txnh(txnh);
72874 +
72875 + if (txnh->atom == atom) {
72876 + atomic_dec(&atom->refcount);
72877 + break;
72878 + }
72879 +
72880 + spin_unlock_txnh(txnh);
72881 + atom_dec_and_unlock(atom);
72882 + }
72883 +
72884 + return atom;
72885 +}
72886 +
72887 +/* Get the current atom and spinlock it if current atom present. May return NULL */
72888 +txn_atom *get_current_atom_locked_nocheck(void)
72889 +{
72890 + reiser4_context *cx;
72891 + txn_atom *atom;
72892 + txn_handle *txnh;
72893 +
72894 + cx = get_current_context();
72895 + assert("zam-437", cx != NULL);
72896 +
72897 + txnh = cx->trans;
72898 + assert("zam-435", txnh != NULL);
72899 +
72900 + atom = txnh_get_atom(txnh);
72901 +
72902 + spin_unlock_txnh(txnh);
72903 + return atom;
72904 +}
72905 +
72906 +/* Get the atom belonging to a jnode, which is initially locked. Return with
72907 + both jnode and atom locked. This performs the necessary spin_trylock to
72908 + break the lock-ordering cycle. Assumes the jnode is already locked, and
72909 + returns NULL if atom is not set. */
72910 +txn_atom *jnode_get_atom(jnode * node)
72911 +{
72912 + txn_atom *atom;
72913 +
72914 + assert("umka-181", node != NULL);
72915 +
72916 + while (1) {
72917 + assert_spin_locked(&(node->guard));
72918 +
72919 + atom = node->atom;
72920 + /* node is not in any atom */
72921 + if (atom == NULL)
72922 + break;
72923 +
72924 + /* If atom is not locked, grab the lock and return */
72925 + if (spin_trylock_atom(atom))
72926 + break;
72927 +
72928 + /* At least one jnode belongs to this atom it guarantees that
72929 + * atom->refcount > 0, we can safely increment refcount. */
72930 + atomic_inc(&atom->refcount);
72931 + spin_unlock_jnode(node);
72932 +
72933 + /* re-acquire spin locks in the right order */
72934 + spin_lock_atom(atom);
72935 + spin_lock_jnode(node);
72936 +
72937 + /* check if node still points to the same atom. */
72938 + if (node->atom == atom) {
72939 + atomic_dec(&atom->refcount);
72940 + break;
72941 + }
72942 +
72943 + /* releasing of atom lock and reference requires not holding
72944 + * locks on jnodes. */
72945 + spin_unlock_jnode(node);
72946 +
72947 + /* We do not sure that this atom has extra references except our
72948 + * one, so we should call proper function which may free atom if
72949 + * last reference is released. */
72950 + atom_dec_and_unlock(atom);
72951 +
72952 + /* lock jnode again for getting valid node->atom pointer
72953 + * value. */
72954 + spin_lock_jnode(node);
72955 + }
72956 +
72957 + return atom;
72958 +}
72959 +
72960 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
72961 + by flush code to indicate whether the next node (in some direction) is suitable for
72962 + flushing. */
72963 +int
72964 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
72965 +{
72966 + int compat;
72967 + txn_atom *atom;
72968 +
72969 + assert("umka-182", node != NULL);
72970 + assert("umka-183", check != NULL);
72971 +
72972 + /* Not sure what this function is supposed to do if supplied with @check that is
72973 + neither formatted nor unformatted (bitmap or so). */
72974 + assert("nikita-2373", jnode_is_znode(check)
72975 + || jnode_is_unformatted(check));
72976 +
72977 + /* Need a lock on CHECK to get its atom and to check various state bits.
72978 + Don't need a lock on NODE once we get the atom lock. */
72979 + /* It is not enough to lock two nodes and check (node->atom ==
72980 + check->atom) because atom could be locked and being fused at that
72981 + moment, jnodes of the atom of that state (being fused) can point to
72982 + different objects, but the atom is the same. */
72983 + spin_lock_jnode(check);
72984 +
72985 + atom = jnode_get_atom(check);
72986 +
72987 + if (atom == NULL) {
72988 + compat = 0;
72989 + } else {
72990 + compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
72991 +
72992 + if (compat && jnode_is_znode(check)) {
72993 + compat &= znode_is_connected(JZNODE(check));
72994 + }
72995 +
72996 + if (compat && alloc_check) {
72997 + compat &= (alloc_value == jnode_is_flushprepped(check));
72998 + }
72999 +
73000 + spin_unlock_atom(atom);
73001 + }
73002 +
73003 + spin_unlock_jnode(check);
73004 +
73005 + return compat;
73006 +}
73007 +
73008 +/* Decrement the atom's reference count and if it falls to zero, free it. */
73009 +void atom_dec_and_unlock(txn_atom * atom)
73010 +{
73011 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73012 +
73013 + assert("umka-186", atom != NULL);
73014 + assert_spin_locked(&(atom->alock));
73015 + assert("zam-1039", atomic_read(&atom->refcount) > 0);
73016 +
73017 + if (atomic_dec_and_test(&atom->refcount)) {
73018 + /* take txnmgr lock and atom lock in proper order. */
73019 + if (!spin_trylock_txnmgr(mgr)) {
73020 + /* This atom should exist after we re-acquire its
73021 + * spinlock, so we increment its reference counter. */
73022 + atomic_inc(&atom->refcount);
73023 + spin_unlock_atom(atom);
73024 + spin_lock_txnmgr(mgr);
73025 + spin_lock_atom(atom);
73026 +
73027 + if (!atomic_dec_and_test(&atom->refcount)) {
73028 + spin_unlock_atom(atom);
73029 + spin_unlock_txnmgr(mgr);
73030 + return;
73031 + }
73032 + }
73033 + assert_spin_locked(&(mgr->tmgr_lock));
73034 + atom_free(atom);
73035 + spin_unlock_txnmgr(mgr);
73036 + } else
73037 + spin_unlock_atom(atom);
73038 +}
73039 +
73040 +/* Create new atom and connect it to given transaction handle. This adds the
73041 + atom to the transaction manager's list and sets its reference count to 1, an
73042 + artificial reference which is kept until it commits. We play strange games
73043 + to avoid allocation under jnode & txnh spinlocks.*/
73044 +
73045 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73046 +{
73047 + txn_atom *atom;
73048 + txn_mgr *mgr;
73049 +
73050 + if (REISER4_DEBUG && rofs_tree(current_tree)) {
73051 + warning("nikita-3366", "Creating atom on rofs");
73052 + dump_stack();
73053 + }
73054 +
73055 + if (*atom_alloc == NULL) {
73056 + (*atom_alloc) = kmem_cache_alloc(_atom_slab,
73057 + reiser4_ctx_gfp_mask_get());
73058 +
73059 + if (*atom_alloc == NULL)
73060 + return RETERR(-ENOMEM);
73061 + }
73062 +
73063 + /* and, also, txnmgr spin lock should be taken before jnode and txnh
73064 + locks. */
73065 + mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73066 + spin_lock_txnmgr(mgr);
73067 + spin_lock_txnh(txnh);
73068 +
73069 + /* Check whether new atom still needed */
73070 + if (txnh->atom != NULL) {
73071 + /* NOTE-NIKITA probably it is rather better to free
73072 + * atom_alloc here than thread it up to reiser4_try_capture() */
73073 +
73074 + spin_unlock_txnh(txnh);
73075 + spin_unlock_txnmgr(mgr);
73076 +
73077 + return -E_REPEAT;
73078 + }
73079 +
73080 + atom = *atom_alloc;
73081 + *atom_alloc = NULL;
73082 +
73083 + atom_init(atom);
73084 +
73085 + assert("jmacd-17", atom_isclean(atom));
73086 +
73087 + /*
73088 + * lock ordering is broken here. It is ok, as long as @atom is new
73089 + * and inaccessible for others. We can't use spin_lock_atom or
73090 + * spin_lock(&atom->alock) because they care about locking
73091 + * dependencies. spin_trylock_lock doesn't.
73092 + */
73093 + check_me("", spin_trylock_atom(atom));
73094 +
73095 + /* add atom to the end of transaction manager's list of atoms */
73096 + list_add_tail(&atom->atom_link, &mgr->atoms_list);
73097 + atom->atom_id = mgr->id_count++;
73098 + mgr->atom_count += 1;
73099 +
73100 + /* Release txnmgr lock */
73101 + spin_unlock_txnmgr(mgr);
73102 +
73103 + /* One reference until it commits. */
73104 + atomic_inc(&atom->refcount);
73105 + atom->stage = ASTAGE_CAPTURE_FUSE;
73106 + atom->super = reiser4_get_current_sb();
73107 + capture_assign_txnh_nolock(atom, txnh);
73108 +
73109 + spin_unlock_atom(atom);
73110 + spin_unlock_txnh(txnh);
73111 +
73112 + return -E_REPEAT;
73113 +}
73114 +
73115 +/* Return true if an atom is currently "open". */
73116 +static int atom_isopen(const txn_atom * atom)
73117 +{
73118 + assert("umka-185", atom != NULL);
73119 +
73120 + return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73121 +}
73122 +
73123 +/* Return the number of pointers to this atom that must be updated during fusion. This
73124 + approximates the amount of work to be done. Fusion chooses the atom with fewer
73125 + pointers to fuse into the atom with more pointers. */
73126 +static int atom_pointer_count(const txn_atom * atom)
73127 +{
73128 + assert("umka-187", atom != NULL);
73129 +
73130 + /* This is a measure of the amount of work needed to fuse this atom
73131 + * into another. */
73132 + return atom->txnh_count + atom->capture_count;
73133 +}
73134 +
73135 +/* Called holding the atom lock, this removes the atom from the transaction manager list
73136 + and frees it. */
73137 +static void atom_free(txn_atom * atom)
73138 +{
73139 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73140 +
73141 + assert("umka-188", atom != NULL);
73142 + assert_spin_locked(&(atom->alock));
73143 +
73144 + /* Remove from the txn_mgr's atom list */
73145 + assert_spin_locked(&(mgr->tmgr_lock));
73146 + mgr->atom_count -= 1;
73147 + list_del_init(&atom->atom_link);
73148 +
73149 + /* Clean the atom */
73150 + assert("jmacd-16",
73151 + (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73152 + atom->stage = ASTAGE_FREE;
73153 +
73154 + blocknr_set_destroy(&atom->delete_set);
73155 + blocknr_set_destroy(&atom->wandered_map);
73156 +
73157 + assert("jmacd-16", atom_isclean(atom));
73158 +
73159 + spin_unlock_atom(atom);
73160 +
73161 + kmem_cache_free(_atom_slab, atom);
73162 +}
73163 +
73164 +static int atom_is_dotard(const txn_atom * atom)
73165 +{
73166 + return time_after(jiffies, atom->start_time +
73167 + get_current_super_private()->tmgr.atom_max_age);
73168 +}
73169 +
73170 +static int atom_can_be_committed(txn_atom * atom)
73171 +{
73172 + assert_spin_locked(&(atom->alock));
73173 + assert("zam-885", atom->txnh_count > atom->nr_waiters);
73174 + return atom->txnh_count == atom->nr_waiters + 1;
73175 +}
73176 +
73177 +/* Return true if an atom should commit now. This is determined by aging, atom
73178 + size or atom flags. */
73179 +static int atom_should_commit(const txn_atom * atom)
73180 +{
73181 + assert("umka-189", atom != NULL);
73182 + return
73183 + (atom->flags & ATOM_FORCE_COMMIT) ||
73184 + ((unsigned)atom_pointer_count(atom) >
73185 + get_current_super_private()->tmgr.atom_max_size)
73186 + || atom_is_dotard(atom);
73187 +}
73188 +
73189 +/* return 1 if current atom exists and requires commit. */
73190 +int current_atom_should_commit(void)
73191 +{
73192 + txn_atom *atom;
73193 + int result = 0;
73194 +
73195 + atom = get_current_atom_locked_nocheck();
73196 + if (atom) {
73197 + result = atom_should_commit(atom);
73198 + spin_unlock_atom(atom);
73199 + }
73200 + return result;
73201 +}
73202 +
73203 +static int atom_should_commit_asap(const txn_atom * atom)
73204 +{
73205 + unsigned int captured;
73206 + unsigned int pinnedpages;
73207 +
73208 + assert("nikita-3309", atom != NULL);
73209 +
73210 + captured = (unsigned)atom->capture_count;
73211 + pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73212 +
73213 + return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
73214 +}
73215 +
73216 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
73217 +{
73218 + jnode *first_dirty;
73219 +
73220 + list_for_each_entry(first_dirty, head, capture_link) {
73221 + if (!(flags & JNODE_FLUSH_COMMIT)) {
73222 + /*
73223 + * skip jnodes which "heard banshee" or having active
73224 + * I/O
73225 + */
73226 + if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
73227 + JF_ISSET(first_dirty, JNODE_WRITEBACK))
73228 + continue;
73229 + }
73230 + return first_dirty;
73231 + }
73232 + return NULL;
73233 +}
73234 +
73235 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
73236 + nodes on atom's lists */
73237 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
73238 +{
73239 + jnode *first_dirty;
73240 + tree_level level;
73241 +
73242 + assert_spin_locked(&(atom->alock));
73243 +
73244 + /* The flush starts from LEAF_LEVEL (=1). */
73245 + for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73246 + if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
73247 + continue;
73248 +
73249 + first_dirty =
73250 + find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
73251 + flags);
73252 + if (first_dirty)
73253 + return first_dirty;
73254 + }
73255 +
73256 + /* znode-above-root is on the list #0. */
73257 + return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
73258 +}
73259 +
73260 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
73261 +{
73262 + jnode *cur;
73263 +
73264 + assert("zam-905", atom_is_protected(atom));
73265 +
73266 + cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
73267 + while (ATOM_WB_LIST(atom) != &cur->capture_link) {
73268 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
73269 +
73270 + spin_lock_jnode(cur);
73271 + if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
73272 + if (JF_ISSET(cur, JNODE_DIRTY)) {
73273 + queue_jnode(fq, cur);
73274 + } else {
73275 + /* move jnode to atom's clean list */
73276 + list_move_tail(&cur->capture_link,
73277 + ATOM_CLEAN_LIST(atom));
73278 + }
73279 + }
73280 + spin_unlock_jnode(cur);
73281 +
73282 + cur = next;
73283 + }
73284 +}
73285 +
73286 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
73287 + * jnodes to disk. */
73288 +static int submit_wb_list(void)
73289 +{
73290 + int ret;
73291 + flush_queue_t *fq;
73292 +
73293 + fq = get_fq_for_current_atom();
73294 + if (IS_ERR(fq))
73295 + return PTR_ERR(fq);
73296 +
73297 + dispatch_wb_list(fq->atom, fq);
73298 + spin_unlock_atom(fq->atom);
73299 +
73300 + ret = reiser4_write_fq(fq, NULL, 1);
73301 + reiser4_fq_put(fq);
73302 +
73303 + return ret;
73304 +}
73305 +
73306 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
73307 +static int current_atom_complete_writes(void)
73308 +{
73309 + int ret;
73310 +
73311 + /* Each jnode from that list was modified and dirtied when it had i/o
73312 + * request running already. After i/o completion we have to resubmit
73313 + * them to disk again.*/
73314 + ret = submit_wb_list();
73315 + if (ret < 0)
73316 + return ret;
73317 +
73318 + /* Wait all i/o completion */
73319 + ret = current_atom_finish_all_fq();
73320 + if (ret)
73321 + return ret;
73322 +
73323 + /* Scan wb list again; all i/o should be completed, we re-submit dirty
73324 + * nodes to disk */
73325 + ret = submit_wb_list();
73326 + if (ret < 0)
73327 + return ret;
73328 +
73329 + /* Wait all nodes we just submitted */
73330 + return current_atom_finish_all_fq();
73331 +}
73332 +
73333 +#if REISER4_DEBUG
73334 +
73335 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
73336 +{
73337 + if (atom == NULL) {
73338 + printk("%s: no atom\n", prefix);
73339 + return;
73340 + }
73341 +
73342 + printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
73343 + " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
73344 + atomic_read(&atom->refcount), atom->atom_id, atom->flags,
73345 + atom->txnh_count, atom->capture_count, atom->stage,
73346 + atom->start_time, atom->flushed);
73347 +}
73348 +
73349 +#else /* REISER4_DEBUG */
73350 +
73351 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
73352 +
73353 +#endif /* REISER4_DEBUG */
73354 +
73355 +#define TOOMANYFLUSHES (1 << 13)
73356 +
73357 +/* Called with the atom locked and no open "active" transaction handlers except
73358 + ours, this function calls flush_current_atom() until all dirty nodes are
73359 + processed. Then it initiates commit processing.
73360 +
73361 + Called by the single remaining open "active" txnh, which is closing. Other
73362 + open txnhs belong to processes which wait atom commit in commit_txnh()
73363 + routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
73364 + long as we hold the atom lock none of the jnodes can be captured and/or
73365 + locked.
73366 +
73367 + Return value is an error code if commit fails.
73368 +*/
73369 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
73370 +{
73371 + reiser4_super_info_data *sbinfo = get_current_super_private();
73372 + long ret = 0;
73373 + /* how many times jnode_flush() was called as a part of attempt to
73374 + * commit this atom. */
73375 + int flushiters;
73376 +
73377 + assert("zam-888", atom != NULL && *atom != NULL);
73378 + assert_spin_locked(&((*atom)->alock));
73379 + assert("zam-887", get_current_context()->trans->atom == *atom);
73380 + assert("jmacd-151", atom_isopen(*atom));
73381 +
73382 + assert("nikita-3184",
73383 + get_current_super_private()->delete_mutex_owner != current);
73384 +
73385 + for (flushiters = 0;; ++flushiters) {
73386 + ret =
73387 + flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
73388 + JNODE_FLUSH_COMMIT,
73389 + LONG_MAX /* nr_to_write */ ,
73390 + nr_submitted, atom, NULL);
73391 + if (ret != -E_REPEAT)
73392 + break;
73393 +
73394 + /* if atom's dirty list contains one znode which is
73395 + HEARD_BANSHEE and is locked we have to allow lock owner to
73396 + continue and uncapture that znode */
73397 + reiser4_preempt_point();
73398 +
73399 + *atom = get_current_atom_locked();
73400 + if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
73401 + warning("nikita-3176",
73402 + "Flushing like mad: %i", flushiters);
73403 + reiser4_info_atom("atom", *atom);
73404 + DEBUGON(flushiters > (1 << 20));
73405 + }
73406 + }
73407 +
73408 + if (ret)
73409 + return ret;
73410 +
73411 + assert_spin_locked(&((*atom)->alock));
73412 +
73413 + if (!atom_can_be_committed(*atom)) {
73414 + spin_unlock_atom(*atom);
73415 + return RETERR(-E_REPEAT);
73416 + }
73417 +
73418 + if ((*atom)->capture_count == 0)
73419 + goto done;
73420 +
73421 + /* Up to this point we have been flushing and after flush is called we
73422 + return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
73423 + at this point, commit should be successful. */
73424 + reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
73425 + ON_DEBUG(((*atom)->committer = current));
73426 + spin_unlock_atom(*atom);
73427 +
73428 + ret = current_atom_complete_writes();
73429 + if (ret)
73430 + return ret;
73431 +
73432 + assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
73433 +
73434 + /* isolate critical code path which should be executed by only one
73435 + * thread using tmgr mutex */
73436 + mutex_lock(&sbinfo->tmgr.commit_mutex);
73437 +
73438 + ret = reiser4_write_logs(nr_submitted);
73439 + if (ret < 0)
73440 + reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
73441 +
73442 + /* The atom->ovrwr_nodes list is processed under commit mutex held
73443 + because of bitmap nodes which are captured by special way in
73444 + reiser4_pre_commit_hook_bitmap(), that way does not include
73445 + capture_fuse_wait() as a capturing of other nodes does -- the commit
73446 + mutex is used for transaction isolation instead. */
73447 + reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
73448 + mutex_unlock(&sbinfo->tmgr.commit_mutex);
73449 +
73450 + reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
73451 + reiser4_invalidate_list(ATOM_WB_LIST(*atom));
73452 + assert("zam-927", list_empty(&(*atom)->inodes));
73453 +
73454 + spin_lock_atom(*atom);
73455 + done:
73456 + reiser4_atom_set_stage(*atom, ASTAGE_DONE);
73457 + ON_DEBUG((*atom)->committer = NULL);
73458 +
73459 + /* Atom's state changes, so wake up everybody waiting for this
73460 + event. */
73461 + wakeup_atom_waiting_list(*atom);
73462 +
73463 + /* Decrement the "until commit" reference, at least one txnh (the caller) is
73464 + still open. */
73465 + atomic_dec(&(*atom)->refcount);
73466 +
73467 + assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
73468 + assert("jmacd-1062", (*atom)->capture_count == 0);
73469 + BUG_ON((*atom)->capture_count != 0);
73470 + assert_spin_locked(&((*atom)->alock));
73471 +
73472 + return ret;
73473 +}
73474 +
73475 +/* TXN_TXNH */
73476 +
73477 +/**
73478 + * force_commit_atom - commit current atom and wait commit completion
73479 + * @txnh:
73480 + *
73481 + * Commits current atom and wait commit completion; current atom and @txnh have
73482 + * to be spinlocked before call, this function unlocks them on exit.
73483 + */
73484 +int force_commit_atom(txn_handle *txnh)
73485 +{
73486 + txn_atom *atom;
73487 +
73488 + assert("zam-837", txnh != NULL);
73489 + assert_spin_locked(&(txnh->hlock));
73490 + assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
73491 +
73492 + atom = txnh->atom;
73493 +
73494 + assert("zam-834", atom != NULL);
73495 + assert_spin_locked(&(atom->alock));
73496 +
73497 + /*
73498 + * Set flags for atom and txnh: forcing atom commit and waiting for
73499 + * commit completion
73500 + */
73501 + txnh->flags |= TXNH_WAIT_COMMIT;
73502 + atom->flags |= ATOM_FORCE_COMMIT;
73503 +
73504 + spin_unlock_txnh(txnh);
73505 + spin_unlock_atom(atom);
73506 +
73507 + /* commit is here */
73508 + reiser4_txn_restart_current();
73509 + return 0;
73510 +}
73511 +
73512 +/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
73513 + * should we commit all atoms including new ones which are created after this
73514 + * functions is called. */
73515 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
73516 +{
73517 + int ret;
73518 + txn_atom *atom;
73519 + txn_mgr *mgr;
73520 + txn_handle *txnh;
73521 + unsigned long start_time = jiffies;
73522 + reiser4_context *ctx = get_current_context();
73523 +
73524 + assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
73525 + assert("nikita-3058", reiser4_commit_check_locks());
73526 +
73527 + reiser4_txn_restart_current();
73528 +
73529 + mgr = &get_super_private(super)->tmgr;
73530 +
73531 + txnh = ctx->trans;
73532 +
73533 + again:
73534 +
73535 + spin_lock_txnmgr(mgr);
73536 +
73537 + list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
73538 + spin_lock_atom(atom);
73539 +
73540 + /* Commit any atom which can be committed. If @commit_new_atoms
73541 + * is not set we commit only atoms which were created before
73542 + * this call is started. */
73543 + if (commit_all_atoms
73544 + || time_before_eq(atom->start_time, start_time)) {
73545 + if (atom->stage <= ASTAGE_POST_COMMIT) {
73546 + spin_unlock_txnmgr(mgr);
73547 +
73548 + if (atom->stage < ASTAGE_PRE_COMMIT) {
73549 + spin_lock_txnh(txnh);
73550 + /* Add force-context txnh */
73551 + capture_assign_txnh_nolock(atom, txnh);
73552 + ret = force_commit_atom(txnh);
73553 + if (ret)
73554 + return ret;
73555 + } else
73556 + /* wait atom commit */
73557 + reiser4_atom_wait_event(atom);
73558 +
73559 + goto again;
73560 + }
73561 + }
73562 +
73563 + spin_unlock_atom(atom);
73564 + }
73565 +
73566 +#if REISER4_DEBUG
73567 + if (commit_all_atoms) {
73568 + reiser4_super_info_data *sbinfo = get_super_private(super);
73569 + spin_lock_reiser4_super(sbinfo);
73570 + assert("zam-813",
73571 + sbinfo->blocks_fake_allocated_unformatted == 0);
73572 + assert("zam-812", sbinfo->blocks_fake_allocated == 0);
73573 + spin_unlock_reiser4_super(sbinfo);
73574 + }
73575 +#endif
73576 +
73577 + spin_unlock_txnmgr(mgr);
73578 +
73579 + return 0;
73580 +}
73581 +
73582 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
73583 + * caller */
73584 +static int atom_is_committable(txn_atom * atom)
73585 +{
73586 + return
73587 + atom->stage < ASTAGE_PRE_COMMIT &&
73588 + atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
73589 +}
73590 +
73591 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
73592 + * lock at exit */
73593 +int commit_some_atoms(txn_mgr * mgr)
73594 +{
73595 + int ret = 0;
73596 + txn_atom *atom;
73597 + txn_handle *txnh;
73598 + reiser4_context *ctx;
73599 + struct list_head *pos, *tmp;
73600 +
73601 + ctx = get_current_context();
73602 + assert("nikita-2444", ctx != NULL);
73603 +
73604 + txnh = ctx->trans;
73605 + spin_lock_txnmgr(mgr);
73606 +
73607 + /*
73608 + * this is to avoid gcc complain that atom might be used
73609 + * uninitialized
73610 + */
73611 + atom = NULL;
73612 +
73613 + /* look for atom to commit */
73614 + list_for_each_safe(pos, tmp, &mgr->atoms_list) {
73615 + atom = list_entry(pos, txn_atom, atom_link);
73616 + /*
73617 + * first test without taking atom spin lock, whether it is
73618 + * eligible for committing at all
73619 + */
73620 + if (atom_is_committable(atom)) {
73621 + /* now, take spin lock and re-check */
73622 + spin_lock_atom(atom);
73623 + if (atom_is_committable(atom))
73624 + break;
73625 + spin_unlock_atom(atom);
73626 + }
73627 + }
73628 +
73629 + ret = (&mgr->atoms_list == pos);
73630 + spin_unlock_txnmgr(mgr);
73631 +
73632 + if (ret) {
73633 + /* nothing found */
73634 + spin_unlock(&mgr->daemon->guard);
73635 + return 0;
73636 + }
73637 +
73638 + spin_lock_txnh(txnh);
73639 +
73640 + BUG_ON(atom == NULL);
73641 + /* Set the atom to force committing */
73642 + atom->flags |= ATOM_FORCE_COMMIT;
73643 +
73644 + /* Add force-context txnh */
73645 + capture_assign_txnh_nolock(atom, txnh);
73646 +
73647 + spin_unlock_txnh(txnh);
73648 + spin_unlock_atom(atom);
73649 +
73650 + /* we are about to release daemon spin lock, notify daemon it
73651 + has to rescan atoms */
73652 + mgr->daemon->rescan = 1;
73653 + spin_unlock(&mgr->daemon->guard);
73654 + reiser4_txn_restart_current();
73655 + return 0;
73656 +}
73657 +
73658 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
73659 +{
73660 + int atom_stage;
73661 + txn_atom *atom_2;
73662 + int repeat;
73663 +
73664 + assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
73665 +
73666 + atom_stage = atom->stage;
73667 + repeat = 0;
73668 +
73669 + if (!spin_trylock_txnmgr(tmgr)) {
73670 + atomic_inc(&atom->refcount);
73671 + spin_unlock_atom(atom);
73672 + spin_lock_txnmgr(tmgr);
73673 + spin_lock_atom(atom);
73674 + repeat = 1;
73675 + if (atom->stage != atom_stage) {
73676 + spin_unlock_txnmgr(tmgr);
73677 + atom_dec_and_unlock(atom);
73678 + return -E_REPEAT;
73679 + }
73680 + atomic_dec(&atom->refcount);
73681 + }
73682 +
73683 + list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
73684 + if (atom == atom_2)
73685 + continue;
73686 + /*
73687 + * if trylock does not succeed we just do not fuse with that
73688 + * atom.
73689 + */
73690 + if (spin_trylock_atom(atom_2)) {
73691 + if (atom_2->stage < ASTAGE_PRE_COMMIT) {
73692 + spin_unlock_txnmgr(tmgr);
73693 + capture_fuse_into(atom_2, atom);
73694 + /* all locks are lost we can only repeat here */
73695 + return -E_REPEAT;
73696 + }
73697 + spin_unlock_atom(atom_2);
73698 + }
73699 + }
73700 + atom->flags |= ATOM_CANCEL_FUSION;
73701 + spin_unlock_txnmgr(tmgr);
73702 + if (repeat) {
73703 + spin_unlock_atom(atom);
73704 + return -E_REPEAT;
73705 + }
73706 + return 0;
73707 +}
73708 +
73709 +/* Calls jnode_flush for current atom if it exists; if not, just take another
73710 + atom and call jnode_flush() for him. If current transaction handle has
73711 + already assigned atom (current atom) we have to close current transaction
73712 + prior to switch to another atom or do something with current atom. This
73713 + code tries to flush current atom.
73714 +
73715 + flush_some_atom() is called as part of memory clearing process. It is
73716 + invoked from balance_dirty_pages(), pdflushd, and entd.
73717 +
73718 + If we can flush no nodes, atom is committed, because this frees memory.
73719 +
73720 + If atom is too large or too old it is committed also.
73721 +*/
73722 +int
73723 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
73724 + int flags)
73725 +{
73726 + reiser4_context *ctx = get_current_context();
73727 + txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
73728 + txn_handle *txnh = ctx->trans;
73729 + txn_atom *atom;
73730 + int ret;
73731 +
73732 + BUG_ON(wbc->nr_to_write == 0);
73733 + BUG_ON(*nr_submitted != 0);
73734 + assert("zam-1042", txnh != NULL);
73735 + repeat:
73736 + if (txnh->atom == NULL) {
73737 + /* current atom is not available, take first from txnmgr */
73738 + spin_lock_txnmgr(tmgr);
73739 +
73740 + /* traverse the list of all atoms */
73741 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73742 + /* lock atom before checking its state */
73743 + spin_lock_atom(atom);
73744 +
73745 + /*
73746 + * we need an atom which is not being committed and
73747 + * which has no flushers (jnode_flush() add one flusher
73748 + * at the beginning and subtract one at the end).
73749 + */
73750 + if (atom->stage < ASTAGE_PRE_COMMIT &&
73751 + atom->nr_flushers == 0) {
73752 + spin_lock_txnh(txnh);
73753 + capture_assign_txnh_nolock(atom, txnh);
73754 + spin_unlock_txnh(txnh);
73755 +
73756 + goto found;
73757 + }
73758 +
73759 + spin_unlock_atom(atom);
73760 + }
73761 +
73762 + /*
73763 + * Write throttling is case of no one atom can be
73764 + * flushed/committed.
73765 + */
73766 + if (!current_is_pdflush() && !wbc->nonblocking) {
73767 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73768 + spin_lock_atom(atom);
73769 + /* Repeat the check from the above. */
73770 + if (atom->stage < ASTAGE_PRE_COMMIT
73771 + && atom->nr_flushers == 0) {
73772 + spin_lock_txnh(txnh);
73773 + capture_assign_txnh_nolock(atom, txnh);
73774 + spin_unlock_txnh(txnh);
73775 +
73776 + goto found;
73777 + }
73778 + if (atom->stage <= ASTAGE_POST_COMMIT) {
73779 + spin_unlock_txnmgr(tmgr);
73780 + /*
73781 + * we just wait until atom's flusher
73782 + * makes a progress in flushing or
73783 + * committing the atom
73784 + */
73785 + reiser4_atom_wait_event(atom);
73786 + goto repeat;
73787 + }
73788 + spin_unlock_atom(atom);
73789 + }
73790 + }
73791 + spin_unlock_txnmgr(tmgr);
73792 + return 0;
73793 + found:
73794 + spin_unlock_txnmgr(tmgr);
73795 + } else
73796 + atom = get_current_atom_locked();
73797 +
73798 + BUG_ON(atom->super != ctx->super);
73799 + assert("vs-35", atom->super == ctx->super);
73800 + if (start) {
73801 + spin_lock_jnode(start);
73802 + ret = (atom == start->atom) ? 1 : 0;
73803 + spin_unlock_jnode(start);
73804 + if (ret == 0)
73805 + start = NULL;
73806 + }
73807 + ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
73808 + if (ret == 0) {
73809 + /* flush_current_atom returns 0 only if it submitted for write
73810 + nothing */
73811 + BUG_ON(*nr_submitted != 0);
73812 + if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
73813 + if (atom->capture_count < tmgr->atom_min_size &&
73814 + !(atom->flags & ATOM_CANCEL_FUSION)) {
73815 + ret = txn_try_to_fuse_small_atom(tmgr, atom);
73816 + if (ret == -E_REPEAT) {
73817 + reiser4_preempt_point();
73818 + goto repeat;
73819 + }
73820 + }
73821 + /* if early flushing could not make more nodes clean,
73822 + * or atom is too old/large,
73823 + * we force current atom to commit */
73824 + /* wait for commit completion but only if this
73825 + * wouldn't stall pdflushd and ent thread. */
73826 + if (!wbc->nonblocking && !ctx->entd)
73827 + txnh->flags |= TXNH_WAIT_COMMIT;
73828 + atom->flags |= ATOM_FORCE_COMMIT;
73829 + }
73830 + spin_unlock_atom(atom);
73831 + } else if (ret == -E_REPEAT) {
73832 + if (*nr_submitted == 0) {
73833 + /* let others who hampers flushing (hold longterm locks,
73834 + for instance) to free the way for flush */
73835 + reiser4_preempt_point();
73836 + goto repeat;
73837 + }
73838 + ret = 0;
73839 + }
73840 +/*
73841 + if (*nr_submitted > wbc->nr_to_write)
73842 + warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
73843 +*/
73844 + reiser4_txn_restart(ctx);
73845 +
73846 + return ret;
73847 +}
73848 +
73849 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
73850 +void reiser4_invalidate_list(struct list_head *head)
73851 +{
73852 + while (!list_empty(head)) {
73853 + jnode *node;
73854 +
73855 + node = list_entry(head->next, jnode, capture_link);
73856 + spin_lock_jnode(node);
73857 + reiser4_uncapture_block(node);
73858 + jput(node);
73859 + }
73860 +}
73861 +
73862 +static void init_wlinks(txn_wait_links * wlinks)
73863 +{
73864 + wlinks->_lock_stack = get_current_lock_stack();
73865 + INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
73866 + INIT_LIST_HEAD(&wlinks->_fwaiting_link);
73867 + wlinks->waitfor_cb = NULL;
73868 + wlinks->waiting_cb = NULL;
73869 +}
73870 +
73871 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
73872 +void reiser4_atom_wait_event(txn_atom * atom)
73873 +{
73874 + txn_wait_links _wlinks;
73875 +
73876 + assert_spin_locked(&(atom->alock));
73877 + assert("nikita-3156",
73878 + lock_stack_isclean(get_current_lock_stack()) ||
73879 + atom->nr_running_queues > 0);
73880 +
73881 + init_wlinks(&_wlinks);
73882 + list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
73883 + atomic_inc(&atom->refcount);
73884 + spin_unlock_atom(atom);
73885 +
73886 + reiser4_prepare_to_sleep(_wlinks._lock_stack);
73887 + reiser4_go_to_sleep(_wlinks._lock_stack);
73888 +
73889 + spin_lock_atom(atom);
73890 + list_del(&_wlinks._fwaitfor_link);
73891 + atom_dec_and_unlock(atom);
73892 +}
73893 +
73894 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
73895 +{
73896 + assert("nikita-3535", atom != NULL);
73897 + assert_spin_locked(&(atom->alock));
73898 + assert("nikita-3536", stage <= ASTAGE_INVALID);
73899 + /* Excelsior! */
73900 + assert("nikita-3537", stage >= atom->stage);
73901 + if (atom->stage != stage) {
73902 + atom->stage = stage;
73903 + reiser4_atom_send_event(atom);
73904 + }
73905 +}
73906 +
73907 +/* wake all threads which wait for an event */
73908 +void reiser4_atom_send_event(txn_atom * atom)
73909 +{
73910 + assert_spin_locked(&(atom->alock));
73911 + wakeup_atom_waitfor_list(atom);
73912 +}
73913 +
73914 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
73915 + example, because it does fsync(2)) */
73916 +static int should_wait_commit(txn_handle * h)
73917 +{
73918 + return h->flags & TXNH_WAIT_COMMIT;
73919 +}
73920 +
73921 +typedef struct commit_data {
73922 + txn_atom *atom;
73923 + txn_handle *txnh;
73924 + long nr_written;
73925 + /* as an optimization we start committing atom by first trying to
73926 + * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
73927 + * allows to reduce stalls due to other threads waiting for atom in
73928 + * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
73929 + * preliminary flushes. */
73930 + int preflush;
73931 + /* have we waited on atom. */
73932 + int wait;
73933 + int failed;
73934 + int wake_ktxnmgrd_up;
73935 +} commit_data;
73936 +
73937 +/*
73938 + * Called from commit_txnh() repeatedly, until either error happens, or atom
73939 + * commits successfully.
73940 + */
73941 +static int try_commit_txnh(commit_data * cd)
73942 +{
73943 + int result;
73944 +
73945 + assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
73946 +
73947 + /* Get the atom and txnh locked. */
73948 + cd->atom = txnh_get_atom(cd->txnh);
73949 + assert("jmacd-309", cd->atom != NULL);
73950 + spin_unlock_txnh(cd->txnh);
73951 +
73952 + if (cd->wait) {
73953 + cd->atom->nr_waiters--;
73954 + cd->wait = 0;
73955 + }
73956 +
73957 + if (cd->atom->stage == ASTAGE_DONE)
73958 + return 0;
73959 +
73960 + if (cd->failed)
73961 + return 0;
73962 +
73963 + if (atom_should_commit(cd->atom)) {
73964 + /* if atom is _very_ large schedule it for commit as soon as
73965 + * possible. */
73966 + if (atom_should_commit_asap(cd->atom)) {
73967 + /*
73968 + * When atom is in PRE_COMMIT or later stage following
73969 + * invariant (encoded in atom_can_be_committed())
73970 + * holds: there is exactly one non-waiter transaction
73971 + * handle opened on this atom. When thread wants to
73972 + * wait until atom commits (for example sync()) it
73973 + * waits on atom event after increasing
73974 + * atom->nr_waiters (see blow in this function). It
73975 + * cannot be guaranteed that atom is already committed
73976 + * after receiving event, so loop has to be
73977 + * re-started. But if atom switched into PRE_COMMIT
73978 + * stage and became too large, we cannot change its
73979 + * state back to CAPTURE_WAIT (atom stage can only
73980 + * increase monotonically), hence this check.
73981 + */
73982 + if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
73983 + reiser4_atom_set_stage(cd->atom,
73984 + ASTAGE_CAPTURE_WAIT);
73985 + cd->atom->flags |= ATOM_FORCE_COMMIT;
73986 + }
73987 + if (cd->txnh->flags & TXNH_DONT_COMMIT) {
73988 + /*
73989 + * this thread (transaction handle that is) doesn't
73990 + * want to commit atom. Notify waiters that handle is
73991 + * closed. This can happen, for example, when we are
73992 + * under VFS directory lock and don't want to commit
73993 + * atom right now to avoid stalling other threads
73994 + * working in the same directory.
73995 + */
73996 +
73997 + /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
73998 + * commit this atom: no atom waiters and only one
73999 + * (our) open transaction handle. */
74000 + cd->wake_ktxnmgrd_up =
74001 + cd->atom->txnh_count == 1 &&
74002 + cd->atom->nr_waiters == 0;
74003 + reiser4_atom_send_event(cd->atom);
74004 + result = 0;
74005 + } else if (!atom_can_be_committed(cd->atom)) {
74006 + if (should_wait_commit(cd->txnh)) {
74007 + /* sync(): wait for commit */
74008 + cd->atom->nr_waiters++;
74009 + cd->wait = 1;
74010 + reiser4_atom_wait_event(cd->atom);
74011 + result = RETERR(-E_REPEAT);
74012 + } else {
74013 + result = 0;
74014 + }
74015 + } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74016 + /*
74017 + * optimization: flush atom without switching it into
74018 + * ASTAGE_CAPTURE_WAIT.
74019 + *
74020 + * But don't do this for ktxnmgrd, because ktxnmgrd
74021 + * should never block on atom fusion.
74022 + */
74023 + result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74024 + LONG_MAX, &cd->nr_written,
74025 + &cd->atom, NULL);
74026 + if (result == 0) {
74027 + spin_unlock_atom(cd->atom);
74028 + cd->preflush = 0;
74029 + result = RETERR(-E_REPEAT);
74030 + } else /* Atoms wasn't flushed
74031 + * completely. Rinse. Repeat. */
74032 + --cd->preflush;
74033 + } else {
74034 + /* We change atom state to ASTAGE_CAPTURE_WAIT to
74035 + prevent atom fusion and count ourself as an active
74036 + flusher */
74037 + reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74038 + cd->atom->flags |= ATOM_FORCE_COMMIT;
74039 +
74040 + result =
74041 + commit_current_atom(&cd->nr_written, &cd->atom);
74042 + if (result != 0 && result != -E_REPEAT)
74043 + cd->failed = 1;
74044 + }
74045 + } else
74046 + result = 0;
74047 +
74048 +#if REISER4_DEBUG
74049 + if (result == 0)
74050 + assert_spin_locked(&(cd->atom->alock));
74051 +#endif
74052 +
74053 + /* perfectly valid assertion, except that when atom/txnh is not locked
74054 + * fusion can take place, and cd->atom points nowhere. */
74055 + /*
74056 + assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74057 + */
74058 + return result;
74059 +}
74060 +
74061 +/* Called to commit a transaction handle. This decrements the atom's number of open
74062 + handles and if it is the last handle to commit and the atom should commit, initiates
74063 + atom commit. if commit does not fail, return number of written blocks */
74064 +static int commit_txnh(txn_handle * txnh)
74065 +{
74066 + commit_data cd;
74067 + assert("umka-192", txnh != NULL);
74068 +
74069 + memset(&cd, 0, sizeof cd);
74070 + cd.txnh = txnh;
74071 + cd.preflush = 10;
74072 +
74073 + /* calls try_commit_txnh() until either atom commits, or error
74074 + * happens */
74075 + while (try_commit_txnh(&cd) != 0)
74076 + reiser4_preempt_point();
74077 +
74078 + spin_lock_txnh(txnh);
74079 +
74080 + cd.atom->txnh_count -= 1;
74081 + txnh->atom = NULL;
74082 + /* remove transaction handle from atom's list of transaction handles */
74083 + list_del_init(&txnh->txnh_link);
74084 +
74085 + spin_unlock_txnh(txnh);
74086 + atom_dec_and_unlock(cd.atom);
74087 + /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74088 + * because it takes time) by current thread, we do that work
74089 + * asynchronously by ktxnmgrd daemon. */
74090 + if (cd.wake_ktxnmgrd_up)
74091 + ktxnmgrd_kick(&get_current_super_private()->tmgr);
74092 +
74093 + return 0;
74094 +}
74095 +
74096 +/* TRY_CAPTURE */
74097 +
74098 +/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
74099 + condition indicates that the request should be retried, and it may block if the
74100 + txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74101 +
74102 + This routine encodes the basic logic of block capturing described by:
74103 +
74104 + http://namesys.com/v4/v4.html
74105 +
74106 + Our goal here is to ensure that any two blocks that contain dependent modifications
74107 + should commit at the same time. This function enforces this discipline by initiating
74108 + fusion whenever a transaction handle belonging to one atom requests to read or write a
74109 + block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74110 +
74111 + In addition, this routine handles the initial assignment of atoms to blocks and
74112 + transaction handles. These are possible outcomes of this function:
74113 +
74114 + 1. The block and handle are already part of the same atom: return immediate success
74115 +
74116 + 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74117 + the handle to the block's atom.
74118 +
74119 + 3. The handle is assigned but the block is not: call capture_assign_block to assign
74120 + the block to the handle's atom.
74121 +
74122 + 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74123 + to fuse atoms.
74124 +
74125 + 5. Neither block nor handle are assigned: create a new atom and assign them both.
74126 +
74127 + 6. A read request for a non-captured block: return immediate success.
74128 +
74129 + This function acquires and releases the handle's spinlock. This function is called
74130 + under the jnode lock and if the return value is 0, it returns with the jnode lock still
74131 + held. If the return is -E_REPEAT or some other error condition, the jnode lock is
74132 + released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
74133 + lock in the failure case.
74134 +*/
74135 +static int try_capture_block(
74136 + txn_handle * txnh, jnode * node, txn_capture mode,
74137 + txn_atom ** atom_alloc)
74138 +{
74139 + txn_atom *block_atom;
74140 + txn_atom *txnh_atom;
74141 +
74142 + /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
74143 + assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74144 +
74145 + /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74146 + * node->tree somewhere. */
74147 + assert("umka-194", txnh != NULL);
74148 + assert("umka-195", node != NULL);
74149 +
74150 + /* The jnode is already locked! Being called from reiser4_try_capture(). */
74151 + assert_spin_locked(&(node->guard));
74152 + block_atom = node->atom;
74153 +
74154 + /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74155 + let us touch the atoms themselves. */
74156 + spin_lock_txnh(txnh);
74157 + txnh_atom = txnh->atom;
74158 + /* Process of capturing continues into one of four branches depends on
74159 + which atoms from (block atom (node->atom), current atom (txnh->atom))
74160 + exist. */
74161 + if (txnh_atom == NULL) {
74162 + if (block_atom == NULL) {
74163 + spin_unlock_txnh(txnh);
74164 + spin_unlock_jnode(node);
74165 + /* assign empty atom to the txnh and repeat */
74166 + return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74167 + } else {
74168 + atomic_inc(&block_atom->refcount);
74169 + /* node spin-lock isn't needed anymore */
74170 + spin_unlock_jnode(node);
74171 + if (!spin_trylock_atom(block_atom)) {
74172 + spin_unlock_txnh(txnh);
74173 + spin_lock_atom(block_atom);
74174 + spin_lock_txnh(txnh);
74175 + }
74176 + /* re-check state after getting txnh and the node
74177 + * atom spin-locked */
74178 + if (node->atom != block_atom || txnh->atom != NULL) {
74179 + spin_unlock_txnh(txnh);
74180 + atom_dec_and_unlock(block_atom);
74181 + return RETERR(-E_REPEAT);
74182 + }
74183 + atomic_dec(&block_atom->refcount);
74184 + if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74185 + (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74186 + block_atom->txnh_count != 0))
74187 + return capture_fuse_wait(txnh, block_atom, NULL, mode);
74188 + capture_assign_txnh_nolock(block_atom, txnh);
74189 + spin_unlock_txnh(txnh);
74190 + spin_unlock_atom(block_atom);
74191 + return RETERR(-E_REPEAT);
74192 + }
74193 + } else {
74194 + /* It is time to perform deadlock prevention check over the
74195 + node we want to capture. It is possible this node was locked
74196 + for read without capturing it. The optimization which allows
74197 + to do it helps us in keeping atoms independent as long as
74198 + possible but it may cause lock/fuse deadlock problems.
74199 +
74200 + A number of similar deadlock situations with locked but not
74201 + captured nodes were found. In each situation there are two
74202 + or more threads: one of them does flushing while another one
74203 + does routine balancing or tree lookup. The flushing thread
74204 + (F) sleeps in long term locking request for node (N), another
74205 + thread (A) sleeps in trying to capture some node already
74206 + belonging the atom F, F has a state which prevents
74207 + immediately fusion .
74208 +
74209 + Deadlocks of this kind cannot happen if node N was properly
74210 + captured by thread A. The F thread fuse atoms before locking
74211 + therefore current atom of thread F and current atom of thread
74212 + A became the same atom and thread A may proceed. This does
74213 + not work if node N was not captured because the fusion of
74214 + atom does not happens.
74215 +
74216 + The following scheme solves the deadlock: If
74217 + longterm_lock_znode locks and does not capture a znode, that
74218 + znode is marked as MISSED_IN_CAPTURE. A node marked this way
74219 + is processed by the code below which restores the missed
74220 + capture and fuses current atoms of all the node lock owners
74221 + by calling the fuse_not_fused_lock_owners() function. */
74222 + if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
74223 + JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
74224 + if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
74225 + spin_unlock_txnh(txnh);
74226 + spin_unlock_jnode(node);
74227 + fuse_not_fused_lock_owners(txnh, JZNODE(node));
74228 + return RETERR(-E_REPEAT);
74229 + }
74230 + }
74231 + if (block_atom == NULL) {
74232 + atomic_inc(&txnh_atom->refcount);
74233 + spin_unlock_txnh(txnh);
74234 + if (!spin_trylock_atom(txnh_atom)) {
74235 + spin_unlock_jnode(node);
74236 + spin_lock_atom(txnh_atom);
74237 + spin_lock_jnode(node);
74238 + }
74239 + if (txnh->atom != txnh_atom || node->atom != NULL
74240 + || JF_ISSET(node, JNODE_IS_DYING)) {
74241 + spin_unlock_jnode(node);
74242 + atom_dec_and_unlock(txnh_atom);
74243 + return RETERR(-E_REPEAT);
74244 + }
74245 + atomic_dec(&txnh_atom->refcount);
74246 + capture_assign_block_nolock(txnh_atom, node);
74247 + spin_unlock_atom(txnh_atom);
74248 + } else {
74249 + if (txnh_atom != block_atom) {
74250 + if (mode & TXN_CAPTURE_DONT_FUSE) {
74251 + spin_unlock_txnh(txnh);
74252 + spin_unlock_jnode(node);
74253 + /* we are in a "no-fusion" mode and @node is
74254 + * already part of transaction. */
74255 + return RETERR(-E_NO_NEIGHBOR);
74256 + }
74257 + return capture_init_fusion(node, txnh, mode);
74258 + }
74259 + spin_unlock_txnh(txnh);
74260 + }
74261 + }
74262 + return 0;
74263 +}
74264 +
74265 +static txn_capture
74266 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
74267 +{
74268 + txn_capture cap_mode;
74269 +
74270 + assert_spin_locked(&(node->guard));
74271 +
74272 + /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
74273 +
74274 + if (lock_mode == ZNODE_WRITE_LOCK) {
74275 + cap_mode = TXN_CAPTURE_WRITE;
74276 + } else if (node->atom != NULL) {
74277 + cap_mode = TXN_CAPTURE_WRITE;
74278 + } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
74279 + jnode_get_level(node) == LEAF_LEVEL) {
74280 + /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
74281 + /* We only need a READ_FUSING capture at the leaf level. This
74282 + is because the internal levels of the tree (twigs included)
74283 + are redundant from the point of the user that asked for a
74284 + read-fusing transcrash. The user only wants to read-fuse
74285 + atoms due to reading uncommitted data that another user has
74286 + written. It is the file system that reads/writes the
74287 + internal tree levels, the user only reads/writes leaves. */
74288 + cap_mode = TXN_CAPTURE_READ_ATOMIC;
74289 + } else {
74290 + /* In this case (read lock at a non-leaf) there's no reason to
74291 + * capture. */
74292 + /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
74293 + return 0;
74294 + }
74295 +
74296 + cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
74297 + assert("nikita-3186", cap_mode != 0);
74298 + return cap_mode;
74299 +}
74300 +
74301 +/* This is an external interface to try_capture_block(), it calls
74302 + try_capture_block() repeatedly as long as -E_REPEAT is returned.
74303 +
74304 + @node: node to capture,
74305 + @lock_mode: read or write lock is used in capture mode calculation,
74306 + @flags: see txn_capture flags enumeration,
74307 + @can_coc : can copy-on-capture
74308 +
74309 + @return: 0 - node was successfully captured, -E_REPEAT - capture request
74310 + cannot be processed immediately as it was requested in flags,
74311 + < 0 - other errors.
74312 +*/
74313 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
74314 + txn_capture flags)
74315 +{
74316 + txn_atom *atom_alloc = NULL;
74317 + txn_capture cap_mode;
74318 + txn_handle *txnh = get_current_context()->trans;
74319 + int ret;
74320 +
74321 + assert_spin_locked(&(node->guard));
74322 +
74323 + repeat:
74324 + if (JF_ISSET(node, JNODE_IS_DYING))
74325 + return RETERR(-EINVAL);
74326 + if (node->atom != NULL && txnh->atom == node->atom)
74327 + return 0;
74328 + cap_mode = build_capture_mode(node, lock_mode, flags);
74329 + if (cap_mode == 0 ||
74330 + (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
74331 + /* Mark this node as "MISSED". It helps in further deadlock
74332 + * analysis */
74333 + if (jnode_is_znode(node))
74334 + JF_SET(node, JNODE_MISSED_IN_CAPTURE);
74335 + return 0;
74336 + }
74337 + /* Repeat try_capture as long as -E_REPEAT is returned. */
74338 + ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
74339 + /* Regardless of non_blocking:
74340 +
74341 + If ret == 0 then jnode is still locked.
74342 + If ret != 0 then jnode is unlocked.
74343 + */
74344 +#if REISER4_DEBUG
74345 + if (ret == 0)
74346 + assert_spin_locked(&(node->guard));
74347 + else
74348 + assert_spin_not_locked(&(node->guard));
74349 +#endif
74350 + assert_spin_not_locked(&(txnh->guard));
74351 +
74352 + if (ret == -E_REPEAT) {
74353 + /* E_REPEAT implies all locks were released, therefore we need
74354 + to take the jnode's lock again. */
74355 + spin_lock_jnode(node);
74356 +
74357 + /* Although this may appear to be a busy loop, it is not.
74358 + There are several conditions that cause E_REPEAT to be
74359 + returned by the call to try_capture_block, all cases
74360 + indicating some kind of state change that means you should
74361 + retry the request and will get a different result. In some
74362 + cases this could be avoided with some extra code, but
74363 + generally it is done because the necessary locks were
74364 + released as a result of the operation and repeating is the
74365 + simplest thing to do (less bug potential). The cases are:
74366 + atom fusion returns E_REPEAT after it completes (jnode and
74367 + txnh were unlocked); race conditions in assign_block,
74368 + assign_txnh, and init_fusion return E_REPEAT (trylock
74369 + failure); after going to sleep in capture_fuse_wait
74370 + (request was blocked but may now succeed). I'm not quite
74371 + sure how capture_copy works yet, but it may also return
74372 + E_REPEAT. When the request is legitimately blocked, the
74373 + requestor goes to sleep in fuse_wait, so this is not a busy
74374 + loop. */
74375 + /* NOTE-NIKITA: still don't understand:
74376 +
74377 + try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
74378 +
74379 + looks like busy loop?
74380 + */
74381 + goto repeat;
74382 + }
74383 +
74384 + /* free extra atom object that was possibly allocated by
74385 + try_capture_block().
74386 +
74387 + Do this before acquiring jnode spin lock to
74388 + minimize time spent under lock. --nikita */
74389 + if (atom_alloc != NULL) {
74390 + kmem_cache_free(_atom_slab, atom_alloc);
74391 + }
74392 +
74393 + if (ret != 0) {
74394 + if (ret == -E_BLOCK) {
74395 + assert("nikita-3360",
74396 + cap_mode & TXN_CAPTURE_NONBLOCKING);
74397 + ret = -E_REPEAT;
74398 + }
74399 +
74400 + /* Failure means jnode is not locked. FIXME_LATER_JMACD May
74401 + want to fix the above code to avoid releasing the lock and
74402 + re-acquiring it, but there are cases were failure occurs
74403 + when the lock is not held, and those cases would need to be
74404 + modified to re-take the lock. */
74405 + spin_lock_jnode(node);
74406 + }
74407 +
74408 + /* Jnode is still locked. */
74409 + assert_spin_locked(&(node->guard));
74410 + return ret;
74411 +}
74412 +
74413 +static void release_two_atoms(txn_atom *one, txn_atom *two)
74414 +{
74415 + spin_unlock_atom(one);
74416 + atom_dec_and_unlock(two);
74417 + spin_lock_atom(one);
74418 + atom_dec_and_unlock(one);
74419 +}
74420 +
74421 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
74422 + returned by that routine. The txn_capture request mode is computed here depending on
74423 + the transaction handle's type and the lock request. This is called from the depths of
74424 + the lock manager with the jnode lock held and it always returns with the jnode lock
74425 + held.
74426 +*/
74427 +
74428 +/* fuse all 'active' atoms of lock owners of given node. */
74429 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
74430 +{
74431 + lock_handle *lh;
74432 + int repeat;
74433 + txn_atom *atomh, *atomf;
74434 + reiser4_context *me = get_current_context();
74435 + reiser4_context *ctx = NULL;
74436 +
74437 + assert_spin_not_locked(&(ZJNODE(node)->guard));
74438 + assert_spin_not_locked(&(txnh->hlock));
74439 +
74440 + repeat:
74441 + repeat = 0;
74442 + atomh = txnh_get_atom(txnh);
74443 + spin_unlock_txnh(txnh);
74444 + assert("zam-692", atomh != NULL);
74445 +
74446 + spin_lock_zlock(&node->lock);
74447 + /* inspect list of lock owners */
74448 + list_for_each_entry(lh, &node->lock.owners, owners_link) {
74449 + ctx = get_context_by_lock_stack(lh->owner);
74450 + if (ctx == me)
74451 + continue;
74452 + /* below we use two assumptions to avoid addition spin-locks
74453 + for checking the condition :
74454 +
74455 + 1) if the lock stack has lock, the transaction should be
74456 + opened, i.e. ctx->trans != NULL;
74457 +
74458 + 2) reading of well-aligned ctx->trans->atom is atomic, if it
74459 + equals to the address of spin-locked atomh, we take that
74460 + the atoms are the same, nothing has to be captured. */
74461 + if (atomh != ctx->trans->atom) {
74462 + reiser4_wake_up(lh->owner);
74463 + repeat = 1;
74464 + break;
74465 + }
74466 + }
74467 + if (repeat) {
74468 + if (!spin_trylock_txnh(ctx->trans)) {
74469 + spin_unlock_zlock(&node->lock);
74470 + spin_unlock_atom(atomh);
74471 + goto repeat;
74472 + }
74473 + atomf = ctx->trans->atom;
74474 + if (atomf == NULL) {
74475 + capture_assign_txnh_nolock(atomh, ctx->trans);
74476 + /* release zlock lock _after_ assigning the atom to the
74477 + * transaction handle, otherwise the lock owner thread
74478 + * may unlock all znodes, exit kernel context and here
74479 + * we would access an invalid transaction handle. */
74480 + spin_unlock_zlock(&node->lock);
74481 + spin_unlock_atom(atomh);
74482 + spin_unlock_txnh(ctx->trans);
74483 + goto repeat;
74484 + }
74485 + assert("zam-1059", atomf != atomh);
74486 + spin_unlock_zlock(&node->lock);
74487 + atomic_inc(&atomh->refcount);
74488 + atomic_inc(&atomf->refcount);
74489 + spin_unlock_txnh(ctx->trans);
74490 + if (atomf > atomh) {
74491 + spin_lock_atom_nested(atomf);
74492 + } else {
74493 + spin_unlock_atom(atomh);
74494 + spin_lock_atom(atomf);
74495 + spin_lock_atom_nested(atomh);
74496 + }
74497 + if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
74498 + release_two_atoms(atomf, atomh);
74499 + goto repeat;
74500 + }
74501 + atomic_dec(&atomh->refcount);
74502 + atomic_dec(&atomf->refcount);
74503 + capture_fuse_into(atomf, atomh);
74504 + goto repeat;
74505 + }
74506 + spin_unlock_zlock(&node->lock);
74507 + spin_unlock_atom(atomh);
74508 +}
74509 +
74510 +/* This is the interface to capture unformatted nodes via their struct page
74511 + reference. Currently it is only used in reiser4_invalidatepage */
74512 +int try_capture_page_to_invalidate(struct page *pg)
74513 +{
74514 + int ret;
74515 + jnode *node;
74516 +
74517 + assert("umka-292", pg != NULL);
74518 + assert("nikita-2597", PageLocked(pg));
74519 +
74520 + if (IS_ERR(node = jnode_of_page(pg))) {
74521 + return PTR_ERR(node);
74522 + }
74523 +
74524 + spin_lock_jnode(node);
74525 + unlock_page(pg);
74526 +
74527 + ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
74528 + spin_unlock_jnode(node);
74529 + jput(node);
74530 + lock_page(pg);
74531 + return ret;
74532 +}
74533 +
74534 +/* This informs the transaction manager when a node is deleted. Add the block to the
74535 + atom's delete set and uncapture the block.
74536 +
74537 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
74538 +explanations. find all the functions that use it, and unless there is some very
74539 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
74540 +move the loop to inside the function.
74541 +
74542 +VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
74543 + */
74544 +void reiser4_uncapture_page(struct page *pg)
74545 +{
74546 + jnode *node;
74547 + txn_atom *atom;
74548 +
74549 + assert("umka-199", pg != NULL);
74550 + assert("nikita-3155", PageLocked(pg));
74551 +
74552 + clear_page_dirty_for_io(pg);
74553 +
74554 + reiser4_wait_page_writeback(pg);
74555 +
74556 + node = jprivate(pg);
74557 + BUG_ON(node == NULL);
74558 +
74559 + spin_lock_jnode(node);
74560 +
74561 + atom = jnode_get_atom(node);
74562 + if (atom == NULL) {
74563 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74564 + spin_unlock_jnode(node);
74565 + return;
74566 + }
74567 +
74568 + /* We can remove jnode from transaction even if it is on flush queue
74569 + * prepped list, we only need to be sure that flush queue is not being
74570 + * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
74571 + * spin lock for protection of the prepped nodes list, instead
74572 + * write_fq() increments atom's nr_running_queues counters for the time
74573 + * when prepped list is not protected by spin lock. Here we check this
74574 + * counter if we want to remove jnode from flush queue and, if the
74575 + * counter is not zero, wait all reiser4_write_fq() for this atom to
74576 + * complete. This is not significant overhead. */
74577 + while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
74578 + spin_unlock_jnode(node);
74579 + /*
74580 + * at this moment we want to wait for "atom event", viz. wait
74581 + * until @node can be removed from flush queue. But
74582 + * reiser4_atom_wait_event() cannot be called with page locked,
74583 + * because it deadlocks with jnode_extent_write(). Unlock page,
74584 + * after making sure (through page_cache_get()) that it cannot
74585 + * be released from memory.
74586 + */
74587 + page_cache_get(pg);
74588 + unlock_page(pg);
74589 + reiser4_atom_wait_event(atom);
74590 + lock_page(pg);
74591 + /*
74592 + * page may has been detached by ->writepage()->releasepage().
74593 + */
74594 + reiser4_wait_page_writeback(pg);
74595 + spin_lock_jnode(node);
74596 + page_cache_release(pg);
74597 + atom = jnode_get_atom(node);
74598 +/* VS-FIXME-HANS: improve the commenting in this function */
74599 + if (atom == NULL) {
74600 + spin_unlock_jnode(node);
74601 + return;
74602 + }
74603 + }
74604 + reiser4_uncapture_block(node);
74605 + spin_unlock_atom(atom);
74606 + jput(node);
74607 +}
74608 +
74609 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
74610 + * inode's tree of jnodes */
74611 +void reiser4_uncapture_jnode(jnode * node)
74612 +{
74613 + txn_atom *atom;
74614 +
74615 + assert_spin_locked(&(node->guard));
74616 + assert("", node->pg == 0);
74617 +
74618 + atom = jnode_get_atom(node);
74619 + if (atom == NULL) {
74620 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74621 + spin_unlock_jnode(node);
74622 + return;
74623 + }
74624 +
74625 + reiser4_uncapture_block(node);
74626 + spin_unlock_atom(atom);
74627 + jput(node);
74628 +}
74629 +
74630 +/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
74631 + increases atom refcount and txnh_count, adds to txnh_list. */
74632 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
74633 +{
74634 + assert("umka-200", atom != NULL);
74635 + assert("umka-201", txnh != NULL);
74636 +
74637 + assert_spin_locked(&(txnh->hlock));
74638 + assert_spin_locked(&(atom->alock));
74639 + assert("jmacd-824", txnh->atom == NULL);
74640 + assert("nikita-3540", atom_isopen(atom));
74641 + BUG_ON(txnh->atom != NULL);
74642 +
74643 + atomic_inc(&atom->refcount);
74644 + txnh->atom = atom;
74645 + reiser4_ctx_gfp_mask_set();
74646 + list_add_tail(&txnh->txnh_link, &atom->txnh_list);
74647 + atom->txnh_count += 1;
74648 +}
74649 +
74650 +/* No-locking version of assign_block. Sets the block's atom pointer, references the
74651 + block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
74652 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
74653 +{
74654 + assert("umka-202", atom != NULL);
74655 + assert("umka-203", node != NULL);
74656 + assert_spin_locked(&(node->guard));
74657 + assert_spin_locked(&(atom->alock));
74658 + assert("jmacd-323", node->atom == NULL);
74659 + BUG_ON(!list_empty_careful(&node->capture_link));
74660 + assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
74661 +
74662 + /* Pointer from jnode to atom is not counted in atom->refcount. */
74663 + node->atom = atom;
74664 +
74665 + list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
74666 + atom->capture_count += 1;
74667 + /* reference to jnode is acquired by atom. */
74668 + jref(node);
74669 +
74670 + ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
74671 +
74672 + LOCK_CNT_INC(t_refs);
74673 +}
74674 +
74675 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
74676 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
74677 +{
74678 + assert_spin_locked(&(node->guard));
74679 + assert_spin_locked(&(atom->alock));
74680 + assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
74681 +
74682 + JF_SET(node, JNODE_DIRTY);
74683 +
74684 + get_current_context()->nr_marked_dirty++;
74685 +
74686 + /* We grab2flush_reserve one additional block only if node was
74687 + not CREATED and jnode_flush did not sort it into neither
74688 + relocate set nor overwrite one. If node is in overwrite or
74689 + relocate set we assume that atom's flush reserved counter was
74690 + already adjusted. */
74691 + if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
74692 + && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
74693 + && !jnode_is_cluster_page(node)) {
74694 + assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
74695 + assert("vs-1506", *jnode_get_block(node) != 0);
74696 + grabbed2flush_reserved_nolock(atom, (__u64) 1);
74697 + JF_SET(node, JNODE_FLUSH_RESERVED);
74698 + }
74699 +
74700 + if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
74701 + /* If the atom is not set yet, it will be added to the appropriate list in
74702 + capture_assign_block_nolock. */
74703 + /* Sometimes a node is set dirty before being captured -- the case for new
74704 + jnodes. In that case the jnode will be added to the appropriate list
74705 + in capture_assign_block_nolock. Another reason not to re-link jnode is
74706 + that jnode is on a flush queue (see flush.c for details) */
74707 +
74708 + int level = jnode_get_level(node);
74709 +
74710 + assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
74711 + assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
74712 + assert("nikita-2607", 0 <= level);
74713 + assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
74714 +
74715 + /* move node to atom's dirty list */
74716 + list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
74717 + ON_DEBUG(count_jnode
74718 + (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
74719 + }
74720 +}
74721 +
74722 +/* Set the dirty status for this (spin locked) jnode. */
74723 +void jnode_make_dirty_locked(jnode * node)
74724 +{
74725 + assert("umka-204", node != NULL);
74726 + assert_spin_locked(&(node->guard));
74727 +
74728 + if (REISER4_DEBUG && rofs_jnode(node)) {
74729 + warning("nikita-3365", "Dirtying jnode on rofs");
74730 + dump_stack();
74731 + }
74732 +
74733 + /* Fast check for already dirty node */
74734 + if (!JF_ISSET(node, JNODE_DIRTY)) {
74735 + txn_atom *atom;
74736 +
74737 + atom = jnode_get_atom(node);
74738 + assert("vs-1094", atom);
74739 + /* Check jnode dirty status again because node spin lock might
74740 + * be released inside jnode_get_atom(). */
74741 + if (likely(!JF_ISSET(node, JNODE_DIRTY)))
74742 + do_jnode_make_dirty(node, atom);
74743 + spin_unlock_atom(atom);
74744 + }
74745 +}
74746 +
74747 +/* Set the dirty status for this znode. */
74748 +void znode_make_dirty(znode * z)
74749 +{
74750 + jnode *node;
74751 + struct page *page;
74752 +
74753 + assert("umka-204", z != NULL);
74754 + assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
74755 + assert("nikita-3560", znode_is_write_locked(z));
74756 +
74757 + node = ZJNODE(z);
74758 + /* znode is longterm locked, we can check dirty bit without spinlock */
74759 + if (JF_ISSET(node, JNODE_DIRTY)) {
74760 + /* znode is dirty already. All we have to do is to change znode version */
74761 + z->version = znode_build_version(jnode_get_tree(node));
74762 + return;
74763 + }
74764 +
74765 + spin_lock_jnode(node);
74766 + jnode_make_dirty_locked(node);
74767 + page = jnode_page(node);
74768 + if (page != NULL) {
74769 + /* this is useful assertion (allows one to check that no
74770 + * modifications are lost due to update of in-flight page),
74771 + * but it requires locking on page to check PG_writeback
74772 + * bit. */
74773 + /* assert("nikita-3292",
74774 + !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
74775 + page_cache_get(page);
74776 +
74777 + /* jnode lock is not needed for the rest of
74778 + * znode_set_dirty(). */
74779 + spin_unlock_jnode(node);
74780 + /* reiser4 file write code calls set_page_dirty for
74781 + * unformatted nodes, for formatted nodes we do it here. */
74782 + reiser4_set_page_dirty_internal(page);
74783 + page_cache_release(page);
74784 + /* bump version counter in znode */
74785 + z->version = znode_build_version(jnode_get_tree(node));
74786 + } else {
74787 + assert("zam-596", znode_above_root(JZNODE(node)));
74788 + spin_unlock_jnode(node);
74789 + }
74790 +
74791 + assert("nikita-1900", znode_is_write_locked(z));
74792 + assert("jmacd-9777", node->atom != NULL);
74793 +}
74794 +
74795 +int reiser4_sync_atom(txn_atom * atom)
74796 +{
74797 + int result;
74798 + txn_handle *txnh;
74799 +
74800 + txnh = get_current_context()->trans;
74801 +
74802 + result = 0;
74803 + if (atom != NULL) {
74804 + if (atom->stage < ASTAGE_PRE_COMMIT) {
74805 + spin_lock_txnh(txnh);
74806 + capture_assign_txnh_nolock(atom, txnh);
74807 + result = force_commit_atom(txnh);
74808 + } else if (atom->stage < ASTAGE_POST_COMMIT) {
74809 + /* wait atom commit */
74810 + reiser4_atom_wait_event(atom);
74811 + /* try once more */
74812 + result = RETERR(-E_REPEAT);
74813 + } else
74814 + spin_unlock_atom(atom);
74815 + }
74816 + return result;
74817 +}
74818 +
74819 +#if REISER4_DEBUG
74820 +
74821 +/* move jnode form one list to another
74822 + call this after atom->capture_count is updated */
74823 +void
74824 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
74825 + atom_list new_list, int check_lists)
74826 +{
74827 + struct list_head *pos;
74828 +
74829 + assert("zam-1018", atom_is_protected(atom));
74830 + assert_spin_locked(&(node->guard));
74831 + assert("", NODE_LIST(node) == old_list);
74832 +
74833 + switch (NODE_LIST(node)) {
74834 + case NOT_CAPTURED:
74835 + break;
74836 + case DIRTY_LIST:
74837 + assert("", atom->dirty > 0);
74838 + atom->dirty--;
74839 + break;
74840 + case CLEAN_LIST:
74841 + assert("", atom->clean > 0);
74842 + atom->clean--;
74843 + break;
74844 + case FQ_LIST:
74845 + assert("", atom->fq > 0);
74846 + atom->fq--;
74847 + break;
74848 + case WB_LIST:
74849 + assert("", atom->wb > 0);
74850 + atom->wb--;
74851 + break;
74852 + case OVRWR_LIST:
74853 + assert("", atom->ovrwr > 0);
74854 + atom->ovrwr--;
74855 + break;
74856 + default:
74857 + impossible("", "");
74858 + }
74859 +
74860 + switch (new_list) {
74861 + case NOT_CAPTURED:
74862 + break;
74863 + case DIRTY_LIST:
74864 + atom->dirty++;
74865 + break;
74866 + case CLEAN_LIST:
74867 + atom->clean++;
74868 + break;
74869 + case FQ_LIST:
74870 + atom->fq++;
74871 + break;
74872 + case WB_LIST:
74873 + atom->wb++;
74874 + break;
74875 + case OVRWR_LIST:
74876 + atom->ovrwr++;
74877 + break;
74878 + default:
74879 + impossible("", "");
74880 + }
74881 + ASSIGN_NODE_LIST(node, new_list);
74882 + if (0 && check_lists) {
74883 + int count;
74884 + tree_level level;
74885 +
74886 + count = 0;
74887 +
74888 + /* flush queue list */
74889 + /* reiser4_check_fq(atom); */
74890 +
74891 + /* dirty list */
74892 + count = 0;
74893 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
74894 + list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
74895 + count++;
74896 + }
74897 + if (count != atom->dirty)
74898 + warning("", "dirty counter %d, real %d\n", atom->dirty,
74899 + count);
74900 +
74901 + /* clean list */
74902 + count = 0;
74903 + list_for_each(pos, ATOM_CLEAN_LIST(atom))
74904 + count++;
74905 + if (count != atom->clean)
74906 + warning("", "clean counter %d, real %d\n", atom->clean,
74907 + count);
74908 +
74909 + /* wb list */
74910 + count = 0;
74911 + list_for_each(pos, ATOM_WB_LIST(atom))
74912 + count++;
74913 + if (count != atom->wb)
74914 + warning("", "wb counter %d, real %d\n", atom->wb,
74915 + count);
74916 +
74917 + /* overwrite list */
74918 + count = 0;
74919 + list_for_each(pos, ATOM_OVRWR_LIST(atom))
74920 + count++;
74921 +
74922 + if (count != atom->ovrwr)
74923 + warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
74924 + count);
74925 + }
74926 + assert("vs-1624", atom->num_queued == atom->fq);
74927 + if (atom->capture_count !=
74928 + atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
74929 + printk
74930 + ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
74931 + atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
74932 + atom->wb, atom->fq);
74933 + assert("vs-1622",
74934 + atom->capture_count ==
74935 + atom->dirty + atom->clean + atom->ovrwr + atom->wb +
74936 + atom->fq);
74937 + }
74938 +}
74939 +
74940 +#endif
74941 +
74942 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
74943 + * lock should be taken before calling this function. */
74944 +void jnode_make_wander_nolock(jnode * node)
74945 +{
74946 + txn_atom *atom;
74947 +
74948 + assert("nikita-2431", node != NULL);
74949 + assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
74950 + assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
74951 + assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
74952 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
74953 +
74954 + atom = node->atom;
74955 +
74956 + assert("zam-895", atom != NULL);
74957 + assert("zam-894", atom_is_protected(atom));
74958 +
74959 + JF_SET(node, JNODE_OVRWR);
74960 + /* move node to atom's overwrite list */
74961 + list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
74962 + ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
74963 +}
74964 +
74965 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
74966 + * this function. */
74967 +void jnode_make_wander(jnode * node)
74968 +{
74969 + txn_atom *atom;
74970 +
74971 + spin_lock_jnode(node);
74972 + atom = jnode_get_atom(node);
74973 + assert("zam-913", atom != NULL);
74974 + assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
74975 +
74976 + jnode_make_wander_nolock(node);
74977 + spin_unlock_atom(atom);
74978 + spin_unlock_jnode(node);
74979 +}
74980 +
74981 +/* this just sets RELOC bit */
74982 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
74983 +{
74984 + assert_spin_locked(&(node->guard));
74985 + assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
74986 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
74987 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
74988 + assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
74989 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
74990 + jnode_set_reloc(node);
74991 +}
74992 +
74993 +/* Make znode RELOC and put it on flush queue */
74994 +void znode_make_reloc(znode * z, flush_queue_t * fq)
74995 +{
74996 + jnode *node;
74997 + txn_atom *atom;
74998 +
74999 + node = ZJNODE(z);
75000 + spin_lock_jnode(node);
75001 +
75002 + atom = jnode_get_atom(node);
75003 + assert("zam-919", atom != NULL);
75004 +
75005 + jnode_make_reloc_nolock(fq, node);
75006 + queue_jnode(fq, node);
75007 +
75008 + spin_unlock_atom(atom);
75009 + spin_unlock_jnode(node);
75010 +
75011 +}
75012 +
75013 +/* Make unformatted node RELOC and put it on flush queue */
75014 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75015 +{
75016 + assert("vs-1479", jnode_is_unformatted(node));
75017 +
75018 + jnode_make_reloc_nolock(fq, node);
75019 + queue_jnode(fq, node);
75020 +}
75021 +
75022 +int reiser4_capture_super_block(struct super_block *s)
75023 +{
75024 + int result;
75025 + znode *uber;
75026 + lock_handle lh;
75027 +
75028 + init_lh(&lh);
75029 + result = get_uber_znode(reiser4_get_tree(s),
75030 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75031 + if (result)
75032 + return result;
75033 +
75034 + uber = lh.node;
75035 + /* Grabbing one block for superblock */
75036 + result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75037 + if (result != 0)
75038 + return result;
75039 +
75040 + znode_make_dirty(uber);
75041 +
75042 + done_lh(&lh);
75043 + return 0;
75044 +}
75045 +
75046 +/* Wakeup every handle on the atom's WAITFOR list */
75047 +static void wakeup_atom_waitfor_list(txn_atom * atom)
75048 +{
75049 + txn_wait_links *wlinks;
75050 +
75051 + assert("umka-210", atom != NULL);
75052 +
75053 + /* atom is locked */
75054 + list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75055 + if (wlinks->waitfor_cb == NULL ||
75056 + wlinks->waitfor_cb(atom, wlinks))
75057 + /* Wake up. */
75058 + reiser4_wake_up(wlinks->_lock_stack);
75059 + }
75060 +}
75061 +
75062 +/* Wakeup every handle on the atom's WAITING list */
75063 +static void wakeup_atom_waiting_list(txn_atom * atom)
75064 +{
75065 + txn_wait_links *wlinks;
75066 +
75067 + assert("umka-211", atom != NULL);
75068 +
75069 + /* atom is locked */
75070 + list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75071 + if (wlinks->waiting_cb == NULL ||
75072 + wlinks->waiting_cb(atom, wlinks))
75073 + /* Wake up. */
75074 + reiser4_wake_up(wlinks->_lock_stack);
75075 + }
75076 +}
75077 +
75078 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75079 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75080 +{
75081 + assert("nikita-3330", atom != NULL);
75082 + assert_spin_locked(&(atom->alock));
75083 +
75084 + /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75085 + * last transaction handle. */
75086 + return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75087 +}
75088 +
75089 +/* The general purpose of this function is to wait on the first of two possible events.
75090 + The situation is that a handle (and its atom atomh) is blocked trying to capture a
75091 + block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
75092 + handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
75093 + another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75094 + needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
75095 + proceed and fuse the two atoms in the CAPTURE_WAIT state.
75096 +
75097 + In other words, if either atomh or atomf change state, the handle will be awakened,
75098 + thus there are two lists per atom: WAITING and WAITFOR.
75099 +
75100 + This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75101 + close but it is not assigned to an atom of its own.
75102 +
75103 + Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75104 + BOTH_ATOM_LOCKS. Result: all four locks are released.
75105 +*/
75106 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75107 + txn_atom * atomh, txn_capture mode)
75108 +{
75109 + int ret;
75110 + txn_wait_links wlinks;
75111 +
75112 + assert("umka-213", txnh != NULL);
75113 + assert("umka-214", atomf != NULL);
75114 +
75115 + if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75116 + spin_unlock_txnh(txnh);
75117 + spin_unlock_atom(atomf);
75118 +
75119 + if (atomh) {
75120 + spin_unlock_atom(atomh);
75121 + }
75122 +
75123 + return RETERR(-E_BLOCK);
75124 + }
75125 +
75126 + /* Initialize the waiting list links. */
75127 + init_wlinks(&wlinks);
75128 +
75129 + /* Add txnh to atomf's waitfor list, unlock atomf. */
75130 + list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75131 + wlinks.waitfor_cb = wait_for_fusion;
75132 + atomic_inc(&atomf->refcount);
75133 + spin_unlock_atom(atomf);
75134 +
75135 + if (atomh) {
75136 + /* Add txnh to atomh's waiting list, unlock atomh. */
75137 + list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75138 + atomic_inc(&atomh->refcount);
75139 + spin_unlock_atom(atomh);
75140 + }
75141 +
75142 + /* Go to sleep. */
75143 + spin_unlock_txnh(txnh);
75144 +
75145 + ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
75146 + if (ret == 0) {
75147 + reiser4_go_to_sleep(wlinks._lock_stack);
75148 + ret = RETERR(-E_REPEAT);
75149 + }
75150 +
75151 + /* Remove from the waitfor list. */
75152 + spin_lock_atom(atomf);
75153 +
75154 + list_del(&wlinks._fwaitfor_link);
75155 + atom_dec_and_unlock(atomf);
75156 +
75157 + if (atomh) {
75158 + /* Remove from the waiting list. */
75159 + spin_lock_atom(atomh);
75160 + list_del(&wlinks._fwaiting_link);
75161 + atom_dec_and_unlock(atomh);
75162 + }
75163 + return ret;
75164 +}
75165 +
75166 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
75167 +{
75168 + assert("zam-1067", one != two);
75169 +
75170 + /* lock the atom with lesser address first */
75171 + if (one < two) {
75172 + spin_lock_atom(one);
75173 + spin_lock_atom_nested(two);
75174 + } else {
75175 + spin_lock_atom(two);
75176 + spin_lock_atom_nested(one);
75177 + }
75178 +}
75179 +
75180 +/* Perform the necessary work to prepare for fusing two atoms, which involves
75181 + * acquiring two atom locks in the proper order. If one of the node's atom is
75182 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75183 + * atom is not then the handle's request is put to sleep. If the node's atom
75184 + * is committing, then the node can be copy-on-captured. Otherwise, pick the
75185 + * atom with fewer pointers to be fused into the atom with more pointer and
75186 + * call capture_fuse_into.
75187 + */
75188 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75189 +{
75190 + txn_atom * txnh_atom = txnh->atom;
75191 + txn_atom * block_atom = node->atom;
75192 +
75193 + atomic_inc(&txnh_atom->refcount);
75194 + atomic_inc(&block_atom->refcount);
75195 +
75196 + spin_unlock_txnh(txnh);
75197 + spin_unlock_jnode(node);
75198 +
75199 + lock_two_atoms(txnh_atom, block_atom);
75200 +
75201 + if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75202 + release_two_atoms(txnh_atom, block_atom);
75203 + return RETERR(-E_REPEAT);
75204 + }
75205 +
75206 + atomic_dec(&txnh_atom->refcount);
75207 + atomic_dec(&block_atom->refcount);
75208 +
75209 + assert ("zam-1066", atom_isopen(txnh_atom));
75210 +
75211 + if (txnh_atom->stage >= block_atom->stage ||
75212 + (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
75213 + capture_fuse_into(txnh_atom, block_atom);
75214 + return RETERR(-E_REPEAT);
75215 + }
75216 + spin_lock_txnh(txnh);
75217 + return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
75218 +}
75219 +
75220 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
75221 + the small list to point to the large atom. Returns the length of the list. */
75222 +static int
75223 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
75224 + struct list_head *small_head)
75225 +{
75226 + int count = 0;
75227 + jnode *node;
75228 +
75229 + assert("umka-218", large != NULL);
75230 + assert("umka-219", large_head != NULL);
75231 + assert("umka-220", small_head != NULL);
75232 + /* small atom should be locked also. */
75233 + assert_spin_locked(&(large->alock));
75234 +
75235 + /* For every jnode on small's capture list... */
75236 + list_for_each_entry(node, small_head, capture_link) {
75237 + count += 1;
75238 +
75239 + /* With the jnode lock held, update atom pointer. */
75240 + spin_lock_jnode(node);
75241 + node->atom = large;
75242 + spin_unlock_jnode(node);
75243 + }
75244 +
75245 + /* Splice the lists. */
75246 + list_splice_init(small_head, large_head->prev);
75247 +
75248 + return count;
75249 +}
75250 +
75251 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
75252 + the small list to point to the large atom. Returns the length of the list. */
75253 +static int
75254 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
75255 + struct list_head *small_head)
75256 +{
75257 + int count = 0;
75258 + txn_handle *txnh;
75259 +
75260 + assert("umka-221", large != NULL);
75261 + assert("umka-222", large_head != NULL);
75262 + assert("umka-223", small_head != NULL);
75263 +
75264 + /* Adjust every txnh to the new atom. */
75265 + list_for_each_entry(txnh, small_head, txnh_link) {
75266 + count += 1;
75267 +
75268 + /* With the txnh lock held, update atom pointer. */
75269 + spin_lock_txnh(txnh);
75270 + txnh->atom = large;
75271 + spin_unlock_txnh(txnh);
75272 + }
75273 +
75274 + /* Splice the txn_handle list. */
75275 + list_splice_init(small_head, large_head->prev);
75276 +
75277 + return count;
75278 +}
75279 +
75280 +/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
75281 + added to LARGE and their ->atom pointers are all updated. The associated counts are
75282 + updated as well, and any waiting handles belonging to either are awakened. Finally the
75283 + smaller atom's refcount is decremented.
75284 +*/
75285 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
75286 +{
75287 + int level;
75288 + unsigned zcount = 0;
75289 + unsigned tcount = 0;
75290 +
75291 + assert("umka-224", small != NULL);
75292 + assert("umka-225", small != NULL);
75293 +
75294 + assert_spin_locked(&(large->alock));
75295 + assert_spin_locked(&(small->alock));
75296 +
75297 + assert("jmacd-201", atom_isopen(small));
75298 + assert("jmacd-202", atom_isopen(large));
75299 +
75300 + /* Splice and update the per-level dirty jnode lists */
75301 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75302 + zcount +=
75303 + capture_fuse_jnode_lists(large,
75304 + ATOM_DIRTY_LIST(large, level),
75305 + ATOM_DIRTY_LIST(small, level));
75306 + }
75307 +
75308 + /* Splice and update the [clean,dirty] jnode and txnh lists */
75309 + zcount +=
75310 + capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
75311 + ATOM_CLEAN_LIST(small));
75312 + zcount +=
75313 + capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
75314 + ATOM_OVRWR_LIST(small));
75315 + zcount +=
75316 + capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
75317 + ATOM_WB_LIST(small));
75318 + zcount +=
75319 + capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
75320 + tcount +=
75321 + capture_fuse_txnh_lists(large, &large->txnh_list,
75322 + &small->txnh_list);
75323 +
75324 + /* Check our accounting. */
75325 + assert("jmacd-1063",
75326 + zcount + small->num_queued == small->capture_count);
75327 + assert("jmacd-1065", tcount == small->txnh_count);
75328 +
75329 + /* sum numbers of waiters threads */
75330 + large->nr_waiters += small->nr_waiters;
75331 + small->nr_waiters = 0;
75332 +
75333 + /* splice flush queues */
75334 + reiser4_fuse_fq(large, small);
75335 +
75336 + /* update counter of jnode on every atom' list */
75337 + ON_DEBUG(large->dirty += small->dirty;
75338 + small->dirty = 0;
75339 + large->clean += small->clean;
75340 + small->clean = 0;
75341 + large->ovrwr += small->ovrwr;
75342 + small->ovrwr = 0;
75343 + large->wb += small->wb;
75344 + small->wb = 0;
75345 + large->fq += small->fq;
75346 + small->fq = 0;);
75347 +
75348 + /* count flushers in result atom */
75349 + large->nr_flushers += small->nr_flushers;
75350 + small->nr_flushers = 0;
75351 +
75352 + /* update counts of flushed nodes */
75353 + large->flushed += small->flushed;
75354 + small->flushed = 0;
75355 +
75356 + /* Transfer list counts to large. */
75357 + large->txnh_count += small->txnh_count;
75358 + large->capture_count += small->capture_count;
75359 +
75360 + /* Add all txnh references to large. */
75361 + atomic_add(small->txnh_count, &large->refcount);
75362 + atomic_sub(small->txnh_count, &small->refcount);
75363 +
75364 + /* Reset small counts */
75365 + small->txnh_count = 0;
75366 + small->capture_count = 0;
75367 +
75368 + /* Assign the oldest start_time, merge flags. */
75369 + large->start_time = min(large->start_time, small->start_time);
75370 + large->flags |= small->flags;
75371 +
75372 + /* Merge blocknr sets. */
75373 + blocknr_set_merge(&small->delete_set, &large->delete_set);
75374 + blocknr_set_merge(&small->wandered_map, &large->wandered_map);
75375 +
75376 + /* Merge allocated/deleted file counts */
75377 + large->nr_objects_deleted += small->nr_objects_deleted;
75378 + large->nr_objects_created += small->nr_objects_created;
75379 +
75380 + small->nr_objects_deleted = 0;
75381 + small->nr_objects_created = 0;
75382 +
75383 + /* Merge allocated blocks counts */
75384 + large->nr_blocks_allocated += small->nr_blocks_allocated;
75385 +
75386 + large->nr_running_queues += small->nr_running_queues;
75387 + small->nr_running_queues = 0;
75388 +
75389 + /* Merge blocks reserved for overwrite set. */
75390 + large->flush_reserved += small->flush_reserved;
75391 + small->flush_reserved = 0;
75392 +
75393 + if (large->stage < small->stage) {
75394 + /* Large only needs to notify if it has changed state. */
75395 + reiser4_atom_set_stage(large, small->stage);
75396 + wakeup_atom_waiting_list(large);
75397 + }
75398 +
75399 + reiser4_atom_set_stage(small, ASTAGE_INVALID);
75400 +
75401 + /* Notify any waiters--small needs to unload its wait lists. Waiters
75402 + actually remove themselves from the list before returning from the
75403 + fuse_wait function. */
75404 + wakeup_atom_waiting_list(small);
75405 +
75406 + /* Unlock atoms */
75407 + spin_unlock_atom(large);
75408 + atom_dec_and_unlock(small);
75409 +}
75410 +
75411 +/* TXNMGR STUFF */
75412 +
75413 +/* Release a block from the atom, reversing the effects of being captured,
75414 + do not release atom's reference to jnode due to holding spin-locks.
75415 + Currently this is only called when the atom commits.
75416 +
75417 + NOTE: this function does not release a (journal) reference to jnode
75418 + due to locking optimizations, you should call jput() somewhere after
75419 + calling reiser4_uncapture_block(). */
75420 +void reiser4_uncapture_block(jnode * node)
75421 +{
75422 + txn_atom *atom;
75423 +
75424 + assert("umka-226", node != NULL);
75425 + atom = node->atom;
75426 + assert("umka-228", atom != NULL);
75427 +
75428 + assert("jmacd-1021", node->atom == atom);
75429 + assert_spin_locked(&(node->guard));
75430 + assert("jmacd-1023", atom_is_protected(atom));
75431 +
75432 + JF_CLR(node, JNODE_DIRTY);
75433 + JF_CLR(node, JNODE_RELOC);
75434 + JF_CLR(node, JNODE_OVRWR);
75435 + JF_CLR(node, JNODE_CREATED);
75436 + JF_CLR(node, JNODE_WRITEBACK);
75437 + JF_CLR(node, JNODE_REPACK);
75438 +
75439 + list_del_init(&node->capture_link);
75440 + if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75441 + assert("zam-925", atom_isopen(atom));
75442 + assert("vs-1623", NODE_LIST(node) == FQ_LIST);
75443 + ON_DEBUG(atom->num_queued--);
75444 + JF_CLR(node, JNODE_FLUSH_QUEUED);
75445 + }
75446 + atom->capture_count -= 1;
75447 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
75448 + node->atom = NULL;
75449 +
75450 + spin_unlock_jnode(node);
75451 + LOCK_CNT_DEC(t_refs);
75452 +}
75453 +
75454 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
75455 + bitmap-based allocator code for adding modified bitmap blocks the
75456 + transaction. @atom and @node are spin locked */
75457 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
75458 +{
75459 + assert("zam-538", atom_is_protected(atom));
75460 + assert_spin_locked(&(node->guard));
75461 + assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
75462 + assert("zam-543", node->atom == NULL);
75463 + assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
75464 +
75465 + list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
75466 + jref(node);
75467 + node->atom = atom;
75468 + atom->capture_count++;
75469 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
75470 +}
75471 +
75472 +static int count_deleted_blocks_actor(txn_atom * atom,
75473 + const reiser4_block_nr * a,
75474 + const reiser4_block_nr * b, void *data)
75475 +{
75476 + reiser4_block_nr *counter = data;
75477 +
75478 + assert("zam-995", data != NULL);
75479 + assert("zam-996", a != NULL);
75480 + if (b == NULL)
75481 + *counter += 1;
75482 + else
75483 + *counter += *b;
75484 + return 0;
75485 +}
75486 +
75487 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
75488 +{
75489 + reiser4_block_nr result;
75490 + txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
75491 + txn_atom *atom;
75492 +
75493 + result = 0;
75494 +
75495 + spin_lock_txnmgr(tmgr);
75496 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
75497 + spin_lock_atom(atom);
75498 + if (atom_isopen(atom))
75499 + blocknr_set_iterator(
75500 + atom, &atom->delete_set,
75501 + count_deleted_blocks_actor, &result, 0);
75502 + spin_unlock_atom(atom);
75503 + }
75504 + spin_unlock_txnmgr(tmgr);
75505 +
75506 + return result;
75507 +}
75508 +
75509 +/*
75510 + * Local variables:
75511 + * c-indentation-style: "K&R"
75512 + * mode-name: "LC"
75513 + * c-basic-offset: 8
75514 + * tab-width: 8
75515 + * fill-column: 79
75516 + * End:
75517 + */
75518 diff -urN linux-2.6.22.orig/fs/reiser4/txnmgr.h linux-2.6.22/fs/reiser4/txnmgr.h
75519 --- linux-2.6.22.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
75520 +++ linux-2.6.22/fs/reiser4/txnmgr.h 2007-07-29 00:25:35.044739961 +0400
75521 @@ -0,0 +1,701 @@
75522 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75523 + * reiser4/README */
75524 +
75525 +/* data-types and function declarations for transaction manager. See txnmgr.c
75526 + * for details. */
75527 +
75528 +#ifndef __REISER4_TXNMGR_H__
75529 +#define __REISER4_TXNMGR_H__
75530 +
75531 +#include "forward.h"
75532 +#include "dformat.h"
75533 +
75534 +#include <linux/fs.h>
75535 +#include <linux/mm.h>
75536 +#include <linux/types.h>
75537 +#include <linux/spinlock.h>
75538 +#include <asm/atomic.h>
75539 +#include <linux/wait.h>
75540 +
75541 +/* TYPE DECLARATIONS */
75542 +
75543 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
75544 + A capture request dynamically assigns a block to the calling thread's transaction
75545 + handle. */
75546 +typedef enum {
75547 + /* A READ_ATOMIC request indicates that a block will be read and that the caller's
75548 + atom should fuse in order to ensure that the block commits atomically with the
75549 + caller. */
75550 + TXN_CAPTURE_READ_ATOMIC = (1 << 0),
75551 +
75552 + /* A READ_NONCOM request indicates that a block will be read and that the caller is
75553 + willing to read a non-committed block without causing atoms to fuse. */
75554 + TXN_CAPTURE_READ_NONCOM = (1 << 1),
75555 +
75556 + /* A READ_MODIFY request indicates that a block will be read but that the caller
75557 + wishes for the block to be captured as it will be written. This capture request
75558 + mode is not currently used, but eventually it will be useful for preventing
75559 + deadlock in read-modify-write cycles. */
75560 + TXN_CAPTURE_READ_MODIFY = (1 << 2),
75561 +
75562 + /* A WRITE capture request indicates that a block will be modified and that atoms
75563 + should fuse to make the commit atomic. */
75564 + TXN_CAPTURE_WRITE = (1 << 3),
75565 +
75566 + /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
75567 + exclusive type designation from extra bits that may be supplied -- see
75568 + below. */
75569 + TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
75570 + TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
75571 + TXN_CAPTURE_WRITE),
75572 +
75573 + /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
75574 + indicate modification will occur. */
75575 + TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
75576 +
75577 + /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
75578 + prefer not to sleep waiting for an aging atom to commit. */
75579 + TXN_CAPTURE_NONBLOCKING = (1 << 4),
75580 +
75581 + /* An option to reiser4_try_capture to prevent atom fusion, just simple
75582 + capturing is allowed */
75583 + TXN_CAPTURE_DONT_FUSE = (1 << 5)
75584 +
75585 + /* This macro selects only the exclusive capture request types, stripping out any
75586 + options that were supplied (i.e., NONBLOCKING). */
75587 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
75588 +} txn_capture;
75589 +
75590 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
75591 + difference is in the handling of read requests. A WRITE_FUSING transaction handle
75592 + defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
75593 + transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
75594 +typedef enum {
75595 + TXN_WRITE_FUSING = (1 << 0),
75596 + TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
75597 +} txn_mode;
75598 +
75599 +/* Every atom has a stage, which is one of these exclusive values: */
75600 +typedef enum {
75601 + /* Initially an atom is free. */
75602 + ASTAGE_FREE = 0,
75603 +
75604 + /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
75605 + blocks and fuse with other atoms. */
75606 + ASTAGE_CAPTURE_FUSE = 1,
75607 +
75608 + /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
75609 +
75610 + /* When an atom reaches a certain age it must do all it can to commit. An atom in
75611 + the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
75612 + atoms in the CAPTURE_FUSE stage. */
75613 + ASTAGE_CAPTURE_WAIT = 2,
75614 +
75615 + /* Waiting for I/O before commit. Copy-on-capture (see
75616 + http://namesys.com/v4/v4.html). */
75617 + ASTAGE_PRE_COMMIT = 3,
75618 +
75619 + /* Post-commit overwrite I/O. Steal-on-capture. */
75620 + ASTAGE_POST_COMMIT = 4,
75621 +
75622 + /* Atom which waits for the removal of the last reference to (it? ) to
75623 + * be deleted from memory */
75624 + ASTAGE_DONE = 5,
75625 +
75626 + /* invalid atom. */
75627 + ASTAGE_INVALID = 6,
75628 +
75629 +} txn_stage;
75630 +
75631 +/* Certain flags may be set in the txn_atom->flags field. */
75632 +typedef enum {
75633 + /* Indicates that the atom should commit as soon as possible. */
75634 + ATOM_FORCE_COMMIT = (1 << 0),
75635 + /* to avoid endless loop, mark the atom (which was considered as too
75636 + * small) after failed attempt to fuse it. */
75637 + ATOM_CANCEL_FUSION = (1 << 1)
75638 +} txn_flags;
75639 +
75640 +/* Flags for controlling commit_txnh */
75641 +typedef enum {
75642 + /* Wait commit atom completion in commit_txnh */
75643 + TXNH_WAIT_COMMIT = 0x2,
75644 + /* Don't commit atom when this handle is closed */
75645 + TXNH_DONT_COMMIT = 0x4
75646 +} txn_handle_flags_t;
75647 +
75648 +/* TYPE DEFINITIONS */
75649 +
75650 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
75651 + fields, so typically an operation on the atom through either of these objects must (1)
75652 + lock the object, (2) read the atom pointer, (3) lock the atom.
75653 +
75654 + During atom fusion, the process holds locks on both atoms at once. Then, it iterates
75655 + through the list of handles and pages held by the smaller of the two atoms. For each
75656 + handle and page referencing the smaller atom, the fusing process must: (1) lock the
75657 + object, and (2) update the atom pointer.
75658 +
75659 + You can see that there is a conflict of lock ordering here, so the more-complex
75660 + procedure should have priority, i.e., the fusing process has priority so that it is
75661 + guaranteed to make progress and to avoid restarts.
75662 +
75663 + This decision, however, means additional complexity for aquiring the atom lock in the
75664 + first place.
75665 +
75666 + The general original procedure followed in the code was:
75667 +
75668 + TXN_OBJECT *obj = ...;
75669 + TXN_ATOM *atom;
75670 +
75671 + spin_lock (& obj->_lock);
75672 +
75673 + atom = obj->_atom;
75674 +
75675 + if (! spin_trylock_atom (atom))
75676 + {
75677 + spin_unlock (& obj->_lock);
75678 + RESTART OPERATION, THERE WAS A RACE;
75679 + }
75680 +
75681 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75682 +
75683 + It has however been found that this wastes CPU a lot in a manner that is
75684 + hard to profile. So, proper refcounting was added to atoms, and new
75685 + standard locking sequence is like following:
75686 +
75687 + TXN_OBJECT *obj = ...;
75688 + TXN_ATOM *atom;
75689 +
75690 + spin_lock (& obj->_lock);
75691 +
75692 + atom = obj->_atom;
75693 +
75694 + if (! spin_trylock_atom (atom))
75695 + {
75696 + atomic_inc (& atom->refcount);
75697 + spin_unlock (& obj->_lock);
75698 + spin_lock (&atom->_lock);
75699 + atomic_dec (& atom->refcount);
75700 + // HERE atom is locked
75701 + spin_unlock (&atom->_lock);
75702 + RESTART OPERATION, THERE WAS A RACE;
75703 + }
75704 +
75705 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75706 +
75707 + (core of this is implemented in trylock_throttle() function)
75708 +
75709 + See the jnode_get_atom() function for a common case.
75710 +
75711 + As an additional (and important) optimization allowing to avoid restarts,
75712 + it is possible to re-check required pre-conditions at the HERE point in
75713 + code above and proceed without restarting if they are still satisfied.
75714 +*/
75715 +
75716 +/* An atomic transaction: this is the underlying system representation
75717 + of a transaction, not the one seen by clients.
75718 +
75719 + Invariants involving this data-type:
75720 +
75721 + [sb-fake-allocated]
75722 +*/
75723 +struct txn_atom {
75724 + /* The spinlock protecting the atom, held during fusion and various other state
75725 + changes. */
75726 + spinlock_t alock;
75727 +
75728 + /* The atom's reference counter, increasing (in case of a duplication
75729 + of an existing reference or when we are sure that some other
75730 + reference exists) may be done without taking spinlock, decrementing
75731 + of the ref. counter requires a spinlock to be held.
75732 +
75733 + Each transaction handle counts in ->refcount. All jnodes count as
75734 + one reference acquired in atom_begin_andlock(), released in
75735 + commit_current_atom().
75736 + */
75737 + atomic_t refcount;
75738 +
75739 + /* The atom_id identifies the atom in persistent records such as the log. */
75740 + __u32 atom_id;
75741 +
75742 + /* Flags holding any of the txn_flags enumerated values (e.g.,
75743 + ATOM_FORCE_COMMIT). */
75744 + __u32 flags;
75745 +
75746 + /* Number of open handles. */
75747 + __u32 txnh_count;
75748 +
75749 + /* The number of znodes captured by this atom. Equal to the sum of lengths of the
75750 + dirty_nodes[level] and clean_nodes lists. */
75751 + __u32 capture_count;
75752 +
75753 +#if REISER4_DEBUG
75754 + int clean;
75755 + int dirty;
75756 + int ovrwr;
75757 + int wb;
75758 + int fq;
75759 +#endif
75760 +
75761 + __u32 flushed;
75762 +
75763 + /* Current transaction stage. */
75764 + txn_stage stage;
75765 +
75766 + /* Start time. */
75767 + unsigned long start_time;
75768 +
75769 + /* The atom's delete set. It collects block numbers of the nodes
75770 + which were deleted during the transaction. */
75771 + struct list_head delete_set;
75772 +
75773 + /* The atom's wandered_block mapping. */
75774 + struct list_head wandered_map;
75775 +
75776 + /* The transaction's list of dirty captured nodes--per level. Index
75777 + by (level). dirty_nodes[0] is for znode-above-root */
75778 + struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
75779 +
75780 + /* The transaction's list of clean captured nodes. */
75781 + struct list_head clean_nodes;
75782 +
75783 + /* The atom's overwrite set */
75784 + struct list_head ovrwr_nodes;
75785 +
75786 + /* nodes which are being written to disk */
75787 + struct list_head writeback_nodes;
75788 +
75789 + /* list of inodes */
75790 + struct list_head inodes;
75791 +
75792 + /* List of handles associated with this atom. */
75793 + struct list_head txnh_list;
75794 +
75795 + /* Transaction list link: list of atoms in the transaction manager. */
75796 + struct list_head atom_link;
75797 +
75798 + /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
75799 + struct list_head fwaitfor_list;
75800 +
75801 + /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
75802 + struct list_head fwaiting_list;
75803 +
75804 + /* Numbers of objects which were deleted/created in this transaction
75805 + thereby numbers of objects IDs which were released/deallocated. */
75806 + int nr_objects_deleted;
75807 + int nr_objects_created;
75808 + /* number of blocks allocated during the transaction */
75809 + __u64 nr_blocks_allocated;
75810 + /* All atom's flush queue objects are on this list */
75811 + struct list_head flush_queues;
75812 +#if REISER4_DEBUG
75813 + /* number of flush queues for this atom. */
75814 + int nr_flush_queues;
75815 + /* Number of jnodes which were removed from atom's lists and put
75816 + on flush_queue */
75817 + int num_queued;
75818 +#endif
75819 + /* number of threads who wait for this atom to complete commit */
75820 + int nr_waiters;
75821 + /* number of threads which do jnode_flush() over this atom */
75822 + int nr_flushers;
75823 + /* number of flush queues which are IN_USE and jnodes from fq->prepped
75824 + are submitted to disk by the reiser4_write_fq() routine. */
75825 + int nr_running_queues;
75826 + /* A counter of grabbed unformatted nodes, see a description of the
75827 + * reiser4 space reservation scheme at block_alloc.c */
75828 + reiser4_block_nr flush_reserved;
75829 +#if REISER4_DEBUG
75830 + void *committer;
75831 +#endif
75832 + struct super_block *super;
75833 +};
75834 +
75835 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
75836 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
75837 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
75838 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
75839 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
75840 +
75841 +#define NODE_LIST(node) (node)->list
75842 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
75843 +ON_DEBUG(void
75844 + count_jnode(txn_atom *, jnode *, atom_list old_list,
75845 + atom_list new_list, int check_lists));
75846 +
75847 +/* A transaction handle: the client obtains and commits this handle which is assigned by
75848 + the system to a txn_atom. */
75849 +struct txn_handle {
75850 + /* Spinlock protecting ->atom pointer */
75851 + spinlock_t hlock;
75852 +
75853 + /* Flags for controlling commit_txnh() behavior */
75854 + /* from txn_handle_flags_t */
75855 + txn_handle_flags_t flags;
75856 +
75857 + /* Whether it is READ_FUSING or WRITE_FUSING. */
75858 + txn_mode mode;
75859 +
75860 + /* If assigned, the atom it is part of. */
75861 + txn_atom *atom;
75862 +
75863 + /* Transaction list link. Head is in txn_atom. */
75864 + struct list_head txnh_link;
75865 +};
75866 +
75867 +/* The transaction manager: one is contained in the reiser4_super_info_data */
75868 +struct txn_mgr {
75869 + /* A spinlock protecting the atom list, id_count, flush_control */
75870 + spinlock_t tmgr_lock;
75871 +
75872 + /* List of atoms. */
75873 + struct list_head atoms_list;
75874 +
75875 + /* Number of atoms. */
75876 + int atom_count;
75877 +
75878 + /* A counter used to assign atom->atom_id values. */
75879 + __u32 id_count;
75880 +
75881 + /* a mutex object for commit serialization */
75882 + struct mutex commit_mutex;
75883 +
75884 + /* a list of all txnmrgs served by particular daemon. */
75885 + struct list_head linkage;
75886 +
75887 + /* description of daemon for this txnmgr */
75888 + ktxnmgrd_context *daemon;
75889 +
75890 + /* parameters. Adjustable through mount options. */
75891 + unsigned int atom_max_size;
75892 + unsigned int atom_max_age;
75893 + unsigned int atom_min_size;
75894 + /* max number of concurrent flushers for one atom, 0 - unlimited. */
75895 + unsigned int atom_max_flushers;
75896 + struct dentry *debugfs_atom_count;
75897 + struct dentry *debugfs_id_count;
75898 +};
75899 +
75900 +/* FUNCTION DECLARATIONS */
75901 +
75902 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
75903 + are prefixed with "txn_". For comments, see txnmgr.c. */
75904 +
75905 +extern int init_txnmgr_static(void);
75906 +extern void done_txnmgr_static(void);
75907 +
75908 +extern void reiser4_init_txnmgr(txn_mgr *);
75909 +extern void reiser4_done_txnmgr(txn_mgr *);
75910 +
75911 +extern int reiser4_txn_reserve(int reserved);
75912 +
75913 +extern void reiser4_txn_begin(reiser4_context * context);
75914 +extern int reiser4_txn_end(reiser4_context * context);
75915 +
75916 +extern void reiser4_txn_restart(reiser4_context * context);
75917 +extern void reiser4_txn_restart_current(void);
75918 +
75919 +extern int txnmgr_force_commit_all(struct super_block *, int);
75920 +extern int current_atom_should_commit(void);
75921 +
75922 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
75923 +
75924 +extern int commit_some_atoms(txn_mgr *);
75925 +extern int force_commit_atom(txn_handle *);
75926 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
75927 +
75928 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
75929 +
75930 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
75931 +
75932 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
75933 + int alloc_value);
75934 +extern void atom_dec_and_unlock(txn_atom * atom);
75935 +
75936 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
75937 +extern int try_capture_page_to_invalidate(struct page *pg);
75938 +
75939 +extern void reiser4_uncapture_page(struct page *pg);
75940 +extern void reiser4_uncapture_block(jnode *);
75941 +extern void reiser4_uncapture_jnode(jnode *);
75942 +
75943 +extern int reiser4_capture_inode(struct inode *);
75944 +extern int reiser4_uncapture_inode(struct inode *);
75945 +
75946 +extern txn_atom *get_current_atom_locked_nocheck(void);
75947 +
75948 +#if REISER4_DEBUG
75949 +
75950 +/**
75951 + * atom_is_protected - make sure that nobody but us can do anything with atom
75952 + * @atom: atom to be checked
75953 + *
75954 + * This is used to assert that atom either entered commit stages or is spin
75955 + * locked.
75956 + */
75957 +static inline int atom_is_protected(txn_atom *atom)
75958 +{
75959 + if (atom->stage >= ASTAGE_PRE_COMMIT)
75960 + return 1;
75961 + assert_spin_locked(&(atom->alock));
75962 + return 1;
75963 +}
75964 +
75965 +#endif
75966 +
75967 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
75968 +static inline txn_atom *get_current_atom_locked(void)
75969 +{
75970 + txn_atom *atom;
75971 +
75972 + atom = get_current_atom_locked_nocheck();
75973 + assert("zam-761", atom != NULL);
75974 +
75975 + return atom;
75976 +}
75977 +
75978 +extern txn_atom *jnode_get_atom(jnode *);
75979 +
75980 +extern void reiser4_atom_wait_event(txn_atom *);
75981 +extern void reiser4_atom_send_event(txn_atom *);
75982 +
75983 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
75984 +extern int reiser4_capture_super_block(struct super_block *s);
75985 +int capture_bulk(jnode **, int count);
75986 +
75987 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
75988 + calling convention of these three routines. */
75989 +extern void blocknr_set_init(struct list_head * bset);
75990 +extern void blocknr_set_destroy(struct list_head * bset);
75991 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
75992 +extern int blocknr_set_add_extent(txn_atom * atom,
75993 + struct list_head * bset,
75994 + blocknr_set_entry ** new_bsep,
75995 + const reiser4_block_nr * start,
75996 + const reiser4_block_nr * len);
75997 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
75998 + blocknr_set_entry ** new_bsep,
75999 + const reiser4_block_nr * a,
76000 + const reiser4_block_nr * b);
76001 +
76002 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76003 + const reiser4_block_nr *, void *);
76004 +
76005 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
76006 + blocknr_set_actor_f actor, void *data,
76007 + int delete);
76008 +
76009 +/* flush code takes care about how to fuse flush queues */
76010 +extern void flush_init_atom(txn_atom * atom);
76011 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76012 +
76013 +static inline void spin_lock_atom(txn_atom *atom)
76014 +{
76015 + /* check that spinlocks of lower priorities are not held */
76016 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76017 + LOCK_CNT_NIL(spin_locked_atom) &&
76018 + LOCK_CNT_NIL(spin_locked_jnode) &&
76019 + LOCK_CNT_NIL(spin_locked_zlock) &&
76020 + LOCK_CNT_NIL(rw_locked_dk) &&
76021 + LOCK_CNT_NIL(rw_locked_tree)));
76022 +
76023 + spin_lock(&(atom->alock));
76024 +
76025 + LOCK_CNT_INC(spin_locked_atom);
76026 + LOCK_CNT_INC(spin_locked);
76027 +}
76028 +
76029 +static inline void spin_lock_atom_nested(txn_atom *atom)
76030 +{
76031 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76032 + LOCK_CNT_NIL(spin_locked_jnode) &&
76033 + LOCK_CNT_NIL(spin_locked_zlock) &&
76034 + LOCK_CNT_NIL(rw_locked_dk) &&
76035 + LOCK_CNT_NIL(rw_locked_tree)));
76036 +
76037 + spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
76038 +
76039 + LOCK_CNT_INC(spin_locked_atom);
76040 + LOCK_CNT_INC(spin_locked);
76041 +}
76042 +
76043 +static inline int spin_trylock_atom(txn_atom *atom)
76044 +{
76045 + if (spin_trylock(&(atom->alock))) {
76046 + LOCK_CNT_INC(spin_locked_atom);
76047 + LOCK_CNT_INC(spin_locked);
76048 + return 1;
76049 + }
76050 + return 0;
76051 +}
76052 +
76053 +static inline void spin_unlock_atom(txn_atom *atom)
76054 +{
76055 + assert_spin_locked(&(atom->alock));
76056 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76057 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76058 +
76059 + LOCK_CNT_DEC(spin_locked_atom);
76060 + LOCK_CNT_DEC(spin_locked);
76061 +
76062 + spin_unlock(&(atom->alock));
76063 +}
76064 +
76065 +static inline void spin_lock_txnh(txn_handle *txnh)
76066 +{
76067 + /* check that spinlocks of lower priorities are not held */
76068 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76069 + LOCK_CNT_NIL(spin_locked_zlock) &&
76070 + LOCK_CNT_NIL(rw_locked_tree)));
76071 +
76072 + spin_lock(&(txnh->hlock));
76073 +
76074 + LOCK_CNT_INC(spin_locked_txnh);
76075 + LOCK_CNT_INC(spin_locked);
76076 +}
76077 +
76078 +static inline int spin_trylock_txnh(txn_handle *txnh)
76079 +{
76080 + if (spin_trylock(&(txnh->hlock))) {
76081 + LOCK_CNT_INC(spin_locked_txnh);
76082 + LOCK_CNT_INC(spin_locked);
76083 + return 1;
76084 + }
76085 + return 0;
76086 +}
76087 +
76088 +static inline void spin_unlock_txnh(txn_handle *txnh)
76089 +{
76090 + assert_spin_locked(&(txnh->hlock));
76091 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76092 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76093 +
76094 + LOCK_CNT_DEC(spin_locked_txnh);
76095 + LOCK_CNT_DEC(spin_locked);
76096 +
76097 + spin_unlock(&(txnh->hlock));
76098 +}
76099 +
76100 +#define spin_ordering_pred_txnmgr(tmgr) \
76101 + ( LOCK_CNT_NIL(spin_locked_atom) && \
76102 + LOCK_CNT_NIL(spin_locked_txnh) && \
76103 + LOCK_CNT_NIL(spin_locked_jnode) && \
76104 + LOCK_CNT_NIL(rw_locked_zlock) && \
76105 + LOCK_CNT_NIL(rw_locked_dk) && \
76106 + LOCK_CNT_NIL(rw_locked_tree) )
76107 +
76108 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
76109 +{
76110 + /* check that spinlocks of lower priorities are not held */
76111 + assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76112 + LOCK_CNT_NIL(spin_locked_txnh) &&
76113 + LOCK_CNT_NIL(spin_locked_jnode) &&
76114 + LOCK_CNT_NIL(spin_locked_zlock) &&
76115 + LOCK_CNT_NIL(rw_locked_dk) &&
76116 + LOCK_CNT_NIL(rw_locked_tree)));
76117 +
76118 + spin_lock(&(mgr->tmgr_lock));
76119 +
76120 + LOCK_CNT_INC(spin_locked_txnmgr);
76121 + LOCK_CNT_INC(spin_locked);
76122 +}
76123 +
76124 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76125 +{
76126 + if (spin_trylock(&(mgr->tmgr_lock))) {
76127 + LOCK_CNT_INC(spin_locked_txnmgr);
76128 + LOCK_CNT_INC(spin_locked);
76129 + return 1;
76130 + }
76131 + return 0;
76132 +}
76133 +
76134 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76135 +{
76136 + assert_spin_locked(&(mgr->tmgr_lock));
76137 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76138 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76139 +
76140 + LOCK_CNT_DEC(spin_locked_txnmgr);
76141 + LOCK_CNT_DEC(spin_locked);
76142 +
76143 + spin_unlock(&(mgr->tmgr_lock));
76144 +}
76145 +
76146 +typedef enum {
76147 + FQ_IN_USE = 0x1
76148 +} flush_queue_state_t;
76149 +
76150 +typedef struct flush_queue flush_queue_t;
76151 +
76152 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76153 + is filled by the jnode_flush() routine, and written to disk under memory
76154 + pressure or at atom commit time. */
76155 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76156 + field and fq->prepped list can be modified if atom is spin-locked and fq
76157 + object is "in-use" state. For read-only traversal of the fq->prepped list
76158 + and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76159 + only have atom spin-locked. */
76160 +struct flush_queue {
76161 + /* linkage element is the first in this structure to make debugging
76162 + easier. See field in atom struct for description of list. */
76163 + struct list_head alink;
76164 + /* A spinlock to protect changes of fq state and fq->atom pointer */
76165 + spinlock_t guard;
76166 + /* flush_queue state: [in_use | ready] */
76167 + flush_queue_state_t state;
76168 + /* A list which contains queued nodes, queued nodes are removed from any
76169 + * atom's list and put on this ->prepped one. */
76170 + struct list_head prepped;
76171 + /* number of submitted i/o requests */
76172 + atomic_t nr_submitted;
76173 + /* number of i/o errors */
76174 + atomic_t nr_errors;
76175 + /* An atom this flush queue is attached to */
76176 + txn_atom *atom;
76177 + /* A wait queue head to wait on i/o completion */
76178 + wait_queue_head_t wait;
76179 +#if REISER4_DEBUG
76180 + /* A thread which took this fq in exclusive use, NULL if fq is free,
76181 + * used for debugging. */
76182 + struct task_struct *owner;
76183 +#endif
76184 +};
76185 +
76186 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
76187 +extern void reiser4_fq_put_nolock(flush_queue_t *);
76188 +extern void reiser4_fq_put(flush_queue_t *);
76189 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
76190 +extern void queue_jnode(flush_queue_t *, jnode *);
76191 +
76192 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
76193 +extern int current_atom_finish_all_fq(void);
76194 +extern void init_atom_fq_parts(txn_atom *);
76195 +
76196 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76197 +
76198 +extern void znode_make_dirty(znode * node);
76199 +extern void jnode_make_dirty_locked(jnode * node);
76200 +
76201 +extern int reiser4_sync_atom(txn_atom * atom);
76202 +
76203 +#if REISER4_DEBUG
76204 +extern int atom_fq_parts_are_clean(txn_atom *);
76205 +#endif
76206 +
76207 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
76208 +extern flush_queue_t *get_fq_for_current_atom(void);
76209 +
76210 +void reiser4_invalidate_list(struct list_head * head);
76211 +
76212 +# endif /* __REISER4_TXNMGR_H__ */
76213 +
76214 +/* Make Linus happy.
76215 + Local variables:
76216 + c-indentation-style: "K&R"
76217 + mode-name: "LC"
76218 + c-basic-offset: 8
76219 + tab-width: 8
76220 + fill-column: 120
76221 + End:
76222 +*/
76223 diff -urN linux-2.6.22.orig/fs/reiser4/type_safe_hash.h linux-2.6.22/fs/reiser4/type_safe_hash.h
76224 --- linux-2.6.22.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
76225 +++ linux-2.6.22/fs/reiser4/type_safe_hash.h 2007-07-29 00:25:35.044739961 +0400
76226 @@ -0,0 +1,320 @@
76227 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76228 + * reiser4/README */
76229 +
76230 +/* A hash table class that uses hash chains (singly-linked) and is
76231 + parametrized to provide type safety. */
76232 +
76233 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
76234 +#define __REISER4_TYPE_SAFE_HASH_H__
76235 +
76236 +#include "debug.h"
76237 +
76238 +#include <asm/errno.h>
76239 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
76240 + based on the object type. You need to declare the item type before
76241 + this definition, define it after this definition. */
76242 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
76243 + \
76244 +typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
76245 +typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
76246 + \
76247 +struct PREFIX##_hash_table_ \
76248 +{ \
76249 + ITEM_TYPE **_table; \
76250 + __u32 _buckets; \
76251 +}; \
76252 + \
76253 +struct PREFIX##_hash_link_ \
76254 +{ \
76255 + ITEM_TYPE *_next; \
76256 +}
76257 +
76258 +/* Step 2: Define the object type of the hash: give it field of type
76259 + PREFIX_hash_link. */
76260 +
76261 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
76262 + the type and field name used in step 3. The arguments are:
76263 +
76264 + ITEM_TYPE The item type being hashed
76265 + KEY_TYPE The type of key being hashed
76266 + KEY_NAME The name of the key field within the item
76267 + LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
76268 + HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
76269 + EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
76270 +
76271 + It implements these functions:
76272 +
76273 + prefix_hash_init Initialize the table given its size.
76274 + prefix_hash_insert Insert an item
76275 + prefix_hash_insert_index Insert an item w/ precomputed hash_index
76276 + prefix_hash_find Find an item by key
76277 + prefix_hash_find_index Find an item w/ precomputed hash_index
76278 + prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
76279 + prefix_hash_remove_index Remove an item w/ precomputed hash_index
76280 +
76281 + If you'd like something to be done differently, feel free to ask me
76282 + for modifications. Additional features that could be added but
76283 + have not been:
76284 +
76285 + prefix_hash_remove_key Find and remove an item by key
76286 + prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
76287 +
76288 + The hash_function currently receives only the key as an argument,
76289 + meaning it must somehow know the number of buckets. If this is a
76290 + problem let me know.
76291 +
76292 + This hash table uses a single-linked hash chain. This means
76293 + insertion is fast but deletion requires searching the chain.
76294 +
76295 + There is also the doubly-linked hash chain approach, under which
76296 + deletion requires no search but the code is longer and it takes two
76297 + pointers per item.
76298 +
76299 + The circularly-linked approach has the shortest code but requires
76300 + two pointers per bucket, doubling the size of the bucket array (in
76301 + addition to two pointers per item).
76302 +*/
76303 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
76304 + \
76305 +static __inline__ void \
76306 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
76307 + __u32 hash UNUSED_ARG) \
76308 +{ \
76309 + assert("nikita-2780", hash < table->_buckets); \
76310 +} \
76311 + \
76312 +static __inline__ int \
76313 +PREFIX##_hash_init (PREFIX##_hash_table *hash, \
76314 + __u32 buckets) \
76315 +{ \
76316 + hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
76317 + hash->_buckets = buckets; \
76318 + if (hash->_table == NULL) \
76319 + { \
76320 + return RETERR(-ENOMEM); \
76321 + } \
76322 + memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
76323 + ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
76324 + return 0; \
76325 +} \
76326 + \
76327 +static __inline__ void \
76328 +PREFIX##_hash_done (PREFIX##_hash_table *hash) \
76329 +{ \
76330 + if (REISER4_DEBUG && hash->_table != NULL) { \
76331 + __u32 i; \
76332 + for (i = 0 ; i < hash->_buckets ; ++ i) \
76333 + assert("nikita-2905", hash->_table[i] == NULL); \
76334 + } \
76335 + if (hash->_table != NULL) \
76336 + KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
76337 + hash->_table = NULL; \
76338 +} \
76339 + \
76340 +static __inline__ void \
76341 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
76342 +{ \
76343 + prefetch(item->LINK_NAME._next); \
76344 +} \
76345 + \
76346 +static __inline__ void \
76347 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
76348 + __u32 index) \
76349 +{ \
76350 + prefetch(hash->_table[index]); \
76351 +} \
76352 + \
76353 +static __inline__ ITEM_TYPE* \
76354 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
76355 + __u32 hash_index, \
76356 + KEY_TYPE const *find_key) \
76357 +{ \
76358 + ITEM_TYPE *item; \
76359 + \
76360 + PREFIX##_check_hash(hash, hash_index); \
76361 + \
76362 + for (item = hash->_table[hash_index]; \
76363 + item != NULL; \
76364 + item = item->LINK_NAME._next) \
76365 + { \
76366 + prefetch(item->LINK_NAME._next); \
76367 + prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
76368 + if (EQ_FUNC (& item->KEY_NAME, find_key)) \
76369 + { \
76370 + return item; \
76371 + } \
76372 + } \
76373 + \
76374 + return NULL; \
76375 +} \
76376 + \
76377 +static __inline__ ITEM_TYPE* \
76378 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
76379 + __u32 hash_index, \
76380 + KEY_TYPE const *find_key) \
76381 +{ \
76382 + ITEM_TYPE ** item = &hash->_table[hash_index]; \
76383 + \
76384 + PREFIX##_check_hash(hash, hash_index); \
76385 + \
76386 + while (*item != NULL) { \
76387 + prefetch(&(*item)->LINK_NAME._next); \
76388 + if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
76389 + ITEM_TYPE *found; \
76390 + \
76391 + found = *item; \
76392 + *item = found->LINK_NAME._next; \
76393 + found->LINK_NAME._next = hash->_table[hash_index]; \
76394 + hash->_table[hash_index] = found; \
76395 + return found; \
76396 + } \
76397 + item = &(*item)->LINK_NAME._next; \
76398 + } \
76399 + return NULL; \
76400 +} \
76401 + \
76402 +static __inline__ int \
76403 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
76404 + __u32 hash_index, \
76405 + ITEM_TYPE *del_item) \
76406 +{ \
76407 + ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
76408 + \
76409 + PREFIX##_check_hash(hash, hash_index); \
76410 + \
76411 + while (*hash_item_p != NULL) { \
76412 + prefetch(&(*hash_item_p)->LINK_NAME._next); \
76413 + if (*hash_item_p == del_item) { \
76414 + *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
76415 + return 1; \
76416 + } \
76417 + hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
76418 + } \
76419 + return 0; \
76420 +} \
76421 + \
76422 +static __inline__ void \
76423 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
76424 + __u32 hash_index, \
76425 + ITEM_TYPE *ins_item) \
76426 +{ \
76427 + PREFIX##_check_hash(hash, hash_index); \
76428 + \
76429 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76430 + hash->_table[hash_index] = ins_item; \
76431 +} \
76432 + \
76433 +static __inline__ void \
76434 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
76435 + __u32 hash_index, \
76436 + ITEM_TYPE *ins_item) \
76437 +{ \
76438 + PREFIX##_check_hash(hash, hash_index); \
76439 + \
76440 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76441 + smp_wmb(); \
76442 + hash->_table[hash_index] = ins_item; \
76443 +} \
76444 + \
76445 +static __inline__ ITEM_TYPE* \
76446 +PREFIX##_hash_find (PREFIX##_hash_table *hash, \
76447 + KEY_TYPE const *find_key) \
76448 +{ \
76449 + return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
76450 +} \
76451 + \
76452 +static __inline__ ITEM_TYPE* \
76453 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
76454 + KEY_TYPE const *find_key) \
76455 +{ \
76456 + return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
76457 +} \
76458 + \
76459 +static __inline__ int \
76460 +PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
76461 + ITEM_TYPE *del_item) \
76462 +{ \
76463 + return PREFIX##_hash_remove_index (hash, \
76464 + HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
76465 +} \
76466 + \
76467 +static __inline__ int \
76468 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
76469 + ITEM_TYPE *del_item) \
76470 +{ \
76471 + return PREFIX##_hash_remove (hash, del_item); \
76472 +} \
76473 + \
76474 +static __inline__ void \
76475 +PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
76476 + ITEM_TYPE *ins_item) \
76477 +{ \
76478 + return PREFIX##_hash_insert_index (hash, \
76479 + HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
76480 +} \
76481 + \
76482 +static __inline__ void \
76483 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
76484 + ITEM_TYPE *ins_item) \
76485 +{ \
76486 + return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
76487 + ins_item); \
76488 +} \
76489 + \
76490 +static __inline__ ITEM_TYPE * \
76491 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
76492 +{ \
76493 + ITEM_TYPE *first; \
76494 + \
76495 + for (first = NULL; ind < hash->_buckets; ++ ind) { \
76496 + first = hash->_table[ind]; \
76497 + if (first != NULL) \
76498 + break; \
76499 + } \
76500 + return first; \
76501 +} \
76502 + \
76503 +static __inline__ ITEM_TYPE * \
76504 +PREFIX##_hash_next (PREFIX##_hash_table *hash, \
76505 + ITEM_TYPE *item) \
76506 +{ \
76507 + ITEM_TYPE *next; \
76508 + \
76509 + if (item == NULL) \
76510 + return NULL; \
76511 + next = item->LINK_NAME._next; \
76512 + if (next == NULL) \
76513 + next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
76514 + return next; \
76515 +} \
76516 + \
76517 +typedef struct {} PREFIX##_hash_dummy
76518 +
76519 +#define for_all_ht_buckets(table, head) \
76520 +for ((head) = &(table) -> _table[ 0 ] ; \
76521 + (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
76522 +
76523 +#define for_all_in_bucket(bucket, item, next, field) \
76524 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
76525 + (item) != NULL ; \
76526 + (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
76527 +
76528 +#define for_all_in_htable(table, prefix, item, next) \
76529 +for ((item) = prefix ## _hash_first ((table), 0), \
76530 + (next) = prefix ## _hash_next ((table), (item)) ; \
76531 + (item) != NULL ; \
76532 + (item) = (next), \
76533 + (next) = prefix ## _hash_next ((table), (item)))
76534 +
76535 +/* __REISER4_TYPE_SAFE_HASH_H__ */
76536 +#endif
76537 +
76538 +/* Make Linus happy.
76539 + Local variables:
76540 + c-indentation-style: "K&R"
76541 + mode-name: "LC"
76542 + c-basic-offset: 8
76543 + tab-width: 8
76544 + fill-column: 120
76545 + End:
76546 +*/
76547 diff -urN linux-2.6.22.orig/fs/reiser4/vfs_ops.c linux-2.6.22/fs/reiser4/vfs_ops.c
76548 --- linux-2.6.22.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300
76549 +++ linux-2.6.22/fs/reiser4/vfs_ops.c 2007-07-29 00:25:35.044739961 +0400
76550 @@ -0,0 +1,259 @@
76551 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76552 + * reiser4/README */
76553 +
76554 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
76555 + here. */
76556 +
76557 +#include "forward.h"
76558 +#include "debug.h"
76559 +#include "dformat.h"
76560 +#include "coord.h"
76561 +#include "plugin/item/item.h"
76562 +#include "plugin/file/file.h"
76563 +#include "plugin/security/perm.h"
76564 +#include "plugin/disk_format/disk_format.h"
76565 +#include "plugin/plugin.h"
76566 +#include "plugin/plugin_set.h"
76567 +#include "plugin/object.h"
76568 +#include "txnmgr.h"
76569 +#include "jnode.h"
76570 +#include "znode.h"
76571 +#include "block_alloc.h"
76572 +#include "tree.h"
76573 +#include "vfs_ops.h"
76574 +#include "inode.h"
76575 +#include "page_cache.h"
76576 +#include "ktxnmgrd.h"
76577 +#include "super.h"
76578 +#include "reiser4.h"
76579 +#include "entd.h"
76580 +#include "status_flags.h"
76581 +#include "flush.h"
76582 +#include "dscale.h"
76583 +
76584 +#include <linux/profile.h>
76585 +#include <linux/types.h>
76586 +#include <linux/mount.h>
76587 +#include <linux/vfs.h>
76588 +#include <linux/mm.h>
76589 +#include <linux/buffer_head.h>
76590 +#include <linux/dcache.h>
76591 +#include <linux/list.h>
76592 +#include <linux/pagemap.h>
76593 +#include <linux/slab.h>
76594 +#include <linux/seq_file.h>
76595 +#include <linux/init.h>
76596 +#include <linux/module.h>
76597 +#include <linux/writeback.h>
76598 +#include <linux/blkdev.h>
76599 +#include <linux/quotaops.h>
76600 +#include <linux/security.h>
76601 +#include <linux/reboot.h>
76602 +#include <linux/rcupdate.h>
76603 +
76604 +/* update inode stat-data by calling plugin */
76605 +int reiser4_update_sd(struct inode *object)
76606 +{
76607 + file_plugin *fplug;
76608 +
76609 + assert("nikita-2338", object != NULL);
76610 + /* check for read-only file system. */
76611 + if (IS_RDONLY(object))
76612 + return 0;
76613 +
76614 + fplug = inode_file_plugin(object);
76615 + assert("nikita-2339", fplug != NULL);
76616 + return fplug->write_sd_by_inode(object);
76617 +}
76618 +
76619 +/* helper function: increase inode nlink count and call plugin method to save
76620 + updated stat-data.
76621 +
76622 + Used by link/create and during creation of dot and dotdot in mkdir
76623 +*/
76624 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
76625 + struct inode *parent /* parent where new entry will be */
76626 + ,
76627 + int write_sd_p /* true if stat-data has to be
76628 + * updated */ )
76629 +{
76630 + file_plugin *fplug;
76631 + int result;
76632 +
76633 + assert("nikita-1351", object != NULL);
76634 +
76635 + fplug = inode_file_plugin(object);
76636 + assert("nikita-1445", fplug != NULL);
76637 +
76638 + /* ask plugin whether it can add yet another link to this
76639 + object */
76640 + if (!fplug->can_add_link(object))
76641 + return RETERR(-EMLINK);
76642 +
76643 + assert("nikita-2211", fplug->add_link != NULL);
76644 + /* call plugin to do actual addition of link */
76645 + result = fplug->add_link(object, parent);
76646 +
76647 + /* optionally update stat data */
76648 + if (result == 0 && write_sd_p)
76649 + result = fplug->write_sd_by_inode(object);
76650 + return result;
76651 +}
76652 +
76653 +/* helper function: decrease inode nlink count and call plugin method to save
76654 + updated stat-data.
76655 +
76656 + Used by unlink/create
76657 +*/
76658 +int reiser4_del_nlink(struct inode *object /* object from which link is
76659 + * removed */ ,
76660 + struct inode *parent /* parent where entry was */ ,
76661 + int write_sd_p /* true is stat-data has to be
76662 + * updated */ )
76663 +{
76664 + file_plugin *fplug;
76665 + int result;
76666 +
76667 + assert("nikita-1349", object != NULL);
76668 +
76669 + fplug = inode_file_plugin(object);
76670 + assert("nikita-1350", fplug != NULL);
76671 + assert("nikita-1446", object->i_nlink > 0);
76672 + assert("nikita-2210", fplug->rem_link != NULL);
76673 +
76674 + /* call plugin to do actual deletion of link */
76675 + result = fplug->rem_link(object, parent);
76676 +
76677 + /* optionally update stat data */
76678 + if (result == 0 && write_sd_p)
76679 + result = fplug->write_sd_by_inode(object);
76680 + return result;
76681 +}
76682 +
76683 +/* Release reiser4 dentry. This is d_op->d_release() method. */
76684 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
76685 +{
76686 + reiser4_free_dentry_fsdata(dentry);
76687 +}
76688 +
76689 +/*
76690 + * Called by reiser4_sync_inodes(), during speculative write-back (through
76691 + * pdflush, or balance_dirty_pages()).
76692 + */
76693 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
76694 +{
76695 + long written = 0;
76696 + int repeats = 0;
76697 + int result;
76698 + struct address_space *mapping;
76699 +
76700 + /*
76701 + * Performs early flushing, trying to free some memory. If there is
76702 + * nothing to flush, commits some atoms.
76703 + */
76704 +
76705 + /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
76706 + sys_fsync(). */
76707 + if (wbc->sync_mode != WB_SYNC_NONE) {
76708 + txnmgr_force_commit_all(sb, 0);
76709 + return;
76710 + }
76711 +
76712 + BUG_ON(reiser4_get_super_fake(sb) == NULL);
76713 + mapping = reiser4_get_super_fake(sb)->i_mapping;
76714 + do {
76715 + long nr_submitted = 0;
76716 + jnode *node = NULL;
76717 +
76718 + /* do not put more requests to overload write queue */
76719 + if (wbc->nonblocking &&
76720 + bdi_write_congested(mapping->backing_dev_info)) {
76721 + blk_run_address_space(mapping);
76722 + wbc->encountered_congestion = 1;
76723 + break;
76724 + }
76725 + repeats++;
76726 + BUG_ON(wbc->nr_to_write <= 0);
76727 +
76728 + if (get_current_context()->entd) {
76729 + entd_context *ent = get_entd_context(sb);
76730 +
76731 + if (ent->cur_request->node)
76732 + /*
76733 + * this is ent thread and it managed to capture
76734 + * requested page itself - start flush from
76735 + * that page
76736 + */
76737 + node = jref(ent->cur_request->node);
76738 + }
76739 +
76740 + result = flush_some_atom(node, &nr_submitted, wbc,
76741 + JNODE_FLUSH_WRITE_BLOCKS);
76742 + if (result != 0)
76743 + warning("nikita-31001", "Flush failed: %i", result);
76744 + if (node)
76745 + jput(node);
76746 + if (!nr_submitted)
76747 + break;
76748 +
76749 + wbc->nr_to_write -= nr_submitted;
76750 + written += nr_submitted;
76751 + } while (wbc->nr_to_write > 0);
76752 +}
76753 +
76754 +void reiser4_throttle_write(struct inode *inode)
76755 +{
76756 + reiser4_txn_restart_current();
76757 + balance_dirty_pages_ratelimited(inode->i_mapping);
76758 +}
76759 +
76760 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
76761 +const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
76762 + * beginning of device */
76763 +
76764 +/*
76765 + * Reiser4 initialization/shutdown.
76766 + *
76767 + * Code below performs global reiser4 initialization that is done either as
76768 + * part of kernel initialization (when reiser4 is statically built-in), or
76769 + * during reiser4 module load (when compiled as module).
76770 + */
76771 +
76772 +void reiser4_handle_error(void)
76773 +{
76774 + struct super_block *sb = reiser4_get_current_sb();
76775 +
76776 + if (!sb)
76777 + return;
76778 + reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
76779 + "Filesystem error occured");
76780 + switch (get_super_private(sb)->onerror) {
76781 + case 0:
76782 + reiser4_panic("foobar-42", "Filesystem error occured\n");
76783 + case 1:
76784 + default:
76785 + if (sb->s_flags & MS_RDONLY)
76786 + return;
76787 + sb->s_flags |= MS_RDONLY;
76788 + break;
76789 + }
76790 +}
76791 +
76792 +struct dentry_operations reiser4_dentry_operations = {
76793 + .d_revalidate = NULL,
76794 + .d_hash = NULL,
76795 + .d_compare = NULL,
76796 + .d_delete = NULL,
76797 + .d_release = reiser4_d_release,
76798 + .d_iput = NULL,
76799 +};
76800 +
76801 +/* Make Linus happy.
76802 + Local variables:
76803 + c-indentation-style: "K&R"
76804 + mode-name: "LC"
76805 + c-basic-offset: 8
76806 + tab-width: 8
76807 + fill-column: 120
76808 + End:
76809 +*/
76810 diff -urN linux-2.6.22.orig/fs/reiser4/vfs_ops.h linux-2.6.22/fs/reiser4/vfs_ops.h
76811 --- linux-2.6.22.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300
76812 +++ linux-2.6.22/fs/reiser4/vfs_ops.h 2007-07-29 00:25:35.044739961 +0400
76813 @@ -0,0 +1,53 @@
76814 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76815 + * reiser4/README */
76816 +
76817 +/* vfs_ops.c's exported symbols */
76818 +
76819 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
76820 +#define __FS_REISER4_VFS_OPS_H__
76821 +
76822 +#include "forward.h"
76823 +#include "coord.h"
76824 +#include "seal.h"
76825 +#include "plugin/file/file.h"
76826 +#include "super.h"
76827 +#include "readahead.h"
76828 +
76829 +#include <linux/types.h> /* for loff_t */
76830 +#include <linux/fs.h> /* for struct address_space */
76831 +#include <linux/dcache.h> /* for struct dentry */
76832 +#include <linux/mm.h>
76833 +#include <linux/backing-dev.h>
76834 +
76835 +/* address space operations */
76836 +int reiser4_writepage(struct page *, struct writeback_control *);
76837 +int reiser4_set_page_dirty(struct page *);
76838 +void reiser4_invalidatepage(struct page *, unsigned long offset);
76839 +int reiser4_releasepage(struct page *, gfp_t);
76840 +
76841 +extern int reiser4_update_sd(struct inode *);
76842 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
76843 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
76844 +
76845 +extern int reiser4_start_up_io(struct page *page);
76846 +extern void reiser4_throttle_write(struct inode *);
76847 +extern int jnode_is_releasable(jnode *);
76848 +
76849 +#define CAPTURE_APAGE_BURST (1024l)
76850 +void reiser4_writeout(struct super_block *, struct writeback_control *);
76851 +
76852 +extern void reiser4_handle_error(void);
76853 +
76854 +/* __FS_REISER4_VFS_OPS_H__ */
76855 +#endif
76856 +
76857 +/* Make Linus happy.
76858 + Local variables:
76859 + c-indentation-style: "K&R"
76860 + mode-name: "LC"
76861 + c-basic-offset: 8
76862 + tab-width: 8
76863 + fill-column: 120
76864 + scroll-step: 1
76865 + End:
76866 +*/
76867 diff -urN linux-2.6.22.orig/fs/reiser4/wander.c linux-2.6.22/fs/reiser4/wander.c
76868 --- linux-2.6.22.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
76869 +++ linux-2.6.22/fs/reiser4/wander.c 2007-07-29 00:25:35.048740996 +0400
76870 @@ -0,0 +1,1797 @@
76871 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76872 + * reiser4/README */
76873 +
76874 +/* Reiser4 Wandering Log */
76875 +
76876 +/* You should read http://www.namesys.com/txn-doc.html
76877 +
76878 + That describes how filesystem operations are performed as atomic
76879 + transactions, and how we try to arrange it so that we can write most of the
76880 + data only once while performing the operation atomically.
76881 +
76882 + For the purposes of this code, it is enough for it to understand that it
76883 + has been told a given block should be written either once, or twice (if
76884 + twice then once to the wandered location and once to the real location).
76885 +
76886 + This code guarantees that those blocks that are defined to be part of an
76887 + atom either all take effect or none of them take effect.
76888 +
76889 + Relocate set nodes are submitted to write by the jnode_flush() routine, and
76890 + the overwrite set is submitted by reiser4_write_log(). This is because with
76891 + the overwrite set we seek to optimize writes, and with the relocate set we
76892 + seek to cause disk order to correlate with the parent first pre-order.
76893 +
76894 + reiser4_write_log() allocates and writes wandered blocks and maintains
76895 + additional on-disk structures of the atom as wander records (each wander
76896 + record occupies one block) for storing of the "wandered map" (a table which
76897 + contains a relation between wandered and real block numbers) and other
76898 + information which might be needed at transaction recovery time.
76899 +
76900 + The wander records are unidirectionally linked into a circle: each wander
76901 + record contains a block number of the next wander record, the last wander
76902 + record points to the first one.
76903 +
76904 + One wander record (named "tx head" in this file) has a format which is
76905 + different from the other wander records. The "tx head" has a reference to the
76906 + "tx head" block of the previously committed atom. Also, "tx head" contains
76907 + fs information (the free blocks counter, and the oid allocator state) which
76908 + is logged in a special way .
76909 +
76910 + There are two journal control blocks, named journal header and journal
76911 + footer which have fixed on-disk locations. The journal header has a
76912 + reference to the "tx head" block of the last committed atom. The journal
76913 + footer points to the "tx head" of the last flushed atom. The atom is
76914 + "played" when all blocks from its overwrite set are written to disk the
76915 + second time (i.e. written to their real locations).
76916 +
76917 + NOTE: People who know reiserfs internals and its journal structure might be
76918 + confused with these terms journal footer and journal header. There is a table
76919 + with terms of similar semantics in reiserfs (reiser3) and reiser4:
76920 +
76921 + REISER3 TERM | REISER4 TERM | DESCRIPTION
76922 + --------------------+-----------------------+----------------------------
76923 + commit record | journal header | atomic write of this record
76924 + | | ends transaction commit
76925 + --------------------+-----------------------+----------------------------
76926 + journal header | journal footer | atomic write of this record
76927 + | | ends post-commit writes.
76928 + | | After successful
76929 + | | writing of this journal
76930 + | | blocks (in reiser3) or
76931 + | | wandered blocks/records are
76932 + | | free for re-use.
76933 + --------------------+-----------------------+----------------------------
76934 +
76935 + The atom commit process is the following:
76936 +
76937 + 1. The overwrite set is taken from atom's clean list, and its size is
76938 + counted.
76939 +
76940 + 2. The number of necessary wander records (including tx head) is calculated,
76941 + and the wander record blocks are allocated.
76942 +
76943 + 3. Allocate wandered blocks and populate wander records by wandered map.
76944 +
76945 + 4. submit write requests for wander records and wandered blocks.
76946 +
76947 + 5. wait until submitted write requests complete.
76948 +
76949 + 6. update journal header: change the pointer to the block number of just
76950 + written tx head, submit an i/o for modified journal header block and wait
76951 + for i/o completion.
76952 +
76953 + NOTE: The special logging for bitmap blocks and some reiser4 super block
76954 + fields makes processes of atom commit, flush and recovering a bit more
76955 + complex (see comments in the source code for details).
76956 +
76957 + The atom playing process is the following:
76958 +
76959 + 1. Write atom's overwrite set in-place.
76960 +
76961 + 2. Wait on i/o.
76962 +
76963 + 3. Update journal footer: change the pointer to block number of tx head
76964 + block of the atom we currently flushing, submit an i/o, wait on i/o
76965 + completion.
76966 +
76967 + 4. Free disk space which was used for wandered blocks and wander records.
76968 +
76969 + After the freeing of wandered blocks and wander records we have that journal
76970 + footer points to the on-disk structure which might be overwritten soon.
76971 + Neither the log writer nor the journal recovery procedure use that pointer
76972 + for accessing the data. When the journal recovery procedure finds the oldest
76973 + transaction it compares the journal footer pointer value with the "prev_tx"
76974 + pointer value in tx head, if values are equal the oldest not flushed
76975 + transaction is found.
76976 +
76977 + NOTE on disk space leakage: the information about of what blocks and how many
76978 + blocks are allocated for wandered blocks, wandered records is not written to
76979 + the disk because of special logging for bitmaps and some super blocks
76980 + counters. After a system crash we the reiser4 does not remember those
76981 + objects allocation, thus we have no such a kind of disk space leakage.
76982 +*/
76983 +
76984 +/* Special logging of reiser4 super block fields. */
76985 +
76986 +/* There are some reiser4 super block fields (free block count and OID allocator
76987 + state (number of files and next free OID) which are logged separately from
76988 + super block to avoid unnecessary atom fusion.
76989 +
76990 + So, the reiser4 super block can be not captured by a transaction with
76991 + allocates/deallocates disk blocks or create/delete file objects. Moreover,
76992 + the reiser4 on-disk super block is not touched when such a transaction is
76993 + committed and flushed. Those "counters logged specially" are logged in "tx
76994 + head" blocks and in the journal footer block.
76995 +
76996 + A step-by-step description of special logging:
76997 +
76998 + 0. The per-atom information about deleted or created files and allocated or
76999 + freed blocks is collected during the transaction. The atom's
77000 + ->nr_objects_created and ->nr_objects_deleted are for object
77001 + deletion/creation tracking, the numbers of allocated and freed blocks are
77002 + calculated using atom's delete set and atom's capture list -- all new and
77003 + relocated nodes should be on atom's clean list and should have JNODE_RELOC
77004 + bit set.
77005 +
77006 + 1. The "logged specially" reiser4 super block fields have their "committed"
77007 + versions in the reiser4 in-memory super block. They get modified only at
77008 + atom commit time. The atom's commit thread has an exclusive access to those
77009 + "committed" fields because the log writer implementation supports only one
77010 + atom commit a time (there is a per-fs "commit" mutex). At
77011 + that time "committed" counters are modified using per-atom information
77012 + collected during the transaction. These counters are stored on disk as a
77013 + part of tx head block when atom is committed.
77014 +
77015 + 2. When the atom is flushed the value of the free block counter and the OID
77016 + allocator state get written to the journal footer block. A special journal
77017 + procedure (journal_recover_sb_data()) takes those values from the journal
77018 + footer and updates the reiser4 in-memory super block.
77019 +
77020 + NOTE: That means free block count and OID allocator state are logged
77021 + separately from the reiser4 super block regardless of the fact that the
77022 + reiser4 super block has fields to store both the free block counter and the
77023 + OID allocator.
77024 +
77025 + Writing the whole super block at commit time requires knowing true values of
77026 + all its fields without changes made by not yet committed transactions. It is
77027 + possible by having their "committed" version of the super block like the
77028 + reiser4 bitmap blocks have "committed" and "working" versions. However,
77029 + another scheme was implemented which stores special logged values in the
77030 + unused free space inside transaction head block. In my opinion it has an
77031 + advantage of not writing whole super block when only part of it was
77032 + modified. */
77033 +
77034 +#include "debug.h"
77035 +#include "dformat.h"
77036 +#include "txnmgr.h"
77037 +#include "jnode.h"
77038 +#include "znode.h"
77039 +#include "block_alloc.h"
77040 +#include "page_cache.h"
77041 +#include "wander.h"
77042 +#include "reiser4.h"
77043 +#include "super.h"
77044 +#include "vfs_ops.h"
77045 +#include "writeout.h"
77046 +#include "inode.h"
77047 +#include "entd.h"
77048 +
77049 +#include <linux/types.h>
77050 +#include <linux/fs.h> /* for struct super_block */
77051 +#include <linux/mm.h> /* for struct page */
77052 +#include <linux/pagemap.h>
77053 +#include <linux/bio.h> /* for struct bio */
77054 +#include <linux/blkdev.h>
77055 +
77056 +static int write_jnodes_to_disk_extent(
77057 + jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77058 +
77059 +/* The commit_handle is a container for objects needed at atom commit time */
77060 +struct commit_handle {
77061 + /* A pointer to atom's list of OVRWR nodes */
77062 + struct list_head *overwrite_set;
77063 + /* atom's overwrite set size */
77064 + int overwrite_set_size;
77065 + /* jnodes for wander record blocks */
77066 + struct list_head tx_list;
77067 + /* number of wander records */
77068 + __u32 tx_size;
77069 + /* 'committed' sb counters are saved here until atom is completely
77070 + flushed */
77071 + __u64 free_blocks;
77072 + __u64 nr_files;
77073 + __u64 next_oid;
77074 + /* A pointer to the atom which is being committed */
77075 + txn_atom *atom;
77076 + /* A pointer to current super block */
77077 + struct super_block *super;
77078 + /* The counter of modified bitmaps */
77079 + reiser4_block_nr nr_bitmap;
77080 +};
77081 +
77082 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77083 +{
77084 + memset(ch, 0, sizeof(struct commit_handle));
77085 + INIT_LIST_HEAD(&ch->tx_list);
77086 +
77087 + ch->atom = atom;
77088 + ch->super = reiser4_get_current_sb();
77089 +}
77090 +
77091 +static void done_commit_handle(struct commit_handle *ch)
77092 +{
77093 + assert("zam-690", list_empty(&ch->tx_list));
77094 +}
77095 +
77096 +static inline int reiser4_use_write_barrier(struct super_block * s)
77097 +{
77098 + return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77099 +}
77100 +
77101 +static void disable_write_barrier(struct super_block * s)
77102 +{
77103 + notice("zam-1055", "%s does not support write barriers,"
77104 + " using synchronous write instead.", s->s_id);
77105 + set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77106 +}
77107 +
77108 +/* fill journal header block data */
77109 +static void format_journal_header(struct commit_handle *ch)
77110 +{
77111 + struct reiser4_super_info_data *sbinfo;
77112 + struct journal_header *header;
77113 + jnode *txhead;
77114 +
77115 + sbinfo = get_super_private(ch->super);
77116 + assert("zam-479", sbinfo != NULL);
77117 + assert("zam-480", sbinfo->journal_header != NULL);
77118 +
77119 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77120 +
77121 + jload(sbinfo->journal_header);
77122 +
77123 + header = (struct journal_header *)jdata(sbinfo->journal_header);
77124 + assert("zam-484", header != NULL);
77125 +
77126 + put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77127 + &header->last_committed_tx);
77128 +
77129 + jrelse(sbinfo->journal_header);
77130 +}
77131 +
77132 +/* fill journal footer block data */
77133 +static void format_journal_footer(struct commit_handle *ch)
77134 +{
77135 + struct reiser4_super_info_data *sbinfo;
77136 + struct journal_footer *footer;
77137 + jnode *tx_head;
77138 +
77139 + sbinfo = get_super_private(ch->super);
77140 +
77141 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77142 +
77143 + assert("zam-493", sbinfo != NULL);
77144 + assert("zam-494", sbinfo->journal_header != NULL);
77145 +
77146 + check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77147 +
77148 + footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77149 + assert("zam-495", footer != NULL);
77150 +
77151 + put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77152 + &footer->last_flushed_tx);
77153 + put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77154 +
77155 + put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77156 + put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77157 +
77158 + jrelse(sbinfo->journal_footer);
77159 +}
77160 +
77161 +/* wander record capacity depends on current block size */
77162 +static int wander_record_capacity(const struct super_block *super)
77163 +{
77164 + return (super->s_blocksize -
77165 + sizeof(struct wander_record_header)) /
77166 + sizeof(struct wander_entry);
77167 +}
77168 +
77169 +/* Fill first wander record (tx head) in accordance with supplied given data */
77170 +static void format_tx_head(struct commit_handle *ch)
77171 +{
77172 + jnode *tx_head;
77173 + jnode *next;
77174 + struct tx_header *header;
77175 +
77176 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77177 + assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77178 +
77179 + next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77180 + if (&ch->tx_list == &next->capture_link)
77181 + next = tx_head;
77182 +
77183 + header = (struct tx_header *)jdata(tx_head);
77184 +
77185 + assert("zam-460", header != NULL);
77186 + assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77187 +
77188 + memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77189 + memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77190 +
77191 + put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77192 + put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77193 + &header->prev_tx);
77194 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77195 + put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
77196 + put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
77197 + put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
77198 +}
77199 +
77200 +/* prepare ordinary wander record block (fill all service fields) */
77201 +static void
77202 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
77203 +{
77204 + struct wander_record_header *LRH;
77205 + jnode *next;
77206 +
77207 + assert("zam-464", node != NULL);
77208 +
77209 + LRH = (struct wander_record_header *)jdata(node);
77210 + next = list_entry(node->capture_link.next, jnode, capture_link);
77211 +
77212 + if (&ch->tx_list == &next->capture_link)
77213 + next = list_entry(ch->tx_list.next, jnode, capture_link);
77214 +
77215 + assert("zam-465", LRH != NULL);
77216 + assert("zam-463",
77217 + ch->super->s_blocksize > sizeof(struct wander_record_header));
77218 +
77219 + memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
77220 + memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
77221 +
77222 + put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
77223 + put_unaligned(cpu_to_le32(serial), &LRH->serial);
77224 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
77225 +}
77226 +
77227 +/* add one wandered map entry to formatted wander record */
77228 +static void
77229 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
77230 + const reiser4_block_nr * b)
77231 +{
77232 + char *data;
77233 + struct wander_entry *pairs;
77234 +
77235 + data = jdata(node);
77236 + assert("zam-451", data != NULL);
77237 +
77238 + pairs =
77239 + (struct wander_entry *)(data + sizeof(struct wander_record_header));
77240 +
77241 + put_unaligned(cpu_to_le64(*a), &pairs[index].original);
77242 + put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
77243 +}
77244 +
77245 +/* currently, wander records contains contain only wandered map, which depend on
77246 + overwrite set size */
77247 +static void get_tx_size(struct commit_handle *ch)
77248 +{
77249 + assert("zam-440", ch->overwrite_set_size != 0);
77250 + assert("zam-695", ch->tx_size == 0);
77251 +
77252 + /* count all ordinary wander records
77253 + (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
77254 + for tx head block */
77255 + ch->tx_size =
77256 + (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
77257 + 2;
77258 +}
77259 +
77260 +/* A special structure for using in store_wmap_actor() for saving its state
77261 + between calls */
77262 +struct store_wmap_params {
77263 + jnode *cur; /* jnode of current wander record to fill */
77264 + int idx; /* free element index in wander record */
77265 + int capacity; /* capacity */
77266 +
77267 +#if REISER4_DEBUG
77268 + struct list_head *tx_list;
77269 +#endif
77270 +};
77271 +
77272 +/* an actor for use in blocknr_set_iterator routine which populates the list
77273 + of pre-formatted wander records by wandered map info */
77274 +static int
77275 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
77276 + const reiser4_block_nr * b, void *data)
77277 +{
77278 + struct store_wmap_params *params = data;
77279 +
77280 + if (params->idx >= params->capacity) {
77281 + /* a new wander record should be taken from the tx_list */
77282 + params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
77283 + assert("zam-454",
77284 + params->tx_list != &params->cur->capture_link);
77285 +
77286 + params->idx = 0;
77287 + }
77288 +
77289 + store_entry(params->cur, params->idx, a, b);
77290 + params->idx++;
77291 +
77292 + return 0;
77293 +}
77294 +
77295 +/* This function is called after Relocate set gets written to disk, Overwrite
77296 + set is written to wandered locations and all wander records are written
77297 + also. Updated journal header blocks contains a pointer (block number) to
77298 + first wander record of the just written transaction */
77299 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
77300 +{
77301 + struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77302 + jnode *jh = sbinfo->journal_header;
77303 + jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
77304 + int ret;
77305 +
77306 + format_journal_header(ch);
77307 +
77308 + ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
77309 + use_barrier ? WRITEOUT_BARRIER : 0);
77310 + if (ret)
77311 + return ret;
77312 +
77313 + // blk_run_address_space(sbinfo->fake->i_mapping);
77314 + /*blk_run_queues(); */
77315 +
77316 + ret = jwait_io(jh, WRITE);
77317 +
77318 + if (ret)
77319 + return ret;
77320 +
77321 + sbinfo->last_committed_tx = *jnode_get_block(head);
77322 +
77323 + return 0;
77324 +}
77325 +
77326 +/* This function is called after write-back is finished. We update journal
77327 + footer block and free blocks which were occupied by wandered blocks and
77328 + transaction wander records */
77329 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
77330 +{
77331 + reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77332 +
77333 + jnode *jf = sbinfo->journal_footer;
77334 +
77335 + int ret;
77336 +
77337 + format_journal_footer(ch);
77338 +
77339 + ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
77340 + use_barrier ? WRITEOUT_BARRIER : 0);
77341 + if (ret)
77342 + return ret;
77343 +
77344 + // blk_run_address_space(sbinfo->fake->i_mapping);
77345 + /*blk_run_queue(); */
77346 +
77347 + ret = jwait_io(jf, WRITE);
77348 + if (ret)
77349 + return ret;
77350 +
77351 + return 0;
77352 +}
77353 +
77354 +/* free block numbers of wander records of already written in place transaction */
77355 +static void dealloc_tx_list(struct commit_handle *ch)
77356 +{
77357 + while (!list_empty(&ch->tx_list)) {
77358 + jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
77359 + list_del(&cur->capture_link);
77360 + ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
77361 + reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
77362 + BA_FORMATTED);
77363 +
77364 + unpin_jnode_data(cur);
77365 + reiser4_drop_io_head(cur);
77366 + }
77367 +}
77368 +
77369 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
77370 + from atom's overwrite set. */
77371 +static int
77372 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
77373 + const reiser4_block_nr * a UNUSED_ARG,
77374 + const reiser4_block_nr * b, void *data UNUSED_ARG)
77375 +{
77376 +
77377 + assert("zam-499", b != NULL);
77378 + assert("zam-500", *b != 0);
77379 + assert("zam-501", !reiser4_blocknr_is_fake(b));
77380 +
77381 + reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
77382 + return 0;
77383 +}
77384 +
77385 +/* free wandered block locations of already written in place transaction */
77386 +static void dealloc_wmap(struct commit_handle *ch)
77387 +{
77388 + assert("zam-696", ch->atom != NULL);
77389 +
77390 + blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
77391 + dealloc_wmap_actor, NULL, 1);
77392 +}
77393 +
77394 +/* helper function for alloc wandered blocks, which refill set of block
77395 + numbers needed for wandered blocks */
77396 +static int
77397 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
77398 +{
77399 + reiser4_blocknr_hint hint;
77400 + int ret;
77401 +
77402 + reiser4_block_nr wide_len = count;
77403 +
77404 + /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
77405 + ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
77406 + reserved allocation area so as to get the best qualities of fixed
77407 + journals? */
77408 + reiser4_blocknr_hint_init(&hint);
77409 + hint.block_stage = BLOCK_GRABBED;
77410 +
77411 + ret = reiser4_alloc_blocks(&hint, start, &wide_len,
77412 + BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
77413 + *len = (int)wide_len;
77414 +
77415 + return ret;
77416 +}
77417 +
77418 +/*
77419 + * roll back changes made before issuing BIO in the case of IO error.
77420 + */
77421 +static void undo_bio(struct bio *bio)
77422 +{
77423 + int i;
77424 +
77425 + for (i = 0; i < bio->bi_vcnt; ++i) {
77426 + struct page *pg;
77427 + jnode *node;
77428 +
77429 + pg = bio->bi_io_vec[i].bv_page;
77430 + end_page_writeback(pg);
77431 + node = jprivate(pg);
77432 + spin_lock_jnode(node);
77433 + JF_CLR(node, JNODE_WRITEBACK);
77434 + JF_SET(node, JNODE_DIRTY);
77435 + spin_unlock_jnode(node);
77436 + }
77437 + bio_put(bio);
77438 +}
77439 +
77440 +/* put overwrite set back to atom's clean list */
77441 +static void put_overwrite_set(struct commit_handle *ch)
77442 +{
77443 + jnode *cur;
77444 +
77445 + list_for_each_entry(cur, ch->overwrite_set, capture_link)
77446 + jrelse_tail(cur);
77447 +}
77448 +
77449 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
77450 + Since we have a separate list for atom's overwrite set we just scan the list,
77451 + count bitmap and other not leaf nodes which wandered blocks allocation we
77452 + have to grab space for. */
77453 +static int get_overwrite_set(struct commit_handle *ch)
77454 +{
77455 + int ret;
77456 + jnode *cur;
77457 + __u64 nr_not_leaves = 0;
77458 +#if REISER4_DEBUG
77459 + __u64 nr_formatted_leaves = 0;
77460 + __u64 nr_unformatted_leaves = 0;
77461 +#endif
77462 +
77463 + assert("zam-697", ch->overwrite_set_size == 0);
77464 +
77465 + ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
77466 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77467 +
77468 + while (ch->overwrite_set != &cur->capture_link) {
77469 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
77470 +
77471 + /* Count bitmap locks for getting correct statistics what number
77472 + * of blocks were cleared by the transaction commit. */
77473 + if (jnode_get_type(cur) == JNODE_BITMAP)
77474 + ch->nr_bitmap++;
77475 +
77476 + assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
77477 + || jnode_get_type(cur) == JNODE_BITMAP);
77478 +
77479 + if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
77480 + /* we replace fake znode by another (real)
77481 + znode which is suggested by disk_layout
77482 + plugin */
77483 +
77484 + /* FIXME: it looks like fake znode should be
77485 + replaced by jnode supplied by
77486 + disk_layout. */
77487 +
77488 + struct super_block *s = reiser4_get_current_sb();
77489 + reiser4_super_info_data *sbinfo =
77490 + get_current_super_private();
77491 +
77492 + if (sbinfo->df_plug->log_super) {
77493 + jnode *sj = sbinfo->df_plug->log_super(s);
77494 +
77495 + assert("zam-593", sj != NULL);
77496 +
77497 + if (IS_ERR(sj))
77498 + return PTR_ERR(sj);
77499 +
77500 + spin_lock_jnode(sj);
77501 + JF_SET(sj, JNODE_OVRWR);
77502 + insert_into_atom_ovrwr_list(ch->atom, sj);
77503 + spin_unlock_jnode(sj);
77504 +
77505 + /* jload it as the rest of overwrite set */
77506 + jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
77507 +
77508 + ch->overwrite_set_size++;
77509 + }
77510 + spin_lock_jnode(cur);
77511 + reiser4_uncapture_block(cur);
77512 + jput(cur);
77513 +
77514 + } else {
77515 + int ret;
77516 + ch->overwrite_set_size++;
77517 + ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
77518 + if (ret)
77519 + reiser4_panic("zam-783",
77520 + "cannot load e-flushed jnode back (ret = %d)\n",
77521 + ret);
77522 + }
77523 +
77524 + /* Count not leaves here because we have to grab disk space
77525 + * for wandered blocks. They were not counted as "flush
77526 + * reserved". Counting should be done _after_ nodes are pinned
77527 + * into memory by jload(). */
77528 + if (!jnode_is_leaf(cur))
77529 + nr_not_leaves++;
77530 + else {
77531 +#if REISER4_DEBUG
77532 + /* at this point @cur either has JNODE_FLUSH_RESERVED
77533 + * or is eflushed. Locking is not strong enough to
77534 + * write an assertion checking for this. */
77535 + if (jnode_is_znode(cur))
77536 + nr_formatted_leaves++;
77537 + else
77538 + nr_unformatted_leaves++;
77539 +#endif
77540 + JF_CLR(cur, JNODE_FLUSH_RESERVED);
77541 + }
77542 +
77543 + cur = next;
77544 + }
77545 +
77546 + /* Grab space for writing (wandered blocks) of not leaves found in
77547 + * overwrite set. */
77548 + ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
77549 + if (ret)
77550 + return ret;
77551 +
77552 + /* Disk space for allocation of wandered blocks of leaf nodes already
77553 + * reserved as "flush reserved", move it to grabbed space counter. */
77554 + spin_lock_atom(ch->atom);
77555 + assert("zam-940",
77556 + nr_formatted_leaves + nr_unformatted_leaves <=
77557 + ch->atom->flush_reserved);
77558 + flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
77559 + spin_unlock_atom(ch->atom);
77560 +
77561 + return ch->overwrite_set_size;
77562 +}
77563 +
77564 +/**
77565 + * write_jnodes_to_disk_extent - submit write request
77566 + * @head:
77567 + * @first: first jnode of the list
77568 + * @nr: number of jnodes on the list
77569 + * @block_p:
77570 + * @fq:
77571 + * @flags: used to decide whether page is to get PG_reclaim flag
77572 + *
77573 + * Submits a write request for @nr jnodes beginning from the @first, other
77574 + * jnodes are after the @first on the double-linked "capture" list. All jnodes
77575 + * will be written to the disk region of @nr blocks starting with @block_p block
77576 + * number. If @fq is not NULL it means that waiting for i/o completion will be
77577 + * done more efficiently by using flush_queue_t objects.
77578 + * This function is the one which writes list of jnodes in batch mode. It does
77579 + * all low-level things as bio construction and page states manipulation.
77580 + *
77581 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
77582 + * aggregated in this function instead of being left to the layers below
77583 + *
77584 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
77585 + * Why that layer needed? Why BIOs cannot be constructed here?
77586 + */
77587 +static int write_jnodes_to_disk_extent(
77588 + jnode *first, int nr, const reiser4_block_nr *block_p,
77589 + flush_queue_t *fq, int flags)
77590 +{
77591 + struct super_block *super = reiser4_get_current_sb();
77592 + int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
77593 + int max_blocks;
77594 + jnode *cur = first;
77595 + reiser4_block_nr block;
77596 +
77597 + assert("zam-571", first != NULL);
77598 + assert("zam-572", block_p != NULL);
77599 + assert("zam-570", nr > 0);
77600 +
77601 + block = *block_p;
77602 + max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
77603 +
77604 + while (nr > 0) {
77605 + struct bio *bio;
77606 + int nr_blocks = min(nr, max_blocks);
77607 + int i;
77608 + int nr_used;
77609 +
77610 + bio = bio_alloc(GFP_NOIO, nr_blocks);
77611 + if (!bio)
77612 + return RETERR(-ENOMEM);
77613 +
77614 + bio->bi_bdev = super->s_bdev;
77615 + bio->bi_sector = block * (super->s_blocksize >> 9);
77616 + for (nr_used = 0, i = 0; i < nr_blocks; i++) {
77617 + struct page *pg;
77618 +
77619 + pg = jnode_page(cur);
77620 + assert("zam-573", pg != NULL);
77621 +
77622 + page_cache_get(pg);
77623 +
77624 + lock_and_wait_page_writeback(pg);
77625 +
77626 + if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
77627 + /*
77628 + * underlying device is satiated. Stop adding
77629 + * pages to the bio.
77630 + */
77631 + unlock_page(pg);
77632 + page_cache_release(pg);
77633 + break;
77634 + }
77635 +
77636 + spin_lock_jnode(cur);
77637 + assert("nikita-3166",
77638 + pg->mapping == jnode_get_mapping(cur));
77639 + assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
77640 +#if REISER4_DEBUG
77641 + spin_lock(&cur->load);
77642 + assert("nikita-3165", !jnode_is_releasable(cur));
77643 + spin_unlock(&cur->load);
77644 +#endif
77645 + JF_SET(cur, JNODE_WRITEBACK);
77646 + JF_CLR(cur, JNODE_DIRTY);
77647 + ON_DEBUG(cur->written++);
77648 + spin_unlock_jnode(cur);
77649 +
77650 + ClearPageError(pg);
77651 + set_page_writeback(pg);
77652 +
77653 + if (get_current_context()->entd) {
77654 + /* this is ent thread */
77655 + entd_context *ent = get_entd_context(super);
77656 + struct wbq *rq, *next;
77657 +
77658 + spin_lock(&ent->guard);
77659 +
77660 + if (pg == ent->cur_request->page) {
77661 + /*
77662 + * entd is called for this page. This
77663 + * request is not in th etodo list
77664 + */
77665 + ent->cur_request->written = 1;
77666 + } else {
77667 + /*
77668 + * if we have written a page for which writepage
77669 + * is called for - move request to another list.
77670 + */
77671 + list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
77672 + assert("", rq->magic == WBQ_MAGIC);
77673 + if (pg == rq->page) {
77674 + /*
77675 + * remove request from
77676 + * entd's queue, but do
77677 + * not wake up a thread
77678 + * which put this
77679 + * request
77680 + */
77681 + list_del_init(&rq->link);
77682 + ent->nr_todo_reqs --;
77683 + list_add_tail(&rq->link, &ent->done_list);
77684 + ent->nr_done_reqs ++;
77685 + rq->written = 1;
77686 + break;
77687 + }
77688 + }
77689 + }
77690 + spin_unlock(&ent->guard);
77691 + }
77692 +
77693 + clear_page_dirty_for_io(pg);
77694 +
77695 + unlock_page(pg);
77696 +
77697 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77698 + nr_used++;
77699 + }
77700 + if (nr_used > 0) {
77701 + assert("nikita-3453",
77702 + bio->bi_size == super->s_blocksize * nr_used);
77703 + assert("nikita-3454", bio->bi_vcnt == nr_used);
77704 +
77705 + /* Check if we are allowed to write at all */
77706 + if (super->s_flags & MS_RDONLY)
77707 + undo_bio(bio);
77708 + else {
77709 + int not_supported;
77710 +
77711 + add_fq_to_bio(fq, bio);
77712 + bio_get(bio);
77713 + reiser4_submit_bio(write_op, bio);
77714 + not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
77715 + bio_put(bio);
77716 + if (not_supported)
77717 + return -EOPNOTSUPP;
77718 + }
77719 +
77720 + block += nr_used - 1;
77721 + update_blocknr_hint_default(super, &block);
77722 + block += 1;
77723 + } else {
77724 + bio_put(bio);
77725 + }
77726 + nr -= nr_used;
77727 + }
77728 +
77729 + return 0;
77730 +}
77731 +
77732 +/* This is a procedure which recovers a contiguous sequences of disk block
77733 + numbers in the given list of j-nodes and submits write requests on this
77734 + per-sequence basis */
77735 +int
77736 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
77737 + long *nr_submitted, int flags)
77738 +{
77739 + int ret;
77740 + jnode *beg = list_entry(head->next, jnode, capture_link);
77741 +
77742 + while (head != &beg->capture_link) {
77743 + int nr = 1;
77744 + jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
77745 +
77746 + while (head != &cur->capture_link) {
77747 + if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
77748 + break;
77749 + ++nr;
77750 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77751 + }
77752 +
77753 + ret = write_jnodes_to_disk_extent(
77754 + beg, nr, jnode_get_block(beg), fq, flags);
77755 + if (ret)
77756 + return ret;
77757 +
77758 + if (nr_submitted)
77759 + *nr_submitted += nr;
77760 +
77761 + beg = cur;
77762 + }
77763 +
77764 + return 0;
77765 +}
77766 +
77767 +/* add given wandered mapping to atom's wandered map */
77768 +static int
77769 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
77770 +{
77771 + int ret;
77772 + blocknr_set_entry *new_bsep = NULL;
77773 + reiser4_block_nr block;
77774 +
77775 + txn_atom *atom;
77776 +
77777 + assert("zam-568", block_p != NULL);
77778 + block = *block_p;
77779 + assert("zam-569", len > 0);
77780 +
77781 + while ((len--) > 0) {
77782 + do {
77783 + atom = get_current_atom_locked();
77784 + assert("zam-536",
77785 + !reiser4_blocknr_is_fake(jnode_get_block(cur)));
77786 + ret =
77787 + blocknr_set_add_pair(atom, &atom->wandered_map,
77788 + &new_bsep,
77789 + jnode_get_block(cur), &block);
77790 + } while (ret == -E_REPEAT);
77791 +
77792 + if (ret) {
77793 + /* deallocate blocks which were not added to wandered
77794 + map */
77795 + reiser4_block_nr wide_len = len;
77796 +
77797 + reiser4_dealloc_blocks(&block, &wide_len,
77798 + BLOCK_NOT_COUNTED,
77799 + BA_FORMATTED
77800 + /* formatted, without defer */ );
77801 +
77802 + return ret;
77803 + }
77804 +
77805 + spin_unlock_atom(atom);
77806 +
77807 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77808 + ++block;
77809 + }
77810 +
77811 + return 0;
77812 +}
77813 +
77814 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
77815 + submit IO for allocated blocks. We assume that current atom is in a stage
77816 + when any atom fusion is impossible and atom is unlocked and it is safe. */
77817 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
77818 +{
77819 + reiser4_block_nr block;
77820 +
77821 + int rest;
77822 + int len;
77823 + int ret;
77824 +
77825 + jnode *cur;
77826 +
77827 + assert("zam-534", ch->overwrite_set_size > 0);
77828 +
77829 + rest = ch->overwrite_set_size;
77830 +
77831 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77832 + while (ch->overwrite_set != &cur->capture_link) {
77833 + assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
77834 +
77835 + ret = get_more_wandered_blocks(rest, &block, &len);
77836 + if (ret)
77837 + return ret;
77838 +
77839 + rest -= len;
77840 +
77841 + ret = add_region_to_wmap(cur, len, &block);
77842 + if (ret)
77843 + return ret;
77844 +
77845 + ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
77846 + if (ret)
77847 + return ret;
77848 +
77849 + while ((len--) > 0) {
77850 + assert("zam-604",
77851 + ch->overwrite_set != &cur->capture_link);
77852 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77853 + }
77854 + }
77855 +
77856 + return 0;
77857 +}
77858 +
77859 +/* allocate given number of nodes over the journal area and link them into a
77860 + list, return pointer to the first jnode in the list */
77861 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
77862 +{
77863 + reiser4_blocknr_hint hint;
77864 + reiser4_block_nr allocated = 0;
77865 + reiser4_block_nr first, len;
77866 + jnode *cur;
77867 + jnode *txhead;
77868 + int ret;
77869 + reiser4_context *ctx;
77870 + reiser4_super_info_data *sbinfo;
77871 +
77872 + assert("zam-698", ch->tx_size > 0);
77873 + assert("zam-699", list_empty_careful(&ch->tx_list));
77874 +
77875 + ctx = get_current_context();
77876 + sbinfo = get_super_private(ctx->super);
77877 +
77878 + while (allocated < (unsigned)ch->tx_size) {
77879 + len = (ch->tx_size - allocated);
77880 +
77881 + reiser4_blocknr_hint_init(&hint);
77882 +
77883 + hint.block_stage = BLOCK_GRABBED;
77884 +
77885 + /* FIXME: there should be some block allocation policy for
77886 + nodes which contain wander records */
77887 +
77888 + /* We assume that disk space for wandered record blocks can be
77889 + * taken from reserved area. */
77890 + ret = reiser4_alloc_blocks(&hint, &first, &len,
77891 + BA_FORMATTED | BA_RESERVED |
77892 + BA_USE_DEFAULT_SEARCH_START);
77893 + reiser4_blocknr_hint_done(&hint);
77894 +
77895 + if (ret)
77896 + return ret;
77897 +
77898 + allocated += len;
77899 +
77900 + /* create jnodes for all wander records */
77901 + while (len--) {
77902 + cur = reiser4_alloc_io_head(&first);
77903 +
77904 + if (cur == NULL) {
77905 + ret = RETERR(-ENOMEM);
77906 + goto free_not_assigned;
77907 + }
77908 +
77909 + ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
77910 +
77911 + if (ret != 0) {
77912 + jfree(cur);
77913 + goto free_not_assigned;
77914 + }
77915 +
77916 + pin_jnode_data(cur);
77917 +
77918 + list_add_tail(&cur->capture_link, &ch->tx_list);
77919 +
77920 + first++;
77921 + }
77922 + }
77923 +
77924 + { /* format a on-disk linked list of wander records */
77925 + int serial = 1;
77926 +
77927 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77928 + format_tx_head(ch);
77929 +
77930 + cur = list_entry(txhead->capture_link.next, jnode, capture_link);
77931 + while (&ch->tx_list != &cur->capture_link) {
77932 + format_wander_record(ch, cur, serial++);
77933 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77934 + }
77935 + }
77936 +
77937 + { /* Fill wander records with Wandered Set */
77938 + struct store_wmap_params params;
77939 + txn_atom *atom;
77940 +
77941 + params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
77942 +
77943 + params.idx = 0;
77944 + params.capacity =
77945 + wander_record_capacity(reiser4_get_current_sb());
77946 +
77947 + atom = get_current_atom_locked();
77948 + blocknr_set_iterator(atom, &atom->wandered_map,
77949 + &store_wmap_actor, &params, 0);
77950 + spin_unlock_atom(atom);
77951 + }
77952 +
77953 + { /* relse all jnodes from tx_list */
77954 + cur = list_entry(ch->tx_list.next, jnode, capture_link);
77955 + while (&ch->tx_list != &cur->capture_link) {
77956 + jrelse(cur);
77957 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
77958 + }
77959 + }
77960 +
77961 + ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
77962 +
77963 + return ret;
77964 +
77965 + free_not_assigned:
77966 + /* We deallocate blocks not yet assigned to jnodes on tx_list. The
77967 + caller takes care about invalidating of tx list */
77968 + reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
77969 +
77970 + return ret;
77971 +}
77972 +
77973 +static int commit_tx(struct commit_handle *ch)
77974 +{
77975 + flush_queue_t *fq;
77976 + int barrier;
77977 + int ret;
77978 +
77979 + /* Grab more space for wandered records. */
77980 + ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
77981 + if (ret)
77982 + return ret;
77983 +
77984 + fq = get_fq_for_current_atom();
77985 + if (IS_ERR(fq))
77986 + return PTR_ERR(fq);
77987 +
77988 + spin_unlock_atom(fq->atom);
77989 + do {
77990 + ret = alloc_wandered_blocks(ch, fq);
77991 + if (ret)
77992 + break;
77993 + ret = alloc_tx(ch, fq);
77994 + if (ret)
77995 + break;
77996 + } while (0);
77997 +
77998 + reiser4_fq_put(fq);
77999 + if (ret)
78000 + return ret;
78001 + repeat_wo_barrier:
78002 + barrier = reiser4_use_write_barrier(ch->super);
78003 + if (!barrier) {
78004 + ret = current_atom_finish_all_fq();
78005 + if (ret)
78006 + return ret;
78007 + }
78008 + ret = update_journal_header(ch, barrier);
78009 + if (barrier) {
78010 + if (ret) {
78011 + if (ret == -EOPNOTSUPP) {
78012 + disable_write_barrier(ch->super);
78013 + goto repeat_wo_barrier;
78014 + }
78015 + return ret;
78016 + }
78017 + ret = current_atom_finish_all_fq();
78018 + }
78019 + return ret;
78020 +}
78021 +
78022 +static int write_tx_back(struct commit_handle * ch)
78023 +{
78024 + flush_queue_t *fq;
78025 + int ret;
78026 + int barrier;
78027 +
78028 + reiser4_post_commit_hook();
78029 + fq = get_fq_for_current_atom();
78030 + if (IS_ERR(fq))
78031 + return PTR_ERR(fq);
78032 + spin_unlock_atom(fq->atom);
78033 + ret = write_jnode_list(
78034 + ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78035 + reiser4_fq_put(fq);
78036 + if (ret)
78037 + return ret;
78038 + repeat_wo_barrier:
78039 + barrier = reiser4_use_write_barrier(ch->super);
78040 + if (!barrier) {
78041 + ret = current_atom_finish_all_fq();
78042 + if (ret)
78043 + return ret;
78044 + }
78045 + ret = update_journal_footer(ch, barrier);
78046 + if (barrier) {
78047 + if (ret) {
78048 + if (ret == -EOPNOTSUPP) {
78049 + disable_write_barrier(ch->super);
78050 + goto repeat_wo_barrier;
78051 + }
78052 + return ret;
78053 + }
78054 + ret = current_atom_finish_all_fq();
78055 + }
78056 + if (ret)
78057 + return ret;
78058 + reiser4_post_write_back_hook();
78059 + return 0;
78060 +}
78061 +
78062 +/* We assume that at this moment all captured blocks are marked as RELOC or
78063 + WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78064 + are submitted to write.
78065 +*/
78066 +
78067 +int reiser4_write_logs(long *nr_submitted)
78068 +{
78069 + txn_atom *atom;
78070 + struct super_block *super = reiser4_get_current_sb();
78071 + reiser4_super_info_data *sbinfo = get_super_private(super);
78072 + struct commit_handle ch;
78073 + int ret;
78074 +
78075 + writeout_mode_enable();
78076 +
78077 + /* block allocator may add j-nodes to the clean_list */
78078 + ret = reiser4_pre_commit_hook();
78079 + if (ret)
78080 + return ret;
78081 +
78082 + /* No locks are required if we take atom which stage >=
78083 + * ASTAGE_PRE_COMMIT */
78084 + atom = get_current_context()->trans->atom;
78085 + assert("zam-965", atom != NULL);
78086 +
78087 + /* relocate set is on the atom->clean_nodes list after
78088 + * current_atom_complete_writes() finishes. It can be safely
78089 + * uncaptured after commit_mutex is locked, because any atom that
78090 + * captures these nodes is guaranteed to commit after current one.
78091 + *
78092 + * This can only be done after reiser4_pre_commit_hook(), because it is where
78093 + * early flushed jnodes with CREATED bit are transferred to the
78094 + * overwrite list. */
78095 + reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
78096 + spin_lock_atom(atom);
78097 + /* There might be waiters for the relocate nodes which we have
78098 + * released, wake them up. */
78099 + reiser4_atom_send_event(atom);
78100 + spin_unlock_atom(atom);
78101 +
78102 + if (REISER4_DEBUG) {
78103 + int level;
78104 +
78105 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78106 + assert("nikita-3352",
78107 + list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78108 + }
78109 +
78110 + sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78111 + sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78112 +
78113 + init_commit_handle(&ch, atom);
78114 +
78115 + ch.free_blocks = sbinfo->blocks_free_committed;
78116 + ch.nr_files = sbinfo->nr_files_committed;
78117 + /* ZAM-FIXME-HANS: email me what the contention level is for the super
78118 + * lock. */
78119 + ch.next_oid = oid_next(super);
78120 +
78121 + /* count overwrite set and place it in a separate list */
78122 + ret = get_overwrite_set(&ch);
78123 +
78124 + if (ret <= 0) {
78125 + /* It is possible that overwrite set is empty here, it means
78126 + all captured nodes are clean */
78127 + goto up_and_ret;
78128 + }
78129 +
78130 + /* Inform the caller about what number of dirty pages will be
78131 + * submitted to disk. */
78132 + *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78133 +
78134 + /* count all records needed for storing of the wandered set */
78135 + get_tx_size(&ch);
78136 +
78137 + ret = commit_tx(&ch);
78138 + if (ret)
78139 + goto up_and_ret;
78140 +
78141 + spin_lock_atom(atom);
78142 + reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
78143 + spin_unlock_atom(atom);
78144 +
78145 + ret = write_tx_back(&ch);
78146 + reiser4_post_write_back_hook();
78147 +
78148 + up_and_ret:
78149 + if (ret) {
78150 + /* there could be fq attached to current atom; the only way to
78151 + remove them is: */
78152 + current_atom_finish_all_fq();
78153 + }
78154 +
78155 + /* free blocks of flushed transaction */
78156 + dealloc_tx_list(&ch);
78157 + dealloc_wmap(&ch);
78158 +
78159 + put_overwrite_set(&ch);
78160 +
78161 + done_commit_handle(&ch);
78162 +
78163 + writeout_mode_disable();
78164 +
78165 + return ret;
78166 +}
78167 +
78168 +/* consistency checks for journal data/control blocks: header, footer, log
78169 + records, transactions head blocks. All functions return zero on success. */
78170 +
78171 +static int check_journal_header(const jnode * node UNUSED_ARG)
78172 +{
78173 + /* FIXME: journal header has no magic field yet. */
78174 + return 0;
78175 +}
78176 +
78177 +/* wait for write completion for all jnodes from given list */
78178 +static int wait_on_jnode_list(struct list_head *head)
78179 +{
78180 + jnode *scan;
78181 + int ret = 0;
78182 +
78183 + list_for_each_entry(scan, head, capture_link) {
78184 + struct page *pg = jnode_page(scan);
78185 +
78186 + if (pg) {
78187 + if (PageWriteback(pg))
78188 + wait_on_page_writeback(pg);
78189 +
78190 + if (PageError(pg))
78191 + ret++;
78192 + }
78193 + }
78194 +
78195 + return ret;
78196 +}
78197 +
78198 +static int check_journal_footer(const jnode * node UNUSED_ARG)
78199 +{
78200 + /* FIXME: journal footer has no magic field yet. */
78201 + return 0;
78202 +}
78203 +
78204 +static int check_tx_head(const jnode * node)
78205 +{
78206 + struct tx_header *header = (struct tx_header *)jdata(node);
78207 +
78208 + if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
78209 + warning("zam-627", "tx head at block %s corrupted\n",
78210 + sprint_address(jnode_get_block(node)));
78211 + return RETERR(-EIO);
78212 + }
78213 +
78214 + return 0;
78215 +}
78216 +
78217 +static int check_wander_record(const jnode * node)
78218 +{
78219 + struct wander_record_header *RH =
78220 + (struct wander_record_header *)jdata(node);
78221 +
78222 + if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
78223 + 0) {
78224 + warning("zam-628", "wander record at block %s corrupted\n",
78225 + sprint_address(jnode_get_block(node)));
78226 + return RETERR(-EIO);
78227 + }
78228 +
78229 + return 0;
78230 +}
78231 +
78232 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
78233 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
78234 +{
78235 + struct tx_header *TXH;
78236 + int ret;
78237 +
78238 + ret = jload(tx_head);
78239 + if (ret)
78240 + return ret;
78241 +
78242 + TXH = (struct tx_header *)jdata(tx_head);
78243 +
78244 + ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
78245 + ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
78246 + ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
78247 +
78248 + jrelse(tx_head);
78249 +
78250 + list_add(&tx_head->capture_link, &ch->tx_list);
78251 +
78252 + return 0;
78253 +}
78254 +
78255 +/* replay one transaction: restore and write overwrite set in place */
78256 +static int replay_transaction(const struct super_block *s,
78257 + jnode * tx_head,
78258 + const reiser4_block_nr * log_rec_block_p,
78259 + const reiser4_block_nr * end_block,
78260 + unsigned int nr_wander_records)
78261 +{
78262 + reiser4_block_nr log_rec_block = *log_rec_block_p;
78263 + struct commit_handle ch;
78264 + LIST_HEAD(overwrite_set);
78265 + jnode *log;
78266 + int ret;
78267 +
78268 + init_commit_handle(&ch, NULL);
78269 + ch.overwrite_set = &overwrite_set;
78270 +
78271 + restore_commit_handle(&ch, tx_head);
78272 +
78273 + while (log_rec_block != *end_block) {
78274 + struct wander_record_header *header;
78275 + struct wander_entry *entry;
78276 +
78277 + int i;
78278 +
78279 + if (nr_wander_records == 0) {
78280 + warning("zam-631",
78281 + "number of wander records in the linked list"
78282 + " greater than number stored in tx head.\n");
78283 + ret = RETERR(-EIO);
78284 + goto free_ow_set;
78285 + }
78286 +
78287 + log = reiser4_alloc_io_head(&log_rec_block);
78288 + if (log == NULL)
78289 + return RETERR(-ENOMEM);
78290 +
78291 + ret = jload(log);
78292 + if (ret < 0) {
78293 + reiser4_drop_io_head(log);
78294 + return ret;
78295 + }
78296 +
78297 + ret = check_wander_record(log);
78298 + if (ret) {
78299 + jrelse(log);
78300 + reiser4_drop_io_head(log);
78301 + return ret;
78302 + }
78303 +
78304 + header = (struct wander_record_header *)jdata(log);
78305 + log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
78306 +
78307 + entry = (struct wander_entry *)(header + 1);
78308 +
78309 + /* restore overwrite set from wander record content */
78310 + for (i = 0; i < wander_record_capacity(s); i++) {
78311 + reiser4_block_nr block;
78312 + jnode *node;
78313 +
78314 + block = le64_to_cpu(get_unaligned(&entry->wandered));
78315 + if (block == 0)
78316 + break;
78317 +
78318 + node = reiser4_alloc_io_head(&block);
78319 + if (node == NULL) {
78320 + ret = RETERR(-ENOMEM);
78321 + /*
78322 + * FIXME-VS:???
78323 + */
78324 + jrelse(log);
78325 + reiser4_drop_io_head(log);
78326 + goto free_ow_set;
78327 + }
78328 +
78329 + ret = jload(node);
78330 +
78331 + if (ret < 0) {
78332 + reiser4_drop_io_head(node);
78333 + /*
78334 + * FIXME-VS:???
78335 + */
78336 + jrelse(log);
78337 + reiser4_drop_io_head(log);
78338 + goto free_ow_set;
78339 + }
78340 +
78341 + block = le64_to_cpu(get_unaligned(&entry->original));
78342 +
78343 + assert("zam-603", block != 0);
78344 +
78345 + jnode_set_block(node, &block);
78346 +
78347 + list_add_tail(&node->capture_link, ch.overwrite_set);
78348 +
78349 + ++entry;
78350 + }
78351 +
78352 + jrelse(log);
78353 + reiser4_drop_io_head(log);
78354 +
78355 + --nr_wander_records;
78356 + }
78357 +
78358 + if (nr_wander_records != 0) {
78359 + warning("zam-632", "number of wander records in the linked list"
78360 + " less than number stored in tx head.\n");
78361 + ret = RETERR(-EIO);
78362 + goto free_ow_set;
78363 + }
78364 +
78365 + { /* write wandered set in place */
78366 + write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
78367 + ret = wait_on_jnode_list(ch.overwrite_set);
78368 +
78369 + if (ret) {
78370 + ret = RETERR(-EIO);
78371 + goto free_ow_set;
78372 + }
78373 + }
78374 +
78375 + ret = update_journal_footer(&ch, 0);
78376 +
78377 + free_ow_set:
78378 +
78379 + while (!list_empty(ch.overwrite_set)) {
78380 + jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
78381 + list_del_init(&cur->capture_link);
78382 + jrelse(cur);
78383 + reiser4_drop_io_head(cur);
78384 + }
78385 +
78386 + list_del_init(&tx_head->capture_link);
78387 +
78388 + done_commit_handle(&ch);
78389 +
78390 + return ret;
78391 +}
78392 +
78393 +/* find oldest committed and not played transaction and play it. The transaction
78394 + * was committed and journal header block was updated but the blocks from the
78395 + * process of writing the atom's overwrite set in-place and updating of journal
78396 + * footer block were not completed. This function completes the process by
78397 + * recovering the atom's overwrite set from their wandered locations and writes
78398 + * them in-place and updating the journal footer. */
78399 +static int replay_oldest_transaction(struct super_block *s)
78400 +{
78401 + reiser4_super_info_data *sbinfo = get_super_private(s);
78402 + jnode *jf = sbinfo->journal_footer;
78403 + unsigned int total;
78404 + struct journal_footer *F;
78405 + struct tx_header *T;
78406 +
78407 + reiser4_block_nr prev_tx;
78408 + reiser4_block_nr last_flushed_tx;
78409 + reiser4_block_nr log_rec_block = 0;
78410 +
78411 + jnode *tx_head;
78412 +
78413 + int ret;
78414 +
78415 + if ((ret = jload(jf)) < 0)
78416 + return ret;
78417 +
78418 + F = (struct journal_footer *)jdata(jf);
78419 +
78420 + last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
78421 +
78422 + jrelse(jf);
78423 +
78424 + if (sbinfo->last_committed_tx == last_flushed_tx) {
78425 + /* all transactions are replayed */
78426 + return 0;
78427 + }
78428 +
78429 + prev_tx = sbinfo->last_committed_tx;
78430 +
78431 + /* searching for oldest not flushed transaction */
78432 + while (1) {
78433 + tx_head = reiser4_alloc_io_head(&prev_tx);
78434 + if (!tx_head)
78435 + return RETERR(-ENOMEM);
78436 +
78437 + ret = jload(tx_head);
78438 + if (ret < 0) {
78439 + reiser4_drop_io_head(tx_head);
78440 + return ret;
78441 + }
78442 +
78443 + ret = check_tx_head(tx_head);
78444 + if (ret) {
78445 + jrelse(tx_head);
78446 + reiser4_drop_io_head(tx_head);
78447 + return ret;
78448 + }
78449 +
78450 + T = (struct tx_header *)jdata(tx_head);
78451 +
78452 + prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
78453 +
78454 + if (prev_tx == last_flushed_tx)
78455 + break;
78456 +
78457 + jrelse(tx_head);
78458 + reiser4_drop_io_head(tx_head);
78459 + }
78460 +
78461 + total = le32_to_cpu(get_unaligned(&T->total));
78462 + log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
78463 +
78464 + pin_jnode_data(tx_head);
78465 + jrelse(tx_head);
78466 +
78467 + ret =
78468 + replay_transaction(s, tx_head, &log_rec_block,
78469 + jnode_get_block(tx_head), total - 1);
78470 +
78471 + unpin_jnode_data(tx_head);
78472 + reiser4_drop_io_head(tx_head);
78473 +
78474 + if (ret)
78475 + return ret;
78476 + return -E_REPEAT;
78477 +}
78478 +
78479 +/* The reiser4 journal current implementation was optimized to not to capture
78480 + super block if certain super blocks fields are modified. Currently, the set
78481 + is (<free block count>, <OID allocator>). These fields are logged by
78482 + special way which includes storing them in each transaction head block at
78483 + atom commit time and writing that information to journal footer block at
78484 + atom flush time. For getting info from journal footer block to the
78485 + in-memory super block there is a special function
78486 + reiser4_journal_recover_sb_data() which should be called after disk format
78487 + plugin re-reads super block after journal replaying.
78488 +*/
78489 +
78490 +/* get the information from journal footer in-memory super block */
78491 +int reiser4_journal_recover_sb_data(struct super_block *s)
78492 +{
78493 + reiser4_super_info_data *sbinfo = get_super_private(s);
78494 + struct journal_footer *jf;
78495 + int ret;
78496 +
78497 + assert("zam-673", sbinfo->journal_footer != NULL);
78498 +
78499 + ret = jload(sbinfo->journal_footer);
78500 + if (ret != 0)
78501 + return ret;
78502 +
78503 + ret = check_journal_footer(sbinfo->journal_footer);
78504 + if (ret != 0)
78505 + goto out;
78506 +
78507 + jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
78508 +
78509 + /* was there at least one flushed transaction? */
78510 + if (jf->last_flushed_tx) {
78511 +
78512 + /* restore free block counter logged in this transaction */
78513 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
78514 +
78515 + /* restore oid allocator state */
78516 + oid_init_allocator(s,
78517 + le64_to_cpu(get_unaligned(&jf->nr_files)),
78518 + le64_to_cpu(get_unaligned(&jf->next_oid)));
78519 + }
78520 + out:
78521 + jrelse(sbinfo->journal_footer);
78522 + return ret;
78523 +}
78524 +
78525 +/* reiser4 replay journal procedure */
78526 +int reiser4_journal_replay(struct super_block *s)
78527 +{
78528 + reiser4_super_info_data *sbinfo = get_super_private(s);
78529 + jnode *jh, *jf;
78530 + struct journal_header *header;
78531 + int nr_tx_replayed = 0;
78532 + int ret;
78533 +
78534 + assert("zam-582", sbinfo != NULL);
78535 +
78536 + jh = sbinfo->journal_header;
78537 + jf = sbinfo->journal_footer;
78538 +
78539 + if (!jh || !jf) {
78540 + /* it is possible that disk layout does not support journal
78541 + structures, we just warn about this */
78542 + warning("zam-583",
78543 + "journal control blocks were not loaded by disk layout plugin. "
78544 + "journal replaying is not possible.\n");
78545 + return 0;
78546 + }
78547 +
78548 + /* Take free block count from journal footer block. The free block
78549 + counter value corresponds the last flushed transaction state */
78550 + ret = jload(jf);
78551 + if (ret < 0)
78552 + return ret;
78553 +
78554 + ret = check_journal_footer(jf);
78555 + if (ret) {
78556 + jrelse(jf);
78557 + return ret;
78558 + }
78559 +
78560 + jrelse(jf);
78561 +
78562 + /* store last committed transaction info in reiser4 in-memory super
78563 + block */
78564 + ret = jload(jh);
78565 + if (ret < 0)
78566 + return ret;
78567 +
78568 + ret = check_journal_header(jh);
78569 + if (ret) {
78570 + jrelse(jh);
78571 + return ret;
78572 + }
78573 +
78574 + header = (struct journal_header *)jdata(jh);
78575 + sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
78576 +
78577 + jrelse(jh);
78578 +
78579 + /* replay committed transactions */
78580 + while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
78581 + nr_tx_replayed++;
78582 +
78583 + return ret;
78584 +}
78585 +
78586 +/* load journal control block (either journal header or journal footer block) */
78587 +static int
78588 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
78589 +{
78590 + int ret;
78591 +
78592 + *node = reiser4_alloc_io_head(block);
78593 + if (!(*node))
78594 + return RETERR(-ENOMEM);
78595 +
78596 + ret = jload(*node);
78597 +
78598 + if (ret) {
78599 + reiser4_drop_io_head(*node);
78600 + *node = NULL;
78601 + return ret;
78602 + }
78603 +
78604 + pin_jnode_data(*node);
78605 + jrelse(*node);
78606 +
78607 + return 0;
78608 +}
78609 +
78610 +/* unload journal header or footer and free jnode */
78611 +static void unload_journal_control_block(jnode ** node)
78612 +{
78613 + if (*node) {
78614 + unpin_jnode_data(*node);
78615 + reiser4_drop_io_head(*node);
78616 + *node = NULL;
78617 + }
78618 +}
78619 +
78620 +/* release journal control blocks */
78621 +void reiser4_done_journal_info(struct super_block *s)
78622 +{
78623 + reiser4_super_info_data *sbinfo = get_super_private(s);
78624 +
78625 + assert("zam-476", sbinfo != NULL);
78626 +
78627 + unload_journal_control_block(&sbinfo->journal_header);
78628 + unload_journal_control_block(&sbinfo->journal_footer);
78629 + rcu_barrier();
78630 +}
78631 +
78632 +/* load journal control blocks */
78633 +int reiser4_init_journal_info(struct super_block *s)
78634 +{
78635 + reiser4_super_info_data *sbinfo = get_super_private(s);
78636 + journal_location *loc;
78637 + int ret;
78638 +
78639 + loc = &sbinfo->jloc;
78640 +
78641 + assert("zam-651", loc != NULL);
78642 + assert("zam-652", loc->header != 0);
78643 + assert("zam-653", loc->footer != 0);
78644 +
78645 + ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
78646 +
78647 + if (ret)
78648 + return ret;
78649 +
78650 + ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
78651 +
78652 + if (ret) {
78653 + unload_journal_control_block(&sbinfo->journal_header);
78654 + }
78655 +
78656 + return ret;
78657 +}
78658 +
78659 +/* Make Linus happy.
78660 + Local variables:
78661 + c-indentation-style: "K&R"
78662 + mode-name: "LC"
78663 + c-basic-offset: 8
78664 + tab-width: 8
78665 + fill-column: 80
78666 + End:
78667 +*/
78668 diff -urN linux-2.6.22.orig/fs/reiser4/wander.h linux-2.6.22/fs/reiser4/wander.h
78669 --- linux-2.6.22.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
78670 +++ linux-2.6.22/fs/reiser4/wander.h 2007-07-29 00:25:35.048740996 +0400
78671 @@ -0,0 +1,135 @@
78672 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
78673 +
78674 +#if !defined (__FS_REISER4_WANDER_H__)
78675 +#define __FS_REISER4_WANDER_H__
78676 +
78677 +#include "dformat.h"
78678 +
78679 +#include <linux/fs.h> /* for struct super_block */
78680 +
78681 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
78682 +
78683 +#define TX_HEADER_MAGIC "TxMagic4"
78684 +#define WANDER_RECORD_MAGIC "LogMagc4"
78685 +
78686 +#define TX_HEADER_MAGIC_SIZE (8)
78687 +#define WANDER_RECORD_MAGIC_SIZE (8)
78688 +
78689 +/* journal header block format */
78690 +struct journal_header {
78691 + /* last written transaction head location */
78692 + d64 last_committed_tx;
78693 +};
78694 +
78695 +typedef struct journal_location {
78696 + reiser4_block_nr footer;
78697 + reiser4_block_nr header;
78698 +} journal_location;
78699 +
78700 +/* The wander.c head comment describes usage and semantic of all these structures */
78701 +/* journal footer block format */
78702 +struct journal_footer {
78703 + /* last flushed transaction location. */
78704 + /* This block number is no more valid after the transaction it points
78705 + to gets flushed, this number is used only at journal replaying time
78706 + for detection of the end of on-disk list of committed transactions
78707 + which were not flushed completely */
78708 + d64 last_flushed_tx;
78709 +
78710 + /* free block counter is written in journal footer at transaction
78711 + flushing , not in super block because free blocks counter is logged
78712 + by another way than super block fields (root pointer, for
78713 + example). */
78714 + d64 free_blocks;
78715 +
78716 + /* number of used OIDs and maximal used OID are logged separately from
78717 + super block */
78718 + d64 nr_files;
78719 + d64 next_oid;
78720 +};
78721 +
78722 +/* Each wander record (except the first one) has unified format with wander
78723 + record header followed by an array of log entries */
78724 +struct wander_record_header {
78725 + /* when there is no predefined location for wander records, this magic
78726 + string should help reiser4fsck. */
78727 + char magic[WANDER_RECORD_MAGIC_SIZE];
78728 +
78729 + /* transaction id */
78730 + d64 id;
78731 +
78732 + /* total number of wander records in current transaction */
78733 + d32 total;
78734 +
78735 + /* this block number in transaction */
78736 + d32 serial;
78737 +
78738 + /* number of previous block in commit */
78739 + d64 next_block;
78740 +};
78741 +
78742 +/* The first wander record (transaction head) of written transaction has the
78743 + special format */
78744 +struct tx_header {
78745 + /* magic string makes first block in transaction different from other
78746 + logged blocks, it should help fsck. */
78747 + char magic[TX_HEADER_MAGIC_SIZE];
78748 +
78749 + /* transaction id */
78750 + d64 id;
78751 +
78752 + /* total number of records (including this first tx head) in the
78753 + transaction */
78754 + d32 total;
78755 +
78756 + /* align next field to 8-byte boundary; this field always is zero */
78757 + d32 padding;
78758 +
78759 + /* block number of previous transaction head */
78760 + d64 prev_tx;
78761 +
78762 + /* next wander record location */
78763 + d64 next_block;
78764 +
78765 + /* committed versions of free blocks counter */
78766 + d64 free_blocks;
78767 +
78768 + /* number of used OIDs (nr_files) and maximal used OID are logged
78769 + separately from super block */
78770 + d64 nr_files;
78771 + d64 next_oid;
78772 +};
78773 +
78774 +/* A transaction gets written to disk as a set of wander records (each wander
78775 + record size is fs block) */
78776 +
78777 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
78778 + by zeroes */
78779 +struct wander_entry {
78780 + d64 original; /* block original location */
78781 + d64 wandered; /* block wandered location */
78782 +};
78783 +
78784 +/* REISER4 JOURNAL WRITER FUNCTIONS */
78785 +
78786 +extern int reiser4_write_logs(long *);
78787 +extern int reiser4_journal_replay(struct super_block *);
78788 +extern int reiser4_journal_recover_sb_data(struct super_block *);
78789 +
78790 +extern int reiser4_init_journal_info(struct super_block *);
78791 +extern void reiser4_done_journal_info(struct super_block *);
78792 +
78793 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
78794 +
78795 +#endif /* __FS_REISER4_WANDER_H__ */
78796 +
78797 +/* Make Linus happy.
78798 + Local variables:
78799 + c-indentation-style: "K&R"
78800 + mode-name: "LC"
78801 + c-basic-offset: 8
78802 + tab-width: 8
78803 + fill-column: 80
78804 + scroll-step: 1
78805 + End:
78806 +*/
78807 diff -urN linux-2.6.22.orig/fs/reiser4/writeout.h linux-2.6.22/fs/reiser4/writeout.h
78808 --- linux-2.6.22.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300
78809 +++ linux-2.6.22/fs/reiser4/writeout.h 2007-07-29 00:25:35.052742032 +0400
78810 @@ -0,0 +1,21 @@
78811 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
78812 +
78813 +#if !defined (__FS_REISER4_WRITEOUT_H__)
78814 +
78815 +#define WRITEOUT_SINGLE_STREAM (0x1)
78816 +#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
78817 +#define WRITEOUT_BARRIER (0x4)
78818 +
78819 +extern int reiser4_get_writeout_flags(void);
78820 +
78821 +#endif /* __FS_REISER4_WRITEOUT_H__ */
78822 +
78823 +/* Make Linus happy.
78824 + Local variables:
78825 + c-indentation-style: "K&R"
78826 + mode-name: "LC"
78827 + c-basic-offset: 8
78828 + tab-width: 8
78829 + fill-column: 80
78830 + End:
78831 +*/
78832 diff -urN linux-2.6.22.orig/fs/reiser4/znode.c linux-2.6.22/fs/reiser4/znode.c
78833 --- linux-2.6.22.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300
78834 +++ linux-2.6.22/fs/reiser4/znode.c 2007-07-29 00:25:35.052742032 +0400
78835 @@ -0,0 +1,1029 @@
78836 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
78837 + * reiser4/README */
78838 +/* Znode manipulation functions. */
78839 +/* Znode is the in-memory header for a tree node. It is stored
78840 + separately from the node itself so that it does not get written to
78841 + disk. In this respect znode is like buffer head or page head. We
78842 + also use znodes for additional reiser4 specific purposes:
78843 +
78844 + . they are organized into tree structure which is a part of whole
78845 + reiser4 tree.
78846 + . they are used to implement node grained locking
78847 + . they are used to keep additional state associated with a
78848 + node
78849 + . they contain links to lists used by the transaction manager
78850 +
78851 + Znode is attached to some variable "block number" which is instance of
78852 + fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
78853 + appropriate node being actually loaded in memory. Existence of znode itself
78854 + is regulated by reference count (->x_count) in it. Each time thread
78855 + acquires reference to znode through call to zget(), ->x_count is
78856 + incremented and decremented on call to zput(). Data (content of node) are
78857 + brought in memory through call to zload(), which also increments ->d_count
78858 + reference counter. zload can block waiting on IO. Call to zrelse()
78859 + decreases this counter. Also, ->c_count keeps track of number of child
78860 + znodes and prevents parent znode from being recycled until all of its
78861 + children are. ->c_count is decremented whenever child goes out of existence
78862 + (being actually recycled in zdestroy()) which can be some time after last
78863 + reference to this child dies if we support some form of LRU cache for
78864 + znodes.
78865 +
78866 +*/
78867 +/* EVERY ZNODE'S STORY
78868 +
78869 + 1. His infancy.
78870 +
78871 + Once upon a time, the znode was born deep inside of zget() by call to
78872 + zalloc(). At the return from zget() znode had:
78873 +
78874 + . reference counter (x_count) of 1
78875 + . assigned block number, marked as used in bitmap
78876 + . pointer to parent znode. Root znode parent pointer points
78877 + to its father: "fake" znode. This, in turn, has NULL parent pointer.
78878 + . hash table linkage
78879 + . no data loaded from disk
78880 + . no node plugin
78881 + . no sibling linkage
78882 +
78883 + 2. His childhood
78884 +
78885 + Each node is either brought into memory as a result of tree traversal, or
78886 + created afresh, creation of the root being a special case of the latter. In
78887 + either case it's inserted into sibling list. This will typically require
78888 + some ancillary tree traversing, but ultimately both sibling pointers will
78889 + exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
78890 + zjnode.state.
78891 +
78892 + 3. His youth.
78893 +
78894 + If znode is bound to already existing node in a tree, its content is read
78895 + from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
78896 + in zjnode.state and zdata() function starts to return non null for this
78897 + znode. zload() further calls zparse() that determines which node layout
78898 + this node is rendered in, and sets ->nplug on success.
78899 +
78900 + If znode is for new node just created, memory for it is allocated and
78901 + zinit_new() function is called to initialise data, according to selected
78902 + node layout.
78903 +
78904 + 4. His maturity.
78905 +
78906 + After this point, znode lingers in memory for some time. Threads can
78907 + acquire references to znode either by blocknr through call to zget(), or by
78908 + following a pointer to unallocated znode from internal item. Each time
78909 + reference to znode is obtained, x_count is increased. Thread can read/write
78910 + lock znode. Znode data can be loaded through calls to zload(), d_count will
78911 + be increased appropriately. If all references to znode are released
78912 + (x_count drops to 0), znode is not recycled immediately. Rather, it is
78913 + still cached in the hash table in the hope that it will be accessed
78914 + shortly.
78915 +
78916 + There are two ways in which znode existence can be terminated:
78917 +
78918 + . sudden death: node bound to this znode is removed from the tree
78919 + . overpopulation: znode is purged out of memory due to memory pressure
78920 +
78921 + 5. His death.
78922 +
78923 + Death is complex process.
78924 +
78925 + When we irrevocably commit ourselves to decision to remove node from the
78926 + tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
78927 + znode. This is done either in ->kill_hook() of internal item or in
78928 + reiser4_kill_root() function when tree root is removed.
78929 +
78930 + At this moment znode still has:
78931 +
78932 + . locks held on it, necessary write ones
78933 + . references to it
78934 + . disk block assigned to it
78935 + . data loaded from the disk
78936 + . pending requests for lock
78937 +
78938 + But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
78939 + deletion. Node deletion includes two phases. First all ways to get
78940 + references to that znode (sibling and parent links and hash lookup using
78941 + block number stored in parent node) should be deleted -- it is done through
78942 + sibling_list_remove(), also we assume that nobody uses down link from
78943 + parent node due to its nonexistence or proper parent node locking and
78944 + nobody uses parent pointers from children due to absence of them. Second we
78945 + invalidate all pending lock requests which still are on znode's lock
78946 + request queue, this is done by reiser4_invalidate_lock(). Another
78947 + JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
78948 + Once it set all requesters are forced to return -EINVAL from
78949 + longterm_lock_znode(). Future locking attempts are not possible because all
78950 + ways to get references to that znode are removed already. Last, node is
78951 + uncaptured from transaction.
78952 +
78953 + When last reference to the dying znode is just about to be released,
78954 + block number for this lock is released and znode is removed from the
78955 + hash table.
78956 +
78957 + Now znode can be recycled.
78958 +
78959 + [it's possible to free bitmap block and remove znode from the hash
78960 + table when last lock is released. This will result in having
78961 + referenced but completely orphaned znode]
78962 +
78963 + 6. Limbo
78964 +
78965 + As have been mentioned above znodes with reference counter 0 are
78966 + still cached in a hash table. Once memory pressure increases they are
78967 + purged out of there [this requires something like LRU list for
78968 + efficient implementation. LRU list would also greatly simplify
78969 + implementation of coord cache that would in this case morph to just
78970 + scanning some initial segment of LRU list]. Data loaded into
78971 + unreferenced znode are flushed back to the durable storage if
78972 + necessary and memory is freed. Znodes themselves can be recycled at
78973 + this point too.
78974 +
78975 +*/
78976 +
78977 +#include "debug.h"
78978 +#include "dformat.h"
78979 +#include "key.h"
78980 +#include "coord.h"
78981 +#include "plugin/plugin_header.h"
78982 +#include "plugin/node/node.h"
78983 +#include "plugin/plugin.h"
78984 +#include "txnmgr.h"
78985 +#include "jnode.h"
78986 +#include "znode.h"
78987 +#include "block_alloc.h"
78988 +#include "tree.h"
78989 +#include "tree_walk.h"
78990 +#include "super.h"
78991 +#include "reiser4.h"
78992 +
78993 +#include <linux/pagemap.h>
78994 +#include <linux/spinlock.h>
78995 +#include <linux/slab.h>
78996 +#include <linux/err.h>
78997 +
78998 +static z_hash_table *get_htable(reiser4_tree *,
78999 + const reiser4_block_nr * const blocknr);
79000 +static z_hash_table *znode_get_htable(const znode *);
79001 +static void zdrop(znode *);
79002 +
79003 +/* hash table support */
79004 +
79005 +/* compare two block numbers for equality. Used by hash-table macros */
79006 +static inline int
79007 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79008 +{
79009 + assert("nikita-534", b1 != NULL);
79010 + assert("nikita-535", b2 != NULL);
79011 +
79012 + return *b1 == *b2;
79013 +}
79014 +
79015 +/* Hash znode by block number. Used by hash-table macros */
79016 +/* Audited by: umka (2002.06.11) */
79017 +static inline __u32
79018 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79019 +{
79020 + assert("nikita-536", b != NULL);
79021 +
79022 + return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79023 +}
79024 +
79025 +/* The hash table definition */
79026 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
79027 +#define KFREE(ptr, size) kfree(ptr)
79028 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79029 + blknrhashfn, blknreq);
79030 +#undef KFREE
79031 +#undef KMALLOC
79032 +
79033 +/* slab for znodes */
79034 +static struct kmem_cache *znode_cache;
79035 +
79036 +int znode_shift_order;
79037 +
79038 +/**
79039 + * init_znodes - create znode cache
79040 + *
79041 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79042 + */
79043 +int init_znodes(void)
79044 +{
79045 + znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79046 + SLAB_HWCACHE_ALIGN |
79047 + SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79048 + if (znode_cache == NULL)
79049 + return RETERR(-ENOMEM);
79050 +
79051 + for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79052 + ++znode_shift_order);
79053 + --znode_shift_order;
79054 + return 0;
79055 +}
79056 +
79057 +/**
79058 + * done_znodes - delete znode cache
79059 + *
79060 + * This is called on reiser4 module unloading or system shutdown.
79061 + */
79062 +void done_znodes(void)
79063 +{
79064 + destroy_reiser4_cache(&znode_cache);
79065 +}
79066 +
79067 +/* call this to initialise tree of znodes */
79068 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79069 +{
79070 + int result;
79071 + assert("umka-050", tree != NULL);
79072 +
79073 + rwlock_init(&tree->dk_lock);
79074 +
79075 + result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79076 + if (result != 0)
79077 + return result;
79078 + result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79079 + return result;
79080 +}
79081 +
79082 +/* free this znode */
79083 +void zfree(znode * node /* znode to free */ )
79084 +{
79085 + assert("nikita-465", node != NULL);
79086 + assert("nikita-2120", znode_page(node) == NULL);
79087 + assert("nikita-2301", list_empty_careful(&node->lock.owners));
79088 + assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79089 + assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79090 + NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79091 + assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79092 + assert("nikita-3293", !znode_is_right_connected(node));
79093 + assert("nikita-3294", !znode_is_left_connected(node));
79094 + assert("nikita-3295", node->left == NULL);
79095 + assert("nikita-3296", node->right == NULL);
79096 +
79097 + /* not yet phash_jnode_destroy(ZJNODE(node)); */
79098 +
79099 + kmem_cache_free(znode_cache, node);
79100 +}
79101 +
79102 +/* call this to free tree of znodes */
79103 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79104 +{
79105 + znode *node;
79106 + znode *next;
79107 + z_hash_table *ztable;
79108 +
79109 + /* scan znode hash-tables and kill all znodes, then free hash tables
79110 + * themselves. */
79111 +
79112 + assert("nikita-795", tree != NULL);
79113 +
79114 + ztable = &tree->zhash_table;
79115 +
79116 + if (ztable->_table != NULL) {
79117 + for_all_in_htable(ztable, z, node, next) {
79118 + node->c_count = 0;
79119 + node->in_parent.node = NULL;
79120 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79121 + zdrop(node);
79122 + }
79123 +
79124 + z_hash_done(&tree->zhash_table);
79125 + }
79126 +
79127 + ztable = &tree->zfake_table;
79128 +
79129 + if (ztable->_table != NULL) {
79130 + for_all_in_htable(ztable, z, node, next) {
79131 + node->c_count = 0;
79132 + node->in_parent.node = NULL;
79133 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79134 + zdrop(node);
79135 + }
79136 +
79137 + z_hash_done(&tree->zfake_table);
79138 + }
79139 +}
79140 +
79141 +/* ZNODE STRUCTURES */
79142 +
79143 +/* allocate fresh znode */
79144 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79145 +{
79146 + znode *node;
79147 +
79148 + node = kmem_cache_alloc(znode_cache, gfp_flag);
79149 + return node;
79150 +}
79151 +
79152 +/* Initialize fields of znode
79153 + @node: znode to initialize;
79154 + @parent: parent znode;
79155 + @tree: tree we are in. */
79156 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79157 +{
79158 + assert("nikita-466", node != NULL);
79159 + assert("umka-268", current_tree != NULL);
79160 +
79161 + memset(node, 0, sizeof *node);
79162 +
79163 + assert("umka-051", tree != NULL);
79164 +
79165 + jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79166 + reiser4_init_lock(&node->lock);
79167 + init_parent_coord(&node->in_parent, parent);
79168 +}
79169 +
79170 +/*
79171 + * remove znode from indices. This is called jput() when last reference on
79172 + * znode is released.
79173 + */
79174 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79175 +{
79176 + assert("nikita-2108", node != NULL);
79177 + assert("nikita-470", node->c_count == 0);
79178 + assert_rw_write_locked(&(tree->tree_lock));
79179 +
79180 + /* remove reference to this znode from cbk cache */
79181 + cbk_cache_invalidate(node, tree);
79182 +
79183 + /* update c_count of parent */
79184 + if (znode_parent(node) != NULL) {
79185 + assert("nikita-472", znode_parent(node)->c_count > 0);
79186 + /* father, onto your hands I forward my spirit... */
79187 + znode_parent(node)->c_count--;
79188 + node->in_parent.node = NULL;
79189 + } else {
79190 + /* orphaned znode?! Root? */
79191 + }
79192 +
79193 + /* remove znode from hash-table */
79194 + z_hash_remove_rcu(znode_get_htable(node), node);
79195 +}
79196 +
79197 +/* zdrop() -- Remove znode from the tree.
79198 +
79199 + This is called when znode is removed from the memory. */
79200 +static void zdrop(znode * node /* znode to finish with */ )
79201 +{
79202 + jdrop(ZJNODE(node));
79203 +}
79204 +
79205 +/*
79206 + * put znode into right place in the hash table. This is called by relocate
79207 + * code.
79208 + */
79209 +int znode_rehash(znode * node /* node to rehash */ ,
79210 + const reiser4_block_nr * new_block_nr /* new block number */ )
79211 +{
79212 + z_hash_table *oldtable;
79213 + z_hash_table *newtable;
79214 + reiser4_tree *tree;
79215 +
79216 + assert("nikita-2018", node != NULL);
79217 +
79218 + tree = znode_get_tree(node);
79219 + oldtable = znode_get_htable(node);
79220 + newtable = get_htable(tree, new_block_nr);
79221 +
79222 + write_lock_tree(tree);
79223 + /* remove znode from hash-table */
79224 + z_hash_remove_rcu(oldtable, node);
79225 +
79226 + /* assertion no longer valid due to RCU */
79227 + /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
79228 +
79229 + /* update blocknr */
79230 + znode_set_block(node, new_block_nr);
79231 + node->zjnode.key.z = *new_block_nr;
79232 +
79233 + /* insert it into hash */
79234 + z_hash_insert_rcu(newtable, node);
79235 + write_unlock_tree(tree);
79236 + return 0;
79237 +}
79238 +
79239 +/* ZNODE LOOKUP, GET, PUT */
79240 +
79241 +/* zlook() - get znode with given block_nr in a hash table or return NULL
79242 +
79243 + If result is non-NULL then the znode's x_count is incremented. Internal version
79244 + accepts pre-computed hash index. The hash table is accessed under caller's
79245 + tree->hash_lock.
79246 +*/
79247 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
79248 +{
79249 + znode *result;
79250 + __u32 hash;
79251 + z_hash_table *htable;
79252 +
79253 + assert("jmacd-506", tree != NULL);
79254 + assert("jmacd-507", blocknr != NULL);
79255 +
79256 + htable = get_htable(tree, blocknr);
79257 + hash = blknrhashfn(htable, blocknr);
79258 +
79259 + rcu_read_lock();
79260 + result = z_hash_find_index(htable, hash, blocknr);
79261 +
79262 + if (result != NULL) {
79263 + add_x_ref(ZJNODE(result));
79264 + result = znode_rip_check(tree, result);
79265 + }
79266 + rcu_read_unlock();
79267 +
79268 + return result;
79269 +}
79270 +
79271 +/* return hash table where znode with block @blocknr is (or should be)
79272 + * stored */
79273 +static z_hash_table *get_htable(reiser4_tree * tree,
79274 + const reiser4_block_nr * const blocknr)
79275 +{
79276 + z_hash_table *table;
79277 + if (is_disk_addr_unallocated(blocknr))
79278 + table = &tree->zfake_table;
79279 + else
79280 + table = &tree->zhash_table;
79281 + return table;
79282 +}
79283 +
79284 +/* return hash table where znode @node is (or should be) stored */
79285 +static z_hash_table *znode_get_htable(const znode * node)
79286 +{
79287 + return get_htable(znode_get_tree(node), znode_get_block(node));
79288 +}
79289 +
79290 +/* zget() - get znode from hash table, allocating it if necessary.
79291 +
79292 + First a call to zlook, locating a x-referenced znode if one
79293 + exists. If znode is not found, allocate new one and return. Result
79294 + is returned with x_count reference increased.
79295 +
79296 + LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
79297 + LOCK ORDERING: NONE
79298 +*/
79299 +znode *zget(reiser4_tree * tree,
79300 + const reiser4_block_nr * const blocknr,
79301 + znode * parent, tree_level level, gfp_t gfp_flag)
79302 +{
79303 + znode *result;
79304 + __u32 hashi;
79305 +
79306 + z_hash_table *zth;
79307 +
79308 + assert("jmacd-512", tree != NULL);
79309 + assert("jmacd-513", blocknr != NULL);
79310 + assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
79311 +
79312 + zth = get_htable(tree, blocknr);
79313 + hashi = blknrhashfn(zth, blocknr);
79314 +
79315 + /* NOTE-NIKITA address-as-unallocated-blocknr still is not
79316 + implemented. */
79317 +
79318 + z_hash_prefetch_bucket(zth, hashi);
79319 +
79320 + rcu_read_lock();
79321 + /* Find a matching BLOCKNR in the hash table. If the znode is found,
79322 + we obtain an reference (x_count) but the znode remains unlocked.
79323 + Have to worry about race conditions later. */
79324 + result = z_hash_find_index(zth, hashi, blocknr);
79325 + /* According to the current design, the hash table lock protects new
79326 + znode references. */
79327 + if (result != NULL) {
79328 + add_x_ref(ZJNODE(result));
79329 + /* NOTE-NIKITA it should be so, but special case during
79330 + creation of new root makes such assertion highly
79331 + complicated. */
79332 + assert("nikita-2131", 1 || znode_parent(result) == parent ||
79333 + (ZF_ISSET(result, JNODE_ORPHAN)
79334 + && (znode_parent(result) == NULL)));
79335 + result = znode_rip_check(tree, result);
79336 + }
79337 +
79338 + rcu_read_unlock();
79339 +
79340 + if (!result) {
79341 + znode *shadow;
79342 +
79343 + result = zalloc(gfp_flag);
79344 + if (!result) {
79345 + return ERR_PTR(RETERR(-ENOMEM));
79346 + }
79347 +
79348 + zinit(result, parent, tree);
79349 + ZJNODE(result)->blocknr = *blocknr;
79350 + ZJNODE(result)->key.z = *blocknr;
79351 + result->level = level;
79352 +
79353 + write_lock_tree(tree);
79354 +
79355 + shadow = z_hash_find_index(zth, hashi, blocknr);
79356 + if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
79357 + jnode_list_remove(ZJNODE(result));
79358 + zfree(result);
79359 + result = shadow;
79360 + } else {
79361 + result->version = znode_build_version(tree);
79362 + z_hash_insert_index_rcu(zth, hashi, result);
79363 +
79364 + if (parent != NULL)
79365 + ++parent->c_count;
79366 + }
79367 +
79368 + add_x_ref(ZJNODE(result));
79369 +
79370 + write_unlock_tree(tree);
79371 + }
79372 +#if REISER4_DEBUG
79373 + if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
79374 + reiser4_check_block(blocknr, 1);
79375 +#endif
79376 + /* Check for invalid tree level, return -EIO */
79377 + if (unlikely(znode_get_level(result) != level)) {
79378 + warning("jmacd-504",
79379 + "Wrong level for cached block %llu: %i expecting %i",
79380 + (unsigned long long)(*blocknr), znode_get_level(result),
79381 + level);
79382 + zput(result);
79383 + return ERR_PTR(RETERR(-EIO));
79384 + }
79385 +
79386 + assert("nikita-1227", znode_invariant(result));
79387 +
79388 + return result;
79389 +}
79390 +
79391 +/* ZNODE PLUGINS/DATA */
79392 +
79393 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
79394 + stored at the fixed offset from the beginning of the node. */
79395 +static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
79396 + * plugin of */ )
79397 +{
79398 + reiser4_tree *tree;
79399 +
79400 + assert("nikita-1053", node != NULL);
79401 + assert("nikita-1055", zdata(node) != NULL);
79402 +
79403 + tree = znode_get_tree(node);
79404 + assert("umka-053", tree != NULL);
79405 +
79406 + if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
79407 + return tree->nplug;
79408 + } else {
79409 + return node_plugin_by_disk_id
79410 + (tree, &((common_node_header *) zdata(node))->plugin_id);
79411 +#ifdef GUESS_EXISTS
79412 + reiser4_plugin *plugin;
79413 +
79414 + /* NOTE-NIKITA add locking here when dynamic plugins will be
79415 + * implemented */
79416 + for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
79417 + if ((plugin->u.node.guess != NULL)
79418 + && plugin->u.node.guess(node))
79419 + return plugin;
79420 + }
79421 + warning("nikita-1057", "Cannot guess node plugin");
79422 + print_znode("node", node);
79423 + return NULL;
79424 +#endif
79425 + }
79426 +}
79427 +
79428 +/* parse node header and install ->node_plugin */
79429 +int zparse(znode * node /* znode to parse */ )
79430 +{
79431 + int result;
79432 +
79433 + assert("nikita-1233", node != NULL);
79434 + assert("nikita-2370", zdata(node) != NULL);
79435 +
79436 + if (node->nplug == NULL) {
79437 + node_plugin *nplug;
79438 +
79439 + nplug = znode_guess_plugin(node);
79440 + if (likely(nplug != NULL)) {
79441 + result = nplug->parse(node);
79442 + if (likely(result == 0))
79443 + node->nplug = nplug;
79444 + } else {
79445 + result = RETERR(-EIO);
79446 + }
79447 + } else
79448 + result = 0;
79449 + return result;
79450 +}
79451 +
79452 +/* zload with readahead */
79453 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
79454 +{
79455 + int result;
79456 +
79457 + assert("nikita-484", node != NULL);
79458 + assert("nikita-1377", znode_invariant(node));
79459 + assert("jmacd-7771", !znode_above_root(node));
79460 + assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
79461 + assert("nikita-3016", reiser4_schedulable());
79462 +
79463 + if (info)
79464 + formatted_readahead(node, info);
79465 +
79466 + result = jload(ZJNODE(node));
79467 + assert("nikita-1378", znode_invariant(node));
79468 + return result;
79469 +}
79470 +
79471 +/* load content of node into memory */
79472 +int zload(znode * node)
79473 +{
79474 + return zload_ra(node, NULL);
79475 +}
79476 +
79477 +/* call node plugin to initialise newly allocated node. */
79478 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
79479 +{
79480 + return jinit_new(ZJNODE(node), gfp_flags);
79481 +}
79482 +
79483 +/* drop reference to node data. When last reference is dropped, data are
79484 + unloaded. */
79485 +void zrelse(znode * node /* znode to release references to */ )
79486 +{
79487 + assert("nikita-1381", znode_invariant(node));
79488 +
79489 + jrelse(ZJNODE(node));
79490 +}
79491 +
79492 +/* returns free space in node */
79493 +unsigned znode_free_space(znode * node /* znode to query */ )
79494 +{
79495 + assert("nikita-852", node != NULL);
79496 + return node_plugin_by_node(node)->free_space(node);
79497 +}
79498 +
79499 +/* left delimiting key of znode */
79500 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
79501 +{
79502 + assert("nikita-958", node != NULL);
79503 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79504 + assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
79505 + assert("nikita-30671", node->rd_key_version != 0);
79506 + return &node->rd_key;
79507 +}
79508 +
79509 +/* right delimiting key of znode */
79510 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
79511 +{
79512 + assert("nikita-974", node != NULL);
79513 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79514 + assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
79515 + assert("nikita-30681", node->ld_key_version != 0);
79516 + return &node->ld_key;
79517 +}
79518 +
79519 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
79520 + )
79521 +
79522 +/* update right-delimiting key of @node */
79523 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
79524 +{
79525 + assert("nikita-2937", node != NULL);
79526 + assert("nikita-2939", key != NULL);
79527 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79528 + assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
79529 + assert("nikita-2944",
79530 + znode_is_any_locked(node) ||
79531 + znode_get_level(node) != LEAF_LEVEL ||
79532 + keyge(key, &node->rd_key) ||
79533 + keyeq(&node->rd_key, reiser4_min_key()) ||
79534 + ZF_ISSET(node, JNODE_HEARD_BANSHEE));
79535 +
79536 + node->rd_key = *key;
79537 + ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
79538 + return &node->rd_key;
79539 +}
79540 +
79541 +/* update left-delimiting key of @node */
79542 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
79543 +{
79544 + assert("nikita-2940", node != NULL);
79545 + assert("nikita-2941", key != NULL);
79546 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79547 + assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
79548 + assert("nikita-2943",
79549 + znode_is_any_locked(node) || keyeq(&node->ld_key,
79550 + reiser4_min_key()));
79551 +
79552 + node->ld_key = *key;
79553 + ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
79554 + return &node->ld_key;
79555 +}
79556 +
79557 +/* true if @key is inside key range for @node */
79558 +int znode_contains_key(znode * node /* znode to look in */ ,
79559 + const reiser4_key * key /* key to look for */ )
79560 +{
79561 + assert("nikita-1237", node != NULL);
79562 + assert("nikita-1238", key != NULL);
79563 +
79564 + /* left_delimiting_key <= key <= right_delimiting_key */
79565 + return keyle(znode_get_ld_key(node), key)
79566 + && keyle(key, znode_get_rd_key(node));
79567 +}
79568 +
79569 +/* same as znode_contains_key(), but lock dk lock */
79570 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
79571 + const reiser4_key * key /* key to look for */ )
79572 +{
79573 + int result;
79574 +
79575 + assert("umka-056", node != NULL);
79576 + assert("umka-057", key != NULL);
79577 +
79578 + read_lock_dk(znode_get_tree(node));
79579 + result = znode_contains_key(node, key);
79580 + read_unlock_dk(znode_get_tree(node));
79581 + return result;
79582 +}
79583 +
79584 +/* get parent pointer, assuming tree is not locked */
79585 +znode *znode_parent_nolock(const znode * node /* child znode */ )
79586 +{
79587 + assert("nikita-1444", node != NULL);
79588 + return node->in_parent.node;
79589 +}
79590 +
79591 +/* get parent pointer of znode */
79592 +znode *znode_parent(const znode * node /* child znode */ )
79593 +{
79594 + assert("nikita-1226", node != NULL);
79595 + assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
79596 + return znode_parent_nolock(node);
79597 +}
79598 +
79599 +/* detect uber znode used to protect in-superblock tree root pointer */
79600 +int znode_above_root(const znode * node /* znode to query */ )
79601 +{
79602 + assert("umka-059", node != NULL);
79603 +
79604 + return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
79605 +}
79606 +
79607 +/* check that @node is root---that its block number is recorder in the tree as
79608 + that of root node */
79609 +#if REISER4_DEBUG
79610 +static int znode_is_true_root(const znode * node /* znode to query */ )
79611 +{
79612 + assert("umka-060", node != NULL);
79613 + assert("umka-061", current_tree != NULL);
79614 +
79615 + return disk_addr_eq(znode_get_block(node),
79616 + &znode_get_tree(node)->root_block);
79617 +}
79618 +#endif
79619 +
79620 +/* check that @node is root */
79621 +int znode_is_root(const znode * node /* znode to query */ )
79622 +{
79623 + assert("nikita-1206", node != NULL);
79624 +
79625 + return znode_get_level(node) == znode_get_tree(node)->height;
79626 +}
79627 +
79628 +/* Returns true is @node was just created by zget() and wasn't ever loaded
79629 + into memory. */
79630 +/* NIKITA-HANS: yes */
79631 +int znode_just_created(const znode * node)
79632 +{
79633 + assert("nikita-2188", node != NULL);
79634 + return (znode_page(node) == NULL);
79635 +}
79636 +
79637 +/* obtain updated ->znode_epoch. See seal.c for description. */
79638 +__u64 znode_build_version(reiser4_tree * tree)
79639 +{
79640 + __u64 result;
79641 +
79642 + spin_lock(&tree->epoch_lock);
79643 + result = ++tree->znode_epoch;
79644 + spin_unlock(&tree->epoch_lock);
79645 + return result;
79646 +}
79647 +
79648 +void init_load_count(load_count * dh)
79649 +{
79650 + assert("nikita-2105", dh != NULL);
79651 + memset(dh, 0, sizeof *dh);
79652 +}
79653 +
79654 +void done_load_count(load_count * dh)
79655 +{
79656 + assert("nikita-2106", dh != NULL);
79657 + if (dh->node != NULL) {
79658 + for (; dh->d_ref > 0; --dh->d_ref)
79659 + zrelse(dh->node);
79660 + dh->node = NULL;
79661 + }
79662 +}
79663 +
79664 +static int incr_load_count(load_count * dh)
79665 +{
79666 + int result;
79667 +
79668 + assert("nikita-2110", dh != NULL);
79669 + assert("nikita-2111", dh->node != NULL);
79670 +
79671 + result = zload(dh->node);
79672 + if (result == 0)
79673 + ++dh->d_ref;
79674 + return result;
79675 +}
79676 +
79677 +int incr_load_count_znode(load_count * dh, znode * node)
79678 +{
79679 + assert("nikita-2107", dh != NULL);
79680 + assert("nikita-2158", node != NULL);
79681 + assert("nikita-2109",
79682 + ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
79683 +
79684 + dh->node = node;
79685 + return incr_load_count(dh);
79686 +}
79687 +
79688 +int incr_load_count_jnode(load_count * dh, jnode * node)
79689 +{
79690 + if (jnode_is_znode(node)) {
79691 + return incr_load_count_znode(dh, JZNODE(node));
79692 + }
79693 + return 0;
79694 +}
79695 +
79696 +void copy_load_count(load_count * new, load_count * old)
79697 +{
79698 + int ret = 0;
79699 + done_load_count(new);
79700 + new->node = old->node;
79701 + new->d_ref = 0;
79702 +
79703 + while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
79704 + }
79705 +
79706 + assert("jmacd-87589", ret == 0);
79707 +}
79708 +
79709 +void move_load_count(load_count * new, load_count * old)
79710 +{
79711 + done_load_count(new);
79712 + new->node = old->node;
79713 + new->d_ref = old->d_ref;
79714 + old->node = NULL;
79715 + old->d_ref = 0;
79716 +}
79717 +
79718 +/* convert parent pointer into coord */
79719 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
79720 +{
79721 + assert("nikita-3204", pcoord != NULL);
79722 + assert("nikita-3205", coord != NULL);
79723 +
79724 + coord_init_first_unit_nocheck(coord, pcoord->node);
79725 + coord_set_item_pos(coord, pcoord->item_pos);
79726 + coord->between = AT_UNIT;
79727 +}
79728 +
79729 +/* pack coord into parent_coord_t */
79730 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
79731 +{
79732 + assert("nikita-3206", pcoord != NULL);
79733 + assert("nikita-3207", coord != NULL);
79734 +
79735 + pcoord->node = coord->node;
79736 + pcoord->item_pos = coord->item_pos;
79737 +}
79738 +
79739 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
79740 + look for comments there) */
79741 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
79742 +{
79743 + pcoord->node = (znode *) node;
79744 + pcoord->item_pos = (unsigned short)~0;
79745 +}
79746 +
79747 +#if REISER4_DEBUG
79748 +
79749 +/* debugging aid: znode invariant */
79750 +static int znode_invariant_f(const znode * node /* znode to check */ ,
79751 + char const **msg /* where to store error
79752 + * message, if any */ )
79753 +{
79754 +#define _ergo(ant, con) \
79755 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
79756 +
79757 +#define _equi(e1, e2) \
79758 + ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
79759 +
79760 +#define _check(exp) ((*msg) = #exp, (exp))
79761 +
79762 + return jnode_invariant_f(ZJNODE(node), msg) &&
79763 + /* [znode-fake] invariant */
79764 + /* fake znode doesn't have a parent, and */
79765 + _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
79766 + /* there is another way to express this very check, and */
79767 + _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
79768 + /* it has special block number, and */
79769 + _ergo(znode_get_level(node) == 0,
79770 + disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
79771 + /* it is the only znode with such block number, and */
79772 + _ergo(!znode_above_root(node) && znode_is_loaded(node),
79773 + !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
79774 + /* it is parent of the tree root node */
79775 + _ergo(znode_is_true_root(node),
79776 + znode_above_root(znode_parent(node))) &&
79777 + /* [znode-level] invariant */
79778 + /* level of parent znode is one larger than that of child,
79779 + except for the fake znode, and */
79780 + _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
79781 + znode_get_level(znode_parent(node)) ==
79782 + znode_get_level(node) + 1) &&
79783 + /* left neighbor is at the same level, and */
79784 + _ergo(znode_is_left_connected(node) && node->left != NULL,
79785 + znode_get_level(node) == znode_get_level(node->left)) &&
79786 + /* right neighbor is at the same level */
79787 + _ergo(znode_is_right_connected(node) && node->right != NULL,
79788 + znode_get_level(node) == znode_get_level(node->right)) &&
79789 + /* [znode-connected] invariant */
79790 + _ergo(node->left != NULL, znode_is_left_connected(node)) &&
79791 + _ergo(node->right != NULL, znode_is_right_connected(node)) &&
79792 + _ergo(!znode_is_root(node) && node->left != NULL,
79793 + znode_is_right_connected(node->left) &&
79794 + node->left->right == node) &&
79795 + _ergo(!znode_is_root(node) && node->right != NULL,
79796 + znode_is_left_connected(node->right) &&
79797 + node->right->left == node) &&
79798 + /* [znode-c_count] invariant */
79799 + /* for any znode, c_count of its parent is greater than 0 */
79800 + _ergo(znode_parent(node) != NULL &&
79801 + !znode_above_root(znode_parent(node)),
79802 + znode_parent(node)->c_count > 0) &&
79803 + /* leaves don't have children */
79804 + _ergo(znode_get_level(node) == LEAF_LEVEL,
79805 + node->c_count == 0) &&
79806 + _check(node->zjnode.jnodes.prev != NULL) &&
79807 + _check(node->zjnode.jnodes.next != NULL) &&
79808 + /* orphan doesn't have a parent */
79809 + _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
79810 + /* [znode-modify] invariant */
79811 + /* if znode is not write-locked, its checksum remains
79812 + * invariant */
79813 + /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
79814 + * cannot check this. */
79815 + /* [znode-refs] invariant */
79816 + /* only referenced znode can be long-term locked */
79817 + _ergo(znode_is_locked(node),
79818 + atomic_read(&ZJNODE(node)->x_count) != 0);
79819 +}
79820 +
79821 +/* debugging aid: check znode invariant and panic if it doesn't hold */
79822 +int znode_invariant(znode * node /* znode to check */ )
79823 +{
79824 + char const *failed_msg;
79825 + int result;
79826 +
79827 + assert("umka-063", node != NULL);
79828 + assert("umka-064", current_tree != NULL);
79829 +
79830 + spin_lock_znode(node);
79831 + read_lock_tree(znode_get_tree(node));
79832 + result = znode_invariant_f(node, &failed_msg);
79833 + if (!result) {
79834 + /* print_znode("corrupted node", node); */
79835 + warning("jmacd-555", "Condition %s failed", failed_msg);
79836 + }
79837 + read_unlock_tree(znode_get_tree(node));
79838 + spin_unlock_znode(node);
79839 + return result;
79840 +}
79841 +
79842 +/* return non-0 iff data are loaded into znode */
79843 +int znode_is_loaded(const znode * node /* znode to query */ )
79844 +{
79845 + assert("nikita-497", node != NULL);
79846 + return jnode_is_loaded(ZJNODE(node));
79847 +}
79848 +
79849 +unsigned long znode_times_locked(const znode * z)
79850 +{
79851 + return z->times_locked;
79852 +}
79853 +
79854 +#endif /* REISER4_DEBUG */
79855 +
79856 +/* Make Linus happy.
79857 + Local variables:
79858 + c-indentation-style: "K&R"
79859 + mode-name: "LC"
79860 + c-basic-offset: 8
79861 + tab-width: 8
79862 + fill-column: 120
79863 + End:
79864 +*/
79865 diff -urN linux-2.6.22.orig/fs/reiser4/znode.h linux-2.6.22/fs/reiser4/znode.h
79866 --- linux-2.6.22.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300
79867 +++ linux-2.6.22/fs/reiser4/znode.h 2007-07-29 00:25:35.052742032 +0400
79868 @@ -0,0 +1,434 @@
79869 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
79870 + * reiser4/README */
79871 +
79872 +/* Declaration of znode (Zam's node). See znode.c for more details. */
79873 +
79874 +#ifndef __ZNODE_H__
79875 +#define __ZNODE_H__
79876 +
79877 +#include "forward.h"
79878 +#include "debug.h"
79879 +#include "dformat.h"
79880 +#include "key.h"
79881 +#include "coord.h"
79882 +#include "plugin/node/node.h"
79883 +#include "jnode.h"
79884 +#include "lock.h"
79885 +#include "readahead.h"
79886 +
79887 +#include <linux/types.h>
79888 +#include <linux/spinlock.h>
79889 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
79890 +#include <asm/atomic.h>
79891 +#include <asm/semaphore.h>
79892 +
79893 +/* znode tracks its position within parent (internal item in a parent node,
79894 + * that contains znode's block number). */
79895 +typedef struct parent_coord {
79896 + znode *node;
79897 + pos_in_node_t item_pos;
79898 +} parent_coord_t;
79899 +
79900 +/* &znode - node in a reiser4 tree.
79901 +
79902 + NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
79903 + cacheline pressure.
79904 +
79905 + Locking:
79906 +
79907 + Long term: data in a disk node attached to this znode are protected
79908 + by long term, deadlock aware lock ->lock;
79909 +
79910 + Spin lock: the following fields are protected by the spin lock:
79911 +
79912 + ->lock
79913 +
79914 + Following fields are protected by the global tree lock:
79915 +
79916 + ->left
79917 + ->right
79918 + ->in_parent
79919 + ->c_count
79920 +
79921 + Following fields are protected by the global delimiting key lock (dk_lock):
79922 +
79923 + ->ld_key (to update ->ld_key long-term lock on the node is also required)
79924 + ->rd_key
79925 +
79926 + Following fields are protected by the long term lock:
79927 +
79928 + ->nr_items
79929 +
79930 + ->node_plugin is never changed once set. This means that after code made
79931 + itself sure that field is valid it can be accessed without any additional
79932 + locking.
79933 +
79934 + ->level is immutable.
79935 +
79936 + Invariants involving this data-type:
79937 +
79938 + [znode-fake]
79939 + [znode-level]
79940 + [znode-connected]
79941 + [znode-c_count]
79942 + [znode-refs]
79943 + [jnode-refs]
79944 + [jnode-queued]
79945 + [znode-modify]
79946 +
79947 + For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
79948 + Suggestions for how to do that are desired.*/
79949 +struct znode {
79950 + /* Embedded jnode. */
79951 + jnode zjnode;
79952 +
79953 + /* contains three subfields, node, pos_in_node, and pos_in_unit.
79954 +
79955 + pos_in_node and pos_in_unit are only hints that are cached to
79956 + speed up lookups during balancing. They are not required to be up to
79957 + date. Synched in find_child_ptr().
79958 +
79959 + This value allows us to avoid expensive binary searches.
79960 +
79961 + in_parent->node points to the parent of this node, and is NOT a
79962 + hint.
79963 + */
79964 + parent_coord_t in_parent;
79965 +
79966 + /*
79967 + * sibling list pointers
79968 + */
79969 +
79970 + /* left-neighbor */
79971 + znode *left;
79972 + /* right-neighbor */
79973 + znode *right;
79974 +
79975 + /* long term lock on node content. This lock supports deadlock
79976 + detection. See lock.c
79977 + */
79978 + zlock lock;
79979 +
79980 + /* You cannot remove from memory a node that has children in
79981 + memory. This is because we rely on the fact that parent of given
79982 + node can always be reached without blocking for io. When reading a
79983 + node into memory you must increase the c_count of its parent, when
79984 + removing it from memory you must decrease the c_count. This makes
79985 + the code simpler, and the cases where it is suboptimal are truly
79986 + obscure.
79987 + */
79988 + int c_count;
79989 +
79990 + /* plugin of node attached to this znode. NULL if znode is not
79991 + loaded. */
79992 + node_plugin *nplug;
79993 +
79994 + /* version of znode data. This is increased on each modification. This
79995 + * is necessary to implement seals (see seal.[ch]) efficiently. */
79996 + __u64 version;
79997 +
79998 + /* left delimiting key. Necessary to efficiently perform
79999 + balancing with node-level locking. Kept in memory only. */
80000 + reiser4_key ld_key;
80001 + /* right delimiting key. */
80002 + reiser4_key rd_key;
80003 +
80004 + /* znode's tree level */
80005 + __u16 level;
80006 + /* number of items in this node. This field is modified by node
80007 + * plugin. */
80008 + __u16 nr_items;
80009 +
80010 +#if REISER4_DEBUG
80011 + void *creator;
80012 + reiser4_key first_key;
80013 + unsigned long times_locked;
80014 + int left_version; /* when node->left was updated */
80015 + int right_version; /* when node->right was updated */
80016 + int ld_key_version; /* when node->ld_key was updated */
80017 + int rd_key_version; /* when node->rd_key was updated */
80018 +#endif
80019 +
80020 +} __attribute__ ((aligned(16)));
80021 +
80022 +ON_DEBUG(extern atomic_t delim_key_version;
80023 + )
80024 +
80025 +/* In general I think these macros should not be exposed. */
80026 +#define znode_is_locked(node) (lock_is_locked(&node->lock))
80027 +#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
80028 +#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
80029 +#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
80030 +#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
80031 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80032 +/* Macros for accessing the znode state. */
80033 +#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
80034 +#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
80035 +#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
80036 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80037 + znode * parent, tree_level level, gfp_t gfp_flag);
80038 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80039 +extern int zload(znode * node);
80040 +extern int zload_ra(znode * node, ra_info_t * info);
80041 +extern int zinit_new(znode * node, gfp_t gfp_flags);
80042 +extern void zrelse(znode * node);
80043 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80044 +
80045 +/* size of data in znode */
80046 +static inline unsigned
80047 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80048 +{
80049 + assert("nikita-1416", node != NULL);
80050 + return PAGE_CACHE_SIZE;
80051 +}
80052 +
80053 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80054 + coord_t * coord);
80055 +extern void coord_to_parent_coord(const coord_t * coord,
80056 + parent_coord_t * pcoord);
80057 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80058 +
80059 +extern unsigned znode_free_space(znode * node);
80060 +
80061 +extern reiser4_key *znode_get_rd_key(znode * node);
80062 +extern reiser4_key *znode_get_ld_key(znode * node);
80063 +
80064 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80065 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80066 +
80067 +/* `connected' state checks */
80068 +static inline int znode_is_right_connected(const znode * node)
80069 +{
80070 + return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80071 +}
80072 +
80073 +static inline int znode_is_left_connected(const znode * node)
80074 +{
80075 + return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80076 +}
80077 +
80078 +static inline int znode_is_connected(const znode * node)
80079 +{
80080 + return znode_is_right_connected(node) && znode_is_left_connected(node);
80081 +}
80082 +
80083 +extern int znode_shift_order;
80084 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80085 +extern void znode_remove(znode *, reiser4_tree *);
80086 +extern znode *znode_parent(const znode * node);
80087 +extern znode *znode_parent_nolock(const znode * node);
80088 +extern int znode_above_root(const znode * node);
80089 +extern int init_znodes(void);
80090 +extern void done_znodes(void);
80091 +extern int znodes_tree_init(reiser4_tree * ztree);
80092 +extern void znodes_tree_done(reiser4_tree * ztree);
80093 +extern int znode_contains_key(znode * node, const reiser4_key * key);
80094 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80095 +extern unsigned znode_save_free_space(znode * node);
80096 +extern unsigned znode_recover_free_space(znode * node);
80097 +extern znode *zalloc(gfp_t gfp_flag);
80098 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
80099 +extern int zparse(znode * node);
80100 +
80101 +extern int znode_just_created(const znode * node);
80102 +
80103 +extern void zfree(znode * node);
80104 +
80105 +#if REISER4_DEBUG
80106 +extern void print_znode(const char *prefix, const znode * node);
80107 +#else
80108 +#define print_znode( p, n ) noop
80109 +#endif
80110 +
80111 +/* Make it look like various znode functions exist instead of treating znodes as
80112 + jnodes in znode-specific code. */
80113 +#define znode_page(x) jnode_page ( ZJNODE(x) )
80114 +#define zdata(x) jdata ( ZJNODE(x) )
80115 +#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
80116 +#define znode_created(x) jnode_created ( ZJNODE(x) )
80117 +#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
80118 +#define znode_convertible(x) jnode_convertible (ZJNODE(x))
80119 +#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
80120 +
80121 +#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
80122 +#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
80123 +#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
80124 +#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
80125 +
80126 +#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
80127 +#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
80128 +#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
80129 +#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
80130 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80131 +
80132 +#if REISER4_DEBUG
80133 +extern int znode_x_count_is_protected(const znode * node);
80134 +extern int znode_invariant(znode * node);
80135 +#endif
80136 +
80137 +/* acquire reference to @node */
80138 +static inline znode *zref(znode * node)
80139 +{
80140 + /* change of x_count from 0 to 1 is protected by tree spin-lock */
80141 + return JZNODE(jref(ZJNODE(node)));
80142 +}
80143 +
80144 +/* release reference to @node */
80145 +static inline void zput(znode * node)
80146 +{
80147 + assert("nikita-3564", znode_invariant(node));
80148 + jput(ZJNODE(node));
80149 +}
80150 +
80151 +/* get the level field for a znode */
80152 +static inline tree_level znode_get_level(const znode * node)
80153 +{
80154 + return node->level;
80155 +}
80156 +
80157 +/* get the level field for a jnode */
80158 +static inline tree_level jnode_get_level(const jnode * node)
80159 +{
80160 + if (jnode_is_znode(node))
80161 + return znode_get_level(JZNODE(node));
80162 + else
80163 + /* unformatted nodes are all at the LEAF_LEVEL and for
80164 + "semi-formatted" nodes like bitmaps, level doesn't matter. */
80165 + return LEAF_LEVEL;
80166 +}
80167 +
80168 +/* true if jnode is on leaf level */
80169 +static inline int jnode_is_leaf(const jnode * node)
80170 +{
80171 + if (jnode_is_znode(node))
80172 + return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80173 + if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80174 + return 1;
80175 + return 0;
80176 +}
80177 +
80178 +/* return znode's tree */
80179 +static inline reiser4_tree *znode_get_tree(const znode * node)
80180 +{
80181 + assert("nikita-2692", node != NULL);
80182 + return jnode_get_tree(ZJNODE(node));
80183 +}
80184 +
80185 +/* resolve race with zput */
80186 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80187 +{
80188 + jnode *j;
80189 +
80190 + j = jnode_rip_sync(tree, ZJNODE(node));
80191 + if (likely(j != NULL))
80192 + node = JZNODE(j);
80193 + else
80194 + node = NULL;
80195 + return node;
80196 +}
80197 +
80198 +#if defined(REISER4_DEBUG)
80199 +int znode_is_loaded(const znode * node /* znode to query */ );
80200 +#endif
80201 +
80202 +extern __u64 znode_build_version(reiser4_tree * tree);
80203 +
80204 +/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
80205 + must load the data for a node in many places. We could do this by simply calling
80206 + zload() everywhere, the difficulty arises when we must release the loaded data by
80207 + calling zrelse. In a function with many possible error/return paths, it requires extra
80208 + work to figure out which exit paths must call zrelse and those which do not. The data
80209 + handle automatically calls zrelse for every zload that it is responsible for. In that
80210 + sense, it acts much like a lock_handle.
80211 +*/
80212 +typedef struct load_count {
80213 + znode *node;
80214 + int d_ref;
80215 +} load_count;
80216 +
80217 +extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
80218 +extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
80219 +extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
80220 +extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
80221 + * incr_load_count_znode, otherwise do nothing (unformatted nodes
80222 + * don't require zload/zrelse treatment). */
80223 +extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
80224 +extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
80225 +
80226 +/* Variable initializers for load_count. */
80227 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
80228 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
80229 +/* A convenience macro for use in assertions or debug-only code, where loaded
80230 + data is only required to perform the debugging check. This macro
80231 + encapsulates an expression inside a pair of calls to zload()/zrelse(). */
80232 +#define WITH_DATA( node, exp ) \
80233 +({ \
80234 + long __with_dh_result; \
80235 + znode *__with_dh_node; \
80236 + \
80237 + __with_dh_node = ( node ); \
80238 + __with_dh_result = zload( __with_dh_node ); \
80239 + if( __with_dh_result == 0 ) { \
80240 + __with_dh_result = ( long )( exp ); \
80241 + zrelse( __with_dh_node ); \
80242 + } \
80243 + __with_dh_result; \
80244 +})
80245 +
80246 +/* Same as above, but accepts a return value in case zload fails. */
80247 +#define WITH_DATA_RET( node, ret, exp ) \
80248 +({ \
80249 + int __with_dh_result; \
80250 + znode *__with_dh_node; \
80251 + \
80252 + __with_dh_node = ( node ); \
80253 + __with_dh_result = zload( __with_dh_node ); \
80254 + if( __with_dh_result == 0 ) { \
80255 + __with_dh_result = ( int )( exp ); \
80256 + zrelse( __with_dh_node ); \
80257 + } else \
80258 + __with_dh_result = ( ret ); \
80259 + __with_dh_result; \
80260 +})
80261 +
80262 +#define WITH_COORD(coord, exp) \
80263 +({ \
80264 + coord_t *__coord; \
80265 + \
80266 + __coord = (coord); \
80267 + coord_clear_iplug(__coord); \
80268 + WITH_DATA(__coord->node, exp); \
80269 +})
80270 +
80271 +#if REISER4_DEBUG
80272 +#define STORE_COUNTERS \
80273 + reiser4_lock_cnt_info __entry_counters = \
80274 + *reiser4_lock_counters()
80275 +#define CHECK_COUNTERS \
80276 +ON_DEBUG_CONTEXT( \
80277 +({ \
80278 + __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
80279 + __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
80280 + __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
80281 + assert("nikita-2159", \
80282 + !memcmp(&__entry_counters, reiser4_lock_counters(), \
80283 + sizeof __entry_counters)); \
80284 +}) )
80285 +
80286 +#else
80287 +#define STORE_COUNTERS
80288 +#define CHECK_COUNTERS noop
80289 +#endif
80290 +
80291 +/* __ZNODE_H__ */
80292 +#endif
80293 +
80294 +/* Make Linus happy.
80295 + Local variables:
80296 + c-indentation-style: "K&R"
80297 + mode-name: "LC"
80298 + c-basic-offset: 8
80299 + tab-width: 8
80300 + fill-column: 120
80301 + End:
80302 +*/
80303 diff -urN linux-2.6.22.orig/include/linux/fs.h linux-2.6.22/include/linux/fs.h
80304 --- linux-2.6.22.orig/include/linux/fs.h 2007-07-21 00:33:00.673389540 +0400
80305 +++ linux-2.6.22/include/linux/fs.h 2007-07-29 00:25:35.056743067 +0400
80306 @@ -1179,6 +1179,8 @@
80307 void (*clear_inode) (struct inode *);
80308 void (*umount_begin) (struct vfsmount *, int);
80309
80310 + void (*sync_inodes) (struct super_block *sb,
80311 + struct writeback_control *wbc);
80312 int (*show_options)(struct seq_file *, struct vfsmount *);
80313 int (*show_stats)(struct seq_file *, struct vfsmount *);
80314 #ifdef CONFIG_QUOTA
80315 @@ -1630,6 +1632,7 @@
80316 extern int invalidate_inode_pages2_range(struct address_space *mapping,
80317 pgoff_t start, pgoff_t end);
80318 extern int write_inode_now(struct inode *, int);
80319 +extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
80320 extern int filemap_fdatawrite(struct address_space *);
80321 extern int filemap_flush(struct address_space *);
80322 extern int filemap_fdatawait(struct address_space *);
80323 diff -urN linux-2.6.22.orig/lib/radix-tree.c linux-2.6.22/lib/radix-tree.c
80324 --- linux-2.6.22.orig/lib/radix-tree.c 2007-07-21 00:33:01.265543326 +0400
80325 +++ linux-2.6.22/lib/radix-tree.c 2007-07-29 00:25:35.060744102 +0400
80326 @@ -151,6 +151,7 @@
80327 out:
80328 return ret;
80329 }
80330 +EXPORT_SYMBOL(radix_tree_preload);
80331
80332 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
80333 int offset)
80334 diff -urN linux-2.6.22.orig/mm/filemap.c linux-2.6.22/mm/filemap.c
80335 --- linux-2.6.22.orig/mm/filemap.c 2007-07-21 00:33:01.277546443 +0400
80336 +++ linux-2.6.22/mm/filemap.c 2007-07-29 00:25:35.064745138 +0400
80337 @@ -121,6 +121,7 @@
80338 mapping->nrpages--;
80339 __dec_zone_page_state(page, NR_FILE_PAGES);
80340 }
80341 +EXPORT_SYMBOL(__remove_from_page_cache);
80342
80343 void remove_from_page_cache(struct page *page)
80344 {
80345 @@ -132,6 +133,7 @@
80346 __remove_from_page_cache(page);
80347 write_unlock_irq(&mapping->tree_lock);
80348 }
80349 +EXPORT_SYMBOL(remove_from_page_cache);
80350
80351 static int sync_page(void *word)
80352 {
80353 @@ -719,6 +721,7 @@
80354 read_unlock_irq(&mapping->tree_lock);
80355 return ret;
80356 }
80357 +EXPORT_SYMBOL(add_to_page_cache_lru);
80358
80359 /**
80360 * find_get_pages_contig - gang contiguous pagecache lookup
80361 @@ -838,6 +841,7 @@
80362
80363 ra->ra_pages /= 4;
80364 }
80365 +EXPORT_SYMBOL(find_get_pages);
80366
80367 /**
80368 * do_generic_mapping_read - generic file read routine
80369 diff -urN linux-2.6.22.orig/mm/readahead.c linux-2.6.22/mm/readahead.c
80370 --- linux-2.6.22.orig/mm/readahead.c 2007-07-21 00:33:01.305553717 +0400
80371 +++ linux-2.6.22/mm/readahead.c 2007-07-29 00:25:35.064745138 +0400
80372 @@ -571,6 +571,7 @@
80373 ra->flags &= ~RA_FLAG_INCACHE;
80374 ra->cache_hit = 0;
80375 }
80376 +EXPORT_SYMBOL_GPL(handle_ra_miss);
80377
80378 /*
80379 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a