]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/reiser4-for-2.6.23.patch
Load libata prior udev at installer because some SATA doesnt autoload it
[people/pmueller/ipfire-2.x.git] / src / patches / reiser4-for-2.6.23.patch
1 diff -urN linux-2.6.23.orig/arch/i386/lib/usercopy.c linux-2.6.23/arch/i386/lib/usercopy.c
2 --- linux-2.6.23.orig/arch/i386/lib/usercopy.c 2007-10-10 00:31:38.000000000 +0400
3 +++ linux-2.6.23/arch/i386/lib/usercopy.c 2007-12-04 20:02:08.041841326 +0300
4 @@ -817,6 +817,7 @@
5 #endif
6 return n;
7 }
8 +EXPORT_SYMBOL(__copy_from_user_ll_nocache);
9
10 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
11 unsigned long n)
12 @@ -831,6 +832,7 @@
13 #endif
14 return n;
15 }
16 +EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
17
18 /**
19 * copy_to_user: - Copy a block of data into user space.
20 diff -urN linux-2.6.23.orig/Documentation/Changes linux-2.6.23/Documentation/Changes
21 --- linux-2.6.23.orig/Documentation/Changes 2007-10-10 00:31:38.000000000 +0400
22 +++ linux-2.6.23/Documentation/Changes 2007-12-04 20:02:08.041841326 +0300
23 @@ -36,6 +36,7 @@
24 o e2fsprogs 1.29 # tune2fs
25 o jfsutils 1.1.3 # fsck.jfs -V
26 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
27 +o reiser4progs 1.0.0 # fsck.reiser4 -V
28 o xfsprogs 2.6.0 # xfs_db -V
29 o pcmciautils 004 # pccardctl -V
30 o quota-tools 3.09 # quota -V
31 @@ -145,6 +146,13 @@
32 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
33 reiserfsck. These utils work on both i386 and alpha platforms.
34
35 +Reiser4progs
36 +------------
37 +
38 +The reiser4progs package contains utilities for the reiser4 file system.
39 +Detailed instructions are provided in the README file located at:
40 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
41 +
42 Xfsprogs
43 --------
44
45 @@ -323,6 +331,10 @@
46 -------------
47 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
48
49 +Reiser4progs
50 +------------
51 +o <ftp://ftp.namesys.com/pub/reiser4progs/>
52 +
53 Xfsprogs
54 --------
55 o <ftp://oss.sgi.com/projects/xfs/download/>
56 diff -urN linux-2.6.23.orig/Documentation/filesystems/reiser4.txt linux-2.6.23/Documentation/filesystems/reiser4.txt
57 --- linux-2.6.23.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
58 +++ linux-2.6.23/Documentation/filesystems/reiser4.txt 2007-12-04 20:02:08.041841326 +0300
59 @@ -0,0 +1,75 @@
60 +Reiser4 filesystem
61 +==================
62 +Reiser4 is a file system based on dancing tree algorithms, and is
63 +described at http://www.namesys.com
64 +
65 +
66 +References
67 +==========
68 +web page http://namesys.com/v4/v4.html
69 +source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
70 +userland tools ftp://ftp.namesys.com/pub/reiser4progs/
71 +install page http://www.namesys.com/install_v4.html
72 +
73 +Compile options
74 +===============
75 +Enable reiser4 debug mode
76 + This checks everything imaginable while reiser4
77 + runs
78 +
79 +Mount options
80 +=============
81 +tmgr.atom_max_size=N
82 + Atoms containing more than N blocks will be forced to commit.
83 + N is decimal.
84 + Default is nr_free_pagecache_pages() / 2 at mount time.
85 +
86 +tmgr.atom_max_age=N
87 + Atoms older than N seconds will be forced to commit. N is decimal.
88 + Default is 600.
89 +
90 +tmgr.atom_max_flushers=N
91 + Limit of concurrent flushers for one atom. 0 means no limit.
92 + Default is 0.
93 +
94 +tree.cbk_cache.nr_slots=N
95 + Number of slots in the cbk cache.
96 +
97 +flush.relocate_threshold=N
98 + If flush finds more than N adjacent dirty leaf-level blocks it
99 + will force them to be relocated.
100 + Default is 64.
101 +
102 +flush.relocate_distance=N
103 + If flush finds can find a block allocation closer than at most
104 + N from the preceder it will relocate to that position.
105 + Default is 64.
106 +
107 +flush.scan_maxnodes=N
108 + The maximum number of nodes to scan left on a level during
109 + flush.
110 + Default is 10000.
111 +
112 +optimal_io_size=N
113 + Preferred IO size. This value is used to set st_blksize of
114 + struct stat.
115 + Default is 65536.
116 +
117 +bsdgroups
118 + Turn on BSD-style gid assignment.
119 +
120 +32bittimes
121 + By default file in reiser4 have 64 bit timestamps. Files
122 + created when filesystem is mounted with 32bittimes mount
123 + option will get 32 bit timestamps.
124 +
125 +mtflush
126 + Turn off concurrent flushing.
127 +
128 +nopseudo
129 + Disable pseudo files support. See
130 + http://namesys.com/v4/pseudo.html for more about pseudo files.
131 +
132 +dont_load_bitmap
133 + Don't load all bitmap blocks at mount time, it is useful for
134 + machines with tiny RAM and large disks.
135 diff -urN linux-2.6.23.orig/fs/fs-writeback.c linux-2.6.23/fs/fs-writeback.c
136 --- linux-2.6.23.orig/fs/fs-writeback.c 2007-10-10 00:31:38.000000000 +0400
137 +++ linux-2.6.23/fs/fs-writeback.c 2007-12-04 20:02:08.045842355 +0300
138 @@ -296,8 +296,6 @@
139 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
140 * that it can be located for waiting on in __writeback_single_inode().
141 *
142 - * Called under inode_lock.
143 - *
144 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
145 * This function assumes that the blockdev superblock's inodes are backed by
146 * a variety of queues, so all inodes are searched. For other superblocks,
147 @@ -313,11 +311,13 @@
148 * on the writer throttling path, and we get decent balancing between many
149 * throttled threads: we don't want them all piling up on __wait_on_inode.
150 */
151 -static void
152 -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
153 +void
154 +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
155 {
156 const unsigned long start = jiffies; /* livelock avoidance */
157
158 + spin_lock(&inode_lock);
159 +
160 if (!wbc->for_kupdate || list_empty(&sb->s_io))
161 list_splice_init(&sb->s_dirty, &sb->s_io);
162
163 @@ -397,8 +397,19 @@
164 if (wbc->nr_to_write <= 0)
165 break;
166 }
167 + spin_unlock(&inode_lock);
168 return; /* Leave any unwritten inodes on s_io */
169 }
170 +EXPORT_SYMBOL(generic_sync_sb_inodes);
171 +
172 +static void
173 +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
174 +{
175 + if (sb->s_op->sync_inodes)
176 + sb->s_op->sync_inodes(sb, wbc);
177 + else
178 + generic_sync_sb_inodes(sb, wbc);
179 +}
180
181 /*
182 * Start writeback of dirty pagecache data against all unlocked inodes.
183 @@ -439,11 +450,8 @@
184 * be unmounted by the time it is released.
185 */
186 if (down_read_trylock(&sb->s_umount)) {
187 - if (sb->s_root) {
188 - spin_lock(&inode_lock);
189 + if (sb->s_root)
190 sync_sb_inodes(sb, wbc);
191 - spin_unlock(&inode_lock);
192 - }
193 up_read(&sb->s_umount);
194 }
195 spin_lock(&sb_lock);
196 @@ -481,9 +489,7 @@
197 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
198 nr_dirty + nr_unstable;
199 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
200 - spin_lock(&inode_lock);
201 sync_sb_inodes(sb, &wbc);
202 - spin_unlock(&inode_lock);
203 }
204
205 /*
206 diff -urN linux-2.6.23.orig/fs/Kconfig linux-2.6.23/fs/Kconfig
207 --- linux-2.6.23.orig/fs/Kconfig 2007-10-10 00:31:38.000000000 +0400
208 +++ linux-2.6.23/fs/Kconfig 2007-12-04 20:02:08.045842355 +0300
209 @@ -272,6 +272,8 @@
210 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
211 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
212
213 +source "fs/reiser4/Kconfig"
214 +
215 config REISERFS_FS
216 tristate "Reiserfs support"
217 help
218 diff -urN linux-2.6.23.orig/fs/Makefile linux-2.6.23/fs/Makefile
219 --- linux-2.6.23.orig/fs/Makefile 2007-10-10 00:31:38.000000000 +0400
220 +++ linux-2.6.23/fs/Makefile 2007-12-04 20:02:08.049843385 +0300
221 @@ -66,6 +66,7 @@
222
223 # Do not add any filesystems before this line
224 obj-$(CONFIG_REISERFS_FS) += reiserfs/
225 +obj-$(CONFIG_REISER4_FS) += reiser4/
226 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
227 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
228 obj-$(CONFIG_JBD) += jbd/
229 diff -urN linux-2.6.23.orig/fs/reiser4/as_ops.c linux-2.6.23/fs/reiser4/as_ops.c
230 --- linux-2.6.23.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
231 +++ linux-2.6.23/fs/reiser4/as_ops.c 2007-12-04 16:49:30.000000000 +0300
232 @@ -0,0 +1,377 @@
233 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
234 +
235 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
236 +
237 +#include "forward.h"
238 +#include "debug.h"
239 +#include "dformat.h"
240 +#include "coord.h"
241 +#include "plugin/item/item.h"
242 +#include "plugin/file/file.h"
243 +#include "plugin/security/perm.h"
244 +#include "plugin/disk_format/disk_format.h"
245 +#include "plugin/plugin.h"
246 +#include "plugin/plugin_set.h"
247 +#include "plugin/object.h"
248 +#include "txnmgr.h"
249 +#include "jnode.h"
250 +#include "znode.h"
251 +#include "block_alloc.h"
252 +#include "tree.h"
253 +#include "vfs_ops.h"
254 +#include "inode.h"
255 +#include "page_cache.h"
256 +#include "ktxnmgrd.h"
257 +#include "super.h"
258 +#include "reiser4.h"
259 +#include "entd.h"
260 +
261 +#include <linux/profile.h>
262 +#include <linux/types.h>
263 +#include <linux/mount.h>
264 +#include <linux/vfs.h>
265 +#include <linux/mm.h>
266 +#include <linux/buffer_head.h>
267 +#include <linux/dcache.h>
268 +#include <linux/list.h>
269 +#include <linux/pagemap.h>
270 +#include <linux/slab.h>
271 +#include <linux/seq_file.h>
272 +#include <linux/init.h>
273 +#include <linux/module.h>
274 +#include <linux/writeback.h>
275 +#include <linux/backing-dev.h>
276 +#include <linux/quotaops.h>
277 +#include <linux/security.h>
278 +
279 +/* address space operations */
280 +
281 +/**
282 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
283 + * @page: page to be dirtied
284 + *
285 + * Operation of struct address_space_operations. This implementation is used by
286 + * unix and cryptcompress file plugins.
287 + *
288 + * This is called when reiser4 page gets dirtied outside of reiser4, for
289 + * example, when dirty bit is moved from pte to physical page.
290 + *
291 + * Tags page in the mapping's page tree with special tag so that it is possible
292 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
293 + * capturing by an atom) later because it can not be done in the contexts where
294 + * set_page_dirty is called.
295 + */
296 +int reiser4_set_page_dirty(struct page *page)
297 +{
298 + /* this page can be unformatted only */
299 + assert("vs-1734", (page->mapping &&
300 + page->mapping->host &&
301 + reiser4_get_super_fake(page->mapping->host->i_sb) !=
302 + page->mapping->host
303 + && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
304 + page->mapping->host
305 + && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
306 + page->mapping->host));
307 +
308 + if (!TestSetPageDirty(page)) {
309 + struct address_space *mapping = page->mapping;
310 +
311 + if (mapping) {
312 + write_lock_irq(&mapping->tree_lock);
313 +
314 + /* check for race with truncate */
315 + if (page->mapping) {
316 + assert("vs-1652", page->mapping == mapping);
317 + if (mapping_cap_account_dirty(mapping))
318 + inc_zone_page_state(page,
319 + NR_FILE_DIRTY);
320 + radix_tree_tag_set(&mapping->page_tree,
321 + page->index,
322 + PAGECACHE_TAG_REISER4_MOVED);
323 + }
324 + write_unlock_irq(&mapping->tree_lock);
325 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
326 + }
327 + }
328 + return 0;
329 +}
330 +
331 +/* ->invalidatepage method for reiser4 */
332 +
333 +/*
334 + * this is called for each truncated page from
335 + * truncate_inode_pages()->truncate_{complete,partial}_page().
336 + *
337 + * At the moment of call, page is under lock, and outstanding io (if any) has
338 + * completed.
339 + */
340 +
341 +/**
342 + * reiser4_invalidatepage
343 + * @page: page to invalidate
344 + * @offset: starting offset for partial invalidation
345 + *
346 + */
347 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
348 +{
349 + int ret = 0;
350 + reiser4_context *ctx;
351 + struct inode *inode;
352 + jnode *node;
353 +
354 + /*
355 + * This is called to truncate file's page.
356 + *
357 + * Originally, reiser4 implemented truncate in a standard way
358 + * (vmtruncate() calls ->invalidatepage() on all truncated pages
359 + * first, then file system ->truncate() call-back is invoked).
360 + *
361 + * This lead to the problem when ->invalidatepage() was called on a
362 + * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
363 + * process. That is, truncate was bypassing transactions. To avoid
364 + * this, try_capture_page_to_invalidate() call was added here.
365 + *
366 + * After many troubles with vmtruncate() based truncate (including
367 + * races with flush, tail conversion, etc.) it was re-written in the
368 + * top-to-bottom style: items are killed in reiser4_cut_tree_object()
369 + * and pages belonging to extent are invalidated in kill_hook_extent().
370 + * So probably now additional call to capture is not needed here.
371 + */
372 +
373 + assert("nikita-3137", PageLocked(page));
374 + assert("nikita-3138", !PageWriteback(page));
375 + inode = page->mapping->host;
376 +
377 + /*
378 + * ->invalidatepage() should only be called for the unformatted
379 + * jnodes. Destruction of all other types of jnodes is performed
380 + * separately. But, during some corner cases (like handling errors
381 + * during mount) it is simpler to let ->invalidatepage to be called on
382 + * them. Check for this, and do nothing.
383 + */
384 + if (reiser4_get_super_fake(inode->i_sb) == inode)
385 + return;
386 + if (reiser4_get_cc_fake(inode->i_sb) == inode)
387 + return;
388 + if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
389 + return;
390 + assert("vs-1426", PagePrivate(page));
391 + assert("vs-1427",
392 + page->mapping == jnode_get_mapping(jnode_by_page(page)));
393 + assert("", jprivate(page) != NULL);
394 + assert("", ergo(inode_file_plugin(inode) !=
395 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
396 + offset == 0));
397 +
398 + ctx = reiser4_init_context(inode->i_sb);
399 + if (IS_ERR(ctx))
400 + return;
401 +
402 + node = jprivate(page);
403 + spin_lock_jnode(node);
404 + if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
405 + (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
406 + /* there is not need to capture */
407 + jref(node);
408 + JF_SET(node, JNODE_HEARD_BANSHEE);
409 + page_clear_jnode(page, node);
410 + reiser4_uncapture_jnode(node);
411 + unhash_unformatted_jnode(node);
412 + jput(node);
413 + reiser4_exit_context(ctx);
414 + return;
415 + }
416 + spin_unlock_jnode(node);
417 +
418 + /* capture page being truncated. */
419 + ret = try_capture_page_to_invalidate(page);
420 + if (ret != 0)
421 + warning("nikita-3141", "Cannot capture: %i", ret);
422 +
423 + if (offset == 0) {
424 + /* remove jnode from transaction and detach it from page. */
425 + jref(node);
426 + JF_SET(node, JNODE_HEARD_BANSHEE);
427 + /* page cannot be detached from jnode concurrently, because it
428 + * is locked */
429 + reiser4_uncapture_page(page);
430 +
431 + /* this detaches page from jnode, so that jdelete will not try
432 + * to lock page which is already locked */
433 + spin_lock_jnode(node);
434 + page_clear_jnode(page, node);
435 + spin_unlock_jnode(node);
436 + unhash_unformatted_jnode(node);
437 +
438 + jput(node);
439 + }
440 +
441 + reiser4_exit_context(ctx);
442 +}
443 +
444 +/* help function called from reiser4_releasepage(). It returns true if jnode
445 + * can be detached from its page and page released. */
446 +int jnode_is_releasable(jnode * node /* node to check */ )
447 +{
448 + assert("nikita-2781", node != NULL);
449 + assert_spin_locked(&(node->guard));
450 + assert_spin_locked(&(node->load));
451 +
452 + /* is some thread is currently using jnode page, later cannot be
453 + * detached */
454 + if (atomic_read(&node->d_count) != 0) {
455 + return 0;
456 + }
457 +
458 + assert("vs-1214", !jnode_is_loaded(node));
459 +
460 + /*
461 + * can only release page if real block number is assigned to it. Simple
462 + * check for ->atom wouldn't do, because it is possible for node to be
463 + * clean, not it atom yet, and still having fake block number. For
464 + * example, node just created in jinit_new().
465 + */
466 + if (reiser4_blocknr_is_fake(jnode_get_block(node)))
467 + return 0;
468 +
469 + /*
470 + * pages prepared for write can not be released anyway, so avoid
471 + * detaching jnode from the page
472 + */
473 + if (JF_ISSET(node, JNODE_WRITE_PREPARED))
474 + return 0;
475 +
476 + /*
477 + * dirty jnode cannot be released. It can however be submitted to disk
478 + * as part of early flushing, but only after getting flush-prepped.
479 + */
480 + if (JF_ISSET(node, JNODE_DIRTY))
481 + return 0;
482 +
483 + /* overwrite set is only written by log writer. */
484 + if (JF_ISSET(node, JNODE_OVRWR))
485 + return 0;
486 +
487 + /* jnode is already under writeback */
488 + if (JF_ISSET(node, JNODE_WRITEBACK))
489 + return 0;
490 +
491 + /* don't flush bitmaps or journal records */
492 + if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
493 + return 0;
494 +
495 + return 1;
496 +}
497 +
498 +/*
499 + * ->releasepage method for reiser4
500 + *
501 + * This is called by VM scanner when it comes across clean page. What we have
502 + * to do here is to check whether page can really be released (freed that is)
503 + * and if so, detach jnode from it and remove page from the page cache.
504 + *
505 + * Check for releasability is done by releasable() function.
506 + */
507 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
508 +{
509 + jnode *node;
510 +
511 + assert("nikita-2257", PagePrivate(page));
512 + assert("nikita-2259", PageLocked(page));
513 + assert("nikita-2892", !PageWriteback(page));
514 + assert("nikita-3019", reiser4_schedulable());
515 +
516 + /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
517 + is not clear what to do in this case. A lot of deadlocks seems be
518 + possible. */
519 +
520 + node = jnode_by_page(page);
521 + assert("nikita-2258", node != NULL);
522 + assert("reiser4-4", page->mapping != NULL);
523 + assert("reiser4-5", page->mapping->host != NULL);
524 +
525 + if (PageDirty(page))
526 + return 0;
527 +
528 + /* extra page reference is used by reiser4 to protect
529 + * jnode<->page link from this ->releasepage(). */
530 + if (page_count(page) > 3)
531 + return 0;
532 +
533 + /* releasable() needs jnode lock, because it looks at the jnode fields
534 + * and we need jload_lock here to avoid races with jload(). */
535 + spin_lock_jnode(node);
536 + spin_lock(&(node->load));
537 + if (jnode_is_releasable(node)) {
538 + struct address_space *mapping;
539 +
540 + mapping = page->mapping;
541 + jref(node);
542 + /* there is no need to synchronize against
543 + * jnode_extent_write() here, because pages seen by
544 + * jnode_extent_write() are !releasable(). */
545 + page_clear_jnode(page, node);
546 + spin_unlock(&(node->load));
547 + spin_unlock_jnode(node);
548 +
549 + /* we are under memory pressure so release jnode also. */
550 + jput(node);
551 +
552 + return 1;
553 + } else {
554 + spin_unlock(&(node->load));
555 + spin_unlock_jnode(node);
556 + assert("nikita-3020", reiser4_schedulable());
557 + return 0;
558 + }
559 +}
560 +
561 +int reiser4_readpage(struct file *file, struct page *page)
562 +{
563 + assert("edward-1533", PageLocked(page));
564 + assert("edward-1534", !PageUptodate(page));
565 + assert("edward-1535", page->mapping && page->mapping->host);
566 +
567 + return inode_file_plugin(page->mapping->host)->readpage(file, page);
568 +}
569 +
570 +int reiser4_readpages(struct file *file, struct address_space *mapping,
571 + struct list_head *pages, unsigned nr_pages)
572 +{
573 + return inode_file_plugin(mapping->host)->readpages(file, mapping,
574 + pages, nr_pages);
575 +}
576 +
577 +int reiser4_writepages(struct address_space *mapping,
578 + struct writeback_control *wbc)
579 +{
580 + return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
581 +}
582 +
583 +int reiser4_prepare_write(struct file *file, struct page *page,
584 + unsigned from, unsigned to)
585 +{
586 + return inode_file_plugin(file->f_dentry->d_inode)->prepare_write(file,
587 + page,
588 + from,
589 + to);
590 +}
591 +
592 +int reiser4_commit_write(struct file *file, struct page *page,
593 + unsigned from, unsigned to)
594 +{
595 + return inode_file_plugin(file->f_dentry->d_inode)->commit_write(file,
596 + page,
597 + from,
598 + to);
599 +}
600 +
601 +/* Make Linus happy.
602 + Local variables:
603 + c-indentation-style: "K&R"
604 + mode-name: "LC"
605 + c-basic-offset: 8
606 + tab-width: 8
607 + fill-column: 120
608 + End:
609 +*/
610 diff -urN linux-2.6.23.orig/fs/reiser4/block_alloc.c linux-2.6.23/fs/reiser4/block_alloc.c
611 --- linux-2.6.23.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
612 +++ linux-2.6.23/fs/reiser4/block_alloc.c 2007-12-04 16:49:30.000000000 +0300
613 @@ -0,0 +1,1137 @@
614 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
615 +
616 +#include "debug.h"
617 +#include "dformat.h"
618 +#include "plugin/plugin.h"
619 +#include "txnmgr.h"
620 +#include "znode.h"
621 +#include "block_alloc.h"
622 +#include "tree.h"
623 +#include "super.h"
624 +
625 +#include <linux/types.h> /* for __u?? */
626 +#include <linux/fs.h> /* for struct super_block */
627 +#include <linux/spinlock.h>
628 +
629 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
630 +
631 +/* We need to be able to reserve enough disk space to ensure that an atomic
632 + operation will have enough disk space to flush (see flush.c and
633 + http://namesys.com/v4/v4.html) and commit it once it is started.
634 +
635 + In our design a call for reserving disk space may fail but not an actual
636 + block allocation.
637 +
638 + All free blocks, already allocated blocks, and all kinds of reserved blocks
639 + are counted in different per-fs block counters.
640 +
641 + A reiser4 super block's set of block counters currently is:
642 +
643 + free -- free blocks,
644 + used -- already allocated blocks,
645 +
646 + grabbed -- initially reserved for performing an fs operation, those blocks
647 + are taken from free blocks, then grabbed disk space leaks from grabbed
648 + blocks counter to other counters like "fake allocated", "flush
649 + reserved", "used", the rest of not used grabbed space is returned to
650 + free space at the end of fs operation;
651 +
652 + fake allocated -- counts all nodes without real disk block numbers assigned,
653 + we have separate accounting for formatted and unformatted
654 + nodes (for easier debugging);
655 +
656 + flush reserved -- disk space needed for flushing and committing an atom.
657 + Each dirty already allocated block could be written as a
658 + part of atom's overwrite set or as a part of atom's
659 + relocate set. In both case one additional block is needed,
660 + it is used as a wandered block if we do overwrite or as a
661 + new location for a relocated block.
662 +
663 + In addition, blocks in some states are counted on per-thread and per-atom
664 + basis. A reiser4 context has a counter of blocks grabbed by this transaction
665 + and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
666 + of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
667 + blocks, which are reserved for flush processing and atom commit. */
668 +
669 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
670 + number of blocks to grab for most expensive case of balancing when the leaf
671 + node we insert new item to gets split and new leaf node is allocated.
672 +
673 + So, we need to grab blocks for
674 +
675 + 1) one block for possible dirtying the node we insert an item to. That block
676 + would be used for node relocation at flush time or for allocating of a
677 + wandered one, it depends what will be a result (what set, relocate or
678 + overwrite the node gets assigned to) of the node processing by the flush
679 + algorithm.
680 +
681 + 2) one block for either allocating a new node, or dirtying of right or left
682 + clean neighbor, only one case may happen.
683 +
684 + VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
685 + node, and creation of new node. have I forgotten something? email me.
686 +
687 + These grabbed blocks are counted in both reiser4 context "grabbed blocks"
688 + counter and in the fs-wide one (both ctx->grabbed_blocks and
689 + sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
690 + decremented by 2.
691 +
692 + Suppose both two blocks were spent for dirtying of an already allocated clean
693 + node (one block went from "grabbed" to "flush reserved") and for new block
694 + allocating (one block went from "grabbed" to "fake allocated formatted").
695 +
696 + Inserting of a child pointer to the parent node caused parent node to be
697 + split, the balancing code takes care about this grabbing necessary space
698 + immediately by calling reiser4_grab with BA_RESERVED flag set which means
699 + "can use the 5% reserved disk space".
700 +
701 + At this moment insertion completes and grabbed blocks (if they were not used)
702 + should be returned to the free space counter.
703 +
704 + However the atom life-cycle is not completed. The atom had one "flush
705 + reserved" block added by our insertion and the new fake allocated node is
706 + counted as a "fake allocated formatted" one. The atom has to be fully
707 + processed by flush before commit. Suppose that the flush moved the first,
708 + already allocated node to the atom's overwrite list, the new fake allocated
709 + node, obviously, went into the atom relocate set. The reiser4 flush
710 + allocates the new node using one unit from "fake allocated formatted"
711 + counter, the log writer uses one from "flush reserved" for wandered block
712 + allocation.
713 +
714 + And, it is not the end. When the wandered block is deallocated after the
715 + atom gets fully played (see wander.c for term description), the disk space
716 + occupied for it is returned to free blocks. */
717 +
718 +/* BLOCK NUMBERS */
719 +
720 +/* Any reiser4 node has a block number assigned to it. We use these numbers for
721 + indexing in hash tables, so if a block has not yet been assigned a location
722 + on disk we need to give it a temporary fake block number.
723 +
724 + Current implementation of reiser4 uses 64-bit integers for block numbers. We
725 + use highest bit in 64-bit block number to distinguish fake and real block
726 + numbers. So, only 63 bits may be used to addressing of real device
727 + blocks. That "fake" block numbers space is divided into subspaces of fake
728 + block numbers for data blocks and for shadow (working) bitmap blocks.
729 +
730 + Fake block numbers for data blocks are generated by a cyclic counter, which
731 + gets incremented after each real block allocation. We assume that it is
732 + impossible to overload this counter during one transaction life. */
733 +
734 +/* Initialize a blocknr hint. */
735 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
736 +{
737 + memset(hint, 0, sizeof(reiser4_blocknr_hint));
738 +}
739 +
740 +/* Release any resources of a blocknr hint. */
741 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
742 +{
743 + /* No resources should be freed in current blocknr_hint implementation. */
744 +}
745 +
746 +/* see above for explanation of fake block number. */
747 +/* Audited by: green(2002.06.11) */
748 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
749 +{
750 + /* The reason for not simply returning result of '&' operation is that
751 + while return value is (possibly 32bit) int, the reiser4_block_nr is
752 + at least 64 bits long, and high bit (which is the only possible
753 + non zero bit after the masking) would be stripped off */
754 + return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
755 +}
756 +
757 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
758 + arithmetic. Mostly, they are isolated to not to code same assertions in
759 + several places. */
760 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
761 +{
762 + BUG_ON(ctx->grabbed_blocks < count);
763 + assert("zam-527", ctx->grabbed_blocks >= count);
764 + ctx->grabbed_blocks -= count;
765 +}
766 +
767 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
768 +{
769 + ctx->grabbed_blocks += count;
770 +}
771 +
772 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
773 +{
774 + assert("zam-525", sbinfo->blocks_grabbed >= count);
775 + sbinfo->blocks_grabbed -= count;
776 +}
777 +
778 +/* Decrease the counter of block reserved for flush in super block. */
779 +static void
780 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
781 +{
782 + assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
783 + sbinfo->blocks_flush_reserved -= count;
784 +}
785 +
786 +static void
787 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
788 + reiser4_ba_flags_t flags)
789 +{
790 + if (flags & BA_FORMATTED) {
791 + assert("zam-806", sbinfo->blocks_fake_allocated >= count);
792 + sbinfo->blocks_fake_allocated -= count;
793 + } else {
794 + assert("zam-528",
795 + sbinfo->blocks_fake_allocated_unformatted >= count);
796 + sbinfo->blocks_fake_allocated_unformatted -= count;
797 + }
798 +}
799 +
800 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
801 +{
802 + assert("zam-530",
803 + sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
804 + sbinfo->blocks_used -= count;
805 +}
806 +
807 +static void
808 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
809 +{
810 + assert("edward-501", sbinfo->blocks_clustered >= count);
811 + sbinfo->blocks_clustered -= count;
812 +}
813 +
814 +/* Increase the counter of block reserved for flush in atom. */
815 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
816 +{
817 + assert("zam-772", atom != NULL);
818 + assert_spin_locked(&(atom->alock));
819 + atom->flush_reserved += count;
820 +}
821 +
822 +/* Decrease the counter of block reserved for flush in atom. */
823 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
824 +{
825 + assert("zam-774", atom != NULL);
826 + assert_spin_locked(&(atom->alock));
827 + assert("nikita-2790", atom->flush_reserved >= count);
828 + atom->flush_reserved -= count;
829 +}
830 +
831 +/* super block has 6 counters: free, used, grabbed, fake allocated
832 + (formatted and unformatted) and flush reserved. Their sum must be
833 + number of blocks on a device. This function checks this */
834 +int reiser4_check_block_counters(const struct super_block *super)
835 +{
836 + __u64 sum;
837 +
838 + sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
839 + reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
840 + reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
841 + reiser4_clustered_blocks(super);
842 + if (reiser4_block_count(super) != sum) {
843 + printk("super block counters: "
844 + "used %llu, free %llu, "
845 + "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
846 + "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
847 + (unsigned long long)reiser4_data_blocks(super),
848 + (unsigned long long)reiser4_free_blocks(super),
849 + (unsigned long long)reiser4_grabbed_blocks(super),
850 + (unsigned long long)reiser4_fake_allocated(super),
851 + (unsigned long long)
852 + reiser4_fake_allocated_unformatted(super),
853 + (unsigned long long)reiser4_flush_reserved(super),
854 + (unsigned long long)reiser4_clustered_blocks(super),
855 + (unsigned long long)sum,
856 + (unsigned long long)reiser4_block_count(super));
857 + return 0;
858 + }
859 + return 1;
860 +}
861 +
862 +/* Adjust "working" free blocks counter for number of blocks we are going to
863 + allocate. Record number of grabbed blocks in fs-wide and per-thread
864 + counters. This function should be called before bitmap scanning or
865 + allocating fake block numbers
866 +
867 + @super -- pointer to reiser4 super block;
868 + @count -- number of blocks we reserve;
869 +
870 + @return -- 0 if success, -ENOSPC, if all
871 + free blocks are preserved or already allocated.
872 +*/
873 +
874 +static int
875 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
876 +{
877 + __u64 free_blocks;
878 + int ret = 0, use_reserved = flags & BA_RESERVED;
879 + reiser4_super_info_data *sbinfo;
880 +
881 + assert("vs-1276", ctx == get_current_context());
882 +
883 + /* Do not grab anything on ro-mounted fs. */
884 + if (rofs_super(ctx->super)) {
885 + ctx->grab_enabled = 0;
886 + return 0;
887 + }
888 +
889 + sbinfo = get_super_private(ctx->super);
890 +
891 + spin_lock_reiser4_super(sbinfo);
892 +
893 + free_blocks = sbinfo->blocks_free;
894 +
895 + if ((use_reserved && free_blocks < count) ||
896 + (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
897 + ret = RETERR(-ENOSPC);
898 + goto unlock_and_ret;
899 + }
900 +
901 + add_to_ctx_grabbed(ctx, count);
902 +
903 + sbinfo->blocks_grabbed += count;
904 + sbinfo->blocks_free -= count;
905 +
906 +#if REISER4_DEBUG
907 + if (ctx->grabbed_initially == 0)
908 + ctx->grabbed_initially = count;
909 +#endif
910 +
911 + assert("nikita-2986", reiser4_check_block_counters(ctx->super));
912 +
913 + /* disable grab space in current context */
914 + ctx->grab_enabled = 0;
915 +
916 + unlock_and_ret:
917 + spin_unlock_reiser4_super(sbinfo);
918 +
919 + return ret;
920 +}
921 +
922 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
923 +{
924 + int ret;
925 + reiser4_context *ctx;
926 +
927 + assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
928 + lock_stack_isclean(get_current_lock_stack
929 + ())));
930 + ctx = get_current_context();
931 + if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
932 + return 0;
933 + }
934 +
935 + ret = reiser4_grab(ctx, count, flags);
936 + if (ret == -ENOSPC) {
937 +
938 + /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
939 + if (flags & BA_CAN_COMMIT) {
940 + txnmgr_force_commit_all(ctx->super, 0);
941 + ctx->grab_enabled = 1;
942 + ret = reiser4_grab(ctx, count, flags);
943 + }
944 + }
945 + /*
946 + * allocation from reserved pool cannot fail. This is severe error.
947 + */
948 + assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
949 + return ret;
950 +}
951 +
952 +/*
953 + * SPACE RESERVED FOR UNLINK/TRUNCATE
954 + *
955 + * Unlink and truncate require space in transaction (to update stat data, at
956 + * least). But we don't want rm(1) to fail with "No space on device" error.
957 + *
958 + * Solution is to reserve 5% of disk space for truncates and
959 + * unlinks. Specifically, normal space grabbing requests don't grab space from
960 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
961 + * drain it. Per super block delete mutex is used to allow only one
962 + * thread at a time to grab from reserved area.
963 + *
964 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
965 + * flag.
966 + *
967 + */
968 +
969 +int reiser4_grab_reserved(struct super_block *super,
970 + __u64 count, reiser4_ba_flags_t flags)
971 +{
972 + reiser4_super_info_data *sbinfo = get_super_private(super);
973 +
974 + assert("nikita-3175", flags & BA_CAN_COMMIT);
975 +
976 + /* Check the delete mutex already taken by us, we assume that
977 + * reading of machine word is atomic. */
978 + if (sbinfo->delete_mutex_owner == current) {
979 + if (reiser4_grab_space
980 + (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
981 + warning("zam-1003",
982 + "nested call of grab_reserved fails count=(%llu)",
983 + (unsigned long long)count);
984 + reiser4_release_reserved(super);
985 + return RETERR(-ENOSPC);
986 + }
987 + return 0;
988 + }
989 +
990 + if (reiser4_grab_space(count, flags)) {
991 + mutex_lock(&sbinfo->delete_mutex);
992 + assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
993 + sbinfo->delete_mutex_owner = current;
994 +
995 + if (reiser4_grab_space(count, flags | BA_RESERVED)) {
996 + warning("zam-833",
997 + "reserved space is not enough (%llu)",
998 + (unsigned long long)count);
999 + reiser4_release_reserved(super);
1000 + return RETERR(-ENOSPC);
1001 + }
1002 + }
1003 + return 0;
1004 +}
1005 +
1006 +void reiser4_release_reserved(struct super_block *super)
1007 +{
1008 + reiser4_super_info_data *info;
1009 +
1010 + info = get_super_private(super);
1011 + if (info->delete_mutex_owner == current) {
1012 + info->delete_mutex_owner = NULL;
1013 + mutex_unlock(&info->delete_mutex);
1014 + }
1015 +}
1016 +
1017 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1018 +{
1019 + reiser4_context *ctx;
1020 + reiser4_super_info_data *sbinfo;
1021 +
1022 + ctx = get_current_context();
1023 + sub_from_ctx_grabbed(ctx, count);
1024 +
1025 + sbinfo = get_super_private(ctx->super);
1026 + spin_lock_reiser4_super(sbinfo);
1027 +
1028 + sub_from_sb_grabbed(sbinfo, count);
1029 + /* return sbinfo locked */
1030 + return sbinfo;
1031 +}
1032 +
1033 +/* is called after @count fake block numbers are allocated and pointer to
1034 + those blocks are inserted into tree. */
1035 +static void grabbed2fake_allocated_formatted(void)
1036 +{
1037 + reiser4_super_info_data *sbinfo;
1038 +
1039 + sbinfo = grabbed2fake_allocated_head(1);
1040 + sbinfo->blocks_fake_allocated++;
1041 +
1042 + assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1043 +
1044 + spin_unlock_reiser4_super(sbinfo);
1045 +}
1046 +
1047 +/**
1048 + * grabbed2fake_allocated_unformatted
1049 + * @count:
1050 + *
1051 + */
1052 +static void grabbed2fake_allocated_unformatted(int count)
1053 +{
1054 + reiser4_super_info_data *sbinfo;
1055 +
1056 + sbinfo = grabbed2fake_allocated_head(count);
1057 + sbinfo->blocks_fake_allocated_unformatted += count;
1058 +
1059 + assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1060 +
1061 + spin_unlock_reiser4_super(sbinfo);
1062 +}
1063 +
1064 +void grabbed2cluster_reserved(int count)
1065 +{
1066 + reiser4_context *ctx;
1067 + reiser4_super_info_data *sbinfo;
1068 +
1069 + ctx = get_current_context();
1070 + sub_from_ctx_grabbed(ctx, count);
1071 +
1072 + sbinfo = get_super_private(ctx->super);
1073 + spin_lock_reiser4_super(sbinfo);
1074 +
1075 + sub_from_sb_grabbed(sbinfo, count);
1076 + sbinfo->blocks_clustered += count;
1077 +
1078 + assert("edward-504", reiser4_check_block_counters(ctx->super));
1079 +
1080 + spin_unlock_reiser4_super(sbinfo);
1081 +}
1082 +
1083 +void cluster_reserved2grabbed(int count)
1084 +{
1085 + reiser4_context *ctx;
1086 + reiser4_super_info_data *sbinfo;
1087 +
1088 + ctx = get_current_context();
1089 +
1090 + sbinfo = get_super_private(ctx->super);
1091 + spin_lock_reiser4_super(sbinfo);
1092 +
1093 + sub_from_cluster_reserved(sbinfo, count);
1094 + sbinfo->blocks_grabbed += count;
1095 +
1096 + assert("edward-505", reiser4_check_block_counters(ctx->super));
1097 +
1098 + spin_unlock_reiser4_super(sbinfo);
1099 + add_to_ctx_grabbed(ctx, count);
1100 +}
1101 +
1102 +void cluster_reserved2free(int count)
1103 +{
1104 + reiser4_context *ctx;
1105 + reiser4_super_info_data *sbinfo;
1106 +
1107 + ctx = get_current_context();
1108 + sbinfo = get_super_private(ctx->super);
1109 +
1110 + cluster_reserved2grabbed(count);
1111 + grabbed2free(ctx, sbinfo, count);
1112 +}
1113 +
1114 +static DEFINE_SPINLOCK(fake_lock);
1115 +static reiser4_block_nr fake_gen = 0;
1116 +
1117 +/**
1118 + * assign_fake_blocknr
1119 + * @blocknr:
1120 + * @count:
1121 + *
1122 + * Obtain a fake block number for new node which will be used to refer to
1123 + * this newly allocated node until real allocation is done.
1124 + */
1125 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1126 +{
1127 + spin_lock(&fake_lock);
1128 + *blocknr = fake_gen;
1129 + fake_gen += count;
1130 + spin_unlock(&fake_lock);
1131 +
1132 + BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1133 + /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1134 + *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1135 + assert("zam-394", zlook(current_tree, blocknr) == NULL);
1136 +}
1137 +
1138 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1139 +{
1140 + assign_fake_blocknr(blocknr, 1);
1141 + grabbed2fake_allocated_formatted();
1142 + return 0;
1143 +}
1144 +
1145 +/**
1146 + * fake_blocknrs_unformatted
1147 + * @count: number of fake numbers to get
1148 + *
1149 + * Allocates @count fake block numbers which will be assigned to jnodes
1150 + */
1151 +reiser4_block_nr fake_blocknr_unformatted(int count)
1152 +{
1153 + reiser4_block_nr blocknr;
1154 +
1155 + assign_fake_blocknr(&blocknr, count);
1156 + grabbed2fake_allocated_unformatted(count);
1157 +
1158 + return blocknr;
1159 +}
1160 +
1161 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1162 + follows grabbing of free disk space. */
1163 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1164 + __u64 count)
1165 +{
1166 + sub_from_ctx_grabbed(ctx, count);
1167 +
1168 + spin_lock_reiser4_super(sbinfo);
1169 +
1170 + sub_from_sb_grabbed(sbinfo, count);
1171 + sbinfo->blocks_used += count;
1172 +
1173 + assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1174 +
1175 + spin_unlock_reiser4_super(sbinfo);
1176 +}
1177 +
1178 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1179 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1180 + reiser4_ba_flags_t flags)
1181 +{
1182 + spin_lock_reiser4_super(sbinfo);
1183 +
1184 + sub_from_sb_fake_allocated(sbinfo, count, flags);
1185 + sbinfo->blocks_used += count;
1186 +
1187 + assert("nikita-2680",
1188 + reiser4_check_block_counters(reiser4_get_current_sb()));
1189 +
1190 + spin_unlock_reiser4_super(sbinfo);
1191 +}
1192 +
1193 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1194 +{
1195 + reiser4_super_info_data *sbinfo;
1196 +
1197 + assert("zam-787", atom != NULL);
1198 + assert_spin_locked(&(atom->alock));
1199 +
1200 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1201 +
1202 + sbinfo = get_current_super_private();
1203 + spin_lock_reiser4_super(sbinfo);
1204 +
1205 + sub_from_sb_flush_reserved(sbinfo, count);
1206 + sbinfo->blocks_used += count;
1207 +
1208 + assert("zam-789",
1209 + reiser4_check_block_counters(reiser4_get_current_sb()));
1210 +
1211 + spin_unlock_reiser4_super(sbinfo);
1212 +}
1213 +
1214 +/* update the per fs blocknr hint default value. */
1215 +void
1216 +update_blocknr_hint_default(const struct super_block *s,
1217 + const reiser4_block_nr * block)
1218 +{
1219 + reiser4_super_info_data *sbinfo = get_super_private(s);
1220 +
1221 + assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1222 +
1223 + spin_lock_reiser4_super(sbinfo);
1224 + if (*block < sbinfo->block_count) {
1225 + sbinfo->blocknr_hint_default = *block;
1226 + } else {
1227 + warning("zam-676",
1228 + "block number %llu is too large to be used in a blocknr hint\n",
1229 + (unsigned long long)*block);
1230 + dump_stack();
1231 + DEBUGON(1);
1232 + }
1233 + spin_unlock_reiser4_super(sbinfo);
1234 +}
1235 +
1236 +/* get current value of the default blocknr hint. */
1237 +void get_blocknr_hint_default(reiser4_block_nr * result)
1238 +{
1239 + reiser4_super_info_data *sbinfo = get_current_super_private();
1240 +
1241 + spin_lock_reiser4_super(sbinfo);
1242 + *result = sbinfo->blocknr_hint_default;
1243 + assert("zam-677", *result < sbinfo->block_count);
1244 + spin_unlock_reiser4_super(sbinfo);
1245 +}
1246 +
1247 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1248 + * method. Blocks are allocated in one contiguous disk region. The plugin
1249 + * independent part accounts blocks by subtracting allocated amount from grabbed
1250 + * or fake block counter and add the same amount to the counter of allocated
1251 + * blocks.
1252 + *
1253 + * @hint -- a reiser4 blocknr hint object which contains further block
1254 + * allocation hints and parameters (search start, a stage of block
1255 + * which will be mapped to disk, etc.),
1256 + * @blk -- an out parameter for the beginning of the allocated region,
1257 + * @len -- in/out parameter, it should contain the maximum number of allocated
1258 + * blocks, after block allocation completes, it contains the length of
1259 + * allocated disk region.
1260 + * @flags -- see reiser4_ba_flags_t description.
1261 + *
1262 + * @return -- 0 if success, error code otherwise.
1263 + */
1264 +int
1265 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1266 + reiser4_block_nr * len, reiser4_ba_flags_t flags)
1267 +{
1268 + __u64 needed = *len;
1269 + reiser4_context *ctx;
1270 + reiser4_super_info_data *sbinfo;
1271 + int ret;
1272 +
1273 + assert("zam-986", hint != NULL);
1274 +
1275 + ctx = get_current_context();
1276 + sbinfo = get_super_private(ctx->super);
1277 +
1278 + /* For write-optimized data we use default search start value, which is
1279 + * close to last write location. */
1280 + if (flags & BA_USE_DEFAULT_SEARCH_START) {
1281 + get_blocknr_hint_default(&hint->blk);
1282 + }
1283 +
1284 + /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1285 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1286 + if (hint->block_stage == BLOCK_NOT_COUNTED) {
1287 + ret = reiser4_grab_space_force(*len, flags);
1288 + if (ret != 0)
1289 + return ret;
1290 + }
1291 +
1292 + ret =
1293 + sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1294 + hint, (int)needed, blk, len);
1295 +
1296 + if (!ret) {
1297 + assert("zam-680", *blk < reiser4_block_count(ctx->super));
1298 + assert("zam-681",
1299 + *blk + *len <= reiser4_block_count(ctx->super));
1300 +
1301 + if (flags & BA_PERMANENT) {
1302 + /* we assume that current atom exists at this moment */
1303 + txn_atom *atom = get_current_atom_locked();
1304 + atom->nr_blocks_allocated += *len;
1305 + spin_unlock_atom(atom);
1306 + }
1307 +
1308 + switch (hint->block_stage) {
1309 + case BLOCK_NOT_COUNTED:
1310 + case BLOCK_GRABBED:
1311 + grabbed2used(ctx, sbinfo, *len);
1312 + break;
1313 + case BLOCK_UNALLOCATED:
1314 + fake_allocated2used(sbinfo, *len, flags);
1315 + break;
1316 + case BLOCK_FLUSH_RESERVED:
1317 + {
1318 + txn_atom *atom = get_current_atom_locked();
1319 + flush_reserved2used(atom, *len);
1320 + spin_unlock_atom(atom);
1321 + }
1322 + break;
1323 + default:
1324 + impossible("zam-531", "wrong block stage");
1325 + }
1326 + } else {
1327 + assert("zam-821",
1328 + ergo(hint->max_dist == 0
1329 + && !hint->backward, ret != -ENOSPC));
1330 + if (hint->block_stage == BLOCK_NOT_COUNTED)
1331 + grabbed2free(ctx, sbinfo, needed);
1332 + }
1333 +
1334 + return ret;
1335 +}
1336 +
1337 +/* used -> fake_allocated -> grabbed -> free */
1338 +
1339 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1340 + disk */
1341 +static void
1342 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1343 + int formatted)
1344 +{
1345 + spin_lock_reiser4_super(sbinfo);
1346 +
1347 + if (formatted)
1348 + sbinfo->blocks_fake_allocated += count;
1349 + else
1350 + sbinfo->blocks_fake_allocated_unformatted += count;
1351 +
1352 + sub_from_sb_used(sbinfo, count);
1353 +
1354 + assert("nikita-2681",
1355 + reiser4_check_block_counters(reiser4_get_current_sb()));
1356 +
1357 + spin_unlock_reiser4_super(sbinfo);
1358 +}
1359 +
1360 +static void
1361 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1362 + __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1363 +{
1364 + assert("nikita-2791", atom != NULL);
1365 + assert_spin_locked(&(atom->alock));
1366 +
1367 + add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1368 +
1369 + spin_lock_reiser4_super(sbinfo);
1370 +
1371 + sbinfo->blocks_flush_reserved += count;
1372 + /*add_to_sb_flush_reserved(sbinfo, count); */
1373 + sub_from_sb_used(sbinfo, count);
1374 +
1375 + assert("nikita-2681",
1376 + reiser4_check_block_counters(reiser4_get_current_sb()));
1377 +
1378 + spin_unlock_reiser4_super(sbinfo);
1379 +}
1380 +
1381 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1382 +static void
1383 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1384 + __u64 count, reiser4_ba_flags_t flags)
1385 +{
1386 + add_to_ctx_grabbed(ctx, count);
1387 +
1388 + spin_lock_reiser4_super(sbinfo);
1389 +
1390 + assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1391 +
1392 + sbinfo->blocks_grabbed += count;
1393 + sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1394 +
1395 + assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1396 +
1397 + spin_unlock_reiser4_super(sbinfo);
1398 +}
1399 +
1400 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1401 +{
1402 + reiser4_context *ctx;
1403 + reiser4_super_info_data *sbinfo;
1404 +
1405 + ctx = get_current_context();
1406 + sbinfo = get_super_private(ctx->super);
1407 +
1408 + fake_allocated2grabbed(ctx, sbinfo, count, flags);
1409 + grabbed2free(ctx, sbinfo, count);
1410 +}
1411 +
1412 +void grabbed2free_mark(__u64 mark)
1413 +{
1414 + reiser4_context *ctx;
1415 + reiser4_super_info_data *sbinfo;
1416 +
1417 + ctx = get_current_context();
1418 + sbinfo = get_super_private(ctx->super);
1419 +
1420 + assert("nikita-3007", (__s64) mark >= 0);
1421 + assert("nikita-3006", ctx->grabbed_blocks >= mark);
1422 + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1423 +}
1424 +
1425 +/**
1426 + * grabbed2free - adjust grabbed and free block counters
1427 + * @ctx: context to update grabbed block counter of
1428 + * @sbinfo: super block to update grabbed and free block counters of
1429 + * @count: number of blocks to adjust counters by
1430 + *
1431 + * Decreases context's and per filesystem's counters of grabbed
1432 + * blocks. Increases per filesystem's counter of free blocks.
1433 + */
1434 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1435 + __u64 count)
1436 +{
1437 + sub_from_ctx_grabbed(ctx, count);
1438 +
1439 + spin_lock_reiser4_super(sbinfo);
1440 +
1441 + sub_from_sb_grabbed(sbinfo, count);
1442 + sbinfo->blocks_free += count;
1443 + assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1444 +
1445 + spin_unlock_reiser4_super(sbinfo);
1446 +}
1447 +
1448 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1449 +{
1450 + reiser4_context *ctx;
1451 + reiser4_super_info_data *sbinfo;
1452 +
1453 + assert("vs-1095", atom);
1454 +
1455 + ctx = get_current_context();
1456 + sbinfo = get_super_private(ctx->super);
1457 +
1458 + sub_from_ctx_grabbed(ctx, count);
1459 +
1460 + add_to_atom_flush_reserved_nolock(atom, count);
1461 +
1462 + spin_lock_reiser4_super(sbinfo);
1463 +
1464 + sbinfo->blocks_flush_reserved += count;
1465 + sub_from_sb_grabbed(sbinfo, count);
1466 +
1467 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1468 +
1469 + spin_unlock_reiser4_super(sbinfo);
1470 +}
1471 +
1472 +void grabbed2flush_reserved(__u64 count)
1473 +{
1474 + txn_atom *atom = get_current_atom_locked();
1475 +
1476 + grabbed2flush_reserved_nolock(atom, count);
1477 +
1478 + spin_unlock_atom(atom);
1479 +}
1480 +
1481 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1482 +{
1483 + reiser4_context *ctx;
1484 + reiser4_super_info_data *sbinfo;
1485 +
1486 + assert("nikita-2788", atom != NULL);
1487 + assert_spin_locked(&(atom->alock));
1488 +
1489 + ctx = get_current_context();
1490 + sbinfo = get_super_private(ctx->super);
1491 +
1492 + add_to_ctx_grabbed(ctx, count);
1493 +
1494 + sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1495 +
1496 + spin_lock_reiser4_super(sbinfo);
1497 +
1498 + sbinfo->blocks_grabbed += count;
1499 + sub_from_sb_flush_reserved(sbinfo, count);
1500 +
1501 + assert("vpf-292", reiser4_check_block_counters(ctx->super));
1502 +
1503 + spin_unlock_reiser4_super(sbinfo);
1504 +}
1505 +
1506 +/**
1507 + * all_grabbed2free - releases all blocks grabbed in context
1508 + *
1509 + * Decreases context's and super block's grabbed block counters by number of
1510 + * blocks grabbed by current context and increases super block's free block
1511 + * counter correspondingly.
1512 + */
1513 +void all_grabbed2free(void)
1514 +{
1515 + reiser4_context *ctx = get_current_context();
1516 +
1517 + grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1518 +}
1519 +
1520 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1521 + after freeing, @count blocks become "grabbed". */
1522 +static void
1523 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1524 + __u64 count)
1525 +{
1526 + add_to_ctx_grabbed(ctx, count);
1527 +
1528 + spin_lock_reiser4_super(sbinfo);
1529 +
1530 + sbinfo->blocks_grabbed += count;
1531 + sub_from_sb_used(sbinfo, count);
1532 +
1533 + assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1534 +
1535 + spin_unlock_reiser4_super(sbinfo);
1536 +}
1537 +
1538 +/* this used to be done through used2grabbed and grabbed2free*/
1539 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1540 +{
1541 + spin_lock_reiser4_super(sbinfo);
1542 +
1543 + sbinfo->blocks_free += count;
1544 + sub_from_sb_used(sbinfo, count);
1545 +
1546 + assert("nikita-2685",
1547 + reiser4_check_block_counters(reiser4_get_current_sb()));
1548 +
1549 + spin_unlock_reiser4_super(sbinfo);
1550 +}
1551 +
1552 +#if REISER4_DEBUG
1553 +
1554 +/* check "allocated" state of given block range */
1555 +static void
1556 +reiser4_check_blocks(const reiser4_block_nr * start,
1557 + const reiser4_block_nr * len, int desired)
1558 +{
1559 + sa_check_blocks(start, len, desired);
1560 +}
1561 +
1562 +/* check "allocated" state of given block */
1563 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1564 +{
1565 + const reiser4_block_nr one = 1;
1566 +
1567 + reiser4_check_blocks(block, &one, desired);
1568 +}
1569 +
1570 +#endif
1571 +
1572 +/* Blocks deallocation function may do an actual deallocation through space
1573 + plugin allocation or store deleted block numbers in atom's delete_set data
1574 + structure depend on @defer parameter. */
1575 +
1576 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1577 + will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1578 + freed but disk space is still grabbed by current thread, or these blocks must
1579 + not be counted in any reiser4 sb block counters, see block_stage_t comment */
1580 +
1581 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1582 + distinguish blocks allocated for unformatted and formatted nodes */
1583 +
1584 +int
1585 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
1586 + const reiser4_block_nr * len,
1587 + block_stage_t target_stage, reiser4_ba_flags_t flags)
1588 +{
1589 + txn_atom *atom = NULL;
1590 + int ret;
1591 + reiser4_context *ctx;
1592 + reiser4_super_info_data *sbinfo;
1593 +
1594 + ctx = get_current_context();
1595 + sbinfo = get_super_private(ctx->super);
1596 +
1597 + if (REISER4_DEBUG) {
1598 + assert("zam-431", *len != 0);
1599 + assert("zam-432", *start != 0);
1600 + assert("zam-558", !reiser4_blocknr_is_fake(start));
1601 +
1602 + spin_lock_reiser4_super(sbinfo);
1603 + assert("zam-562", *start < sbinfo->block_count);
1604 + spin_unlock_reiser4_super(sbinfo);
1605 + }
1606 +
1607 + if (flags & BA_DEFER) {
1608 + blocknr_set_entry *bsep = NULL;
1609 +
1610 + /* storing deleted block numbers in a blocknr set
1611 + datastructure for further actual deletion */
1612 + do {
1613 + atom = get_current_atom_locked();
1614 + assert("zam-430", atom != NULL);
1615 +
1616 + ret =
1617 + blocknr_set_add_extent(atom, &atom->delete_set,
1618 + &bsep, start, len);
1619 +
1620 + if (ret == -ENOMEM)
1621 + return ret;
1622 +
1623 + /* This loop might spin at most two times */
1624 + } while (ret == -E_REPEAT);
1625 +
1626 + assert("zam-477", ret == 0);
1627 + assert("zam-433", atom != NULL);
1628 +
1629 + spin_unlock_atom(atom);
1630 +
1631 + } else {
1632 + assert("zam-425", get_current_super_private() != NULL);
1633 + sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1634 + *start, *len);
1635 +
1636 + if (flags & BA_PERMANENT) {
1637 + /* These blocks were counted as allocated, we have to revert it
1638 + * back if allocation is discarded. */
1639 + txn_atom *atom = get_current_atom_locked();
1640 + atom->nr_blocks_allocated -= *len;
1641 + spin_unlock_atom(atom);
1642 + }
1643 +
1644 + switch (target_stage) {
1645 + case BLOCK_NOT_COUNTED:
1646 + assert("vs-960", flags & BA_FORMATTED);
1647 + /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1648 + used2free(sbinfo, *len);
1649 + break;
1650 +
1651 + case BLOCK_GRABBED:
1652 + used2grabbed(ctx, sbinfo, *len);
1653 + break;
1654 +
1655 + case BLOCK_UNALLOCATED:
1656 + used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1657 + break;
1658 +
1659 + case BLOCK_FLUSH_RESERVED:{
1660 + txn_atom *atom;
1661 +
1662 + atom = get_current_atom_locked();
1663 + used2flush_reserved(sbinfo, atom, *len,
1664 + flags & BA_FORMATTED);
1665 + spin_unlock_atom(atom);
1666 + break;
1667 + }
1668 + default:
1669 + impossible("zam-532", "wrong block stage");
1670 + }
1671 + }
1672 +
1673 + return 0;
1674 +}
1675 +
1676 +/* wrappers for block allocator plugin methods */
1677 +int reiser4_pre_commit_hook(void)
1678 +{
1679 + assert("zam-502", get_current_super_private() != NULL);
1680 + sa_pre_commit_hook();
1681 + return 0;
1682 +}
1683 +
1684 +/* an actor which applies delete set to block allocator data */
1685 +static int
1686 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1687 + const reiser4_block_nr * b, void *data UNUSED_ARG)
1688 +{
1689 + reiser4_context *ctx;
1690 + reiser4_super_info_data *sbinfo;
1691 +
1692 + __u64 len = 1;
1693 +
1694 + ctx = get_current_context();
1695 + sbinfo = get_super_private(ctx->super);
1696 +
1697 + assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1698 + assert("zam-552", sbinfo != NULL);
1699 +
1700 + if (b != NULL)
1701 + len = *b;
1702 +
1703 + if (REISER4_DEBUG) {
1704 + spin_lock_reiser4_super(sbinfo);
1705 +
1706 + assert("zam-554", *a < reiser4_block_count(ctx->super));
1707 + assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1708 +
1709 + spin_unlock_reiser4_super(sbinfo);
1710 + }
1711 +
1712 + sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1713 + /* adjust sb block counters */
1714 + used2free(sbinfo, len);
1715 + return 0;
1716 +}
1717 +
1718 +void reiser4_post_commit_hook(void)
1719 +{
1720 + txn_atom *atom;
1721 +
1722 + atom = get_current_atom_locked();
1723 + assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1724 + spin_unlock_atom(atom);
1725 +
1726 + /* do the block deallocation which was deferred
1727 + until commit is done */
1728 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1729 +
1730 + assert("zam-504", get_current_super_private() != NULL);
1731 + sa_post_commit_hook();
1732 +}
1733 +
1734 +void reiser4_post_write_back_hook(void)
1735 +{
1736 + assert("zam-504", get_current_super_private() != NULL);
1737 +
1738 + sa_post_commit_hook();
1739 +}
1740 +
1741 +/*
1742 + Local variables:
1743 + c-indentation-style: "K&R"
1744 + mode-name: "LC"
1745 + c-basic-offset: 8
1746 + tab-width: 8
1747 + fill-column: 120
1748 + scroll-step: 1
1749 + End:
1750 +*/
1751 diff -urN linux-2.6.23.orig/fs/reiser4/block_alloc.h linux-2.6.23/fs/reiser4/block_alloc.h
1752 --- linux-2.6.23.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1753 +++ linux-2.6.23/fs/reiser4/block_alloc.h 2007-12-04 16:49:30.000000000 +0300
1754 @@ -0,0 +1,175 @@
1755 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1756 +
1757 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1758 +#define __FS_REISER4_BLOCK_ALLOC_H__
1759 +
1760 +#include "dformat.h"
1761 +#include "forward.h"
1762 +
1763 +#include <linux/types.h> /* for __u?? */
1764 +#include <linux/fs.h>
1765 +
1766 +/* Mask when is applied to given block number shows is that block number is a fake one */
1767 +#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1768 +/* Mask which isolates a type of object this fake block number was assigned to */
1769 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1770 +
1771 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1772 + against these two values to understand is the object unallocated or bitmap
1773 + shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1774 +#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1775 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1776 +
1777 +/* specification how block allocation was counted in sb block counters */
1778 +typedef enum {
1779 + BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1780 + BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1781 + of this block */
1782 + BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1783 + BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1784 + ( unallocated formatted or unformatted
1785 + node) */
1786 + BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1787 + number assigned */
1788 +} block_stage_t;
1789 +
1790 +/* a hint for block allocator */
1791 +struct reiser4_blocknr_hint {
1792 + /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1793 + is to prevent jnode_flush() calls from interleaving allocations on the same
1794 + bitmap, once a hint is established. */
1795 +
1796 + /* search start hint */
1797 + reiser4_block_nr blk;
1798 + /* if not zero, it is a region size we search for free blocks in */
1799 + reiser4_block_nr max_dist;
1800 + /* level for allocation, may be useful have branch-level and higher
1801 + write-optimized. */
1802 + tree_level level;
1803 + /* block allocator assumes that blocks, which will be mapped to disk,
1804 + are in this specified block_stage */
1805 + block_stage_t block_stage;
1806 + /* If direction = 1 allocate blocks in backward direction from the end
1807 + * of disk to the beginning of disk. */
1808 + unsigned int backward:1;
1809 +
1810 +};
1811 +
1812 +/* These flags control block allocation/deallocation behavior */
1813 +enum reiser4_ba_flags {
1814 + /* do allocatations from reserved (5%) area */
1815 + BA_RESERVED = (1 << 0),
1816 +
1817 + /* block allocator can do commit trying to recover free space */
1818 + BA_CAN_COMMIT = (1 << 1),
1819 +
1820 + /* if operation will be applied to formatted block */
1821 + BA_FORMATTED = (1 << 2),
1822 +
1823 + /* defer actual block freeing until transaction commit */
1824 + BA_DEFER = (1 << 3),
1825 +
1826 + /* allocate blocks for permanent fs objects (formatted or unformatted), not
1827 + wandered of log blocks */
1828 + BA_PERMANENT = (1 << 4),
1829 +
1830 + /* grab space even it was disabled */
1831 + BA_FORCE = (1 << 5),
1832 +
1833 + /* use default start value for free blocks search. */
1834 + BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1835 +};
1836 +
1837 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1838 +
1839 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1840 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1841 +extern void update_blocknr_hint_default(const struct super_block *,
1842 + const reiser4_block_nr *);
1843 +extern void get_blocknr_hint_default(reiser4_block_nr *);
1844 +
1845 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1846 +
1847 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
1848 +reiser4_block_nr fake_blocknr_unformatted(int);
1849 +
1850 +/* free -> grabbed -> fake_allocated -> used */
1851 +
1852 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1853 +void all_grabbed2free(void);
1854 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1855 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1856 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1857 +void grabbed2flush_reserved(__u64 count);
1858 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1859 + reiser4_block_nr * start,
1860 + reiser4_block_nr * len, reiser4_ba_flags_t flags);
1861 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
1862 + const reiser4_block_nr *,
1863 + block_stage_t, reiser4_ba_flags_t flags);
1864 +
1865 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1866 + reiser4_block_nr * start,
1867 + reiser4_ba_flags_t flags)
1868 +{
1869 + reiser4_block_nr one = 1;
1870 + return reiser4_alloc_blocks(hint, start, &one, flags);
1871 +}
1872 +
1873 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1874 + block_stage_t stage,
1875 + reiser4_ba_flags_t flags)
1876 +{
1877 + const reiser4_block_nr one = 1;
1878 + return reiser4_dealloc_blocks(block, &one, stage, flags);
1879 +}
1880 +
1881 +#define reiser4_grab_space_force(count, flags) \
1882 + reiser4_grab_space(count, flags | BA_FORCE)
1883 +
1884 +extern void grabbed2free_mark(__u64 mark);
1885 +extern int reiser4_grab_reserved(struct super_block *,
1886 + __u64, reiser4_ba_flags_t);
1887 +extern void reiser4_release_reserved(struct super_block *super);
1888 +
1889 +/* grabbed -> fake_allocated */
1890 +
1891 +/* fake_allocated -> used */
1892 +
1893 +/* used -> fake_allocated -> grabbed -> free */
1894 +
1895 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1896 +
1897 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1898 +
1899 +extern void grabbed2cluster_reserved(int count);
1900 +extern void cluster_reserved2grabbed(int count);
1901 +extern void cluster_reserved2free(int count);
1902 +
1903 +extern int reiser4_check_block_counters(const struct super_block *);
1904 +
1905 +#if REISER4_DEBUG
1906 +
1907 +extern void reiser4_check_block(const reiser4_block_nr *, int);
1908 +
1909 +#else
1910 +
1911 +# define reiser4_check_block(beg, val) noop
1912 +
1913 +#endif
1914 +
1915 +extern int reiser4_pre_commit_hook(void);
1916 +extern void reiser4_post_commit_hook(void);
1917 +extern void reiser4_post_write_back_hook(void);
1918 +
1919 +#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1920 +
1921 +/* Make Linus happy.
1922 + Local variables:
1923 + c-indentation-style: "K&R"
1924 + mode-name: "LC"
1925 + c-basic-offset: 8
1926 + tab-width: 8
1927 + fill-column: 120
1928 + End:
1929 +*/
1930 diff -urN linux-2.6.23.orig/fs/reiser4/blocknrset.c linux-2.6.23/fs/reiser4/blocknrset.c
1931 --- linux-2.6.23.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1932 +++ linux-2.6.23/fs/reiser4/blocknrset.c 2007-12-04 16:49:30.000000000 +0300
1933 @@ -0,0 +1,368 @@
1934 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1935 +
1936 +/* This file contains code for various block number sets used by the atom to
1937 + track the deleted set and wandered block mappings. */
1938 +
1939 +#include "debug.h"
1940 +#include "dformat.h"
1941 +#include "txnmgr.h"
1942 +#include "context.h"
1943 +
1944 +#include <linux/slab.h>
1945 +
1946 +/* The proposed data structure for storing unordered block number sets is a
1947 + list of elements, each of which contains an array of block number or/and
1948 + array of block number pairs. That element called blocknr_set_entry is used
1949 + to store block numbers from the beginning and for extents from the end of
1950 + the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1951 + count numbers of blocks and extents.
1952 +
1953 + +------------------- blocknr_set_entry->data ------------------+
1954 + |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1955 + +------------------------------------------------------------+
1956 +
1957 + When current blocknr_set_entry is full, allocate a new one. */
1958 +
1959 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1960 + * set (single blocks and block extents), in that case blocknr pair represent an
1961 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1962 + * there represent a (real block) -> (wandered block) mapping. */
1963 +
1964 +/* Protection: blocknr sets belong to reiser4 atom, and
1965 + * their modifications are performed with the atom lock held */
1966 +
1967 +/* The total size of a blocknr_set_entry. */
1968 +#define BLOCKNR_SET_ENTRY_SIZE 128
1969 +
1970 +/* The number of blocks that can fit the blocknr data area. */
1971 +#define BLOCKNR_SET_ENTRIES_NUMBER \
1972 + ((BLOCKNR_SET_ENTRY_SIZE - \
1973 + 2 * sizeof (unsigned) - \
1974 + sizeof(struct list_head)) / \
1975 + sizeof(reiser4_block_nr))
1976 +
1977 +/* An entry of the blocknr_set */
1978 +struct blocknr_set_entry {
1979 + unsigned nr_singles;
1980 + unsigned nr_pairs;
1981 + struct list_head link;
1982 + reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1983 +};
1984 +
1985 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
1986 +struct blocknr_pair {
1987 + reiser4_block_nr a;
1988 + reiser4_block_nr b;
1989 +};
1990 +
1991 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
1992 +/* Audited by: green(2002.06.11) */
1993 +static unsigned bse_avail(blocknr_set_entry * bse)
1994 +{
1995 + unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1996 +
1997 + assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1998 + cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1999 +
2000 + return BLOCKNR_SET_ENTRIES_NUMBER - used;
2001 +}
2002 +
2003 +/* Initialize a blocknr_set_entry. */
2004 +static void bse_init(blocknr_set_entry *bse)
2005 +{
2006 + bse->nr_singles = 0;
2007 + bse->nr_pairs = 0;
2008 + INIT_LIST_HEAD(&bse->link);
2009 +}
2010 +
2011 +/* Allocate and initialize a blocknr_set_entry. */
2012 +/* Audited by: green(2002.06.11) */
2013 +static blocknr_set_entry *bse_alloc(void)
2014 +{
2015 + blocknr_set_entry *e;
2016 +
2017 + if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2018 + reiser4_ctx_gfp_mask_get())) == NULL)
2019 + return NULL;
2020 +
2021 + bse_init(e);
2022 +
2023 + return e;
2024 +}
2025 +
2026 +/* Free a blocknr_set_entry. */
2027 +/* Audited by: green(2002.06.11) */
2028 +static void bse_free(blocknr_set_entry * bse)
2029 +{
2030 + kfree(bse);
2031 +}
2032 +
2033 +/* Add a block number to a blocknr_set_entry */
2034 +/* Audited by: green(2002.06.11) */
2035 +static void
2036 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2037 +{
2038 + assert("jmacd-5099", bse_avail(bse) >= 1);
2039 +
2040 + bse->entries[bse->nr_singles++] = *block;
2041 +}
2042 +
2043 +/* Get a pair of block numbers */
2044 +/* Audited by: green(2002.06.11) */
2045 +static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
2046 + unsigned pno)
2047 +{
2048 + assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2049 +
2050 + return (struct blocknr_pair *) (bse->entries +
2051 + BLOCKNR_SET_ENTRIES_NUMBER -
2052 + 2 * (pno + 1));
2053 +}
2054 +
2055 +/* Add a pair of block numbers to a blocknr_set_entry */
2056 +/* Audited by: green(2002.06.11) */
2057 +static void
2058 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2059 + const reiser4_block_nr * b)
2060 +{
2061 + struct blocknr_pair *pair;
2062 +
2063 + assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2064 +
2065 + pair = bse_get_pair(bse, bse->nr_pairs++);
2066 +
2067 + pair->a = *a;
2068 + pair->b = *b;
2069 +}
2070 +
2071 +/* Add either a block or pair of blocks to the block number set. The first
2072 + blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2073 + @b is non-NULL a pair is added. The block number set belongs to atom, and
2074 + the call is made with the atom lock held. There may not be enough space in
2075 + the current blocknr_set_entry. If new_bsep points to a non-NULL
2076 + blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2077 + will be set to NULL. If new_bsep contains NULL then the atom lock will be
2078 + released and a new bse will be allocated in new_bsep. E_REPEAT will be
2079 + returned with the atom unlocked for the operation to be tried again. If
2080 + the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2081 + used during the call, it will be freed automatically. */
2082 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2083 + blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2084 + const reiser4_block_nr *b)
2085 +{
2086 + blocknr_set_entry *bse;
2087 + unsigned entries_needed;
2088 +
2089 + assert("jmacd-5101", a != NULL);
2090 +
2091 + entries_needed = (b == NULL) ? 1 : 2;
2092 + if (list_empty(bset) ||
2093 + bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2094 + /* See if a bse was previously allocated. */
2095 + if (*new_bsep == NULL) {
2096 + spin_unlock_atom(atom);
2097 + *new_bsep = bse_alloc();
2098 + return (*new_bsep != NULL) ? -E_REPEAT :
2099 + RETERR(-ENOMEM);
2100 + }
2101 +
2102 + /* Put it on the head of the list. */
2103 + list_add(&((*new_bsep)->link), bset);
2104 +
2105 + *new_bsep = NULL;
2106 + }
2107 +
2108 + /* Add the single or pair. */
2109 + bse = list_entry(bset->next, blocknr_set_entry, link);
2110 + if (b == NULL) {
2111 + bse_put_single(bse, a);
2112 + } else {
2113 + bse_put_pair(bse, a, b);
2114 + }
2115 +
2116 + /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2117 + if (*new_bsep != NULL) {
2118 + bse_free(*new_bsep);
2119 + *new_bsep = NULL;
2120 + }
2121 +
2122 + return 0;
2123 +}
2124 +
2125 +/* Add an extent to the block set. If the length is 1, it is treated as a
2126 + single block (e.g., reiser4_set_add_block). */
2127 +/* Audited by: green(2002.06.11) */
2128 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2129 + kmalloc might schedule. The only exception is atom spinlock, which is
2130 + properly freed. */
2131 +int
2132 +blocknr_set_add_extent(txn_atom * atom,
2133 + struct list_head * bset,
2134 + blocknr_set_entry ** new_bsep,
2135 + const reiser4_block_nr * start,
2136 + const reiser4_block_nr * len)
2137 +{
2138 + assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2139 + return blocknr_set_add(atom, bset, new_bsep, start,
2140 + *len == 1 ? NULL : len);
2141 +}
2142 +
2143 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2144 + * by an assertion that both arguments are not null.*/
2145 +/* Audited by: green(2002.06.11) */
2146 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2147 + kmalloc might schedule. The only exception is atom spinlock, which is
2148 + properly freed. */
2149 +int
2150 +blocknr_set_add_pair(txn_atom * atom,
2151 + struct list_head * bset,
2152 + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2153 + const reiser4_block_nr * b)
2154 +{
2155 + assert("jmacd-5103", a != NULL && b != NULL);
2156 + return blocknr_set_add(atom, bset, new_bsep, a, b);
2157 +}
2158 +
2159 +/* Initialize a blocknr_set. */
2160 +void blocknr_set_init(struct list_head *bset)
2161 +{
2162 + INIT_LIST_HEAD(bset);
2163 +}
2164 +
2165 +/* Release the entries of a blocknr_set. */
2166 +void blocknr_set_destroy(struct list_head *bset)
2167 +{
2168 + blocknr_set_entry *bse;
2169 +
2170 + while (!list_empty(bset)) {
2171 + bse = list_entry(bset->next, blocknr_set_entry, link);
2172 + list_del_init(&bse->link);
2173 + bse_free(bse);
2174 + }
2175 +}
2176 +
2177 +/* Merge blocknr_set entries out of @from into @into. */
2178 +/* Audited by: green(2002.06.11) */
2179 +/* Auditor comments: This merge does not know if merged sets contain
2180 + blocks pairs (As for wandered sets) or extents, so it cannot really merge
2181 + overlapping ranges if there is some. So I believe it may lead to
2182 + some blocks being presented several times in one blocknr_set. To help
2183 + debugging such problems it might help to check for duplicate entries on
2184 + actual processing of this set. Testing this kind of stuff right here is
2185 + also complicated by the fact that these sets are not sorted and going
2186 + through whole set on each element addition is going to be CPU-heavy task */
2187 +void blocknr_set_merge(struct list_head * from, struct list_head * into)
2188 +{
2189 + blocknr_set_entry *bse_into = NULL;
2190 +
2191 + /* If @from is empty, no work to perform. */
2192 + if (list_empty(from))
2193 + return;
2194 + /* If @into is not empty, try merging partial-entries. */
2195 + if (!list_empty(into)) {
2196 +
2197 + /* Neither set is empty, pop the front to members and try to combine them. */
2198 + blocknr_set_entry *bse_from;
2199 + unsigned into_avail;
2200 +
2201 + bse_into = list_entry(into->next, blocknr_set_entry, link);
2202 + list_del_init(&bse_into->link);
2203 + bse_from = list_entry(from->next, blocknr_set_entry, link);
2204 + list_del_init(&bse_from->link);
2205 +
2206 + /* Combine singles. */
2207 + for (into_avail = bse_avail(bse_into);
2208 + into_avail != 0 && bse_from->nr_singles != 0;
2209 + into_avail -= 1) {
2210 + bse_put_single(bse_into,
2211 + &bse_from->entries[--bse_from->
2212 + nr_singles]);
2213 + }
2214 +
2215 + /* Combine pairs. */
2216 + for (; into_avail > 1 && bse_from->nr_pairs != 0;
2217 + into_avail -= 2) {
2218 + struct blocknr_pair *pair =
2219 + bse_get_pair(bse_from, --bse_from->nr_pairs);
2220 + bse_put_pair(bse_into, &pair->a, &pair->b);
2221 + }
2222 +
2223 + /* If bse_from is empty, delete it now. */
2224 + if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2225 + bse_free(bse_from);
2226 + } else {
2227 + /* Otherwise, bse_into is full or nearly full (e.g.,
2228 + it could have one slot avail and bse_from has one
2229 + pair left). Push it back onto the list. bse_from
2230 + becomes bse_into, which will be the new partial. */
2231 + list_add(&bse_into->link, into);
2232 + bse_into = bse_from;
2233 + }
2234 + }
2235 +
2236 + /* Splice lists together. */
2237 + list_splice_init(from, into->prev);
2238 +
2239 + /* Add the partial entry back to the head of the list. */
2240 + if (bse_into != NULL)
2241 + list_add(&bse_into->link, into);
2242 +}
2243 +
2244 +/* Iterate over all blocknr set elements. */
2245 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2246 + blocknr_set_actor_f actor, void *data, int delete)
2247 +{
2248 +
2249 + blocknr_set_entry *entry;
2250 +
2251 + assert("zam-429", atom != NULL);
2252 + assert("zam-430", atom_is_protected(atom));
2253 + assert("zam-431", bset != 0);
2254 + assert("zam-432", actor != NULL);
2255 +
2256 + entry = list_entry(bset->next, blocknr_set_entry, link);
2257 + while (bset != &entry->link) {
2258 + blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2259 + unsigned int i;
2260 + int ret;
2261 +
2262 + for (i = 0; i < entry->nr_singles; i++) {
2263 + ret = actor(atom, &entry->entries[i], NULL, data);
2264 +
2265 + /* We can't break a loop if delete flag is set. */
2266 + if (ret != 0 && !delete)
2267 + return ret;
2268 + }
2269 +
2270 + for (i = 0; i < entry->nr_pairs; i++) {
2271 + struct blocknr_pair *ab;
2272 +
2273 + ab = bse_get_pair(entry, i);
2274 +
2275 + ret = actor(atom, &ab->a, &ab->b, data);
2276 +
2277 + if (ret != 0 && !delete)
2278 + return ret;
2279 + }
2280 +
2281 + if (delete) {
2282 + list_del(&entry->link);
2283 + bse_free(entry);
2284 + }
2285 +
2286 + entry = tmp;
2287 + }
2288 +
2289 + return 0;
2290 +}
2291 +
2292 +/*
2293 + * Local variables:
2294 + * c-indentation-style: "K&R"
2295 + * mode-name: "LC"
2296 + * c-basic-offset: 8
2297 + * tab-width: 8
2298 + * fill-column: 79
2299 + * scroll-step: 1
2300 + * End:
2301 + */
2302 diff -urN linux-2.6.23.orig/fs/reiser4/carry.c linux-2.6.23/fs/reiser4/carry.c
2303 --- linux-2.6.23.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2304 +++ linux-2.6.23/fs/reiser4/carry.c 2007-12-04 16:49:30.000000000 +0300
2305 @@ -0,0 +1,1391 @@
2306 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2307 +/* Functions to "carry" tree modification(s) upward. */
2308 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2309 + set of changes that need to be propagated to the next level. We manage
2310 + node locking such that any searches that collide with carrying are
2311 + restarted, from the root if necessary.
2312 +
2313 + Insertion of a new item may result in items being moved among nodes and
2314 + this requires the delimiting key to be updated at the least common parent
2315 + of the nodes modified to preserve search tree invariants. Also, insertion
2316 + may require allocation of a new node. A pointer to the new node has to be
2317 + inserted into some node on the parent level, etc.
2318 +
2319 + Tree carrying is meant to be analogous to arithmetic carrying.
2320 +
2321 + A carry operation is always associated with some node (&carry_node).
2322 +
2323 + Carry process starts with some initial set of operations to be performed
2324 + and an initial set of already locked nodes. Operations are performed one
2325 + by one. Performing each single operation has following possible effects:
2326 +
2327 + - content of carry node associated with operation is modified
2328 + - new carry nodes are locked and involved into carry process on this level
2329 + - new carry operations are posted to the next level
2330 +
2331 + After all carry operations on this level are done, process is repeated for
2332 + the accumulated sequence on carry operations for the next level. This
2333 + starts by trying to lock (in left to right order) all carry nodes
2334 + associated with carry operations on the parent level. After this, we decide
2335 + whether more nodes are required on the left of already locked set. If so,
2336 + all locks taken on the parent level are released, new carry nodes are
2337 + added, and locking process repeats.
2338 +
2339 + It may happen that balancing process fails owing to unrecoverable error on
2340 + some of upper levels of a tree (possible causes are io error, failure to
2341 + allocate new node, etc.). In this case we should unmount the filesystem,
2342 + rebooting if it is the root, and possibly advise the use of fsck.
2343 +
2344 + USAGE:
2345 +
2346 + int some_tree_operation( znode *node, ... )
2347 + {
2348 + // Allocate on a stack pool of carry objects: operations and nodes.
2349 + // Most carry processes will only take objects from here, without
2350 + // dynamic allocation.
2351 +
2352 +I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2353 +
2354 + carry_pool pool;
2355 + carry_level lowest_level;
2356 + carry_op *op;
2357 +
2358 + init_carry_pool( &pool );
2359 + init_carry_level( &lowest_level, &pool );
2360 +
2361 + // operation may be one of:
2362 + // COP_INSERT --- insert new item into node
2363 + // COP_CUT --- remove part of or whole node
2364 + // COP_PASTE --- increase size of item
2365 + // COP_DELETE --- delete pointer from parent node
2366 + // COP_UPDATE --- update delimiting key in least
2367 + // common ancestor of two
2368 +
2369 + op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2370 + if( IS_ERR( op ) || ( op == NULL ) ) {
2371 + handle error
2372 + } else {
2373 + // fill in remaining fields in @op, according to carry.h:carry_op
2374 + result = carry( &lowest_level, NULL );
2375 + }
2376 + done_carry_pool( &pool );
2377 + }
2378 +
2379 + When you are implementing node plugin method that participates in carry
2380 + (shifting, insertion, deletion, etc.), do the following:
2381 +
2382 + int foo_node_method( znode *node, ..., carry_level *todo )
2383 + {
2384 + carry_op *op;
2385 +
2386 + ....
2387 +
2388 + // note, that last argument to reiser4_post_carry() is non-null
2389 + // here, because @op is to be applied to the parent of @node, rather
2390 + // than to the @node itself as in the previous case.
2391 +
2392 + op = node_post_carry( todo, operation, node, 1 );
2393 + // fill in remaining fields in @op, according to carry.h:carry_op
2394 +
2395 + ....
2396 +
2397 + }
2398 +
2399 + BATCHING:
2400 +
2401 + One of the main advantages of level-by-level balancing implemented here is
2402 + ability to batch updates on a parent level and to peform them more
2403 + efficiently as a result.
2404 +
2405 + Description To Be Done (TBD).
2406 +
2407 + DIFFICULTIES AND SUBTLE POINTS:
2408 +
2409 + 1. complex plumbing is required, because:
2410 +
2411 + a. effective allocation through pools is needed
2412 +
2413 + b. target of operation is not exactly known when operation is
2414 + posted. This is worked around through bitfields in &carry_node and
2415 + logic in lock_carry_node()
2416 +
2417 + c. of interaction with locking code: node should be added into sibling
2418 + list when pointer to it is inserted into its parent, which is some time
2419 + after node was created. Between these moments, node is somewhat in
2420 + suspended state and is only registered in the carry lists
2421 +
2422 + 2. whole balancing logic is implemented here, in particular, insertion
2423 + logic is coded in make_space().
2424 +
2425 + 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2426 + (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2427 + (insert_paste()) have to be handled.
2428 +
2429 + 4. there is non-trivial interdependency between allocation of new nodes
2430 + and almost everything else. This is mainly due to the (1.c) above. I shall
2431 + write about this later.
2432 +
2433 +*/
2434 +
2435 +#include "forward.h"
2436 +#include "debug.h"
2437 +#include "key.h"
2438 +#include "coord.h"
2439 +#include "plugin/item/item.h"
2440 +#include "plugin/item/extent.h"
2441 +#include "plugin/node/node.h"
2442 +#include "jnode.h"
2443 +#include "znode.h"
2444 +#include "tree_mod.h"
2445 +#include "tree_walk.h"
2446 +#include "block_alloc.h"
2447 +#include "pool.h"
2448 +#include "tree.h"
2449 +#include "carry.h"
2450 +#include "carry_ops.h"
2451 +#include "super.h"
2452 +#include "reiser4.h"
2453 +
2454 +#include <linux/types.h>
2455 +
2456 +/* level locking/unlocking */
2457 +static int lock_carry_level(carry_level * level);
2458 +static void unlock_carry_level(carry_level * level, int failure);
2459 +static void done_carry_level(carry_level * level);
2460 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2461 +
2462 +int lock_carry_node(carry_level * level, carry_node * node);
2463 +int lock_carry_node_tail(carry_node * node);
2464 +
2465 +/* carry processing proper */
2466 +static int carry_on_level(carry_level * doing, carry_level * todo);
2467 +
2468 +static carry_op *add_op(carry_level * level, pool_ordering order,
2469 + carry_op * reference);
2470 +
2471 +/* handlers for carry operations. */
2472 +
2473 +static void fatal_carry_error(carry_level * doing, int ecode);
2474 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2475 +
2476 +static void print_level(const char *prefix, carry_level * level);
2477 +
2478 +#if REISER4_DEBUG
2479 +typedef enum {
2480 + CARRY_TODO,
2481 + CARRY_DOING
2482 +} carry_queue_state;
2483 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2484 +#endif
2485 +
2486 +/* main entry point for tree balancing.
2487 +
2488 + Tree carry performs operations from @doing and while doing so accumulates
2489 + information about operations to be performed on the next level ("carried"
2490 + to the parent level). Carried operations are performed, causing possibly
2491 + more operations to be carried upward etc. carry() takes care about
2492 + locking and pinning znodes while operating on them.
2493 +
2494 + For usage, see comment at the top of fs/reiser4/carry.c
2495 +
2496 +*/
2497 +int reiser4_carry(carry_level * doing /* set of carry operations to be
2498 + * performed */ ,
2499 + carry_level * done /* set of nodes, already performed
2500 + * at the previous level.
2501 + * NULL in most cases */)
2502 +{
2503 + int result = 0;
2504 + /* queue of new requests */
2505 + carry_level *todo;
2506 + ON_DEBUG(STORE_COUNTERS);
2507 +
2508 + assert("nikita-888", doing != NULL);
2509 + BUG_ON(done != NULL);
2510 +
2511 + todo = doing + 1;
2512 + init_carry_level(todo, doing->pool);
2513 +
2514 + /* queue of requests preformed on the previous level */
2515 + done = todo + 1;
2516 + init_carry_level(done, doing->pool);
2517 +
2518 + /* iterate until there is nothing more to do */
2519 + while (result == 0 && doing->ops_num > 0) {
2520 + carry_level *tmp;
2521 +
2522 + /* at this point @done is locked. */
2523 + /* repeat lock/do/unlock while
2524 +
2525 + (1) lock_carry_level() fails due to deadlock avoidance, or
2526 +
2527 + (2) carry_on_level() decides that more nodes have to
2528 + be involved.
2529 +
2530 + (3) some unexpected error occurred while balancing on the
2531 + upper levels. In this case all changes are rolled back.
2532 +
2533 + */
2534 + while (1) {
2535 + result = lock_carry_level(doing);
2536 + if (result == 0) {
2537 + /* perform operations from @doing and
2538 + accumulate new requests in @todo */
2539 + result = carry_on_level(doing, todo);
2540 + if (result == 0)
2541 + break;
2542 + else if (result != -E_REPEAT ||
2543 + !doing->restartable) {
2544 + warning("nikita-1043",
2545 + "Fatal error during carry: %i",
2546 + result);
2547 + print_level("done", done);
2548 + print_level("doing", doing);
2549 + print_level("todo", todo);
2550 + /* do some rough stuff like aborting
2551 + all pending transcrashes and thus
2552 + pushing tree back to the consistent
2553 + state. Alternatvely, just panic.
2554 + */
2555 + fatal_carry_error(doing, result);
2556 + return result;
2557 + }
2558 + } else if (result != -E_REPEAT) {
2559 + fatal_carry_error(doing, result);
2560 + return result;
2561 + }
2562 + unlock_carry_level(doing, 1);
2563 + }
2564 + /* at this point @done can be safely unlocked */
2565 + done_carry_level(done);
2566 +
2567 + /* cyclically shift queues */
2568 + tmp = done;
2569 + done = doing;
2570 + doing = todo;
2571 + todo = tmp;
2572 + init_carry_level(todo, doing->pool);
2573 +
2574 + /* give other threads chance to run */
2575 + reiser4_preempt_point();
2576 + }
2577 + done_carry_level(done);
2578 +
2579 + /* all counters, but x_refs should remain the same. x_refs can change
2580 + owing to transaction manager */
2581 + ON_DEBUG(CHECK_COUNTERS);
2582 + return result;
2583 +}
2584 +
2585 +/* perform carry operations on given level.
2586 +
2587 + Optimizations proposed by pooh:
2588 +
2589 + (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2590 + required;
2591 +
2592 + (2) unlock node if there are no more operations to be performed upon it and
2593 + node didn't add any operation to @todo. This can be implemented by
2594 + attaching to each node two counters: counter of operaions working on this
2595 + node and counter and operations carried upward from this node.
2596 +
2597 +*/
2598 +static int carry_on_level(carry_level * doing /* queue of carry operations to
2599 + * do on this level */ ,
2600 + carry_level * todo /* queue where new carry
2601 + * operations to be performed on
2602 + * the * parent level are
2603 + * accumulated during @doing
2604 + * processing. */ )
2605 +{
2606 + int result;
2607 + int (*f) (carry_op *, carry_level *, carry_level *);
2608 + carry_op *op;
2609 + carry_op *tmp_op;
2610 +
2611 + assert("nikita-1034", doing != NULL);
2612 + assert("nikita-1035", todo != NULL);
2613 +
2614 + /* @doing->nodes are locked. */
2615 +
2616 + /* This function can be split into two phases: analysis and modification.
2617 +
2618 + Analysis calculates precisely what items should be moved between
2619 + nodes. This information is gathered in some structures attached to
2620 + each carry_node in a @doing queue. Analysis also determines whether
2621 + new nodes are to be allocated etc.
2622 +
2623 + After analysis is completed, actual modification is performed. Here
2624 + we can take advantage of "batch modification": if there are several
2625 + operations acting on the same node, modifications can be performed
2626 + more efficiently when batched together.
2627 +
2628 + Above is an optimization left for the future.
2629 + */
2630 + /* Important, but delayed optimization: it's possible to batch
2631 + operations together and perform them more efficiently as a
2632 + result. For example, deletion of several neighboring items from a
2633 + node can be converted to a single ->cut() operation.
2634 +
2635 + Before processing queue, it should be scanned and "mergeable"
2636 + operations merged.
2637 + */
2638 + result = 0;
2639 + for_all_ops(doing, op, tmp_op) {
2640 + carry_opcode opcode;
2641 +
2642 + assert("nikita-1041", op != NULL);
2643 + opcode = op->op;
2644 + assert("nikita-1042", op->op < COP_LAST_OP);
2645 + f = op_dispatch_table[op->op].handler;
2646 + result = f(op, doing, todo);
2647 + /* locking can fail with -E_REPEAT. Any different error is fatal
2648 + and will be handled by fatal_carry_error() sledgehammer.
2649 + */
2650 + if (result != 0)
2651 + break;
2652 + }
2653 + if (result == 0) {
2654 + carry_plugin_info info;
2655 + carry_node *scan;
2656 + carry_node *tmp_scan;
2657 +
2658 + info.doing = doing;
2659 + info.todo = todo;
2660 +
2661 + assert("nikita-3002",
2662 + carry_level_invariant(doing, CARRY_DOING));
2663 + for_all_nodes(doing, scan, tmp_scan) {
2664 + znode *node;
2665 +
2666 + node = reiser4_carry_real(scan);
2667 + assert("nikita-2547", node != NULL);
2668 + if (node_is_empty(node)) {
2669 + result =
2670 + node_plugin_by_node(node)->
2671 + prepare_removal(node, &info);
2672 + if (result != 0)
2673 + break;
2674 + }
2675 + }
2676 + }
2677 + return result;
2678 +}
2679 +
2680 +/* post carry operation
2681 +
2682 + This is main function used by external carry clients: node layout plugins
2683 + and tree operations to create new carry operation to be performed on some
2684 + level.
2685 +
2686 + New operation will be included in the @level queue. To actually perform it,
2687 + call carry( level, ... ). This function takes write lock on @node. Carry
2688 + manages all its locks by itself, don't worry about this.
2689 +
2690 + This function adds operation and node at the end of the queue. It is up to
2691 + caller to guarantee proper ordering of node queue.
2692 +
2693 +*/
2694 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2695 + * is to be posted at */ ,
2696 + carry_opcode op /* opcode of operation */ ,
2697 + znode * node /* node on which this operation
2698 + * will operate */ ,
2699 + int apply_to_parent_p /* whether operation will
2700 + * operate directly on @node
2701 + * or on it parent. */)
2702 +{
2703 + carry_op *result;
2704 + carry_node *child;
2705 +
2706 + assert("nikita-1046", level != NULL);
2707 + assert("nikita-1788", znode_is_write_locked(node));
2708 +
2709 + result = add_op(level, POOLO_LAST, NULL);
2710 + if (IS_ERR(result))
2711 + return result;
2712 + child = reiser4_add_carry(level, POOLO_LAST, NULL);
2713 + if (IS_ERR(child)) {
2714 + reiser4_pool_free(&level->pool->op_pool, &result->header);
2715 + return (carry_op *) child;
2716 + }
2717 + result->node = child;
2718 + result->op = op;
2719 + child->parent = apply_to_parent_p;
2720 + if (ZF_ISSET(node, JNODE_ORPHAN))
2721 + child->left_before = 1;
2722 + child->node = node;
2723 + return result;
2724 +}
2725 +
2726 +/* initialize carry queue */
2727 +void init_carry_level(carry_level * level /* level to initialize */ ,
2728 + carry_pool * pool /* pool @level will allocate objects
2729 + * from */ )
2730 +{
2731 + assert("nikita-1045", level != NULL);
2732 + assert("nikita-967", pool != NULL);
2733 +
2734 + memset(level, 0, sizeof *level);
2735 + level->pool = pool;
2736 +
2737 + INIT_LIST_HEAD(&level->nodes);
2738 + INIT_LIST_HEAD(&level->ops);
2739 +}
2740 +
2741 +/* allocate carry pool and initialize pools within queue */
2742 +carry_pool *init_carry_pool(int size)
2743 +{
2744 + carry_pool *pool;
2745 +
2746 + assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2747 + pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2748 + if (pool == NULL)
2749 + return ERR_PTR(RETERR(-ENOMEM));
2750 +
2751 + reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2752 + (char *)pool->op);
2753 + reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2754 + NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2755 + return pool;
2756 +}
2757 +
2758 +/* finish with queue pools */
2759 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2760 +{
2761 + reiser4_done_pool(&pool->op_pool);
2762 + reiser4_done_pool(&pool->node_pool);
2763 + kfree(pool);
2764 +}
2765 +
2766 +/* add new carry node to the @level.
2767 +
2768 + Returns pointer to the new carry node allocated from pool. It's up to
2769 + callers to maintain proper order in the @level. Assumption is that if carry
2770 + nodes on one level are already sorted and modifications are peroformed from
2771 + left to right, carry nodes added on the parent level will be ordered
2772 + automatically. To control ordering use @order and @reference parameters.
2773 +
2774 +*/
2775 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2776 + * node to */ ,
2777 + pool_ordering order /* where to insert:
2778 + * at the beginning of
2779 + * @level,
2780 + * before @reference,
2781 + * after @reference,
2782 + * at the end of @level
2783 + */ ,
2784 + carry_node * reference/* reference node for
2785 + * insertion */)
2786 +{
2787 + ON_DEBUG(carry_node * orig_ref = reference);
2788 +
2789 + if (order == POOLO_BEFORE) {
2790 + reference = find_left_carry(reference, level);
2791 + if (reference == NULL)
2792 + reference = list_entry(level->nodes.next, carry_node,
2793 + header.level_linkage);
2794 + else
2795 + reference = list_entry(reference->header.level_linkage.next,
2796 + carry_node, header.level_linkage);
2797 + } else if (order == POOLO_AFTER) {
2798 + reference = find_right_carry(reference, level);
2799 + if (reference == NULL)
2800 + reference = list_entry(level->nodes.prev, carry_node,
2801 + header.level_linkage);
2802 + else
2803 + reference = list_entry(reference->header.level_linkage.prev,
2804 + carry_node, header.level_linkage);
2805 + }
2806 + assert("nikita-2209",
2807 + ergo(orig_ref != NULL,
2808 + reiser4_carry_real(reference) ==
2809 + reiser4_carry_real(orig_ref)));
2810 + return reiser4_add_carry(level, order, reference);
2811 +}
2812 +
2813 +carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
2814 + * to */ ,
2815 + pool_ordering order /* where to insert: at the
2816 + * beginning of @level, before
2817 + * @reference, after @reference,
2818 + * at the end of @level */ ,
2819 + carry_node * reference /* reference node for
2820 + * insertion */ )
2821 +{
2822 + carry_node *result;
2823 +
2824 + result =
2825 + (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2826 + &level->nodes,
2827 + order, &reference->header);
2828 + if (!IS_ERR(result) && (result != NULL))
2829 + ++level->nodes_num;
2830 + return result;
2831 +}
2832 +
2833 +/* add new carry operation to the @level.
2834 +
2835 + Returns pointer to the new carry operations allocated from pool. It's up to
2836 + callers to maintain proper order in the @level. To control ordering use
2837 + @order and @reference parameters.
2838 +
2839 +*/
2840 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2841 + pool_ordering order /* where to insert: at the beginning of
2842 + * @level, before @reference, after
2843 + * @reference, at the end of @level */ ,
2844 + carry_op *
2845 + reference /* reference node for insertion */ )
2846 +{
2847 + carry_op *result;
2848 +
2849 + result =
2850 + (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2851 + order, &reference->header);
2852 + if (!IS_ERR(result) && (result != NULL))
2853 + ++level->ops_num;
2854 + return result;
2855 +}
2856 +
2857 +/* Return node on the right of which @node was created.
2858 +
2859 + Each node is created on the right of some existing node (or it is new root,
2860 + which is special case not handled here).
2861 +
2862 + @node is new node created on some level, but not yet inserted into its
2863 + parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2864 +
2865 +*/
2866 +static carry_node *find_begetting_brother(carry_node * node /* node to start search
2867 + * from */ ,
2868 + carry_level * kin UNUSED_ARG /* level to
2869 + * scan */ )
2870 +{
2871 + carry_node *scan;
2872 +
2873 + assert("nikita-1614", node != NULL);
2874 + assert("nikita-1615", kin != NULL);
2875 + assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2876 + assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2877 + ZF_ISSET(reiser4_carry_real(node),
2878 + JNODE_ORPHAN)));
2879 + for (scan = node;;
2880 + scan = list_entry(scan->header.level_linkage.prev, carry_node,
2881 + header.level_linkage)) {
2882 + assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2883 + if ((scan->node != node->node) &&
2884 + !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2885 + assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2886 + break;
2887 + }
2888 + }
2889 + return scan;
2890 +}
2891 +
2892 +static cmp_t
2893 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2894 +{
2895 + assert("nikita-2199", n1 != NULL);
2896 + assert("nikita-2200", n2 != NULL);
2897 +
2898 + if (n1 == n2)
2899 + return EQUAL_TO;
2900 + while (1) {
2901 + n1 = carry_node_next(n1);
2902 + if (carry_node_end(level, n1))
2903 + return GREATER_THAN;
2904 + if (n1 == n2)
2905 + return LESS_THAN;
2906 + }
2907 + impossible("nikita-2201", "End of level reached");
2908 +}
2909 +
2910 +carry_node *find_carry_node(carry_level * level, const znode * node)
2911 +{
2912 + carry_node *scan;
2913 + carry_node *tmp_scan;
2914 +
2915 + assert("nikita-2202", level != NULL);
2916 + assert("nikita-2203", node != NULL);
2917 +
2918 + for_all_nodes(level, scan, tmp_scan) {
2919 + if (reiser4_carry_real(scan) == node)
2920 + return scan;
2921 + }
2922 + return NULL;
2923 +}
2924 +
2925 +znode *reiser4_carry_real(const carry_node * node)
2926 +{
2927 + assert("nikita-3061", node != NULL);
2928 +
2929 + return node->lock_handle.node;
2930 +}
2931 +
2932 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2933 + const znode * node)
2934 +{
2935 + carry_node *base;
2936 + carry_node *scan;
2937 + carry_node *tmp_scan;
2938 + carry_node *proj;
2939 +
2940 + base = find_carry_node(doing, node);
2941 + assert("nikita-2204", base != NULL);
2942 +
2943 + for_all_nodes(todo, scan, tmp_scan) {
2944 + proj = find_carry_node(doing, scan->node);
2945 + assert("nikita-2205", proj != NULL);
2946 + if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2947 + break;
2948 + }
2949 + return scan;
2950 +}
2951 +
2952 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2953 + znode * node)
2954 +{
2955 + carry_node *reference;
2956 +
2957 + assert("nikita-2994", doing != NULL);
2958 + assert("nikita-2995", todo != NULL);
2959 + assert("nikita-2996", node != NULL);
2960 +
2961 + reference = insert_carry_node(doing, todo, node);
2962 + assert("nikita-2997", reference != NULL);
2963 +
2964 + return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2965 +}
2966 +
2967 +/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2968 + This function is different from reiser4_post_carry() in that it finds proper
2969 + place to insert node in the queue. */
2970 +carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2971 + * passed down to node
2972 + * plugin */ ,
2973 + carry_opcode op /* opcode of operation */ ,
2974 + znode * node /* node on which this
2975 + * operation will operate */ ,
2976 + int apply_to_parent_p /* whether operation will
2977 + * operate directly on @node
2978 + * or on it parent. */ )
2979 +{
2980 + carry_op *result;
2981 + carry_node *child;
2982 +
2983 + assert("nikita-2207", info != NULL);
2984 + assert("nikita-2208", info->todo != NULL);
2985 +
2986 + if (info->doing == NULL)
2987 + return reiser4_post_carry(info->todo, op, node,
2988 + apply_to_parent_p);
2989 +
2990 + result = add_op(info->todo, POOLO_LAST, NULL);
2991 + if (IS_ERR(result))
2992 + return result;
2993 + child = add_carry_atplace(info->doing, info->todo, node);
2994 + if (IS_ERR(child)) {
2995 + reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2996 + return (carry_op *) child;
2997 + }
2998 + result->node = child;
2999 + result->op = op;
3000 + child->parent = apply_to_parent_p;
3001 + if (ZF_ISSET(node, JNODE_ORPHAN))
3002 + child->left_before = 1;
3003 + child->node = node;
3004 + return result;
3005 +}
3006 +
3007 +/* lock all carry nodes in @level */
3008 +static int lock_carry_level(carry_level * level /* level to lock */ )
3009 +{
3010 + int result;
3011 + carry_node *node;
3012 + carry_node *tmp_node;
3013 +
3014 + assert("nikita-881", level != NULL);
3015 + assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3016 +
3017 + /* lock nodes from left to right */
3018 + result = 0;
3019 + for_all_nodes(level, node, tmp_node) {
3020 + result = lock_carry_node(level, node);
3021 + if (result != 0)
3022 + break;
3023 + }
3024 + return result;
3025 +}
3026 +
3027 +/* Synchronize delimiting keys between @node and its left neighbor.
3028 +
3029 + To reduce contention on dk key and simplify carry code, we synchronize
3030 + delimiting keys only when carry ultimately leaves tree level (carrying
3031 + changes upward) and unlocks nodes at this level.
3032 +
3033 + This function first finds left neighbor of @node and then updates left
3034 + neighbor's right delimiting key to conincide with least key in @node.
3035 +
3036 +*/
3037 +
3038 +ON_DEBUG(extern atomic_t delim_key_version;
3039 + )
3040 +
3041 +static void sync_dkeys(znode * spot /* node to update */ )
3042 +{
3043 + reiser4_key pivot;
3044 + reiser4_tree *tree;
3045 +
3046 + assert("nikita-1610", spot != NULL);
3047 + assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3048 +
3049 + tree = znode_get_tree(spot);
3050 + read_lock_tree(tree);
3051 + write_lock_dk(tree);
3052 +
3053 + assert("nikita-2192", znode_is_loaded(spot));
3054 +
3055 + /* sync left delimiting key of @spot with key in its leftmost item */
3056 + if (node_is_empty(spot))
3057 + pivot = *znode_get_rd_key(spot);
3058 + else
3059 + leftmost_key_in_node(spot, &pivot);
3060 +
3061 + znode_set_ld_key(spot, &pivot);
3062 +
3063 + /* there can be sequence of empty nodes pending removal on the left of
3064 + @spot. Scan them and update their left and right delimiting keys to
3065 + match left delimiting key of @spot. Also, update right delimiting
3066 + key of first non-empty left neighbor.
3067 + */
3068 + while (1) {
3069 + if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3070 + break;
3071 +
3072 + spot = spot->left;
3073 + if (spot == NULL)
3074 + break;
3075 +
3076 + znode_set_rd_key(spot, &pivot);
3077 + /* don't sink into the domain of another balancing */
3078 + if (!znode_is_write_locked(spot))
3079 + break;
3080 + if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3081 + znode_set_ld_key(spot, &pivot);
3082 + else
3083 + break;
3084 + }
3085 +
3086 + write_unlock_dk(tree);
3087 + read_unlock_tree(tree);
3088 +}
3089 +
3090 +/* unlock all carry nodes in @level */
3091 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3092 + int failure /* true if unlocking owing to
3093 + * failure */ )
3094 +{
3095 + carry_node *node;
3096 + carry_node *tmp_node;
3097 +
3098 + assert("nikita-889", level != NULL);
3099 +
3100 + if (!failure) {
3101 + znode *spot;
3102 +
3103 + spot = NULL;
3104 + /* update delimiting keys */
3105 + for_all_nodes(level, node, tmp_node) {
3106 + if (reiser4_carry_real(node) != spot) {
3107 + spot = reiser4_carry_real(node);
3108 + sync_dkeys(spot);
3109 + }
3110 + }
3111 + }
3112 +
3113 + /* nodes can be unlocked in arbitrary order. In preemptible
3114 + environment it's better to unlock in reverse order of locking,
3115 + though.
3116 + */
3117 + for_all_nodes_back(level, node, tmp_node) {
3118 + /* all allocated nodes should be already linked to their
3119 + parents at this moment. */
3120 + assert("nikita-1631",
3121 + ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3122 + JNODE_ORPHAN)));
3123 + ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3124 + unlock_carry_node(level, node, failure);
3125 + }
3126 + level->new_root = NULL;
3127 +}
3128 +
3129 +/* finish with @level
3130 +
3131 + Unlock nodes and release all allocated resources */
3132 +static void done_carry_level(carry_level * level /* level to finish */ )
3133 +{
3134 + carry_node *node;
3135 + carry_node *tmp_node;
3136 + carry_op *op;
3137 + carry_op *tmp_op;
3138 +
3139 + assert("nikita-1076", level != NULL);
3140 +
3141 + unlock_carry_level(level, 0);
3142 + for_all_nodes(level, node, tmp_node) {
3143 + assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3144 + assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3145 + reiser4_pool_free(&level->pool->node_pool, &node->header);
3146 + }
3147 + for_all_ops(level, op, tmp_op)
3148 + reiser4_pool_free(&level->pool->op_pool, &op->header);
3149 +}
3150 +
3151 +/* helper function to complete locking of carry node
3152 +
3153 + Finish locking of carry node. There are several ways in which new carry
3154 + node can be added into carry level and locked. Normal is through
3155 + lock_carry_node(), but also from find_{left|right}_neighbor(). This
3156 + function factors out common final part of all locking scenarios. It
3157 + supposes that @node -> lock_handle is lock handle for lock just taken and
3158 + fills ->real_node from this lock handle.
3159 +
3160 +*/
3161 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3162 +{
3163 + assert("nikita-1052", node != NULL);
3164 + assert("nikita-1187", reiser4_carry_real(node) != NULL);
3165 + assert("nikita-1188", !node->unlock);
3166 +
3167 + node->unlock = 1;
3168 + /* Load node content into memory and install node plugin by
3169 + looking at the node header.
3170 +
3171 + Most of the time this call is cheap because the node is
3172 + already in memory.
3173 +
3174 + Corresponding zrelse() is in unlock_carry_node()
3175 + */
3176 + return zload(reiser4_carry_real(node));
3177 +}
3178 +
3179 +/* lock carry node
3180 +
3181 + "Resolve" node to real znode, lock it and mark as locked.
3182 + This requires recursive locking of znodes.
3183 +
3184 + When operation is posted to the parent level, node it will be applied to is
3185 + not yet known. For example, when shifting data between two nodes,
3186 + delimiting has to be updated in parent or parents of nodes involved. But
3187 + their parents is not yet locked and, moreover said nodes can be reparented
3188 + by concurrent balancing.
3189 +
3190 + To work around this, carry operation is applied to special "carry node"
3191 + rather than to the znode itself. Carry node consists of some "base" or
3192 + "reference" znode and flags indicating how to get to the target of carry
3193 + operation (->real_node field of carry_node) from base.
3194 +
3195 +*/
3196 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3197 + carry_node * node /* node to lock */ )
3198 +{
3199 + int result;
3200 + znode *reference_point;
3201 + lock_handle lh;
3202 + lock_handle tmp_lh;
3203 + reiser4_tree *tree;
3204 +
3205 + assert("nikita-887", level != NULL);
3206 + assert("nikita-882", node != NULL);
3207 +
3208 + result = 0;
3209 + reference_point = node->node;
3210 + init_lh(&lh);
3211 + init_lh(&tmp_lh);
3212 + if (node->left_before) {
3213 + /* handling of new nodes, allocated on the previous level:
3214 +
3215 + some carry ops were propably posted from the new node, but
3216 + this node neither has parent pointer set, nor is
3217 + connected. This will be done in ->create_hook() for
3218 + internal item.
3219 +
3220 + No then less, parent of new node has to be locked. To do
3221 + this, first go to the "left" in the carry order. This
3222 + depends on the decision to always allocate new node on the
3223 + right of existing one.
3224 +
3225 + Loop handles case when multiple nodes, all orphans, were
3226 + inserted.
3227 +
3228 + Strictly speaking, taking tree lock is not necessary here,
3229 + because all nodes scanned by loop in
3230 + find_begetting_brother() are write-locked by this thread,
3231 + and thus, their sibling linkage cannot change.
3232 +
3233 + */
3234 + tree = znode_get_tree(reference_point);
3235 + read_lock_tree(tree);
3236 + reference_point = find_begetting_brother(node, level)->node;
3237 + read_unlock_tree(tree);
3238 + assert("nikita-1186", reference_point != NULL);
3239 + }
3240 + if (node->parent && (result == 0)) {
3241 + result =
3242 + reiser4_get_parent(&tmp_lh, reference_point,
3243 + ZNODE_WRITE_LOCK);
3244 + if (result != 0) {
3245 + ; /* nothing */
3246 + } else if (znode_get_level(tmp_lh.node) == 0) {
3247 + assert("nikita-1347", znode_above_root(tmp_lh.node));
3248 + result = add_new_root(level, node, tmp_lh.node);
3249 + if (result == 0) {
3250 + reference_point = level->new_root;
3251 + move_lh(&lh, &node->lock_handle);
3252 + }
3253 + } else if ((level->new_root != NULL)
3254 + && (level->new_root !=
3255 + znode_parent_nolock(reference_point))) {
3256 + /* parent of node exists, but this level aready
3257 + created different new root, so */
3258 + warning("nikita-1109",
3259 + /* it should be "radicis", but tradition is
3260 + tradition. do banshees read latin? */
3261 + "hodie natus est radici frater");
3262 + result = -EIO;
3263 + } else {
3264 + move_lh(&lh, &tmp_lh);
3265 + reference_point = lh.node;
3266 + }
3267 + }
3268 + if (node->left && (result == 0)) {
3269 + assert("nikita-1183", node->parent);
3270 + assert("nikita-883", reference_point != NULL);
3271 + result =
3272 + reiser4_get_left_neighbor(&tmp_lh, reference_point,
3273 + ZNODE_WRITE_LOCK,
3274 + GN_CAN_USE_UPPER_LEVELS);
3275 + if (result == 0) {
3276 + done_lh(&lh);
3277 + move_lh(&lh, &tmp_lh);
3278 + reference_point = lh.node;
3279 + }
3280 + }
3281 + if (!node->parent && !node->left && !node->left_before) {
3282 + result =
3283 + longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3284 + ZNODE_LOCK_HIPRI);
3285 + }
3286 + if (result == 0) {
3287 + move_lh(&node->lock_handle, &lh);
3288 + result = lock_carry_node_tail(node);
3289 + }
3290 + done_lh(&tmp_lh);
3291 + done_lh(&lh);
3292 + return result;
3293 +}
3294 +
3295 +/* release a lock on &carry_node.
3296 +
3297 + Release if necessary lock on @node. This opearion is pair of
3298 + lock_carry_node() and is idempotent: you can call it more than once on the
3299 + same node.
3300 +
3301 +*/
3302 +static void
3303 +unlock_carry_node(carry_level * level,
3304 + carry_node * node /* node to be released */ ,
3305 + int failure /* 0 if node is unlocked due
3306 + * to some error */ )
3307 +{
3308 + znode *real_node;
3309 +
3310 + assert("nikita-884", node != NULL);
3311 +
3312 + real_node = reiser4_carry_real(node);
3313 + /* pair to zload() in lock_carry_node_tail() */
3314 + zrelse(real_node);
3315 + if (node->unlock && (real_node != NULL)) {
3316 + assert("nikita-899", real_node == node->lock_handle.node);
3317 + longterm_unlock_znode(&node->lock_handle);
3318 + }
3319 + if (failure) {
3320 + if (node->deallocate && (real_node != NULL)) {
3321 + /* free node in bitmap
3322 +
3323 + Prepare node for removal. Last zput() will finish
3324 + with it.
3325 + */
3326 + ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3327 + }
3328 + if (node->free) {
3329 + assert("nikita-2177",
3330 + list_empty_careful(&node->lock_handle.locks_link));
3331 + assert("nikita-2112",
3332 + list_empty_careful(&node->lock_handle.owners_link));
3333 + reiser4_pool_free(&level->pool->node_pool,
3334 + &node->header);
3335 + }
3336 + }
3337 +}
3338 +
3339 +/* fatal_carry_error() - all-catching error handling function
3340 +
3341 + It is possible that carry faces unrecoverable error, like unability to
3342 + insert pointer at the internal level. Our simple solution is just panic in
3343 + this situation. More sophisticated things like attempt to remount
3344 + file-system as read-only can be implemented without much difficlties.
3345 +
3346 + It is believed, that:
3347 +
3348 + 1. in stead of panicking, all current transactions can be aborted rolling
3349 + system back to the consistent state.
3350 +
3351 +Umm, if you simply panic without doing anything more at all, then all current
3352 +transactions are aborted and the system is rolled back to a consistent state,
3353 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3354 +precise. If an internal node is corrupted on disk due to hardware failure,
3355 +then there may be no consistent state that can be rolled back to, so instead
3356 +we should say that it will rollback the transactions, which barring other
3357 +factors means rolling back to a consistent state.
3358 +
3359 +# Nikita: there is a subtle difference between panic and aborting
3360 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3361 +# don't using reiser4 (not that we care about such processes), or using other
3362 +# reiser4 mounts (about them we do care) will simply continue to run. With
3363 +# some luck, even application using aborted file system can survive: it will
3364 +# get some error, like EBADF, from each file descriptor on failed file system,
3365 +# but applications that do care about tolerance will cope with this (squid
3366 +# will).
3367 +
3368 +It would be a nice feature though to support rollback without rebooting
3369 +followed by remount, but this can wait for later versions.
3370 +
3371 + 2. once isolated transactions will be implemented it will be possible to
3372 + roll back offending transaction.
3373 +
3374 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3375 +it more before deciding if it should be done. -Hans
3376 +
3377 +*/
3378 +static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3379 + * where
3380 + * unrecoverable
3381 + * error
3382 + * occurred */ ,
3383 + int ecode /* error code */ )
3384 +{
3385 + assert("nikita-1230", doing != NULL);
3386 + assert("nikita-1231", ecode < 0);
3387 +
3388 + reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3389 +}
3390 +
3391 +/* add new root to the tree
3392 +
3393 + This function itself only manages changes in carry structures and delegates
3394 + all hard work (allocation of znode for new root, changes of parent and
3395 + sibling pointers to the reiser4_add_tree_root().
3396 +
3397 + Locking: old tree root is locked by carry at this point. Fake znode is also
3398 + locked.
3399 +
3400 +*/
3401 +static int add_new_root(carry_level * level /* carry level in context of which
3402 + * operation is performed */ ,
3403 + carry_node * node /* carry node for existing root */ ,
3404 + znode * fake /* "fake" znode already locked by
3405 + * us */ )
3406 +{
3407 + int result;
3408 +
3409 + assert("nikita-1104", level != NULL);
3410 + assert("nikita-1105", node != NULL);
3411 +
3412 + assert("nikita-1403", znode_is_write_locked(node->node));
3413 + assert("nikita-1404", znode_is_write_locked(fake));
3414 +
3415 + /* trying to create new root. */
3416 + /* @node is root and it's already locked by us. This
3417 + means that nobody else can be trying to add/remove
3418 + tree root right now.
3419 + */
3420 + if (level->new_root == NULL)
3421 + level->new_root = reiser4_add_tree_root(node->node, fake);
3422 + if (!IS_ERR(level->new_root)) {
3423 + assert("nikita-1210", znode_is_root(level->new_root));
3424 + node->deallocate = 1;
3425 + result =
3426 + longterm_lock_znode(&node->lock_handle, level->new_root,
3427 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3428 + if (result == 0)
3429 + zput(level->new_root);
3430 + } else {
3431 + result = PTR_ERR(level->new_root);
3432 + level->new_root = NULL;
3433 + }
3434 + return result;
3435 +}
3436 +
3437 +/* allocate new znode and add the operation that inserts the
3438 + pointer to it into the parent node into the todo level
3439 +
3440 + Allocate new znode, add it into carry queue and post into @todo queue
3441 + request to add pointer to new node into its parent.
3442 +
3443 + This is carry related routing that calls reiser4_new_node() to allocate new
3444 + node.
3445 +*/
3446 +carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3447 + * node */ ,
3448 + carry_node * ref /* carry node after which new
3449 + * carry node is to be inserted
3450 + * into queue. This affects
3451 + * locking. */ ,
3452 + carry_level * doing /* carry queue where new node is
3453 + * to be added */ ,
3454 + carry_level * todo /* carry queue where COP_INSERT
3455 + * operation to add pointer to
3456 + * new node will ne added */ )
3457 +{
3458 + carry_node *fresh;
3459 + znode *new_znode;
3460 + carry_op *add_pointer;
3461 + carry_plugin_info info;
3462 +
3463 + assert("nikita-1048", brother != NULL);
3464 + assert("nikita-1049", todo != NULL);
3465 +
3466 + /* There is a lot of possible variations here: to what parent
3467 + new node will be attached and where. For simplicity, always
3468 + do the following:
3469 +
3470 + (1) new node and @brother will have the same parent.
3471 +
3472 + (2) new node is added on the right of @brother
3473 +
3474 + */
3475 +
3476 + fresh = reiser4_add_carry_skip(doing,
3477 + ref ? POOLO_AFTER : POOLO_LAST, ref);
3478 + if (IS_ERR(fresh))
3479 + return fresh;
3480 +
3481 + fresh->deallocate = 1;
3482 + fresh->free = 1;
3483 +
3484 + new_znode = reiser4_new_node(brother, znode_get_level(brother));
3485 + if (IS_ERR(new_znode))
3486 + /* @fresh will be deallocated automatically by error
3487 + handling code in the caller. */
3488 + return (carry_node *) new_znode;
3489 +
3490 + /* new_znode returned znode with x_count 1. Caller has to decrease
3491 + it. make_space() does. */
3492 +
3493 + ZF_SET(new_znode, JNODE_ORPHAN);
3494 + fresh->node = new_znode;
3495 +
3496 + while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3497 + ref = carry_node_prev(ref);
3498 + assert("nikita-1606", !carry_node_end(doing, ref));
3499 + }
3500 +
3501 + info.todo = todo;
3502 + info.doing = doing;
3503 + add_pointer = node_post_carry(&info, COP_INSERT,
3504 + reiser4_carry_real(ref), 1);
3505 + if (IS_ERR(add_pointer)) {
3506 + /* no need to deallocate @new_znode here: it will be
3507 + deallocated during carry error handling. */
3508 + return (carry_node *) add_pointer;
3509 + }
3510 +
3511 + add_pointer->u.insert.type = COPT_CHILD;
3512 + add_pointer->u.insert.child = fresh;
3513 + add_pointer->u.insert.brother = brother;
3514 + /* initially new node spawns empty key range */
3515 + write_lock_dk(znode_get_tree(brother));
3516 + znode_set_ld_key(new_znode,
3517 + znode_set_rd_key(new_znode,
3518 + znode_get_rd_key(brother)));
3519 + write_unlock_dk(znode_get_tree(brother));
3520 + return fresh;
3521 +}
3522 +
3523 +/* DEBUGGING FUNCTIONS.
3524 +
3525 + Probably we also should leave them on even when
3526 + debugging is turned off to print dumps at errors.
3527 +*/
3528 +#if REISER4_DEBUG
3529 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3530 +{
3531 + carry_node *node;
3532 + carry_node *tmp_node;
3533 +
3534 + if (level == NULL)
3535 + return 0;
3536 +
3537 + if (level->track_type != 0 &&
3538 + level->track_type != CARRY_TRACK_NODE &&
3539 + level->track_type != CARRY_TRACK_CHANGE)
3540 + return 0;
3541 +
3542 + /* check that nodes are in ascending order */
3543 + for_all_nodes(level, node, tmp_node) {
3544 + znode *left;
3545 + znode *right;
3546 +
3547 + reiser4_key lkey;
3548 + reiser4_key rkey;
3549 +
3550 + if (node != carry_node_front(level)) {
3551 + if (state == CARRY_TODO) {
3552 + right = node->node;
3553 + left = carry_node_prev(node)->node;
3554 + } else {
3555 + right = reiser4_carry_real(node);
3556 + left = reiser4_carry_real(carry_node_prev(node));
3557 + }
3558 + if (right == NULL || left == NULL)
3559 + continue;
3560 + if (node_is_empty(right) || node_is_empty(left))
3561 + continue;
3562 + if (!keyle(leftmost_key_in_node(left, &lkey),
3563 + leftmost_key_in_node(right, &rkey))) {
3564 + warning("", "wrong key order");
3565 + return 0;
3566 + }
3567 + }
3568 + }
3569 + return 1;
3570 +}
3571 +#endif
3572 +
3573 +/* get symbolic name for boolean */
3574 +static const char *tf(int boolean /* truth value */ )
3575 +{
3576 + return boolean ? "t" : "f";
3577 +}
3578 +
3579 +/* symbolic name for carry operation */
3580 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3581 +{
3582 + switch (op) {
3583 + case COP_INSERT:
3584 + return "COP_INSERT";
3585 + case COP_DELETE:
3586 + return "COP_DELETE";
3587 + case COP_CUT:
3588 + return "COP_CUT";
3589 + case COP_PASTE:
3590 + return "COP_PASTE";
3591 + case COP_UPDATE:
3592 + return "COP_UPDATE";
3593 + case COP_EXTENT:
3594 + return "COP_EXTENT";
3595 + case COP_INSERT_FLOW:
3596 + return "COP_INSERT_FLOW";
3597 + default:{
3598 + /* not mt safe, but who cares? */
3599 + static char buf[20];
3600 +
3601 + sprintf(buf, "unknown op: %x", op);
3602 + return buf;
3603 + }
3604 + }
3605 +}
3606 +
3607 +/* dump information about carry node */
3608 +static void print_carry(const char *prefix /* prefix to print */ ,
3609 + carry_node * node /* node to print */ )
3610 +{
3611 + if (node == NULL) {
3612 + printk("%s: null\n", prefix);
3613 + return;
3614 + }
3615 + printk
3616 + ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3617 + prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3618 + tf(node->free), tf(node->deallocate));
3619 +}
3620 +
3621 +/* dump information about carry operation */
3622 +static void print_op(const char *prefix /* prefix to print */ ,
3623 + carry_op * op /* operation to print */ )
3624 +{
3625 + if (op == NULL) {
3626 + printk("%s: null\n", prefix);
3627 + return;
3628 + }
3629 + printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3630 + print_carry("\tnode", op->node);
3631 + switch (op->op) {
3632 + case COP_INSERT:
3633 + case COP_PASTE:
3634 + print_coord("\tcoord",
3635 + op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3636 + reiser4_print_key("\tkey",
3637 + op->u.insert.d ? op->u.insert.d->key : NULL);
3638 + print_carry("\tchild", op->u.insert.child);
3639 + break;
3640 + case COP_DELETE:
3641 + print_carry("\tchild", op->u.delete.child);
3642 + break;
3643 + case COP_CUT:
3644 + if (op->u.cut_or_kill.is_cut) {
3645 + print_coord("\tfrom",
3646 + op->u.cut_or_kill.u.kill->params.from, 0);
3647 + print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3648 + 0);
3649 + } else {
3650 + print_coord("\tfrom",
3651 + op->u.cut_or_kill.u.cut->params.from, 0);
3652 + print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3653 + 0);
3654 + }
3655 + break;
3656 + case COP_UPDATE:
3657 + print_carry("\tleft", op->u.update.left);
3658 + break;
3659 + default:
3660 + /* do nothing */
3661 + break;
3662 + }
3663 +}
3664 +
3665 +/* dump information about all nodes and operations in a @level */
3666 +static void print_level(const char *prefix /* prefix to print */ ,
3667 + carry_level * level /* level to print */ )
3668 +{
3669 + carry_node *node;
3670 + carry_node *tmp_node;
3671 + carry_op *op;
3672 + carry_op *tmp_op;
3673 +
3674 + if (level == NULL) {
3675 + printk("%s: null\n", prefix);
3676 + return;
3677 + }
3678 + printk("%s: %p, restartable: %s\n",
3679 + prefix, level, tf(level->restartable));
3680 +
3681 + for_all_nodes(level, node, tmp_node)
3682 + print_carry("\tcarry node", node);
3683 + for_all_ops(level, op, tmp_op)
3684 + print_op("\tcarry op", op);
3685 +}
3686 +
3687 +/* Make Linus happy.
3688 + Local variables:
3689 + c-indentation-style: "K&R"
3690 + mode-name: "LC"
3691 + c-basic-offset: 8
3692 + tab-width: 8
3693 + fill-column: 120
3694 + scroll-step: 1
3695 + End:
3696 +*/
3697 diff -urN linux-2.6.23.orig/fs/reiser4/carry.h linux-2.6.23/fs/reiser4/carry.h
3698 --- linux-2.6.23.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3699 +++ linux-2.6.23/fs/reiser4/carry.h 2007-12-04 16:49:30.000000000 +0300
3700 @@ -0,0 +1,442 @@
3701 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3702 +
3703 +/* Functions and data types to "carry" tree modification(s) upward.
3704 + See fs/reiser4/carry.c for details. */
3705 +
3706 +#if !defined( __FS_REISER4_CARRY_H__ )
3707 +#define __FS_REISER4_CARRY_H__
3708 +
3709 +#include "forward.h"
3710 +#include "debug.h"
3711 +#include "pool.h"
3712 +#include "znode.h"
3713 +
3714 +#include <linux/types.h>
3715 +
3716 +/* &carry_node - "location" of carry node.
3717 +
3718 + "location" of node that is involved or going to be involved into
3719 + carry process. Node where operation will be carried to on the
3720 + parent level cannot be recorded explicitly. Operation will be carried
3721 + usually to the parent of some node (where changes are performed at
3722 + the current level) or, to the left neighbor of its parent. But while
3723 + modifications are performed at the current level, parent may
3724 + change. So, we have to allow some indirection (or, positevly,
3725 + flexibility) in locating carry nodes.
3726 +
3727 +*/
3728 +typedef struct carry_node {
3729 + /* pool linkage */
3730 + struct reiser4_pool_header header;
3731 +
3732 + /* base node from which real_node is calculated. See
3733 + fs/reiser4/carry.c:lock_carry_node(). */
3734 + znode *node;
3735 +
3736 + /* how to get ->real_node */
3737 + /* to get ->real_node obtain parent of ->node */
3738 + __u32 parent:1;
3739 + /* to get ->real_node obtain left neighbor of parent of
3740 + ->node */
3741 + __u32 left:1;
3742 + __u32 left_before:1;
3743 +
3744 + /* locking */
3745 +
3746 + /* this node was locked by carry process and should be
3747 + unlocked when carry leaves a level */
3748 + __u32 unlock:1;
3749 +
3750 + /* disk block for this node was allocated by carry process and
3751 + should be deallocated when carry leaves a level */
3752 + __u32 deallocate:1;
3753 + /* this carry node was allocated by carry process and should be
3754 + freed when carry leaves a level */
3755 + __u32 free:1;
3756 +
3757 + /* type of lock we want to take on this node */
3758 + lock_handle lock_handle;
3759 +} carry_node;
3760 +
3761 +/* &carry_opcode - elementary operations that can be carried upward
3762 +
3763 + Operations that carry() can handle. This list is supposed to be
3764 + expanded.
3765 +
3766 + Each carry operation (cop) is handled by appropriate function defined
3767 + in fs/reiser4/carry.c. For example COP_INSERT is handled by
3768 + fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3769 + call plugins of nodes affected by operation to modify nodes' content
3770 + and to gather operations to be performed on the next level.
3771 +
3772 +*/
3773 +typedef enum {
3774 + /* insert new item into node. */
3775 + COP_INSERT,
3776 + /* delete pointer from parent node */
3777 + COP_DELETE,
3778 + /* remove part of or whole node. */
3779 + COP_CUT,
3780 + /* increase size of item. */
3781 + COP_PASTE,
3782 + /* insert extent (that is sequence of unformatted nodes). */
3783 + COP_EXTENT,
3784 + /* update delimiting key in least common ancestor of two
3785 + nodes. This is performed when items are moved between two
3786 + nodes.
3787 + */
3788 + COP_UPDATE,
3789 + /* insert flow */
3790 + COP_INSERT_FLOW,
3791 + COP_LAST_OP,
3792 +} carry_opcode;
3793 +
3794 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
3795 +
3796 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3797 + item is determined. */
3798 +typedef enum {
3799 + /* target item is one containing pointer to the ->child node */
3800 + COPT_CHILD,
3801 + /* target item is given explicitly by @coord */
3802 + COPT_ITEM_DATA,
3803 + /* target item is given by key */
3804 + COPT_KEY,
3805 + /* see insert_paste_common() for more comments on this. */
3806 + COPT_PASTE_RESTARTED,
3807 +} cop_insert_pos_type;
3808 +
3809 +/* flags to cut and delete */
3810 +typedef enum {
3811 + /* don't kill node even if it became completely empty as results of
3812 + * cut. This is needed for eottl handling. See carry_extent() for
3813 + * details. */
3814 + DELETE_RETAIN_EMPTY = (1 << 0)
3815 +} cop_delete_flag;
3816 +
3817 +/*
3818 + * carry() implements "lock handle tracking" feature.
3819 + *
3820 + * Callers supply carry with node where to perform initial operation and lock
3821 + * handle on this node. Trying to optimize node utilization carry may actually
3822 + * move insertion point to different node. Callers expect that lock handle
3823 + * will rebe transferred to the new node also.
3824 + *
3825 + */
3826 +typedef enum {
3827 + /* transfer lock handle along with insertion point */
3828 + CARRY_TRACK_CHANGE = 1,
3829 + /* acquire new lock handle to the node where insertion point is. This
3830 + * is used when carry() client doesn't initially possess lock handle
3831 + * on the insertion point node, for example, by extent insertion
3832 + * code. See carry_extent(). */
3833 + CARRY_TRACK_NODE = 2
3834 +} carry_track_type;
3835 +
3836 +/* data supplied to COP_{INSERT|PASTE} by callers */
3837 +typedef struct carry_insert_data {
3838 + /* position where new item is to be inserted */
3839 + coord_t *coord;
3840 + /* new item description */
3841 + reiser4_item_data *data;
3842 + /* key of new item */
3843 + const reiser4_key *key;
3844 +} carry_insert_data;
3845 +
3846 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3847 +struct cut_kill_params {
3848 + /* coord where cut starts (inclusive) */
3849 + coord_t *from;
3850 + /* coord where cut stops (inclusive, this item/unit will also be
3851 + * cut) */
3852 + coord_t *to;
3853 + /* starting key. This is necessary when item and unit pos don't
3854 + * uniquely identify what portion or tree to remove. For example, this
3855 + * indicates what portion of extent unit will be affected. */
3856 + const reiser4_key *from_key;
3857 + /* exclusive stop key */
3858 + const reiser4_key *to_key;
3859 + /* if this is not NULL, smallest actually removed key is stored
3860 + * here. */
3861 + reiser4_key *smallest_removed;
3862 + /* kill_node_content() is called for file truncate */
3863 + int truncate;
3864 +};
3865 +
3866 +struct carry_cut_data {
3867 + struct cut_kill_params params;
3868 +};
3869 +
3870 +struct carry_kill_data {
3871 + struct cut_kill_params params;
3872 + /* parameter to be passed to the ->kill_hook() method of item
3873 + * plugin */
3874 + /*void *iplug_params; *//* FIXME: unused currently */
3875 + /* if not NULL---inode whose items are being removed. This is needed
3876 + * for ->kill_hook() of extent item to update VM structures when
3877 + * removing pages. */
3878 + struct inode *inode;
3879 + /* sibling list maintenance is complicated by existence of eottl. When
3880 + * eottl whose left and right neighbors are formatted leaves is
3881 + * removed, one has to connect said leaves in the sibling list. This
3882 + * cannot be done when extent removal is just started as locking rules
3883 + * require sibling list update to happen atomically with removal of
3884 + * extent item. Therefore: 1. pointers to left and right neighbors
3885 + * have to be passed down to the ->kill_hook() of extent item, and
3886 + * 2. said neighbors have to be locked. */
3887 + lock_handle *left;
3888 + lock_handle *right;
3889 + /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3890 + unsigned flags;
3891 + char *buf;
3892 +};
3893 +
3894 +/* &carry_tree_op - operation to "carry" upward.
3895 +
3896 + Description of an operation we want to "carry" to the upper level of
3897 + a tree: e.g, when we insert something and there is not enough space
3898 + we allocate a new node and "carry" the operation of inserting a
3899 + pointer to the new node to the upper level, on removal of empty node,
3900 + we carry up operation of removing appropriate entry from parent.
3901 +
3902 + There are two types of carry ops: when adding or deleting node we
3903 + node at the parent level where appropriate modification has to be
3904 + performed is known in advance. When shifting items between nodes
3905 + (split, merge), delimiting key should be changed in the least common
3906 + parent of the nodes involved that is not known in advance.
3907 +
3908 + For the operations of the first type we store in &carry_op pointer to
3909 + the &carry_node at the parent level. For the operation of the second
3910 + type we store &carry_node or parents of the left and right nodes
3911 + modified and keep track of them upward until they coincide.
3912 +
3913 +*/
3914 +typedef struct carry_op {
3915 + /* pool linkage */
3916 + struct reiser4_pool_header header;
3917 + carry_opcode op;
3918 + /* node on which operation is to be performed:
3919 +
3920 + for insert, paste: node where new item is to be inserted
3921 +
3922 + for delete: node where pointer is to be deleted
3923 +
3924 + for cut: node to cut from
3925 +
3926 + for update: node where delimiting key is to be modified
3927 +
3928 + for modify: parent of modified node
3929 +
3930 + */
3931 + carry_node *node;
3932 + union {
3933 + struct {
3934 + /* (sub-)type of insertion/paste. Taken from
3935 + cop_insert_pos_type. */
3936 + __u8 type;
3937 + /* various operation flags. Taken from
3938 + cop_insert_flag. */
3939 + __u8 flags;
3940 + carry_insert_data *d;
3941 + carry_node *child;
3942 + znode *brother;
3943 + } insert, paste, extent;
3944 +
3945 + struct {
3946 + int is_cut;
3947 + union {
3948 + carry_kill_data *kill;
3949 + carry_cut_data *cut;
3950 + } u;
3951 + } cut_or_kill;
3952 +
3953 + struct {
3954 + carry_node *left;
3955 + } update;
3956 + struct {
3957 + /* changed child */
3958 + carry_node *child;
3959 + /* bitmask of changes. See &cop_modify_flag */
3960 + __u32 flag;
3961 + } modify;
3962 + struct {
3963 + /* flags to deletion operation. Are taken from
3964 + cop_delete_flag */
3965 + __u32 flags;
3966 + /* child to delete from parent. If this is
3967 + NULL, delete op->node. */
3968 + carry_node *child;
3969 + } delete;
3970 + struct {
3971 + /* various operation flags. Taken from
3972 + cop_insert_flag. */
3973 + __u32 flags;
3974 + flow_t *flow;
3975 + coord_t *insert_point;
3976 + reiser4_item_data *data;
3977 + /* flow insertion is limited by number of new blocks
3978 + added in that operation which do not get any data
3979 + but part of flow. This limit is set by macro
3980 + CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3981 + of nodes added already during one carry_flow */
3982 + int new_nodes;
3983 + } insert_flow;
3984 + } u;
3985 +} carry_op;
3986 +
3987 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3988 +typedef struct carry_pool {
3989 + carry_op op[CARRIES_POOL_SIZE];
3990 + struct reiser4_pool op_pool;
3991 + carry_node node[NODES_LOCKED_POOL_SIZE];
3992 + struct reiser4_pool node_pool;
3993 +} carry_pool;
3994 +
3995 +/* &carry_tree_level - carry process on given level
3996 +
3997 + Description of balancing process on the given level.
3998 +
3999 + No need for locking here, as carry_tree_level is essentially per
4000 + thread thing (for now).
4001 +
4002 +*/
4003 +struct carry_level {
4004 + /* this level may be restarted */
4005 + __u32 restartable:1;
4006 + /* list of carry nodes on this level, ordered by key order */
4007 + struct list_head nodes;
4008 + struct list_head ops;
4009 + /* pool where new objects are allocated from */
4010 + carry_pool *pool;
4011 + int ops_num;
4012 + int nodes_num;
4013 + /* new root created on this level, if any */
4014 + znode *new_root;
4015 + /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
4016 + when they want ->tracked to automagically wander to the node where
4017 + insertion point moved after insert or paste.
4018 + */
4019 + carry_track_type track_type;
4020 + /* lock handle supplied by user that we are tracking. See
4021 + above. */
4022 + lock_handle *tracked;
4023 +};
4024 +
4025 +/* information carry passes to plugin methods that may add new operations to
4026 + the @todo queue */
4027 +struct carry_plugin_info {
4028 + carry_level *doing;
4029 + carry_level *todo;
4030 +};
4031 +
4032 +int reiser4_carry(carry_level * doing, carry_level * done);
4033 +
4034 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
4035 + carry_node * reference);
4036 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4037 + carry_node * reference);
4038 +
4039 +extern carry_node *insert_carry_node(carry_level * doing,
4040 + carry_level * todo, const znode * node);
4041 +
4042 +extern carry_pool *init_carry_pool(int);
4043 +extern void done_carry_pool(carry_pool * pool);
4044 +
4045 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4046 +
4047 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4048 + znode * node, int apply_to_parent);
4049 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4050 + znode * node, int apply_to_parent_p);
4051 +
4052 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4053 + carry_level * doing, carry_level * todo);
4054 +
4055 +carry_node *find_carry_node(carry_level * level, const znode * node);
4056 +
4057 +extern znode *reiser4_carry_real(const carry_node * node);
4058 +
4059 +/* helper macros to iterate over carry queues */
4060 +
4061 +#define carry_node_next( node ) \
4062 + list_entry((node)->header.level_linkage.next, carry_node, \
4063 + header.level_linkage)
4064 +
4065 +#define carry_node_prev( node ) \
4066 + list_entry((node)->header.level_linkage.prev, carry_node, \
4067 + header.level_linkage)
4068 +
4069 +#define carry_node_front( level ) \
4070 + list_entry((level)->nodes.next, carry_node, header.level_linkage)
4071 +
4072 +#define carry_node_back( level ) \
4073 + list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4074 +
4075 +#define carry_node_end( level, node ) \
4076 + (&(level)->nodes == &(node)->header.level_linkage)
4077 +
4078 +/* macro to iterate over all operations in a @level */
4079 +#define for_all_ops( level /* carry level (of type carry_level *) */, \
4080 + op /* pointer to carry operation, modified by loop (of \
4081 + * type carry_op *) */, \
4082 + tmp /* pointer to carry operation (of type carry_op *), \
4083 + * used to make iterator stable in the face of \
4084 + * deletions from the level */ ) \
4085 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4086 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4087 + &op->header.level_linkage != &level->ops; \
4088 + op = tmp, \
4089 + tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4090 +
4091 +#if 0
4092 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4093 + tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4094 + ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4095 + op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4096 +#endif
4097 +
4098 +/* macro to iterate over all nodes in a @level */ \
4099 +#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4100 + node /* pointer to carry node, modified by loop (of \
4101 + * type carry_node *) */, \
4102 + tmp /* pointer to carry node (of type carry_node *), \
4103 + * used to make iterator stable in the face of * \
4104 + * deletions from the level */ ) \
4105 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4106 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4107 + &node->header.level_linkage != &level->nodes; \
4108 + node = tmp, \
4109 + tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4110 +
4111 +#if 0
4112 +for( node = carry_node_front( level ), \
4113 + tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4114 + node = tmp, tmp = carry_node_next( node ) )
4115 +#endif
4116 +
4117 +/* macro to iterate over all nodes in a @level in reverse order
4118 +
4119 + This is used, because nodes are unlocked in reversed order of locking */
4120 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4121 + node /* pointer to carry node, modified by loop \
4122 + * (of type carry_node *) */, \
4123 + tmp /* pointer to carry node (of type carry_node \
4124 + * *), used to make iterator stable in the \
4125 + * face of deletions from the level */ ) \
4126 +for( node = carry_node_back( level ), \
4127 + tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4128 + node = tmp, tmp = carry_node_prev( node ) )
4129 +
4130 +/* __FS_REISER4_CARRY_H__ */
4131 +#endif
4132 +
4133 +/* Make Linus happy.
4134 + Local variables:
4135 + c-indentation-style: "K&R"
4136 + mode-name: "LC"
4137 + c-basic-offset: 8
4138 + tab-width: 8
4139 + fill-column: 120
4140 + scroll-step: 1
4141 + End:
4142 +*/
4143 diff -urN linux-2.6.23.orig/fs/reiser4/carry_ops.c linux-2.6.23/fs/reiser4/carry_ops.c
4144 --- linux-2.6.23.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300
4145 +++ linux-2.6.23/fs/reiser4/carry_ops.c 2007-12-04 16:49:30.000000000 +0300
4146 @@ -0,0 +1,2131 @@
4147 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4148 +
4149 +/* implementation of carry operations */
4150 +
4151 +#include "forward.h"
4152 +#include "debug.h"
4153 +#include "key.h"
4154 +#include "coord.h"
4155 +#include "plugin/item/item.h"
4156 +#include "plugin/node/node.h"
4157 +#include "jnode.h"
4158 +#include "znode.h"
4159 +#include "block_alloc.h"
4160 +#include "tree_walk.h"
4161 +#include "pool.h"
4162 +#include "tree_mod.h"
4163 +#include "carry.h"
4164 +#include "carry_ops.h"
4165 +#include "tree.h"
4166 +#include "super.h"
4167 +#include "reiser4.h"
4168 +
4169 +#include <linux/types.h>
4170 +#include <linux/err.h>
4171 +
4172 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4173 + carry_level * doing, carry_level * todo,
4174 + unsigned int including_insert_coord_p);
4175 +
4176 +extern int lock_carry_node(carry_level * level, carry_node * node);
4177 +extern int lock_carry_node_tail(carry_node * node);
4178 +
4179 +/* find left neighbor of a carry node
4180 +
4181 + Look for left neighbor of @node and add it to the @doing queue. See
4182 + comments in the body.
4183 +
4184 +*/
4185 +static carry_node *find_left_neighbor(carry_op * op /* node to find left
4186 + * neighbor of */ ,
4187 + carry_level * doing /* level to scan */ )
4188 +{
4189 + int result;
4190 + carry_node *node;
4191 + carry_node *left;
4192 + int flags;
4193 + reiser4_tree *tree;
4194 +
4195 + node = op->node;
4196 +
4197 + tree = current_tree;
4198 + read_lock_tree(tree);
4199 + /* first, check whether left neighbor is already in a @doing queue */
4200 + if (reiser4_carry_real(node)->left != NULL) {
4201 + /* NOTE: there is locking subtlety here. Look into
4202 + * find_right_neighbor() for more info */
4203 + if (find_carry_node(doing,
4204 + reiser4_carry_real(node)->left) != NULL) {
4205 + read_unlock_tree(tree);
4206 + left = node;
4207 + do {
4208 + left = list_entry(left->header.level_linkage.prev,
4209 + carry_node, header.level_linkage);
4210 + assert("nikita-3408", !carry_node_end(doing,
4211 + left));
4212 + } while (reiser4_carry_real(left) ==
4213 + reiser4_carry_real(node));
4214 + return left;
4215 + }
4216 + }
4217 + read_unlock_tree(tree);
4218 +
4219 + left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4220 + if (IS_ERR(left))
4221 + return left;
4222 +
4223 + left->node = node->node;
4224 + left->free = 1;
4225 +
4226 + flags = GN_TRY_LOCK;
4227 + if (!op->u.insert.flags & COPI_LOAD_LEFT)
4228 + flags |= GN_NO_ALLOC;
4229 +
4230 + /* then, feeling lucky, peek left neighbor in the cache. */
4231 + result = reiser4_get_left_neighbor(&left->lock_handle,
4232 + reiser4_carry_real(node),
4233 + ZNODE_WRITE_LOCK, flags);
4234 + if (result == 0) {
4235 + /* ok, node found and locked. */
4236 + result = lock_carry_node_tail(left);
4237 + if (result != 0)
4238 + left = ERR_PTR(result);
4239 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4240 + /* node is leftmost node in a tree, or neighbor wasn't in
4241 + cache, or there is an extent on the left. */
4242 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4243 + left = NULL;
4244 + } else if (doing->restartable) {
4245 + /* if left neighbor is locked, and level is restartable, add
4246 + new node to @doing and restart. */
4247 + assert("nikita-913", node->parent != 0);
4248 + assert("nikita-914", node->node != NULL);
4249 + left->left = 1;
4250 + left->free = 0;
4251 + left = ERR_PTR(-E_REPEAT);
4252 + } else {
4253 + /* left neighbor is locked, level cannot be restarted. Just
4254 + ignore left neighbor. */
4255 + reiser4_pool_free(&doing->pool->node_pool, &left->header);
4256 + left = NULL;
4257 + }
4258 + return left;
4259 +}
4260 +
4261 +/* find right neighbor of a carry node
4262 +
4263 + Look for right neighbor of @node and add it to the @doing queue. See
4264 + comments in the body.
4265 +
4266 +*/
4267 +static carry_node *find_right_neighbor(carry_op * op /* node to find right
4268 + * neighbor of */ ,
4269 + carry_level * doing /* level to scan */ )
4270 +{
4271 + int result;
4272 + carry_node *node;
4273 + carry_node *right;
4274 + lock_handle lh;
4275 + int flags;
4276 + reiser4_tree *tree;
4277 +
4278 + init_lh(&lh);
4279 +
4280 + node = op->node;
4281 +
4282 + tree = current_tree;
4283 + read_lock_tree(tree);
4284 + /* first, check whether right neighbor is already in a @doing queue */
4285 + if (reiser4_carry_real(node)->right != NULL) {
4286 + /*
4287 + * Tree lock is taken here anyway, because, even if _outcome_
4288 + * of (find_carry_node() != NULL) doesn't depends on
4289 + * concurrent updates to ->right, find_carry_node() cannot
4290 + * work with second argument NULL. Hence, following comment is
4291 + * of historic importance only.
4292 + *
4293 + * Subtle:
4294 + *
4295 + * Q: why don't we need tree lock here, looking for the right
4296 + * neighbor?
4297 + *
4298 + * A: even if value of node->real_node->right were changed
4299 + * during find_carry_node() execution, outcome of execution
4300 + * wouldn't change, because (in short) other thread cannot add
4301 + * elements to the @doing, and if node->real_node->right
4302 + * already was in @doing, value of node->real_node->right
4303 + * couldn't change, because node cannot be inserted between
4304 + * locked neighbors.
4305 + */
4306 + if (find_carry_node(doing,
4307 + reiser4_carry_real(node)->right) != NULL) {
4308 + read_unlock_tree(tree);
4309 + /*
4310 + * What we are doing here (this is also applicable to
4311 + * the find_left_neighbor()).
4312 + *
4313 + * tree_walk.c code requires that insertion of a
4314 + * pointer to a child, modification of parent pointer
4315 + * in the child, and insertion of the child into
4316 + * sibling list are atomic (see
4317 + * plugin/item/internal.c:create_hook_internal()).
4318 + *
4319 + * carry allocates new node long before pointer to it
4320 + * is inserted into parent and, actually, long before
4321 + * parent is even known. Such allocated-but-orphaned
4322 + * nodes are only trackable through carry level lists.
4323 + *
4324 + * Situation that is handled here is following: @node
4325 + * has valid ->right pointer, but there is
4326 + * allocated-but-orphaned node in the carry queue that
4327 + * is logically between @node and @node->right. Here
4328 + * we are searching for it. Critical point is that
4329 + * this is only possible if @node->right is also in
4330 + * the carry queue (this is checked above), because
4331 + * this is the only way new orphaned node could be
4332 + * inserted between them (before inserting new node,
4333 + * make_space() first tries to shift to the right, so,
4334 + * right neighbor will be locked and queued).
4335 + *
4336 + */
4337 + right = node;
4338 + do {
4339 + right = list_entry(right->header.level_linkage.next,
4340 + carry_node, header.level_linkage);
4341 + assert("nikita-3408", !carry_node_end(doing,
4342 + right));
4343 + } while (reiser4_carry_real(right) ==
4344 + reiser4_carry_real(node));
4345 + return right;
4346 + }
4347 + }
4348 + read_unlock_tree(tree);
4349 +
4350 + flags = GN_CAN_USE_UPPER_LEVELS;
4351 + if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4352 + flags = GN_NO_ALLOC;
4353 +
4354 + /* then, try to lock right neighbor */
4355 + init_lh(&lh);
4356 + result = reiser4_get_right_neighbor(&lh,
4357 + reiser4_carry_real(node),
4358 + ZNODE_WRITE_LOCK, flags);
4359 + if (result == 0) {
4360 + /* ok, node found and locked. */
4361 + right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4362 + if (!IS_ERR(right)) {
4363 + right->node = lh.node;
4364 + move_lh(&right->lock_handle, &lh);
4365 + right->free = 1;
4366 + result = lock_carry_node_tail(right);
4367 + if (result != 0)
4368 + right = ERR_PTR(result);
4369 + }
4370 + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4371 + /* node is rightmost node in a tree, or neighbor wasn't in
4372 + cache, or there is an extent on the right. */
4373 + right = NULL;
4374 + } else
4375 + right = ERR_PTR(result);
4376 + done_lh(&lh);
4377 + return right;
4378 +}
4379 +
4380 +/* how much free space in a @node is needed for @op
4381 +
4382 + How much space in @node is required for completion of @op, where @op is
4383 + insert or paste operation.
4384 +*/
4385 +static unsigned int space_needed_for_op(znode * node /* znode data are
4386 + * inserted or
4387 + * pasted in */ ,
4388 + carry_op * op /* carry
4389 + operation */ )
4390 +{
4391 + assert("nikita-919", op != NULL);
4392 +
4393 + switch (op->op) {
4394 + default:
4395 + impossible("nikita-1701", "Wrong opcode");
4396 + case COP_INSERT:
4397 + return space_needed(node, NULL, op->u.insert.d->data, 1);
4398 + case COP_PASTE:
4399 + return space_needed(node, op->u.insert.d->coord,
4400 + op->u.insert.d->data, 0);
4401 + }
4402 +}
4403 +
4404 +/* how much space in @node is required to insert or paste @data at
4405 + @coord. */
4406 +unsigned int space_needed(const znode * node /* node data are inserted or
4407 + * pasted in */ ,
4408 + const coord_t * coord /* coord where data are
4409 + * inserted or pasted
4410 + * at */ ,
4411 + const reiser4_item_data * data /* data to insert or
4412 + * paste */ ,
4413 + int insertion /* non-0 is inserting, 0---paste */ )
4414 +{
4415 + int result;
4416 + item_plugin *iplug;
4417 +
4418 + assert("nikita-917", node != NULL);
4419 + assert("nikita-918", node_plugin_by_node(node) != NULL);
4420 + assert("vs-230", !insertion || (coord == NULL));
4421 +
4422 + result = 0;
4423 + iplug = data->iplug;
4424 + if (iplug->b.estimate != NULL) {
4425 + /* ask item plugin how much space is needed to insert this
4426 + item */
4427 + result += iplug->b.estimate(insertion ? NULL : coord, data);
4428 + } else {
4429 + /* reasonable default */
4430 + result += data->length;
4431 + }
4432 + if (insertion) {
4433 + node_plugin *nplug;
4434 +
4435 + nplug = node->nplug;
4436 + /* and add node overhead */
4437 + if (nplug->item_overhead != NULL) {
4438 + result += nplug->item_overhead(node, NULL);
4439 + }
4440 + }
4441 + return result;
4442 +}
4443 +
4444 +/* find &coord in parent where pointer to new child is to be stored. */
4445 +static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4446 + * insert pointer to new
4447 + * child */ )
4448 +{
4449 + int result;
4450 + znode *node;
4451 + znode *child;
4452 +
4453 + assert("nikita-941", op != NULL);
4454 + assert("nikita-942", op->op == COP_INSERT);
4455 +
4456 + node = reiser4_carry_real(op->node);
4457 + assert("nikita-943", node != NULL);
4458 + assert("nikita-944", node_plugin_by_node(node) != NULL);
4459 +
4460 + child = reiser4_carry_real(op->u.insert.child);
4461 + result =
4462 + find_new_child_ptr(node, child, op->u.insert.brother,
4463 + op->u.insert.d->coord);
4464 +
4465 + build_child_ptr_data(child, op->u.insert.d->data);
4466 + return result;
4467 +}
4468 +
4469 +/* additional amount of free space in @node required to complete @op */
4470 +static int free_space_shortage(znode * node /* node to check */ ,
4471 + carry_op * op /* operation being performed */ )
4472 +{
4473 + assert("nikita-1061", node != NULL);
4474 + assert("nikita-1062", op != NULL);
4475 +
4476 + switch (op->op) {
4477 + default:
4478 + impossible("nikita-1702", "Wrong opcode");
4479 + case COP_INSERT:
4480 + case COP_PASTE:
4481 + return space_needed_for_op(node, op) - znode_free_space(node);
4482 + case COP_EXTENT:
4483 + /* when inserting extent shift data around until insertion
4484 + point is utmost in the node. */
4485 + if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4486 + return +1;
4487 + else
4488 + return -1;
4489 + }
4490 +}
4491 +
4492 +/* helper function: update node pointer in operation after insertion
4493 + point was probably shifted into @target. */
4494 +static znode *sync_op(carry_op * op, carry_node * target)
4495 +{
4496 + znode *insertion_node;
4497 +
4498 + /* reget node from coord: shift might move insertion coord to
4499 + the neighbor */
4500 + insertion_node = op->u.insert.d->coord->node;
4501 + /* if insertion point was actually moved into new node,
4502 + update carry node pointer in operation. */
4503 + if (insertion_node != reiser4_carry_real(op->node)) {
4504 + op->node = target;
4505 + assert("nikita-2540",
4506 + reiser4_carry_real(target) == insertion_node);
4507 + }
4508 + assert("nikita-2541",
4509 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4510 + return insertion_node;
4511 +}
4512 +
4513 +/*
4514 + * complete make_space() call: update tracked lock handle if necessary. See
4515 + * comments for fs/reiser4/carry.h:carry_track_type
4516 + */
4517 +static int
4518 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4519 +{
4520 + int result;
4521 + carry_track_type tracking;
4522 + znode *node;
4523 +
4524 + tracking = doing->track_type;
4525 + node = op->u.insert.d->coord->node;
4526 +
4527 + if (tracking == CARRY_TRACK_NODE ||
4528 + (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4529 + /* inserting or pasting into node different from
4530 + original. Update lock handle supplied by caller. */
4531 + assert("nikita-1417", doing->tracked != NULL);
4532 + done_lh(doing->tracked);
4533 + init_lh(doing->tracked);
4534 + result = longterm_lock_znode(doing->tracked, node,
4535 + ZNODE_WRITE_LOCK,
4536 + ZNODE_LOCK_HIPRI);
4537 + } else
4538 + result = 0;
4539 + return result;
4540 +}
4541 +
4542 +/* This is insertion policy function. It shifts data to the left and right
4543 + neighbors of insertion coord and allocates new nodes until there is enough
4544 + free space to complete @op.
4545 +
4546 + See comments in the body.
4547 +
4548 + Assumes that the node format favors insertions at the right end of the node
4549 + as node40 does.
4550 +
4551 + See carry_flow() on detail about flow insertion
4552 +*/
4553 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4554 + carry_level * doing /* current carry queue */ ,
4555 + carry_level * todo /* carry queue on the parent level */ )
4556 +{
4557 + znode *node;
4558 + int result;
4559 + int not_enough_space;
4560 + int blk_alloc;
4561 + znode *orig_node;
4562 + __u32 flags;
4563 +
4564 + coord_t *coord;
4565 +
4566 + assert("nikita-890", op != NULL);
4567 + assert("nikita-891", todo != NULL);
4568 + assert("nikita-892",
4569 + op->op == COP_INSERT ||
4570 + op->op == COP_PASTE || op->op == COP_EXTENT);
4571 + assert("nikita-1607",
4572 + reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4573 +
4574 + flags = op->u.insert.flags;
4575 +
4576 + /* NOTE check that new node can only be allocated after checking left
4577 + * and right neighbors. This is necessary for proper work of
4578 + * find_{left,right}_neighbor(). */
4579 + assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4580 + flags & COPI_DONT_SHIFT_LEFT));
4581 + assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4582 + flags & COPI_DONT_SHIFT_RIGHT));
4583 +
4584 + coord = op->u.insert.d->coord;
4585 + orig_node = node = coord->node;
4586 +
4587 + assert("nikita-908", node != NULL);
4588 + assert("nikita-909", node_plugin_by_node(node) != NULL);
4589 +
4590 + result = 0;
4591 + /* If there is not enough space in a node, try to shift something to
4592 + the left neighbor. This is a bit tricky, as locking to the left is
4593 + low priority. This is handled by restart logic in carry().
4594 + */
4595 + not_enough_space = free_space_shortage(node, op);
4596 + if (not_enough_space <= 0)
4597 + /* it is possible that carry was called when there actually
4598 + was enough space in the node. For example, when inserting
4599 + leftmost item so that delimiting keys have to be updated.
4600 + */
4601 + return make_space_tail(op, doing, orig_node);
4602 + if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4603 + carry_node *left;
4604 + /* make note in statistics of an attempt to move
4605 + something into the left neighbor */
4606 + left = find_left_neighbor(op, doing);
4607 + if (unlikely(IS_ERR(left))) {
4608 + if (PTR_ERR(left) == -E_REPEAT)
4609 + return -E_REPEAT;
4610 + else {
4611 + /* some error other than restart request
4612 + occurred. This shouldn't happen. Issue a
4613 + warning and continue as if left neighbor
4614 + weren't existing.
4615 + */
4616 + warning("nikita-924",
4617 + "Error accessing left neighbor: %li",
4618 + PTR_ERR(left));
4619 + }
4620 + } else if (left != NULL) {
4621 +
4622 + /* shift everything possible on the left of and
4623 + including insertion coord into the left neighbor */
4624 + result = carry_shift_data(LEFT_SIDE, coord,
4625 + reiser4_carry_real(left),
4626 + doing, todo,
4627 + flags & COPI_GO_LEFT);
4628 +
4629 + /* reget node from coord: shift_left() might move
4630 + insertion coord to the left neighbor */
4631 + node = sync_op(op, left);
4632 +
4633 + not_enough_space = free_space_shortage(node, op);
4634 + /* There is not enough free space in @node, but
4635 + may be, there is enough free space in
4636 + @left. Various balancing decisions are valid here.
4637 + The same for the shifiting to the right.
4638 + */
4639 + }
4640 + }
4641 + /* If there still is not enough space, shift to the right */
4642 + if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4643 + carry_node *right;
4644 +
4645 + right = find_right_neighbor(op, doing);
4646 + if (IS_ERR(right)) {
4647 + warning("nikita-1065",
4648 + "Error accessing right neighbor: %li",
4649 + PTR_ERR(right));
4650 + } else if (right != NULL) {
4651 + /* node containing insertion point, and its right
4652 + neighbor node are write locked by now.
4653 +
4654 + shift everything possible on the right of but
4655 + excluding insertion coord into the right neighbor
4656 + */
4657 + result = carry_shift_data(RIGHT_SIDE, coord,
4658 + reiser4_carry_real(right),
4659 + doing, todo,
4660 + flags & COPI_GO_RIGHT);
4661 + /* reget node from coord: shift_right() might move
4662 + insertion coord to the right neighbor */
4663 + node = sync_op(op, right);
4664 + not_enough_space = free_space_shortage(node, op);
4665 + }
4666 + }
4667 + /* If there is still not enough space, allocate new node(s).
4668 +
4669 + We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4670 + the carry operation flags (currently this is needed during flush
4671 + only).
4672 + */
4673 + for (blk_alloc = 0;
4674 + not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4675 + !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4676 + carry_node *fresh; /* new node we are allocating */
4677 + coord_t coord_shadow; /* remembered insertion point before
4678 + * shifting data into new node */
4679 + carry_node *node_shadow; /* remembered insertion node before
4680 + * shifting */
4681 + unsigned int gointo; /* whether insertion point should move
4682 + * into newly allocated node */
4683 +
4684 + /* allocate new node on the right of @node. Znode and disk
4685 + fake block number for new node are allocated.
4686 +
4687 + add_new_znode() posts carry operation COP_INSERT with
4688 + COPT_CHILD option to the parent level to add
4689 + pointer to newly created node to its parent.
4690 +
4691 + Subtle point: if several new nodes are required to complete
4692 + insertion operation at this level, they will be inserted
4693 + into their parents in the order of creation, which means
4694 + that @node will be valid "cookie" at the time of insertion.
4695 +
4696 + */
4697 + fresh = add_new_znode(node, op->node, doing, todo);
4698 + if (IS_ERR(fresh))
4699 + return PTR_ERR(fresh);
4700 +
4701 + /* Try to shift into new node. */
4702 + result = lock_carry_node(doing, fresh);
4703 + zput(reiser4_carry_real(fresh));
4704 + if (result != 0) {
4705 + warning("nikita-947",
4706 + "Cannot lock new node: %i", result);
4707 + return result;
4708 + }
4709 +
4710 + /* both nodes are write locked by now.
4711 +
4712 + shift everything possible on the right of and
4713 + including insertion coord into the right neighbor.
4714 + */
4715 + coord_dup(&coord_shadow, op->u.insert.d->coord);
4716 + node_shadow = op->node;
4717 + /* move insertion point into newly created node if:
4718 +
4719 + . insertion point is rightmost in the source node, or
4720 + . this is not the first node we are allocating in a row.
4721 + */
4722 + gointo =
4723 + (blk_alloc > 0) ||
4724 + coord_is_after_rightmost(op->u.insert.d->coord);
4725 +
4726 + if (gointo &&
4727 + op->op == COP_PASTE &&
4728 + coord_is_existing_item(op->u.insert.d->coord) &&
4729 + is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4730 + /* paste into solid (atomic) item, which can contain
4731 + only one unit, so we need to shift it right, where
4732 + insertion point supposed to be */
4733 +
4734 + assert("edward-1444", op->u.insert.d->data->iplug ==
4735 + item_plugin_by_id(STATIC_STAT_DATA_ID));
4736 + assert("edward-1445",
4737 + op->u.insert.d->data->length >
4738 + node_plugin_by_node(coord->node)->free_space
4739 + (coord->node));
4740 +
4741 + op->u.insert.d->coord->between = BEFORE_UNIT;
4742 + }
4743 +
4744 + result = carry_shift_data(RIGHT_SIDE, coord,
4745 + reiser4_carry_real(fresh),
4746 + doing, todo, gointo);
4747 + /* if insertion point was actually moved into new node,
4748 + update carry node pointer in operation. */
4749 + node = sync_op(op, fresh);
4750 + not_enough_space = free_space_shortage(node, op);
4751 + if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4752 + /* there is not enough free in new node. Shift
4753 + insertion point back to the @shadow_node so that
4754 + next new node would be inserted between
4755 + @shadow_node and @fresh.
4756 + */
4757 + coord_normalize(&coord_shadow);
4758 + coord_dup(coord, &coord_shadow);
4759 + node = coord->node;
4760 + op->node = node_shadow;
4761 + if (1 || (flags & COPI_STEP_BACK)) {
4762 + /* still not enough space?! Maybe there is
4763 + enough space in the source node (i.e., node
4764 + data are moved from) now.
4765 + */
4766 + not_enough_space =
4767 + free_space_shortage(node, op);
4768 + }
4769 + }
4770 + }
4771 + if (not_enough_space > 0) {
4772 + if (!(flags & COPI_DONT_ALLOCATE))
4773 + warning("nikita-948", "Cannot insert new item");
4774 + result = -E_NODE_FULL;
4775 + }
4776 + assert("nikita-1622", ergo(result == 0,
4777 + reiser4_carry_real(op->node) == coord->node));
4778 + assert("nikita-2616", coord == op->u.insert.d->coord);
4779 + if (result == 0)
4780 + result = make_space_tail(op, doing, orig_node);
4781 + return result;
4782 +}
4783 +
4784 +/* insert_paste_common() - common part of insert and paste operations
4785 +
4786 + This function performs common part of COP_INSERT and COP_PASTE.
4787 +
4788 + There are two ways in which insertion/paste can be requested:
4789 +
4790 + . by directly supplying reiser4_item_data. In this case, op ->
4791 + u.insert.type is set to COPT_ITEM_DATA.
4792 +
4793 + . by supplying child pointer to which is to inserted into parent. In this
4794 + case op -> u.insert.type == COPT_CHILD.
4795 +
4796 + . by supplying key of new item/unit. This is currently only used during
4797 + extent insertion
4798 +
4799 + This is required, because when new node is allocated we don't know at what
4800 + position pointer to it is to be stored in the parent. Actually, we don't
4801 + even know what its parent will be, because parent can be re-balanced
4802 + concurrently and new node re-parented, and because parent can be full and
4803 + pointer to the new node will go into some other node.
4804 +
4805 + insert_paste_common() resolves pointer to child node into position in the
4806 + parent by calling find_new_child_coord(), that fills
4807 + reiser4_item_data. After this, insertion/paste proceeds uniformly.
4808 +
4809 + Another complication is with finding free space during pasting. It may
4810 + happen that while shifting items to the neighbors and newly allocated
4811 + nodes, insertion coord can no longer be in the item we wanted to paste
4812 + into. At this point, paste becomes (morphs) into insert. Moreover free
4813 + space analysis has to be repeated, because amount of space required for
4814 + insertion is different from that of paste (item header overhead, etc).
4815 +
4816 + This function "unifies" different insertion modes (by resolving child
4817 + pointer or key into insertion coord), and then calls make_space() to free
4818 + enough space in the node by shifting data to the left and right and by
4819 + allocating new nodes if necessary. Carry operation knows amount of space
4820 + required for its completion. After enough free space is obtained, caller of
4821 + this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4822 + by calling item plugin method.
4823 +
4824 +*/
4825 +static int insert_paste_common(carry_op * op /* carry operation being
4826 + * performed */ ,
4827 + carry_level * doing /* current carry level */ ,
4828 + carry_level * todo /* next carry level */ ,
4829 + carry_insert_data * cdata /* pointer to
4830 + * cdata */ ,
4831 + coord_t * coord /* insertion/paste coord */ ,
4832 + reiser4_item_data * data /* data to be
4833 + * inserted/pasted */ )
4834 +{
4835 + assert("nikita-981", op != NULL);
4836 + assert("nikita-980", todo != NULL);
4837 + assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4838 + || (op->op == COP_EXTENT));
4839 +
4840 + if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4841 + /* nothing to do. Fall through to make_space(). */
4842 + ;
4843 + } else if (op->u.insert.type == COPT_KEY) {
4844 + node_search_result intra_node;
4845 + znode *node;
4846 + /* Problem with doing batching at the lowest level, is that
4847 + operations here are given by coords where modification is
4848 + to be performed, and one modification can invalidate coords
4849 + of all following operations.
4850 +
4851 + So, we are implementing yet another type for operation that
4852 + will use (the only) "locator" stable across shifting of
4853 + data between nodes, etc.: key (COPT_KEY).
4854 +
4855 + This clause resolves key to the coord in the node.
4856 +
4857 + But node can change also. Probably some pieces have to be
4858 + added to the lock_carry_node(), to lock node by its key.
4859 +
4860 + */
4861 + /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4862 + if you need something else. */
4863 + op->u.insert.d->coord = coord;
4864 + node = reiser4_carry_real(op->node);
4865 + intra_node = node_plugin_by_node(node)->lookup
4866 + (node, op->u.insert.d->key, FIND_EXACT,
4867 + op->u.insert.d->coord);
4868 + if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4869 + warning("nikita-1715", "Intra node lookup failure: %i",
4870 + intra_node);
4871 + return intra_node;
4872 + }
4873 + } else if (op->u.insert.type == COPT_CHILD) {
4874 + /* if we are asked to insert pointer to the child into
4875 + internal node, first convert pointer to the child into
4876 + coord within parent node.
4877 + */
4878 + znode *child;
4879 + int result;
4880 +
4881 + op->u.insert.d = cdata;
4882 + op->u.insert.d->coord = coord;
4883 + op->u.insert.d->data = data;
4884 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4885 + result = find_new_child_coord(op);
4886 + child = reiser4_carry_real(op->u.insert.child);
4887 + if (result != NS_NOT_FOUND) {
4888 + warning("nikita-993",
4889 + "Cannot find a place for child pointer: %i",
4890 + result);
4891 + return result;
4892 + }
4893 + /* This only happens when we did multiple insertions at
4894 + the previous level, trying to insert single item and
4895 + it so happened, that insertion of pointers to all new
4896 + nodes before this one already caused parent node to
4897 + split (may be several times).
4898 +
4899 + I am going to come up with better solution.
4900 +
4901 + You are not expected to understand this.
4902 + -- v6root/usr/sys/ken/slp.c
4903 +
4904 + Basically, what happens here is the following: carry came
4905 + to the parent level and is about to insert internal item
4906 + pointing to the child node that it just inserted in the
4907 + level below. Position where internal item is to be inserted
4908 + was found by find_new_child_coord() above, but node of the
4909 + current carry operation (that is, parent node of child
4910 + inserted on the previous level), was determined earlier in
4911 + the lock_carry_level/lock_carry_node. It could so happen
4912 + that other carry operations already performed on the parent
4913 + level already split parent node, so that insertion point
4914 + moved into another node. Handle this by creating new carry
4915 + node for insertion point if necessary.
4916 + */
4917 + if (reiser4_carry_real(op->node) !=
4918 + op->u.insert.d->coord->node) {
4919 + pool_ordering direction;
4920 + znode *z1;
4921 + znode *z2;
4922 + reiser4_key k1;
4923 + reiser4_key k2;
4924 +
4925 + /*
4926 + * determine in what direction insertion point
4927 + * moved. Do this by comparing delimiting keys.
4928 + */
4929 + z1 = op->u.insert.d->coord->node;
4930 + z2 = reiser4_carry_real(op->node);
4931 + if (keyle(leftmost_key_in_node(z1, &k1),
4932 + leftmost_key_in_node(z2, &k2)))
4933 + /* insertion point moved to the left */
4934 + direction = POOLO_BEFORE;
4935 + else
4936 + /* insertion point moved to the right */
4937 + direction = POOLO_AFTER;
4938 +
4939 + op->node = reiser4_add_carry_skip(doing,
4940 + direction, op->node);
4941 + if (IS_ERR(op->node))
4942 + return PTR_ERR(op->node);
4943 + op->node->node = op->u.insert.d->coord->node;
4944 + op->node->free = 1;
4945 + result = lock_carry_node(doing, op->node);
4946 + if (result != 0)
4947 + return result;
4948 + }
4949 +
4950 + /*
4951 + * set up key of an item being inserted: we are inserting
4952 + * internal item and its key is (by the very definition of
4953 + * search tree) is leftmost key in the child node.
4954 + */
4955 + write_lock_dk(znode_get_tree(child));
4956 + op->u.insert.d->key = leftmost_key_in_node(child,
4957 + znode_get_ld_key(child));
4958 + write_unlock_dk(znode_get_tree(child));
4959 + op->u.insert.d->data->arg = op->u.insert.brother;
4960 + } else {
4961 + assert("vs-243", op->u.insert.d->coord != NULL);
4962 + op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4963 + }
4964 +
4965 + /* find free space. */
4966 + return make_space(op, doing, todo);
4967 +}
4968 +
4969 +/* handle carry COP_INSERT operation.
4970 +
4971 + Insert new item into node. New item can be given in one of two ways:
4972 +
4973 + - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4974 + only applicable at the leaf/twig level.
4975 +
4976 + - by passing a child node pointer to which is to be inserted by this
4977 + operation.
4978 +
4979 +*/
4980 +static int carry_insert(carry_op * op /* operation to perform */ ,
4981 + carry_level * doing /* queue of operations @op
4982 + * is part of */ ,
4983 + carry_level * todo /* queue where new operations
4984 + * are accumulated */ )
4985 +{
4986 + znode *node;
4987 + carry_insert_data cdata;
4988 + coord_t coord;
4989 + reiser4_item_data data;
4990 + carry_plugin_info info;
4991 + int result;
4992 +
4993 + assert("nikita-1036", op != NULL);
4994 + assert("nikita-1037", todo != NULL);
4995 + assert("nikita-1038", op->op == COP_INSERT);
4996 +
4997 + coord_init_zero(&coord);
4998 +
4999 + /* perform common functionality of insert and paste. */
5000 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5001 + if (result != 0)
5002 + return result;
5003 +
5004 + node = op->u.insert.d->coord->node;
5005 + assert("nikita-1039", node != NULL);
5006 + assert("nikita-1040", node_plugin_by_node(node) != NULL);
5007 +
5008 + assert("nikita-949",
5009 + space_needed_for_op(node, op) <= znode_free_space(node));
5010 +
5011 + /* ask node layout to create new item. */
5012 + info.doing = doing;
5013 + info.todo = todo;
5014 + result = node_plugin_by_node(node)->create_item
5015 + (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5016 + &info);
5017 + doing->restartable = 0;
5018 + znode_make_dirty(node);
5019 +
5020 + return result;
5021 +}
5022 +
5023 +/*
5024 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5025 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5026 + * by slicing into multiple items.
5027 + */
5028 +
5029 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5030 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5031 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5032 +
5033 +static size_t item_data_overhead(carry_op * op)
5034 +{
5035 + if (flow_insert_data(op)->iplug->b.estimate == NULL)
5036 + return 0;
5037 + return (flow_insert_data(op)->iplug->b.
5038 + estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5039 + flow_insert_data(op)->length);
5040 +}
5041 +
5042 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5043 + and it will always return the same result. Some optimization could be made
5044 + by calculating this value once at the beginning and passing it around. That
5045 + would reduce some flexibility in future changes
5046 +*/
5047 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5048 +static size_t flow_insertion_overhead(carry_op * op)
5049 +{
5050 + znode *node;
5051 + size_t insertion_overhead;
5052 +
5053 + node = flow_insert_point(op)->node;
5054 + insertion_overhead = 0;
5055 + if (node->nplug->item_overhead &&
5056 + !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5057 + flow_insert_data(op)))
5058 + insertion_overhead =
5059 + node->nplug->item_overhead(node, NULL) +
5060 + item_data_overhead(op);
5061 + return insertion_overhead;
5062 +}
5063 +
5064 +/* how many bytes of flow does fit to the node */
5065 +static int what_can_fit_into_node(carry_op * op)
5066 +{
5067 + size_t free, overhead;
5068 +
5069 + overhead = flow_insertion_overhead(op);
5070 + free = znode_free_space(flow_insert_point(op)->node);
5071 + if (free <= overhead)
5072 + return 0;
5073 + free -= overhead;
5074 + /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5075 + if (free < op->u.insert_flow.flow->length)
5076 + return free;
5077 + return (int)op->u.insert_flow.flow->length;
5078 +}
5079 +
5080 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5081 + fits into a node or whether minimal fraction of flow fits into a node */
5082 +static int enough_space_for_whole_flow(carry_op * op)
5083 +{
5084 + return (unsigned)what_can_fit_into_node(op) ==
5085 + op->u.insert_flow.flow->length;
5086 +}
5087 +
5088 +#define MIN_FLOW_FRACTION 1
5089 +static int enough_space_for_min_flow_fraction(carry_op * op)
5090 +{
5091 + assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5092 +
5093 + return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5094 +}
5095 +
5096 +/* this returns 0 if left neighbor was obtained successfully and everything
5097 + upto insertion point including it were shifted and left neighbor still has
5098 + some free space to put minimal fraction of flow into it */
5099 +static int
5100 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5101 +{
5102 + carry_node *left;
5103 + znode *orig;
5104 +
5105 + left = find_left_neighbor(op, doing);
5106 + if (unlikely(IS_ERR(left))) {
5107 + warning("vs-899",
5108 + "make_space_by_shift_left: "
5109 + "error accessing left neighbor: %li", PTR_ERR(left));
5110 + return 1;
5111 + }
5112 + if (left == NULL)
5113 + /* left neighbor either does not exist or is unformatted
5114 + node */
5115 + return 1;
5116 +
5117 + orig = flow_insert_point(op)->node;
5118 + /* try to shift content of node @orig from its head upto insert point
5119 + including insertion point into the left neighbor */
5120 + carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5121 + reiser4_carry_real(left), doing, todo,
5122 + 1 /* including insert point */);
5123 + if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5124 + /* insertion point did not move */
5125 + return 1;
5126 + }
5127 +
5128 + /* insertion point is set after last item in the node */
5129 + assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5130 +
5131 + if (!enough_space_for_min_flow_fraction(op)) {
5132 + /* insertion point node does not have enough free space to put
5133 + even minimal portion of flow into it, therefore, move
5134 + insertion point back to orig node (before first item) */
5135 + coord_init_before_first_item(flow_insert_point(op), orig);
5136 + return 1;
5137 + }
5138 +
5139 + /* part of flow is to be written to the end of node */
5140 + op->node = left;
5141 + return 0;
5142 +}
5143 +
5144 +/* this returns 0 if right neighbor was obtained successfully and everything to
5145 + the right of insertion point was shifted to it and node got enough free
5146 + space to put minimal fraction of flow into it */
5147 +static int
5148 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5149 + carry_level * todo)
5150 +{
5151 + carry_node *right;
5152 +
5153 + right = find_right_neighbor(op, doing);
5154 + if (unlikely(IS_ERR(right))) {
5155 + warning("nikita-1065", "shift_right_excluding_insert_point: "
5156 + "error accessing right neighbor: %li", PTR_ERR(right));
5157 + return 1;
5158 + }
5159 + if (right) {
5160 + /* shift everything possible on the right of but excluding
5161 + insertion coord into the right neighbor */
5162 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5163 + reiser4_carry_real(right), doing, todo,
5164 + 0 /* not including insert point */);
5165 + } else {
5166 + /* right neighbor either does not exist or is unformatted
5167 + node */
5168 + ;
5169 + }
5170 + if (coord_is_after_rightmost(flow_insert_point(op))) {
5171 + if (enough_space_for_min_flow_fraction(op)) {
5172 + /* part of flow is to be written to the end of node */
5173 + return 0;
5174 + }
5175 + }
5176 +
5177 + /* new node is to be added if insert point node did not get enough
5178 + space for whole flow */
5179 + return 1;
5180 +}
5181 +
5182 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5183 + fits into that node */
5184 +static int
5185 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5186 +{
5187 + int result;
5188 + znode *node;
5189 + carry_node *new;
5190 +
5191 + node = flow_insert_point(op)->node;
5192 +
5193 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5194 + return RETERR(-E_NODE_FULL);
5195 + /* add new node after insert point node */
5196 + new = add_new_znode(node, op->node, doing, todo);
5197 + if (unlikely(IS_ERR(new))) {
5198 + return PTR_ERR(new);
5199 + }
5200 + result = lock_carry_node(doing, new);
5201 + zput(reiser4_carry_real(new));
5202 + if (unlikely(result)) {
5203 + return result;
5204 + }
5205 + op->u.insert_flow.new_nodes++;
5206 + if (!coord_is_after_rightmost(flow_insert_point(op))) {
5207 + carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5208 + reiser4_carry_real(new), doing, todo,
5209 + 0 /* not including insert point */);
5210 + assert("vs-901",
5211 + coord_is_after_rightmost(flow_insert_point(op)));
5212 +
5213 + if (enough_space_for_min_flow_fraction(op)) {
5214 + return 0;
5215 + }
5216 + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5217 + return RETERR(-E_NODE_FULL);
5218 +
5219 + /* add one more new node */
5220 + new = add_new_znode(node, op->node, doing, todo);
5221 + if (unlikely(IS_ERR(new))) {
5222 + return PTR_ERR(new);
5223 + }
5224 + result = lock_carry_node(doing, new);
5225 + zput(reiser4_carry_real(new));
5226 + if (unlikely(result)) {
5227 + return result;
5228 + }
5229 + op->u.insert_flow.new_nodes++;
5230 + }
5231 +
5232 + /* move insertion point to new node */
5233 + coord_init_before_first_item(flow_insert_point(op),
5234 + reiser4_carry_real(new));
5235 + op->node = new;
5236 + return 0;
5237 +}
5238 +
5239 +static int
5240 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5241 + carry_level * todo)
5242 +{
5243 + __u32 flags = op->u.insert_flow.flags;
5244 +
5245 + if (enough_space_for_whole_flow(op)) {
5246 + /* whole flow fits into insert point node */
5247 + return 0;
5248 + }
5249 +
5250 + if (!(flags & COPI_DONT_SHIFT_LEFT)
5251 + && (make_space_by_shift_left(op, doing, todo) == 0)) {
5252 + /* insert point is shifted to left neighbor of original insert
5253 + point node and is set after last unit in that node. It has
5254 + enough space to fit at least minimal fraction of flow. */
5255 + return 0;
5256 + }
5257 +
5258 + if (enough_space_for_whole_flow(op)) {
5259 + /* whole flow fits into insert point node */
5260 + return 0;
5261 + }
5262 +
5263 + if (!(flags & COPI_DONT_SHIFT_RIGHT)
5264 + && (make_space_by_shift_right(op, doing, todo) == 0)) {
5265 + /* insert point is still set to the same node, but there is
5266 + nothing to the right of insert point. */
5267 + return 0;
5268 + }
5269 +
5270 + if (enough_space_for_whole_flow(op)) {
5271 + /* whole flow fits into insert point node */
5272 + return 0;
5273 + }
5274 +
5275 + return make_space_by_new_nodes(op, doing, todo);
5276 +}
5277 +
5278 +/* implements COP_INSERT_FLOW operation */
5279 +static int
5280 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5281 +{
5282 + int result;
5283 + flow_t *f;
5284 + coord_t *insert_point;
5285 + node_plugin *nplug;
5286 + carry_plugin_info info;
5287 + znode *orig_node;
5288 + lock_handle *orig_lh;
5289 +
5290 + f = op->u.insert_flow.flow;
5291 + result = 0;
5292 +
5293 + /* carry system needs this to work */
5294 + info.doing = doing;
5295 + info.todo = todo;
5296 +
5297 + orig_node = flow_insert_point(op)->node;
5298 + orig_lh = doing->tracked;
5299 +
5300 + while (f->length) {
5301 + result = make_space_for_flow_insertion(op, doing, todo);
5302 + if (result)
5303 + break;
5304 +
5305 + insert_point = flow_insert_point(op);
5306 + nplug = node_plugin_by_node(insert_point->node);
5307 +
5308 + /* compose item data for insertion/pasting */
5309 + flow_insert_data(op)->data = f->data;
5310 + flow_insert_data(op)->length = what_can_fit_into_node(op);
5311 +
5312 + if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5313 + /* insert point is set to item of file we are writing to and we have to append to it */
5314 + assert("vs-903", insert_point->between == AFTER_UNIT);
5315 + nplug->change_item_size(insert_point,
5316 + flow_insert_data(op)->length);
5317 + flow_insert_data(op)->iplug->b.paste(insert_point,
5318 + flow_insert_data
5319 + (op), &info);
5320 + } else {
5321 + /* new item must be inserted */
5322 + pos_in_node_t new_pos;
5323 + flow_insert_data(op)->length += item_data_overhead(op);
5324 +
5325 + /* FIXME-VS: this is because node40_create_item changes
5326 + insert_point for obscure reasons */
5327 + switch (insert_point->between) {
5328 + case AFTER_ITEM:
5329 + new_pos = insert_point->item_pos + 1;
5330 + break;
5331 + case EMPTY_NODE:
5332 + new_pos = 0;
5333 + break;
5334 + case BEFORE_ITEM:
5335 + assert("vs-905", insert_point->item_pos == 0);
5336 + new_pos = 0;
5337 + break;
5338 + default:
5339 + impossible("vs-906",
5340 + "carry_insert_flow: invalid coord");
5341 + new_pos = 0;
5342 + break;
5343 + }
5344 +
5345 + nplug->create_item(insert_point, &f->key,
5346 + flow_insert_data(op), &info);
5347 + coord_set_item_pos(insert_point, new_pos);
5348 + }
5349 + coord_init_after_item_end(insert_point);
5350 + doing->restartable = 0;
5351 + znode_make_dirty(insert_point->node);
5352 +
5353 + move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5354 + }
5355 +
5356 + if (orig_node != flow_insert_point(op)->node) {
5357 + /* move lock to new insert point */
5358 + done_lh(orig_lh);
5359 + init_lh(orig_lh);
5360 + result =
5361 + longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5362 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5363 + }
5364 +
5365 + return result;
5366 +}
5367 +
5368 +/* implements COP_DELETE operation
5369 +
5370 + Remove pointer to @op -> u.delete.child from it's parent.
5371 +
5372 + This function also handles killing of a tree root is last pointer from it
5373 + was removed. This is complicated by our handling of "twig" level: root on
5374 + twig level is never killed.
5375 +
5376 +*/
5377 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5378 + carry_level * doing UNUSED_ARG /* current carry
5379 + * level */ ,
5380 + carry_level * todo /* next carry level */ )
5381 +{
5382 + int result;
5383 + coord_t coord;
5384 + coord_t coord2;
5385 + znode *parent;
5386 + znode *child;
5387 + carry_plugin_info info;
5388 + reiser4_tree *tree;
5389 +
5390 + /*
5391 + * This operation is called to delete internal item pointing to the
5392 + * child node that was removed by carry from the tree on the previous
5393 + * tree level.
5394 + */
5395 +
5396 + assert("nikita-893", op != NULL);
5397 + assert("nikita-894", todo != NULL);
5398 + assert("nikita-895", op->op == COP_DELETE);
5399 +
5400 + coord_init_zero(&coord);
5401 + coord_init_zero(&coord2);
5402 +
5403 + parent = reiser4_carry_real(op->node);
5404 + child = op->u.delete.child ?
5405 + reiser4_carry_real(op->u.delete.child) : op->node->node;
5406 + tree = znode_get_tree(child);
5407 + read_lock_tree(tree);
5408 +
5409 + /*
5410 + * @parent was determined when carry entered parent level
5411 + * (lock_carry_level/lock_carry_node). Since then, actual parent of
5412 + * @child node could change due to other carry operations performed on
5413 + * the parent level. Check for this.
5414 + */
5415 +
5416 + if (znode_parent(child) != parent) {
5417 + /* NOTE-NIKITA add stat counter for this. */
5418 + parent = znode_parent(child);
5419 + assert("nikita-2581", find_carry_node(doing, parent));
5420 + }
5421 + read_unlock_tree(tree);
5422 +
5423 + assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5424 +
5425 + /* Twig level horrors: tree should be of height at least 2. So, last
5426 + pointer from the root at twig level is preserved even if child is
5427 + empty. This is ugly, but so it was architectured.
5428 + */
5429 +
5430 + if (znode_is_root(parent) &&
5431 + znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5432 + node_num_items(parent) == 1) {
5433 + /* Delimiting key manipulations. */
5434 + write_lock_dk(tree);
5435 + znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5436 + znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5437 + ZF_SET(child, JNODE_DKSET);
5438 + write_unlock_dk(tree);
5439 +
5440 + /* @child escaped imminent death! */
5441 + ZF_CLR(child, JNODE_HEARD_BANSHEE);
5442 + return 0;
5443 + }
5444 +
5445 + /* convert child pointer to the coord_t */
5446 + result = find_child_ptr(parent, child, &coord);
5447 + if (result != NS_FOUND) {
5448 + warning("nikita-994", "Cannot find child pointer: %i", result);
5449 + print_coord_content("coord", &coord);
5450 + return result;
5451 + }
5452 +
5453 + coord_dup(&coord2, &coord);
5454 + info.doing = doing;
5455 + info.todo = todo;
5456 + {
5457 + /*
5458 + * Actually kill internal item: prepare structure with
5459 + * arguments for ->cut_and_kill() method...
5460 + */
5461 +
5462 + struct carry_kill_data kdata;
5463 + kdata.params.from = &coord;
5464 + kdata.params.to = &coord2;
5465 + kdata.params.from_key = NULL;
5466 + kdata.params.to_key = NULL;
5467 + kdata.params.smallest_removed = NULL;
5468 + kdata.params.truncate = 1;
5469 + kdata.flags = op->u.delete.flags;
5470 + kdata.inode = NULL;
5471 + kdata.left = NULL;
5472 + kdata.right = NULL;
5473 + kdata.buf = NULL;
5474 + /* ... and call it. */
5475 + result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5476 + &info);
5477 + }
5478 + doing->restartable = 0;
5479 +
5480 + /* check whether root should be killed violently */
5481 + if (znode_is_root(parent) &&
5482 + /* don't kill roots at and lower than twig level */
5483 + znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5484 + node_num_items(parent) == 1) {
5485 + result = reiser4_kill_tree_root(coord.node);
5486 + }
5487 +
5488 + return result < 0 ? : 0;
5489 +}
5490 +
5491 +/* implements COP_CUT opration
5492 +
5493 + Cuts part or whole content of node.
5494 +
5495 +*/
5496 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5497 + carry_level * doing /* current carry level */ ,
5498 + carry_level * todo /* next carry level */ )
5499 +{
5500 + int result;
5501 + carry_plugin_info info;
5502 + node_plugin *nplug;
5503 +
5504 + assert("nikita-896", op != NULL);
5505 + assert("nikita-897", todo != NULL);
5506 + assert("nikita-898", op->op == COP_CUT);
5507 +
5508 + info.doing = doing;
5509 + info.todo = todo;
5510 +
5511 + nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5512 + if (op->u.cut_or_kill.is_cut)
5513 + result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5514 + else
5515 + result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5516 +
5517 + doing->restartable = 0;
5518 + return result < 0 ? : 0;
5519 +}
5520 +
5521 +/* helper function for carry_paste(): returns true if @op can be continued as
5522 + paste */
5523 +static int
5524 +can_paste(coord_t * icoord, const reiser4_key * key,
5525 + const reiser4_item_data * data)
5526 +{
5527 + coord_t circa;
5528 + item_plugin *new_iplug;
5529 + item_plugin *old_iplug;
5530 + int result = 0; /* to keep gcc shut */
5531 +
5532 + assert("", icoord->between != AT_UNIT);
5533 +
5534 + /* obviously, one cannot paste when node is empty---there is nothing
5535 + to paste into. */
5536 + if (node_is_empty(icoord->node))
5537 + return 0;
5538 + /* if insertion point is at the middle of the item, then paste */
5539 + if (!coord_is_between_items(icoord))
5540 + return 1;
5541 + coord_dup(&circa, icoord);
5542 + circa.between = AT_UNIT;
5543 +
5544 + old_iplug = item_plugin_by_coord(&circa);
5545 + new_iplug = data->iplug;
5546 +
5547 + /* check whether we can paste to the item @icoord is "at" when we
5548 + ignore ->between field */
5549 + if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5550 + result = 1;
5551 + } else if (icoord->between == BEFORE_UNIT
5552 + || icoord->between == BEFORE_ITEM) {
5553 + /* otherwise, try to glue to the item at the left, if any */
5554 + coord_dup(&circa, icoord);
5555 + if (coord_set_to_left(&circa)) {
5556 + result = 0;
5557 + coord_init_before_item(icoord);
5558 + } else {
5559 + old_iplug = item_plugin_by_coord(&circa);
5560 + result = (old_iplug == new_iplug)
5561 + && item_can_contain_key(icoord, key, data);
5562 + if (result) {
5563 + coord_dup(icoord, &circa);
5564 + icoord->between = AFTER_UNIT;
5565 + }
5566 + }
5567 + } else if (icoord->between == AFTER_UNIT
5568 + || icoord->between == AFTER_ITEM) {
5569 + coord_dup(&circa, icoord);
5570 + /* otherwise, try to glue to the item at the right, if any */
5571 + if (coord_set_to_right(&circa)) {
5572 + result = 0;
5573 + coord_init_after_item(icoord);
5574 + } else {
5575 + int (*cck) (const coord_t *, const reiser4_key *,
5576 + const reiser4_item_data *);
5577 +
5578 + old_iplug = item_plugin_by_coord(&circa);
5579 +
5580 + cck = old_iplug->b.can_contain_key;
5581 + if (cck == NULL)
5582 + /* item doesn't define ->can_contain_key
5583 + method? So it is not expandable. */
5584 + result = 0;
5585 + else {
5586 + result = (old_iplug == new_iplug)
5587 + && cck(&circa /*icoord */ , key, data);
5588 + if (result) {
5589 + coord_dup(icoord, &circa);
5590 + icoord->between = BEFORE_UNIT;
5591 + }
5592 + }
5593 + }
5594 + } else
5595 + impossible("nikita-2513", "Nothing works");
5596 + if (result) {
5597 + if (icoord->between == BEFORE_ITEM) {
5598 + assert("vs-912", icoord->unit_pos == 0);
5599 + icoord->between = BEFORE_UNIT;
5600 + } else if (icoord->between == AFTER_ITEM) {
5601 + coord_init_after_item_end(icoord);
5602 + }
5603 + }
5604 + return result;
5605 +}
5606 +
5607 +/* implements COP_PASTE operation
5608 +
5609 + Paste data into existing item. This is complicated by the fact that after
5610 + we shifted something to the left or right neighbors trying to free some
5611 + space, item we were supposed to paste into can be in different node than
5612 + insertion coord. If so, we are no longer doing paste, but insert. See
5613 + comments in insert_paste_common().
5614 +
5615 +*/
5616 +static int carry_paste(carry_op * op /* operation to be performed */ ,
5617 + carry_level * doing UNUSED_ARG /* current carry
5618 + * level */ ,
5619 + carry_level * todo /* next carry level */ )
5620 +{
5621 + znode *node;
5622 + carry_insert_data cdata;
5623 + coord_t dcoord;
5624 + reiser4_item_data data;
5625 + int result;
5626 + int real_size;
5627 + item_plugin *iplug;
5628 + carry_plugin_info info;
5629 + coord_t *coord;
5630 +
5631 + assert("nikita-982", op != NULL);
5632 + assert("nikita-983", todo != NULL);
5633 + assert("nikita-984", op->op == COP_PASTE);
5634 +
5635 + coord_init_zero(&dcoord);
5636 +
5637 + result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5638 + if (result != 0)
5639 + return result;
5640 +
5641 + coord = op->u.insert.d->coord;
5642 +
5643 + /* handle case when op -> u.insert.coord doesn't point to the item
5644 + of required type. restart as insert. */
5645 + if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5646 + op->op = COP_INSERT;
5647 + op->u.insert.type = COPT_PASTE_RESTARTED;
5648 + result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5649 +
5650 + return result;
5651 + }
5652 +
5653 + node = coord->node;
5654 + iplug = item_plugin_by_coord(coord);
5655 + assert("nikita-992", iplug != NULL);
5656 +
5657 + assert("nikita-985", node != NULL);
5658 + assert("nikita-986", node_plugin_by_node(node) != NULL);
5659 +
5660 + assert("nikita-987",
5661 + space_needed_for_op(node, op) <= znode_free_space(node));
5662 +
5663 + assert("nikita-1286", coord_is_existing_item(coord));
5664 +
5665 + /*
5666 + * if item is expanded as a result of this operation, we should first
5667 + * change item size, than call ->b.paste item method. If item is
5668 + * shrunk, it should be done other way around: first call ->b.paste
5669 + * method, then reduce item size.
5670 + */
5671 +
5672 + real_size = space_needed_for_op(node, op);
5673 + if (real_size > 0)
5674 + node->nplug->change_item_size(coord, real_size);
5675 +
5676 + doing->restartable = 0;
5677 + info.doing = doing;
5678 + info.todo = todo;
5679 +
5680 + result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5681 +
5682 + if (real_size < 0)
5683 + node->nplug->change_item_size(coord, real_size);
5684 +
5685 + /* if we pasted at the beginning of the item, update item's key. */
5686 + if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5687 + node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5688 +
5689 + znode_make_dirty(node);
5690 + return result;
5691 +}
5692 +
5693 +/* handle carry COP_EXTENT operation. */
5694 +static int carry_extent(carry_op * op /* operation to perform */ ,
5695 + carry_level * doing /* queue of operations @op
5696 + * is part of */ ,
5697 + carry_level * todo /* queue where new operations
5698 + * are accumulated */ )
5699 +{
5700 + znode *node;
5701 + carry_insert_data cdata;
5702 + coord_t coord;
5703 + reiser4_item_data data;
5704 + carry_op *delete_dummy;
5705 + carry_op *insert_extent;
5706 + int result;
5707 + carry_plugin_info info;
5708 +
5709 + assert("nikita-1751", op != NULL);
5710 + assert("nikita-1752", todo != NULL);
5711 + assert("nikita-1753", op->op == COP_EXTENT);
5712 +
5713 + /* extent insertion overview:
5714 +
5715 + extents live on the TWIG LEVEL, which is level one above the leaf
5716 + one. This complicates extent insertion logic somewhat: it may
5717 + happen (and going to happen all the time) that in logical key
5718 + ordering extent has to be placed between items I1 and I2, located
5719 + at the leaf level, but I1 and I2 are in the same formatted leaf
5720 + node N1. To insert extent one has to
5721 +
5722 + (1) reach node N1 and shift data between N1, its neighbors and
5723 + possibly newly allocated nodes until I1 and I2 fall into different
5724 + nodes. Since I1 and I2 are still neighboring items in logical key
5725 + order, they will be necessary utmost items in their respective
5726 + nodes.
5727 +
5728 + (2) After this new extent item is inserted into node on the twig
5729 + level.
5730 +
5731 + Fortunately this process can reuse almost all code from standard
5732 + insertion procedure (viz. make_space() and insert_paste_common()),
5733 + due to the following observation: make_space() only shifts data up
5734 + to and excluding or including insertion point. It never
5735 + "over-moves" through insertion point. Thus, one can use
5736 + make_space() to perform step (1). All required for this is just to
5737 + instruct free_space_shortage() to keep make_space() shifting data
5738 + until insertion point is at the node border.
5739 +
5740 + */
5741 +
5742 + /* perform common functionality of insert and paste. */
5743 + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5744 + if (result != 0)
5745 + return result;
5746 +
5747 + node = op->u.extent.d->coord->node;
5748 + assert("nikita-1754", node != NULL);
5749 + assert("nikita-1755", node_plugin_by_node(node) != NULL);
5750 + assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5751 +
5752 + /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5753 + extent fits between items. */
5754 +
5755 + info.doing = doing;
5756 + info.todo = todo;
5757 +
5758 + /* there is another complication due to placement of extents on the
5759 + twig level: extents are "rigid" in the sense that key-range
5760 + occupied by extent cannot grow indefinitely to the right as it is
5761 + for the formatted leaf nodes. Because of this when search finds two
5762 + adjacent extents on the twig level, it has to "drill" to the leaf
5763 + level, creating new node. Here we are removing this node.
5764 + */
5765 + if (node_is_empty(node)) {
5766 + delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5767 + if (IS_ERR(delete_dummy))
5768 + return PTR_ERR(delete_dummy);
5769 + delete_dummy->u.delete.child = NULL;
5770 + delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5771 + ZF_SET(node, JNODE_HEARD_BANSHEE);
5772 + }
5773 +
5774 + /* proceed with inserting extent item into parent. We are definitely
5775 + inserting rather than pasting if we get that far. */
5776 + insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5777 + if (IS_ERR(insert_extent))
5778 + /* @delete_dummy will be automatically destroyed on the level
5779 + exiting */
5780 + return PTR_ERR(insert_extent);
5781 + /* NOTE-NIKITA insertion by key is simplest option here. Another
5782 + possibility is to insert on the left or right of already existing
5783 + item.
5784 + */
5785 + insert_extent->u.insert.type = COPT_KEY;
5786 + insert_extent->u.insert.d = op->u.extent.d;
5787 + assert("nikita-1719", op->u.extent.d->key != NULL);
5788 + insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5789 + insert_extent->u.insert.flags =
5790 + znode_get_tree(node)->carry.new_extent_flags;
5791 +
5792 + /*
5793 + * if carry was asked to track lock handle we should actually track
5794 + * lock handle on the twig node rather than on the leaf where
5795 + * operation was started from. Transfer tracked lock handle.
5796 + */
5797 + if (doing->track_type) {
5798 + assert("nikita-3242", doing->tracked != NULL);
5799 + assert("nikita-3244", todo->tracked == NULL);
5800 + todo->tracked = doing->tracked;
5801 + todo->track_type = CARRY_TRACK_NODE;
5802 + doing->tracked = NULL;
5803 + doing->track_type = 0;
5804 + }
5805 +
5806 + return 0;
5807 +}
5808 +
5809 +/* update key in @parent between pointers to @left and @right.
5810 +
5811 + Find coords of @left and @right and update delimiting key between them.
5812 + This is helper function called by carry_update(). Finds position of
5813 + internal item involved. Updates item key. Updates delimiting keys of child
5814 + nodes involved.
5815 +*/
5816 +static int update_delimiting_key(znode * parent /* node key is updated
5817 + * in */ ,
5818 + znode * left /* child of @parent */ ,
5819 + znode * right /* child of @parent */ ,
5820 + carry_level * doing /* current carry
5821 + * level */ ,
5822 + carry_level * todo /* parent carry
5823 + * level */ ,
5824 + const char **error_msg /* place to
5825 + * store error
5826 + * message */ )
5827 +{
5828 + coord_t left_pos;
5829 + coord_t right_pos;
5830 + int result;
5831 + reiser4_key ldkey;
5832 + carry_plugin_info info;
5833 +
5834 + assert("nikita-1177", right != NULL);
5835 + /* find position of right left child in a parent */
5836 + result = find_child_ptr(parent, right, &right_pos);
5837 + if (result != NS_FOUND) {
5838 + *error_msg = "Cannot find position of right child";
5839 + return result;
5840 + }
5841 +
5842 + if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5843 + /* find position of the left child in a parent */
5844 + result = find_child_ptr(parent, left, &left_pos);
5845 + if (result != NS_FOUND) {
5846 + *error_msg = "Cannot find position of left child";
5847 + return result;
5848 + }
5849 + assert("nikita-1355", left_pos.node != NULL);
5850 + } else
5851 + left_pos.node = NULL;
5852 +
5853 + /* check that they are separated by exactly one key and are basically
5854 + sane */
5855 + if (REISER4_DEBUG) {
5856 + if ((left_pos.node != NULL)
5857 + && !coord_is_existing_unit(&left_pos)) {
5858 + *error_msg = "Left child is bastard";
5859 + return RETERR(-EIO);
5860 + }
5861 + if (!coord_is_existing_unit(&right_pos)) {
5862 + *error_msg = "Right child is bastard";
5863 + return RETERR(-EIO);
5864 + }
5865 + if (left_pos.node != NULL &&
5866 + !coord_are_neighbors(&left_pos, &right_pos)) {
5867 + *error_msg = "Children are not direct siblings";
5868 + return RETERR(-EIO);
5869 + }
5870 + }
5871 + *error_msg = NULL;
5872 +
5873 + info.doing = doing;
5874 + info.todo = todo;
5875 +
5876 + /*
5877 + * If child node is not empty, new key of internal item is a key of
5878 + * leftmost item in the child node. If the child is empty, take its
5879 + * right delimiting key as a new key of the internal item. Precise key
5880 + * in the latter case is not important per se, because the child (and
5881 + * the internal item) are going to be killed shortly anyway, but we
5882 + * have to preserve correct order of keys in the parent node.
5883 + */
5884 +
5885 + if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5886 + leftmost_key_in_node(right, &ldkey);
5887 + else {
5888 + read_lock_dk(znode_get_tree(parent));
5889 + ldkey = *znode_get_rd_key(right);
5890 + read_unlock_dk(znode_get_tree(parent));
5891 + }
5892 + node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5893 + doing->restartable = 0;
5894 + znode_make_dirty(parent);
5895 + return 0;
5896 +}
5897 +
5898 +/* implements COP_UPDATE opration
5899 +
5900 + Update delimiting keys.
5901 +
5902 +*/
5903 +static int carry_update(carry_op * op /* operation to be performed */ ,
5904 + carry_level * doing /* current carry level */ ,
5905 + carry_level * todo /* next carry level */ )
5906 +{
5907 + int result;
5908 + carry_node *missing UNUSED_ARG;
5909 + znode *left;
5910 + znode *right;
5911 + carry_node *lchild;
5912 + carry_node *rchild;
5913 + const char *error_msg;
5914 + reiser4_tree *tree;
5915 +
5916 + /*
5917 + * This operation is called to update key of internal item. This is
5918 + * necessary when carry shifted of cut data on the child
5919 + * level. Arguments of this operation are:
5920 + *
5921 + * @right --- child node. Operation should update key of internal
5922 + * item pointing to @right.
5923 + *
5924 + * @left --- left neighbor of @right. This parameter is optional.
5925 + */
5926 +
5927 + assert("nikita-902", op != NULL);
5928 + assert("nikita-903", todo != NULL);
5929 + assert("nikita-904", op->op == COP_UPDATE);
5930 +
5931 + lchild = op->u.update.left;
5932 + rchild = op->node;
5933 +
5934 + if (lchild != NULL) {
5935 + assert("nikita-1001", lchild->parent);
5936 + assert("nikita-1003", !lchild->left);
5937 + left = reiser4_carry_real(lchild);
5938 + } else
5939 + left = NULL;
5940 +
5941 + tree = znode_get_tree(rchild->node);
5942 + read_lock_tree(tree);
5943 + right = znode_parent(rchild->node);
5944 + read_unlock_tree(tree);
5945 +
5946 + if (right != NULL) {
5947 + result = update_delimiting_key(right,
5948 + lchild ? lchild->node : NULL,
5949 + rchild->node,
5950 + doing, todo, &error_msg);
5951 + } else {
5952 + error_msg = "Cannot find node to update key in";
5953 + result = RETERR(-EIO);
5954 + }
5955 + /* operation will be reposted to the next level by the
5956 + ->update_item_key() method of node plugin, if necessary. */
5957 +
5958 + if (result != 0) {
5959 + warning("nikita-999", "Error updating delimiting key: %s (%i)",
5960 + error_msg ? : "", result);
5961 + }
5962 + return result;
5963 +}
5964 +
5965 +/* move items from @node during carry */
5966 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
5967 + coord_t * insert_coord /* coord where new item
5968 + * is to be inserted */ ,
5969 + znode * node /* node which data are moved from */ ,
5970 + carry_level * doing /* active carry queue */ ,
5971 + carry_level * todo /* carry queue where new
5972 + * operations are to be put
5973 + * in */ ,
5974 + unsigned int including_insert_coord_p /* true if
5975 + * @insertion_coord
5976 + * can be moved */ )
5977 +{
5978 + int result;
5979 + znode *source;
5980 + carry_plugin_info info;
5981 + node_plugin *nplug;
5982 +
5983 + source = insert_coord->node;
5984 +
5985 + info.doing = doing;
5986 + info.todo = todo;
5987 +
5988 + nplug = node_plugin_by_node(node);
5989 + result = nplug->shift(insert_coord, node,
5990 + (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5991 + (int)including_insert_coord_p, &info);
5992 + /* the only error ->shift() method of node plugin can return is
5993 + -ENOMEM due to carry node/operation allocation. */
5994 + assert("nikita-915", result >= 0 || result == -ENOMEM);
5995 + if (result > 0) {
5996 + /*
5997 + * if some number of bytes was actually shifted, mark nodes
5998 + * dirty, and carry level as non-restartable.
5999 + */
6000 + doing->restartable = 0;
6001 + znode_make_dirty(source);
6002 + znode_make_dirty(node);
6003 + }
6004 +
6005 + assert("nikita-2077", coord_check(insert_coord));
6006 + return 0;
6007 +}
6008 +
6009 +typedef carry_node *(*carry_iterator) (carry_node * node);
6010 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6011 + carry_iterator iterator);
6012 +
6013 +static carry_node *pool_level_list_prev(carry_node *node)
6014 +{
6015 + return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6016 +}
6017 +
6018 +/* look for the left neighbor of given carry node in a carry queue.
6019 +
6020 + This is used by find_left_neighbor(), but I am not sure that this
6021 + really gives any advantage. More statistics required.
6022 +
6023 +*/
6024 +carry_node *find_left_carry(carry_node * node /* node to find left neighbor
6025 + * of */ ,
6026 + carry_level * level /* level to scan */ )
6027 +{
6028 + return find_dir_carry(node, level,
6029 + (carry_iterator) pool_level_list_prev);
6030 +}
6031 +
6032 +static carry_node *pool_level_list_next(carry_node *node)
6033 +{
6034 + return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6035 +}
6036 +
6037 +/* look for the right neighbor of given carry node in a
6038 + carry queue.
6039 +
6040 + This is used by find_right_neighbor(), but I am not sure that this
6041 + really gives any advantage. More statistics required.
6042 +
6043 +*/
6044 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6045 + * of */ ,
6046 + carry_level * level /* level to scan */ )
6047 +{
6048 + return find_dir_carry(node, level,
6049 + (carry_iterator) pool_level_list_next);
6050 +}
6051 +
6052 +/* look for the left or right neighbor of given carry node in a carry
6053 + queue.
6054 +
6055 + Helper function used by find_{left|right}_carry().
6056 +*/
6057 +static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6058 + * from */ ,
6059 + carry_level * level /* level to scan */ ,
6060 + carry_iterator iterator /* operation to
6061 + * move to the next
6062 + * node */ )
6063 +{
6064 + carry_node *neighbor;
6065 +
6066 + assert("nikita-1059", node != NULL);
6067 + assert("nikita-1060", level != NULL);
6068 +
6069 + /* scan list of carry nodes on this list dir-ward, skipping all
6070 + carry nodes referencing the same znode. */
6071 + neighbor = node;
6072 + while (1) {
6073 + neighbor = iterator(neighbor);
6074 + if (carry_node_end(level, neighbor))
6075 + /* list head is reached */
6076 + return NULL;
6077 + if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6078 + return neighbor;
6079 + }
6080 +}
6081 +
6082 +/*
6083 + * Memory reservation estimation.
6084 + *
6085 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6086 + * takes tree in consistent state (e.g., that search tree invariants hold),
6087 + * and leaves tree consistent after it finishes. This means that when some
6088 + * error occurs carry cannot simply return if there are pending carry
6089 + * operations. Generic solution for this problem is carry-undo either as
6090 + * transaction manager feature (requiring checkpoints and isolation), or
6091 + * through some carry specific mechanism.
6092 + *
6093 + * Our current approach is to panic if carry hits an error while tree is
6094 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6095 + * this "memory reservation" mechanism was added.
6096 + *
6097 + * Memory reservation is implemented by perthread-pages.diff patch from
6098 + * core-patches. Its API is defined in <linux/gfp.h>
6099 + *
6100 + * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6101 + * void perthread_pages_release(int nrpages);
6102 + * int perthread_pages_count(void);
6103 + *
6104 + * carry estimates its worst case memory requirements at the entry, reserved
6105 + * enough memory, and released unused pages before returning.
6106 + *
6107 + * Code below estimates worst case memory requirements for a given carry
6108 + * queue. This is dome by summing worst case memory requirements for each
6109 + * operation in the queue.
6110 + *
6111 + */
6112 +
6113 +/*
6114 + * Memory memory requirements of many operations depends on the tree
6115 + * height. For example, item insertion requires new node to be inserted at
6116 + * each tree level in the worst case. What tree height should be used for
6117 + * estimation? Current tree height is wrong, because tree height can change
6118 + * between the time when estimation was done and the time when operation is
6119 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6120 + * is also not desirable, because it would lead to the huge over-estimation
6121 + * all the time. Plausible solution is "capped tree height": if current tree
6122 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6123 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6124 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6125 + * to be increased even more during short interval of time.
6126 + */
6127 +#define TREE_HEIGHT_CAP (5)
6128 +
6129 +/* return capped tree height for the @tree. See comment above. */
6130 +static int cap_tree_height(reiser4_tree * tree)
6131 +{
6132 + return max_t(int, tree->height, TREE_HEIGHT_CAP);
6133 +}
6134 +
6135 +/* return capped tree height for the current tree. */
6136 +static int capped_height(void)
6137 +{
6138 + return cap_tree_height(current_tree);
6139 +}
6140 +
6141 +/* return number of pages required to store given number of bytes */
6142 +static int bytes_to_pages(int bytes)
6143 +{
6144 + return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6145 +}
6146 +
6147 +/* how many pages are required to allocate znodes during item insertion. */
6148 +static int carry_estimate_znodes(void)
6149 +{
6150 + /*
6151 + * Note, that there we have some problem here: there is no way to
6152 + * reserve pages specifically for the given slab. This means that
6153 + * these pages can be hijacked for some other end.
6154 + */
6155 +
6156 + /* in the worst case we need 3 new znode on each tree level */
6157 + return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6158 +}
6159 +
6160 +/*
6161 + * how many pages are required to load bitmaps. One bitmap per level.
6162 + */
6163 +static int carry_estimate_bitmaps(void)
6164 +{
6165 + if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6166 + int bytes;
6167 +
6168 + bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6169 + * bitmap.c, skip for now. */
6170 + 2 * sizeof(jnode)); /* working and commit jnodes */
6171 + return bytes_to_pages(bytes) + 2; /* and their contents */
6172 + } else
6173 + /* bitmaps were pre-loaded during mount */
6174 + return 0;
6175 +}
6176 +
6177 +/* worst case item insertion memory requirements */
6178 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6179 +{
6180 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6181 + capped_height() + /* new block on each level */
6182 + 1 + /* and possibly extra new block at the leaf level */
6183 + 3; /* loading of leaves into memory */
6184 +}
6185 +
6186 +/* worst case item deletion memory requirements */
6187 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6188 +{
6189 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6190 + 3; /* loading of leaves into memory */
6191 +}
6192 +
6193 +/* worst case tree cut memory requirements */
6194 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6195 +{
6196 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6197 + 3; /* loading of leaves into memory */
6198 +}
6199 +
6200 +/* worst case memory requirements of pasting into item */
6201 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6202 +{
6203 + return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6204 + capped_height() + /* new block on each level */
6205 + 1 + /* and possibly extra new block at the leaf level */
6206 + 3; /* loading of leaves into memory */
6207 +}
6208 +
6209 +/* worst case memory requirements of extent insertion */
6210 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6211 +{
6212 + return carry_estimate_insert(op, level) + /* insert extent */
6213 + carry_estimate_delete(op, level); /* kill leaf */
6214 +}
6215 +
6216 +/* worst case memory requirements of key update */
6217 +static int carry_estimate_update(carry_op * op, carry_level * level)
6218 +{
6219 + return 0;
6220 +}
6221 +
6222 +/* worst case memory requirements of flow insertion */
6223 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6224 +{
6225 + int newnodes;
6226 +
6227 + newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6228 + CARRY_FLOW_NEW_NODES_LIMIT);
6229 + /*
6230 + * roughly estimate insert_flow as a sequence of insertions.
6231 + */
6232 + return newnodes * carry_estimate_insert(op, level);
6233 +}
6234 +
6235 +/* This is dispatch table for carry operations. It can be trivially
6236 + abstracted into useful plugin: tunable balancing policy is a good
6237 + thing. */
6238 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6239 + [COP_INSERT] = {
6240 + .handler = carry_insert,
6241 + .estimate = carry_estimate_insert}
6242 + ,
6243 + [COP_DELETE] = {
6244 + .handler = carry_delete,
6245 + .estimate = carry_estimate_delete}
6246 + ,
6247 + [COP_CUT] = {
6248 + .handler = carry_cut,
6249 + .estimate = carry_estimate_cut}
6250 + ,
6251 + [COP_PASTE] = {
6252 + .handler = carry_paste,
6253 + .estimate = carry_estimate_paste}
6254 + ,
6255 + [COP_EXTENT] = {
6256 + .handler = carry_extent,
6257 + .estimate = carry_estimate_extent}
6258 + ,
6259 + [COP_UPDATE] = {
6260 + .handler = carry_update,
6261 + .estimate = carry_estimate_update}
6262 + ,
6263 + [COP_INSERT_FLOW] = {
6264 + .handler = carry_insert_flow,
6265 + .estimate = carry_estimate_insert_flow}
6266 +};
6267 +
6268 +/* Make Linus happy.
6269 + Local variables:
6270 + c-indentation-style: "K&R"
6271 + mode-name: "LC"
6272 + c-basic-offset: 8
6273 + tab-width: 8
6274 + fill-column: 120
6275 + scroll-step: 1
6276 + End:
6277 +*/
6278 diff -urN linux-2.6.23.orig/fs/reiser4/carry_ops.h linux-2.6.23/fs/reiser4/carry_ops.h
6279 --- linux-2.6.23.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300
6280 +++ linux-2.6.23/fs/reiser4/carry_ops.h 2007-12-04 16:49:30.000000000 +0300
6281 @@ -0,0 +1,42 @@
6282 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6283 +
6284 +/* implementation of carry operations. See carry_ops.c for details. */
6285 +
6286 +#if !defined( __CARRY_OPS_H__ )
6287 +#define __CARRY_OPS_H__
6288 +
6289 +#include "forward.h"
6290 +#include "znode.h"
6291 +#include "carry.h"
6292 +
6293 +/* carry operation handlers */
6294 +typedef struct carry_op_handler {
6295 + /* perform operation */
6296 + int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6297 + /* estimate memory requirements for @op */
6298 + int (*estimate) (carry_op * op, carry_level * level);
6299 +} carry_op_handler;
6300 +
6301 +/* This is dispatch table for carry operations. It can be trivially
6302 + abstracted into useful plugin: tunable balancing policy is a good
6303 + thing. */
6304 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6305 +
6306 +unsigned int space_needed(const znode * node, const coord_t * coord,
6307 + const reiser4_item_data * data, int inserting);
6308 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6309 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6310 +
6311 +/* __CARRY_OPS_H__ */
6312 +#endif
6313 +
6314 +/* Make Linus happy.
6315 + Local variables:
6316 + c-indentation-style: "K&R"
6317 + mode-name: "LC"
6318 + c-basic-offset: 8
6319 + tab-width: 8
6320 + fill-column: 120
6321 + scroll-step: 1
6322 + End:
6323 +*/
6324 diff -urN linux-2.6.23.orig/fs/reiser4/context.c linux-2.6.23/fs/reiser4/context.c
6325 --- linux-2.6.23.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300
6326 +++ linux-2.6.23/fs/reiser4/context.c 2007-12-04 16:49:30.000000000 +0300
6327 @@ -0,0 +1,288 @@
6328 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6329 +
6330 +/* Manipulation of reiser4_context */
6331 +
6332 +/*
6333 + * global context used during system call. Variable of this type is allocated
6334 + * on the stack at the beginning of the reiser4 part of the system call and
6335 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6336 + * passing pointer to current transaction and current lockstack (both in
6337 + * one-to-one mapping with threads) all over the call chain.
6338 + *
6339 + * It's kind of like those global variables the prof used to tell you not to
6340 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6341 + *
6342 + * In some situations it is desirable to have ability to enter reiser4_context
6343 + * more than once for the same thread (nested contexts). For example, there
6344 + * are some functions that can be called either directly from VFS/VM or from
6345 + * already active reiser4 context (->writepage, for example).
6346 + *
6347 + * In such situations "child" context acts like dummy: all activity is
6348 + * actually performed in the top level context, and get_current_context()
6349 + * always returns top level context.
6350 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6351 + * nested any way.
6352 + *
6353 + * Note that there is an important difference between reiser4 uses
6354 + * ->fs_context and the way other file systems use it. Other file systems
6355 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6356 + * (this is why ->fs_context was initially called ->journal_info). This means,
6357 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6358 + * to the file system, they assume that some transaction is already underway,
6359 + * and usually bail out, because starting nested transaction would most likely
6360 + * lead to the deadlock. This gives false positives with reiser4, because we
6361 + * set ->fs_context before starting transaction.
6362 + */
6363 +
6364 +#include "debug.h"
6365 +#include "super.h"
6366 +#include "context.h"
6367 +
6368 +#include <linux/writeback.h> /* balance_dirty_pages() */
6369 +#include <linux/hardirq.h>
6370 +
6371 +static void _reiser4_init_context(reiser4_context * context,
6372 + struct super_block *super)
6373 +{
6374 + memset(context, 0, sizeof(*context));
6375 +
6376 + context->super = super;
6377 + context->magic = context_magic;
6378 + context->outer = current->journal_info;
6379 + current->journal_info = (void *)context;
6380 + context->nr_children = 0;
6381 + context->gfp_mask = GFP_KERNEL;
6382 +
6383 + init_lock_stack(&context->stack);
6384 +
6385 + reiser4_txn_begin(context);
6386 +
6387 + /* initialize head of tap list */
6388 + INIT_LIST_HEAD(&context->taps);
6389 +#if REISER4_DEBUG
6390 + context->task = current;
6391 +#endif
6392 + grab_space_enable();
6393 +}
6394 +
6395 +/* initialize context and bind it to the current thread
6396 +
6397 + This function should be called at the beginning of reiser4 part of
6398 + syscall.
6399 +*/
6400 +reiser4_context * reiser4_init_context(struct super_block * super)
6401 +{
6402 + reiser4_context *context;
6403 +
6404 + assert("nikita-2662", !in_interrupt() && !in_irq());
6405 + assert("nikita-3357", super != NULL);
6406 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6407 +
6408 + context = get_current_context_check();
6409 + if (context && context->super == super) {
6410 + context = (reiser4_context *) current->journal_info;
6411 + context->nr_children++;
6412 + return context;
6413 + }
6414 +
6415 + context = kmalloc(sizeof(*context), GFP_KERNEL);
6416 + if (context == NULL)
6417 + return ERR_PTR(RETERR(-ENOMEM));
6418 +
6419 + _reiser4_init_context(context, super);
6420 + return context;
6421 +}
6422 +
6423 +/* this is used in scan_mgr which is called with spinlock held and in
6424 + reiser4_fill_super magic */
6425 +void init_stack_context(reiser4_context *context, struct super_block *super)
6426 +{
6427 + assert("nikita-2662", !in_interrupt() && !in_irq());
6428 + assert("nikita-3357", super != NULL);
6429 + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6430 + assert("vs-12", !is_in_reiser4_context());
6431 +
6432 + _reiser4_init_context(context, super);
6433 + context->on_stack = 1;
6434 + return;
6435 +}
6436 +
6437 +/* cast lock stack embedded into reiser4 context up to its container */
6438 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6439 +{
6440 + return container_of(owner, reiser4_context, stack);
6441 +}
6442 +
6443 +/* true if there is already _any_ reiser4 context for the current thread */
6444 +int is_in_reiser4_context(void)
6445 +{
6446 + reiser4_context *ctx;
6447 +
6448 + ctx = current->journal_info;
6449 + return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6450 +}
6451 +
6452 +/*
6453 + * call balance dirty pages for the current context.
6454 + *
6455 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6456 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6457 + * write---this covers vast majority of all dirty traffic), but we cannot do
6458 + * this immediately when formatted node is dirtied, because long term lock is
6459 + * usually held at that time. To work around this, dirtying of formatted node
6460 + * simply increases ->nr_marked_dirty counter in the current reiser4
6461 + * context. When we are about to leave this context,
6462 + * balance_dirty_pages_ratelimited() is called, if necessary.
6463 + *
6464 + * This introduces another problem: sometimes we do not want to run
6465 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6466 + * because some important lock (like ->i_mutex on the parent directory) is
6467 + * held. To achieve this, ->nobalance flag can be set in the current context.
6468 + */
6469 +static void balance_dirty_pages_at(reiser4_context *context)
6470 +{
6471 + reiser4_super_info_data *sbinfo = get_super_private(context->super);
6472 +
6473 + /*
6474 + * call balance_dirty_pages_ratelimited() to process formatted nodes
6475 + * dirtied during this system call. Do that only if we are not in mount
6476 + * and there were nodes dirtied in this context and we are not in
6477 + * writepage (to avoid deadlock) and not in pdflush
6478 + */
6479 + if (sbinfo != NULL && sbinfo->fake != NULL &&
6480 + context->nr_marked_dirty != 0 &&
6481 + !(current->flags & PF_MEMALLOC) &&
6482 + !current_is_pdflush())
6483 + balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6484 +}
6485 +
6486 +/* release resources associated with context.
6487 +
6488 + This function should be called at the end of "session" with reiser4,
6489 + typically just before leaving reiser4 driver back to VFS.
6490 +
6491 + This is good place to put some degugging consistency checks, like that
6492 + thread released all locks and closed transcrash etc.
6493 +
6494 +*/
6495 +static void reiser4_done_context(reiser4_context * context /* context being released */ )
6496 +{
6497 + assert("nikita-860", context != NULL);
6498 + assert("nikita-859", context->magic == context_magic);
6499 + assert("vs-646", (reiser4_context *) current->journal_info == context);
6500 + assert("zam-686", !in_interrupt() && !in_irq());
6501 +
6502 + /* only do anything when leaving top-level reiser4 context. All nested
6503 + * contexts are just dummies. */
6504 + if (context->nr_children == 0) {
6505 + assert("jmacd-673", context->trans == NULL);
6506 + assert("jmacd-1002", lock_stack_isclean(&context->stack));
6507 + assert("nikita-1936", reiser4_no_counters_are_held());
6508 + assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6509 + assert("zam-1004", ergo(get_super_private(context->super),
6510 + get_super_private(context->super)->delete_mutex_owner !=
6511 + current));
6512 +
6513 + /* release all grabbed but as yet unused blocks */
6514 + if (context->grabbed_blocks != 0)
6515 + all_grabbed2free();
6516 +
6517 + /*
6518 + * synchronize against longterm_unlock_znode():
6519 + * wake_up_requestor() wakes up requestors without holding
6520 + * zlock (otherwise they will immediately bump into that lock
6521 + * after wake up on another CPU). To work around (rare)
6522 + * situation where requestor has been woken up asynchronously
6523 + * and managed to run until completion (and destroy its
6524 + * context and lock stack) before wake_up_requestor() called
6525 + * wake_up() on it, wake_up_requestor() synchronize on lock
6526 + * stack spin lock. It has actually been observed that spin
6527 + * lock _was_ locked at this point, because
6528 + * wake_up_requestor() took interrupt.
6529 + */
6530 + spin_lock_stack(&context->stack);
6531 + spin_unlock_stack(&context->stack);
6532 +
6533 + assert("zam-684", context->nr_children == 0);
6534 + /* restore original ->fs_context value */
6535 + current->journal_info = context->outer;
6536 + if (context->on_stack == 0)
6537 + kfree(context);
6538 + } else {
6539 + context->nr_children--;
6540 +#if REISER4_DEBUG
6541 + assert("zam-685", context->nr_children >= 0);
6542 +#endif
6543 + }
6544 +}
6545 +
6546 +/*
6547 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6548 + * transaction. Call done_context() to do context related book-keeping.
6549 + */
6550 +void reiser4_exit_context(reiser4_context * context)
6551 +{
6552 + assert("nikita-3021", reiser4_schedulable());
6553 +
6554 + if (context->nr_children == 0) {
6555 + if (!context->nobalance) {
6556 + reiser4_txn_restart(context);
6557 + balance_dirty_pages_at(context);
6558 + }
6559 +
6560 + /* if filesystem is mounted with -o sync or -o dirsync - commit
6561 + transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6562 + commiting on exit_context when inode semaphore is held and
6563 + to have ktxnmgrd to do commit instead to get better
6564 + concurrent filesystem accesses. But, when one mounts with -o
6565 + sync, he cares more about reliability than about
6566 + performance. So, for now we have this simple mount -o sync
6567 + support. */
6568 + if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6569 + txn_atom *atom;
6570 +
6571 + atom = get_current_atom_locked_nocheck();
6572 + if (atom) {
6573 + atom->flags |= ATOM_FORCE_COMMIT;
6574 + context->trans->flags &= ~TXNH_DONT_COMMIT;
6575 + spin_unlock_atom(atom);
6576 + }
6577 + }
6578 + reiser4_txn_end(context);
6579 + }
6580 + reiser4_done_context(context);
6581 +}
6582 +
6583 +void reiser4_ctx_gfp_mask_set(void)
6584 +{
6585 + reiser4_context *ctx;
6586 +
6587 + ctx = get_current_context();
6588 + if (ctx->entd == 0 &&
6589 + list_empty(&ctx->stack.locks) &&
6590 + ctx->trans->atom == NULL)
6591 + ctx->gfp_mask = GFP_KERNEL;
6592 + else
6593 + ctx->gfp_mask = GFP_NOFS;
6594 +}
6595 +
6596 +void reiser4_ctx_gfp_mask_force (gfp_t mask)
6597 +{
6598 + reiser4_context *ctx;
6599 + ctx = get_current_context();
6600 +
6601 + assert("edward-1454", ctx != NULL);
6602 +
6603 + ctx->gfp_mask = mask;
6604 +}
6605 +
6606 +/*
6607 + * Local variables:
6608 + * c-indentation-style: "K&R"
6609 + * mode-name: "LC"
6610 + * c-basic-offset: 8
6611 + * tab-width: 8
6612 + * fill-column: 120
6613 + * scroll-step: 1
6614 + * End:
6615 + */
6616 diff -urN linux-2.6.23.orig/fs/reiser4/context.h linux-2.6.23/fs/reiser4/context.h
6617 --- linux-2.6.23.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300
6618 +++ linux-2.6.23/fs/reiser4/context.h 2007-12-04 16:49:30.000000000 +0300
6619 @@ -0,0 +1,228 @@
6620 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6621 + * reiser4/README */
6622 +
6623 +/* Reiser4 context. See context.c for details. */
6624 +
6625 +#if !defined( __REISER4_CONTEXT_H__ )
6626 +#define __REISER4_CONTEXT_H__
6627 +
6628 +#include "forward.h"
6629 +#include "debug.h"
6630 +#include "dformat.h"
6631 +#include "tap.h"
6632 +#include "lock.h"
6633 +
6634 +#include <linux/types.h> /* for __u?? */
6635 +#include <linux/fs.h> /* for struct super_block */
6636 +#include <linux/spinlock.h>
6637 +#include <linux/sched.h> /* for struct task_struct */
6638 +
6639 +/* reiser4 per-thread context */
6640 +struct reiser4_context {
6641 + /* magic constant. For identification of reiser4 contexts. */
6642 + __u32 magic;
6643 +
6644 + /* current lock stack. See lock.[ch]. This is where list of all
6645 + locks taken by current thread is kept. This is also used in
6646 + deadlock detection. */
6647 + lock_stack stack;
6648 +
6649 + /* current transcrash. */
6650 + txn_handle *trans;
6651 + /* transaction handle embedded into reiser4_context. ->trans points
6652 + * here by default. */
6653 + txn_handle trans_in_ctx;
6654 +
6655 + /* super block we are working with. To get the current tree
6656 + use &get_super_private (reiser4_get_current_sb ())->tree. */
6657 + struct super_block *super;
6658 +
6659 + /* parent fs activation */
6660 + struct fs_activation *outer;
6661 +
6662 + /* per-thread grabbed (for further allocation) blocks counter */
6663 + reiser4_block_nr grabbed_blocks;
6664 +
6665 + /* list of taps currently monitored. See tap.c */
6666 + struct list_head taps;
6667 +
6668 + /* grabbing space is enabled */
6669 + unsigned int grab_enabled:1;
6670 + /* should be set when we are write dirty nodes to disk in jnode_flush or
6671 + * reiser4_write_logs() */
6672 + unsigned int writeout_mode:1;
6673 + /* true, if current thread is an ent thread */
6674 + unsigned int entd:1;
6675 + /* true, if balance_dirty_pages() should not be run when leaving this
6676 + * context. This is used to avoid lengthly balance_dirty_pages()
6677 + * operation when holding some important resource, like directory
6678 + * ->i_mutex */
6679 + unsigned int nobalance:1;
6680 +
6681 + /* this bit is used on reiser4_done_context to decide whether context is
6682 + kmalloc-ed and has to be kfree-ed */
6683 + unsigned int on_stack:1;
6684 +
6685 + /* count non-trivial jnode_set_dirty() calls */
6686 + unsigned long nr_marked_dirty;
6687 +
6688 + /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6689 + * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6690 + * captures pages. When number of pages captured in one
6691 + * reiser4_sync_inodes reaches some threshold - some atoms get
6692 + * flushed */
6693 + int nr_captured;
6694 + int nr_children; /* number of child contexts */
6695 +#if REISER4_DEBUG
6696 + /* debugging information about reiser4 locks held by the current
6697 + * thread */
6698 + reiser4_lock_cnt_info locks;
6699 + struct task_struct *task; /* so we can easily find owner of the stack */
6700 +
6701 + /*
6702 + * disk space grabbing debugging support
6703 + */
6704 + /* how many disk blocks were grabbed by the first call to
6705 + * reiser4_grab_space() in this context */
6706 + reiser4_block_nr grabbed_initially;
6707 +
6708 + /* list of all threads doing flush currently */
6709 + struct list_head flushers_link;
6710 + /* information about last error encountered by reiser4 */
6711 + err_site err;
6712 +#endif
6713 + void *vp;
6714 + gfp_t gfp_mask;
6715 +};
6716 +
6717 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6718 +
6719 +/* Debugging helps. */
6720 +#if REISER4_DEBUG
6721 +extern void print_contexts(void);
6722 +#endif
6723 +
6724 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6725 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
6726 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6727 +
6728 +extern reiser4_context *reiser4_init_context(struct super_block *);
6729 +extern void init_stack_context(reiser4_context *, struct super_block *);
6730 +extern void reiser4_exit_context(reiser4_context *);
6731 +
6732 +/* magic constant we store in reiser4_context allocated at the stack. Used to
6733 + catch accesses to staled or uninitialized contexts. */
6734 +#define context_magic ((__u32) 0x4b1b5d0b)
6735 +
6736 +extern int is_in_reiser4_context(void);
6737 +
6738 +/*
6739 + * return reiser4_context for the thread @tsk
6740 + */
6741 +static inline reiser4_context *get_context(const struct task_struct *tsk)
6742 +{
6743 + assert("vs-1682",
6744 + ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6745 + return (reiser4_context *) tsk->journal_info;
6746 +}
6747 +
6748 +/*
6749 + * return reiser4 context of the current thread, or NULL if there is none.
6750 + */
6751 +static inline reiser4_context *get_current_context_check(void)
6752 +{
6753 + if (is_in_reiser4_context())
6754 + return get_context(current);
6755 + else
6756 + return NULL;
6757 +}
6758 +
6759 +static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6760 +
6761 +/* return context associated with current thread */
6762 +static inline reiser4_context *get_current_context(void)
6763 +{
6764 + return get_context(current);
6765 +}
6766 +
6767 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6768 +{
6769 + reiser4_context *ctx;
6770 +
6771 + ctx = get_current_context_check();
6772 + return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6773 +}
6774 +
6775 +void reiser4_ctx_gfp_mask_set(void);
6776 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
6777 +
6778 +/*
6779 + * true if current thread is in the write-out mode. Thread enters write-out
6780 + * mode during jnode_flush and reiser4_write_logs().
6781 + */
6782 +static inline int is_writeout_mode(void)
6783 +{
6784 + return get_current_context()->writeout_mode;
6785 +}
6786 +
6787 +/*
6788 + * enter write-out mode
6789 + */
6790 +static inline void writeout_mode_enable(void)
6791 +{
6792 + assert("zam-941", !get_current_context()->writeout_mode);
6793 + get_current_context()->writeout_mode = 1;
6794 +}
6795 +
6796 +/*
6797 + * leave write-out mode
6798 + */
6799 +static inline void writeout_mode_disable(void)
6800 +{
6801 + assert("zam-942", get_current_context()->writeout_mode);
6802 + get_current_context()->writeout_mode = 0;
6803 +}
6804 +
6805 +static inline void grab_space_enable(void)
6806 +{
6807 + get_current_context()->grab_enabled = 1;
6808 +}
6809 +
6810 +static inline void grab_space_disable(void)
6811 +{
6812 + get_current_context()->grab_enabled = 0;
6813 +}
6814 +
6815 +static inline void grab_space_set_enabled(int enabled)
6816 +{
6817 + get_current_context()->grab_enabled = enabled;
6818 +}
6819 +
6820 +static inline int is_grab_enabled(reiser4_context * ctx)
6821 +{
6822 + return ctx->grab_enabled;
6823 +}
6824 +
6825 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6826 + * flush would be performed when it is closed. This is necessary when handle
6827 + * has to be closed under some coarse semaphore, like i_mutex of
6828 + * directory. Commit will be performed by ktxnmgrd. */
6829 +static inline void context_set_commit_async(reiser4_context * context)
6830 +{
6831 + context->nobalance = 1;
6832 + context->trans->flags |= TXNH_DONT_COMMIT;
6833 +}
6834 +
6835 +/* __REISER4_CONTEXT_H__ */
6836 +#endif
6837 +
6838 +/* Make Linus happy.
6839 + Local variables:
6840 + c-indentation-style: "K&R"
6841 + mode-name: "LC"
6842 + c-basic-offset: 8
6843 + tab-width: 8
6844 + fill-column: 120
6845 + scroll-step: 1
6846 + End:
6847 +*/
6848 diff -urN linux-2.6.23.orig/fs/reiser4/coord.c linux-2.6.23/fs/reiser4/coord.c
6849 --- linux-2.6.23.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300
6850 +++ linux-2.6.23/fs/reiser4/coord.c 2007-12-04 16:49:30.000000000 +0300
6851 @@ -0,0 +1,935 @@
6852 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6853 +
6854 +#include "forward.h"
6855 +#include "debug.h"
6856 +#include "dformat.h"
6857 +#include "tree.h"
6858 +#include "plugin/item/item.h"
6859 +#include "znode.h"
6860 +#include "coord.h"
6861 +
6862 +/* Internal constructor. */
6863 +static inline void
6864 +coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6865 + pos_in_node_t unit_pos, between_enum between)
6866 +{
6867 + coord->node = (znode *) node;
6868 + coord_set_item_pos(coord, item_pos);
6869 + coord->unit_pos = unit_pos;
6870 + coord->between = between;
6871 + ON_DEBUG(coord->plug_v = 0);
6872 + ON_DEBUG(coord->body_v = 0);
6873 +
6874 + /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6875 +}
6876 +
6877 +/* after shifting of node content, coord previously set properly may become
6878 + invalid, try to "normalize" it. */
6879 +void coord_normalize(coord_t * coord)
6880 +{
6881 + znode *node;
6882 +
6883 + node = coord->node;
6884 + assert("vs-683", node);
6885 +
6886 + coord_clear_iplug(coord);
6887 +
6888 + if (node_is_empty(node)) {
6889 + coord_init_first_unit(coord, node);
6890 + } else if ((coord->between == AFTER_ITEM)
6891 + || (coord->between == AFTER_UNIT)) {
6892 + return;
6893 + } else if (coord->item_pos == coord_num_items(coord)
6894 + && coord->between == BEFORE_ITEM) {
6895 + coord_dec_item_pos(coord);
6896 + coord->between = AFTER_ITEM;
6897 + } else if (coord->unit_pos == coord_num_units(coord)
6898 + && coord->between == BEFORE_UNIT) {
6899 + coord->unit_pos--;
6900 + coord->between = AFTER_UNIT;
6901 + } else if (coord->item_pos == coord_num_items(coord)
6902 + && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6903 + coord_dec_item_pos(coord);
6904 + coord->unit_pos = 0;
6905 + coord->between = AFTER_ITEM;
6906 + }
6907 +}
6908 +
6909 +/* Copy a coordinate. */
6910 +void coord_dup(coord_t * coord, const coord_t * old_coord)
6911 +{
6912 + assert("jmacd-9800", coord_check(old_coord));
6913 + coord_dup_nocheck(coord, old_coord);
6914 +}
6915 +
6916 +/* Copy a coordinate without check. Useful when old_coord->node is not
6917 + loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6918 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6919 +{
6920 + coord->node = old_coord->node;
6921 + coord_set_item_pos(coord, old_coord->item_pos);
6922 + coord->unit_pos = old_coord->unit_pos;
6923 + coord->between = old_coord->between;
6924 + coord->iplugid = old_coord->iplugid;
6925 + ON_DEBUG(coord->plug_v = old_coord->plug_v);
6926 + ON_DEBUG(coord->body_v = old_coord->body_v);
6927 +}
6928 +
6929 +/* Initialize an invalid coordinate. */
6930 +void coord_init_invalid(coord_t * coord, const znode * node)
6931 +{
6932 + coord_init_values(coord, node, 0, 0, INVALID_COORD);
6933 +}
6934 +
6935 +void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6936 +{
6937 + coord_init_values(coord, node, 0, 0, AT_UNIT);
6938 +}
6939 +
6940 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
6941 + empty, it is positioned at the EMPTY_NODE. */
6942 +void coord_init_first_unit(coord_t * coord, const znode * node)
6943 +{
6944 + int is_empty = node_is_empty(node);
6945 +
6946 + coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6947 +
6948 + assert("jmacd-9801", coord_check(coord));
6949 +}
6950 +
6951 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
6952 + empty, it is positioned at the EMPTY_NODE. */
6953 +void coord_init_last_unit(coord_t * coord, const znode * node)
6954 +{
6955 + int is_empty = node_is_empty(node);
6956 +
6957 + coord_init_values(coord, node,
6958 + (is_empty ? 0 : node_num_items(node) - 1), 0,
6959 + (is_empty ? EMPTY_NODE : AT_UNIT));
6960 + if (!is_empty)
6961 + coord->unit_pos = coord_last_unit_pos(coord);
6962 + assert("jmacd-9802", coord_check(coord));
6963 +}
6964 +
6965 +/* Initialize a coordinate to before the first item. If the node is empty, it is
6966 + positioned at the EMPTY_NODE. */
6967 +void coord_init_before_first_item(coord_t * coord, const znode * node)
6968 +{
6969 + int is_empty = node_is_empty(node);
6970 +
6971 + coord_init_values(coord, node, 0, 0,
6972 + (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6973 +
6974 + assert("jmacd-9803", coord_check(coord));
6975 +}
6976 +
6977 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
6978 + at the EMPTY_NODE. */
6979 +void coord_init_after_last_item(coord_t * coord, const znode * node)
6980 +{
6981 + int is_empty = node_is_empty(node);
6982 +
6983 + coord_init_values(coord, node,
6984 + (is_empty ? 0 : node_num_items(node) - 1), 0,
6985 + (is_empty ? EMPTY_NODE : AFTER_ITEM));
6986 +
6987 + assert("jmacd-9804", coord_check(coord));
6988 +}
6989 +
6990 +/* Initialize a coordinate to after last unit in the item. Coord must be set
6991 + already to existing item */
6992 +void coord_init_after_item_end(coord_t * coord)
6993 +{
6994 + coord->between = AFTER_UNIT;
6995 + coord->unit_pos = coord_last_unit_pos(coord);
6996 +}
6997 +
6998 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6999 +void coord_init_before_item(coord_t * coord)
7000 +{
7001 + coord->unit_pos = 0;
7002 + coord->between = BEFORE_ITEM;
7003 +}
7004 +
7005 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7006 +void coord_init_after_item(coord_t * coord)
7007 +{
7008 + coord->unit_pos = 0;
7009 + coord->between = AFTER_ITEM;
7010 +}
7011 +
7012 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7013 + it was not clear how actually */
7014 +void coord_init_zero(coord_t * coord)
7015 +{
7016 + memset(coord, 0, sizeof(*coord));
7017 +}
7018 +
7019 +/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
7020 +unsigned coord_num_units(const coord_t * coord)
7021 +{
7022 + assert("jmacd-9806", coord_is_existing_item(coord));
7023 +
7024 + return item_plugin_by_coord(coord)->b.nr_units(coord);
7025 +}
7026 +
7027 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7028 +/* Audited by: green(2002.06.15) */
7029 +int coord_is_invalid(const coord_t * coord)
7030 +{
7031 + return coord->between == INVALID_COORD;
7032 +}
7033 +
7034 +/* Returns true if the coordinate is positioned at an existing item, not before or after
7035 + an item. It may be placed at, before, or after any unit within the item, whether
7036 + existing or not. */
7037 +int coord_is_existing_item(const coord_t * coord)
7038 +{
7039 + switch (coord->between) {
7040 + case EMPTY_NODE:
7041 + case BEFORE_ITEM:
7042 + case AFTER_ITEM:
7043 + case INVALID_COORD:
7044 + return 0;
7045 +
7046 + case BEFORE_UNIT:
7047 + case AT_UNIT:
7048 + case AFTER_UNIT:
7049 + return coord->item_pos < coord_num_items(coord);
7050 + }
7051 +
7052 + impossible("jmacd-9900", "unreachable coord: %p", coord);
7053 + return 0;
7054 +}
7055 +
7056 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7057 + unit. */
7058 +/* Audited by: green(2002.06.15) */
7059 +int coord_is_existing_unit(const coord_t * coord)
7060 +{
7061 + switch (coord->between) {
7062 + case EMPTY_NODE:
7063 + case BEFORE_UNIT:
7064 + case AFTER_UNIT:
7065 + case BEFORE_ITEM:
7066 + case AFTER_ITEM:
7067 + case INVALID_COORD:
7068 + return 0;
7069 +
7070 + case AT_UNIT:
7071 + return (coord->item_pos < coord_num_items(coord)
7072 + && coord->unit_pos < coord_num_units(coord));
7073 + }
7074 +
7075 + impossible("jmacd-9902", "unreachable");
7076 + return 0;
7077 +}
7078 +
7079 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7080 + true for empty nodes nor coordinates positioned before the first item. */
7081 +/* Audited by: green(2002.06.15) */
7082 +int coord_is_leftmost_unit(const coord_t * coord)
7083 +{
7084 + return (coord->between == AT_UNIT && coord->item_pos == 0
7085 + && coord->unit_pos == 0);
7086 +}
7087 +
7088 +#if REISER4_DEBUG
7089 +/* For assertions only, checks for a valid coordinate. */
7090 +int coord_check(const coord_t * coord)
7091 +{
7092 + if (coord->node == NULL) {
7093 + return 0;
7094 + }
7095 + if (znode_above_root(coord->node))
7096 + return 1;
7097 +
7098 + switch (coord->between) {
7099 + default:
7100 + case INVALID_COORD:
7101 + return 0;
7102 + case EMPTY_NODE:
7103 + if (!node_is_empty(coord->node)) {
7104 + return 0;
7105 + }
7106 + return coord->item_pos == 0 && coord->unit_pos == 0;
7107 +
7108 + case BEFORE_UNIT:
7109 + case AFTER_UNIT:
7110 + if (node_is_empty(coord->node) && (coord->item_pos == 0)
7111 + && (coord->unit_pos == 0))
7112 + return 1;
7113 + case AT_UNIT:
7114 + break;
7115 + case AFTER_ITEM:
7116 + case BEFORE_ITEM:
7117 + /* before/after item should not set unit_pos. */
7118 + if (coord->unit_pos != 0) {
7119 + return 0;
7120 + }
7121 + break;
7122 + }
7123 +
7124 + if (coord->item_pos >= node_num_items(coord->node)) {
7125 + return 0;
7126 + }
7127 +
7128 + /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7129 + between is set either AFTER_ITEM or BEFORE_ITEM */
7130 + if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7131 + return 1;
7132 +
7133 + if (coord_is_iplug_set(coord) &&
7134 + coord->unit_pos >
7135 + item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7136 + return 0;
7137 + }
7138 + return 1;
7139 +}
7140 +#endif
7141 +
7142 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7143 + Returns 1 if the new position is does not exist. */
7144 +static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7145 +{
7146 + /* If the node is invalid, leave it. */
7147 + if (coord->between == INVALID_COORD) {
7148 + return 1;
7149 + }
7150 +
7151 + /* If the node is empty, set it appropriately. */
7152 + if (items == 0) {
7153 + coord->between = EMPTY_NODE;
7154 + coord_set_item_pos(coord, 0);
7155 + coord->unit_pos = 0;
7156 + return 1;
7157 + }
7158 +
7159 + /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7160 + if (coord->between == EMPTY_NODE) {
7161 + coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7162 + coord_set_item_pos(coord, 0);
7163 + coord->unit_pos = 0;
7164 + return 0;
7165 + }
7166 +
7167 + /* If the item_pos is out-of-range, set it appropriatly. */
7168 + if (coord->item_pos >= items) {
7169 + coord->between = AFTER_ITEM;
7170 + coord_set_item_pos(coord, items - 1);
7171 + coord->unit_pos = 0;
7172 + /* If is_next, return 1 (can't go any further). */
7173 + return is_next;
7174 + }
7175 +
7176 + return 0;
7177 +}
7178 +
7179 +/* Advances the coordinate by one unit to the right. If empty, no change. If
7180 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7181 + existing unit. */
7182 +int coord_next_unit(coord_t * coord)
7183 +{
7184 + unsigned items = coord_num_items(coord);
7185 +
7186 + if (coord_adjust_items(coord, items, 1) == 1) {
7187 + return 1;
7188 + }
7189 +
7190 + switch (coord->between) {
7191 + case BEFORE_UNIT:
7192 + /* Now it is positioned at the same unit. */
7193 + coord->between = AT_UNIT;
7194 + return 0;
7195 +
7196 + case AFTER_UNIT:
7197 + case AT_UNIT:
7198 + /* If it was at or after a unit and there are more units in this item,
7199 + advance to the next one. */
7200 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7201 + coord->unit_pos += 1;
7202 + coord->between = AT_UNIT;
7203 + return 0;
7204 + }
7205 +
7206 + /* Otherwise, it is crossing an item boundary and treated as if it was
7207 + after the current item. */
7208 + coord->between = AFTER_ITEM;
7209 + coord->unit_pos = 0;
7210 + /* FALLTHROUGH */
7211 +
7212 + case AFTER_ITEM:
7213 + /* Check for end-of-node. */
7214 + if (coord->item_pos == items - 1) {
7215 + return 1;
7216 + }
7217 +
7218 + coord_inc_item_pos(coord);
7219 + coord->unit_pos = 0;
7220 + coord->between = AT_UNIT;
7221 + return 0;
7222 +
7223 + case BEFORE_ITEM:
7224 + /* The adjust_items checks ensure that we are valid here. */
7225 + coord->unit_pos = 0;
7226 + coord->between = AT_UNIT;
7227 + return 0;
7228 +
7229 + case INVALID_COORD:
7230 + case EMPTY_NODE:
7231 + /* Handled in coord_adjust_items(). */
7232 + break;
7233 + }
7234 +
7235 + impossible("jmacd-9902", "unreachable");
7236 + return 0;
7237 +}
7238 +
7239 +/* Advances the coordinate by one item to the right. If empty, no change. If
7240 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7241 + an existing item. */
7242 +int coord_next_item(coord_t * coord)
7243 +{
7244 + unsigned items = coord_num_items(coord);
7245 +
7246 + if (coord_adjust_items(coord, items, 1) == 1) {
7247 + return 1;
7248 + }
7249 +
7250 + switch (coord->between) {
7251 + case AFTER_UNIT:
7252 + case AT_UNIT:
7253 + case BEFORE_UNIT:
7254 + case AFTER_ITEM:
7255 + /* Check for end-of-node. */
7256 + if (coord->item_pos == items - 1) {
7257 + coord->between = AFTER_ITEM;
7258 + coord->unit_pos = 0;
7259 + coord_clear_iplug(coord);
7260 + return 1;
7261 + }
7262 +
7263 + /* Anywhere in an item, go to the next one. */
7264 + coord->between = AT_UNIT;
7265 + coord_inc_item_pos(coord);
7266 + coord->unit_pos = 0;
7267 + return 0;
7268 +
7269 + case BEFORE_ITEM:
7270 + /* The out-of-range check ensures that we are valid here. */
7271 + coord->unit_pos = 0;
7272 + coord->between = AT_UNIT;
7273 + return 0;
7274 + case INVALID_COORD:
7275 + case EMPTY_NODE:
7276 + /* Handled in coord_adjust_items(). */
7277 + break;
7278 + }
7279 +
7280 + impossible("jmacd-9903", "unreachable");
7281 + return 0;
7282 +}
7283 +
7284 +/* Advances the coordinate by one unit to the left. If empty, no change. If
7285 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7286 + is an existing unit. */
7287 +int coord_prev_unit(coord_t * coord)
7288 +{
7289 + unsigned items = coord_num_items(coord);
7290 +
7291 + if (coord_adjust_items(coord, items, 0) == 1) {
7292 + return 1;
7293 + }
7294 +
7295 + switch (coord->between) {
7296 + case AT_UNIT:
7297 + case BEFORE_UNIT:
7298 + if (coord->unit_pos > 0) {
7299 + coord->unit_pos -= 1;
7300 + coord->between = AT_UNIT;
7301 + return 0;
7302 + }
7303 +
7304 + if (coord->item_pos == 0) {
7305 + coord->between = BEFORE_ITEM;
7306 + return 1;
7307 + }
7308 +
7309 + coord_dec_item_pos(coord);
7310 + coord->unit_pos = coord_last_unit_pos(coord);
7311 + coord->between = AT_UNIT;
7312 + return 0;
7313 +
7314 + case AFTER_UNIT:
7315 + /* What if unit_pos is out-of-range? */
7316 + assert("jmacd-5442",
7317 + coord->unit_pos <= coord_last_unit_pos(coord));
7318 + coord->between = AT_UNIT;
7319 + return 0;
7320 +
7321 + case BEFORE_ITEM:
7322 + if (coord->item_pos == 0) {
7323 + return 1;
7324 + }
7325 +
7326 + coord_dec_item_pos(coord);
7327 + /* FALLTHROUGH */
7328 +
7329 + case AFTER_ITEM:
7330 + coord->between = AT_UNIT;
7331 + coord->unit_pos = coord_last_unit_pos(coord);
7332 + return 0;
7333 +
7334 + case INVALID_COORD:
7335 + case EMPTY_NODE:
7336 + break;
7337 + }
7338 +
7339 + impossible("jmacd-9904", "unreachable");
7340 + return 0;
7341 +}
7342 +
7343 +/* Advances the coordinate by one item to the left. If empty, no change. If
7344 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7345 + is an existing item. */
7346 +int coord_prev_item(coord_t * coord)
7347 +{
7348 + unsigned items = coord_num_items(coord);
7349 +
7350 + if (coord_adjust_items(coord, items, 0) == 1) {
7351 + return 1;
7352 + }
7353 +
7354 + switch (coord->between) {
7355 + case AT_UNIT:
7356 + case AFTER_UNIT:
7357 + case BEFORE_UNIT:
7358 + case BEFORE_ITEM:
7359 +
7360 + if (coord->item_pos == 0) {
7361 + coord->between = BEFORE_ITEM;
7362 + coord->unit_pos = 0;
7363 + return 1;
7364 + }
7365 +
7366 + coord_dec_item_pos(coord);
7367 + coord->unit_pos = 0;
7368 + coord->between = AT_UNIT;
7369 + return 0;
7370 +
7371 + case AFTER_ITEM:
7372 + coord->between = AT_UNIT;
7373 + coord->unit_pos = 0;
7374 + return 0;
7375 +
7376 + case INVALID_COORD:
7377 + case EMPTY_NODE:
7378 + break;
7379 + }
7380 +
7381 + impossible("jmacd-9905", "unreachable");
7382 + return 0;
7383 +}
7384 +
7385 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7386 +void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7387 +{
7388 + assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7389 + if (dir == LEFT_SIDE) {
7390 + coord_init_first_unit(coord, node);
7391 + } else {
7392 + coord_init_last_unit(coord, node);
7393 + }
7394 +}
7395 +
7396 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7397 + argument. */
7398 +/* Audited by: green(2002.06.15) */
7399 +int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7400 +{
7401 + assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7402 + if (dir == LEFT_SIDE) {
7403 + return coord_is_before_leftmost(coord);
7404 + } else {
7405 + return coord_is_after_rightmost(coord);
7406 + }
7407 +}
7408 +
7409 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7410 +/* Audited by: green(2002.06.15) */
7411 +int coord_sideof_unit(coord_t * coord, sideof dir)
7412 +{
7413 + assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7414 + if (dir == LEFT_SIDE) {
7415 + return coord_prev_unit(coord);
7416 + } else {
7417 + return coord_next_unit(coord);
7418 + }
7419 +}
7420 +
7421 +#if REISER4_DEBUG
7422 +int coords_equal(const coord_t * c1, const coord_t * c2)
7423 +{
7424 + assert("nikita-2840", c1 != NULL);
7425 + assert("nikita-2841", c2 != NULL);
7426 +
7427 + return
7428 + c1->node == c2->node &&
7429 + c1->item_pos == c2->item_pos &&
7430 + c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7431 +}
7432 +#endif /* REISER4_DEBUG */
7433 +
7434 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7435 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7436 +/* Audited by: green(2002.06.15) */
7437 +coord_wrt_node coord_wrt(const coord_t * coord)
7438 +{
7439 + if (coord_is_before_leftmost(coord)) {
7440 + return COORD_ON_THE_LEFT;
7441 + }
7442 +
7443 + if (coord_is_after_rightmost(coord)) {
7444 + return COORD_ON_THE_RIGHT;
7445 + }
7446 +
7447 + return COORD_INSIDE;
7448 +}
7449 +
7450 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7451 + of the last item or it is an empty node. */
7452 +/* Audited by: green(2002.06.15) */
7453 +int coord_is_after_rightmost(const coord_t * coord)
7454 +{
7455 + assert("jmacd-7313", coord_check(coord));
7456 +
7457 + switch (coord->between) {
7458 + case INVALID_COORD:
7459 + case AT_UNIT:
7460 + case BEFORE_UNIT:
7461 + case BEFORE_ITEM:
7462 + return 0;
7463 +
7464 + case EMPTY_NODE:
7465 + return 1;
7466 +
7467 + case AFTER_ITEM:
7468 + return (coord->item_pos == node_num_items(coord->node) - 1);
7469 +
7470 + case AFTER_UNIT:
7471 + return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7472 + coord->unit_pos == coord_last_unit_pos(coord));
7473 + }
7474 +
7475 + impossible("jmacd-9908", "unreachable");
7476 + return 0;
7477 +}
7478 +
7479 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7480 + node. */
7481 +int coord_is_before_leftmost(const coord_t * coord)
7482 +{
7483 + /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7484 + necessary to check if coord is set before leftmost
7485 + assert ("jmacd-7313", coord_check (coord)); */
7486 + switch (coord->between) {
7487 + case INVALID_COORD:
7488 + case AT_UNIT:
7489 + case AFTER_ITEM:
7490 + case AFTER_UNIT:
7491 + return 0;
7492 +
7493 + case EMPTY_NODE:
7494 + return 1;
7495 +
7496 + case BEFORE_ITEM:
7497 + case BEFORE_UNIT:
7498 + return (coord->item_pos == 0) && (coord->unit_pos == 0);
7499 + }
7500 +
7501 + impossible("jmacd-9908", "unreachable");
7502 + return 0;
7503 +}
7504 +
7505 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7506 + last unit of an item, before the first unit of an item, or at an empty node. */
7507 +/* Audited by: green(2002.06.15) */
7508 +int coord_is_between_items(const coord_t * coord)
7509 +{
7510 + assert("jmacd-7313", coord_check(coord));
7511 +
7512 + switch (coord->between) {
7513 + case INVALID_COORD:
7514 + case AT_UNIT:
7515 + return 0;
7516 +
7517 + case AFTER_ITEM:
7518 + case BEFORE_ITEM:
7519 + case EMPTY_NODE:
7520 + return 1;
7521 +
7522 + case BEFORE_UNIT:
7523 + return coord->unit_pos == 0;
7524 +
7525 + case AFTER_UNIT:
7526 + return coord->unit_pos == coord_last_unit_pos(coord);
7527 + }
7528 +
7529 + impossible("jmacd-9908", "unreachable");
7530 + return 0;
7531 +}
7532 +
7533 +#if REISER4_DEBUG
7534 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7535 + before-after or item boundaries. */
7536 +int coord_are_neighbors(coord_t * c1, coord_t * c2)
7537 +{
7538 + coord_t *left;
7539 + coord_t *right;
7540 +
7541 + assert("nikita-1241", c1 != NULL);
7542 + assert("nikita-1242", c2 != NULL);
7543 + assert("nikita-1243", c1->node == c2->node);
7544 + assert("nikita-1244", coord_is_existing_unit(c1));
7545 + assert("nikita-1245", coord_is_existing_unit(c2));
7546 +
7547 + left = right = NULL;
7548 + switch (coord_compare(c1, c2)) {
7549 + case COORD_CMP_ON_LEFT:
7550 + left = c1;
7551 + right = c2;
7552 + break;
7553 + case COORD_CMP_ON_RIGHT:
7554 + left = c2;
7555 + right = c1;
7556 + break;
7557 + case COORD_CMP_SAME:
7558 + return 0;
7559 + default:
7560 + wrong_return_value("nikita-1246", "compare_coords()");
7561 + }
7562 + assert("vs-731", left && right);
7563 + if (left->item_pos == right->item_pos) {
7564 + return left->unit_pos + 1 == right->unit_pos;
7565 + } else if (left->item_pos + 1 == right->item_pos) {
7566 + return (left->unit_pos == coord_last_unit_pos(left))
7567 + && (right->unit_pos == 0);
7568 + } else {
7569 + return 0;
7570 + }
7571 +}
7572 +#endif /* REISER4_DEBUG */
7573 +
7574 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7575 + COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7576 +/* Audited by: green(2002.06.15) */
7577 +coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7578 +{
7579 + assert("vs-209", c1->node == c2->node);
7580 + assert("vs-194", coord_is_existing_unit(c1)
7581 + && coord_is_existing_unit(c2));
7582 +
7583 + if (c1->item_pos > c2->item_pos)
7584 + return COORD_CMP_ON_RIGHT;
7585 + if (c1->item_pos < c2->item_pos)
7586 + return COORD_CMP_ON_LEFT;
7587 + if (c1->unit_pos > c2->unit_pos)
7588 + return COORD_CMP_ON_RIGHT;
7589 + if (c1->unit_pos < c2->unit_pos)
7590 + return COORD_CMP_ON_LEFT;
7591 + return COORD_CMP_SAME;
7592 +}
7593 +
7594 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7595 + non-zero if there is no position to the right. */
7596 +int coord_set_to_right(coord_t * coord)
7597 +{
7598 + unsigned items = coord_num_items(coord);
7599 +
7600 + if (coord_adjust_items(coord, items, 1) == 1) {
7601 + return 1;
7602 + }
7603 +
7604 + switch (coord->between) {
7605 + case AT_UNIT:
7606 + return 0;
7607 +
7608 + case BEFORE_ITEM:
7609 + case BEFORE_UNIT:
7610 + coord->between = AT_UNIT;
7611 + return 0;
7612 +
7613 + case AFTER_UNIT:
7614 + if (coord->unit_pos < coord_last_unit_pos(coord)) {
7615 + coord->unit_pos += 1;
7616 + coord->between = AT_UNIT;
7617 + return 0;
7618 + } else {
7619 +
7620 + coord->unit_pos = 0;
7621 +
7622 + if (coord->item_pos == items - 1) {
7623 + coord->between = AFTER_ITEM;
7624 + return 1;
7625 + }
7626 +
7627 + coord_inc_item_pos(coord);
7628 + coord->between = AT_UNIT;
7629 + return 0;
7630 + }
7631 +
7632 + case AFTER_ITEM:
7633 + if (coord->item_pos == items - 1) {
7634 + return 1;
7635 + }
7636 +
7637 + coord_inc_item_pos(coord);
7638 + coord->unit_pos = 0;
7639 + coord->between = AT_UNIT;
7640 + return 0;
7641 +
7642 + case EMPTY_NODE:
7643 + return 1;
7644 +
7645 + case INVALID_COORD:
7646 + break;
7647 + }
7648 +
7649 + impossible("jmacd-9920", "unreachable");
7650 + return 0;
7651 +}
7652 +
7653 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7654 + non-zero if there is no position to the left. */
7655 +int coord_set_to_left(coord_t * coord)
7656 +{
7657 + unsigned items = coord_num_items(coord);
7658 +
7659 + if (coord_adjust_items(coord, items, 0) == 1) {
7660 + return 1;
7661 + }
7662 +
7663 + switch (coord->between) {
7664 + case AT_UNIT:
7665 + return 0;
7666 +
7667 + case AFTER_UNIT:
7668 + coord->between = AT_UNIT;
7669 + return 0;
7670 +
7671 + case AFTER_ITEM:
7672 + coord->between = AT_UNIT;
7673 + coord->unit_pos = coord_last_unit_pos(coord);
7674 + return 0;
7675 +
7676 + case BEFORE_UNIT:
7677 + if (coord->unit_pos > 0) {
7678 + coord->unit_pos -= 1;
7679 + coord->between = AT_UNIT;
7680 + return 0;
7681 + } else {
7682 +
7683 + if (coord->item_pos == 0) {
7684 + coord->between = BEFORE_ITEM;
7685 + return 1;
7686 + }
7687 +
7688 + coord->unit_pos = coord_last_unit_pos(coord);
7689 + coord_dec_item_pos(coord);
7690 + coord->between = AT_UNIT;
7691 + return 0;
7692 + }
7693 +
7694 + case BEFORE_ITEM:
7695 + if (coord->item_pos == 0) {
7696 + return 1;
7697 + }
7698 +
7699 + coord_dec_item_pos(coord);
7700 + coord->unit_pos = coord_last_unit_pos(coord);
7701 + coord->between = AT_UNIT;
7702 + return 0;
7703 +
7704 + case EMPTY_NODE:
7705 + return 1;
7706 +
7707 + case INVALID_COORD:
7708 + break;
7709 + }
7710 +
7711 + impossible("jmacd-9920", "unreachable");
7712 + return 0;
7713 +}
7714 +
7715 +static const char *coord_tween_tostring(between_enum n)
7716 +{
7717 + switch (n) {
7718 + case BEFORE_UNIT:
7719 + return "before unit";
7720 + case BEFORE_ITEM:
7721 + return "before item";
7722 + case AT_UNIT:
7723 + return "at unit";
7724 + case AFTER_UNIT:
7725 + return "after unit";
7726 + case AFTER_ITEM:
7727 + return "after item";
7728 + case EMPTY_NODE:
7729 + return "empty node";
7730 + case INVALID_COORD:
7731 + return "invalid";
7732 + default:
7733 + {
7734 + static char buf[30];
7735 +
7736 + sprintf(buf, "unknown: %i", n);
7737 + return buf;
7738 + }
7739 + }
7740 +}
7741 +
7742 +void print_coord(const char *mes, const coord_t * coord, int node)
7743 +{
7744 + if (coord == NULL) {
7745 + printk("%s: null\n", mes);
7746 + return;
7747 + }
7748 + printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7749 + mes, coord->item_pos, coord->unit_pos,
7750 + coord_tween_tostring(coord->between), coord->iplugid);
7751 +}
7752 +
7753 +int
7754 +item_utmost_child_real_block(const coord_t * coord, sideof side,
7755 + reiser4_block_nr * blk)
7756 +{
7757 + return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7758 + side,
7759 + blk);
7760 +}
7761 +
7762 +int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7763 +{
7764 + return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7765 +}
7766 +
7767 +/* @count bytes of flow @f got written, update correspondingly f->length,
7768 + f->data and f->key */
7769 +void move_flow_forward(flow_t * f, unsigned count)
7770 +{
7771 + if (f->data)
7772 + f->data += count;
7773 + f->length -= count;
7774 + set_key_offset(&f->key, get_key_offset(&f->key) + count);
7775 +}
7776 +
7777 +/*
7778 + Local variables:
7779 + c-indentation-style: "K&R"
7780 + mode-name: "LC"
7781 + c-basic-offset: 8
7782 + tab-width: 8
7783 + fill-column: 120
7784 + scroll-step: 1
7785 + End:
7786 +*/
7787 diff -urN linux-2.6.23.orig/fs/reiser4/coord.h linux-2.6.23/fs/reiser4/coord.h
7788 --- linux-2.6.23.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300
7789 +++ linux-2.6.23/fs/reiser4/coord.h 2007-12-04 16:49:30.000000000 +0300
7790 @@ -0,0 +1,389 @@
7791 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7792 +
7793 +/* Coords */
7794 +
7795 +#if !defined( __REISER4_COORD_H__ )
7796 +#define __REISER4_COORD_H__
7797 +
7798 +#include "forward.h"
7799 +#include "debug.h"
7800 +#include "dformat.h"
7801 +#include "key.h"
7802 +
7803 +/* insertions happen between coords in the tree, so we need some means
7804 + of specifying the sense of betweenness. */
7805 +typedef enum {
7806 + BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7807 + AT_UNIT,
7808 + AFTER_UNIT,
7809 + BEFORE_ITEM,
7810 + AFTER_ITEM,
7811 + INVALID_COORD,
7812 + EMPTY_NODE,
7813 +} between_enum;
7814 +
7815 +/* location of coord w.r.t. its node */
7816 +typedef enum {
7817 + COORD_ON_THE_LEFT = -1,
7818 + COORD_ON_THE_RIGHT = +1,
7819 + COORD_INSIDE = 0
7820 +} coord_wrt_node;
7821 +
7822 +typedef enum {
7823 + COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7824 +} coord_cmp;
7825 +
7826 +struct coord {
7827 + /* node in a tree */
7828 + /* 0 */ znode *node;
7829 +
7830 + /* position of item within node */
7831 + /* 4 */ pos_in_node_t item_pos;
7832 + /* position of unit within item */
7833 + /* 6 */ pos_in_node_t unit_pos;
7834 + /* optimization: plugin of item is stored in coord_t. Until this was
7835 + implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7836 + is invalidated (set to 0xff) on each modification of ->item_pos,
7837 + and all such modifications are funneled through coord_*_item_pos()
7838 + functions below.
7839 + */
7840 + /* 8 */ char iplugid;
7841 + /* position of coord w.r.t. to neighboring items and/or units.
7842 + Values are taken from &between_enum above.
7843 + */
7844 + /* 9 */ char between;
7845 + /* padding. It will be added by the compiler anyway to conform to the
7846 + * C language alignment requirements. We keep it here to be on the
7847 + * safe side and to have a clear picture of the memory layout of this
7848 + * structure. */
7849 + /* 10 */ __u16 pad;
7850 + /* 12 */ int offset;
7851 +#if REISER4_DEBUG
7852 + unsigned long plug_v;
7853 + unsigned long body_v;
7854 +#endif
7855 +};
7856 +
7857 +#define INVALID_PLUGID ((char)((1 << 8) - 1))
7858 +#define INVALID_OFFSET -1
7859 +
7860 +static inline void coord_clear_iplug(coord_t * coord)
7861 +{
7862 + assert("nikita-2835", coord != NULL);
7863 + coord->iplugid = INVALID_PLUGID;
7864 + coord->offset = INVALID_OFFSET;
7865 +}
7866 +
7867 +static inline int coord_is_iplug_set(const coord_t * coord)
7868 +{
7869 + assert("nikita-2836", coord != NULL);
7870 + return coord->iplugid != INVALID_PLUGID;
7871 +}
7872 +
7873 +static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7874 +{
7875 + assert("nikita-2478", coord != NULL);
7876 + coord->item_pos = pos;
7877 + coord_clear_iplug(coord);
7878 +}
7879 +
7880 +static inline void coord_dec_item_pos(coord_t * coord)
7881 +{
7882 + assert("nikita-2480", coord != NULL);
7883 + --coord->item_pos;
7884 + coord_clear_iplug(coord);
7885 +}
7886 +
7887 +static inline void coord_inc_item_pos(coord_t * coord)
7888 +{
7889 + assert("nikita-2481", coord != NULL);
7890 + ++coord->item_pos;
7891 + coord_clear_iplug(coord);
7892 +}
7893 +
7894 +static inline void coord_add_item_pos(coord_t * coord, int delta)
7895 +{
7896 + assert("nikita-2482", coord != NULL);
7897 + coord->item_pos += delta;
7898 + coord_clear_iplug(coord);
7899 +}
7900 +
7901 +static inline void coord_invalid_item_pos(coord_t * coord)
7902 +{
7903 + assert("nikita-2832", coord != NULL);
7904 + coord->item_pos = (unsigned short)~0;
7905 + coord_clear_iplug(coord);
7906 +}
7907 +
7908 +/* Reverse a direction. */
7909 +static inline sideof sideof_reverse(sideof side)
7910 +{
7911 + return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7912 +}
7913 +
7914 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7915 +
7916 + "first" and "last"
7917 + "next" and "prev"
7918 + "before" and "after"
7919 + "leftmost" and "rightmost"
7920 +
7921 + But I think the chosen names are decent the way they are.
7922 +*/
7923 +
7924 +/* COORD INITIALIZERS */
7925 +
7926 +/* Initialize an invalid coordinate. */
7927 +extern void coord_init_invalid(coord_t * coord, const znode * node);
7928 +
7929 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7930 +
7931 +/* Initialize a coordinate to point at the first unit of the first item. If the node is
7932 + empty, it is positioned at the EMPTY_NODE. */
7933 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
7934 +
7935 +/* Initialize a coordinate to point at the last unit of the last item. If the node is
7936 + empty, it is positioned at the EMPTY_NODE. */
7937 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
7938 +
7939 +/* Initialize a coordinate to before the first item. If the node is empty, it is
7940 + positioned at the EMPTY_NODE. */
7941 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7942 +
7943 +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7944 + at the EMPTY_NODE. */
7945 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7946 +
7947 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7948 + already to existing item */
7949 +void coord_init_after_item_end(coord_t * coord);
7950 +
7951 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7952 +void coord_init_before_item(coord_t *);
7953 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7954 +void coord_init_after_item(coord_t *);
7955 +
7956 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7957 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7958 + sideof dir);
7959 +
7960 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7961 + it was not clear how actually
7962 + FIXME-VS: added by vs (2002, june, 8) */
7963 +extern void coord_init_zero(coord_t * coord);
7964 +
7965 +/* COORD METHODS */
7966 +
7967 +/* after shifting of node content, coord previously set properly may become
7968 + invalid, try to "normalize" it. */
7969 +void coord_normalize(coord_t * coord);
7970 +
7971 +/* Copy a coordinate. */
7972 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7973 +
7974 +/* Copy a coordinate without check. */
7975 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7976 +
7977 +unsigned coord_num_units(const coord_t * coord);
7978 +
7979 +/* Return the last valid unit number at the present item (i.e.,
7980 + coord_num_units() - 1). */
7981 +static inline unsigned coord_last_unit_pos(const coord_t * coord)
7982 +{
7983 + return coord_num_units(coord) - 1;
7984 +}
7985 +
7986 +#if REISER4_DEBUG
7987 +/* For assertions only, checks for a valid coordinate. */
7988 +extern int coord_check(const coord_t * coord);
7989 +
7990 +extern unsigned long znode_times_locked(const znode * z);
7991 +
7992 +static inline void coord_update_v(coord_t * coord)
7993 +{
7994 + coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7995 +}
7996 +#endif
7997 +
7998 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
7999 +
8000 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8001 +
8002 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8003 + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8004 +extern coord_wrt_node coord_wrt(const coord_t * coord);
8005 +
8006 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
8007 + before-after or item boundaries. */
8008 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8009 +
8010 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8011 + NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
8012 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8013 +
8014 +/* COORD PREDICATES */
8015 +
8016 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8017 +extern int coord_is_invalid(const coord_t * coord);
8018 +
8019 +/* Returns true if the coordinate is positioned at an existing item, not before or after
8020 + an item. It may be placed at, before, or after any unit within the item, whether
8021 + existing or not. If this is true you can call methods of the item plugin. */
8022 +extern int coord_is_existing_item(const coord_t * coord);
8023 +
8024 +/* Returns true if the coordinate is positioned after a item, before a item, after the
8025 + last unit of an item, before the first unit of an item, or at an empty node. */
8026 +extern int coord_is_between_items(const coord_t * coord);
8027 +
8028 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8029 + unit. */
8030 +extern int coord_is_existing_unit(const coord_t * coord);
8031 +
8032 +/* Returns true if the coordinate is positioned at an empty node. */
8033 +extern int coord_is_empty(const coord_t * coord);
8034 +
8035 +/* Returns true if the coordinate is positioned at the first unit of the first item. Not
8036 + true for empty nodes nor coordinates positioned before the first item. */
8037 +extern int coord_is_leftmost_unit(const coord_t * coord);
8038 +
8039 +/* Returns true if the coordinate is positioned after the last item or after the last unit
8040 + of the last item or it is an empty node. */
8041 +extern int coord_is_after_rightmost(const coord_t * coord);
8042 +
8043 +/* Returns true if the coordinate is positioned before the first item or it is an empty
8044 + node. */
8045 +extern int coord_is_before_leftmost(const coord_t * coord);
8046 +
8047 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8048 + argument. */
8049 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8050 +
8051 +/* COORD MODIFIERS */
8052 +
8053 +/* Advances the coordinate by one unit to the right. If empty, no change. If
8054 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8055 + an existing unit. */
8056 +extern int coord_next_unit(coord_t * coord);
8057 +
8058 +/* Advances the coordinate by one item to the right. If empty, no change. If
8059 + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8060 + an existing item. */
8061 +extern int coord_next_item(coord_t * coord);
8062 +
8063 +/* Advances the coordinate by one unit to the left. If empty, no change. If
8064 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8065 + is an existing unit. */
8066 +extern int coord_prev_unit(coord_t * coord);
8067 +
8068 +/* Advances the coordinate by one item to the left. If empty, no change. If
8069 + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8070 + is an existing item. */
8071 +extern int coord_prev_item(coord_t * coord);
8072 +
8073 +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8074 + non-zero if there is no position to the right. */
8075 +extern int coord_set_to_right(coord_t * coord);
8076 +
8077 +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8078 + non-zero if there is no position to the left. */
8079 +extern int coord_set_to_left(coord_t * coord);
8080 +
8081 +/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8082 + and non-zero if the unit did not exist. */
8083 +extern int coord_set_after_unit(coord_t * coord);
8084 +
8085 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8086 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
8087 +
8088 +/* iterate over all units in @node */
8089 +#define for_all_units( coord, node ) \
8090 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8091 + coord_next_unit( coord ) == 0 ; )
8092 +
8093 +/* iterate over all items in @node */
8094 +#define for_all_items( coord, node ) \
8095 + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8096 + coord_next_item( coord ) == 0 ; )
8097 +
8098 +/* COORD/ITEM METHODS */
8099 +
8100 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8101 + reiser4_block_nr * blk);
8102 +extern int item_utmost_child(const coord_t * coord, sideof side,
8103 + jnode ** child);
8104 +
8105 +/* a flow is a sequence of bytes being written to or read from the tree. The
8106 + tree will slice the flow into items while storing it into nodes, but all of
8107 + that is hidden from anything outside the tree. */
8108 +
8109 +struct flow {
8110 + reiser4_key key; /* key of start of flow's sequence of bytes */
8111 + loff_t length; /* length of flow's sequence of bytes */
8112 + char *data; /* start of flow's sequence of bytes */
8113 + int user; /* if 1 data is user space, 0 - kernel space */
8114 + rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8115 +};
8116 +
8117 +void move_flow_forward(flow_t * f, unsigned count);
8118 +
8119 +/* &reiser4_item_data - description of data to be inserted or pasted
8120 +
8121 + Q: articulate the reasons for the difference between this and flow.
8122 +
8123 + A: Becides flow we insert into tree other things: stat data, directory
8124 + entry, etc. To insert them into tree one has to provide this structure. If
8125 + one is going to insert flow - he can use insert_flow, where this structure
8126 + does not have to be created
8127 +*/
8128 +struct reiser4_item_data {
8129 + /* actual data to be inserted. If NULL, ->create_item() will not
8130 + do xmemcpy itself, leaving this up to the caller. This can
8131 + save some amount of unnecessary memory copying, for example,
8132 + during insertion of stat data.
8133 +
8134 + */
8135 + char *data;
8136 + /* 1 if 'char * data' contains pointer to user space and 0 if it is
8137 + kernel space */
8138 + int user;
8139 + /* amount of data we are going to insert or paste */
8140 + int length;
8141 + /* "Arg" is opaque data that is passed down to the
8142 + ->create_item() method of node layout, which in turn
8143 + hands it to the ->create_hook() of item being created. This
8144 + arg is currently used by:
8145 +
8146 + . ->create_hook() of internal item
8147 + (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8148 + . ->paste() method of directory item.
8149 + . ->create_hook() of extent item
8150 +
8151 + For internal item, this is left "brother" of new node being
8152 + inserted and it is used to add new node into sibling list
8153 + after parent to it was just inserted into parent.
8154 +
8155 + While ->arg does look somewhat of unnecessary compication,
8156 + it actually saves a lot of headache in many places, because
8157 + all data necessary to insert or paste new data into tree are
8158 + collected in one place, and this eliminates a lot of extra
8159 + argument passing and storing everywhere.
8160 +
8161 + */
8162 + void *arg;
8163 + /* plugin of item we are inserting */
8164 + item_plugin *iplug;
8165 +};
8166 +
8167 +/* __REISER4_COORD_H__ */
8168 +#endif
8169 +
8170 +/* Make Linus happy.
8171 + Local variables:
8172 + c-indentation-style: "K&R"
8173 + mode-name: "LC"
8174 + c-basic-offset: 8
8175 + tab-width: 8
8176 + fill-column: 120
8177 + scroll-step: 1
8178 + End:
8179 +*/
8180 diff -urN linux-2.6.23.orig/fs/reiser4/debug.c linux-2.6.23/fs/reiser4/debug.c
8181 --- linux-2.6.23.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300
8182 +++ linux-2.6.23/fs/reiser4/debug.c 2007-12-04 16:49:30.000000000 +0300
8183 @@ -0,0 +1,308 @@
8184 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8185 + * reiser4/README */
8186 +
8187 +/* Debugging facilities. */
8188 +
8189 +/*
8190 + * This file contains generic debugging functions used by reiser4. Roughly
8191 + * following:
8192 + *
8193 + * panicking: reiser4_do_panic(), reiser4_print_prefix().
8194 + *
8195 + * locking:
8196 + * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8197 + * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8198 + *
8199 + * error code monitoring (see comment before RETERR macro):
8200 + * reiser4_return_err(), reiser4_report_err().
8201 + *
8202 + * stack back-tracing: fill_backtrace()
8203 + *
8204 + * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8205 + * reiser4_debugtrap().
8206 + *
8207 + */
8208 +
8209 +#include "reiser4.h"
8210 +#include "context.h"
8211 +#include "super.h"
8212 +#include "txnmgr.h"
8213 +#include "znode.h"
8214 +
8215 +#include <linux/sysfs.h>
8216 +#include <linux/slab.h>
8217 +#include <linux/types.h>
8218 +#include <linux/fs.h>
8219 +#include <linux/spinlock.h>
8220 +#include <linux/kallsyms.h>
8221 +#include <linux/vmalloc.h>
8222 +#include <linux/ctype.h>
8223 +#include <linux/sysctl.h>
8224 +#include <linux/hardirq.h>
8225 +
8226 +#if 0
8227 +#if REISER4_DEBUG
8228 +static void reiser4_report_err(void);
8229 +#else
8230 +#define reiser4_report_err() noop
8231 +#endif
8232 +#endif /* 0 */
8233 +
8234 +/*
8235 + * global buffer where message given to reiser4_panic is formatted.
8236 + */
8237 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8238 +
8239 +/*
8240 + * lock protecting consistency of panic_buf under concurrent panics
8241 + */
8242 +static DEFINE_SPINLOCK(panic_guard);
8243 +
8244 +/* Your best friend. Call it on each occasion. This is called by
8245 + fs/reiser4/debug.h:reiser4_panic(). */
8246 +void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8247 +{
8248 + static int in_panic = 0;
8249 + va_list args;
8250 +
8251 + /*
8252 + * check for recursive panic.
8253 + */
8254 + if (in_panic == 0) {
8255 + in_panic = 1;
8256 +
8257 + spin_lock(&panic_guard);
8258 + va_start(args, format);
8259 + vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8260 + va_end(args);
8261 + printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8262 + spin_unlock(&panic_guard);
8263 +
8264 + /*
8265 + * if kernel debugger is configured---drop in. Early dropping
8266 + * into kgdb is not always convenient, because panic message
8267 + * is not yet printed most of the times. But:
8268 + *
8269 + * (1) message can be extracted from printk_buf[]
8270 + * (declared static inside of printk()), and
8271 + *
8272 + * (2) sometimes serial/kgdb combo dies while printing
8273 + * long panic message, so it's more prudent to break into
8274 + * debugger earlier.
8275 + *
8276 + */
8277 + DEBUGON(1);
8278 + }
8279 + /* to make gcc happy about noreturn attribute */
8280 + panic("%s", panic_buf);
8281 +}
8282 +
8283 +#if 0
8284 +void
8285 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8286 + const char *function, const char *file, int lineno)
8287 +{
8288 + const char *comm;
8289 + int pid;
8290 +
8291 + if (unlikely(in_interrupt() || in_irq())) {
8292 + comm = "interrupt";
8293 + pid = 0;
8294 + } else {
8295 + comm = current->comm;
8296 + pid = current->pid;
8297 + }
8298 + printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8299 + level, comm, pid, function, file, lineno, mid);
8300 + if (reperr)
8301 + reiser4_report_err();
8302 +}
8303 +#endif /* 0 */
8304 +
8305 +/* Preemption point: this should be called periodically during long running
8306 + operations (carry, allocate, and squeeze are best examples) */
8307 +int reiser4_preempt_point(void)
8308 +{
8309 + assert("nikita-3008", reiser4_schedulable());
8310 + cond_resched();
8311 + return signal_pending(current);
8312 +}
8313 +
8314 +#if REISER4_DEBUG
8315 +/* Debugging aid: return struct where information about locks taken by current
8316 + thread is accumulated. This can be used to formulate lock ordering
8317 + constraints and various assertions.
8318 +
8319 +*/
8320 +reiser4_lock_cnt_info *reiser4_lock_counters(void)
8321 +{
8322 + reiser4_context *ctx = get_current_context();
8323 + assert("jmacd-1123", ctx != NULL);
8324 + return &ctx->locks;
8325 +}
8326 +
8327 +/*
8328 + * print human readable information about locks held by the reiser4 context.
8329 + */
8330 +static void print_lock_counters(const char *prefix,
8331 + const reiser4_lock_cnt_info * info)
8332 +{
8333 + printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8334 + "jload: %i, "
8335 + "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8336 + "ktxnmgrd: %i, fq: %i\n"
8337 + "inode: %i, "
8338 + "cbk_cache: %i (r:%i,w%i), "
8339 + "eflush: %i, "
8340 + "zlock: %i,\n"
8341 + "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8342 + "d: %i, x: %i, t: %i\n", prefix,
8343 + info->spin_locked_jnode,
8344 + info->rw_locked_tree, info->read_locked_tree,
8345 + info->write_locked_tree,
8346 + info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8347 + info->spin_locked_jload,
8348 + info->spin_locked_txnh,
8349 + info->spin_locked_atom, info->spin_locked_stack,
8350 + info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8351 + info->spin_locked_fq,
8352 + info->spin_locked_inode,
8353 + info->rw_locked_cbk_cache,
8354 + info->read_locked_cbk_cache,
8355 + info->write_locked_cbk_cache,
8356 + info->spin_locked_super_eflush,
8357 + info->spin_locked_zlock,
8358 + info->spin_locked,
8359 + info->long_term_locked_znode,
8360 + info->inode_sem_r, info->inode_sem_w,
8361 + info->d_refs, info->x_refs, info->t_refs);
8362 +}
8363 +
8364 +/* check that no spinlocks are held */
8365 +int reiser4_schedulable(void)
8366 +{
8367 + if (get_current_context_check() != NULL) {
8368 + if (!LOCK_CNT_NIL(spin_locked)) {
8369 + print_lock_counters("in atomic", reiser4_lock_counters());
8370 + return 0;
8371 + }
8372 + }
8373 + might_sleep();
8374 + return 1;
8375 +}
8376 +/*
8377 + * return true, iff no locks are held.
8378 + */
8379 +int reiser4_no_counters_are_held(void)
8380 +{
8381 + reiser4_lock_cnt_info *counters;
8382 +
8383 + counters = reiser4_lock_counters();
8384 + return
8385 + (counters->spin_locked_zlock == 0) &&
8386 + (counters->spin_locked_jnode == 0) &&
8387 + (counters->rw_locked_tree == 0) &&
8388 + (counters->read_locked_tree == 0) &&
8389 + (counters->write_locked_tree == 0) &&
8390 + (counters->rw_locked_dk == 0) &&
8391 + (counters->read_locked_dk == 0) &&
8392 + (counters->write_locked_dk == 0) &&
8393 + (counters->spin_locked_txnh == 0) &&
8394 + (counters->spin_locked_atom == 0) &&
8395 + (counters->spin_locked_stack == 0) &&
8396 + (counters->spin_locked_txnmgr == 0) &&
8397 + (counters->spin_locked_inode == 0) &&
8398 + (counters->spin_locked == 0) &&
8399 + (counters->long_term_locked_znode == 0) &&
8400 + (counters->inode_sem_r == 0) &&
8401 + (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8402 +}
8403 +
8404 +/*
8405 + * return true, iff transaction commit can be done under locks held by the
8406 + * current thread.
8407 + */
8408 +int reiser4_commit_check_locks(void)
8409 +{
8410 + reiser4_lock_cnt_info *counters;
8411 + int inode_sem_r;
8412 + int inode_sem_w;
8413 + int result;
8414 +
8415 + /*
8416 + * inode's read/write semaphore is the only reiser4 lock that can be
8417 + * held during commit.
8418 + */
8419 +
8420 + counters = reiser4_lock_counters();
8421 + inode_sem_r = counters->inode_sem_r;
8422 + inode_sem_w = counters->inode_sem_w;
8423 +
8424 + counters->inode_sem_r = counters->inode_sem_w = 0;
8425 + result = reiser4_no_counters_are_held();
8426 + counters->inode_sem_r = inode_sem_r;
8427 + counters->inode_sem_w = inode_sem_w;
8428 + return result;
8429 +}
8430 +
8431 +/*
8432 + * fill "error site" in the current reiser4 context. See comment before RETERR
8433 + * macro for more details.
8434 + */
8435 +void reiser4_return_err(int code, const char *file, int line)
8436 +{
8437 + if (code < 0 && is_in_reiser4_context()) {
8438 + reiser4_context *ctx = get_current_context();
8439 +
8440 + if (ctx != NULL) {
8441 + ctx->err.code = code;
8442 + ctx->err.file = file;
8443 + ctx->err.line = line;
8444 + }
8445 + }
8446 +}
8447 +
8448 +#if 0
8449 +/*
8450 + * report error information recorder by reiser4_return_err().
8451 + */
8452 +static void reiser4_report_err(void)
8453 +{
8454 + reiser4_context *ctx = get_current_context_check();
8455 +
8456 + if (ctx != NULL) {
8457 + if (ctx->err.code != 0) {
8458 + printk("code: %i at %s:%i\n",
8459 + ctx->err.code, ctx->err.file, ctx->err.line);
8460 + }
8461 + }
8462 +}
8463 +#endif /* 0 */
8464 +
8465 +#endif /* REISER4_DEBUG */
8466 +
8467 +#if KERNEL_DEBUGGER
8468 +
8469 +/*
8470 + * this functions just drops into kernel debugger. It is a convenient place to
8471 + * put breakpoint in.
8472 + */
8473 +void reiser4_debugtrap(void)
8474 +{
8475 + /* do nothing. Put break point here. */
8476 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8477 + extern void breakpoint(void);
8478 + breakpoint();
8479 +#endif
8480 +}
8481 +#endif
8482 +
8483 +/* Make Linus happy.
8484 + Local variables:
8485 + c-indentation-style: "K&R"
8486 + mode-name: "LC"
8487 + c-basic-offset: 8
8488 + tab-width: 8
8489 + fill-column: 120
8490 + End:
8491 +*/
8492 diff -urN linux-2.6.23.orig/fs/reiser4/debug.h linux-2.6.23/fs/reiser4/debug.h
8493 --- linux-2.6.23.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300
8494 +++ linux-2.6.23/fs/reiser4/debug.h 2007-12-04 20:42:06.138861845 +0300
8495 @@ -0,0 +1,350 @@
8496 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8497 +
8498 +/* Declarations of debug macros. */
8499 +
8500 +#if !defined( __FS_REISER4_DEBUG_H__ )
8501 +#define __FS_REISER4_DEBUG_H__
8502 +
8503 +#include "forward.h"
8504 +#include "reiser4.h"
8505 +
8506 +/* generic function to produce formatted output, decorating it with
8507 + whatever standard prefixes/postfixes we want. "Fun" is a function
8508 + that will be actually called, can be printk, panic etc.
8509 + This is for use by other debugging macros, not by users. */
8510 +#define DCALL(lev, fun, reperr, label, format, ...) \
8511 +({ \
8512 + fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8513 + current->comm, current->pid, __FUNCTION__, \
8514 + __FILE__, __LINE__, label, ## __VA_ARGS__); \
8515 +})
8516 +
8517 +/*
8518 + * cause kernel to crash
8519 + */
8520 +#define reiser4_panic(mid, format, ...) \
8521 + DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8522 +
8523 +/* print message with indication of current process, file, line and
8524 + function */
8525 +#define reiser4_log(label, format, ...) \
8526 + DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8527 +
8528 +/* Assertion checked during compilation.
8529 + If "cond" is false (0) we get duplicate case label in switch.
8530 + Use this to check something like famous
8531 + cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8532 + in 3.x journal.c. If cassertion fails you get compiler error,
8533 + so no "maintainer-id".
8534 +*/
8535 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8536 +
8537 +#define noop do {;} while(0)
8538 +
8539 +#if REISER4_DEBUG
8540 +/* version of info that only actually prints anything when _d_ebugging
8541 + is on */
8542 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8543 +/* macro to catch logical errors. Put it into `default' clause of
8544 + switch() statement. */
8545 +#define impossible(label, format, ...) \
8546 + reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8547 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8548 + called. Use this for checking logical consistency and _never_ call
8549 + this to check correctness of external data: disk blocks and user-input . */
8550 +#define assert(label, cond) \
8551 +({ \
8552 + /* call_on_each_assert(); */ \
8553 + if (cond) { \
8554 + /* put negated check to avoid using !(cond) that would lose \
8555 + * warnings for things like assert(a = b); */ \
8556 + ; \
8557 + } else { \
8558 + DEBUGON(1); \
8559 + reiser4_panic(label, "assertion failed: %s", #cond); \
8560 + } \
8561 +})
8562 +
8563 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8564 +#define check_me( label, expr ) assert( label, ( expr ) )
8565 +
8566 +#define ON_DEBUG( exp ) exp
8567 +
8568 +extern int reiser4_schedulable(void);
8569 +extern void call_on_each_assert(void);
8570 +
8571 +#else
8572 +
8573 +#define dinfo( format, args... ) noop
8574 +#define impossible( label, format, args... ) noop
8575 +#define assert( label, cond ) noop
8576 +#define check_me( label, expr ) ( ( void ) ( expr ) )
8577 +#define ON_DEBUG( exp )
8578 +#define reiser4_schedulable() might_sleep()
8579 +
8580 +/* REISER4_DEBUG */
8581 +#endif
8582 +
8583 +#if REISER4_DEBUG
8584 +/* per-thread information about lock acquired by this thread. Used by lock
8585 + * ordering checking in spin_macros.h */
8586 +typedef struct reiser4_lock_cnt_info {
8587 + int rw_locked_tree;
8588 + int read_locked_tree;
8589 + int write_locked_tree;
8590 +
8591 + int rw_locked_dk;
8592 + int read_locked_dk;
8593 + int write_locked_dk;
8594 +
8595 + int rw_locked_cbk_cache;
8596 + int read_locked_cbk_cache;
8597 + int write_locked_cbk_cache;
8598 +
8599 + int spin_locked_zlock;
8600 + int spin_locked_jnode;
8601 + int spin_locked_jload;
8602 + int spin_locked_txnh;
8603 + int spin_locked_atom;
8604 + int spin_locked_stack;
8605 + int spin_locked_txnmgr;
8606 + int spin_locked_ktxnmgrd;
8607 + int spin_locked_fq;
8608 + int spin_locked_inode;
8609 + int spin_locked_super_eflush;
8610 + int spin_locked;
8611 + int long_term_locked_znode;
8612 +
8613 + int inode_sem_r;
8614 + int inode_sem_w;
8615 +
8616 + int d_refs;
8617 + int x_refs;
8618 + int t_refs;
8619 +} reiser4_lock_cnt_info;
8620 +
8621 +extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
8622 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8623 +
8624 +/* increment lock-counter @counter, if present */
8625 +#define LOCK_CNT_INC(counter) \
8626 + IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8627 +
8628 +/* decrement lock-counter @counter, if present */
8629 +#define LOCK_CNT_DEC(counter) \
8630 + IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8631 +
8632 +/* check that lock-counter is zero. This is for use in assertions */
8633 +#define LOCK_CNT_NIL(counter) \
8634 + IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8635 +
8636 +/* check that lock-counter is greater than zero. This is for use in
8637 + * assertions */
8638 +#define LOCK_CNT_GTZ(counter) \
8639 + IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8640 +#define LOCK_CNT_LT(counter,n) \
8641 + IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8642 +
8643 +#else /* REISER4_DEBUG */
8644 +
8645 +/* no-op versions on the above */
8646 +
8647 +typedef struct reiser4_lock_cnt_info {
8648 +} reiser4_lock_cnt_info;
8649 +
8650 +#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
8651 +#define LOCK_CNT_INC(counter) noop
8652 +#define LOCK_CNT_DEC(counter) noop
8653 +#define LOCK_CNT_NIL(counter) (1)
8654 +#define LOCK_CNT_GTZ(counter) (1)
8655 +#define LOCK_CNT_LT(counter,n) (1)
8656 +
8657 +#endif /* REISER4_DEBUG */
8658 +
8659 +#define assert_spin_not_locked(lock) BUG_ON(0)
8660 +#define assert_rw_write_locked(lock) BUG_ON(0)
8661 +#define assert_rw_read_locked(lock) BUG_ON(0)
8662 +#define assert_rw_locked(lock) BUG_ON(0)
8663 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
8664 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
8665 +#define assert_rw_not_locked(lock) BUG_ON(0)
8666 +
8667 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
8668 + option. */
8669 +typedef enum {
8670 + /* print a lot of information during panic. When this is on all jnodes
8671 + * are listed. This can be *very* large output. Usually you don't want
8672 + * this. Especially over serial line. */
8673 + REISER4_VERBOSE_PANIC = 0x00000001,
8674 + /* print a lot of information during umount */
8675 + REISER4_VERBOSE_UMOUNT = 0x00000002,
8676 + /* print gathered statistics on umount */
8677 + REISER4_STATS_ON_UMOUNT = 0x00000004,
8678 + /* check node consistency */
8679 + REISER4_CHECK_NODE = 0x00000008
8680 +} reiser4_debug_flags;
8681 +
8682 +extern int is_in_reiser4_context(void);
8683 +
8684 +/*
8685 + * evaluate expression @e only if with reiser4 context
8686 + */
8687 +#define ON_CONTEXT(e) do { \
8688 + if(is_in_reiser4_context()) { \
8689 + e; \
8690 + } } while(0)
8691 +
8692 +/*
8693 + * evaluate expression @e only when within reiser4_context and debugging is
8694 + * on.
8695 + */
8696 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8697 +
8698 +/*
8699 + * complain about unexpected function result and crash. Used in "default"
8700 + * branches of switch statements and alike to assert that invalid results are
8701 + * not silently ignored.
8702 + */
8703 +#define wrong_return_value( label, function ) \
8704 + impossible( label, "wrong return value from " function )
8705 +
8706 +/* Issue different types of reiser4 messages to the console */
8707 +#define warning( label, format, ... ) \
8708 + DCALL( KERN_WARNING, \
8709 + printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8710 +#define notice( label, format, ... ) \
8711 + DCALL( KERN_NOTICE, \
8712 + printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8713 +
8714 +/* mark not yet implemented functionality */
8715 +#define not_yet( label, format, ... ) \
8716 + reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8717 +
8718 +extern void reiser4_do_panic(const char *format, ...)
8719 + __attribute__ ((noreturn, format(printf, 1, 2)));
8720 +
8721 +extern int reiser4_preempt_point(void);
8722 +extern void reiser4_print_stats(void);
8723 +
8724 +#if REISER4_DEBUG
8725 +extern int reiser4_no_counters_are_held(void);
8726 +extern int reiser4_commit_check_locks(void);
8727 +#else
8728 +#define reiser4_no_counters_are_held() (1)
8729 +#define reiser4_commit_check_locks() (1)
8730 +#endif
8731 +
8732 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8733 +#define IS_POW(i) \
8734 +({ \
8735 + typeof(i) __i; \
8736 + \
8737 + __i = (i); \
8738 + !(__i & (__i - 1)); \
8739 +})
8740 +
8741 +#define KERNEL_DEBUGGER (1)
8742 +
8743 +#if KERNEL_DEBUGGER
8744 +
8745 +extern void reiser4_debugtrap(void);
8746 +
8747 +/*
8748 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8749 + * kgdb is not compiled in, do nothing.
8750 + */
8751 +#define DEBUGON(cond) \
8752 +({ \
8753 + if (unlikely(cond)) \
8754 + reiser4_debugtrap(); \
8755 +})
8756 +#else
8757 +#define DEBUGON(cond) noop
8758 +#endif
8759 +
8760 +/*
8761 + * Error code tracing facility. (Idea is borrowed from XFS code.)
8762 + *
8763 + * Suppose some strange and/or unexpected code is returned from some function
8764 + * (for example, write(2) returns -EEXIST). It is possible to place a
8765 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
8766 + * in what particular place -EEXIST was generated first?
8767 + *
8768 + * In reiser4 all places where actual error codes are produced (that is,
8769 + * statements of the form
8770 + *
8771 + * return -EFOO; // (1), or
8772 + *
8773 + * result = -EFOO; // (2)
8774 + *
8775 + * are replaced with
8776 + *
8777 + * return RETERR(-EFOO); // (1a), and
8778 + *
8779 + * result = RETERR(-EFOO); // (2a) respectively
8780 + *
8781 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8782 + * printed in error and warning messages. Moreover, it's possible to put a
8783 + * conditional breakpoint in reiser4_return_err (low-level function called
8784 + * by RETERR() to do the actual work) to break into debugger immediately
8785 + * when particular error happens.
8786 + *
8787 + */
8788 +
8789 +#if REISER4_DEBUG
8790 +
8791 +/*
8792 + * data-type to store information about where error happened ("error site").
8793 + */
8794 +typedef struct err_site {
8795 + int code; /* error code */
8796 + const char *file; /* source file, filled by __FILE__ */
8797 + int line; /* source file line, filled by __LINE__ */
8798 +} err_site;
8799 +
8800 +extern void reiser4_return_err(int code, const char *file, int line);
8801 +
8802 +/*
8803 + * fill &get_current_context()->err_site with error information.
8804 + */
8805 +#define RETERR(code) \
8806 +({ \
8807 + typeof(code) __code; \
8808 + \
8809 + __code = (code); \
8810 + reiser4_return_err(__code, __FILE__, __LINE__); \
8811 + __code; \
8812 +})
8813 +
8814 +#else
8815 +
8816 +/*
8817 + * no-op versions of the above
8818 + */
8819 +
8820 +typedef struct err_site {
8821 +} err_site;
8822 +#define RETERR(code) code
8823 +#endif
8824 +
8825 +#if REISER4_LARGE_KEY
8826 +/*
8827 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8828 + */
8829 +#define ON_LARGE_KEY(...) __VA_ARGS__
8830 +#else
8831 +#define ON_LARGE_KEY(...)
8832 +#endif
8833 +
8834 +/* __FS_REISER4_DEBUG_H__ */
8835 +#endif
8836 +
8837 +/* Make Linus happy.
8838 + Local variables:
8839 + c-indentation-style: "K&R"
8840 + mode-name: "LC"
8841 + c-basic-offset: 8
8842 + tab-width: 8
8843 + fill-column: 120
8844 + End:
8845 +*/
8846 diff -urN linux-2.6.23.orig/fs/reiser4/dformat.h linux-2.6.23/fs/reiser4/dformat.h
8847 --- linux-2.6.23.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300
8848 +++ linux-2.6.23/fs/reiser4/dformat.h 2007-12-04 16:49:30.000000000 +0300
8849 @@ -0,0 +1,70 @@
8850 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8851 +
8852 +/* Formats of on-disk data and conversion functions. */
8853 +
8854 +/* put all item formats in the files describing the particular items,
8855 + our model is, everything you need to do to add an item to reiser4,
8856 + (excepting the changes to the plugin that uses the item which go
8857 + into the file defining that plugin), you put into one file. */
8858 +/* Data on disk are stored in little-endian format.
8859 + To declare fields of on-disk structures, use d8, d16, d32 and d64.
8860 + d??tocpu() and cputod??() to convert. */
8861 +
8862 +#if !defined( __FS_REISER4_DFORMAT_H__ )
8863 +#define __FS_REISER4_DFORMAT_H__
8864 +
8865 +#include <asm/byteorder.h>
8866 +#include <asm/unaligned.h>
8867 +#include <linux/types.h>
8868 +
8869 +typedef __u8 d8;
8870 +typedef __le16 d16;
8871 +typedef __le32 d32;
8872 +typedef __le64 d64;
8873 +
8874 +#define PACKED __attribute__((packed))
8875 +
8876 +/* data-type for block number */
8877 +typedef __u64 reiser4_block_nr;
8878 +
8879 +/* data-type for block number on disk, disk format */
8880 +typedef __le64 reiser4_dblock_nr;
8881 +
8882 +/**
8883 + * disk_addr_eq - compare disk addresses
8884 + * @b1: pointer to block number ot compare
8885 + * @b2: pointer to block number ot compare
8886 + *
8887 + * Returns true if if disk addresses are the same
8888 + */
8889 +static inline int disk_addr_eq(const reiser4_block_nr *b1,
8890 + const reiser4_block_nr * b2)
8891 +{
8892 + assert("nikita-1033", b1 != NULL);
8893 + assert("nikita-1266", b2 != NULL);
8894 +
8895 + return !memcmp(b1, b2, sizeof *b1);
8896 +}
8897 +
8898 +/* structure of master reiser4 super block */
8899 +typedef struct reiser4_master_sb {
8900 + char magic[16]; /* "ReIsEr4" */
8901 + __le16 disk_plugin_id; /* id of disk layout plugin */
8902 + __le16 blocksize;
8903 + char uuid[16]; /* unique id */
8904 + char label[16]; /* filesystem label */
8905 + __le64 diskmap; /* location of the diskmap. 0 if not present */
8906 +} reiser4_master_sb;
8907 +
8908 +/* __FS_REISER4_DFORMAT_H__ */
8909 +#endif
8910 +
8911 +/*
8912 + * Local variables:
8913 + * c-indentation-style: "K&R"
8914 + * mode-name: "LC"
8915 + * c-basic-offset: 8
8916 + * tab-width: 8
8917 + * fill-column: 79
8918 + * End:
8919 + */
8920 diff -urN linux-2.6.23.orig/fs/reiser4/dscale.c linux-2.6.23/fs/reiser4/dscale.c
8921 --- linux-2.6.23.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
8922 +++ linux-2.6.23/fs/reiser4/dscale.c 2007-12-04 22:59:05.786366833 +0300
8923 @@ -0,0 +1,174 @@
8924 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8925 + * reiser4/README */
8926 +
8927 +/* Scalable on-disk integers */
8928 +
8929 +/*
8930 + * Various on-disk structures contain integer-like structures. Stat-data
8931 + * contain [yes, "data" is plural, check the dictionary] file size, link
8932 + * count; extent unit contains extent width etc. To accommodate for general
8933 + * case enough space is reserved to keep largest possible value. 64 bits in
8934 + * all cases above. But in overwhelming majority of cases numbers actually
8935 + * stored in these fields will be comparatively small and reserving 8 bytes is
8936 + * a waste of precious disk bandwidth.
8937 + *
8938 + * Scalable integers are one way to solve this problem. dscale_write()
8939 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8940 + * depending on the magnitude of the value supplied. dscale_read() reads value
8941 + * previously stored by dscale_write().
8942 + *
8943 + * dscale_write() produces format not completely unlike of UTF: two highest
8944 + * bits of the first byte are used to store "tag". One of 4 possible tag
8945 + * values is chosen depending on the number being encoded:
8946 + *
8947 + * 0 ... 0x3f => 0 [table 1]
8948 + * 0x40 ... 0x3fff => 1
8949 + * 0x4000 ... 0x3fffffff => 2
8950 + * 0x40000000 ... 0xffffffffffffffff => 3
8951 + *
8952 + * (see dscale_range() function)
8953 + *
8954 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8955 + * to be stored, so in this case there is no place in the first byte to store
8956 + * tag. For such values tag is stored in an extra 9th byte.
8957 + *
8958 + * As _highest_ bits are used for the test (which is natural) scaled integers
8959 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8960 + * uses LITTLE-ENDIAN.
8961 + *
8962 + */
8963 +
8964 +#include "debug.h"
8965 +#include "dscale.h"
8966 +
8967 +/* return tag of scaled integer stored at @address */
8968 +static int gettag(const unsigned char *address)
8969 +{
8970 + /* tag is stored in two highest bits */
8971 + return (*address) >> 6;
8972 +}
8973 +
8974 +/* clear tag from value. Clear tag embedded into @value. */
8975 +static void cleartag(__u64 * value, int tag)
8976 +{
8977 + /*
8978 + * W-w-what ?!
8979 + *
8980 + * Actually, this is rather simple: @value passed here was read by
8981 + * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8982 + * zeroes. Tag is still stored in the highest (arithmetically)
8983 + * non-zero bits of @value, but relative position of tag within __u64
8984 + * depends on @tag.
8985 + *
8986 + * For example if @tag is 0, it's stored 2 highest bits of lowest
8987 + * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8988 + *
8989 + * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8990 + * and it's offset if (2 * 8) - 2 == 14 bits.
8991 + *
8992 + * See table 1 above for details.
8993 + *
8994 + * All these cases are captured by the formula:
8995 + */
8996 + *value &= ~(3 << (((1 << tag) << 3) - 2));
8997 + /*
8998 + * That is, clear two (3 == 0t11) bits at the offset
8999 + *
9000 + * 8 * (2 ^ tag) - 2,
9001 + *
9002 + * that is, two highest bits of (2 ^ tag)-th byte of @value.
9003 + */
9004 +}
9005 +
9006 +/* return tag for @value. See table 1 above for details. */
9007 +static int dscale_range(__u64 value)
9008 +{
9009 + if (value > 0x3fffffff)
9010 + return 3;
9011 + if (value > 0x3fff)
9012 + return 2;
9013 + if (value > 0x3f)
9014 + return 1;
9015 + return 0;
9016 +}
9017 +
9018 +/* restore value stored at @adderss by dscale_write() and return number of
9019 + * bytes consumed */
9020 +int dscale_read(unsigned char *address, __u64 * value)
9021 +{
9022 + int tag;
9023 +
9024 + /* read tag */
9025 + tag = gettag(address);
9026 + switch (tag) {
9027 + case 3:
9028 + /* In this case tag is stored in an extra byte, skip this byte
9029 + * and decode value stored in the next 8 bytes.*/
9030 + *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9031 + /* worst case: 8 bytes for value itself plus one byte for
9032 + * tag. */
9033 + return 9;
9034 + case 0:
9035 + *value = get_unaligned(address);
9036 + break;
9037 + case 1:
9038 + *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9039 + break;
9040 + case 2:
9041 + *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9042 + break;
9043 + default:
9044 + return RETERR(-EIO);
9045 + }
9046 + /* clear tag embedded into @value */
9047 + cleartag(value, tag);
9048 + /* number of bytes consumed is (2 ^ tag)---see table 1. */
9049 + return 1 << tag;
9050 +}
9051 +
9052 +/* store @value at @address and return number of bytes consumed */
9053 +int dscale_write(unsigned char *address, __u64 value)
9054 +{
9055 + int tag;
9056 + int shift;
9057 + __be64 v;
9058 + unsigned char *valarr;
9059 +
9060 + tag = dscale_range(value);
9061 + v = __cpu_to_be64(value);
9062 + valarr = (unsigned char *)&v;
9063 + shift = (tag == 3) ? 1 : 0;
9064 + memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9065 + *address |= (tag << 6);
9066 + return shift + (1 << tag);
9067 +}
9068 +
9069 +/* number of bytes required to store @value */
9070 +int dscale_bytes(__u64 value)
9071 +{
9072 + int bytes;
9073 +
9074 + bytes = 1 << dscale_range(value);
9075 + if (bytes == 8)
9076 + ++bytes;
9077 + return bytes;
9078 +}
9079 +
9080 +/* returns true if @value and @other require the same number of bytes to be
9081 + * stored. Used by detect when data structure (like stat-data) has to be
9082 + * expanded or contracted. */
9083 +int dscale_fit(__u64 value, __u64 other)
9084 +{
9085 + return dscale_range(value) == dscale_range(other);
9086 +}
9087 +
9088 +/* Make Linus happy.
9089 + Local variables:
9090 + c-indentation-style: "K&R"
9091 + mode-name: "LC"
9092 + c-basic-offset: 8
9093 + tab-width: 8
9094 + fill-column: 120
9095 + scroll-step: 1
9096 + End:
9097 +*/
9098 diff -urN linux-2.6.23.orig/fs/reiser4/dscale.h linux-2.6.23/fs/reiser4/dscale.h
9099 --- linux-2.6.23.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
9100 +++ linux-2.6.23/fs/reiser4/dscale.h 2007-12-04 22:59:05.790367863 +0300
9101 @@ -0,0 +1,27 @@
9102 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9103 + * reiser4/README */
9104 +
9105 +/* Scalable on-disk integers. See dscale.h for details. */
9106 +
9107 +#if !defined( __FS_REISER4_DSCALE_H__ )
9108 +#define __FS_REISER4_DSCALE_H__
9109 +
9110 +#include "dformat.h"
9111 +
9112 +extern int dscale_read(unsigned char *address, __u64 * value);
9113 +extern int dscale_write(unsigned char *address, __u64 value);
9114 +extern int dscale_bytes(__u64 value);
9115 +extern int dscale_fit(__u64 value, __u64 other);
9116 +
9117 +/* __FS_REISER4_DSCALE_H__ */
9118 +#endif
9119 +
9120 +/* Make Linus happy.
9121 + Local variables:
9122 + c-indentation-style: "K&R"
9123 + mode-name: "LC"
9124 + c-basic-offset: 8
9125 + tab-width: 8
9126 + fill-column: 120
9127 + End:
9128 +*/
9129 diff -urN linux-2.6.23.orig/fs/reiser4/entd.c linux-2.6.23/fs/reiser4/entd.c
9130 --- linux-2.6.23.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9131 +++ linux-2.6.23/fs/reiser4/entd.c 2007-12-04 16:49:30.000000000 +0300
9132 @@ -0,0 +1,335 @@
9133 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9134 + * reiser4/README */
9135 +
9136 +/* Ent daemon. */
9137 +
9138 +#include "debug.h"
9139 +#include "txnmgr.h"
9140 +#include "tree.h"
9141 +#include "entd.h"
9142 +#include "super.h"
9143 +#include "context.h"
9144 +#include "reiser4.h"
9145 +#include "vfs_ops.h"
9146 +#include "page_cache.h"
9147 +#include "inode.h"
9148 +
9149 +#include <linux/sched.h> /* struct task_struct */
9150 +#include <linux/suspend.h>
9151 +#include <linux/kernel.h>
9152 +#include <linux/writeback.h>
9153 +#include <linux/time.h> /* INITIAL_JIFFIES */
9154 +#include <linux/backing-dev.h> /* bdi_write_congested */
9155 +#include <linux/wait.h>
9156 +#include <linux/kthread.h>
9157 +#include <linux/freezer.h>
9158 +
9159 +#define DEF_PRIORITY 12
9160 +#define MAX_ENTD_ITERS 10
9161 +
9162 +static void entd_flush(struct super_block *, struct wbq *);
9163 +static int entd(void *arg);
9164 +
9165 +/*
9166 + * set ->comm field of end thread to make its state visible to the user level
9167 + */
9168 +#define entd_set_comm(state) \
9169 + snprintf(current->comm, sizeof(current->comm), \
9170 + "ent:%s%s", super->s_id, (state))
9171 +
9172 +/**
9173 + * reiser4_init_entd - initialize entd context and start kernel daemon
9174 + * @super: super block to start ent thread for
9175 + *
9176 + * Creates entd contexts, starts kernel thread and waits until it
9177 + * initializes.
9178 + */
9179 +int reiser4_init_entd(struct super_block *super)
9180 +{
9181 + entd_context *ctx;
9182 +
9183 + assert("nikita-3104", super != NULL);
9184 +
9185 + ctx = get_entd_context(super);
9186 +
9187 + memset(ctx, 0, sizeof *ctx);
9188 + spin_lock_init(&ctx->guard);
9189 + init_waitqueue_head(&ctx->wait);
9190 +#if REISER4_DEBUG
9191 + INIT_LIST_HEAD(&ctx->flushers_list);
9192 +#endif
9193 + /* lists of writepage requests */
9194 + INIT_LIST_HEAD(&ctx->todo_list);
9195 + INIT_LIST_HEAD(&ctx->done_list);
9196 + /* start entd */
9197 + ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9198 + if (IS_ERR(ctx->tsk))
9199 + return PTR_ERR(ctx->tsk);
9200 + return 0;
9201 +}
9202 +
9203 +static void put_wbq(struct wbq *rq)
9204 +{
9205 + iput(rq->mapping->host);
9206 + complete(&rq->completion);
9207 +}
9208 +
9209 +/* ent should be locked */
9210 +static struct wbq *__get_wbq(entd_context * ent)
9211 +{
9212 + struct wbq *wbq;
9213 +
9214 + if (list_empty(&ent->todo_list))
9215 + return NULL;
9216 +
9217 + ent->nr_todo_reqs --;
9218 + wbq = list_entry(ent->todo_list.next, struct wbq, link);
9219 + list_del_init(&wbq->link);
9220 + return wbq;
9221 +}
9222 +
9223 +/* ent thread function */
9224 +static int entd(void *arg)
9225 +{
9226 + struct super_block *super;
9227 + entd_context *ent;
9228 + int done = 0;
9229 +
9230 + super = arg;
9231 + /* do_fork() just copies task_struct into the new
9232 + thread. ->fs_context shouldn't be copied of course. This shouldn't
9233 + be a problem for the rest of the code though.
9234 + */
9235 + current->journal_info = NULL;
9236 +
9237 + ent = get_entd_context(super);
9238 +
9239 + while (!done) {
9240 + try_to_freeze();
9241 +
9242 + spin_lock(&ent->guard);
9243 + while (ent->nr_todo_reqs != 0) {
9244 + struct wbq *rq;
9245 +
9246 + assert("", list_empty(&ent->done_list));
9247 +
9248 + /* take request from the queue head */
9249 + rq = __get_wbq(ent);
9250 + assert("", rq != NULL);
9251 + ent->cur_request = rq;
9252 + spin_unlock(&ent->guard);
9253 +
9254 + entd_set_comm("!");
9255 + entd_flush(super, rq);
9256 +
9257 + put_wbq(rq);
9258 +
9259 + /*
9260 + * wakeup all requestors and iput their inodes
9261 + */
9262 + spin_lock(&ent->guard);
9263 + while (!list_empty(&ent->done_list)) {
9264 + rq = list_entry(ent->done_list.next, struct wbq, link);
9265 + list_del_init(&rq->link);
9266 + ent->nr_done_reqs --;
9267 + spin_unlock(&ent->guard);
9268 + assert("", rq->written == 1);
9269 + put_wbq(rq);
9270 + spin_lock(&ent->guard);
9271 + }
9272 + }
9273 + spin_unlock(&ent->guard);
9274 +
9275 + entd_set_comm(".");
9276 +
9277 + {
9278 + DEFINE_WAIT(__wait);
9279 +
9280 + do {
9281 + prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9282 + if (kthread_should_stop()) {
9283 + done = 1;
9284 + break;
9285 + }
9286 + if (ent->nr_todo_reqs != 0)
9287 + break;
9288 + schedule();
9289 + } while (0);
9290 + finish_wait(&ent->wait, &__wait);
9291 + }
9292 + }
9293 + BUG_ON(ent->nr_todo_reqs != 0);
9294 + return 0;
9295 +}
9296 +
9297 +/**
9298 + * reiser4_done_entd - stop entd kernel thread
9299 + * @super: super block to stop ent thread for
9300 + *
9301 + * It is called on umount. Sends stop signal to entd and wait until it handles
9302 + * it.
9303 + */
9304 +void reiser4_done_entd(struct super_block *super)
9305 +{
9306 + entd_context *ent;
9307 +
9308 + assert("nikita-3103", super != NULL);
9309 +
9310 + ent = get_entd_context(super);
9311 + assert("zam-1055", ent->tsk != NULL);
9312 + kthread_stop(ent->tsk);
9313 +}
9314 +
9315 +/* called at the beginning of jnode_flush to register flusher thread with ent
9316 + * daemon */
9317 +void reiser4_enter_flush(struct super_block *super)
9318 +{
9319 + entd_context *ent;
9320 +
9321 + assert("zam-1029", super != NULL);
9322 + ent = get_entd_context(super);
9323 +
9324 + assert("zam-1030", ent != NULL);
9325 +
9326 + spin_lock(&ent->guard);
9327 + ent->flushers++;
9328 +#if REISER4_DEBUG
9329 + list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9330 +#endif
9331 + spin_unlock(&ent->guard);
9332 +}
9333 +
9334 +/* called at the end of jnode_flush */
9335 +void reiser4_leave_flush(struct super_block *super)
9336 +{
9337 + entd_context *ent;
9338 + int wake_up_ent;
9339 +
9340 + assert("zam-1027", super != NULL);
9341 + ent = get_entd_context(super);
9342 +
9343 + assert("zam-1028", ent != NULL);
9344 +
9345 + spin_lock(&ent->guard);
9346 + ent->flushers--;
9347 + wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9348 +#if REISER4_DEBUG
9349 + list_del_init(&get_current_context()->flushers_link);
9350 +#endif
9351 + spin_unlock(&ent->guard);
9352 + if (wake_up_ent)
9353 + wake_up(&ent->wait);
9354 +}
9355 +
9356 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9357 +
9358 +static void entd_flush(struct super_block *super, struct wbq *rq)
9359 +{
9360 + reiser4_context ctx;
9361 + int tmp;
9362 +
9363 + init_stack_context(&ctx, super);
9364 + ctx.entd = 1;
9365 + ctx.gfp_mask = GFP_NOFS;
9366 +
9367 + rq->wbc->range_start = page_offset(rq->page);
9368 + rq->wbc->range_end = rq->wbc->range_start +
9369 + (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9370 + tmp = rq->wbc->nr_to_write;
9371 + rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9372 +
9373 + if (rq->wbc->nr_to_write > 0) {
9374 + rq->wbc->range_start = 0;
9375 + rq->wbc->range_end = LLONG_MAX;
9376 + generic_sync_sb_inodes(super, rq->wbc);
9377 + }
9378 + rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9379 + reiser4_writeout(super, rq->wbc);
9380 +
9381 + context_set_commit_async(&ctx);
9382 + reiser4_exit_context(&ctx);
9383 +}
9384 +
9385 +/**
9386 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9387 + * @page: page to be written
9388 + * @wbc: writeback control passed to reiser4_writepage
9389 + *
9390 + * Creates a request, puts it on entd list of requests, wakeups entd if
9391 + * necessary, waits until entd completes with the request.
9392 + */
9393 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9394 +{
9395 + struct super_block *sb;
9396 + struct inode *inode;
9397 + entd_context *ent;
9398 + struct wbq rq;
9399 +
9400 + assert("", PageLocked(page));
9401 + assert("", page->mapping != NULL);
9402 +
9403 + sb = page->mapping->host->i_sb;
9404 + ent = get_entd_context(sb);
9405 + assert("", ent && ent->done == 0);
9406 +
9407 + /*
9408 + * we are going to unlock page and ask ent thread to write the
9409 + * page. Re-dirty page before unlocking so that if ent thread fails to
9410 + * write it - it will remain dirty
9411 + */
9412 + reiser4_set_page_dirty_internal(page);
9413 +
9414 + /*
9415 + * pin inode in memory, unlock page, entd_flush will iput. We can not
9416 + * iput here becasue we can not allow delete_inode to be called here
9417 + */
9418 + inode = igrab(page->mapping->host);
9419 + unlock_page(page);
9420 + if (inode == NULL)
9421 + /* inode is getting freed */
9422 + return 0;
9423 +
9424 + /* init wbq */
9425 + INIT_LIST_HEAD(&rq.link);
9426 + rq.magic = WBQ_MAGIC;
9427 + rq.wbc = wbc;
9428 + rq.page = page;
9429 + rq.mapping = inode->i_mapping;
9430 + rq.node = NULL;
9431 + rq.written = 0;
9432 + init_completion(&rq.completion);
9433 +
9434 + /* add request to entd's list of writepage requests */
9435 + spin_lock(&ent->guard);
9436 + ent->nr_todo_reqs++;
9437 + list_add_tail(&rq.link, &ent->todo_list);
9438 + if (ent->nr_todo_reqs == 1)
9439 + wake_up(&ent->wait);
9440 +
9441 + spin_unlock(&ent->guard);
9442 +
9443 + /* wait until entd finishes */
9444 + wait_for_completion(&rq.completion);
9445 +
9446 + if (rq.written)
9447 + /* Eventually ENTD has written the page to disk. */
9448 + return 0;
9449 + return 0;
9450 +}
9451 +
9452 +int wbq_available(void)
9453 +{
9454 + struct super_block *sb = reiser4_get_current_sb();
9455 + entd_context *ent = get_entd_context(sb);
9456 + return ent->nr_todo_reqs;
9457 +}
9458 +
9459 +/*
9460 + * Local variables:
9461 + * c-indentation-style: "K&R"
9462 + * mode-name: "LC"
9463 + * c-basic-offset: 8
9464 + * tab-width: 8
9465 + * fill-column: 79
9466 + * End:
9467 + */
9468 diff -urN linux-2.6.23.orig/fs/reiser4/entd.h linux-2.6.23/fs/reiser4/entd.h
9469 --- linux-2.6.23.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9470 +++ linux-2.6.23/fs/reiser4/entd.h 2007-12-04 16:49:30.000000000 +0300
9471 @@ -0,0 +1,90 @@
9472 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9473 +
9474 +/* Ent daemon. */
9475 +
9476 +#ifndef __ENTD_H__
9477 +#define __ENTD_H__
9478 +
9479 +#include "context.h"
9480 +
9481 +#include <linux/fs.h>
9482 +#include <linux/completion.h>
9483 +#include <linux/wait.h>
9484 +#include <linux/spinlock.h>
9485 +#include <linux/sched.h> /* for struct task_struct */
9486 +
9487 +#define WBQ_MAGIC 0x7876dc76
9488 +
9489 +/* write-back request. */
9490 +struct wbq {
9491 + int magic;
9492 + struct list_head link; /* list head of this list is in entd context */
9493 + struct writeback_control *wbc;
9494 + struct page *page;
9495 + struct address_space *mapping;
9496 + struct completion completion;
9497 + jnode *node; /* set if ent thread captured requested page */
9498 + int written; /* set if ent thread wrote requested page */
9499 +};
9500 +
9501 +/* ent-thread context. This is used to synchronize starting/stopping ent
9502 + * threads. */
9503 +typedef struct entd_context {
9504 + /* wait queue that ent thread waits on for more work. It's
9505 + * signaled by write_page_by_ent(). */
9506 + wait_queue_head_t wait;
9507 + /* spinlock protecting other fields */
9508 + spinlock_t guard;
9509 + /* ent thread */
9510 + struct task_struct *tsk;
9511 + /* set to indicate that ent thread should leave. */
9512 + int done;
9513 + /* counter of active flushers */
9514 + int flushers;
9515 + /*
9516 + * when reiser4_writepage asks entd to write a page - it adds struct
9517 + * wbq to this list
9518 + */
9519 + struct list_head todo_list;
9520 + /* number of elements on the above list */
9521 + int nr_todo_reqs;
9522 +
9523 + struct wbq *cur_request;
9524 + /*
9525 + * when entd writes a page it moves write-back request from todo_list
9526 + * to done_list. This list is used at the end of entd iteration to
9527 + * wakeup requestors and iput inodes.
9528 + */
9529 + struct list_head done_list;
9530 + /* number of elements on the above list */
9531 + int nr_done_reqs;
9532 +
9533 +#if REISER4_DEBUG
9534 + /* list of all active flushers */
9535 + struct list_head flushers_list;
9536 +#endif
9537 +} entd_context;
9538 +
9539 +extern int reiser4_init_entd(struct super_block *);
9540 +extern void reiser4_done_entd(struct super_block *);
9541 +
9542 +extern void reiser4_enter_flush(struct super_block *);
9543 +extern void reiser4_leave_flush(struct super_block *);
9544 +
9545 +extern int write_page_by_ent(struct page *, struct writeback_control *);
9546 +extern int wbq_available(void);
9547 +extern void ent_writes_page(struct super_block *, struct page *);
9548 +
9549 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9550 +/* __ENTD_H__ */
9551 +#endif
9552 +
9553 +/* Make Linus happy.
9554 + Local variables:
9555 + c-indentation-style: "K&R"
9556 + mode-name: "LC"
9557 + c-basic-offset: 8
9558 + tab-width: 8
9559 + fill-column: 120
9560 + End:
9561 +*/
9562 diff -urN linux-2.6.23.orig/fs/reiser4/eottl.c linux-2.6.23/fs/reiser4/eottl.c
9563 --- linux-2.6.23.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300
9564 +++ linux-2.6.23/fs/reiser4/eottl.c 2007-12-04 16:49:30.000000000 +0300
9565 @@ -0,0 +1,509 @@
9566 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9567 +
9568 +#include "forward.h"
9569 +#include "debug.h"
9570 +#include "key.h"
9571 +#include "coord.h"
9572 +#include "plugin/item/item.h"
9573 +#include "plugin/node/node.h"
9574 +#include "znode.h"
9575 +#include "block_alloc.h"
9576 +#include "tree_walk.h"
9577 +#include "tree_mod.h"
9578 +#include "carry.h"
9579 +#include "tree.h"
9580 +#include "super.h"
9581 +
9582 +#include <linux/types.h> /* for __u?? */
9583 +
9584 +/*
9585 + * Extents on the twig level (EOTTL) handling.
9586 + *
9587 + * EOTTL poses some problems to the tree traversal, that are better explained
9588 + * by example.
9589 + *
9590 + * Suppose we have block B1 on the twig level with the following items:
9591 + *
9592 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9593 + * offset)
9594 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9595 + * 2. internal item I2 with key (10:0:0:0)
9596 + *
9597 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9598 + * then intra-node lookup is done. This lookup finished on the E1, because the
9599 + * key we are looking for is larger than the key of E1 and is smaller than key
9600 + * the of I2.
9601 + *
9602 + * Here search is stuck.
9603 + *
9604 + * After some thought it is clear what is wrong here: extents on the twig level
9605 + * break some basic property of the *search* tree (on the pretext, that they
9606 + * restore property of balanced tree).
9607 + *
9608 + * Said property is the following: if in the internal node of the search tree
9609 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9610 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9611 + * through the Pointer.
9612 + *
9613 + * This is not true, when Pointer is Extent-Pointer, simply because extent
9614 + * cannot expand indefinitely to the right to include any item with
9615 + *
9616 + * Key1 <= Key <= Key2.
9617 + *
9618 + * For example, our E1 extent is only responsible for the data with keys
9619 + *
9620 + * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9621 + *
9622 + * so, key range
9623 + *
9624 + * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9625 + *
9626 + * is orphaned: there is no way to get there from the tree root.
9627 + *
9628 + * In other words, extent pointers are different than normal child pointers as
9629 + * far as search tree is concerned, and this creates such problems.
9630 + *
9631 + * Possible solution for this problem is to insert our item into node pointed
9632 + * to by I2. There are some problems through:
9633 + *
9634 + * (1) I2 can be in a different node.
9635 + * (2) E1 can be immediately followed by another extent E2.
9636 + *
9637 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9638 + * for locks/coords as necessary.
9639 + *
9640 + * (2) is more complex. Solution here is to insert new empty leaf node and
9641 + * insert internal item between E1 and E2 pointing to said leaf node. This is
9642 + * further complicated by possibility that E2 is in a different node, etc.
9643 + *
9644 + * Problems:
9645 + *
9646 + * (1) if there was internal item I2 immediately on the right of an extent E1
9647 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9648 + * key of S1 will be less than smallest key in the N2. Normally, search key
9649 + * checks that key we are looking for is in the range of keys covered by the
9650 + * node key is being looked in. To work around of this situation, while
9651 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9652 + * cbk falgs bitmask. This flag is automatically set on entrance to the
9653 + * coord_by_key() and is only cleared when we are about to enter situation
9654 + * described above.
9655 + *
9656 + * (2) If extent E1 is immediately followed by another extent E2 and we are
9657 + * searching for the key that is between E1 and E2 we only have to insert new
9658 + * empty leaf node when coord_by_key was called for insertion, rather than just
9659 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9660 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9661 + * performed by insert_by_key() and friends.
9662 + *
9663 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9664 + * case it requires modification of node content which is only possible under
9665 + * write lock. It may well happen that we only have read lock on the node where
9666 + * new internal pointer is to be inserted (common case: lookup of non-existent
9667 + * stat-data that fells between two extents). If only read lock is held, tree
9668 + * traversal is restarted with lock_level modified so that next time we hit
9669 + * this problem, write lock will be held. Once we have write lock, balancing
9670 + * will be performed.
9671 + */
9672 +
9673 +/**
9674 + * is_next_item_internal - check whether next item is internal
9675 + * @coord: coordinate of extent item in twig node
9676 + * @key: search key
9677 + * @lh: twig node lock handle
9678 + *
9679 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9680 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9681 + * to that node, @coord is set to its first unit. If next item is not internal
9682 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9683 + * is returned if search restart has to be done.
9684 + */
9685 +static int
9686 +is_next_item_internal(coord_t *coord, const reiser4_key *key,
9687 + lock_handle *lh)
9688 +{
9689 + coord_t next;
9690 + lock_handle rn;
9691 + int result;
9692 +
9693 + coord_dup(&next, coord);
9694 + if (coord_next_unit(&next) == 0) {
9695 + /* next unit is in this node */
9696 + if (item_is_internal(&next)) {
9697 + coord_dup(coord, &next);
9698 + return 1;
9699 + }
9700 + assert("vs-3", item_is_extent(&next));
9701 + return 0;
9702 + }
9703 +
9704 + /*
9705 + * next unit either does not exist or is in right neighbor. If it is in
9706 + * right neighbor we have to check right delimiting key because
9707 + * concurrent thread could get their first and insert item with a key
9708 + * smaller than @key
9709 + */
9710 + read_lock_dk(current_tree);
9711 + result = keycmp(key, znode_get_rd_key(coord->node));
9712 + read_unlock_dk(current_tree);
9713 + assert("vs-6", result != EQUAL_TO);
9714 + if (result == GREATER_THAN)
9715 + return 2;
9716 +
9717 + /* lock right neighbor */
9718 + init_lh(&rn);
9719 + result = reiser4_get_right_neighbor(&rn, coord->node,
9720 + znode_is_wlocked(coord->node) ?
9721 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9722 + GN_CAN_USE_UPPER_LEVELS);
9723 + if (result == -E_NO_NEIGHBOR) {
9724 + /* we are on the rightmost edge of the tree */
9725 + done_lh(&rn);
9726 + return 0;
9727 + }
9728 +
9729 + if (result) {
9730 + assert("vs-4", result < 0);
9731 + done_lh(&rn);
9732 + return result;
9733 + }
9734 +
9735 + /*
9736 + * check whether concurrent thread managed to insert item with a key
9737 + * smaller than @key
9738 + */
9739 + read_lock_dk(current_tree);
9740 + result = keycmp(key, znode_get_ld_key(rn.node));
9741 + read_unlock_dk(current_tree);
9742 + assert("vs-6", result != EQUAL_TO);
9743 + if (result == GREATER_THAN) {
9744 + done_lh(&rn);
9745 + return 2;
9746 + }
9747 +
9748 + result = zload(rn.node);
9749 + if (result) {
9750 + assert("vs-5", result < 0);
9751 + done_lh(&rn);
9752 + return result;
9753 + }
9754 +
9755 + coord_init_first_unit(&next, rn.node);
9756 + if (item_is_internal(&next)) {
9757 + /*
9758 + * next unit is in right neighbor and it is an unit of internal
9759 + * item. Unlock coord->node. Move @lh to right neighbor. @coord
9760 + * is set to the first unit of right neighbor.
9761 + */
9762 + coord_dup(coord, &next);
9763 + zrelse(rn.node);
9764 + done_lh(lh);
9765 + move_lh(lh, &rn);
9766 + return 1;
9767 + }
9768 +
9769 + /*
9770 + * next unit is unit of extent item. Return without chaning @lh and
9771 + * @coord.
9772 + */
9773 + assert("vs-6", item_is_extent(&next));
9774 + zrelse(rn.node);
9775 + done_lh(&rn);
9776 + return 0;
9777 +}
9778 +
9779 +/**
9780 + * rd_key - calculate key of an item next to the given one
9781 + * @coord: position in a node
9782 + * @key: storage for result key
9783 + *
9784 + * @coord is set between items or after the last item in a node. Calculate key
9785 + * of item to the right of @coord.
9786 + */
9787 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9788 +{
9789 + coord_t dup;
9790 +
9791 + assert("nikita-2281", coord_is_between_items(coord));
9792 + coord_dup(&dup, coord);
9793 +
9794 + if (coord_set_to_right(&dup) == 0)
9795 + /* next item is in this node. Return its key. */
9796 + unit_key_by_coord(&dup, key);
9797 + else {
9798 + /*
9799 + * next item either does not exist or is in right
9800 + * neighbor. Return znode's right delimiting key.
9801 + */
9802 + read_lock_dk(current_tree);
9803 + *key = *znode_get_rd_key(coord->node);
9804 + read_unlock_dk(current_tree);
9805 + }
9806 + return key;
9807 +}
9808 +
9809 +/**
9810 + * add_empty_leaf - insert empty leaf between two extents
9811 + * @insert_coord: position in twig node between two extents
9812 + * @lh: twig node lock handle
9813 + * @key: left delimiting key of new node
9814 + * @rdkey: right delimiting key of new node
9815 + *
9816 + * Inserts empty leaf node between two extent items. It is necessary when we
9817 + * have to insert an item on leaf level between two extents (items on the twig
9818 + * level).
9819 + */
9820 +static int
9821 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9822 + const reiser4_key *key, const reiser4_key *rdkey)
9823 +{
9824 + int result;
9825 + carry_pool *pool;
9826 + carry_level *todo;
9827 + reiser4_item_data *item;
9828 + carry_insert_data *cdata;
9829 + carry_op *op;
9830 + znode *node;
9831 + reiser4_tree *tree;
9832 +
9833 + assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9834 + tree = znode_get_tree(insert_coord->node);
9835 + node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9836 + if (IS_ERR(node))
9837 + return PTR_ERR(node);
9838 +
9839 + /* setup delimiting keys for node being inserted */
9840 + write_lock_dk(tree);
9841 + znode_set_ld_key(node, key);
9842 + znode_set_rd_key(node, rdkey);
9843 + ON_DEBUG(node->creator = current);
9844 + ON_DEBUG(node->first_key = *key);
9845 + write_unlock_dk(tree);
9846 +
9847 + ZF_SET(node, JNODE_ORPHAN);
9848 +
9849 + /*
9850 + * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9851 + * carry_insert_data
9852 + */
9853 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9854 + sizeof(*item) + sizeof(*cdata));
9855 + if (IS_ERR(pool))
9856 + return PTR_ERR(pool);
9857 + todo = (carry_level *) (pool + 1);
9858 + init_carry_level(todo, pool);
9859 +
9860 + item = (reiser4_item_data *) (todo + 3);
9861 + cdata = (carry_insert_data *) (item + 1);
9862 +
9863 + op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9864 + if (!IS_ERR(op)) {
9865 + cdata->coord = insert_coord;
9866 + cdata->key = key;
9867 + cdata->data = item;
9868 + op->u.insert.d = cdata;
9869 + op->u.insert.type = COPT_ITEM_DATA;
9870 + build_child_ptr_data(node, item);
9871 + item->arg = NULL;
9872 + /* have @insert_coord to be set at inserted item after
9873 + insertion is done */
9874 + todo->track_type = CARRY_TRACK_CHANGE;
9875 + todo->tracked = lh;
9876 +
9877 + result = reiser4_carry(todo, NULL);
9878 + if (result == 0) {
9879 + /*
9880 + * pin node in memory. This is necessary for
9881 + * znode_make_dirty() below.
9882 + */
9883 + result = zload(node);
9884 + if (result == 0) {
9885 + lock_handle local_lh;
9886 +
9887 + /*
9888 + * if we inserted new child into tree we have
9889 + * to mark it dirty so that flush will be able
9890 + * to process it.
9891 + */
9892 + init_lh(&local_lh);
9893 + result = longterm_lock_znode(&local_lh, node,
9894 + ZNODE_WRITE_LOCK,
9895 + ZNODE_LOCK_LOPRI);
9896 + if (result == 0) {
9897 + znode_make_dirty(node);
9898 +
9899 + /*
9900 + * when internal item pointing to @node
9901 + * was inserted into twig node
9902 + * create_hook_internal did not connect
9903 + * it properly because its right
9904 + * neighbor was not known. Do it
9905 + * here
9906 + */
9907 + write_lock_tree(tree);
9908 + assert("nikita-3312",
9909 + znode_is_right_connected(node));
9910 + assert("nikita-2984",
9911 + node->right == NULL);
9912 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9913 + write_unlock_tree(tree);
9914 + result =
9915 + connect_znode(insert_coord, node);
9916 + ON_DEBUG(if (result == 0) check_dkeys(node););
9917 +
9918 + done_lh(lh);
9919 + move_lh(lh, &local_lh);
9920 + assert("vs-1676", node_is_empty(node));
9921 + coord_init_first_unit(insert_coord,
9922 + node);
9923 + } else {
9924 + warning("nikita-3136",
9925 + "Cannot lock child");
9926 + }
9927 + done_lh(&local_lh);
9928 + zrelse(node);
9929 + }
9930 + }
9931 + } else
9932 + result = PTR_ERR(op);
9933 + zput(node);
9934 + done_carry_pool(pool);
9935 + return result;
9936 +}
9937 +
9938 +/**
9939 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9940 + * @h: search handle
9941 + * @outcome: flag saying whether search has to restart or is done
9942 + *
9943 + * Handles search on twig level. If this function completes search itself then
9944 + * it returns 1. If search has to go one level down then 0 is returned. If
9945 + * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9946 + * in @h->result.
9947 + */
9948 +int handle_eottl(cbk_handle *h, int *outcome)
9949 +{
9950 + int result;
9951 + reiser4_key key;
9952 + coord_t *coord;
9953 +
9954 + coord = h->coord;
9955 +
9956 + if (h->level != TWIG_LEVEL ||
9957 + (coord_is_existing_item(coord) && item_is_internal(coord))) {
9958 + /* Continue to traverse tree downward. */
9959 + return 0;
9960 + }
9961 +
9962 + /*
9963 + * make sure that @h->coord is set to twig node and that it is either
9964 + * set to extent item or after extent item
9965 + */
9966 + assert("vs-356", h->level == TWIG_LEVEL);
9967 + assert("vs-357", ( {
9968 + coord_t lcoord;
9969 + coord_dup(&lcoord, coord);
9970 + check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9971 + item_is_extent(&lcoord);
9972 + }
9973 + ));
9974 +
9975 + if (*outcome == NS_FOUND) {
9976 + /* we have found desired key on twig level in extent item */
9977 + h->result = CBK_COORD_FOUND;
9978 + *outcome = LOOKUP_DONE;
9979 + return 1;
9980 + }
9981 +
9982 + if (!(h->flags & CBK_FOR_INSERT)) {
9983 + /* tree traversal is not for insertion. Just return
9984 + CBK_COORD_NOTFOUND. */
9985 + h->result = CBK_COORD_NOTFOUND;
9986 + *outcome = LOOKUP_DONE;
9987 + return 1;
9988 + }
9989 +
9990 + /* take a look at the item to the right of h -> coord */
9991 + result = is_next_item_internal(coord, h->key, h->active_lh);
9992 + if (unlikely(result < 0)) {
9993 + h->error = "get_right_neighbor failed";
9994 + h->result = result;
9995 + *outcome = LOOKUP_DONE;
9996 + return 1;
9997 + }
9998 + if (result == 0) {
9999 + /*
10000 + * item to the right is also an extent one. Allocate a new node
10001 + * and insert pointer to it after item h -> coord.
10002 + *
10003 + * This is a result of extents being located at the twig
10004 + * level. For explanation, see comment just above
10005 + * is_next_item_internal().
10006 + */
10007 + znode *loaded;
10008 +
10009 + if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10010 + /*
10011 + * we got node read locked, restart coord_by_key to
10012 + * have write lock on twig level
10013 + */
10014 + h->lock_level = TWIG_LEVEL;
10015 + h->lock_mode = ZNODE_WRITE_LOCK;
10016 + *outcome = LOOKUP_REST;
10017 + return 1;
10018 + }
10019 +
10020 + loaded = coord->node;
10021 + result =
10022 + add_empty_leaf(coord, h->active_lh, h->key,
10023 + rd_key(coord, &key));
10024 + if (result) {
10025 + h->error = "could not add empty leaf";
10026 + h->result = result;
10027 + *outcome = LOOKUP_DONE;
10028 + return 1;
10029 + }
10030 + /* added empty leaf is locked (h->active_lh), its parent node
10031 + is unlocked, h->coord is set as EMPTY */
10032 + assert("vs-13", coord->between == EMPTY_NODE);
10033 + assert("vs-14", znode_is_write_locked(coord->node));
10034 + assert("vs-15",
10035 + WITH_DATA(coord->node, node_is_empty(coord->node)));
10036 + assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10037 + assert("vs-17", coord->node == h->active_lh->node);
10038 + *outcome = LOOKUP_DONE;
10039 + h->result = CBK_COORD_NOTFOUND;
10040 + return 1;
10041 + } else if (result == 1) {
10042 + /*
10043 + * this is special case mentioned in the comment on
10044 + * tree.h:cbk_flags. We have found internal item immediately on
10045 + * the right of extent, and we are going to insert new item
10046 + * there. Key of item we are going to insert is smaller than
10047 + * leftmost key in the node pointed to by said internal item
10048 + * (otherwise search wouldn't come to the extent in the first
10049 + * place).
10050 + *
10051 + * This is a result of extents being located at the twig
10052 + * level. For explanation, see comment just above
10053 + * is_next_item_internal().
10054 + */
10055 + h->flags &= ~CBK_TRUST_DK;
10056 + } else {
10057 + assert("vs-8", result == 2);
10058 + *outcome = LOOKUP_REST;
10059 + return 1;
10060 + }
10061 + assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10062 + return 0;
10063 +}
10064 +
10065 +/*
10066 + * Local variables:
10067 + * c-indentation-style: "K&R"
10068 + * mode-name: "LC"
10069 + * c-basic-offset: 8
10070 + * tab-width: 8
10071 + * fill-column: 120
10072 + * scroll-step: 1
10073 + * End:
10074 + */
10075 diff -urN linux-2.6.23.orig/fs/reiser4/estimate.c linux-2.6.23/fs/reiser4/estimate.c
10076 --- linux-2.6.23.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300
10077 +++ linux-2.6.23/fs/reiser4/estimate.c 2007-12-04 16:49:30.000000000 +0300
10078 @@ -0,0 +1,120 @@
10079 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10080 +
10081 +#include "debug.h"
10082 +#include "dformat.h"
10083 +#include "tree.h"
10084 +#include "carry.h"
10085 +#include "inode.h"
10086 +#include "plugin/cluster.h"
10087 +#include "plugin/item/ctail.h"
10088 +
10089 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10090 +
10091 + Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10092 + is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10093 + neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10094 + leaf level, 3 for twig level, 2 on upper + 1 for root.
10095 +
10096 + Do not calculate the current node of the lowest level here - this is overhead only.
10097 +
10098 + children is almost always 1 here. Exception is flow insertion
10099 +*/
10100 +static reiser4_block_nr
10101 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10102 +{
10103 + reiser4_block_nr ten_percent;
10104 +
10105 + ten_percent = ((103 * childen) >> 10);
10106 +
10107 + /* If we have too many balancings at the time, tree height can raise on more
10108 + then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10109 + return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10110 +}
10111 +
10112 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10113 + perform insertion of one item into the tree */
10114 +/* it is only called when tree height changes, or gets initialized */
10115 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10116 +{
10117 + return 1 + max_balance_overhead(1, height);
10118 +}
10119 +
10120 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10121 +{
10122 + return tree->estimate_one_insert;
10123 +}
10124 +
10125 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10126 + perform insertion of one unit into an item in the tree */
10127 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10128 +{
10129 + /* estimate insert into item just like item insertion */
10130 + return tree->estimate_one_insert;
10131 +}
10132 +
10133 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10134 +{
10135 + /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10136 + level */
10137 + return tree->estimate_one_insert;
10138 +}
10139 +
10140 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10141 + both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10142 + levels */
10143 +reiser4_block_nr estimate_insert_flow(tree_level height)
10144 +{
10145 + return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10146 + CARRY_FLOW_NEW_NODES_LIMIT,
10147 + height);
10148 +}
10149 +
10150 +/* returnes max number of nodes can be occupied by disk cluster */
10151 +static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10152 +{
10153 + int per_cluster;
10154 + per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10155 + return 3 + per_cluster +
10156 + max_balance_overhead(3 + per_cluster,
10157 + REISER4_MAX_ZTREE_HEIGHT);
10158 +}
10159 +
10160 +/* how many nodes might get dirty and added
10161 + during insertion of a disk cluster */
10162 +reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10163 +{
10164 + return estimate_cluster(inode, 1); /* 24 */
10165 +}
10166 +
10167 +/* how many nodes might get dirty and added
10168 + during update of a (prepped or unprepped) disk cluster */
10169 +reiser4_block_nr estimate_update_cluster(struct inode * inode)
10170 +{
10171 + return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10172 +}
10173 +
10174 +/* How many nodes occupied by a disk cluster might get dirty.
10175 + Note that this estimation is not precise (i.e. disk cluster
10176 + can occupy more nodes).
10177 + Q: Why we don't use precise estimation?
10178 + A: 1.Because precise estimation is fairly bad: 65536 nodes
10179 + for 64K logical cluster, it means 256M of dead space on
10180 + a partition
10181 + 2.It is a very rare case when disk cluster occupies more
10182 + nodes then this estimation returns.
10183 +*/
10184 +reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10185 +{
10186 + return cluster_nrpages(inode) + 4;
10187 +}
10188 +
10189 +/* Make Linus happy.
10190 + Local variables:
10191 + c-indentation-style: "K&R"
10192 + mode-name: "LC"
10193 + c-basic-offset: 8
10194 + tab-width: 8
10195 + fill-column: 120
10196 + scroll-step: 1
10197 + End:
10198 +*/
10199 diff -urN linux-2.6.23.orig/fs/reiser4/export_ops.c linux-2.6.23/fs/reiser4/export_ops.c
10200 --- linux-2.6.23.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300
10201 +++ linux-2.6.23/fs/reiser4/export_ops.c 2007-12-04 22:59:05.774363742 +0300
10202 @@ -0,0 +1,297 @@
10203 +/* Copyright 2005 by Hans Reiser, licensing governed by
10204 + * reiser4/README */
10205 +
10206 +#include "inode.h"
10207 +#include "plugin/plugin.h"
10208 +
10209 +/*
10210 + * Supported file-handle types
10211 + */
10212 +typedef enum {
10213 + FH_WITH_PARENT = 0x10, /* file handle with parent */
10214 + FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10215 +} reiser4_fhtype;
10216 +
10217 +#define NFSERROR (255)
10218 +
10219 +/* initialize place-holder for object */
10220 +static void object_on_wire_init(reiser4_object_on_wire *o)
10221 +{
10222 + o->plugin = NULL;
10223 +}
10224 +
10225 +/* finish with @o */
10226 +static void object_on_wire_done(reiser4_object_on_wire *o)
10227 +{
10228 + if (o->plugin != NULL)
10229 + o->plugin->wire.done(o);
10230 +}
10231 +
10232 +/*
10233 + * read serialized object identity from @addr and store information about
10234 + * object in @obj. This is dual to encode_inode().
10235 + */
10236 +static char *decode_inode(struct super_block *s, char *addr,
10237 + reiser4_object_on_wire * obj)
10238 +{
10239 + file_plugin *fplug;
10240 +
10241 + /* identifier of object plugin is stored in the first two bytes,
10242 + * followed by... */
10243 + fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10244 + if (fplug != NULL) {
10245 + addr += sizeof(d16);
10246 + obj->plugin = fplug;
10247 + assert("nikita-3520", fplug->wire.read != NULL);
10248 + /* plugin specific encoding of object identity. */
10249 + addr = fplug->wire.read(addr, obj);
10250 + } else
10251 + addr = ERR_PTR(RETERR(-EINVAL));
10252 + return addr;
10253 +}
10254 +
10255 +/**
10256 + * reiser4_decode_fh - decode_fh of export operations
10257 + * @super: super block
10258 + * @fh: nfsd file handle
10259 + * @len: length of file handle
10260 + * @fhtype: type of file handle
10261 + * @acceptable: acceptability testing function
10262 + * @context: argument for @acceptable
10263 + *
10264 + * Returns dentry referring to the same file as @fh.
10265 + */
10266 +static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10267 + int len, int fhtype,
10268 + int (*acceptable) (void *context,
10269 + struct dentry *de),
10270 + void *context)
10271 +{
10272 + reiser4_context *ctx;
10273 + reiser4_object_on_wire object;
10274 + reiser4_object_on_wire parent;
10275 + char *addr;
10276 + int with_parent;
10277 +
10278 + ctx = reiser4_init_context(super);
10279 + if (IS_ERR(ctx))
10280 + return (struct dentry *)ctx;
10281 +
10282 + assert("vs-1482",
10283 + fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10284 +
10285 + with_parent = (fhtype == FH_WITH_PARENT);
10286 +
10287 + addr = (char *)fh;
10288 +
10289 + object_on_wire_init(&object);
10290 + object_on_wire_init(&parent);
10291 +#if 0
10292 + addr = decode_inode(super, addr, &object);
10293 + if (!IS_ERR(addr)) {
10294 + if (with_parent)
10295 + addr = decode_inode(super, addr, &parent);
10296 + if (!IS_ERR(addr)) {
10297 + struct dentry *d;
10298 + typeof(super->s_export_op->find_exported_dentry) fn;
10299 +
10300 + fn = super->s_export_op->find_exported_dentry;
10301 + assert("nikita-3521", fn != NULL);
10302 + d = fn(super, &object, with_parent ? &parent : NULL,
10303 + acceptable, context);
10304 + if (d != NULL && !IS_ERR(d))
10305 + /* FIXME check for -ENOMEM */
10306 + reiser4_get_dentry_fsdata(d)->stateless = 1;
10307 + addr = (char *)d;
10308 + }
10309 + }
10310 + object_on_wire_done(&object);
10311 + object_on_wire_done(&parent);
10312 +
10313 + reiser4_exit_context(ctx);
10314 + return (void *)addr;
10315 +#else
10316 + return ERR_PTR(-EINVAL);
10317 +#endif
10318 +}
10319 +
10320 +/*
10321 + * Object serialization support.
10322 + *
10323 + * To support knfsd file system provides export_operations that are used to
10324 + * construct and interpret NFS file handles. As a generalization of this,
10325 + * reiser4 object plugins have serialization support: it provides methods to
10326 + * create on-wire representation of identity of reiser4 object, and
10327 + * re-create/locate object given its on-wire identity.
10328 + *
10329 + */
10330 +
10331 +/*
10332 + * return number of bytes that on-wire representation of @inode's identity
10333 + * consumes.
10334 + */
10335 +static int encode_inode_size(struct inode *inode)
10336 +{
10337 + assert("nikita-3514", inode != NULL);
10338 + assert("nikita-3515", inode_file_plugin(inode) != NULL);
10339 + assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10340 +
10341 + return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10342 +}
10343 +
10344 +/*
10345 + * store on-wire representation of @inode's identity at the area beginning at
10346 + * @start.
10347 + */
10348 +static char *encode_inode(struct inode *inode, char *start)
10349 +{
10350 + assert("nikita-3517", inode != NULL);
10351 + assert("nikita-3518", inode_file_plugin(inode) != NULL);
10352 + assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10353 +
10354 + /*
10355 + * first, store two-byte identifier of object plugin, then
10356 + */
10357 + save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10358 + (d16 *) start);
10359 + start += sizeof(d16);
10360 + /*
10361 + * call plugin to serialize object's identity
10362 + */
10363 + return inode_file_plugin(inode)->wire.write(inode, start);
10364 +}
10365 +
10366 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10367 + * returned if file handle can not be stored */
10368 +/**
10369 + * reiser4_encode_fh - encode_fh of export operations
10370 + * @dentry:
10371 + * @fh:
10372 + * @lenp:
10373 + * @need_parent:
10374 + *
10375 + */
10376 +static int
10377 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10378 + int need_parent)
10379 +{
10380 + struct inode *inode;
10381 + struct inode *parent;
10382 + char *addr;
10383 + int need;
10384 + int delta;
10385 + int result;
10386 + reiser4_context *ctx;
10387 +
10388 + /*
10389 + * knfsd asks as to serialize object in @dentry, and, optionally its
10390 + * parent (if need_parent != 0).
10391 + *
10392 + * encode_inode() and encode_inode_size() is used to build
10393 + * representation of object and its parent. All hard work is done by
10394 + * object plugins.
10395 + */
10396 + inode = dentry->d_inode;
10397 + parent = dentry->d_parent->d_inode;
10398 +
10399 + addr = (char *)fh;
10400 +
10401 + need = encode_inode_size(inode);
10402 + if (need < 0)
10403 + return NFSERROR;
10404 + if (need_parent) {
10405 + delta = encode_inode_size(parent);
10406 + if (delta < 0)
10407 + return NFSERROR;
10408 + need += delta;
10409 + }
10410 +
10411 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
10412 + if (IS_ERR(ctx))
10413 + return PTR_ERR(ctx);
10414 +
10415 + if (need <= sizeof(__u32) * (*lenp)) {
10416 + addr = encode_inode(inode, addr);
10417 + if (need_parent)
10418 + addr = encode_inode(parent, addr);
10419 +
10420 + /* store in lenp number of 32bit words required for file
10421 + * handle. */
10422 + *lenp = (need + sizeof(__u32) - 1) >> 2;
10423 + result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10424 + } else
10425 + /* no enough space in file handle */
10426 + result = NFSERROR;
10427 + reiser4_exit_context(ctx);
10428 + return result;
10429 +}
10430 +
10431 +/**
10432 + * reiser4_get_dentry_parent - get_parent of export operations
10433 + * @child:
10434 + *
10435 + */
10436 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10437 +{
10438 + struct inode *dir;
10439 + dir_plugin *dplug;
10440 +
10441 + assert("nikita-3527", child != NULL);
10442 + /* see comment in reiser4_get_dentry() about following assertion */
10443 + assert("nikita-3528", is_in_reiser4_context());
10444 +
10445 + dir = child->d_inode;
10446 + assert("nikita-3529", dir != NULL);
10447 + dplug = inode_dir_plugin(dir);
10448 + assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10449 + if (dplug != NULL)
10450 + return dplug->get_parent(dir);
10451 + else
10452 + return ERR_PTR(RETERR(-ENOTDIR));
10453 +}
10454 +
10455 +/**
10456 + * reiser4_get_dentry - get_dentry of export operations
10457 + * @super:
10458 + * @data:
10459 + *
10460 + *
10461 + */
10462 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10463 +{
10464 + reiser4_object_on_wire *o;
10465 +
10466 + assert("nikita-3522", super != NULL);
10467 + assert("nikita-3523", data != NULL);
10468 + /*
10469 + * this is only supposed to be called by
10470 + *
10471 + * reiser4_decode_fh->find_exported_dentry
10472 + *
10473 + * so, reiser4_context should be here already.
10474 + */
10475 + assert("nikita-3526", is_in_reiser4_context());
10476 +
10477 + o = (reiser4_object_on_wire *)data;
10478 + assert("nikita-3524", o->plugin != NULL);
10479 + assert("nikita-3525", o->plugin->wire.get != NULL);
10480 +
10481 + return o->plugin->wire.get(super, o);
10482 +}
10483 +
10484 +struct export_operations reiser4_export_operations = {
10485 + .encode_fh = reiser4_encode_fh,
10486 +// .decode_fh = reiser4_decode_fh,
10487 + .get_parent = reiser4_get_dentry_parent,
10488 +// .get_dentry = reiser4_get_dentry
10489 +};
10490 +
10491 +/*
10492 + * Local variables:
10493 + * c-indentation-style: "K&R"
10494 + * mode-name: "LC"
10495 + * c-basic-offset: 8
10496 + * tab-width: 8
10497 + * fill-column: 79
10498 + * End:
10499 + */
10500 diff -urN linux-2.6.23.orig/fs/reiser4/flush.c linux-2.6.23/fs/reiser4/flush.c
10501 --- linux-2.6.23.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300
10502 +++ linux-2.6.23/fs/reiser4/flush.c 2007-12-04 16:49:30.000000000 +0300
10503 @@ -0,0 +1,3625 @@
10504 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10505 +
10506 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10507 +
10508 +#include "forward.h"
10509 +#include "debug.h"
10510 +#include "dformat.h"
10511 +#include "key.h"
10512 +#include "coord.h"
10513 +#include "plugin/item/item.h"
10514 +#include "plugin/plugin.h"
10515 +#include "plugin/object.h"
10516 +#include "txnmgr.h"
10517 +#include "jnode.h"
10518 +#include "znode.h"
10519 +#include "block_alloc.h"
10520 +#include "tree_walk.h"
10521 +#include "carry.h"
10522 +#include "tree.h"
10523 +#include "vfs_ops.h"
10524 +#include "inode.h"
10525 +#include "page_cache.h"
10526 +#include "wander.h"
10527 +#include "super.h"
10528 +#include "entd.h"
10529 +#include "reiser4.h"
10530 +#include "flush.h"
10531 +#include "writeout.h"
10532 +
10533 +#include <asm/atomic.h>
10534 +#include <linux/fs.h> /* for struct super_block */
10535 +#include <linux/mm.h> /* for struct page */
10536 +#include <linux/bio.h> /* for struct bio */
10537 +#include <linux/pagemap.h>
10538 +#include <linux/blkdev.h>
10539 +
10540 +/* IMPLEMENTATION NOTES */
10541 +
10542 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10543 + order to the nodes of the tree in which the parent is placed before its children, which
10544 + are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10545 + describes the node that "came before in forward parent-first order". When we speak of a
10546 + "parent-first follower", it describes the node that "comes next in parent-first
10547 + order" (alternatively the node that "came before in reverse parent-first order").
10548 +
10549 + The following pseudo-code prints the nodes of a tree in forward parent-first order:
10550 +
10551 + void parent_first (node)
10552 + {
10553 + print_node (node);
10554 + if (node->level > leaf) {
10555 + for (i = 0; i < num_children; i += 1) {
10556 + parent_first (node->child[i]);
10557 + }
10558 + }
10559 + }
10560 +*/
10561 +
10562 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10563 + that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10564 + can be accomplished with sequential reads, which results in reading nodes in their
10565 + parent-first order. This is a read-optimization aspect of the flush algorithm, and
10566 + there is also a write-optimization aspect, which is that we wish to make large
10567 + sequential writes to the disk by allocating or reallocating blocks so that they can be
10568 + written in sequence. Sometimes the read-optimization and write-optimization goals
10569 + conflict with each other, as we discuss in more detail below.
10570 +*/
10571 +
10572 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10573 + the relevant jnode->state bits and their relevence to flush:
10574 +
10575 + JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10576 + must be allocated first. In order to be considered allocated, the jnode must have
10577 + exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10578 + all dirtied jnodes eventually have one of these bits set during each transaction.
10579 +
10580 + JNODE_CREATED: The node was freshly created in its transaction and has no previous
10581 + block address, so it is unconditionally assigned to be relocated, although this is
10582 + mainly for code-convenience. It is not being 'relocated' from anything, but in
10583 + almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10584 + remains set even after JNODE_RELOC is set, so the actual relocate can be
10585 + distinguished from the created-and-allocated set easily: relocate-set members
10586 + (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10587 + have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10588 +
10589 + JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10590 + decision to maintain the pre-existing location for this node and it will be written
10591 + to the wandered-log.
10592 +
10593 + JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10594 + not created, see note above). A block with JNODE_RELOC set is eligible for
10595 + early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10596 + bit is set on a znode, the parent node's internal item is modified and the znode is
10597 + rehashed.
10598 +
10599 + JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10600 + and calls plugin->f.squeeze() method for its items. By this technology we update disk
10601 + clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10602 + has this flag (races with write(), rare case) the flush algorythm makes the decision
10603 + to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10604 + repeated allocation.
10605 +
10606 + JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10607 + flush queue. This means the jnode is not on any clean or dirty list, instead it is
10608 + moved to one of the flush queue (see flush_queue.h) object private list. This
10609 + prevents multiple concurrent flushes from attempting to start flushing from the
10610 + same node.
10611 +
10612 + (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10613 + squeeze-and-allocate on a node while its children are actively being squeezed and
10614 + allocated. This flag was created to avoid submitting a write request for a node
10615 + while its children are still being allocated and squeezed. Then flush queue was
10616 + re-implemented to allow unlimited number of nodes be queued. This flag support was
10617 + commented out in source code because we decided that there was no reason to submit
10618 + queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10619 + during a slum traversal and may submit "busy nodes" to disk. Probably we can
10620 + re-enable the JNODE_FLUSH_BUSY bit support in future.
10621 +
10622 + With these state bits, we describe a test used frequently in the code below,
10623 + jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10624 + test for "flushprepped" returns true if any of the following are true:
10625 +
10626 + - The node is not dirty
10627 + - The node has JNODE_RELOC set
10628 + - The node has JNODE_OVRWR set
10629 +
10630 + If either the node is not dirty or it has already been processed by flush (and assigned
10631 + JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10632 + true then flush has work to do on that node.
10633 +*/
10634 +
10635 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10636 + flushprepped twice (unless an explicit call to flush_unprep is made as described in
10637 + detail below). For example a node is dirtied, allocated, and then early-flushed to
10638 + disk and set clean. Before the transaction commits, the page is dirtied again and, due
10639 + to memory pressure, the node is flushed again. The flush algorithm will not relocate
10640 + the node to a new disk location, it will simply write it to the same, previously
10641 + relocated position again.
10642 +*/
10643 +
10644 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10645 + start at a leaf node and allocate in parent-first order by iterating to the right. At
10646 + each step of the iteration, we check for the right neighbor. Before advancing to the
10647 + right neighbor, we check if the current position and the right neighbor share the same
10648 + parent. If they do not share the same parent, the parent is allocated before the right
10649 + neighbor.
10650 +
10651 + This process goes recursively up the tree and squeeze nodes level by level as long as
10652 + the right neighbor and the current position have different parents, then it allocates
10653 + the right-neighbors-with-different-parents on the way back down. This process is
10654 + described in more detail in flush_squalloc_changed_ancestor and the recursive function
10655 + squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10656 + specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10657 + approaches.
10658 +
10659 + The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10660 + approach, we find a starting point by scanning left along each level past dirty nodes,
10661 + then going up and repeating the process until the left node and the parent node are
10662 + clean. We then perform a parent-first traversal from the starting point, which makes
10663 + allocating in parent-first order trivial. After one subtree has been allocated in this
10664 + manner, we move to the right, try moving upward, then repeat the parent-first
10665 + traversal.
10666 +
10667 + Both approaches have problems that need to be addressed. Both are approximately the
10668 + same amount of code, but the bottom-up approach has advantages in the order it acquires
10669 + locks which, at the very least, make it the better approach. At first glance each one
10670 + makes the other one look simpler, so it is important to remember a few of the problems
10671 + with each one.
10672 +
10673 + Main problem with the top-down approach: When you encounter a clean child during the
10674 + parent-first traversal, what do you do? You would like to avoid searching through a
10675 + large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10676 + obvious solution. One of the advantages of the top-down approach is that during the
10677 + parent-first traversal you check every child of a parent to see if it is dirty. In
10678 + this way, the top-down approach easily handles the main problem of the bottom-up
10679 + approach: unallocated children.
10680 +
10681 + The unallocated children problem is that before writing a node to disk we must make
10682 + sure that all of its children are allocated. Otherwise, the writing the node means
10683 + extra I/O because the node will have to be written again when the child is finally
10684 + allocated.
10685 +
10686 + WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10687 + should not cause any file system corruption, it only degrades I/O performance because a
10688 + node may be written when it is sure to be written at least one more time in the same
10689 + transaction when the remaining children are allocated. What follows is a description
10690 + of how we will solve the problem.
10691 +*/
10692 +
10693 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10694 + proceeding in parent first order, allocate some of its left-children, then encounter a
10695 + clean child in the middle of the parent. We do not allocate the clean child, but there
10696 + may remain unallocated (dirty) children to the right of the clean child. If we were to
10697 + stop flushing at this moment and write everything to disk, the parent might still
10698 + contain unallocated children.
10699 +
10700 + We could try to allocate all the descendents of every node that we allocate, but this
10701 + is not necessary. Doing so could result in allocating the entire tree: if the root
10702 + node is allocated then every unallocated node would have to be allocated before
10703 + flushing. Actually, we do not have to write a node just because we allocate it. It is
10704 + possible to allocate but not write a node during flush, when it still has unallocated
10705 + children. However, this approach is probably not optimal for the following reason.
10706 +
10707 + The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10708 + to optimize reads that occur in the same order. Thus we are read-optimizing for a
10709 + left-to-right scan through all the leaves in the system, and we are hoping to
10710 + write-optimize at the same time because those nodes will be written together in batch.
10711 + What happens, however, if we assign a block number to a node in its read-optimized
10712 + order but then avoid writing it because it has unallocated children? In that
10713 + situation, we lose out on the write-optimization aspect because a node will have to be
10714 + written again to the its location on the device, later, which likely means seeking back
10715 + to that location.
10716 +
10717 + So there are tradeoffs. We can choose either:
10718 +
10719 + A. Allocate all unallocated children to preserve both write-optimization and
10720 + read-optimization, but this is not always desirable because it may mean having to
10721 + allocate and flush very many nodes at once.
10722 +
10723 + B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10724 + but sacrifice write-optimization because those nodes will be written again.
10725 +
10726 + C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10727 + locations. Instead, choose to write-optimize them later, when they are written. To
10728 + facilitate this, we "undo" the read-optimized allocation that was given to the node so
10729 + that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10730 + case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10731 + call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10732 + if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10733 + location, and set the JNODE_CREATED bit, effectively setting the node back to an
10734 + unallocated state.
10735 +
10736 + We will take the following approach in v4.0: for twig nodes we will always finish
10737 + allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10738 + writing and choose write-optimization (C).
10739 +
10740 + To summarize, there are several parts to a solution that avoids the problem with
10741 + unallocated children:
10742 +
10743 + FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10744 + problem because there was an experiment which was done showed that we have 1-2 nodes
10745 + with unallocated children for thousands of written nodes. The experiment was simple
10746 + like coping / deletion of linux kernel sources. However the problem can arise in more
10747 + complex tests. I think we have jnode_io_hook to insert a check for unallocated
10748 + children and see what kind of problem we have.
10749 +
10750 + 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10751 + squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
10752 + implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10753 + comments in that function.
10754 +
10755 + 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10756 + have unallocated children. If the twig level has unallocated children it is an
10757 + assertion failure. If a higher-level node has unallocated children, then it should be
10758 + explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
10759 + should be simple.
10760 +
10761 + 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10762 + CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10763 + this somewhat in the case where large sub-trees are flushed. The following observation
10764 + helps: if both the left- and right-neighbor of a node are processed by the flush
10765 + algorithm then the node itself is guaranteed to have all of its children allocated.
10766 + However, the cost of this check may not be so expensive after all: it is not needed for
10767 + leaves and flush can guarantee this property for twigs. That leaves only (level >
10768 + TWIG) nodes that have to be checked, so this optimization only helps if at least three
10769 + (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10770 + there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
10771 + then the number of blocks being written will be very large, so the savings may be
10772 + insignificant. That said, the idea is to maintain both the left and right edges of
10773 + nodes that are processed in flush. When flush_empty_queue() is called, a relatively
10774 + simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
10775 + edge, the slow check is necessary, but if it is in the interior then it can be assumed
10776 + to have all of its children allocated. FIXME: medium complexity to implement, but
10777 + simple to verify given that we must have a slow check anyway.
10778 +
10779 + 4. (Optional) This part is optional, not for v4.0--flush should work independently of
10780 + whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
10781 + left-scan operation to take unallocated children into account. Normally, the left-scan
10782 + operation goes left as long as adjacent nodes are dirty up until some large maximum
10783 + value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
10784 + may stop at a position where there are unallocated children to the left with the same
10785 + parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10786 + FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10787 + with a rapid scan. The rapid scan skips all the interior children of a node--if the
10788 + leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10789 + twig to the left). If the left neighbor of the leftmost child is also dirty, then
10790 + continue the scan at the left twig and repeat. This option will cause flush to
10791 + allocate more twigs in a single pass, but it also has the potential to write many more
10792 + nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
10793 + was partially implemented, code removed August 12, 2002 by JMACD.
10794 +*/
10795 +
10796 +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
10797 + starting point for flush is a leaf node, but actually the flush code cares very little
10798 + about whether or not this is true. It is possible that all the leaf nodes are flushed
10799 + and dirty parent nodes still remain, in which case jnode_flush() is called on a
10800 + non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
10801 + leaf, even when it is not. This is a simple approach, and there may be a more optimal
10802 + policy but until a problem with this approach is discovered, simplest is probably best.
10803 +
10804 + NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10805 + the leaves. This is done as a matter of simplicity and there is only one (shaky)
10806 + justification. When an atom commits, it flushes all leaf level nodes first, followed
10807 + by twigs, and so on. With flushing done in this order, if flush is eventually called
10808 + on a non-leaf node it means that (somehow) we reached a point where all leaves are
10809 + clean and only internal nodes need to be flushed. If that it the case, then it means
10810 + there were no leaves that were the parent-first preceder/follower of the parent. This
10811 + is expected to be a rare case, which is why we do nothing special about it. However,
10812 + memory pressure may pass an internal node to flush when there are still dirty leaf
10813 + nodes that need to be flushed, which could prove our original assumptions
10814 + "inoperative". If this needs to be fixed, then scan_left/right should have
10815 + special checks for the non-leaf levels. For example, instead of passing from a node to
10816 + the left neighbor, it should pass from the node to the left neighbor's rightmost
10817 + descendent (if dirty).
10818 +
10819 +*/
10820 +
10821 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10822 + it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
10823 + logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10824 + device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
10825 + device becomes sorted such that tree order and block number order fully correlate.
10826 +
10827 + Resizing is done by shifting everything either all the way to the left or all the way
10828 + to the right, and then reporting the last block.
10829 +*/
10830 +
10831 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
10832 + descibes the policy from the highest level:
10833 +
10834 + The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10835 + leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10836 + leaf nodes.
10837 +
10838 + Otherwise, there are two contexts in which we make a decision to relocate:
10839 +
10840 + 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10841 + During the initial stages of flush, after scan-right completes, we want to ask the
10842 + question: should we relocate this leaf node and thus dirty the parent node. Then if
10843 + the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10844 + the question at the next level up, and so on. In these cases we are moving in the
10845 + reverse-parent first direction.
10846 +
10847 + There is another case which is considered the reverse direction, which comes at the end
10848 + of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
10849 + reach a point where there is a clean twig to the right with a dirty leftmost child. In
10850 + this case, we may wish to relocate the child by testing if it should be relocated
10851 + relative to its parent.
10852 +
10853 + 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10854 + allocate_znode. What distinguishes the forward parent-first case from the
10855 + reverse-parent first case is that the preceder has already been allocated in the
10856 + forward case, whereas in the reverse case we don't know what the preceder is until we
10857 + finish "going in reverse". That simplifies the forward case considerably, and there we
10858 + actually use the block allocator to determine whether, e.g., a block closer to the
10859 + preceder is available.
10860 +*/
10861 +
10862 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
10863 + finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10864 + squeeze the parent's left neighbor and the parent. This may change the
10865 + flush-starting-node's parent. Repeat until the child's parent is stable. If the child
10866 + is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10867 + Note that we cannot allocate extents during this or they will be out of parent-first
10868 + order. There is also some difficult coordinate maintenence issues. We can't do a tree
10869 + search to find coordinates again (because we hold locks), we have to determine them
10870 + from the two nodes being squeezed. Looks difficult, but has potential to increase
10871 + space utilization. */
10872 +
10873 +/* Flush-scan helper functions. */
10874 +static void scan_init(flush_scan * scan);
10875 +static void scan_done(flush_scan * scan);
10876 +
10877 +/* Flush-scan algorithm. */
10878 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10879 + unsigned limit);
10880 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10881 +static int scan_common(flush_scan * scan, flush_scan * other);
10882 +static int scan_formatted(flush_scan * scan);
10883 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
10884 +static int scan_by_coord(flush_scan * scan);
10885 +
10886 +/* Initial flush-point ancestor allocation. */
10887 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
10888 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10889 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10890 +
10891 +/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
10892 +static int squalloc(flush_pos_t * pos);
10893 +
10894 +/* Flush squeeze implementation. */
10895 +static int squeeze_right_non_twig(znode * left, znode * right);
10896 +static int shift_one_internal_unit(znode * left, znode * right);
10897 +
10898 +/* Flush reverse parent-first relocation routines. */
10899 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10900 + const reiser4_block_nr * nblk);
10901 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10902 + flush_pos_t * pos);
10903 +static int reverse_relocate_check_dirty_parent(jnode * node,
10904 + const coord_t * parent_coord,
10905 + flush_pos_t * pos);
10906 +
10907 +/* Flush allocate write-queueing functions: */
10908 +static int allocate_znode(znode * node, const coord_t * parent_coord,
10909 + flush_pos_t * pos);
10910 +static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10911 + flush_pos_t * pos);
10912 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10913 +
10914 +/* Flush helper functions: */
10915 +static int jnode_lock_parent_coord(jnode * node,
10916 + coord_t * coord,
10917 + lock_handle * parent_lh,
10918 + load_count * parent_zh,
10919 + znode_lock_mode mode, int try);
10920 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
10921 + znode_lock_mode mode, int check_dirty, int expected);
10922 +static int znode_same_parents(znode * a, znode * b);
10923 +
10924 +static int znode_check_flushprepped(znode * node)
10925 +{
10926 + return jnode_check_flushprepped(ZJNODE(node));
10927 +}
10928 +
10929 +/* Flush position functions */
10930 +static void pos_init(flush_pos_t * pos);
10931 +static int pos_valid(flush_pos_t * pos);
10932 +static void pos_done(flush_pos_t * pos);
10933 +static int pos_stop(flush_pos_t * pos);
10934 +
10935 +/* check that @org is first jnode extent unit, if extent is unallocated,
10936 + * because all jnodes of unallocated extent are dirty and of the same atom. */
10937 +#define checkchild(scan) \
10938 +assert("nikita-3435", \
10939 + ergo(scan->direction == LEFT_SIDE && \
10940 + (scan->parent_coord.node->level == TWIG_LEVEL) && \
10941 + jnode_is_unformatted(scan->node) && \
10942 + extent_is_unallocated(&scan->parent_coord), \
10943 + extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10944 +
10945 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
10946 + useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
10947 + no static initializer function...) */
10948 +ON_DEBUG(atomic_t flush_cnt;
10949 + )
10950 +
10951 +/* check fs backing device for write congestion */
10952 +static int check_write_congestion(void)
10953 +{
10954 + struct super_block *sb;
10955 + struct backing_dev_info *bdi;
10956 +
10957 + sb = reiser4_get_current_sb();
10958 + bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
10959 + return bdi_write_congested(bdi);
10960 +}
10961 +
10962 +/* conditionally write flush queue */
10963 +static int write_prepped_nodes(flush_pos_t * pos)
10964 +{
10965 + int ret;
10966 +
10967 + assert("zam-831", pos);
10968 + assert("zam-832", pos->fq);
10969 +
10970 + if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
10971 + return 0;
10972 +
10973 + if (check_write_congestion())
10974 + return 0;
10975 +
10976 + ret = reiser4_write_fq(pos->fq, pos->nr_written,
10977 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
10978 + return ret;
10979 +}
10980 +
10981 +/* Proper release all flush pos. resources then move flush position to new
10982 + locked node */
10983 +static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
10984 + load_count * new_load, const coord_t * new_coord)
10985 +{
10986 + assert("zam-857", new_lock->node == new_load->node);
10987 +
10988 + if (new_coord) {
10989 + assert("zam-858", new_coord->node == new_lock->node);
10990 + coord_dup(&pos->coord, new_coord);
10991 + } else {
10992 + coord_init_first_unit(&pos->coord, new_lock->node);
10993 + }
10994 +
10995 + if (pos->child) {
10996 + jput(pos->child);
10997 + pos->child = NULL;
10998 + }
10999 +
11000 + move_load_count(&pos->load, new_load);
11001 + done_lh(&pos->lock);
11002 + move_lh(&pos->lock, new_lock);
11003 +}
11004 +
11005 +/* delete empty node which link from the parent still exists. */
11006 +static int delete_empty_node(znode * node)
11007 +{
11008 + reiser4_key smallest_removed;
11009 +
11010 + assert("zam-1019", node != NULL);
11011 + assert("zam-1020", node_is_empty(node));
11012 + assert("zam-1023", znode_is_wlocked(node));
11013 +
11014 + return reiser4_delete_node(node, &smallest_removed, NULL, 1);
11015 +}
11016 +
11017 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11018 +static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11019 +{
11020 + int ret;
11021 + load_count load;
11022 + lock_handle lock;
11023 +
11024 + init_lh(&lock);
11025 + init_load_count(&load);
11026 +
11027 + if (jnode_is_znode(org)) {
11028 + ret = longterm_lock_znode(&lock, JZNODE(org),
11029 + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11030 + if (ret)
11031 + return ret;
11032 +
11033 + ret = incr_load_count_znode(&load, JZNODE(org));
11034 + if (ret)
11035 + return ret;
11036 +
11037 + pos->state =
11038 + (jnode_get_level(org) ==
11039 + LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11040 + move_flush_pos(pos, &lock, &load, NULL);
11041 + } else {
11042 + coord_t parent_coord;
11043 + ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11044 + &load, ZNODE_WRITE_LOCK, 0);
11045 + if (ret)
11046 + goto done;
11047 + if (!item_is_extent(&parent_coord)) {
11048 + /* file was converted to tail, org became HB, we found internal
11049 + item */
11050 + ret = -EAGAIN;
11051 + goto done;
11052 + }
11053 +
11054 + pos->state = POS_ON_EPOINT;
11055 + move_flush_pos(pos, &lock, &load, &parent_coord);
11056 + pos->child = jref(org);
11057 + if (extent_is_unallocated(&parent_coord)
11058 + && extent_unit_index(&parent_coord) != index_jnode(org)) {
11059 + /* @org is not first child of its parent unit. This may happen
11060 + because longerm lock of its parent node was released between
11061 + scan_left and scan_right. For now work around this having flush to repeat */
11062 + ret = -EAGAIN;
11063 + }
11064 + }
11065 +
11066 + done:
11067 + done_load_count(&load);
11068 + done_lh(&lock);
11069 + return ret;
11070 +}
11071 +
11072 +/* TODO LIST (no particular order): */
11073 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
11074 + indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11075 + specific names mentioned instead that need to be inspected/resolved. */
11076 +/* B. There is an issue described in reverse_relocate_test having to do with an
11077 + imprecise is_preceder? check having to do with partially-dirty extents. The code that
11078 + sets preceder hints and computes the preceder is basically untested. Careful testing
11079 + needs to be done that preceder calculations are done correctly, since if it doesn't
11080 + affect correctness we will not catch this stuff during regular testing. */
11081 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11082 + considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11083 + but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11084 + Many of the calls that may produce one of these return values (i.e.,
11085 + longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11086 + values themselves and, for instance, stop flushing instead of resulting in a restart.
11087 + If any of these results are true error conditions then flush will go into a busy-loop,
11088 + as we noticed during testing when a corrupt tree caused find_child_ptr to return
11089 + ENOENT. It needs careful thought and testing of corner conditions.
11090 +*/
11091 +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11092 + block is assigned a block number then early-flushed to disk. It is dirtied again and
11093 + flush is called again. Concurrently, that block is deleted, and the de-allocation of
11094 + its block number does not need to be deferred, since it is not part of the preserve set
11095 + (i.e., it didn't exist before the transaction). I think there may be a race condition
11096 + where flush writes the dirty, created block after the non-deferred deallocated block
11097 + number is re-allocated, making it possible to write deleted data on top of non-deleted
11098 + data. Its just a theory, but it needs to be thought out. */
11099 +/* F. bio_alloc() failure is not handled gracefully. */
11100 +/* G. Unallocated children. */
11101 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11102 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11103 +
11104 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11105 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11106 + neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11107 + blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11108 + a part of transaction commit.
11109 +
11110 + Our objective here is to prep and flush the slum the jnode belongs to. We want to
11111 + squish the slum together, and allocate the nodes in it as we squish because allocation
11112 + of children affects squishing of parents.
11113 +
11114 + The "argument" @node tells flush where to start. From there, flush finds the left edge
11115 + of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11116 + "better place" to start squalloc first we perform a flush_scan.
11117 +
11118 + Flush-scanning may be performed in both left and right directions, but for different
11119 + purposes. When scanning to the left, we are searching for a node that precedes a
11120 + sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11121 + During flush-scanning, we also take the opportunity to count the number of consecutive
11122 + leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11123 + make a decision to reallocate leaf nodes (thus favoring write-optimization).
11124 +
11125 + Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11126 + also be dirty nodes to the right of the argument. If the scan-left operation does not
11127 + count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11128 + operation to see whether there is, in fact, enough nodes to meet the relocate
11129 + threshold. Each right- and left-scan operation uses a single flush_scan object.
11130 +
11131 + After left-scan and possibly right-scan, we prepare a flush_position object with the
11132 + starting flush point or parent coordinate, which was determined using scan-left.
11133 +
11134 + Next we call the main flush routine, squalloc, which iterates along the
11135 + leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11136 +
11137 + After squalloc returns we take extra steps to ensure that all the children
11138 + of the final twig node are allocated--this involves repeating squalloc
11139 + until we finish at a twig with no unallocated children.
11140 +
11141 + Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11142 + any above-twig nodes during flush_empty_queue that still have unallocated children, we
11143 + flush_unprep them.
11144 +
11145 + Flush treats several "failure" cases as non-failures, essentially causing them to start
11146 + over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11147 + probably be handled properly rather than restarting, but there are a bunch of cases to
11148 + audit.
11149 +*/
11150 +
11151 +static int
11152 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11153 + flush_queue_t * fq, int flags)
11154 +{
11155 + long ret = 0;
11156 + flush_scan *right_scan;
11157 + flush_scan *left_scan;
11158 + flush_pos_t *flush_pos;
11159 + int todo;
11160 + struct super_block *sb;
11161 + reiser4_super_info_data *sbinfo;
11162 + jnode *leftmost_in_slum = NULL;
11163 +
11164 + assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11165 + assert("nikita-3022", reiser4_schedulable());
11166 +
11167 + assert("nikita-3185",
11168 + get_current_super_private()->delete_mutex_owner != current);
11169 +
11170 + /* allocate right_scan, left_scan and flush_pos */
11171 + right_scan =
11172 + kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11173 + reiser4_ctx_gfp_mask_get());
11174 + if (right_scan == NULL)
11175 + return RETERR(-ENOMEM);
11176 + left_scan = right_scan + 1;
11177 + flush_pos = (flush_pos_t *) (left_scan + 1);
11178 +
11179 + sb = reiser4_get_current_sb();
11180 + sbinfo = get_super_private(sb);
11181 +
11182 + /* Flush-concurrency debug code */
11183 +#if REISER4_DEBUG
11184 + atomic_inc(&flush_cnt);
11185 +#endif
11186 +
11187 + reiser4_enter_flush(sb);
11188 +
11189 + /* Initialize a flush position. */
11190 + pos_init(flush_pos);
11191 +
11192 + flush_pos->nr_written = nr_written;
11193 + flush_pos->fq = fq;
11194 + flush_pos->flags = flags;
11195 + flush_pos->nr_to_write = nr_to_write;
11196 +
11197 + scan_init(right_scan);
11198 + scan_init(left_scan);
11199 +
11200 + /* First scan left and remember the leftmost scan position. If the leftmost
11201 + position is unformatted we remember its parent_coord. We scan until counting
11202 + FLUSH_SCAN_MAXNODES.
11203 +
11204 + If starting @node is unformatted, at the beginning of left scan its
11205 + parent (twig level node, containing extent item) will be long term
11206 + locked and lock handle will be stored in the
11207 + @right_scan->parent_lock. This lock is used to start the rightward
11208 + scan without redoing the tree traversal (necessary to find parent)
11209 + and, hence, is kept during leftward scan. As a result, we have to
11210 + use try-lock when taking long term locks during the leftward scan.
11211 + */
11212 + ret = scan_left(left_scan, right_scan,
11213 + node, sbinfo->flush.scan_maxnodes);
11214 + if (ret != 0)
11215 + goto failed;
11216 +
11217 + leftmost_in_slum = jref(left_scan->node);
11218 + scan_done(left_scan);
11219 +
11220 + /* Then possibly go right to decide if we will use a policy of relocating leaves.
11221 + This is only done if we did not scan past (and count) enough nodes during the
11222 + leftward scan. If we do scan right, we only care to go far enough to establish
11223 + that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11224 + scan limit is the difference between left_scan.count and the threshold. */
11225 +
11226 + todo = sbinfo->flush.relocate_threshold - left_scan->count;
11227 + /* scan right is inherently deadlock prone, because we are
11228 + * (potentially) holding a lock on the twig node at this moment.
11229 + * FIXME: this is incorrect comment: lock is not held */
11230 + if (todo > 0) {
11231 + ret = scan_right(right_scan, node, (unsigned)todo);
11232 + if (ret != 0)
11233 + goto failed;
11234 + }
11235 +
11236 + /* Only the right-scan count is needed, release any rightward locks right away. */
11237 + scan_done(right_scan);
11238 +
11239 + /* ... and the answer is: we should relocate leaf nodes if at least
11240 + FLUSH_RELOCATE_THRESHOLD nodes were found. */
11241 + flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11242 + (left_scan->count + right_scan->count >=
11243 + sbinfo->flush.relocate_threshold);
11244 +
11245 + /* Funny business here. We set the 'point' in the flush_position at prior to
11246 + starting squalloc regardless of whether the first point is
11247 + formatted or unformatted. Without this there would be an invariant, in the
11248 + rest of the code, that if the flush_position is unformatted then
11249 + flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11250 + and if the flush_position is formatted then flush_position->point is non-NULL
11251 + and no parent info is set.
11252 +
11253 + This seems lazy, but it makes the initial calls to reverse_relocate_test
11254 + (which ask "is it the pos->point the leftmost child of its parent") much easier
11255 + because we know the first child already. Nothing is broken by this, but the
11256 + reasoning is subtle. Holding an extra reference on a jnode during flush can
11257 + cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11258 + removed from sibling lists until they have zero reference count. Flush would
11259 + never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11260 + deleted to the right. So if nothing is broken, why fix it?
11261 +
11262 + NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11263 + point and in any moment, because of the concurrent file system
11264 + activity (for example, truncate). */
11265 +
11266 + /* Check jnode state after flush_scan completed. Having a lock on this
11267 + node or its parent (in case of unformatted) helps us in case of
11268 + concurrent flushing. */
11269 + if (jnode_check_flushprepped(leftmost_in_slum)
11270 + && !jnode_convertible(leftmost_in_slum)) {
11271 + ret = 0;
11272 + goto failed;
11273 + }
11274 +
11275 + /* Now setup flush_pos using scan_left's endpoint. */
11276 + ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11277 + if (ret)
11278 + goto failed;
11279 +
11280 + if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11281 + && node_is_empty(flush_pos->coord.node)) {
11282 + znode *empty = flush_pos->coord.node;
11283 +
11284 + assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11285 + ret = delete_empty_node(empty);
11286 + goto failed;
11287 + }
11288 +
11289 + if (jnode_check_flushprepped(leftmost_in_slum)
11290 + && !jnode_convertible(leftmost_in_slum)) {
11291 + ret = 0;
11292 + goto failed;
11293 + }
11294 +
11295 + /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11296 + ret = alloc_pos_and_ancestors(flush_pos);
11297 + if (ret)
11298 + goto failed;
11299 +
11300 + /* Do the main rightward-bottom-up squeeze and allocate loop. */
11301 + ret = squalloc(flush_pos);
11302 + pos_stop(flush_pos);
11303 + if (ret)
11304 + goto failed;
11305 +
11306 + /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11307 + First, the pos_stop() and pos_valid() routines should be modified
11308 + so that pos_stop() sets a flush_position->stop flag to 1 without
11309 + releasing the current position immediately--instead release it in
11310 + pos_done(). This is a better implementation than the current one anyway.
11311 +
11312 + It is not clear that all fields of the flush_position should not be released,
11313 + but at the very least the parent_lock, parent_coord, and parent_load should
11314 + remain held because they are hold the last twig when pos_stop() is
11315 + called.
11316 +
11317 + When we reach this point in the code, if the parent_coord is set to after the
11318 + last item then we know that flush reached the end of a twig (and according to
11319 + the new flush queueing design, we will return now). If parent_coord is not
11320 + past the last item, we should check if the current twig has any unallocated
11321 + children to the right (we are not concerned with unallocated children to the
11322 + left--in that case the twig itself should not have been allocated). If the
11323 + twig has unallocated children to the right, set the parent_coord to that
11324 + position and then repeat the call to squalloc.
11325 +
11326 + Testing for unallocated children may be defined in two ways: if any internal
11327 + item has a fake block number, it is unallocated; if any extent item is
11328 + unallocated then all of its children are unallocated. But there is a more
11329 + aggressive approach: if there are any dirty children of the twig to the right
11330 + of the current position, we may wish to relocate those nodes now. Checking for
11331 + potential relocation is more expensive as it requires knowing whether there are
11332 + any dirty children that are not unallocated. The extent_needs_allocation
11333 + should be used after setting the correct preceder.
11334 +
11335 + When we reach the end of a twig at this point in the code, if the flush can
11336 + continue (when the queue is ready) it will need some information on the future
11337 + starting point. That should be stored away in the flush_handle using a seal, I
11338 + believe. Holding a jref() on the future starting point may break other code
11339 + that deletes that node.
11340 + */
11341 +
11342 + /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11343 + above the twig level. If the VM calls flush above the twig level, do nothing
11344 + and return (but figure out why this happens). The txnmgr should be modified to
11345 + only flush its leaf-level dirty list. This will do all the necessary squeeze
11346 + and allocate steps but leave unallocated branches and possibly unallocated
11347 + twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11348 + level, the remaining unallocated nodes should be given write-optimized
11349 + locations. (Possibly, the remaining unallocated twigs should be allocated just
11350 + before their leftmost child.)
11351 + */
11352 +
11353 + /* Any failure reaches this point. */
11354 + failed:
11355 +
11356 + switch (ret) {
11357 + case -E_REPEAT:
11358 + case -EINVAL:
11359 + case -E_DEADLOCK:
11360 + case -E_NO_NEIGHBOR:
11361 + case -ENOENT:
11362 + /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11363 + in each case. They already are handled in many cases. */
11364 + /* Something bad happened, but difficult to avoid... Try again! */
11365 + ret = 0;
11366 + }
11367 +
11368 + if (leftmost_in_slum)
11369 + jput(leftmost_in_slum);
11370 +
11371 + pos_done(flush_pos);
11372 + scan_done(left_scan);
11373 + scan_done(right_scan);
11374 + kfree(right_scan);
11375 +
11376 + ON_DEBUG(atomic_dec(&flush_cnt));
11377 +
11378 + reiser4_leave_flush(sb);
11379 +
11380 + return ret;
11381 +}
11382 +
11383 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11384 + * flusher should submit all prepped nodes immediately without keeping them in
11385 + * flush queues for long time. The reason for rapid flush mode is to free
11386 + * memory as fast as possible. */
11387 +
11388 +#if REISER4_USE_RAPID_FLUSH
11389 +
11390 +/**
11391 + * submit all prepped nodes if rapid flush mode is set,
11392 + * turn rapid flush mode off.
11393 + */
11394 +
11395 +static int rapid_flush(flush_pos_t * pos)
11396 +{
11397 + if (!wbq_available())
11398 + return 0;
11399 +
11400 + return write_prepped_nodes(pos);
11401 +}
11402 +
11403 +#else
11404 +
11405 +#define rapid_flush(pos) (0)
11406 +
11407 +#endif /* REISER4_USE_RAPID_FLUSH */
11408 +
11409 +static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11410 + flush_queue_t *fq, int *nr_queued,
11411 + int flags)
11412 +{
11413 + jnode * node;
11414 +
11415 + if (start != NULL) {
11416 + spin_lock_jnode(start);
11417 + if (!jnode_is_flushprepped(start)) {
11418 + assert("zam-1056", start->atom == atom);
11419 + node = start;
11420 + goto enter;
11421 + }
11422 + spin_unlock_jnode(start);
11423 + }
11424 + /*
11425 + * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11426 + * nodes. The atom spin lock is not released until all dirty nodes processed or
11427 + * not prepped node found in the atom dirty lists.
11428 + */
11429 + while ((node = find_first_dirty_jnode(atom, flags))) {
11430 + spin_lock_jnode(node);
11431 + enter:
11432 + assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11433 + assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11434 +
11435 + if (JF_ISSET(node, JNODE_WRITEBACK)) {
11436 + /* move node to the end of atom's writeback list */
11437 + list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11438 +
11439 + /*
11440 + * jnode is not necessarily on dirty list: if it was dirtied when
11441 + * it was on flush queue - it does not get moved to dirty list
11442 + */
11443 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11444 + WB_LIST, 1));
11445 +
11446 + } else if (jnode_is_znode(node)
11447 + && znode_above_root(JZNODE(node))) {
11448 + /*
11449 + * A special case for znode-above-root. The above-root (fake)
11450 + * znode is captured and dirtied when the tree height changes or
11451 + * when the root node is relocated. This causes atoms to fuse so
11452 + * that changes at the root are serialized. However, this node is
11453 + * never flushed. This special case used to be in lock.c to
11454 + * prevent the above-root node from ever being captured, but now
11455 + * that it is captured we simply prevent it from flushing. The
11456 + * log-writer code relies on this to properly log superblock
11457 + * modifications of the tree height.
11458 + */
11459 + jnode_make_wander_nolock(node);
11460 + } else if (JF_ISSET(node, JNODE_RELOC)) {
11461 + queue_jnode(fq, node);
11462 + ++(*nr_queued);
11463 + } else
11464 + break;
11465 +
11466 + spin_unlock_jnode(node);
11467 + }
11468 + return node;
11469 +}
11470 +
11471 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11472 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11473 + * other errors as they are. */
11474 +int
11475 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11476 + txn_atom ** atom, jnode *start)
11477 +{
11478 + reiser4_super_info_data *sinfo = get_current_super_private();
11479 + flush_queue_t *fq = NULL;
11480 + jnode *node;
11481 + int nr_queued;
11482 + int ret;
11483 +
11484 + assert("zam-889", atom != NULL && *atom != NULL);
11485 + assert_spin_locked(&((*atom)->alock));
11486 + assert("zam-892", get_current_context()->trans->atom == *atom);
11487 +
11488 + nr_to_write = LONG_MAX;
11489 + while (1) {
11490 + ret = reiser4_fq_by_atom(*atom, &fq);
11491 + if (ret != -E_REPEAT)
11492 + break;
11493 + *atom = get_current_atom_locked();
11494 + }
11495 + if (ret)
11496 + return ret;
11497 +
11498 + assert_spin_locked(&((*atom)->alock));
11499 +
11500 + /* parallel flushers limit */
11501 + if (sinfo->tmgr.atom_max_flushers != 0) {
11502 + while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11503 + /* An reiser4_atom_send_event() call is inside
11504 + reiser4_fq_put_nolock() which is called when flush is
11505 + finished and nr_flushers is decremented. */
11506 + reiser4_atom_wait_event(*atom);
11507 + *atom = get_current_atom_locked();
11508 + }
11509 + }
11510 +
11511 + /* count ourself as a flusher */
11512 + (*atom)->nr_flushers++;
11513 +
11514 + writeout_mode_enable();
11515 +
11516 + nr_queued = 0;
11517 + node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11518 +
11519 + if (node == NULL) {
11520 + if (nr_queued == 0) {
11521 + (*atom)->nr_flushers--;
11522 + reiser4_fq_put_nolock(fq);
11523 + reiser4_atom_send_event(*atom);
11524 + /* current atom remains locked */
11525 + writeout_mode_disable();
11526 + return 0;
11527 + }
11528 + spin_unlock_atom(*atom);
11529 + } else {
11530 + jref(node);
11531 + BUG_ON((*atom)->super != node->tree->super);
11532 + spin_unlock_atom(*atom);
11533 + spin_unlock_jnode(node);
11534 + BUG_ON(nr_to_write == 0);
11535 + ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11536 + jput(node);
11537 + }
11538 +
11539 + ret =
11540 + reiser4_write_fq(fq, nr_submitted,
11541 + WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11542 +
11543 + *atom = get_current_atom_locked();
11544 + (*atom)->nr_flushers--;
11545 + reiser4_fq_put_nolock(fq);
11546 + reiser4_atom_send_event(*atom);
11547 + spin_unlock_atom(*atom);
11548 +
11549 + writeout_mode_disable();
11550 +
11551 + if (ret == 0)
11552 + ret = -E_REPEAT;
11553 +
11554 + return ret;
11555 +}
11556 +
11557 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11558 +
11559 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11560 + reverse parent-first relocate context. Here all we know is the preceder and the block
11561 + number. Since we are going in reverse, the preceder may still be relocated as well, so
11562 + we can't ask the block allocator "is there a closer block available to relocate?" here.
11563 + In the _forward_ parent-first relocate context (not here) we actually call the block
11564 + allocator to try and find a closer location. */
11565 +static int
11566 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11567 + const reiser4_block_nr * nblk)
11568 +{
11569 + reiser4_block_nr dist;
11570 +
11571 + assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11572 + assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11573 + assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11574 +
11575 + /* Distance is the absolute value. */
11576 + dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11577 +
11578 + /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11579 + block, do not relocate. */
11580 + if (dist <= get_current_super_private()->flush.relocate_distance) {
11581 + return 0;
11582 + }
11583 +
11584 + return 1;
11585 +}
11586 +
11587 +/* This function is a predicate that tests for relocation. Always called in the
11588 + reverse-parent-first context, when we are asking whether the current node should be
11589 + relocated in order to expand the flush by dirtying the parent level (and thus
11590 + proceeding to flush that level). When traversing in the forward parent-first direction
11591 + (not here), relocation decisions are handled in two places: allocate_znode() and
11592 + extent_needs_allocation(). */
11593 +static int
11594 +reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11595 + flush_pos_t * pos)
11596 +{
11597 + reiser4_block_nr pblk = 0;
11598 + reiser4_block_nr nblk = 0;
11599 +
11600 + assert("jmacd-8989", !jnode_is_root(node));
11601 +
11602 + /*
11603 + * This function is called only from the
11604 + * reverse_relocate_check_dirty_parent() and only if the parent
11605 + * node is clean. This implies that the parent has the real (i.e., not
11606 + * fake) block number, and, so does the child, because otherwise the
11607 + * parent would be dirty.
11608 + */
11609 +
11610 + /* New nodes are treated as if they are being relocated. */
11611 + if (JF_ISSET (node, JNODE_CREATED) ||
11612 + (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11613 + return 1;
11614 + }
11615 +
11616 + /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11617 + existing node, the coord may be leftmost even though the child is not the
11618 + parent-first preceder of the parent. If the first dirty node appears somewhere
11619 + in the middle of the first extent unit, this preceder calculation is wrong.
11620 + Needs more logic in here. */
11621 + if (coord_is_leftmost_unit(parent_coord)) {
11622 + pblk = *znode_get_block(parent_coord->node);
11623 + } else {
11624 + pblk = pos->preceder.blk;
11625 + }
11626 + check_preceder(pblk);
11627 +
11628 + /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11629 + if (pblk == 0) {
11630 + return 1;
11631 + }
11632 +
11633 + nblk = *jnode_get_block(node);
11634 +
11635 + if (reiser4_blocknr_is_fake(&nblk))
11636 + /* child is unallocated, mark parent dirty */
11637 + return 1;
11638 +
11639 + return reverse_relocate_if_close_enough(&pblk, &nblk);
11640 +}
11641 +
11642 +/* This function calls reverse_relocate_test to make a reverse-parent-first
11643 + relocation decision and then, if yes, it marks the parent dirty. */
11644 +static int
11645 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11646 + flush_pos_t * pos)
11647 +{
11648 + int ret;
11649 +
11650 + if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11651 +
11652 + ret = reverse_relocate_test(node, parent_coord, pos);
11653 + if (ret < 0) {
11654 + return ret;
11655 + }
11656 +
11657 + /* FIXME-ZAM
11658 + if parent is already relocated - we do not want to grab space, right? */
11659 + if (ret == 1) {
11660 + int grabbed;
11661 +
11662 + grabbed = get_current_context()->grabbed_blocks;
11663 + if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11664 + 0)
11665 + reiser4_panic("umka-1250",
11666 + "No space left during flush.");
11667 +
11668 + assert("jmacd-18923",
11669 + znode_is_write_locked(parent_coord->node));
11670 + znode_make_dirty(parent_coord->node);
11671 + grabbed2free_mark(grabbed);
11672 + }
11673 + }
11674 +
11675 + return 0;
11676 +}
11677 +
11678 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11679 + PARENT-FIRST LOOP BEGINS) */
11680 +
11681 +/* Get the leftmost child for given coord. */
11682 +static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11683 +{
11684 + int ret;
11685 +
11686 + ret = item_utmost_child(coord, LEFT_SIDE, child);
11687 +
11688 + if (ret)
11689 + return ret;
11690 +
11691 + if (IS_ERR(*child))
11692 + return PTR_ERR(*child);
11693 +
11694 + return 0;
11695 +}
11696 +
11697 +/* This step occurs after the left- and right-scans are completed, before starting the
11698 + forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11699 + flush point, which means continuing in the reverse parent-first direction to the
11700 + parent, grandparent, and so on (as long as the child is a leftmost child). This
11701 + routine calls a recursive process, alloc_one_ancestor, which does the real work,
11702 + except there is special-case handling here for the first ancestor, which may be a twig.
11703 + At each level (here and alloc_one_ancestor), we check for relocation and then, if
11704 + the child is a leftmost child, repeat at the next level. On the way back down (the
11705 + recursion), we allocate the ancestors in parent-first order. */
11706 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
11707 +{
11708 + int ret = 0;
11709 + lock_handle plock;
11710 + load_count pload;
11711 + coord_t pcoord;
11712 +
11713 + if (znode_check_flushprepped(pos->lock.node))
11714 + return 0;
11715 +
11716 + coord_init_invalid(&pcoord, NULL);
11717 + init_lh(&plock);
11718 + init_load_count(&pload);
11719 +
11720 + if (pos->state == POS_ON_EPOINT) {
11721 + /* a special case for pos on twig level, where we already have
11722 + a lock on parent node. */
11723 + /* The parent may not be dirty, in which case we should decide
11724 + whether to relocate the child now. If decision is made to
11725 + relocate the child, the parent is marked dirty. */
11726 + ret =
11727 + reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11728 + pos);
11729 + if (ret)
11730 + goto exit;
11731 +
11732 + /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11733 + is leftmost) and the leaf/child, so recursion is not needed.
11734 + Levels above the twig will be allocated for
11735 + write-optimization before the transaction commits. */
11736 +
11737 + /* Do the recursive step, allocating zero or more of our
11738 + * ancestors. */
11739 + ret = alloc_one_ancestor(&pos->coord, pos);
11740 +
11741 + } else {
11742 + if (!znode_is_root(pos->lock.node)) {
11743 + /* all formatted nodes except tree root */
11744 + ret =
11745 + reiser4_get_parent(&plock, pos->lock.node,
11746 + ZNODE_WRITE_LOCK);
11747 + if (ret)
11748 + goto exit;
11749 +
11750 + ret = incr_load_count_znode(&pload, plock.node);
11751 + if (ret)
11752 + goto exit;
11753 +
11754 + ret =
11755 + find_child_ptr(plock.node, pos->lock.node, &pcoord);
11756 + if (ret)
11757 + goto exit;
11758 +
11759 + ret =
11760 + reverse_relocate_check_dirty_parent(ZJNODE
11761 + (pos->lock.
11762 + node), &pcoord,
11763 + pos);
11764 + if (ret)
11765 + goto exit;
11766 +
11767 + ret = alloc_one_ancestor(&pcoord, pos);
11768 + if (ret)
11769 + goto exit;
11770 + }
11771 +
11772 + ret = allocate_znode(pos->lock.node, &pcoord, pos);
11773 + }
11774 + exit:
11775 + done_load_count(&pload);
11776 + done_lh(&plock);
11777 + return ret;
11778 +}
11779 +
11780 +/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
11781 + call to set_preceder, which is the next function described, this checks if the
11782 + child is a leftmost child and returns if it is not. If the child is a leftmost child
11783 + it checks for relocation, possibly dirtying the parent. Then it performs the recursive
11784 + step. */
11785 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11786 +{
11787 + int ret = 0;
11788 + lock_handle alock;
11789 + load_count aload;
11790 + coord_t acoord;
11791 +
11792 + /* As we ascend at the left-edge of the region to flush, take this opportunity at
11793 + the twig level to find our parent-first preceder unless we have already set
11794 + it. */
11795 + if (pos->preceder.blk == 0) {
11796 + ret = set_preceder(coord, pos);
11797 + if (ret != 0)
11798 + return ret;
11799 + }
11800 +
11801 + /* If the ancestor is clean or already allocated, or if the child is not a
11802 + leftmost child, stop going up, even leaving coord->node not flushprepped. */
11803 + if (znode_check_flushprepped(coord->node)
11804 + || !coord_is_leftmost_unit(coord))
11805 + return 0;
11806 +
11807 + init_lh(&alock);
11808 + init_load_count(&aload);
11809 + coord_init_invalid(&acoord, NULL);
11810 +
11811 + /* Only ascend to the next level if it is a leftmost child, but write-lock the
11812 + parent in case we will relocate the child. */
11813 + if (!znode_is_root(coord->node)) {
11814 +
11815 + ret =
11816 + jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11817 + &alock, &aload, ZNODE_WRITE_LOCK,
11818 + 0);
11819 + if (ret != 0) {
11820 + /* FIXME(C): check EINVAL, E_DEADLOCK */
11821 + goto exit;
11822 + }
11823 +
11824 + ret =
11825 + reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11826 + &acoord, pos);
11827 + if (ret != 0) {
11828 + goto exit;
11829 + }
11830 +
11831 + /* Recursive call. */
11832 + if (!znode_check_flushprepped(acoord.node)) {
11833 + ret = alloc_one_ancestor(&acoord, pos);
11834 + if (ret)
11835 + goto exit;
11836 + }
11837 + }
11838 +
11839 + /* Note: we call allocate with the parent write-locked (except at the root) in
11840 + case we relocate the child, in which case it will modify the parent during this
11841 + call. */
11842 + ret = allocate_znode(coord->node, &acoord, pos);
11843 +
11844 + exit:
11845 + done_load_count(&aload);
11846 + done_lh(&alock);
11847 + return ret;
11848 +}
11849 +
11850 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11851 + a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
11852 + should this node be relocated (in reverse parent-first context)? We repeat this
11853 + process as long as the child is the leftmost child, eventually reaching an ancestor of
11854 + the flush point that is not a leftmost child. The preceder of that ancestors, which is
11855 + not a leftmost child, is actually on the leaf level. The preceder of that block is the
11856 + left-neighbor of the flush point. The preceder of that block is the rightmost child of
11857 + the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
11858 + level, it stops momentarily to remember the block of the rightmost child of the twig on
11859 + the left and sets it to the flush_position's preceder_hint.
11860 +
11861 + There is one other place where we may set the flush_position's preceder hint, which is
11862 + during scan-left.
11863 +*/
11864 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11865 +{
11866 + int ret;
11867 + coord_t coord;
11868 + lock_handle left_lock;
11869 + load_count left_load;
11870 +
11871 + coord_dup(&coord, coord_in);
11872 +
11873 + init_lh(&left_lock);
11874 + init_load_count(&left_load);
11875 +
11876 + /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11877 + coord_is_leftmost_unit is not the right test if the unformatted child is in the
11878 + middle of the first extent unit. */
11879 + if (!coord_is_leftmost_unit(&coord)) {
11880 + coord_prev_unit(&coord);
11881 + } else {
11882 + ret =
11883 + reiser4_get_left_neighbor(&left_lock, coord.node,
11884 + ZNODE_READ_LOCK, GN_SAME_ATOM);
11885 + if (ret) {
11886 + /* If we fail for any reason it doesn't matter because the
11887 + preceder is only a hint. We are low-priority at this point, so
11888 + this must be the case. */
11889 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11890 + ret == -ENOENT || ret == -EINVAL
11891 + || ret == -E_DEADLOCK) {
11892 + ret = 0;
11893 + }
11894 + goto exit;
11895 + }
11896 +
11897 + ret = incr_load_count_znode(&left_load, left_lock.node);
11898 + if (ret)
11899 + goto exit;
11900 +
11901 + coord_init_last_unit(&coord, left_lock.node);
11902 + }
11903 +
11904 + ret =
11905 + item_utmost_child_real_block(&coord, RIGHT_SIDE,
11906 + &pos->preceder.blk);
11907 + exit:
11908 + check_preceder(pos->preceder.blk);
11909 + done_load_count(&left_load);
11910 + done_lh(&left_lock);
11911 + return ret;
11912 +}
11913 +
11914 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11915 +
11916 +/* This procedure implements the outer loop of the flush algorithm. To put this in
11917 + context, here is the general list of steps taken by the flush routine as a whole:
11918 +
11919 + 1. Scan-left
11920 + 2. Scan-right (maybe)
11921 + 3. Allocate initial flush position and its ancestors
11922 + 4. <handle extents>
11923 + 5. <squeeze and next position and its ancestors to-the-right,
11924 + then update position to-the-right>
11925 + 6. <repeat from #4 until flush is stopped>
11926 +
11927 + This procedure implements the loop in steps 4 through 6 in the above listing.
11928 +
11929 + Step 4: if the current flush position is an extent item (position on the twig level),
11930 + it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11931 + coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
11932 + If the next coordinate is an internal item, we descend back to the leaf level,
11933 + otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
11934 + brings us past the end of the twig level, then we call
11935 + reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11936 + step #5 which moves to the right.
11937 +
11938 + Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11939 + tree to allocate any ancestors of the next-right flush position that are not also
11940 + ancestors of the current position. Those ancestors (in top-down order) are the next in
11941 + parent-first order. We squeeze adjacent nodes on the way up until the right node and
11942 + current node share the same parent, then allocate on the way back down. Finally, this
11943 + step sets the flush position to the next-right node. Then repeat steps 4 and 5.
11944 +*/
11945 +
11946 +/* SQUEEZE CODE */
11947 +
11948 +/* squalloc_right_twig helper function, cut a range of extent items from
11949 + cut node to->node from the beginning up to coord @to. */
11950 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11951 + znode * left)
11952 +{
11953 + coord_t from;
11954 + reiser4_key from_key;
11955 +
11956 + coord_init_first_unit(&from, to->node);
11957 + item_key_by_coord(&from, &from_key);
11958 +
11959 + return cut_node_content(&from, to, &from_key, to_key, NULL);
11960 +}
11961 +
11962 +/* Copy as much of the leading extents from @right to @left, allocating
11963 + unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
11964 + SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
11965 + internal item it calls shift_one_internal_unit and may then return
11966 + SUBTREE_MOVED. */
11967 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
11968 +{
11969 + int ret = SUBTREE_MOVED;
11970 + coord_t coord; /* used to iterate over items */
11971 + reiser4_key stop_key;
11972 +
11973 + assert("jmacd-2008", !node_is_empty(right));
11974 + coord_init_first_unit(&coord, right);
11975 +
11976 + /* FIXME: can be optimized to cut once */
11977 + while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
11978 + ON_DEBUG(void *vp);
11979 +
11980 + assert("vs-1468", coord_is_leftmost_unit(&coord));
11981 + ON_DEBUG(vp = shift_check_prepare(left, coord.node));
11982 +
11983 + /* stop_key is used to find what was copied and what to cut */
11984 + stop_key = *reiser4_min_key();
11985 + ret = squalloc_extent(left, &coord, pos, &stop_key);
11986 + if (ret != SQUEEZE_CONTINUE) {
11987 + ON_DEBUG(kfree(vp));
11988 + break;
11989 + }
11990 + assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
11991 +
11992 + /* Helper function to do the cutting. */
11993 + set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
11994 + check_me("vs-1466",
11995 + squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
11996 +
11997 + ON_DEBUG(shift_check(vp, left, coord.node));
11998 + }
11999 +
12000 + if (node_is_empty(coord.node))
12001 + ret = SQUEEZE_SOURCE_EMPTY;
12002 +
12003 + if (ret == SQUEEZE_TARGET_FULL) {
12004 + goto out;
12005 + }
12006 +
12007 + if (node_is_empty(right)) {
12008 + /* The whole right node was copied into @left. */
12009 + assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12010 + goto out;
12011 + }
12012 +
12013 + coord_init_first_unit(&coord, right);
12014 +
12015 + if (!item_is_internal(&coord)) {
12016 + /* we do not want to squeeze anything else to left neighbor because "slum"
12017 + is over */
12018 + ret = SQUEEZE_TARGET_FULL;
12019 + goto out;
12020 + }
12021 + assert("jmacd-433", item_is_internal(&coord));
12022 +
12023 + /* Shift an internal unit. The child must be allocated before shifting any more
12024 + extents, so we stop here. */
12025 + ret = shift_one_internal_unit(left, right);
12026 +
12027 + out:
12028 + assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12029 + || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12030 +
12031 + if (ret == SQUEEZE_TARGET_FULL) {
12032 + /* We submit prepped nodes here and expect that this @left twig
12033 + * will not be modified again during this jnode_flush() call. */
12034 + int ret1;
12035 +
12036 + /* NOTE: seems like io is done under long term locks. */
12037 + ret1 = write_prepped_nodes(pos);
12038 + if (ret1 < 0)
12039 + return ret1;
12040 + }
12041 +
12042 + return ret;
12043 +}
12044 +
12045 +#if REISER4_DEBUG
12046 +static void item_convert_invariant(flush_pos_t * pos)
12047 +{
12048 + assert("edward-1225", coord_is_existing_item(&pos->coord));
12049 + if (chaining_data_present(pos)) {
12050 + item_plugin *iplug = item_convert_plug(pos);
12051 +
12052 + assert("edward-1000",
12053 + iplug == item_plugin_by_coord(&pos->coord));
12054 + assert("edward-1001", iplug->f.convert != NULL);
12055 + } else
12056 + assert("edward-1226", pos->child == NULL);
12057 +}
12058 +#else
12059 +
12060 +#define item_convert_invariant(pos) noop
12061 +
12062 +#endif
12063 +
12064 +/* Scan node items starting from the first one and apply for each
12065 + item its flush ->convert() method (if any). This method may
12066 + resize/kill the item so the tree will be changed.
12067 +*/
12068 +static int convert_node(flush_pos_t * pos, znode * node)
12069 +{
12070 + int ret = 0;
12071 + item_plugin *iplug;
12072 +
12073 + assert("edward-304", pos != NULL);
12074 + assert("edward-305", pos->child == NULL);
12075 + assert("edward-475", znode_convertible(node));
12076 + assert("edward-669", znode_is_wlocked(node));
12077 + assert("edward-1210", !node_is_empty(node));
12078 +
12079 + if (znode_get_level(node) != LEAF_LEVEL)
12080 + /* unsupported */
12081 + goto exit;
12082 +
12083 + coord_init_first_unit(&pos->coord, node);
12084 +
12085 + while (1) {
12086 + ret = 0;
12087 + coord_set_to_left(&pos->coord);
12088 + item_convert_invariant(pos);
12089 +
12090 + iplug = item_plugin_by_coord(&pos->coord);
12091 + assert("edward-844", iplug != NULL);
12092 +
12093 + if (iplug->f.convert) {
12094 + ret = iplug->f.convert(pos);
12095 + if (ret)
12096 + goto exit;
12097 + }
12098 + assert("edward-307", pos->child == NULL);
12099 +
12100 + if (coord_next_item(&pos->coord)) {
12101 + /* node is over */
12102 +
12103 + if (!chaining_data_present(pos))
12104 + /* finished this node */
12105 + break;
12106 + if (should_chain_next_node(pos)) {
12107 + /* go to next node */
12108 + move_chaining_data(pos, 0 /* to next node */ );
12109 + break;
12110 + }
12111 + /* repeat this node */
12112 + move_chaining_data(pos, 1 /* this node */ );
12113 + continue;
12114 + }
12115 + /* Node is not over.
12116 + Check if there is attached convert data.
12117 + If so roll one item position back and repeat
12118 + on this node
12119 + */
12120 + if (chaining_data_present(pos)) {
12121 +
12122 + if (iplug != item_plugin_by_coord(&pos->coord))
12123 + set_item_convert_count(pos, 0);
12124 +
12125 + ret = coord_prev_item(&pos->coord);
12126 + assert("edward-1003", !ret);
12127 +
12128 + move_chaining_data(pos, 1 /* this node */ );
12129 + }
12130 + }
12131 + JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12132 + znode_make_dirty(node);
12133 + exit:
12134 + assert("edward-1004", !ret);
12135 + return ret;
12136 +}
12137 +
12138 +/* Squeeze and allocate the right neighbor. This is called after @left and
12139 + its current children have been squeezed and allocated already. This
12140 + procedure's job is to squeeze and items from @right to @left.
12141 +
12142 + If at the leaf level, use the shift_everything_left memcpy-optimized
12143 + version of shifting (squeeze_right_leaf).
12144 +
12145 + If at the twig level, extents are allocated as they are shifted from @right
12146 + to @left (squalloc_right_twig).
12147 +
12148 + At any other level, shift one internal item and return to the caller
12149 + (squalloc_parent_first) so that the shifted-subtree can be processed in
12150 + parent-first order.
12151 +
12152 + When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12153 + returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12154 + returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12155 + is returned.
12156 +*/
12157 +
12158 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12159 + znode * right)
12160 +{
12161 + int ret;
12162 +
12163 + /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12164 + * tree owing to error (for example, ENOSPC) in write */
12165 + /* assert("jmacd-9321", !node_is_empty(left)); */
12166 + assert("jmacd-9322", !node_is_empty(right));
12167 + assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12168 +
12169 + switch (znode_get_level(left)) {
12170 + case TWIG_LEVEL:
12171 + /* Shift with extent allocating until either an internal item
12172 + is encountered or everything is shifted or no free space
12173 + left in @left */
12174 + ret = squeeze_right_twig(left, right, pos);
12175 + break;
12176 +
12177 + default:
12178 + /* All other levels can use shift_everything until we implement per-item
12179 + flush plugins. */
12180 + ret = squeeze_right_non_twig(left, right);
12181 + break;
12182 + }
12183 +
12184 + assert("jmacd-2011", (ret < 0 ||
12185 + ret == SQUEEZE_SOURCE_EMPTY
12186 + || ret == SQUEEZE_TARGET_FULL
12187 + || ret == SUBTREE_MOVED));
12188 + return ret;
12189 +}
12190 +
12191 +static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12192 + znode * right)
12193 +{
12194 + int ret;
12195 +
12196 + ret = squeeze_right_twig(pos->lock.node, right, pos);
12197 + if (ret < 0)
12198 + return ret;
12199 + if (ret > 0) {
12200 + coord_init_after_last_item(&pos->coord, pos->lock.node);
12201 + return ret;
12202 + }
12203 +
12204 + coord_init_last_unit(&pos->coord, pos->lock.node);
12205 + return 0;
12206 +}
12207 +
12208 +/* forward declaration */
12209 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12210 +
12211 +/* do a fast check for "same parents" condition before calling
12212 + * squalloc_upper_levels() */
12213 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12214 + znode * left,
12215 + znode * right)
12216 +{
12217 + if (znode_same_parents(left, right))
12218 + return 0;
12219 +
12220 + return squalloc_upper_levels(pos, left, right);
12221 +}
12222 +
12223 +/* Check whether the parent of given @right node needs to be processes
12224 + ((re)allocated) prior to processing of the child. If @left and @right do not
12225 + share at least the parent of the @right is after the @left but before the
12226 + @right in parent-first order, we have to (re)allocate it before the @right
12227 + gets (re)allocated. */
12228 +static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12229 +{
12230 + int ret;
12231 +
12232 + lock_handle left_parent_lock;
12233 + lock_handle right_parent_lock;
12234 +
12235 + load_count left_parent_load;
12236 + load_count right_parent_load;
12237 +
12238 + init_lh(&left_parent_lock);
12239 + init_lh(&right_parent_lock);
12240 +
12241 + init_load_count(&left_parent_load);
12242 + init_load_count(&right_parent_load);
12243 +
12244 + ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12245 + if (ret)
12246 + goto out;
12247 +
12248 + ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12249 + if (ret)
12250 + goto out;
12251 +
12252 + /* Check for same parents */
12253 + if (left_parent_lock.node == right_parent_lock.node)
12254 + goto out;
12255 +
12256 + if (znode_check_flushprepped(right_parent_lock.node)) {
12257 + /* Keep parent-first order. In the order, the right parent node stands
12258 + before the @right node. If it is already allocated, we set the
12259 + preceder (next block search start point) to its block number, @right
12260 + node should be allocated after it.
12261 +
12262 + However, preceder is set only if the right parent is on twig level.
12263 + The explanation is the following: new branch nodes are allocated over
12264 + already allocated children while the tree grows, it is difficult to
12265 + keep tree ordered, we assume that only leaves and twings are correctly
12266 + allocated. So, only twigs are used as a preceder for allocating of the
12267 + rest of the slum. */
12268 + if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12269 + pos->preceder.blk =
12270 + *znode_get_block(right_parent_lock.node);
12271 + check_preceder(pos->preceder.blk);
12272 + }
12273 + goto out;
12274 + }
12275 +
12276 + ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12277 + if (ret)
12278 + goto out;
12279 +
12280 + ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12281 + if (ret)
12282 + goto out;
12283 +
12284 + ret =
12285 + squeeze_right_neighbor(pos, left_parent_lock.node,
12286 + right_parent_lock.node);
12287 + /* We stop if error. We stop if some items/units were shifted (ret == 0)
12288 + * and thus @right changed its parent. It means we have not process
12289 + * right_parent node prior to processing of @right. Positive return
12290 + * values say that shifting items was not happen because of "empty
12291 + * source" or "target full" conditions. */
12292 + if (ret <= 0)
12293 + goto out;
12294 +
12295 + /* parent(@left) and parent(@right) may have different parents also. We
12296 + * do a recursive call for checking that. */
12297 + ret =
12298 + check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12299 + right_parent_lock.node);
12300 + if (ret)
12301 + goto out;
12302 +
12303 + /* allocate znode when going down */
12304 + ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12305 +
12306 + out:
12307 + done_load_count(&left_parent_load);
12308 + done_load_count(&right_parent_load);
12309 +
12310 + done_lh(&left_parent_lock);
12311 + done_lh(&right_parent_lock);
12312 +
12313 + return ret;
12314 +}
12315 +
12316 +/* Check the leftmost child "flushprepped" status, also returns true if child
12317 + * node was not found in cache. */
12318 +static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12319 +{
12320 + int ret;
12321 + int prepped;
12322 +
12323 + jnode *child;
12324 +
12325 + ret = get_leftmost_child_of_unit(coord, &child);
12326 +
12327 + if (ret)
12328 + return ret;
12329 +
12330 + if (child) {
12331 + prepped = jnode_check_flushprepped(child);
12332 + jput(child);
12333 + } else {
12334 + /* We consider not existing child as a node which slum
12335 + processing should not continue to. Not cached node is clean,
12336 + so it is flushprepped. */
12337 + prepped = 1;
12338 + }
12339 +
12340 + return prepped;
12341 +}
12342 +
12343 +/* (re)allocate znode with automated getting parent node */
12344 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12345 +{
12346 + int ret;
12347 + lock_handle parent_lock;
12348 + load_count parent_load;
12349 + coord_t pcoord;
12350 +
12351 + assert("zam-851", znode_is_write_locked(node));
12352 +
12353 + init_lh(&parent_lock);
12354 + init_load_count(&parent_load);
12355 +
12356 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12357 + if (ret)
12358 + goto out;
12359 +
12360 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12361 + if (ret)
12362 + goto out;
12363 +
12364 + ret = find_child_ptr(parent_lock.node, node, &pcoord);
12365 + if (ret)
12366 + goto out;
12367 +
12368 + ret = allocate_znode(node, &pcoord, pos);
12369 +
12370 + out:
12371 + done_load_count(&parent_load);
12372 + done_lh(&parent_lock);
12373 + return ret;
12374 +}
12375 +
12376 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12377 + * slum reached. */
12378 +static int handle_pos_on_formatted(flush_pos_t * pos)
12379 +{
12380 + int ret;
12381 + lock_handle right_lock;
12382 + load_count right_load;
12383 +
12384 + init_lh(&right_lock);
12385 + init_load_count(&right_load);
12386 +
12387 + if (should_convert_node(pos, pos->lock.node)) {
12388 + ret = convert_node(pos, pos->lock.node);
12389 + if (ret)
12390 + return ret;
12391 + }
12392 +
12393 + while (1) {
12394 + int expected;
12395 + expected = should_convert_next_node(pos);
12396 + ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12397 + ZNODE_WRITE_LOCK, !expected, expected);
12398 + if (ret) {
12399 + if (expected)
12400 + warning("edward-1495",
12401 + "Expected neighbor not found (ret = %d). Fsck?",
12402 + ret);
12403 + break;
12404 + }
12405 +
12406 + /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12407 + * can be optimal. For now we choose to live with the risk that it will
12408 + * be suboptimal because it would be quite complex to code it to be
12409 + * smarter. */
12410 + if (znode_check_flushprepped(right_lock.node)
12411 + && !znode_convertible(right_lock.node)) {
12412 + assert("edward-1005", !should_convert_next_node(pos));
12413 + pos_stop(pos);
12414 + break;
12415 + }
12416 +
12417 + ret = incr_load_count_znode(&right_load, right_lock.node);
12418 + if (ret)
12419 + break;
12420 + if (should_convert_node(pos, right_lock.node)) {
12421 + ret = convert_node(pos, right_lock.node);
12422 + if (ret)
12423 + break;
12424 + if (node_is_empty(right_lock.node)) {
12425 + /* node became empty after converting, repeat */
12426 + done_load_count(&right_load);
12427 + done_lh(&right_lock);
12428 + continue;
12429 + }
12430 + }
12431 +
12432 + /* squeeze _before_ going upward. */
12433 + ret =
12434 + squeeze_right_neighbor(pos, pos->lock.node,
12435 + right_lock.node);
12436 + if (ret < 0)
12437 + break;
12438 +
12439 + if (znode_check_flushprepped(right_lock.node)) {
12440 + if (should_convert_next_node(pos)) {
12441 + /* in spite of flushprepped status of the node,
12442 + its right slum neighbor should be converted */
12443 + assert("edward-953", convert_data(pos));
12444 + assert("edward-954", item_convert_data(pos));
12445 +
12446 + if (node_is_empty(right_lock.node)) {
12447 + done_load_count(&right_load);
12448 + done_lh(&right_lock);
12449 + } else
12450 + move_flush_pos(pos, &right_lock,
12451 + &right_load, NULL);
12452 + continue;
12453 + }
12454 + pos_stop(pos);
12455 + break;
12456 + }
12457 +
12458 + if (node_is_empty(right_lock.node)) {
12459 + /* repeat if right node was squeezed completely */
12460 + done_load_count(&right_load);
12461 + done_lh(&right_lock);
12462 + continue;
12463 + }
12464 +
12465 + /* parent(right_lock.node) has to be processed before
12466 + * (right_lock.node) due to "parent-first" allocation order. */
12467 + ret =
12468 + check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12469 + right_lock.node);
12470 + if (ret)
12471 + break;
12472 + /* (re)allocate _after_ going upward */
12473 + ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12474 + if (ret)
12475 + break;
12476 + if (should_terminate_squalloc(pos)) {
12477 + set_item_convert_count(pos, 0);
12478 + break;
12479 + }
12480 +
12481 + /* advance the flush position to the right neighbor */
12482 + move_flush_pos(pos, &right_lock, &right_load, NULL);
12483 +
12484 + ret = rapid_flush(pos);
12485 + if (ret)
12486 + break;
12487 + }
12488 + check_convert_info(pos);
12489 + done_load_count(&right_load);
12490 + done_lh(&right_lock);
12491 +
12492 + /* This function indicates via pos whether to stop or go to twig or continue on current
12493 + * level. */
12494 + return ret;
12495 +
12496 +}
12497 +
12498 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12499 + * slum reached. */
12500 +static int handle_pos_on_leaf(flush_pos_t * pos)
12501 +{
12502 + int ret;
12503 +
12504 + assert("zam-845", pos->state == POS_ON_LEAF);
12505 +
12506 + ret = handle_pos_on_formatted(pos);
12507 +
12508 + if (ret == -E_NO_NEIGHBOR) {
12509 + /* cannot get right neighbor, go process extents. */
12510 + pos->state = POS_TO_TWIG;
12511 + return 0;
12512 + }
12513 +
12514 + return ret;
12515 +}
12516 +
12517 +/* Process slum on level > 1 */
12518 +static int handle_pos_on_internal(flush_pos_t * pos)
12519 +{
12520 + assert("zam-850", pos->state == POS_ON_INTERNAL);
12521 + return handle_pos_on_formatted(pos);
12522 +}
12523 +
12524 +/* check whether squalloc should stop before processing given extent */
12525 +static int squalloc_extent_should_stop(flush_pos_t * pos)
12526 +{
12527 + assert("zam-869", item_is_extent(&pos->coord));
12528 +
12529 + /* pos->child is a jnode handle_pos_on_extent() should start with in
12530 + * stead of the first child of the first extent unit. */
12531 + if (pos->child) {
12532 + int prepped;
12533 +
12534 + assert("vs-1383", jnode_is_unformatted(pos->child));
12535 + prepped = jnode_check_flushprepped(pos->child);
12536 + pos->pos_in_unit =
12537 + jnode_get_index(pos->child) -
12538 + extent_unit_index(&pos->coord);
12539 + assert("vs-1470",
12540 + pos->pos_in_unit < extent_unit_width(&pos->coord));
12541 + assert("nikita-3434",
12542 + ergo(extent_is_unallocated(&pos->coord),
12543 + pos->pos_in_unit == 0));
12544 + jput(pos->child);
12545 + pos->child = NULL;
12546 +
12547 + return prepped;
12548 + }
12549 +
12550 + pos->pos_in_unit = 0;
12551 + if (extent_is_unallocated(&pos->coord))
12552 + return 0;
12553 +
12554 + return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12555 +}
12556 +
12557 +/* Handle the case when regular reiser4 tree (znodes connected one to its
12558 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
12559 + * unformatted nodes. By having a lock on twig level and use extent code
12560 + * routines to process unformatted nodes we swim around an irregular part of
12561 + * reiser4 tree. */
12562 +static int handle_pos_on_twig(flush_pos_t * pos)
12563 +{
12564 + int ret;
12565 +
12566 + assert("zam-844", pos->state == POS_ON_EPOINT);
12567 + assert("zam-843", item_is_extent(&pos->coord));
12568 +
12569 + /* We decide should we continue slum processing with current extent
12570 + unit: if leftmost child of current extent unit is flushprepped
12571 + (i.e. clean or already processed by flush) we stop squalloc(). There
12572 + is a fast check for unallocated extents which we assume contain all
12573 + not flushprepped nodes. */
12574 + /* FIXME: Here we implement simple check, we are only looking on the
12575 + leftmost child. */
12576 + ret = squalloc_extent_should_stop(pos);
12577 + if (ret != 0) {
12578 + pos_stop(pos);
12579 + return ret;
12580 + }
12581 +
12582 + while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12583 + && item_is_extent(&pos->coord)) {
12584 + ret = reiser4_alloc_extent(pos);
12585 + if (ret) {
12586 + break;
12587 + }
12588 + coord_next_unit(&pos->coord);
12589 + }
12590 +
12591 + if (coord_is_after_rightmost(&pos->coord)) {
12592 + pos->state = POS_END_OF_TWIG;
12593 + return 0;
12594 + }
12595 + if (item_is_internal(&pos->coord)) {
12596 + pos->state = POS_TO_LEAF;
12597 + return 0;
12598 + }
12599 +
12600 + assert("zam-860", item_is_extent(&pos->coord));
12601 +
12602 + /* "slum" is over */
12603 + pos->state = POS_INVALID;
12604 + return 0;
12605 +}
12606 +
12607 +/* When we about to return flush position from twig to leaf level we can process
12608 + * the right twig node or move position to the leaf. This processes right twig
12609 + * if it is possible and jump to leaf level if not. */
12610 +static int handle_pos_end_of_twig(flush_pos_t * pos)
12611 +{
12612 + int ret;
12613 + lock_handle right_lock;
12614 + load_count right_load;
12615 + coord_t at_right;
12616 + jnode *child = NULL;
12617 +
12618 + assert("zam-848", pos->state == POS_END_OF_TWIG);
12619 + assert("zam-849", coord_is_after_rightmost(&pos->coord));
12620 +
12621 + init_lh(&right_lock);
12622 + init_load_count(&right_load);
12623 +
12624 + /* We get a lock on the right twig node even it is not dirty because
12625 + * slum continues or discontinues on leaf level not on next twig. This
12626 + * lock on the right twig is needed for getting its leftmost child. */
12627 + ret =
12628 + reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12629 + ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12630 + if (ret)
12631 + goto out;
12632 +
12633 + ret = incr_load_count_znode(&right_load, right_lock.node);
12634 + if (ret)
12635 + goto out;
12636 +
12637 + /* right twig could be not dirty */
12638 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12639 + /* If right twig node is dirty we always attempt to squeeze it
12640 + * content to the left... */
12641 + became_dirty:
12642 + ret =
12643 + squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12644 + if (ret <= 0) {
12645 + /* pos->coord is on internal item, go to leaf level, or
12646 + * we have an error which will be caught in squalloc() */
12647 + pos->state = POS_TO_LEAF;
12648 + goto out;
12649 + }
12650 +
12651 + /* If right twig was squeezed completely we wave to re-lock
12652 + * right twig. now it is done through the top-level squalloc
12653 + * routine. */
12654 + if (node_is_empty(right_lock.node))
12655 + goto out;
12656 +
12657 + /* ... and prep it if it is not yet prepped */
12658 + if (!znode_check_flushprepped(right_lock.node)) {
12659 + /* As usual, process parent before ... */
12660 + ret =
12661 + check_parents_and_squalloc_upper_levels(pos,
12662 + pos->lock.
12663 + node,
12664 + right_lock.
12665 + node);
12666 + if (ret)
12667 + goto out;
12668 +
12669 + /* ... processing the child */
12670 + ret =
12671 + lock_parent_and_allocate_znode(right_lock.node,
12672 + pos);
12673 + if (ret)
12674 + goto out;
12675 + }
12676 + } else {
12677 + coord_init_first_unit(&at_right, right_lock.node);
12678 +
12679 + /* check first child of next twig, should we continue there ? */
12680 + ret = get_leftmost_child_of_unit(&at_right, &child);
12681 + if (ret || child == NULL || jnode_check_flushprepped(child)) {
12682 + pos_stop(pos);
12683 + goto out;
12684 + }
12685 +
12686 + /* check clean twig for possible relocation */
12687 + if (!znode_check_flushprepped(right_lock.node)) {
12688 + ret =
12689 + reverse_relocate_check_dirty_parent(child,
12690 + &at_right, pos);
12691 + if (ret)
12692 + goto out;
12693 + if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12694 + goto became_dirty;
12695 + }
12696 + }
12697 +
12698 + assert("zam-875", znode_check_flushprepped(right_lock.node));
12699 +
12700 + /* Update the preceder by a block number of just processed right twig
12701 + * node. The code above could miss the preceder updating because
12702 + * allocate_znode() could not be called for this node. */
12703 + pos->preceder.blk = *znode_get_block(right_lock.node);
12704 + check_preceder(pos->preceder.blk);
12705 +
12706 + coord_init_first_unit(&at_right, right_lock.node);
12707 + assert("zam-868", coord_is_existing_unit(&at_right));
12708 +
12709 + pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12710 + move_flush_pos(pos, &right_lock, &right_load, &at_right);
12711 +
12712 + out:
12713 + done_load_count(&right_load);
12714 + done_lh(&right_lock);
12715 +
12716 + if (child)
12717 + jput(child);
12718 +
12719 + return ret;
12720 +}
12721 +
12722 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12723 + * continue there. */
12724 +static int handle_pos_to_leaf(flush_pos_t * pos)
12725 +{
12726 + int ret;
12727 + lock_handle child_lock;
12728 + load_count child_load;
12729 + jnode *child;
12730 +
12731 + assert("zam-846", pos->state == POS_TO_LEAF);
12732 + assert("zam-847", item_is_internal(&pos->coord));
12733 +
12734 + init_lh(&child_lock);
12735 + init_load_count(&child_load);
12736 +
12737 + ret = get_leftmost_child_of_unit(&pos->coord, &child);
12738 + if (ret)
12739 + return ret;
12740 + if (child == NULL) {
12741 + pos_stop(pos);
12742 + return 0;
12743 + }
12744 +
12745 + if (jnode_check_flushprepped(child)) {
12746 + pos->state = POS_INVALID;
12747 + goto out;
12748 + }
12749 +
12750 + ret =
12751 + longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12752 + ZNODE_LOCK_LOPRI);
12753 + if (ret)
12754 + goto out;
12755 +
12756 + ret = incr_load_count_znode(&child_load, JZNODE(child));
12757 + if (ret)
12758 + goto out;
12759 +
12760 + ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12761 + if (ret)
12762 + goto out;
12763 +
12764 + /* move flush position to leaf level */
12765 + pos->state = POS_ON_LEAF;
12766 + move_flush_pos(pos, &child_lock, &child_load, NULL);
12767 +
12768 + if (node_is_empty(JZNODE(child))) {
12769 + ret = delete_empty_node(JZNODE(child));
12770 + pos->state = POS_INVALID;
12771 + }
12772 + out:
12773 + done_load_count(&child_load);
12774 + done_lh(&child_lock);
12775 + jput(child);
12776 +
12777 + return ret;
12778 +}
12779 +
12780 +/* move pos from leaf to twig, and move lock from leaf to twig. */
12781 +/* Move pos->lock to upper (twig) level */
12782 +static int handle_pos_to_twig(flush_pos_t * pos)
12783 +{
12784 + int ret;
12785 +
12786 + lock_handle parent_lock;
12787 + load_count parent_load;
12788 + coord_t pcoord;
12789 +
12790 + assert("zam-852", pos->state == POS_TO_TWIG);
12791 +
12792 + init_lh(&parent_lock);
12793 + init_load_count(&parent_load);
12794 +
12795 + ret =
12796 + reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12797 + if (ret)
12798 + goto out;
12799 +
12800 + ret = incr_load_count_znode(&parent_load, parent_lock.node);
12801 + if (ret)
12802 + goto out;
12803 +
12804 + ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12805 + if (ret)
12806 + goto out;
12807 +
12808 + assert("zam-870", item_is_internal(&pcoord));
12809 + coord_next_item(&pcoord);
12810 +
12811 + if (coord_is_after_rightmost(&pcoord))
12812 + pos->state = POS_END_OF_TWIG;
12813 + else if (item_is_extent(&pcoord))
12814 + pos->state = POS_ON_EPOINT;
12815 + else {
12816 + /* Here we understand that getting -E_NO_NEIGHBOR in
12817 + * handle_pos_on_leaf() was because of just a reaching edge of
12818 + * slum */
12819 + pos_stop(pos);
12820 + goto out;
12821 + }
12822 +
12823 + move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12824 +
12825 + out:
12826 + done_load_count(&parent_load);
12827 + done_lh(&parent_lock);
12828 +
12829 + return ret;
12830 +}
12831 +
12832 +typedef int (*pos_state_handle_t) (flush_pos_t *);
12833 +static pos_state_handle_t flush_pos_handlers[] = {
12834 + /* process formatted nodes on leaf level, keep lock on a leaf node */
12835 + [POS_ON_LEAF] = handle_pos_on_leaf,
12836 + /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12837 + * being processed */
12838 + [POS_ON_EPOINT] = handle_pos_on_twig,
12839 + /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12840 + [POS_TO_TWIG] = handle_pos_to_twig,
12841 + /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12842 + * pos->coord points to the leaf node we jump to */
12843 + [POS_TO_LEAF] = handle_pos_to_leaf,
12844 + /* after processing last extent in the twig node, attempting to shift items from the twigs
12845 + * right neighbor and process them while shifting */
12846 + [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12847 + /* process formatted nodes on internal level, keep lock on an internal node */
12848 + [POS_ON_INTERNAL] = handle_pos_on_internal
12849 +};
12850 +
12851 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12852 + * encrypt) nodes and their ancestors in "parent-first" order */
12853 +static int squalloc(flush_pos_t * pos)
12854 +{
12855 + int ret = 0;
12856 +
12857 + /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12858 + * greater CPU efficiency? Measure and see.... -Hans */
12859 + while (pos_valid(pos)) {
12860 + ret = flush_pos_handlers[pos->state] (pos);
12861 + if (ret < 0)
12862 + break;
12863 +
12864 + ret = rapid_flush(pos);
12865 + if (ret)
12866 + break;
12867 + }
12868 +
12869 + /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12870 + routines, -E_NO_NEIGHBOR means that slum edge was reached */
12871 + if (ret > 0 || ret == -E_NO_NEIGHBOR)
12872 + ret = 0;
12873 +
12874 + return ret;
12875 +}
12876 +
12877 +static void update_ldkey(znode * node)
12878 +{
12879 + reiser4_key ldkey;
12880 +
12881 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12882 + if (node_is_empty(node))
12883 + return;
12884 +
12885 + znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12886 +}
12887 +
12888 +/* this is to be called after calling of shift node's method to shift data from @right to
12889 + @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12890 + and @right correspondingly and sets right delimiting key of @left to first key of @right */
12891 +static void update_znode_dkeys(znode * left, znode * right)
12892 +{
12893 + assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12894 + assert("vs-1629", (znode_is_write_locked(left) &&
12895 + znode_is_write_locked(right)));
12896 +
12897 + /* we need to update left delimiting of left if it was empty before shift */
12898 + update_ldkey(left);
12899 + update_ldkey(right);
12900 + if (node_is_empty(right))
12901 + znode_set_rd_key(left, znode_get_rd_key(right));
12902 + else
12903 + znode_set_rd_key(left, znode_get_ld_key(right));
12904 +}
12905 +
12906 +/* try to shift everything from @right to @left. If everything was shifted -
12907 + @right is removed from the tree. Result is the number of bytes shifted. */
12908 +static int
12909 +shift_everything_left(znode * right, znode * left, carry_level * todo)
12910 +{
12911 + coord_t from;
12912 + node_plugin *nplug;
12913 + carry_plugin_info info;
12914 +
12915 + coord_init_after_last_item(&from, right);
12916 +
12917 + nplug = node_plugin_by_node(right);
12918 + info.doing = NULL;
12919 + info.todo = todo;
12920 + return nplug->shift(&from, left, SHIFT_LEFT,
12921 + 1 /* delete @right if it becomes empty */ ,
12922 + 1
12923 + /* move coord @from to node @left if everything will be shifted */
12924 + ,
12925 + &info);
12926 +}
12927 +
12928 +/* Shift as much as possible from @right to @left using the memcpy-optimized
12929 + shift_everything_left. @left and @right are formatted neighboring nodes on
12930 + leaf level. */
12931 +static int squeeze_right_non_twig(znode * left, znode * right)
12932 +{
12933 + int ret;
12934 + carry_pool *pool;
12935 + carry_level *todo;
12936 +
12937 + assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12938 +
12939 + if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12940 + !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12941 + return SQUEEZE_TARGET_FULL;
12942 +
12943 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12944 + if (IS_ERR(pool))
12945 + return PTR_ERR(pool);
12946 + todo = (carry_level *) (pool + 1);
12947 + init_carry_level(todo, pool);
12948 +
12949 + ret = shift_everything_left(right, left, todo);
12950 + if (ret > 0) {
12951 + /* something was shifted */
12952 + reiser4_tree *tree;
12953 + __u64 grabbed;
12954 +
12955 + znode_make_dirty(left);
12956 + znode_make_dirty(right);
12957 +
12958 + /* update delimiting keys of nodes which participated in
12959 + shift. FIXME: it would be better to have this in shift
12960 + node's operation. But it can not be done there. Nobody
12961 + remembers why, though */
12962 + tree = znode_get_tree(left);
12963 + write_lock_dk(tree);
12964 + update_znode_dkeys(left, right);
12965 + write_unlock_dk(tree);
12966 +
12967 + /* Carry is called to update delimiting key and, maybe, to remove empty
12968 + node. */
12969 + grabbed = get_current_context()->grabbed_blocks;
12970 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
12971 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
12972 + ret = reiser4_carry(todo, NULL /* previous level */ );
12973 + grabbed2free_mark(grabbed);
12974 + } else {
12975 + /* Shifting impossible, we return appropriate result code */
12976 + ret =
12977 + node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
12978 + SQUEEZE_TARGET_FULL;
12979 + }
12980 +
12981 + done_carry_pool(pool);
12982 +
12983 + return ret;
12984 +}
12985 +
12986 +#if REISER4_DEBUG
12987 +static int sibling_link_is_ok(const znode *left, const znode *right)
12988 +{
12989 + int result;
12990 +
12991 + read_lock_tree(znode_get_tree(left));
12992 + result = (left->right == right && left == right->left);
12993 + read_unlock_tree(znode_get_tree(left));
12994 + return result;
12995 +}
12996 +#endif
12997 +
12998 +/* Shift first unit of first item if it is an internal one. Return
12999 + SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13000 + SUBTREE_MOVED. */
13001 +static int shift_one_internal_unit(znode * left, znode * right)
13002 +{
13003 + int ret;
13004 + carry_pool *pool;
13005 + carry_level *todo;
13006 + coord_t *coord;
13007 + carry_plugin_info *info;
13008 + int size, moved;
13009 +
13010 + assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13011 + assert("nikita-2435", znode_is_write_locked(left));
13012 + assert("nikita-2436", znode_is_write_locked(right));
13013 + assert("nikita-2434", sibling_link_is_ok(left, right));
13014 +
13015 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13016 + sizeof(*coord) + sizeof(*info)
13017 +#if REISER4_DEBUG
13018 + + sizeof(*coord) + 2 * sizeof(reiser4_key)
13019 +#endif
13020 + );
13021 + if (IS_ERR(pool))
13022 + return PTR_ERR(pool);
13023 + todo = (carry_level *) (pool + 1);
13024 + init_carry_level(todo, pool);
13025 +
13026 + coord = (coord_t *) (todo + 3);
13027 + coord_init_first_unit(coord, right);
13028 + info = (carry_plugin_info *) (coord + 1);
13029 +
13030 +#if REISER4_DEBUG
13031 + if (!node_is_empty(left)) {
13032 + coord_t *last;
13033 + reiser4_key *right_key;
13034 + reiser4_key *left_key;
13035 +
13036 + last = (coord_t *) (info + 1);
13037 + right_key = (reiser4_key *) (last + 1);
13038 + left_key = right_key + 1;
13039 + coord_init_last_unit(last, left);
13040 +
13041 + assert("nikita-2463",
13042 + keyle(item_key_by_coord(last, left_key),
13043 + item_key_by_coord(coord, right_key)));
13044 + }
13045 +#endif
13046 +
13047 + assert("jmacd-2007", item_is_internal(coord));
13048 +
13049 + size = item_length_by_coord(coord);
13050 + info->todo = todo;
13051 + info->doing = NULL;
13052 +
13053 + ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13054 + 1
13055 + /* delete @right if it becomes empty */
13056 + ,
13057 + 0
13058 + /* do not move coord @coord to node @left */
13059 + ,
13060 + info);
13061 +
13062 + /* If shift returns positive, then we shifted the item. */
13063 + assert("vs-423", ret <= 0 || size == ret);
13064 + moved = (ret > 0);
13065 +
13066 + if (moved) {
13067 + /* something was moved */
13068 + reiser4_tree *tree;
13069 + int grabbed;
13070 +
13071 + znode_make_dirty(left);
13072 + znode_make_dirty(right);
13073 + tree = znode_get_tree(left);
13074 + write_lock_dk(tree);
13075 + update_znode_dkeys(left, right);
13076 + write_unlock_dk(tree);
13077 +
13078 + /* reserve space for delimiting keys after shifting */
13079 + grabbed = get_current_context()->grabbed_blocks;
13080 + ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13081 + assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13082 +
13083 + ret = reiser4_carry(todo, NULL /* previous level */ );
13084 + grabbed2free_mark(grabbed);
13085 + }
13086 +
13087 + done_carry_pool(pool);
13088 +
13089 + if (ret != 0) {
13090 + /* Shift or carry operation failed. */
13091 + assert("jmacd-7325", ret < 0);
13092 + return ret;
13093 + }
13094 +
13095 + return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13096 +}
13097 +
13098 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
13099 + znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13100 +static int
13101 +allocate_znode_loaded(znode * node,
13102 + const coord_t * parent_coord, flush_pos_t * pos)
13103 +{
13104 + int ret;
13105 + reiser4_super_info_data *sbinfo = get_current_super_private();
13106 + /* FIXME(D): We have the node write-locked and should have checked for !
13107 + allocated() somewhere before reaching this point, but there can be a race, so
13108 + this assertion is bogus. */
13109 + assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13110 + assert("jmacd-7988", znode_is_write_locked(node));
13111 + assert("jmacd-7989", coord_is_invalid(parent_coord)
13112 + || znode_is_write_locked(parent_coord->node));
13113 +
13114 + if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13115 + znode_is_root(node) ||
13116 + /* We have enough nodes to relocate no matter what. */
13117 + (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13118 + /* No need to decide with new nodes, they are treated the same as
13119 + relocate. If the root node is dirty, relocate. */
13120 + if (pos->preceder.blk == 0) {
13121 + /* preceder is unknown and we have decided to relocate node --
13122 + using of default value for search start is better than search
13123 + from block #0. */
13124 + get_blocknr_hint_default(&pos->preceder.blk);
13125 + check_preceder(pos->preceder.blk);
13126 + }
13127 +
13128 + goto best_reloc;
13129 +
13130 + } else if (pos->preceder.blk == 0) {
13131 + /* If we don't know the preceder, leave it where it is. */
13132 + jnode_make_wander(ZJNODE(node));
13133 + } else {
13134 + /* Make a decision based on block distance. */
13135 + reiser4_block_nr dist;
13136 + reiser4_block_nr nblk = *znode_get_block(node);
13137 +
13138 + assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13139 + assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13140 + assert("jmacd-6174", pos->preceder.blk != 0);
13141 +
13142 + if (pos->preceder.blk == nblk - 1) {
13143 + /* Ideal. */
13144 + jnode_make_wander(ZJNODE(node));
13145 + } else {
13146 +
13147 + dist =
13148 + (nblk <
13149 + pos->preceder.blk) ? (pos->preceder.blk -
13150 + nblk) : (nblk -
13151 + pos->preceder.blk);
13152 +
13153 + /* See if we can find a closer block (forward direction only). */
13154 + pos->preceder.max_dist =
13155 + min((reiser4_block_nr) sbinfo->flush.
13156 + relocate_distance, dist);
13157 + pos->preceder.level = znode_get_level(node);
13158 +
13159 + ret = allocate_znode_update(node, parent_coord, pos);
13160 +
13161 + pos->preceder.max_dist = 0;
13162 +
13163 + if (ret && (ret != -ENOSPC))
13164 + return ret;
13165 +
13166 + if (ret == 0) {
13167 + /* Got a better allocation. */
13168 + znode_make_reloc(node, pos->fq);
13169 + } else if (dist < sbinfo->flush.relocate_distance) {
13170 + /* The present allocation is good enough. */
13171 + jnode_make_wander(ZJNODE(node));
13172 + } else {
13173 + /* Otherwise, try to relocate to the best position. */
13174 + best_reloc:
13175 + ret =
13176 + allocate_znode_update(node, parent_coord,
13177 + pos);
13178 + if (ret != 0)
13179 + return ret;
13180 +
13181 + /* set JNODE_RELOC bit _after_ node gets allocated */
13182 + znode_make_reloc(node, pos->fq);
13183 + }
13184 + }
13185 + }
13186 +
13187 + /* This is the new preceder. */
13188 + pos->preceder.blk = *znode_get_block(node);
13189 + check_preceder(pos->preceder.blk);
13190 + pos->alloc_cnt += 1;
13191 +
13192 + assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13193 +
13194 + return 0;
13195 +}
13196 +
13197 +static int
13198 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13199 +{
13200 + /*
13201 + * perform znode allocation with znode pinned in memory to avoid races
13202 + * with asynchronous emergency flush (which plays with
13203 + * JNODE_FLUSH_RESERVED bit).
13204 + */
13205 + return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13206 +}
13207 +
13208 +/* A subroutine of allocate_znode, this is called first to see if there is a close
13209 + position to relocate to. It may return ENOSPC if there is no close position. If there
13210 + is no close position it may not relocate. This takes care of updating the parent node
13211 + with the relocated block address. */
13212 +static int
13213 +allocate_znode_update(znode * node, const coord_t * parent_coord,
13214 + flush_pos_t * pos)
13215 +{
13216 + int ret;
13217 + reiser4_block_nr blk;
13218 + lock_handle uber_lock;
13219 + int flush_reserved_used = 0;
13220 + int grabbed;
13221 + reiser4_context *ctx;
13222 + reiser4_super_info_data *sbinfo;
13223 +
13224 + init_lh(&uber_lock);
13225 +
13226 + ctx = get_current_context();
13227 + sbinfo = get_super_private(ctx->super);
13228 +
13229 + grabbed = ctx->grabbed_blocks;
13230 +
13231 + /* discard e-flush allocation */
13232 + ret = zload(node);
13233 + if (ret)
13234 + return ret;
13235 +
13236 + if (ZF_ISSET(node, JNODE_CREATED)) {
13237 + assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13238 + pos->preceder.block_stage = BLOCK_UNALLOCATED;
13239 + } else {
13240 + pos->preceder.block_stage = BLOCK_GRABBED;
13241 +
13242 + /* The disk space for relocating the @node is already reserved in "flush reserved"
13243 + * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13244 + * space from whole disk not from only 95%). */
13245 + if (znode_get_level(node) == LEAF_LEVEL) {
13246 + /*
13247 + * earlier (during do_jnode_make_dirty()) we decided
13248 + * that @node can possibly go into overwrite set and
13249 + * reserved block for its wandering location.
13250 + */
13251 + txn_atom *atom = get_current_atom_locked();
13252 + assert("nikita-3449",
13253 + ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13254 + flush_reserved2grabbed(atom, (__u64) 1);
13255 + spin_unlock_atom(atom);
13256 + /*
13257 + * we are trying to move node into relocate
13258 + * set. Allocation of relocated position "uses"
13259 + * reserved block.
13260 + */
13261 + ZF_CLR(node, JNODE_FLUSH_RESERVED);
13262 + flush_reserved_used = 1;
13263 + } else {
13264 + ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13265 + if (ret != 0)
13266 + goto exit;
13267 + }
13268 + }
13269 +
13270 + /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13271 + ret = reiser4_alloc_block(&pos->preceder, &blk,
13272 + BA_FORMATTED | BA_PERMANENT);
13273 + if (ret)
13274 + goto exit;
13275 +
13276 + if (!ZF_ISSET(node, JNODE_CREATED) &&
13277 + (ret =
13278 + reiser4_dealloc_block(znode_get_block(node), 0,
13279 + BA_DEFER | BA_FORMATTED)))
13280 + goto exit;
13281 +
13282 + if (likely(!znode_is_root(node))) {
13283 + item_plugin *iplug;
13284 +
13285 + iplug = item_plugin_by_coord(parent_coord);
13286 + assert("nikita-2954", iplug->f.update != NULL);
13287 + iplug->f.update(parent_coord, &blk);
13288 +
13289 + znode_make_dirty(parent_coord->node);
13290 +
13291 + } else {
13292 + reiser4_tree *tree = znode_get_tree(node);
13293 + znode *uber;
13294 +
13295 + /* We take a longterm lock on the fake node in order to change
13296 + the root block number. This may cause atom fusion. */
13297 + ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13298 + &uber_lock);
13299 + /* The fake node cannot be deleted, and we must have priority
13300 + here, and may not be confused with ENOSPC. */
13301 + assert("jmacd-74412",
13302 + ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13303 +
13304 + if (ret)
13305 + goto exit;
13306 +
13307 + uber = uber_lock.node;
13308 +
13309 + write_lock_tree(tree);
13310 + tree->root_block = blk;
13311 + write_unlock_tree(tree);
13312 +
13313 + znode_make_dirty(uber);
13314 + }
13315 +
13316 + ret = znode_rehash(node, &blk);
13317 + exit:
13318 + if (ret) {
13319 + /* Get flush reserved block back if something fails, because
13320 + * callers assume that on error block wasn't relocated and its
13321 + * flush reserved block wasn't used. */
13322 + if (flush_reserved_used) {
13323 + /*
13324 + * ok, we failed to move node into relocate
13325 + * set. Restore status quo.
13326 + */
13327 + grabbed2flush_reserved((__u64) 1);
13328 + ZF_SET(node, JNODE_FLUSH_RESERVED);
13329 + }
13330 + }
13331 + zrelse(node);
13332 + done_lh(&uber_lock);
13333 + grabbed2free_mark(grabbed);
13334 + return ret;
13335 +}
13336 +
13337 +/* JNODE INTERFACE */
13338 +
13339 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13340 + coordinate in the parent. If the child is the root node, the above_root
13341 + znode is returned but the coord is not set. This function may cause atom
13342 + fusion, but it is only used for read locks (at this point) and therefore
13343 + fusion only occurs when the parent is already dirty. */
13344 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13345 + pointer in jnodes. */
13346 +static int
13347 +jnode_lock_parent_coord(jnode * node,
13348 + coord_t * coord,
13349 + lock_handle * parent_lh,
13350 + load_count * parent_zh,
13351 + znode_lock_mode parent_mode, int try)
13352 +{
13353 + int ret;
13354 +
13355 + assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13356 + assert("edward-54", jnode_is_unformatted(node)
13357 + || znode_is_any_locked(JZNODE(node)));
13358 +
13359 + if (!jnode_is_znode(node)) {
13360 + reiser4_key key;
13361 + tree_level stop_level = TWIG_LEVEL;
13362 + lookup_bias bias = FIND_EXACT;
13363 +
13364 + assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13365 +
13366 + /* The case when node is not znode, but can have parent coord
13367 + (unformatted node, node which represents cluster page,
13368 + etc..). Generate a key for the appropriate entry, search
13369 + in the tree using coord_by_key, which handles locking for
13370 + us. */
13371 +
13372 + /*
13373 + * nothing is locked at this moment, so, nothing prevents
13374 + * concurrent truncate from removing jnode from inode. To
13375 + * prevent this spin-lock jnode. jnode can be truncated just
13376 + * after call to the jnode_build_key(), but this is ok,
13377 + * because coord_by_key() will just fail to find appropriate
13378 + * extent.
13379 + */
13380 + spin_lock_jnode(node);
13381 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13382 + jnode_build_key(node, &key);
13383 + ret = 0;
13384 + } else
13385 + ret = RETERR(-ENOENT);
13386 + spin_unlock_jnode(node);
13387 +
13388 + if (ret != 0)
13389 + return ret;
13390 +
13391 + if (jnode_is_cluster_page(node))
13392 + stop_level = LEAF_LEVEL;
13393 +
13394 + assert("jmacd-1812", coord != NULL);
13395 +
13396 + ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13397 + parent_mode, bias, stop_level, stop_level,
13398 + CBK_UNIQUE, NULL /*ra_info */ );
13399 + switch (ret) {
13400 + case CBK_COORD_NOTFOUND:
13401 + assert("edward-1038",
13402 + ergo(jnode_is_cluster_page(node),
13403 + JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13404 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13405 + warning("nikita-3177", "Parent not found");
13406 + return ret;
13407 + case CBK_COORD_FOUND:
13408 + if (coord->between != AT_UNIT) {
13409 + /* FIXME: comment needed */
13410 + done_lh(parent_lh);
13411 + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13412 + warning("nikita-3178",
13413 + "Found but not happy: %i",
13414 + coord->between);
13415 + }
13416 + return RETERR(-ENOENT);
13417 + }
13418 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13419 + if (ret != 0)
13420 + return ret;
13421 + /* if (jnode_is_cluster_page(node)) {
13422 + races with write() are possible
13423 + check_child_cluster (parent_lh->node);
13424 + }
13425 + */
13426 + break;
13427 + default:
13428 + return ret;
13429 + }
13430 +
13431 + } else {
13432 + int flags;
13433 + znode *z;
13434 +
13435 + z = JZNODE(node);
13436 + /* Formatted node case: */
13437 + assert("jmacd-2061", !znode_is_root(z));
13438 +
13439 + flags = GN_ALLOW_NOT_CONNECTED;
13440 + if (try)
13441 + flags |= GN_TRY_LOCK;
13442 +
13443 + ret =
13444 + reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13445 + if (ret != 0)
13446 + /* -E_REPEAT is ok here, it is handled by the caller. */
13447 + return ret;
13448 +
13449 + /* Make the child's position "hint" up-to-date. (Unless above
13450 + root, which caller must check.) */
13451 + if (coord != NULL) {
13452 +
13453 + ret = incr_load_count_znode(parent_zh, parent_lh->node);
13454 + if (ret != 0) {
13455 + warning("jmacd-976812386",
13456 + "incr_load_count_znode failed: %d",
13457 + ret);
13458 + return ret;
13459 + }
13460 +
13461 + ret = find_child_ptr(parent_lh->node, z, coord);
13462 + if (ret != 0) {
13463 + warning("jmacd-976812",
13464 + "find_child_ptr failed: %d", ret);
13465 + return ret;
13466 + }
13467 + }
13468 + }
13469 +
13470 + return 0;
13471 +}
13472 +
13473 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13474 + If there is no next neighbor or the neighbor is not in memory or if there is a
13475 + neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13476 + In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13477 +static int neighbor_in_slum(znode * node, /* starting point */
13478 + lock_handle * lock, /* lock on starting point */
13479 + sideof side, /* left or right direction we seek the next node in */
13480 + znode_lock_mode mode, /* kind of lock we want */
13481 + int check_dirty, /* true if the neighbor should be dirty */
13482 + int use_upper_levels /* get neighbor by going though
13483 + upper levels */)
13484 +{
13485 + int ret;
13486 + int flags;
13487 +
13488 + assert("jmacd-6334", znode_is_connected(node));
13489 +
13490 + flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13491 + if (use_upper_levels)
13492 + flags |= GN_CAN_USE_UPPER_LEVELS;
13493 +
13494 + ret = reiser4_get_neighbor(lock, node, mode, flags);
13495 + if (ret) {
13496 + /* May return -ENOENT or -E_NO_NEIGHBOR. */
13497 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13498 + if (ret == -ENOENT) {
13499 + ret = RETERR(-E_NO_NEIGHBOR);
13500 + }
13501 + return ret;
13502 + }
13503 + if (!check_dirty)
13504 + return 0;
13505 + /* Check dirty bit of locked znode, no races here */
13506 + if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13507 + return 0;
13508 +
13509 + done_lh(lock);
13510 + return RETERR(-E_NO_NEIGHBOR);
13511 +}
13512 +
13513 +/* Return true if two znodes have the same parent. This is called with both nodes
13514 + write-locked (for squeezing) so no tree lock is needed. */
13515 +static int znode_same_parents(znode * a, znode * b)
13516 +{
13517 + int result;
13518 +
13519 + assert("jmacd-7011", znode_is_write_locked(a));
13520 + assert("jmacd-7012", znode_is_write_locked(b));
13521 +
13522 + /* We lock the whole tree for this check.... I really don't like whole tree
13523 + * locks... -Hans */
13524 + read_lock_tree(znode_get_tree(a));
13525 + result = (znode_parent(a) == znode_parent(b));
13526 + read_unlock_tree(znode_get_tree(a));
13527 + return result;
13528 +}
13529 +
13530 +/* FLUSH SCAN */
13531 +
13532 +/* Initialize the flush_scan data structure. */
13533 +static void scan_init(flush_scan * scan)
13534 +{
13535 + memset(scan, 0, sizeof(*scan));
13536 + init_lh(&scan->node_lock);
13537 + init_lh(&scan->parent_lock);
13538 + init_load_count(&scan->parent_load);
13539 + init_load_count(&scan->node_load);
13540 + coord_init_invalid(&scan->parent_coord, NULL);
13541 +}
13542 +
13543 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13544 +static void scan_done(flush_scan * scan)
13545 +{
13546 + done_load_count(&scan->node_load);
13547 + if (scan->node != NULL) {
13548 + jput(scan->node);
13549 + scan->node = NULL;
13550 + }
13551 + done_load_count(&scan->parent_load);
13552 + done_lh(&scan->parent_lock);
13553 + done_lh(&scan->node_lock);
13554 +}
13555 +
13556 +/* Returns true if flush scanning is finished. */
13557 +int reiser4_scan_finished(flush_scan * scan)
13558 +{
13559 + return scan->stop || (scan->direction == RIGHT_SIDE &&
13560 + scan->count >= scan->max_count);
13561 +}
13562 +
13563 +/* Return true if the scan should continue to the @tonode. True if the node meets the
13564 + same_slum_check condition. If not, deref the "left" node and stop the scan. */
13565 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13566 +{
13567 + int go = same_slum_check(scan->node, tonode, 1, 0);
13568 +
13569 + if (!go) {
13570 + scan->stop = 1;
13571 + jput(tonode);
13572 + }
13573 +
13574 + return go;
13575 +}
13576 +
13577 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13578 + count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13579 + parent coordinate. */
13580 +int
13581 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13582 + const coord_t * parent)
13583 +{
13584 + /* Release the old references, take the new reference. */
13585 + done_load_count(&scan->node_load);
13586 +
13587 + if (scan->node != NULL) {
13588 + jput(scan->node);
13589 + }
13590 + scan->node = node;
13591 + scan->count += add_count;
13592 +
13593 + /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
13594 + delay this update step until it finishes and update the parent_coord only once.
13595 + It did that before, but there was a bug and this was the easiest way to make it
13596 + correct. */
13597 + if (parent != NULL) {
13598 + coord_dup(&scan->parent_coord, parent);
13599 + }
13600 +
13601 + /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13602 + is safely taken. */
13603 + return incr_load_count_jnode(&scan->node_load, node);
13604 +}
13605 +
13606 +/* Return true if scanning in the leftward direction. */
13607 +int reiser4_scanning_left(flush_scan * scan)
13608 +{
13609 + return scan->direction == LEFT_SIDE;
13610 +}
13611 +
13612 +/* Performs leftward scanning starting from either kind of node. Counts the starting
13613 + node. The right-scan object is passed in for the left-scan in order to copy the parent
13614 + of an unformatted starting position. This way we avoid searching for the unformatted
13615 + node's parent when scanning in each direction. If we search for the parent once it is
13616 + set in both scan objects. The limit parameter tells flush-scan when to stop.
13617 +
13618 + Rapid scanning is used only during scan_left, where we are interested in finding the
13619 + 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13620 + of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13621 + problem is finding a way to flush only those nodes without unallocated children, and it
13622 + is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13623 + problem can be solved by scanning left at every level as we go upward, but this would
13624 + basically bring us back to using a top-down allocation strategy, which we already tried
13625 + (see BK history from May 2002), and has a different set of problems. The top-down
13626 + strategy makes avoiding unallocated children easier, but makes it difficult to
13627 + propertly flush dirty children with clean parents that would otherwise stop the
13628 + top-down flush, only later to dirty the parent once the children are flushed. So we
13629 + solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13630 + only.
13631 +
13632 + The first step in solving the problem is this rapid leftward scan. After we determine
13633 + that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13634 + are no longer interested in the exact count, we are only interested in finding a the
13635 + best place to start the flush. We could choose one of two possibilities:
13636 +
13637 + 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13638 + This requires checking one leaf per rapid-scan twig
13639 +
13640 + 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13641 + to the left. This requires checking possibly all of the in-memory children of each
13642 + twig during the rapid scan.
13643 +
13644 + For now we implement the first policy.
13645 +*/
13646 +static int
13647 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13648 +{
13649 + int ret = 0;
13650 +
13651 + scan->max_count = limit;
13652 + scan->direction = LEFT_SIDE;
13653 +
13654 + ret = scan_set_current(scan, jref(node), 1, NULL);
13655 + if (ret != 0) {
13656 + return ret;
13657 + }
13658 +
13659 + ret = scan_common(scan, right);
13660 + if (ret != 0) {
13661 + return ret;
13662 + }
13663 +
13664 + /* Before rapid scanning, we need a lock on scan->node so that we can get its
13665 + parent, only if formatted. */
13666 + if (jnode_is_znode(scan->node)) {
13667 + ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13668 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13669 + }
13670 +
13671 + /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13672 + return ret;
13673 +}
13674 +
13675 +/* Performs rightward scanning... Does not count the starting node. The limit parameter
13676 + is described in scan_left. If the starting node is unformatted then the
13677 + parent_coord was already set during scan_left. The rapid_after parameter is not used
13678 + during right-scanning.
13679 +
13680 + scan_right is only called if the scan_left operation does not count at least
13681 + FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13682 + the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13683 + scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13684 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13685 +{
13686 + int ret;
13687 +
13688 + scan->max_count = limit;
13689 + scan->direction = RIGHT_SIDE;
13690 +
13691 + ret = scan_set_current(scan, jref(node), 0, NULL);
13692 + if (ret != 0) {
13693 + return ret;
13694 + }
13695 +
13696 + return scan_common(scan, NULL);
13697 +}
13698 +
13699 +/* Common code to perform left or right scanning. */
13700 +static int scan_common(flush_scan * scan, flush_scan * other)
13701 +{
13702 + int ret;
13703 +
13704 + assert("nikita-2376", scan->node != NULL);
13705 + assert("edward-54", jnode_is_unformatted(scan->node)
13706 + || jnode_is_znode(scan->node));
13707 +
13708 + /* Special case for starting at an unformatted node. Optimization: we only want
13709 + to search for the parent (which requires a tree traversal) once. Obviously, we
13710 + shouldn't have to call it once for the left scan and once for the right scan.
13711 + For this reason, if we search for the parent during scan-left we then duplicate
13712 + the coord/lock/load into the scan-right object. */
13713 + if (jnode_is_unformatted(scan->node)) {
13714 + ret = scan_unformatted(scan, other);
13715 + if (ret != 0)
13716 + return ret;
13717 + }
13718 + /* This loop expects to start at a formatted position and performs chaining of
13719 + formatted regions */
13720 + while (!reiser4_scan_finished(scan)) {
13721 +
13722 + ret = scan_formatted(scan);
13723 + if (ret != 0) {
13724 + return ret;
13725 + }
13726 + }
13727 +
13728 + return 0;
13729 +}
13730 +
13731 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
13732 +{
13733 + int ret = 0;
13734 + int try = 0;
13735 +
13736 + if (!coord_is_invalid(&scan->parent_coord))
13737 + goto scan;
13738 +
13739 + /* set parent coord from */
13740 + if (!jnode_is_unformatted(scan->node)) {
13741 + /* formatted position */
13742 +
13743 + lock_handle lock;
13744 + assert("edward-301", jnode_is_znode(scan->node));
13745 + init_lh(&lock);
13746 +
13747 + /*
13748 + * when flush starts from unformatted node, first thing it
13749 + * does is tree traversal to find formatted parent of starting
13750 + * node. This parent is then kept lock across scans to the
13751 + * left and to the right. This means that during scan to the
13752 + * left we cannot take left-ward lock, because this is
13753 + * dead-lock prone. So, if we are scanning to the left and
13754 + * there is already lock held by this thread,
13755 + * jnode_lock_parent_coord() should use try-lock.
13756 + */
13757 + try = reiser4_scanning_left(scan)
13758 + && !lock_stack_isclean(get_current_lock_stack());
13759 + /* Need the node locked to get the parent lock, We have to
13760 + take write lock since there is at least one call path
13761 + where this znode is already write-locked by us. */
13762 + ret =
13763 + longterm_lock_znode(&lock, JZNODE(scan->node),
13764 + ZNODE_WRITE_LOCK,
13765 + reiser4_scanning_left(scan) ?
13766 + ZNODE_LOCK_LOPRI :
13767 + ZNODE_LOCK_HIPRI);
13768 + if (ret != 0)
13769 + /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
13770 + scanned too far and can't back out, just start over. */
13771 + return ret;
13772 +
13773 + ret = jnode_lock_parent_coord(scan->node,
13774 + &scan->parent_coord,
13775 + &scan->parent_lock,
13776 + &scan->parent_load,
13777 + ZNODE_WRITE_LOCK, try);
13778 +
13779 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13780 + done_lh(&lock);
13781 + if (ret == -E_REPEAT) {
13782 + scan->stop = 1;
13783 + return 0;
13784 + }
13785 + if (ret)
13786 + return ret;
13787 +
13788 + } else {
13789 + /* unformatted position */
13790 +
13791 + ret =
13792 + jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13793 + &scan->parent_lock,
13794 + &scan->parent_load,
13795 + ZNODE_WRITE_LOCK, try);
13796 +
13797 + if (IS_CBKERR(ret))
13798 + return ret;
13799 +
13800 + if (ret == CBK_COORD_NOTFOUND)
13801 + /* FIXME(C): check EINVAL, E_DEADLOCK */
13802 + return ret;
13803 +
13804 + /* parent was found */
13805 + assert("jmacd-8661", other != NULL);
13806 + /* Duplicate the reference into the other flush_scan. */
13807 + coord_dup(&other->parent_coord, &scan->parent_coord);
13808 + copy_lh(&other->parent_lock, &scan->parent_lock);
13809 + copy_load_count(&other->parent_load, &scan->parent_load);
13810 + }
13811 + scan:
13812 + return scan_by_coord(scan);
13813 +}
13814 +
13815 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
13816 + pointers under tree lock as long as:
13817 +
13818 + - node->left/right is non-NULL
13819 + - node->left/right is connected, dirty
13820 + - node->left/right belongs to the same atom
13821 + - scan has not reached maximum count
13822 +*/
13823 +static int scan_formatted(flush_scan * scan)
13824 +{
13825 + int ret;
13826 + znode *neighbor = NULL;
13827 +
13828 + assert("jmacd-1401", !reiser4_scan_finished(scan));
13829 +
13830 + do {
13831 + znode *node = JZNODE(scan->node);
13832 +
13833 + /* Node should be connected, but if not stop the scan. */
13834 + if (!znode_is_connected(node)) {
13835 + scan->stop = 1;
13836 + break;
13837 + }
13838 +
13839 + /* Lock the tree, check-for and reference the next sibling. */
13840 + read_lock_tree(znode_get_tree(node));
13841 +
13842 + /* It may be that a node is inserted or removed between a node and its
13843 + left sibling while the tree lock is released, but the flush-scan count
13844 + does not need to be precise. Thus, we release the tree lock as soon as
13845 + we get the neighboring node. */
13846 + neighbor =
13847 + reiser4_scanning_left(scan) ? node->left : node->right;
13848 + if (neighbor != NULL) {
13849 + zref(neighbor);
13850 + }
13851 +
13852 + read_unlock_tree(znode_get_tree(node));
13853 +
13854 + /* If neighbor is NULL at the leaf level, need to check for an unformatted
13855 + sibling using the parent--break in any case. */
13856 + if (neighbor == NULL) {
13857 + break;
13858 + }
13859 +
13860 + /* Check the condition for going left, break if it is not met. This also
13861 + releases (jputs) the neighbor if false. */
13862 + if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
13863 + break;
13864 + }
13865 +
13866 + /* Advance the flush_scan state to the left, repeat. */
13867 + ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13868 + if (ret != 0) {
13869 + return ret;
13870 + }
13871 +
13872 + } while (!reiser4_scan_finished(scan));
13873 +
13874 + /* If neighbor is NULL then we reached the end of a formatted region, or else the
13875 + sibling is out of memory, now check for an extent to the left (as long as
13876 + LEAF_LEVEL). */
13877 + if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
13878 + || reiser4_scan_finished(scan)) {
13879 + scan->stop = 1;
13880 + return 0;
13881 + }
13882 + /* Otherwise, calls scan_by_coord for the right(left)most item of the
13883 + left(right) neighbor on the parent level, then possibly continue. */
13884 +
13885 + coord_init_invalid(&scan->parent_coord, NULL);
13886 + return scan_unformatted(scan, NULL);
13887 +}
13888 +
13889 +/* NOTE-EDWARD:
13890 + This scans adjacent items of the same type and calls scan flush plugin for each one.
13891 + Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
13892 + from unformatted node, then we continue only if the next neighbor is also unformatted.
13893 + When called from scan_formatted, we skip first iteration (to make sure that
13894 + right(left)most item of the left(right) neighbor on the parent level is of the same
13895 + type and set appropriate coord). */
13896 +static int scan_by_coord(flush_scan * scan)
13897 +{
13898 + int ret = 0;
13899 + int scan_this_coord;
13900 + lock_handle next_lock;
13901 + load_count next_load;
13902 + coord_t next_coord;
13903 + jnode *child;
13904 + item_plugin *iplug;
13905 +
13906 + init_lh(&next_lock);
13907 + init_load_count(&next_load);
13908 + scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13909 +
13910 + /* set initial item id */
13911 + iplug = item_plugin_by_coord(&scan->parent_coord);
13912 +
13913 + for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
13914 + if (scan_this_coord) {
13915 + /* Here we expect that unit is scannable. it would not be so due
13916 + * to race with extent->tail conversion. */
13917 + if (iplug->f.scan == NULL) {
13918 + scan->stop = 1;
13919 + ret = -E_REPEAT;
13920 + /* skip the check at the end. */
13921 + goto race;
13922 + }
13923 +
13924 + ret = iplug->f.scan(scan);
13925 + if (ret != 0)
13926 + goto exit;
13927 +
13928 + if (reiser4_scan_finished(scan)) {
13929 + checkchild(scan);
13930 + break;
13931 + }
13932 + } else {
13933 + /* the same race against truncate as above is possible
13934 + * here, it seems */
13935 +
13936 + /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13937 + the first coordinate. */
13938 + assert("jmacd-1231",
13939 + item_is_internal(&scan->parent_coord));
13940 + }
13941 +
13942 + if (iplug->f.utmost_child == NULL
13943 + || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13944 + /* stop this coord and continue on parrent level */
13945 + ret =
13946 + scan_set_current(scan,
13947 + ZJNODE(zref
13948 + (scan->parent_coord.node)),
13949 + 1, NULL);
13950 + if (ret != 0)
13951 + goto exit;
13952 + break;
13953 + }
13954 +
13955 + /* Either way, the invariant is that scan->parent_coord is set to the
13956 + parent of scan->node. Now get the next unit. */
13957 + coord_dup(&next_coord, &scan->parent_coord);
13958 + coord_sideof_unit(&next_coord, scan->direction);
13959 +
13960 + /* If off-the-end of the twig, try the next twig. */
13961 + if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
13962 + /* We take the write lock because we may start flushing from this
13963 + * coordinate. */
13964 + ret = neighbor_in_slum(next_coord.node,
13965 + &next_lock,
13966 + scan->direction,
13967 + ZNODE_WRITE_LOCK,
13968 + 1 /* check dirty */,
13969 + 0 /* don't go though upper
13970 + levels */);
13971 + if (ret == -E_NO_NEIGHBOR) {
13972 + scan->stop = 1;
13973 + ret = 0;
13974 + break;
13975 + }
13976 +
13977 + if (ret != 0) {
13978 + goto exit;
13979 + }
13980 +
13981 + ret = incr_load_count_znode(&next_load, next_lock.node);
13982 + if (ret != 0) {
13983 + goto exit;
13984 + }
13985 +
13986 + coord_init_sideof_unit(&next_coord, next_lock.node,
13987 + sideof_reverse(scan->direction));
13988 + }
13989 +
13990 + iplug = item_plugin_by_coord(&next_coord);
13991 +
13992 + /* Get the next child. */
13993 + ret =
13994 + iplug->f.utmost_child(&next_coord,
13995 + sideof_reverse(scan->direction),
13996 + &child);
13997 + if (ret != 0)
13998 + goto exit;
13999 + /* If the next child is not in memory, or, item_utmost_child
14000 + failed (due to race with unlink, most probably), stop
14001 + here. */
14002 + if (child == NULL || IS_ERR(child)) {
14003 + scan->stop = 1;
14004 + checkchild(scan);
14005 + break;
14006 + }
14007 +
14008 + assert("nikita-2374", jnode_is_unformatted(child)
14009 + || jnode_is_znode(child));
14010 +
14011 + /* See if it is dirty, part of the same atom. */
14012 + if (!reiser4_scan_goto(scan, child)) {
14013 + checkchild(scan);
14014 + break;
14015 + }
14016 +
14017 + /* If so, make this child current. */
14018 + ret = scan_set_current(scan, child, 1, &next_coord);
14019 + if (ret != 0)
14020 + goto exit;
14021 +
14022 + /* Now continue. If formatted we release the parent lock and return, then
14023 + proceed. */
14024 + if (jnode_is_znode(child))
14025 + break;
14026 +
14027 + /* Otherwise, repeat the above loop with next_coord. */
14028 + if (next_load.node != NULL) {
14029 + done_lh(&scan->parent_lock);
14030 + move_lh(&scan->parent_lock, &next_lock);
14031 + move_load_count(&scan->parent_load, &next_load);
14032 + }
14033 + }
14034 +
14035 + assert("jmacd-6233",
14036 + reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
14037 + exit:
14038 + checkchild(scan);
14039 + race: /* skip the above check */
14040 + if (jnode_is_znode(scan->node)) {
14041 + done_lh(&scan->parent_lock);
14042 + done_load_count(&scan->parent_load);
14043 + }
14044 +
14045 + done_load_count(&next_load);
14046 + done_lh(&next_lock);
14047 + return ret;
14048 +}
14049 +
14050 +/* FLUSH POS HELPERS */
14051 +
14052 +/* Initialize the fields of a flush_position. */
14053 +static void pos_init(flush_pos_t * pos)
14054 +{
14055 + memset(pos, 0, sizeof *pos);
14056 +
14057 + pos->state = POS_INVALID;
14058 + coord_init_invalid(&pos->coord, NULL);
14059 + init_lh(&pos->lock);
14060 + init_load_count(&pos->load);
14061 +
14062 + reiser4_blocknr_hint_init(&pos->preceder);
14063 +}
14064 +
14065 +/* The flush loop inside squalloc periodically checks pos_valid to
14066 + determine when "enough flushing" has been performed. This will return true until one
14067 + of the following conditions is met:
14068 +
14069 + 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14070 + parameter, meaning we have flushed as many blocks as the kernel requested. When
14071 + flushing to commit, this parameter is NULL.
14072 +
14073 + 2. pos_stop() is called because squalloc discovers that the "next" node in the
14074 + flush order is either non-existant, not dirty, or not in the same atom.
14075 +*/
14076 +
14077 +static int pos_valid(flush_pos_t * pos)
14078 +{
14079 + return pos->state != POS_INVALID;
14080 +}
14081 +
14082 +/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14083 +static void pos_done(flush_pos_t * pos)
14084 +{
14085 + pos_stop(pos);
14086 + reiser4_blocknr_hint_done(&pos->preceder);
14087 + if (convert_data(pos))
14088 + free_convert_data(pos);
14089 +}
14090 +
14091 +/* Reset the point and parent. Called during flush subroutines to terminate the
14092 + squalloc loop. */
14093 +static int pos_stop(flush_pos_t * pos)
14094 +{
14095 + pos->state = POS_INVALID;
14096 + done_lh(&pos->lock);
14097 + done_load_count(&pos->load);
14098 + coord_init_invalid(&pos->coord, NULL);
14099 +
14100 + if (pos->child) {
14101 + jput(pos->child);
14102 + pos->child = NULL;
14103 + }
14104 +
14105 + return 0;
14106 +}
14107 +
14108 +/* Return the flush_position's block allocator hint. */
14109 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14110 +{
14111 + return &pos->preceder;
14112 +}
14113 +
14114 +flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14115 +{
14116 + return pos->fq;
14117 +}
14118 +
14119 +/* Make Linus happy.
14120 + Local variables:
14121 + c-indentation-style: "K&R"
14122 + mode-name: "LC"
14123 + c-basic-offset: 8
14124 + tab-width: 8
14125 + fill-column: 90
14126 + LocalWords: preceder
14127 + End:
14128 +*/
14129 diff -urN linux-2.6.23.orig/fs/reiser4/flush.h linux-2.6.23/fs/reiser4/flush.h
14130 --- linux-2.6.23.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300
14131 +++ linux-2.6.23/fs/reiser4/flush.h 2007-12-04 16:49:30.000000000 +0300
14132 @@ -0,0 +1,295 @@
14133 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14134 +
14135 +/* DECLARATIONS: */
14136 +
14137 +#if !defined(__REISER4_FLUSH_H__)
14138 +#define __REISER4_FLUSH_H__
14139 +
14140 +#include "plugin/cluster.h"
14141 +
14142 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14143 + single level of the tree. A flush-scan is used for counting the number of adjacent
14144 + nodes to flush, which is used to determine whether we should relocate, and it is also
14145 + used to find a starting point for flush. A flush-scan object can scan in both right
14146 + and left directions via the scan_left() and scan_right() interfaces. The
14147 + right- and left-variations are similar but perform different functions. When scanning
14148 + left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14149 + When scanning right we are simply counting the number of adjacent, dirty nodes. */
14150 +struct flush_scan {
14151 +
14152 + /* The current number of nodes scanned on this level. */
14153 + unsigned count;
14154 +
14155 + /* There may be a maximum number of nodes for a scan on any single level. When
14156 + going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14157 + unsigned max_count;
14158 +
14159 + /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14160 + sideof direction;
14161 +
14162 + /* Initially @stop is set to false then set true once some condition stops the
14163 + search (e.g., we found a clean node before reaching max_count or we found a
14164 + node belonging to another atom). */
14165 + int stop;
14166 +
14167 + /* The current scan position. If @node is non-NULL then its reference count has
14168 + been incremented to reflect this reference. */
14169 + jnode *node;
14170 +
14171 + /* A handle for zload/zrelse of current scan position node. */
14172 + load_count node_load;
14173 +
14174 + /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14175 + node is locked using this lock handle. The endpoint needs to be locked for
14176 + transfer to the flush_position object after scanning finishes. */
14177 + lock_handle node_lock;
14178 +
14179 + /* When the position is unformatted, its parent, coordinate, and parent
14180 + zload/zrelse handle. */
14181 + lock_handle parent_lock;
14182 + coord_t parent_coord;
14183 + load_count parent_load;
14184 +
14185 + /* The block allocator preceder hint. Sometimes flush_scan determines what the
14186 + preceder is and if so it sets it here, after which it is copied into the
14187 + flush_position. Otherwise, the preceder is computed later. */
14188 + reiser4_block_nr preceder_blk;
14189 +};
14190 +
14191 +struct convert_item_info {
14192 + dc_item_stat d_cur; /* disk cluster state of the current item */
14193 + dc_item_stat d_next; /* disk cluster state of the next slum item */
14194 + struct inode *inode;
14195 + flow_t flow;
14196 +};
14197 +
14198 +struct convert_info {
14199 + int count; /* for squalloc terminating */
14200 + item_plugin *iplug; /* current item plugin */
14201 + struct convert_item_info *itm; /* current item info */
14202 + struct cluster_handle clust; /* transform cluster */
14203 +};
14204 +
14205 +typedef enum flush_position_state {
14206 + POS_INVALID, /* Invalid or stopped pos, do not continue slum
14207 + * processing */
14208 + POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14209 + * leaf level */
14210 + POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14211 + * to traverse unformatted nodes */
14212 + POS_TO_LEAF, /* pos is being moved to leaf level */
14213 + POS_TO_TWIG, /* pos is being moved to twig level */
14214 + POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14215 + * rightmost unit of the current twig */
14216 + POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14217 +} flushpos_state_t;
14218 +
14219 +/* An encapsulation of the current flush point and all the parameters that are passed
14220 + through the entire squeeze-and-allocate stage of the flush routine. A single
14221 + flush_position object is constructed after left- and right-scanning finishes. */
14222 +struct flush_position {
14223 + flushpos_state_t state;
14224 +
14225 + coord_t coord; /* coord to traverse unformatted nodes */
14226 + lock_handle lock; /* current lock we hold */
14227 + load_count load; /* load status for current locked formatted node */
14228 +
14229 + jnode *child; /* for passing a reference to unformatted child
14230 + * across pos state changes */
14231 +
14232 + reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14233 + int leaf_relocate; /* True if enough leaf-level nodes were
14234 + * found to suggest a relocate policy. */
14235 + int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14236 + int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14237 + flush_queue_t *fq;
14238 + long *nr_written; /* number of nodes submitted to disk */
14239 + int flags; /* a copy of jnode_flush flags argument */
14240 +
14241 + znode *prev_twig; /* previous parent pointer value, used to catch
14242 + * processing of new twig node */
14243 + struct convert_info *sq; /* convert info */
14244 +
14245 + unsigned long pos_in_unit; /* for extents only. Position
14246 + within an extent unit of first
14247 + jnode of slum */
14248 + long nr_to_write; /* number of unformatted nodes to handle on flush */
14249 +};
14250 +
14251 +static inline int item_convert_count(flush_pos_t * pos)
14252 +{
14253 + return pos->sq->count;
14254 +}
14255 +static inline void inc_item_convert_count(flush_pos_t * pos)
14256 +{
14257 + pos->sq->count++;
14258 +}
14259 +static inline void set_item_convert_count(flush_pos_t * pos, int count)
14260 +{
14261 + pos->sq->count = count;
14262 +}
14263 +static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14264 +{
14265 + return pos->sq->iplug;
14266 +}
14267 +
14268 +static inline struct convert_info *convert_data(flush_pos_t * pos)
14269 +{
14270 + return pos->sq;
14271 +}
14272 +
14273 +static inline struct convert_item_info *item_convert_data(flush_pos_t * pos)
14274 +{
14275 + assert("edward-955", convert_data(pos));
14276 + return pos->sq->itm;
14277 +}
14278 +
14279 +static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos)
14280 +{
14281 + return &pos->sq->clust.tc;
14282 +}
14283 +
14284 +static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos,
14285 + tfm_stream_id id)
14286 +{
14287 + assert("edward-854", pos->sq != NULL);
14288 + return get_tfm_stream(tfm_cluster_sq(pos), id);
14289 +}
14290 +
14291 +static inline int chaining_data_present(flush_pos_t * pos)
14292 +{
14293 + return convert_data(pos) && item_convert_data(pos);
14294 +}
14295 +
14296 +/* Returns true if next node contains next item of the disk cluster
14297 + so item convert data should be moved to the right slum neighbor.
14298 +*/
14299 +static inline int should_chain_next_node(flush_pos_t * pos)
14300 +{
14301 + int result = 0;
14302 +
14303 + assert("edward-1007", chaining_data_present(pos));
14304 +
14305 + switch (item_convert_data(pos)->d_next) {
14306 + case DC_CHAINED_ITEM:
14307 + result = 1;
14308 + break;
14309 + case DC_AFTER_CLUSTER:
14310 + break;
14311 + default:
14312 + impossible("edward-1009", "bad state of next slum item");
14313 + }
14314 + return result;
14315 +}
14316 +
14317 +/* update item state in a disk cluster to assign conversion mode */
14318 +static inline void
14319 +move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14320 +{
14321 +
14322 + assert("edward-1010", chaining_data_present(pos));
14323 +
14324 + if (this_node == 0) {
14325 + /* next item is on the right neighbor */
14326 + assert("edward-1011",
14327 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14328 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14329 + assert("edward-1012",
14330 + item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14331 +
14332 + item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14333 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14334 + } else {
14335 + /* next item is on the same node */
14336 + assert("edward-1013",
14337 + item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14338 + item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14339 + assert("edward-1227",
14340 + item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14341 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
14342 +
14343 + item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14344 + item_convert_data(pos)->d_next = DC_INVALID_STATE;
14345 + }
14346 +}
14347 +
14348 +static inline int should_convert_node(flush_pos_t * pos, znode * node)
14349 +{
14350 + return znode_convertible(node);
14351 +}
14352 +
14353 +/* true if there is attached convert item info */
14354 +static inline int should_convert_next_node(flush_pos_t * pos)
14355 +{
14356 + return convert_data(pos) && item_convert_data(pos);
14357 +}
14358 +
14359 +#define SQUALLOC_THRESHOLD 256
14360 +
14361 +static inline int should_terminate_squalloc(flush_pos_t * pos)
14362 +{
14363 + return convert_data(pos) &&
14364 + !item_convert_data(pos) &&
14365 + item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14366 +}
14367 +
14368 +#if 1
14369 +#define check_convert_info(pos) \
14370 +do { \
14371 + if (unlikely(should_convert_next_node(pos))){ \
14372 + warning("edward-1006", "unprocessed chained data"); \
14373 + printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \
14374 + item_convert_data(pos)->d_cur, \
14375 + item_convert_data(pos)->d_next, \
14376 + item_convert_data(pos)->flow.length); \
14377 + printk("inode %llu, size = %llu, cluster %lu\n", \
14378 + (unsigned long long)get_inode_oid \
14379 + (item_convert_data(pos)->inode), \
14380 + i_size_read(item_convert_data(pos)->inode), \
14381 + convert_data(pos)->clust.index); \
14382 + } \
14383 +} while (0)
14384 +#else
14385 +#define check_convert_info(pos)
14386 +#endif /* REISER4_DEBUG */
14387 +
14388 +void free_convert_data(flush_pos_t * pos);
14389 +/* used in extent.c */
14390 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14391 + const coord_t * parent);
14392 +int reiser4_scan_finished(flush_scan * scan);
14393 +int reiser4_scanning_left(flush_scan * scan);
14394 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14395 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14396 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
14397 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14398 + reiser4_key *stop_key);
14399 +extern int reiser4_init_fqs(void);
14400 +extern void reiser4_done_fqs(void);
14401 +
14402 +#if REISER4_DEBUG
14403 +
14404 +extern void reiser4_check_fq(const txn_atom *atom);
14405 +extern atomic_t flush_cnt;
14406 +
14407 +#define check_preceder(blk) \
14408 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14409 +extern void check_pos(flush_pos_t * pos);
14410 +#else
14411 +#define check_preceder(b) noop
14412 +#define check_pos(pos) noop
14413 +#endif
14414 +
14415 +/* __REISER4_FLUSH_H__ */
14416 +#endif
14417 +
14418 +/* Make Linus happy.
14419 + Local variables:
14420 + c-indentation-style: "K&R"
14421 + mode-name: "LC"
14422 + c-basic-offset: 8
14423 + tab-width: 8
14424 + fill-column: 90
14425 + LocalWords: preceder
14426 + End:
14427 +*/
14428 diff -urN linux-2.6.23.orig/fs/reiser4/flush_queue.c linux-2.6.23/fs/reiser4/flush_queue.c
14429 --- linux-2.6.23.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300
14430 +++ linux-2.6.23/fs/reiser4/flush_queue.c 2007-12-04 21:05:55.782803824 +0300
14431 @@ -0,0 +1,680 @@
14432 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14433 +
14434 +#include "debug.h"
14435 +#include "super.h"
14436 +#include "txnmgr.h"
14437 +#include "jnode.h"
14438 +#include "znode.h"
14439 +#include "page_cache.h"
14440 +#include "wander.h"
14441 +#include "vfs_ops.h"
14442 +#include "writeout.h"
14443 +#include "flush.h"
14444 +
14445 +#include <linux/bio.h>
14446 +#include <linux/mm.h>
14447 +#include <linux/pagemap.h>
14448 +#include <linux/blkdev.h>
14449 +#include <linux/writeback.h>
14450 +
14451 +/* A flush queue object is an accumulator for keeping jnodes prepared
14452 + by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14453 + kept on the flush queue until memory pressure or atom commit asks
14454 + flush queues to write some or all from their jnodes. */
14455 +
14456 +/*
14457 + LOCKING:
14458 +
14459 + fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14460 + list protected by atom spin lock. fq->prepped list uses the following
14461 + locking:
14462 +
14463 + two ways to protect fq->prepped list for read-only list traversal:
14464 +
14465 + 1. atom spin-lock atom.
14466 + 2. fq is IN_USE, atom->nr_running_queues increased.
14467 +
14468 + and one for list modification:
14469 +
14470 + 1. atom is spin-locked and one condition is true: fq is IN_USE or
14471 + atom->nr_running_queues == 0.
14472 +
14473 + The deadlock-safe order for flush queues and atoms is: first lock atom, then
14474 + lock flush queue, then lock jnode.
14475 +*/
14476 +
14477 +#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14478 +#define fq_ready(fq) (!fq_in_use(fq))
14479 +
14480 +#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14481 +#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14482 +
14483 +/* get lock on atom from locked flush queue object */
14484 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14485 +{
14486 + /* This code is similar to jnode_get_atom(), look at it for the
14487 + * explanation. */
14488 + txn_atom *atom;
14489 +
14490 + assert_spin_locked(&(fq->guard));
14491 +
14492 + while (1) {
14493 + atom = fq->atom;
14494 + if (atom == NULL)
14495 + break;
14496 +
14497 + if (spin_trylock_atom(atom))
14498 + break;
14499 +
14500 + atomic_inc(&atom->refcount);
14501 + spin_unlock(&(fq->guard));
14502 + spin_lock_atom(atom);
14503 + spin_lock(&(fq->guard));
14504 +
14505 + if (fq->atom == atom) {
14506 + atomic_dec(&atom->refcount);
14507 + break;
14508 + }
14509 +
14510 + spin_unlock(&(fq->guard));
14511 + atom_dec_and_unlock(atom);
14512 + spin_lock(&(fq->guard));
14513 + }
14514 +
14515 + return atom;
14516 +}
14517 +
14518 +txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14519 +{
14520 + txn_atom *atom;
14521 +
14522 + spin_lock(&(fq->guard));
14523 + atom = atom_locked_by_fq_nolock(fq);
14524 + spin_unlock(&(fq->guard));
14525 + return atom;
14526 +}
14527 +
14528 +static void init_fq(flush_queue_t * fq)
14529 +{
14530 + memset(fq, 0, sizeof *fq);
14531 +
14532 + atomic_set(&fq->nr_submitted, 0);
14533 +
14534 + INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14535 +
14536 + init_waitqueue_head(&fq->wait);
14537 + spin_lock_init(&fq->guard);
14538 +}
14539 +
14540 +/* slab for flush queues */
14541 +static struct kmem_cache *fq_slab;
14542 +
14543 +/**
14544 + * reiser4_init_fqs - create flush queue cache
14545 + *
14546 + * Initializes slab cache of flush queues. It is part of reiser4 module
14547 + * initialization.
14548 + */
14549 +int reiser4_init_fqs(void)
14550 +{
14551 + fq_slab = kmem_cache_create("fq",
14552 + sizeof(flush_queue_t),
14553 + 0, SLAB_HWCACHE_ALIGN, NULL);
14554 + if (fq_slab == NULL)
14555 + return RETERR(-ENOMEM);
14556 + return 0;
14557 +}
14558 +
14559 +/**
14560 + * reiser4_done_fqs - delete flush queue cache
14561 + *
14562 + * This is called on reiser4 module unloading or system shutdown.
14563 + */
14564 +void reiser4_done_fqs(void)
14565 +{
14566 + destroy_reiser4_cache(&fq_slab);
14567 +}
14568 +
14569 +/* create new flush queue object */
14570 +static flush_queue_t *create_fq(gfp_t gfp)
14571 +{
14572 + flush_queue_t *fq;
14573 +
14574 + fq = kmem_cache_alloc(fq_slab, gfp);
14575 + if (fq)
14576 + init_fq(fq);
14577 +
14578 + return fq;
14579 +}
14580 +
14581 +/* adjust atom's and flush queue's counters of queued nodes */
14582 +static void count_enqueued_node(flush_queue_t * fq)
14583 +{
14584 + ON_DEBUG(fq->atom->num_queued++);
14585 +}
14586 +
14587 +static void count_dequeued_node(flush_queue_t * fq)
14588 +{
14589 + assert("zam-993", fq->atom->num_queued > 0);
14590 + ON_DEBUG(fq->atom->num_queued--);
14591 +}
14592 +
14593 +/* attach flush queue object to the atom */
14594 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14595 +{
14596 + assert_spin_locked(&(atom->alock));
14597 + list_add(&fq->alink, &atom->flush_queues);
14598 + fq->atom = atom;
14599 + ON_DEBUG(atom->nr_flush_queues++);
14600 +}
14601 +
14602 +static void detach_fq(flush_queue_t * fq)
14603 +{
14604 + assert_spin_locked(&(fq->atom->alock));
14605 +
14606 + spin_lock(&(fq->guard));
14607 + list_del_init(&fq->alink);
14608 + assert("vs-1456", fq->atom->nr_flush_queues > 0);
14609 + ON_DEBUG(fq->atom->nr_flush_queues--);
14610 + fq->atom = NULL;
14611 + spin_unlock(&(fq->guard));
14612 +}
14613 +
14614 +/* destroy flush queue object */
14615 +static void done_fq(flush_queue_t * fq)
14616 +{
14617 + assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14618 + assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14619 +
14620 + kmem_cache_free(fq_slab, fq);
14621 +}
14622 +
14623 +/* */
14624 +static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14625 +{
14626 + JF_SET(node, JNODE_FLUSH_QUEUED);
14627 + count_enqueued_node(fq);
14628 +}
14629 +
14630 +/* Putting jnode into the flush queue. Both atom and jnode should be
14631 + spin-locked. */
14632 +void queue_jnode(flush_queue_t * fq, jnode * node)
14633 +{
14634 + assert_spin_locked(&(node->guard));
14635 + assert("zam-713", node->atom != NULL);
14636 + assert_spin_locked(&(node->atom->alock));
14637 + assert("zam-716", fq->atom != NULL);
14638 + assert("zam-717", fq->atom == node->atom);
14639 + assert("zam-907", fq_in_use(fq));
14640 +
14641 + assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14642 + assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14643 + assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14644 + assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14645 +
14646 + mark_jnode_queued(fq, node);
14647 + list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14648 +
14649 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14650 + FQ_LIST, 1));
14651 +}
14652 +
14653 +/* repeatable process for waiting io completion on a flush queue object */
14654 +static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14655 +{
14656 + assert("zam-738", fq->atom != NULL);
14657 + assert_spin_locked(&(fq->atom->alock));
14658 + assert("zam-736", fq_in_use(fq));
14659 + assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14660 +
14661 + if (atomic_read(&fq->nr_submitted) != 0) {
14662 + struct super_block *super;
14663 +
14664 + spin_unlock_atom(fq->atom);
14665 +
14666 + assert("nikita-3013", reiser4_schedulable());
14667 +
14668 + super = reiser4_get_current_sb();
14669 +
14670 + /* FIXME: this is instead of blk_run_queues() */
14671 + blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14672 +
14673 + if (!(super->s_flags & MS_RDONLY))
14674 + wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
14675 +
14676 + /* Ask the caller to re-acquire the locks and call this
14677 + function again. Note: this technique is commonly used in
14678 + the txnmgr code. */
14679 + return -E_REPEAT;
14680 + }
14681 +
14682 + *nr_io_errors += atomic_read(&fq->nr_errors);
14683 + return 0;
14684 +}
14685 +
14686 +/* wait on I/O completion, re-submit dirty nodes to write */
14687 +static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14688 +{
14689 + int ret;
14690 + txn_atom *atom = fq->atom;
14691 +
14692 + assert("zam-801", atom != NULL);
14693 + assert_spin_locked(&(atom->alock));
14694 + assert("zam-762", fq_in_use(fq));
14695 +
14696 + ret = wait_io(fq, nr_io_errors);
14697 + if (ret)
14698 + return ret;
14699 +
14700 + detach_fq(fq);
14701 + done_fq(fq);
14702 +
14703 + reiser4_atom_send_event(atom);
14704 +
14705 + return 0;
14706 +}
14707 +
14708 +/* wait for all i/o for given atom to be completed, actually do one iteration
14709 + on that and return -E_REPEAT if there more iterations needed */
14710 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14711 +{
14712 + flush_queue_t *fq;
14713 +
14714 + assert_spin_locked(&(atom->alock));
14715 +
14716 + if (list_empty_careful(&atom->flush_queues))
14717 + return 0;
14718 +
14719 + list_for_each_entry(fq, &atom->flush_queues, alink) {
14720 + if (fq_ready(fq)) {
14721 + int ret;
14722 +
14723 + mark_fq_in_use(fq);
14724 + assert("vs-1247", fq->owner == NULL);
14725 + ON_DEBUG(fq->owner = current);
14726 + ret = finish_fq(fq, nr_io_errors);
14727 +
14728 + if (*nr_io_errors)
14729 + reiser4_handle_error();
14730 +
14731 + if (ret) {
14732 + reiser4_fq_put(fq);
14733 + return ret;
14734 + }
14735 +
14736 + spin_unlock_atom(atom);
14737 +
14738 + return -E_REPEAT;
14739 + }
14740 + }
14741 +
14742 + /* All flush queues are in use; atom remains locked */
14743 + return -EBUSY;
14744 +}
14745 +
14746 +/* wait all i/o for current atom */
14747 +int current_atom_finish_all_fq(void)
14748 +{
14749 + txn_atom *atom;
14750 + int nr_io_errors = 0;
14751 + int ret = 0;
14752 +
14753 + do {
14754 + while (1) {
14755 + atom = get_current_atom_locked();
14756 + ret = finish_all_fq(atom, &nr_io_errors);
14757 + if (ret != -EBUSY)
14758 + break;
14759 + reiser4_atom_wait_event(atom);
14760 + }
14761 + } while (ret == -E_REPEAT);
14762 +
14763 + /* we do not need locked atom after this function finishes, SUCCESS or
14764 + -EBUSY are two return codes when atom remains locked after
14765 + finish_all_fq */
14766 + if (!ret)
14767 + spin_unlock_atom(atom);
14768 +
14769 + assert_spin_not_locked(&(atom->alock));
14770 +
14771 + if (ret)
14772 + return ret;
14773 +
14774 + if (nr_io_errors)
14775 + return RETERR(-EIO);
14776 +
14777 + return 0;
14778 +}
14779 +
14780 +/* change node->atom field for all jnode from given list */
14781 +static void
14782 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14783 +{
14784 + jnode *cur;
14785 +
14786 + list_for_each_entry(cur, list, capture_link) {
14787 + spin_lock_jnode(cur);
14788 + cur->atom = atom;
14789 + spin_unlock_jnode(cur);
14790 + }
14791 +}
14792 +
14793 +/* support for atom fusion operation */
14794 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14795 +{
14796 + flush_queue_t *fq;
14797 +
14798 + assert_spin_locked(&(to->alock));
14799 + assert_spin_locked(&(from->alock));
14800 +
14801 + list_for_each_entry(fq, &from->flush_queues, alink) {
14802 + scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14803 + spin_lock(&(fq->guard));
14804 + fq->atom = to;
14805 + spin_unlock(&(fq->guard));
14806 + }
14807 +
14808 + list_splice_init(&from->flush_queues, to->flush_queues.prev);
14809 +
14810 +#if REISER4_DEBUG
14811 + to->num_queued += from->num_queued;
14812 + to->nr_flush_queues += from->nr_flush_queues;
14813 + from->nr_flush_queues = 0;
14814 +#endif
14815 +}
14816 +
14817 +#if REISER4_DEBUG
14818 +int atom_fq_parts_are_clean(txn_atom * atom)
14819 +{
14820 + assert("zam-915", atom != NULL);
14821 + return list_empty_careful(&atom->flush_queues);
14822 +}
14823 +#endif
14824 +/* Bio i/o completion routine for reiser4 write operations. */
14825 +static int
14826 +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
14827 + int err)
14828 +{
14829 + int i;
14830 + int nr_errors = 0;
14831 + flush_queue_t *fq;
14832 +
14833 + assert("zam-958", bio->bi_rw & WRITE);
14834 +
14835 + /* i/o op. is not fully completed */
14836 + if (bio->bi_size != 0)
14837 + return 1;
14838 +
14839 + if (err == -EOPNOTSUPP)
14840 + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14841 +
14842 + /* we expect that bio->private is set to NULL or fq object which is used
14843 + * for synchronization and error counting. */
14844 + fq = bio->bi_private;
14845 + /* Check all elements of io_vec for correct write completion. */
14846 + for (i = 0; i < bio->bi_vcnt; i += 1) {
14847 + struct page *pg = bio->bi_io_vec[i].bv_page;
14848 +
14849 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14850 + SetPageError(pg);
14851 + nr_errors++;
14852 + }
14853 +
14854 + {
14855 + /* jnode WRITEBACK ("write is in progress bit") is
14856 + * atomically cleared here. */
14857 + jnode *node;
14858 +
14859 + assert("zam-736", pg != NULL);
14860 + assert("zam-736", PagePrivate(pg));
14861 + node = jprivate(pg);
14862 +
14863 + JF_CLR(node, JNODE_WRITEBACK);
14864 + }
14865 +
14866 + end_page_writeback(pg);
14867 + page_cache_release(pg);
14868 + }
14869 +
14870 + if (fq) {
14871 + /* count i/o error in fq object */
14872 + atomic_add(nr_errors, &fq->nr_errors);
14873 +
14874 + /* If all write requests registered in this "fq" are done we up
14875 + * the waiter. */
14876 + if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
14877 + wake_up(&fq->wait);
14878 + }
14879 +
14880 + bio_put(bio);
14881 + return 0;
14882 +}
14883 +
14884 +/* Count I/O requests which will be submitted by @bio in given flush queues
14885 + @fq */
14886 +void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14887 +{
14888 + bio->bi_private = fq;
14889 + bio->bi_end_io = end_io_handler;
14890 +
14891 + if (fq)
14892 + atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14893 +}
14894 +
14895 +/* Move all queued nodes out from @fq->prepped list. */
14896 +static void release_prepped_list(flush_queue_t * fq)
14897 +{
14898 + txn_atom *atom;
14899 +
14900 + assert("zam-904", fq_in_use(fq));
14901 + atom = atom_locked_by_fq(fq);
14902 +
14903 + while (!list_empty(ATOM_FQ_LIST(fq))) {
14904 + jnode *cur;
14905 +
14906 + cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14907 + list_del_init(&cur->capture_link);
14908 +
14909 + count_dequeued_node(fq);
14910 + spin_lock_jnode(cur);
14911 + assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14912 + assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14913 + assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14914 + JF_CLR(cur, JNODE_FLUSH_QUEUED);
14915 +
14916 + if (JF_ISSET(cur, JNODE_DIRTY)) {
14917 + list_add_tail(&cur->capture_link,
14918 + ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14919 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14920 + DIRTY_LIST, 1));
14921 + } else {
14922 + list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14923 + ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14924 + CLEAN_LIST, 1));
14925 + }
14926 +
14927 + spin_unlock_jnode(cur);
14928 + }
14929 +
14930 + if (--atom->nr_running_queues == 0)
14931 + reiser4_atom_send_event(atom);
14932 +
14933 + spin_unlock_atom(atom);
14934 +}
14935 +
14936 +/* Submit write requests for nodes on the already filled flush queue @fq.
14937 +
14938 + @fq: flush queue object which contains jnodes we can (and will) write.
14939 + @return: number of submitted blocks (>=0) if success, otherwise -- an error
14940 + code (<0). */
14941 +int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
14942 +{
14943 + int ret;
14944 + txn_atom *atom;
14945 +
14946 + while (1) {
14947 + atom = atom_locked_by_fq(fq);
14948 + assert("zam-924", atom);
14949 + /* do not write fq in parallel. */
14950 + if (atom->nr_running_queues == 0
14951 + || !(flags & WRITEOUT_SINGLE_STREAM))
14952 + break;
14953 + reiser4_atom_wait_event(atom);
14954 + }
14955 +
14956 + atom->nr_running_queues++;
14957 + spin_unlock_atom(atom);
14958 +
14959 + ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14960 + release_prepped_list(fq);
14961 +
14962 + return ret;
14963 +}
14964 +
14965 +/* Getting flush queue object for exclusive use by one thread. May require
14966 + several iterations which is indicated by -E_REPEAT return code.
14967 +
14968 + This function does not contain code for obtaining an atom lock because an
14969 + atom lock is obtained by different ways in different parts of reiser4,
14970 + usually it is current atom, but we need a possibility for getting fq for the
14971 + atom of given jnode. */
14972 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
14973 +{
14974 + flush_queue_t *fq;
14975 +
14976 + assert_spin_locked(&(atom->alock));
14977 +
14978 + fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
14979 + while (&atom->flush_queues != &fq->alink) {
14980 + spin_lock(&(fq->guard));
14981 +
14982 + if (fq_ready(fq)) {
14983 + mark_fq_in_use(fq);
14984 + assert("vs-1246", fq->owner == NULL);
14985 + ON_DEBUG(fq->owner = current);
14986 + spin_unlock(&(fq->guard));
14987 +
14988 + if (*new_fq)
14989 + done_fq(*new_fq);
14990 +
14991 + *new_fq = fq;
14992 +
14993 + return 0;
14994 + }
14995 +
14996 + spin_unlock(&(fq->guard));
14997 +
14998 + fq = list_entry(fq->alink.next, flush_queue_t, alink);
14999 + }
15000 +
15001 + /* Use previously allocated fq object */
15002 + if (*new_fq) {
15003 + mark_fq_in_use(*new_fq);
15004 + assert("vs-1248", (*new_fq)->owner == 0);
15005 + ON_DEBUG((*new_fq)->owner = current);
15006 + attach_fq(atom, *new_fq);
15007 +
15008 + return 0;
15009 + }
15010 +
15011 + spin_unlock_atom(atom);
15012 +
15013 + *new_fq = create_fq(gfp);
15014 +
15015 + if (*new_fq == NULL)
15016 + return RETERR(-ENOMEM);
15017 +
15018 + return RETERR(-E_REPEAT);
15019 +}
15020 +
15021 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15022 +{
15023 + return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
15024 +}
15025 +
15026 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
15027 + object for current atom, if success fq->atom remains locked. */
15028 +flush_queue_t *get_fq_for_current_atom(void)
15029 +{
15030 + flush_queue_t *fq = NULL;
15031 + txn_atom *atom;
15032 + int ret;
15033 +
15034 + do {
15035 + atom = get_current_atom_locked();
15036 + ret = reiser4_fq_by_atom(atom, &fq);
15037 + } while (ret == -E_REPEAT);
15038 +
15039 + if (ret)
15040 + return ERR_PTR(ret);
15041 + return fq;
15042 +}
15043 +
15044 +/* Releasing flush queue object after exclusive use */
15045 +void reiser4_fq_put_nolock(flush_queue_t *fq)
15046 +{
15047 + assert("zam-747", fq->atom != NULL);
15048 + assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15049 + mark_fq_ready(fq);
15050 + assert("vs-1245", fq->owner == current);
15051 + ON_DEBUG(fq->owner = NULL);
15052 +}
15053 +
15054 +void reiser4_fq_put(flush_queue_t * fq)
15055 +{
15056 + txn_atom *atom;
15057 +
15058 + spin_lock(&(fq->guard));
15059 + atom = atom_locked_by_fq_nolock(fq);
15060 +
15061 + assert("zam-746", atom != NULL);
15062 +
15063 + reiser4_fq_put_nolock(fq);
15064 + reiser4_atom_send_event(atom);
15065 +
15066 + spin_unlock(&(fq->guard));
15067 + spin_unlock_atom(atom);
15068 +}
15069 +
15070 +/* A part of atom object initialization related to the embedded flush queue
15071 + list head */
15072 +
15073 +void init_atom_fq_parts(txn_atom *atom)
15074 +{
15075 + INIT_LIST_HEAD(&atom->flush_queues);
15076 +}
15077 +
15078 +#if REISER4_DEBUG
15079 +
15080 +void reiser4_check_fq(const txn_atom *atom)
15081 +{
15082 + /* check number of nodes on all atom's flush queues */
15083 + flush_queue_t *fq;
15084 + int count;
15085 + struct list_head *pos;
15086 +
15087 + count = 0;
15088 + list_for_each_entry(fq, &atom->flush_queues, alink) {
15089 + spin_lock(&(fq->guard));
15090 + /* calculate number of jnodes on fq' list of prepped jnodes */
15091 + list_for_each(pos, ATOM_FQ_LIST(fq))
15092 + count++;
15093 + spin_unlock(&(fq->guard));
15094 + }
15095 + if (count != atom->fq)
15096 + warning("", "fq counter %d, real %d\n", atom->fq, count);
15097 +
15098 +}
15099 +
15100 +#endif
15101 +
15102 +/*
15103 + * Local variables:
15104 + * c-indentation-style: "K&R"
15105 + * mode-name: "LC"
15106 + * c-basic-offset: 8
15107 + * tab-width: 8
15108 + * fill-column: 79
15109 + * scroll-step: 1
15110 + * End:
15111 + */
15112 diff -urN linux-2.6.23.orig/fs/reiser4/forward.h linux-2.6.23/fs/reiser4/forward.h
15113 --- linux-2.6.23.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300
15114 +++ linux-2.6.23/fs/reiser4/forward.h 2007-12-04 16:49:30.000000000 +0300
15115 @@ -0,0 +1,252 @@
15116 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15117 +
15118 +/* Forward declarations. Thank you Kernighan. */
15119 +
15120 +#if !defined( __REISER4_FORWARD_H__ )
15121 +#define __REISER4_FORWARD_H__
15122 +
15123 +#include <asm/errno.h>
15124 +#include <linux/types.h>
15125 +
15126 +typedef struct zlock zlock;
15127 +typedef struct lock_stack lock_stack;
15128 +typedef struct lock_handle lock_handle;
15129 +typedef struct znode znode;
15130 +typedef struct flow flow_t;
15131 +typedef struct coord coord_t;
15132 +typedef struct tree_access_pointer tap_t;
15133 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15134 +typedef union reiser4_plugin reiser4_plugin;
15135 +typedef __u16 reiser4_plugin_id;
15136 +typedef __u64 reiser4_plugin_groups;
15137 +typedef struct item_plugin item_plugin;
15138 +typedef struct jnode_plugin jnode_plugin;
15139 +typedef struct reiser4_item_data reiser4_item_data;
15140 +typedef union reiser4_key reiser4_key;
15141 +typedef struct reiser4_tree reiser4_tree;
15142 +typedef struct carry_cut_data carry_cut_data;
15143 +typedef struct carry_kill_data carry_kill_data;
15144 +typedef struct carry_tree_op carry_tree_op;
15145 +typedef struct carry_tree_node carry_tree_node;
15146 +typedef struct carry_plugin_info carry_plugin_info;
15147 +typedef struct reiser4_journal reiser4_journal;
15148 +typedef struct txn_atom txn_atom;
15149 +typedef struct txn_handle txn_handle;
15150 +typedef struct txn_mgr txn_mgr;
15151 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15152 +typedef struct reiser4_context reiser4_context;
15153 +typedef struct carry_level carry_level;
15154 +typedef struct blocknr_set_entry blocknr_set_entry;
15155 +/* super_block->s_fs_info points to this */
15156 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15157 +/* next two objects are fields of reiser4_super_info_data */
15158 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15159 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15160 +
15161 +typedef struct flush_scan flush_scan;
15162 +typedef struct flush_position flush_pos_t;
15163 +
15164 +typedef unsigned short pos_in_node_t;
15165 +#define MAX_POS_IN_NODE 65535
15166 +
15167 +typedef struct jnode jnode;
15168 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15169 +
15170 +typedef struct uf_coord uf_coord_t;
15171 +typedef struct hint hint_t;
15172 +
15173 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15174 +
15175 +struct inode;
15176 +struct page;
15177 +struct file;
15178 +struct dentry;
15179 +struct super_block;
15180 +
15181 +/* return values of coord_by_key(). cbk == coord_by_key */
15182 +typedef enum {
15183 + CBK_COORD_FOUND = 0,
15184 + CBK_COORD_NOTFOUND = -ENOENT,
15185 +} lookup_result;
15186 +
15187 +/* results of lookup with directory file */
15188 +typedef enum {
15189 + FILE_NAME_FOUND = 0,
15190 + FILE_NAME_NOTFOUND = -ENOENT,
15191 + FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15192 + FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15193 +} file_lookup_result;
15194 +
15195 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15196 + both coincide. */
15197 +typedef enum {
15198 + /* search exactly for the coord with key given */
15199 + FIND_EXACT,
15200 + /* search for coord with the maximal key not greater than one
15201 + given */
15202 + FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15203 +} lookup_bias;
15204 +
15205 +typedef enum {
15206 + /* number of leaf level of the tree
15207 + The fake root has (tree_level=0). */
15208 + LEAF_LEVEL = 1,
15209 +
15210 + /* number of level one above leaf level of the tree.
15211 +
15212 + It is supposed that internal tree used by reiser4 to store file
15213 + system data and meta data will have height 2 initially (when
15214 + created by mkfs).
15215 + */
15216 + TWIG_LEVEL = 2,
15217 +} tree_level;
15218 +
15219 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15220 + array, since the zero'th level is not used. */
15221 +#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15222 +
15223 +/* enumeration of possible mutual position of item and coord. This enum is
15224 + return type of ->is_in_item() item plugin method which see. */
15225 +typedef enum {
15226 + /* coord is on the left of an item */
15227 + IP_ON_THE_LEFT,
15228 + /* coord is inside item */
15229 + IP_INSIDE,
15230 + /* coord is inside item, but to the right of the rightmost unit of
15231 + this item */
15232 + IP_RIGHT_EDGE,
15233 + /* coord is on the right of an item */
15234 + IP_ON_THE_RIGHT
15235 +} interposition;
15236 +
15237 +/* type of lock to acquire on znode before returning it to caller */
15238 +typedef enum {
15239 + ZNODE_NO_LOCK = 0,
15240 + ZNODE_READ_LOCK = 1,
15241 + ZNODE_WRITE_LOCK = 2,
15242 +} znode_lock_mode;
15243 +
15244 +/* type of lock request */
15245 +typedef enum {
15246 + ZNODE_LOCK_LOPRI = 0,
15247 + ZNODE_LOCK_HIPRI = (1 << 0),
15248 +
15249 + /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15250 + waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15251 + return the value -E_REPEAT. */
15252 + ZNODE_LOCK_NONBLOCK = (1 << 1),
15253 + /* An option for longterm_lock_znode which prevents atom fusion */
15254 + ZNODE_LOCK_DONT_FUSE = (1 << 2)
15255 +} znode_lock_request;
15256 +
15257 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15258 +
15259 +/* used to specify direction of shift. These must be -1 and 1 */
15260 +typedef enum {
15261 + SHIFT_LEFT = 1,
15262 + SHIFT_RIGHT = -1
15263 +} shift_direction;
15264 +
15265 +typedef enum {
15266 + LEFT_SIDE,
15267 + RIGHT_SIDE
15268 +} sideof;
15269 +
15270 +#define round_up( value, order ) \
15271 + ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15272 + ~( ( order ) - 1 ) ) )
15273 +
15274 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15275 +typedef enum {
15276 + /* unit of internal item is moved */
15277 + SUBTREE_MOVED = 0,
15278 + /* nothing else can be squeezed into left neighbor */
15279 + SQUEEZE_TARGET_FULL = 1,
15280 + /* all content of node is squeezed into its left neighbor */
15281 + SQUEEZE_SOURCE_EMPTY = 2,
15282 + /* one more item is copied (this is only returned by
15283 + allocate_and_copy_extent to squalloc_twig)) */
15284 + SQUEEZE_CONTINUE = 3
15285 +} squeeze_result;
15286 +
15287 +/* Do not change items ids. If you do - there will be format change */
15288 +typedef enum {
15289 + STATIC_STAT_DATA_ID = 0x0,
15290 + SIMPLE_DIR_ENTRY_ID = 0x1,
15291 + COMPOUND_DIR_ID = 0x2,
15292 + NODE_POINTER_ID = 0x3,
15293 + EXTENT_POINTER_ID = 0x5,
15294 + FORMATTING_ID = 0x6,
15295 + CTAIL_ID = 0x7,
15296 + BLACK_BOX_ID = 0x8,
15297 + LAST_ITEM_ID = 0x9
15298 +} item_id;
15299 +
15300 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15301 + whether commit() was called or VM memory pressure was applied. */
15302 +typedef enum {
15303 + /* submit flush queue to disk at jnode_flush completion */
15304 + JNODE_FLUSH_WRITE_BLOCKS = 1,
15305 +
15306 + /* flush is called for commit */
15307 + JNODE_FLUSH_COMMIT = 2,
15308 + /* not implemented */
15309 + JNODE_FLUSH_MEMORY_FORMATTED = 4,
15310 +
15311 + /* not implemented */
15312 + JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15313 +} jnode_flush_flags;
15314 +
15315 +/* Flags to insert/paste carry operations. Currently they only used in
15316 + flushing code, but in future, they can be used to optimize for repetitive
15317 + accesses. */
15318 +typedef enum {
15319 + /* carry is not allowed to shift data to the left when trying to find
15320 + free space */
15321 + COPI_DONT_SHIFT_LEFT = (1 << 0),
15322 + /* carry is not allowed to shift data to the right when trying to find
15323 + free space */
15324 + COPI_DONT_SHIFT_RIGHT = (1 << 1),
15325 + /* carry is not allowed to allocate new node(s) when trying to find
15326 + free space */
15327 + COPI_DONT_ALLOCATE = (1 << 2),
15328 + /* try to load left neighbor if its not in a cache */
15329 + COPI_LOAD_LEFT = (1 << 3),
15330 + /* try to load right neighbor if its not in a cache */
15331 + COPI_LOAD_RIGHT = (1 << 4),
15332 + /* shift insertion point to the left neighbor */
15333 + COPI_GO_LEFT = (1 << 5),
15334 + /* shift insertion point to the right neighbor */
15335 + COPI_GO_RIGHT = (1 << 6),
15336 + /* try to step back into original node if insertion into new node
15337 + fails after shifting data there. */
15338 + COPI_STEP_BACK = (1 << 7)
15339 +} cop_insert_flag;
15340 +
15341 +typedef enum {
15342 + SAFE_UNLINK, /* safe-link for unlink */
15343 + SAFE_TRUNCATE /* safe-link for truncate */
15344 +} reiser4_safe_link_t;
15345 +
15346 +/* this is to show on which list of atom jnode is */
15347 +typedef enum {
15348 + NOT_CAPTURED,
15349 + DIRTY_LIST,
15350 + CLEAN_LIST,
15351 + FQ_LIST,
15352 + WB_LIST,
15353 + OVRWR_LIST
15354 +} atom_list;
15355 +
15356 +/* __REISER4_FORWARD_H__ */
15357 +#endif
15358 +
15359 +/* Make Linus happy.
15360 + Local variables:
15361 + c-indentation-style: "K&R"
15362 + mode-name: "LC"
15363 + c-basic-offset: 8
15364 + tab-width: 8
15365 + fill-column: 120
15366 + End:
15367 +*/
15368 diff -urN linux-2.6.23.orig/fs/reiser4/fsdata.c linux-2.6.23/fs/reiser4/fsdata.c
15369 --- linux-2.6.23.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300
15370 +++ linux-2.6.23/fs/reiser4/fsdata.c 2007-12-04 16:49:30.000000000 +0300
15371 @@ -0,0 +1,804 @@
15372 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15373 + * reiser4/README */
15374 +
15375 +#include "fsdata.h"
15376 +#include "inode.h"
15377 +
15378 +
15379 +/* cache or dir_cursors */
15380 +static struct kmem_cache *d_cursor_cache;
15381 +
15382 +/* list of unused cursors */
15383 +static LIST_HEAD(cursor_cache);
15384 +
15385 +/* number of cursors in list of ununsed cursors */
15386 +static unsigned long d_cursor_unused = 0;
15387 +
15388 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15389 +DEFINE_SPINLOCK(d_lock);
15390 +
15391 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15392 +static int file_is_stateless(struct file *file);
15393 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15394 +static void kill_cursor(dir_cursor *);
15395 +
15396 +/**
15397 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15398 + * @nr: number of objects to free
15399 + * @mask: GFP mask
15400 + *
15401 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15402 + * number. Return number of still freeable cursors.
15403 + */
15404 +static int d_cursor_shrink(int nr, gfp_t mask)
15405 +{
15406 + if (nr != 0) {
15407 + dir_cursor *scan;
15408 + int killed;
15409 +
15410 + killed = 0;
15411 + spin_lock(&d_lock);
15412 + while (!list_empty(&cursor_cache)) {
15413 + scan = list_entry(cursor_cache.next, dir_cursor, alist);
15414 + assert("nikita-3567", scan->ref == 0);
15415 + kill_cursor(scan);
15416 + ++killed;
15417 + --nr;
15418 + if (nr == 0)
15419 + break;
15420 + }
15421 + spin_unlock(&d_lock);
15422 + }
15423 + return d_cursor_unused;
15424 +}
15425 +
15426 +/*
15427 + * actually, d_cursors are "priceless", because there is no way to
15428 + * recover information stored in them. On the other hand, we don't
15429 + * want to consume all kernel memory by them. As a compromise, just
15430 + * assign higher "seeks" value to d_cursor cache, so that it will be
15431 + * shrunk only if system is really tight on memory.
15432 + */
15433 +static struct shrinker d_cursor_shrinker = {
15434 + .shrink = d_cursor_shrink,
15435 + .seeks = DEFAULT_SEEKS << 3,
15436 +};
15437 +
15438 +/**
15439 + * reiser4_init_d_cursor - create d_cursor cache
15440 + *
15441 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15442 + * initialization.
15443 + */
15444 +int reiser4_init_d_cursor(void)
15445 +{
15446 + d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15447 + SLAB_HWCACHE_ALIGN, NULL);
15448 + if (d_cursor_cache == NULL)
15449 + return RETERR(-ENOMEM);
15450 +
15451 + register_shrinker(&d_cursor_shrinker);
15452 + return 0;
15453 +}
15454 +
15455 +/**
15456 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15457 + *
15458 + * This is called on reiser4 module unloading or system shutdown.
15459 + */
15460 +void reiser4_done_d_cursor(void)
15461 +{
15462 + unregister_shrinker(&d_cursor_shrinker);
15463 +
15464 + destroy_reiser4_cache(&d_cursor_cache);
15465 +}
15466 +
15467 +#define D_CURSOR_TABLE_SIZE (256)
15468 +
15469 +static inline unsigned long
15470 +d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key)
15471 +{
15472 + assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15473 + return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15474 +}
15475 +
15476 +static inline int d_cursor_eq(const struct d_cursor_key *k1,
15477 + const struct d_cursor_key *k2)
15478 +{
15479 + return k1->cid == k2->cid && k1->oid == k2->oid;
15480 +}
15481 +
15482 +/*
15483 + * define functions to manipulate reiser4 super block's hash table of
15484 + * dir_cursors
15485 + */
15486 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15487 +#define KFREE(ptr, size) kfree(ptr)
15488 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15489 + dir_cursor,
15490 + struct d_cursor_key,
15491 + key, hash, d_cursor_hash, d_cursor_eq);
15492 +#undef KFREE
15493 +#undef KMALLOC
15494 +
15495 +/**
15496 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15497 + * @super: super block to initialize
15498 + *
15499 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15500 + * of mount.
15501 + */
15502 +int reiser4_init_super_d_info(struct super_block *super)
15503 +{
15504 + struct d_cursor_info *p;
15505 +
15506 + p = &get_super_private(super)->d_info;
15507 +
15508 + INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15509 + return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15510 +}
15511 +
15512 +/**
15513 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
15514 + * @super: super block being umounted
15515 + *
15516 + * It is called on umount. Kills all directory cursors attached to suoer block.
15517 + */
15518 +void reiser4_done_super_d_info(struct super_block *super)
15519 +{
15520 + struct d_cursor_info *d_info;
15521 + dir_cursor *cursor, *next;
15522 +
15523 + d_info = &get_super_private(super)->d_info;
15524 + for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15525 + kill_cursor(cursor);
15526 +
15527 + BUG_ON(d_info->tree.rnode != NULL);
15528 + d_cursor_hash_done(&d_info->table);
15529 +}
15530 +
15531 +/**
15532 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15533 + * @cursor: cursor to free
15534 + *
15535 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15536 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15537 + * indices, hash table, list of unused cursors and frees it.
15538 + */
15539 +static void kill_cursor(dir_cursor *cursor)
15540 +{
15541 + unsigned long index;
15542 +
15543 + assert("nikita-3566", cursor->ref == 0);
15544 + assert("nikita-3572", cursor->fsdata != NULL);
15545 +
15546 + index = (unsigned long)cursor->key.oid;
15547 + list_del_init(&cursor->fsdata->dir.linkage);
15548 + free_fsdata(cursor->fsdata);
15549 + cursor->fsdata = NULL;
15550 +
15551 + if (list_empty_careful(&cursor->list))
15552 + /* this is last cursor for a file. Kill radix-tree entry */
15553 + radix_tree_delete(&cursor->info->tree, index);
15554 + else {
15555 + void **slot;
15556 +
15557 + /*
15558 + * there are other cursors for the same oid.
15559 + */
15560 +
15561 + /*
15562 + * if radix tree point to the cursor being removed, re-target
15563 + * radix tree slot to the next cursor in the (non-empty as was
15564 + * checked above) element of the circular list of all cursors
15565 + * for this oid.
15566 + */
15567 + slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15568 + assert("nikita-3571", *slot != NULL);
15569 + if (*slot == cursor)
15570 + *slot = list_entry(cursor->list.next, dir_cursor, list);
15571 + /* remove cursor from circular list */
15572 + list_del_init(&cursor->list);
15573 + }
15574 + /* remove cursor from the list of unused cursors */
15575 + list_del_init(&cursor->alist);
15576 + /* remove cursor from the hash table */
15577 + d_cursor_hash_remove(&cursor->info->table, cursor);
15578 + /* and free it */
15579 + kmem_cache_free(d_cursor_cache, cursor);
15580 + --d_cursor_unused;
15581 +}
15582 +
15583 +/* possible actions that can be performed on all cursors for the given file */
15584 +enum cursor_action {
15585 + /*
15586 + * load all detached state: this is called when stat-data is loaded
15587 + * from the disk to recover information about all pending readdirs
15588 + */
15589 + CURSOR_LOAD,
15590 + /*
15591 + * detach all state from inode, leaving it in the cache. This is called
15592 + * when inode is removed form the memory by memory pressure
15593 + */
15594 + CURSOR_DISPOSE,
15595 + /*
15596 + * detach cursors from the inode, and free them. This is called when
15597 + * inode is destroyed
15598 + */
15599 + CURSOR_KILL
15600 +};
15601 +
15602 +/*
15603 + * return d_cursor data for the file system @inode is in.
15604 + */
15605 +static inline struct d_cursor_info *d_info(struct inode *inode)
15606 +{
15607 + return &get_super_private(inode->i_sb)->d_info;
15608 +}
15609 +
15610 +/*
15611 + * lookup d_cursor in the per-super-block radix tree.
15612 + */
15613 +static inline dir_cursor *lookup(struct d_cursor_info * info,
15614 + unsigned long index)
15615 +{
15616 + return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15617 +}
15618 +
15619 +/*
15620 + * attach @cursor to the radix tree. There may be multiple cursors for the
15621 + * same oid, they are chained into circular list.
15622 + */
15623 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
15624 +{
15625 + dir_cursor *head;
15626 +
15627 + head = lookup(cursor->info, index);
15628 + if (head == NULL) {
15629 + /* this is the first cursor for this index */
15630 + INIT_LIST_HEAD(&cursor->list);
15631 + radix_tree_insert(&cursor->info->tree, index, cursor);
15632 + } else {
15633 + /* some cursor already exists. Chain ours */
15634 + list_add(&cursor->list, &head->list);
15635 + }
15636 +}
15637 +
15638 +/*
15639 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
15640 + * "unused" list. Called when file descriptor is not longer in active use.
15641 + */
15642 +static void clean_fsdata(struct file *file)
15643 +{
15644 + dir_cursor *cursor;
15645 + reiser4_file_fsdata *fsdata;
15646 +
15647 + assert("nikita-3570", file_is_stateless(file));
15648 +
15649 + fsdata = (reiser4_file_fsdata *) file->private_data;
15650 + if (fsdata != NULL) {
15651 + cursor = fsdata->cursor;
15652 + if (cursor != NULL) {
15653 + spin_lock(&d_lock);
15654 + --cursor->ref;
15655 + if (cursor->ref == 0) {
15656 + list_add_tail(&cursor->alist, &cursor_cache);
15657 + ++d_cursor_unused;
15658 + }
15659 + spin_unlock(&d_lock);
15660 + file->private_data = NULL;
15661 + }
15662 + }
15663 +}
15664 +
15665 +/*
15666 + * global counter used to generate "client ids". These ids are encoded into
15667 + * high bits of fpos.
15668 + */
15669 +static __u32 cid_counter = 0;
15670 +#define CID_SHIFT (20)
15671 +#define CID_MASK (0xfffffull)
15672 +
15673 +static void free_file_fsdata_nolock(struct file *);
15674 +
15675 +/**
15676 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15677 + * @cursor:
15678 + * @file:
15679 + * @inode:
15680 + *
15681 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15682 + * reiser4 super block's hash table and radix tree.
15683 + add detachable readdir
15684 + * state to the @f
15685 + */
15686 +static int insert_cursor(dir_cursor *cursor, struct file *file,
15687 + struct inode *inode)
15688 +{
15689 + int result;
15690 + reiser4_file_fsdata *fsdata;
15691 +
15692 + memset(cursor, 0, sizeof *cursor);
15693 +
15694 + /* this is either first call to readdir, or rewind. Anyway, create new
15695 + * cursor. */
15696 + fsdata = create_fsdata(NULL);
15697 + if (fsdata != NULL) {
15698 + result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15699 + if (result == 0) {
15700 + struct d_cursor_info *info;
15701 + oid_t oid;
15702 +
15703 + info = d_info(inode);
15704 + oid = get_inode_oid(inode);
15705 + /* cid occupies higher 12 bits of f->f_pos. Don't
15706 + * allow it to become negative: this confuses
15707 + * nfsd_readdir() */
15708 + cursor->key.cid = (++cid_counter) & 0x7ff;
15709 + cursor->key.oid = oid;
15710 + cursor->fsdata = fsdata;
15711 + cursor->info = info;
15712 + cursor->ref = 1;
15713 +
15714 + spin_lock_inode(inode);
15715 + /* install cursor as @f's private_data, discarding old
15716 + * one if necessary */
15717 +#if REISER4_DEBUG
15718 + if (file->private_data)
15719 + warning("", "file has fsdata already");
15720 +#endif
15721 + clean_fsdata(file);
15722 + free_file_fsdata_nolock(file);
15723 + file->private_data = fsdata;
15724 + fsdata->cursor = cursor;
15725 + spin_unlock_inode(inode);
15726 + spin_lock(&d_lock);
15727 + /* insert cursor into hash table */
15728 + d_cursor_hash_insert(&info->table, cursor);
15729 + /* and chain it into radix-tree */
15730 + bind_cursor(cursor, (unsigned long)oid);
15731 + spin_unlock(&d_lock);
15732 + radix_tree_preload_end();
15733 + file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15734 + }
15735 + } else
15736 + result = RETERR(-ENOMEM);
15737 + return result;
15738 +}
15739 +
15740 +/**
15741 + * process_cursors - do action on each cursor attached to inode
15742 + * @inode:
15743 + * @act: action to do
15744 + *
15745 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15746 + * and performs action specified by @act on each of cursors.
15747 + */
15748 +static void process_cursors(struct inode *inode, enum cursor_action act)
15749 +{
15750 + oid_t oid;
15751 + dir_cursor *start;
15752 + struct list_head *head;
15753 + reiser4_context *ctx;
15754 + struct d_cursor_info *info;
15755 +
15756 + /* this can be called by
15757 + *
15758 + * kswapd->...->prune_icache->..reiser4_destroy_inode
15759 + *
15760 + * without reiser4_context
15761 + */
15762 + ctx = reiser4_init_context(inode->i_sb);
15763 + if (IS_ERR(ctx)) {
15764 + warning("vs-23", "failed to init context");
15765 + return;
15766 + }
15767 +
15768 + assert("nikita-3558", inode != NULL);
15769 +
15770 + info = d_info(inode);
15771 + oid = get_inode_oid(inode);
15772 + spin_lock_inode(inode);
15773 + head = get_readdir_list(inode);
15774 + spin_lock(&d_lock);
15775 + /* find any cursor for this oid: reference to it is hanging of radix
15776 + * tree */
15777 + start = lookup(info, (unsigned long)oid);
15778 + if (start != NULL) {
15779 + dir_cursor *scan;
15780 + reiser4_file_fsdata *fsdata;
15781 +
15782 + /* process circular list of cursors for this oid */
15783 + scan = start;
15784 + do {
15785 + dir_cursor *next;
15786 +
15787 + next = list_entry(scan->list.next, dir_cursor, list);
15788 + fsdata = scan->fsdata;
15789 + assert("nikita-3557", fsdata != NULL);
15790 + if (scan->key.oid == oid) {
15791 + switch (act) {
15792 + case CURSOR_DISPOSE:
15793 + list_del_init(&fsdata->dir.linkage);
15794 + break;
15795 + case CURSOR_LOAD:
15796 + list_add(&fsdata->dir.linkage, head);
15797 + break;
15798 + case CURSOR_KILL:
15799 + kill_cursor(scan);
15800 + break;
15801 + }
15802 + }
15803 + if (scan == next)
15804 + /* last cursor was just killed */
15805 + break;
15806 + scan = next;
15807 + } while (scan != start);
15808 + }
15809 + spin_unlock(&d_lock);
15810 + /* check that we killed 'em all */
15811 + assert("nikita-3568",
15812 + ergo(act == CURSOR_KILL,
15813 + list_empty_careful(get_readdir_list(inode))));
15814 + assert("nikita-3569",
15815 + ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15816 + spin_unlock_inode(inode);
15817 + reiser4_exit_context(ctx);
15818 +}
15819 +
15820 +/**
15821 + * reiser4_dispose_cursors - removes cursors from inode's list
15822 + * @inode: inode to dispose cursors of
15823 + *
15824 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15825 + * attached to cursor from inode's readdir list. This is called when inode is
15826 + * removed from the memory by memory pressure.
15827 + */
15828 +void reiser4_dispose_cursors(struct inode *inode)
15829 +{
15830 + process_cursors(inode, CURSOR_DISPOSE);
15831 +}
15832 +
15833 +/**
15834 + * reiser4_load_cursors - attach cursors to inode
15835 + * @inode: inode to load cursors to
15836 + *
15837 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15838 + * attached to cursor to inode's readdir list. This is done when inode is
15839 + * loaded into memory.
15840 + */
15841 +void reiser4_load_cursors(struct inode *inode)
15842 +{
15843 + process_cursors(inode, CURSOR_LOAD);
15844 +}
15845 +
15846 +/**
15847 + * reiser4_kill_cursors - kill all inode cursors
15848 + * @inode: inode to kill cursors of
15849 + *
15850 + * Frees all cursors for this inode. This is called when inode is destroyed.
15851 + */
15852 +void reiser4_kill_cursors(struct inode *inode)
15853 +{
15854 + process_cursors(inode, CURSOR_KILL);
15855 +}
15856 +
15857 +/**
15858 + * file_is_stateless -
15859 + * @file:
15860 + *
15861 + * true, if file descriptor @f is created by NFS server by "demand" to serve
15862 + * one file system operation. This means that there may be "detached state"
15863 + * for underlying inode.
15864 + */
15865 +static int file_is_stateless(struct file *file)
15866 +{
15867 + return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15868 +}
15869 +
15870 +/**
15871 + * reiser4_get_dir_fpos -
15872 + * @dir:
15873 + *
15874 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15875 + * in the case of stateless directory operation (readdir-over-nfs), client id
15876 + * was encoded in the high bits of cookie and should me masked off.
15877 + */
15878 +loff_t reiser4_get_dir_fpos(struct file *dir)
15879 +{
15880 + if (file_is_stateless(dir))
15881 + return dir->f_pos & CID_MASK;
15882 + else
15883 + return dir->f_pos;
15884 +}
15885 +
15886 +/**
15887 + * reiser4_attach_fsdata - try to attach fsdata
15888 + * @file:
15889 + * @inode:
15890 + *
15891 + * Finds or creates cursor for readdir-over-nfs.
15892 + */
15893 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
15894 +{
15895 + loff_t pos;
15896 + int result;
15897 + dir_cursor *cursor;
15898 +
15899 + /*
15900 + * we are serialized by inode->i_mutex
15901 + */
15902 + if (!file_is_stateless(file))
15903 + return 0;
15904 +
15905 + pos = file->f_pos;
15906 + result = 0;
15907 + if (pos == 0) {
15908 + /*
15909 + * first call to readdir (or rewind to the beginning of
15910 + * directory)
15911 + */
15912 + cursor = kmem_cache_alloc(d_cursor_cache,
15913 + reiser4_ctx_gfp_mask_get());
15914 + if (cursor != NULL)
15915 + result = insert_cursor(cursor, file, inode);
15916 + else
15917 + result = RETERR(-ENOMEM);
15918 + } else {
15919 + /* try to find existing cursor */
15920 + struct d_cursor_key key;
15921 +
15922 + key.cid = pos >> CID_SHIFT;
15923 + key.oid = get_inode_oid(inode);
15924 + spin_lock(&d_lock);
15925 + cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15926 + if (cursor != NULL) {
15927 + /* cursor was found */
15928 + if (cursor->ref == 0) {
15929 + /* move it from unused list */
15930 + list_del_init(&cursor->alist);
15931 + --d_cursor_unused;
15932 + }
15933 + ++cursor->ref;
15934 + }
15935 + spin_unlock(&d_lock);
15936 + if (cursor != NULL) {
15937 + spin_lock_inode(inode);
15938 + assert("nikita-3556", cursor->fsdata->back == NULL);
15939 + clean_fsdata(file);
15940 + free_file_fsdata_nolock(file);
15941 + file->private_data = cursor->fsdata;
15942 + spin_unlock_inode(inode);
15943 + }
15944 + }
15945 + return result;
15946 +}
15947 +
15948 +/**
15949 + * reiser4_detach_fsdata - ???
15950 + * @file:
15951 + *
15952 + * detach fsdata, if necessary
15953 + */
15954 +void reiser4_detach_fsdata(struct file *file)
15955 +{
15956 + struct inode *inode;
15957 +
15958 + if (!file_is_stateless(file))
15959 + return;
15960 +
15961 + inode = file->f_dentry->d_inode;
15962 + spin_lock_inode(inode);
15963 + clean_fsdata(file);
15964 + spin_unlock_inode(inode);
15965 +}
15966 +
15967 +/* slab for reiser4_dentry_fsdata */
15968 +static struct kmem_cache *dentry_fsdata_cache;
15969 +
15970 +/**
15971 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
15972 + *
15973 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
15974 + * part of reiser4 module initialization.
15975 + */
15976 +int reiser4_init_dentry_fsdata(void)
15977 +{
15978 + dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
15979 + sizeof(struct reiser4_dentry_fsdata),
15980 + 0,
15981 + SLAB_HWCACHE_ALIGN |
15982 + SLAB_RECLAIM_ACCOUNT,
15983 + NULL);
15984 + if (dentry_fsdata_cache == NULL)
15985 + return RETERR(-ENOMEM);
15986 + return 0;
15987 +}
15988 +
15989 +/**
15990 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
15991 + *
15992 + * This is called on reiser4 module unloading or system shutdown.
15993 + */
15994 +void reiser4_done_dentry_fsdata(void)
15995 +{
15996 + destroy_reiser4_cache(&dentry_fsdata_cache);
15997 +}
15998 +
15999 +/**
16000 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
16001 + * @dentry: queried dentry
16002 + *
16003 + * Allocates if necessary and returns per-dentry data that we attach to each
16004 + * dentry.
16005 + */
16006 +struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16007 +{
16008 + assert("nikita-1365", dentry != NULL);
16009 +
16010 + if (dentry->d_fsdata == NULL) {
16011 + dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16012 + reiser4_ctx_gfp_mask_get());
16013 + if (dentry->d_fsdata == NULL)
16014 + return ERR_PTR(RETERR(-ENOMEM));
16015 + memset(dentry->d_fsdata, 0,
16016 + sizeof(struct reiser4_dentry_fsdata));
16017 + }
16018 + return dentry->d_fsdata;
16019 +}
16020 +
16021 +/**
16022 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16023 + * @dentry: dentry to free fsdata of
16024 + *
16025 + * Detaches and frees fs-specific dentry data
16026 + */
16027 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
16028 +{
16029 + if (dentry->d_fsdata != NULL) {
16030 + kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16031 + dentry->d_fsdata = NULL;
16032 + }
16033 +}
16034 +
16035 +/* slab for reiser4_file_fsdata */
16036 +static struct kmem_cache *file_fsdata_cache;
16037 +
16038 +/**
16039 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16040 + *
16041 + * Initializes slab cache of structures attached to file->private_data. It is
16042 + * part of reiser4 module initialization.
16043 + */
16044 +int reiser4_init_file_fsdata(void)
16045 +{
16046 + file_fsdata_cache = kmem_cache_create("file_fsdata",
16047 + sizeof(reiser4_file_fsdata),
16048 + 0,
16049 + SLAB_HWCACHE_ALIGN |
16050 + SLAB_RECLAIM_ACCOUNT, NULL);
16051 + if (file_fsdata_cache == NULL)
16052 + return RETERR(-ENOMEM);
16053 + return 0;
16054 +}
16055 +
16056 +/**
16057 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16058 + *
16059 + * This is called on reiser4 module unloading or system shutdown.
16060 + */
16061 +void reiser4_done_file_fsdata(void)
16062 +{
16063 + destroy_reiser4_cache(&file_fsdata_cache);
16064 +}
16065 +
16066 +/**
16067 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16068 + * @file: what to create file_fsdata for, may be NULL
16069 + *
16070 + * Allocates and initializes reiser4_file_fsdata structure.
16071 + */
16072 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16073 +{
16074 + reiser4_file_fsdata *fsdata;
16075 +
16076 + fsdata = kmem_cache_alloc(file_fsdata_cache,
16077 + reiser4_ctx_gfp_mask_get());
16078 + if (fsdata != NULL) {
16079 + memset(fsdata, 0, sizeof *fsdata);
16080 + fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16081 + fsdata->back = file;
16082 + INIT_LIST_HEAD(&fsdata->dir.linkage);
16083 + }
16084 + return fsdata;
16085 +}
16086 +
16087 +/**
16088 + * free_fsdata - free reiser4_file_fsdata
16089 + * @fsdata: object to free
16090 + *
16091 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16092 + */
16093 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16094 +{
16095 + BUG_ON(fsdata == NULL);
16096 + kmem_cache_free(file_fsdata_cache, fsdata);
16097 +}
16098 +
16099 +/**
16100 + * reiser4_get_file_fsdata - get fs-specific file data
16101 + * @file: queried file
16102 + *
16103 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16104 + * to @file.
16105 + */
16106 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16107 +{
16108 + assert("nikita-1603", file != NULL);
16109 +
16110 + if (file->private_data == NULL) {
16111 + reiser4_file_fsdata *fsdata;
16112 + struct inode *inode;
16113 +
16114 + fsdata = create_fsdata(file);
16115 + if (fsdata == NULL)
16116 + return ERR_PTR(RETERR(-ENOMEM));
16117 +
16118 + inode = file->f_dentry->d_inode;
16119 + spin_lock_inode(inode);
16120 + if (file->private_data == NULL) {
16121 + file->private_data = fsdata;
16122 + fsdata = NULL;
16123 + }
16124 + spin_unlock_inode(inode);
16125 + if (fsdata != NULL)
16126 + /* other thread initialized ->fsdata */
16127 + kmem_cache_free(file_fsdata_cache, fsdata);
16128 + }
16129 + assert("nikita-2665", file->private_data != NULL);
16130 + return file->private_data;
16131 +}
16132 +
16133 +/**
16134 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16135 + * @file:
16136 + *
16137 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16138 + * readdir list, frees if it is not linked to d_cursor object.
16139 + */
16140 +static void free_file_fsdata_nolock(struct file *file)
16141 +{
16142 + reiser4_file_fsdata *fsdata;
16143 +
16144 + assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16145 + fsdata = file->private_data;
16146 + if (fsdata != NULL) {
16147 + list_del_init(&fsdata->dir.linkage);
16148 + if (fsdata->cursor == NULL)
16149 + free_fsdata(fsdata);
16150 + }
16151 + file->private_data = NULL;
16152 +}
16153 +
16154 +/**
16155 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16156 + * @file:
16157 + *
16158 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16159 + */
16160 +void reiser4_free_file_fsdata(struct file *file)
16161 +{
16162 + spin_lock_inode(file->f_dentry->d_inode);
16163 + free_file_fsdata_nolock(file);
16164 + spin_unlock_inode(file->f_dentry->d_inode);
16165 +}
16166 +
16167 +/*
16168 + * Local variables:
16169 + * c-indentation-style: "K&R"
16170 + * mode-name: "LC"
16171 + * c-basic-offset: 8
16172 + * tab-width: 8
16173 + * fill-column: 79
16174 + * End:
16175 + */
16176 diff -urN linux-2.6.23.orig/fs/reiser4/fsdata.h linux-2.6.23/fs/reiser4/fsdata.h
16177 --- linux-2.6.23.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300
16178 +++ linux-2.6.23/fs/reiser4/fsdata.h 2007-12-04 16:49:30.000000000 +0300
16179 @@ -0,0 +1,205 @@
16180 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16181 + * reiser4/README */
16182 +
16183 +#if !defined( __REISER4_FSDATA_H__ )
16184 +#define __REISER4_FSDATA_H__
16185 +
16186 +#include "debug.h"
16187 +#include "kassign.h"
16188 +#include "seal.h"
16189 +#include "type_safe_hash.h"
16190 +#include "plugin/file/file.h"
16191 +#include "readahead.h"
16192 +
16193 +/*
16194 + * comment about reiser4_dentry_fsdata
16195 + *
16196 + *
16197 + */
16198 +
16199 +/*
16200 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16201 + * protected by ->i_mutex on inode. Under this lock following invariant
16202 + * holds:
16203 + *
16204 + * file descriptor is "looking" at the entry_no-th directory entry from
16205 + * the beginning of directory. This entry has key dir_entry_key and is
16206 + * pos-th entry with duplicate-key sequence.
16207 + *
16208 + */
16209 +
16210 +/* logical position within directory */
16211 +struct dir_pos {
16212 + /* key of directory entry (actually, part of a key sufficient to
16213 + identify directory entry) */
16214 + de_id dir_entry_key;
16215 + /* ordinal number of directory entry among all entries with the same
16216 + key. (Starting from 0.) */
16217 + unsigned pos;
16218 +};
16219 +
16220 +struct readdir_pos {
16221 + /* f_pos corresponding to this readdir position */
16222 + __u64 fpos;
16223 + /* logical position within directory */
16224 + struct dir_pos position;
16225 + /* logical number of directory entry within
16226 + directory */
16227 + __u64 entry_no;
16228 +};
16229 +
16230 +/*
16231 + * this is used to speed up lookups for directory entry: on initial call to
16232 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16233 + * in struct dentry and reused later to avoid tree traversals.
16234 + */
16235 +struct de_location {
16236 + /* seal covering directory entry */
16237 + seal_t entry_seal;
16238 + /* coord of directory entry */
16239 + coord_t entry_coord;
16240 + /* ordinal number of directory entry among all entries with the same
16241 + key. (Starting from 0.) */
16242 + int pos;
16243 +};
16244 +
16245 +/**
16246 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16247 + *
16248 + * This is allocated dynamically and released in d_op->d_release()
16249 + *
16250 + * Currently it only contains cached location (hint) of directory entry, but
16251 + * it is expected that other information will be accumulated here.
16252 + */
16253 +struct reiser4_dentry_fsdata {
16254 + /*
16255 + * here will go fields filled by ->lookup() to speedup next
16256 + * create/unlink, like blocknr of znode with stat-data, or key of
16257 + * stat-data.
16258 + */
16259 + struct de_location dec;
16260 + int stateless; /* created through reiser4_decode_fh, needs special
16261 + * treatment in readdir. */
16262 +};
16263 +
16264 +extern int reiser4_init_dentry_fsdata(void);
16265 +extern void reiser4_done_dentry_fsdata(void);
16266 +extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16267 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16268 +
16269 +/**
16270 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16271 + *
16272 + * This is allocated dynamically and released in inode->i_fop->release
16273 + */
16274 +typedef struct reiser4_file_fsdata {
16275 + /*
16276 + * pointer back to the struct file which this reiser4_file_fsdata is
16277 + * part of
16278 + */
16279 + struct file *back;
16280 + /* detached cursor for stateless readdir. */
16281 + struct dir_cursor *cursor;
16282 + /*
16283 + * We need both directory and regular file parts here, because there
16284 + * are file system objects that are files and directories.
16285 + */
16286 + struct {
16287 + /*
16288 + * position in directory. It is updated each time directory is
16289 + * modified
16290 + */
16291 + struct readdir_pos readdir;
16292 + /* head of this list is reiser4_inode->lists.readdir_list */
16293 + struct list_head linkage;
16294 + } dir;
16295 + /* hints to speed up operations with regular files: read and write. */
16296 + struct {
16297 + hint_t hint;
16298 + } reg;
16299 + struct reiser4_file_ra_state ra1;
16300 +
16301 +} reiser4_file_fsdata;
16302 +
16303 +extern int reiser4_init_file_fsdata(void);
16304 +extern void reiser4_done_file_fsdata(void);
16305 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16306 +extern void reiser4_free_file_fsdata(struct file *);
16307 +
16308 +/*
16309 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16310 + * used to address problem reiser4 has with readdir accesses via NFS. See
16311 + * plugin/file_ops_readdir.c for more details.
16312 + */
16313 +struct d_cursor_key{
16314 + __u16 cid;
16315 + __u64 oid;
16316 +};
16317 +
16318 +/*
16319 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16320 + * maintain hash table of dir_cursor-s in reiser4's super block
16321 + */
16322 +typedef struct dir_cursor dir_cursor;
16323 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16324 +
16325 +struct dir_cursor {
16326 + int ref;
16327 + reiser4_file_fsdata *fsdata;
16328 +
16329 + /* link to reiser4 super block hash table of cursors */
16330 + d_cursor_hash_link hash;
16331 +
16332 + /*
16333 + * this is to link cursors to reiser4 super block's radix tree of
16334 + * cursors if there are more than one cursor of the same objectid
16335 + */
16336 + struct list_head list;
16337 + struct d_cursor_key key;
16338 + struct d_cursor_info *info;
16339 + /* list of unused cursors */
16340 + struct list_head alist;
16341 +};
16342 +
16343 +extern int reiser4_init_d_cursor(void);
16344 +extern void reiser4_done_d_cursor(void);
16345 +
16346 +extern int reiser4_init_super_d_info(struct super_block *);
16347 +extern void reiser4_done_super_d_info(struct super_block *);
16348 +
16349 +extern loff_t reiser4_get_dir_fpos(struct file *);
16350 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
16351 +extern void reiser4_detach_fsdata(struct file *);
16352 +
16353 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16354 + more details */
16355 +void reiser4_dispose_cursors(struct inode *inode);
16356 +void reiser4_load_cursors(struct inode *inode);
16357 +void reiser4_kill_cursors(struct inode *inode);
16358 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16359 + int offset, int adj);
16360 +
16361 +/*
16362 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16363 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16364 + */
16365 +struct d_cursor_info {
16366 + d_cursor_hash_table table;
16367 + struct radix_tree_root tree;
16368 +};
16369 +
16370 +/* spinlock protecting readdir cursors */
16371 +extern spinlock_t d_lock;
16372 +
16373 +/* __REISER4_FSDATA_H__ */
16374 +#endif
16375 +
16376 +/*
16377 + * Local variables:
16378 + * c-indentation-style: "K&R"
16379 + * mode-name: "LC"
16380 + * c-basic-offset: 8
16381 + * tab-width: 8
16382 + * fill-column: 120
16383 + * End:
16384 + */
16385 diff -urN linux-2.6.23.orig/fs/reiser4/init_super.c linux-2.6.23/fs/reiser4/init_super.c
16386 --- linux-2.6.23.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300
16387 +++ linux-2.6.23/fs/reiser4/init_super.c 2007-12-04 16:49:30.000000000 +0300
16388 @@ -0,0 +1,751 @@
16389 +/* Copyright by Hans Reiser, 2003 */
16390 +
16391 +#include "super.h"
16392 +#include "inode.h"
16393 +#include "plugin/plugin_set.h"
16394 +
16395 +#include <linux/swap.h>
16396 +
16397 +/**
16398 + * init_fs_info - allocate reiser4 specific super block
16399 + * @super: super block of filesystem
16400 + *
16401 + * Allocates and initialize reiser4_super_info_data, attaches it to
16402 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16403 + */
16404 +int reiser4_init_fs_info(struct super_block *super)
16405 +{
16406 + reiser4_super_info_data *sbinfo;
16407 +
16408 + sbinfo = kzalloc(sizeof(reiser4_super_info_data),
16409 + reiser4_ctx_gfp_mask_get());
16410 + if (!sbinfo)
16411 + return RETERR(-ENOMEM);
16412 +
16413 + super->s_fs_info = sbinfo;
16414 + super->s_op = NULL;
16415 +
16416 + ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16417 + ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16418 +
16419 + mutex_init(&sbinfo->delete_mutex);
16420 + spin_lock_init(&(sbinfo->guard));
16421 +
16422 + /* initialize per-super-block d_cursor resources */
16423 + reiser4_init_super_d_info(super);
16424 +
16425 + return 0;
16426 +}
16427 +
16428 +/**
16429 + * reiser4_done_fs_info - free reiser4 specific super block
16430 + * @super: super block of filesystem
16431 + *
16432 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16433 + * frees reiser4_super_info_data.
16434 + */
16435 +void reiser4_done_fs_info(struct super_block *super)
16436 +{
16437 + assert("zam-990", super->s_fs_info != NULL);
16438 +
16439 + /* release per-super-block d_cursor resources */
16440 + reiser4_done_super_d_info(super);
16441 +
16442 + /* make sure that there are not jnodes already */
16443 + assert("", list_empty(&get_super_private(super)->all_jnodes));
16444 + assert("", get_current_context()->trans->atom == NULL);
16445 + reiser4_check_block_counters(super);
16446 + kfree(super->s_fs_info);
16447 + super->s_fs_info = NULL;
16448 +}
16449 +
16450 +/* type of option parseable by parse_option() */
16451 +typedef enum {
16452 + /* value of option is arbitrary string */
16453 + OPT_STRING,
16454 +
16455 + /*
16456 + * option specifies bit in a bitmask. When option is set - bit in
16457 + * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16458 + * dont_load_bitmap, atomic_write.
16459 + */
16460 + OPT_BIT,
16461 +
16462 + /*
16463 + * value of option should conform to sprintf() format. Examples are
16464 + * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16465 + */
16466 + OPT_FORMAT,
16467 +
16468 + /*
16469 + * option can take one of predefined values. Example is onerror=panic or
16470 + * onerror=remount-ro
16471 + */
16472 + OPT_ONEOF,
16473 +} opt_type_t;
16474 +
16475 +#if 0
16476 +struct opt_bitmask_bit {
16477 + const char *bit_name;
16478 + int bit_nr;
16479 +};
16480 +#endif
16481 +
16482 +/* description of option parseable by parse_option() */
16483 +struct opt_desc {
16484 + /* option name.
16485 +
16486 + parsed portion of string has a form "name=value".
16487 + */
16488 + const char *name;
16489 + /* type of option */
16490 + opt_type_t type;
16491 + union {
16492 + /* where to store value of string option (type == OPT_STRING) */
16493 + char **string;
16494 + /* description of bits for bit option (type == OPT_BIT) */
16495 + struct {
16496 + int nr;
16497 + void *addr;
16498 + } bit;
16499 + /* description of format and targets for format option (type
16500 + == OPT_FORMAT) */
16501 + struct {
16502 + const char *format;
16503 + int nr_args;
16504 + void *arg1;
16505 + void *arg2;
16506 + void *arg3;
16507 + void *arg4;
16508 + } f;
16509 + struct {
16510 + int *result;
16511 + const char *list[10];
16512 + } oneof;
16513 + struct {
16514 + void *addr;
16515 + int nr_bits;
16516 + //struct opt_bitmask_bit *bits;
16517 + } bitmask;
16518 + } u;
16519 +};
16520 +
16521 +/**
16522 + * parse_option - parse one option
16523 + * @opt_strin: starting point of parsing
16524 + * @opt: option description
16525 + *
16526 + * foo=bar,
16527 + * ^ ^ ^
16528 + * | | +-- replaced to '\0'
16529 + * | +-- val_start
16530 + * +-- opt_string
16531 + * Figures out option type and handles option correspondingly.
16532 + */
16533 +static int parse_option(char *opt_string, struct opt_desc *opt)
16534 +{
16535 + char *val_start;
16536 + int result;
16537 + const char *err_msg;
16538 +
16539 + /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16540 +
16541 + val_start = strchr(opt_string, '=');
16542 + if (val_start != NULL) {
16543 + *val_start = '\0';
16544 + ++val_start;
16545 + }
16546 +
16547 + err_msg = NULL;
16548 + result = 0;
16549 + switch (opt->type) {
16550 + case OPT_STRING:
16551 + if (val_start == NULL) {
16552 + err_msg = "String arg missing";
16553 + result = RETERR(-EINVAL);
16554 + } else
16555 + *opt->u.string = val_start;
16556 + break;
16557 + case OPT_BIT:
16558 + if (val_start != NULL)
16559 + err_msg = "Value ignored";
16560 + else
16561 + set_bit(opt->u.bit.nr, opt->u.bit.addr);
16562 + break;
16563 + case OPT_FORMAT:
16564 + if (val_start == NULL) {
16565 + err_msg = "Formatted arg missing";
16566 + result = RETERR(-EINVAL);
16567 + break;
16568 + }
16569 + if (sscanf(val_start, opt->u.f.format,
16570 + opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16571 + opt->u.f.arg4) != opt->u.f.nr_args) {
16572 + err_msg = "Wrong conversion";
16573 + result = RETERR(-EINVAL);
16574 + }
16575 + break;
16576 + case OPT_ONEOF:
16577 + {
16578 + int i = 0;
16579 +
16580 + if (val_start == NULL) {
16581 + err_msg = "Value is missing";
16582 + result = RETERR(-EINVAL);
16583 + break;
16584 + }
16585 + err_msg = "Wrong option value";
16586 + result = RETERR(-EINVAL);
16587 + while (opt->u.oneof.list[i]) {
16588 + if (!strcmp(opt->u.oneof.list[i], val_start)) {
16589 + result = 0;
16590 + err_msg = NULL;
16591 + *opt->u.oneof.result = i;
16592 + break;
16593 + }
16594 + i++;
16595 + }
16596 + break;
16597 + }
16598 + default:
16599 + wrong_return_value("nikita-2100", "opt -> type");
16600 + break;
16601 + }
16602 + if (err_msg != NULL) {
16603 + warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16604 + err_msg, opt->name, val_start ? "=" : "",
16605 + val_start ? : "");
16606 + }
16607 + return result;
16608 +}
16609 +
16610 +/**
16611 + * parse_options - parse reiser4 mount options
16612 + * @opt_string: starting point
16613 + * @opts: array of option description
16614 + * @nr_opts: number of elements in @opts
16615 + *
16616 + * Parses comma separated list of reiser4 mount options.
16617 + */
16618 +static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
16619 +{
16620 + int result;
16621 +
16622 + result = 0;
16623 + while ((result == 0) && opt_string && *opt_string) {
16624 + int j;
16625 + char *next;
16626 +
16627 + next = strchr(opt_string, ',');
16628 + if (next != NULL) {
16629 + *next = '\0';
16630 + ++next;
16631 + }
16632 + for (j = 0; j < nr_opts; ++j) {
16633 + if (!strncmp(opt_string, opts[j].name,
16634 + strlen(opts[j].name))) {
16635 + result = parse_option(opt_string, &opts[j]);
16636 + break;
16637 + }
16638 + }
16639 + if (j == nr_opts) {
16640 + warning("nikita-2307", "Unrecognized option: \"%s\"",
16641 + opt_string);
16642 + /* traditionally, -EINVAL is returned on wrong mount
16643 + option */
16644 + result = RETERR(-EINVAL);
16645 + }
16646 + opt_string = next;
16647 + }
16648 + return result;
16649 +}
16650 +
16651 +#define NUM_OPT( label, fmt, addr ) \
16652 + { \
16653 + .name = ( label ), \
16654 + .type = OPT_FORMAT, \
16655 + .u = { \
16656 + .f = { \
16657 + .format = ( fmt ), \
16658 + .nr_args = 1, \
16659 + .arg1 = ( addr ), \
16660 + .arg2 = NULL, \
16661 + .arg3 = NULL, \
16662 + .arg4 = NULL \
16663 + } \
16664 + } \
16665 + }
16666 +
16667 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16668 +
16669 +#define BIT_OPT(label, bitnr) \
16670 + { \
16671 + .name = label, \
16672 + .type = OPT_BIT, \
16673 + .u = { \
16674 + .bit = { \
16675 + .nr = bitnr, \
16676 + .addr = &sbinfo->fs_flags \
16677 + } \
16678 + } \
16679 + }
16680 +
16681 +#define MAX_NR_OPTIONS (30)
16682 +
16683 +/**
16684 + * reiser4_init_super_data - initialize reiser4 private super block
16685 + * @super: super block to initialize
16686 + * @opt_string: list of reiser4 mount options
16687 + *
16688 + * Sets various reiser4 parameters to default values. Parses mount options and
16689 + * overwrites default settings.
16690 + */
16691 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
16692 +{
16693 + int result;
16694 + struct opt_desc *opts, *p;
16695 + reiser4_super_info_data *sbinfo = get_super_private(super);
16696 +
16697 + /* initialize super, export, dentry operations */
16698 + sbinfo->ops.super = reiser4_super_operations;
16699 + sbinfo->ops.export = reiser4_export_operations;
16700 + sbinfo->ops.dentry = reiser4_dentry_operations;
16701 + super->s_op = &sbinfo->ops.super;
16702 + super->s_export_op = &sbinfo->ops.export;
16703 +
16704 + /* initialize transaction manager parameters to default values */
16705 + sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16706 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16707 + sbinfo->tmgr.atom_min_size = 256;
16708 + sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16709 +
16710 + /* initialize cbk cache parameter */
16711 + sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16712 +
16713 + /* initialize flush parameters */
16714 + sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16715 + sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16716 + sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16717 + sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16718 +
16719 + sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16720 +
16721 + /* preliminary tree initializations */
16722 + sbinfo->tree.super = super;
16723 + sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16724 + sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16725 + sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16726 + sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16727 + rwlock_init(&(sbinfo->tree.tree_lock));
16728 + spin_lock_init(&(sbinfo->tree.epoch_lock));
16729 +
16730 + /* initialize default readahead params */
16731 + sbinfo->ra_params.max = num_physpages / 4;
16732 + sbinfo->ra_params.flags = 0;
16733 +
16734 + /* allocate memory for structure describing reiser4 mount options */
16735 + opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
16736 + reiser4_ctx_gfp_mask_get());
16737 + if (opts == NULL)
16738 + return RETERR(-ENOMEM);
16739 +
16740 + /* initialize structure describing reiser4 mount options */
16741 + p = opts;
16742 +
16743 +#if REISER4_DEBUG
16744 +# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
16745 + warning ("zam-1046", "opt array is overloaded"); break; \
16746 + }
16747 +#else
16748 +# define OPT_ARRAY_CHECK noop
16749 +#endif
16750 +
16751 +#define PUSH_OPT(...) \
16752 +do { \
16753 + struct opt_desc o = __VA_ARGS__; \
16754 + OPT_ARRAY_CHECK; \
16755 + *p ++ = o; \
16756 +} while (0)
16757 +
16758 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16759 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16760 +
16761 + /*
16762 + * tmgr.atom_max_size=N
16763 + * Atoms containing more than N blocks will be forced to commit. N is
16764 + * decimal.
16765 + */
16766 + PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16767 + /*
16768 + * tmgr.atom_max_age=N
16769 + * Atoms older than N seconds will be forced to commit. N is decimal.
16770 + */
16771 + PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16772 + /*
16773 + * tmgr.atom_min_size=N
16774 + * In committing an atom to free dirty pages, force the atom less than
16775 + * N in size to fuse with another one.
16776 + */
16777 + PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16778 + /*
16779 + * tmgr.atom_max_flushers=N
16780 + * limit of concurrent flushers for one atom. 0 means no limit.
16781 + */
16782 + PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16783 + /*
16784 + * tree.cbk_cache_slots=N
16785 + * Number of slots in the cbk cache.
16786 + */
16787 + PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16788 + /*
16789 + * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16790 + * leaf-level blocks it will force them to be relocated.
16791 + */
16792 + PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16793 + /*
16794 + * If flush finds can find a block allocation closer than at most
16795 + * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16796 + * position.
16797 + */
16798 + PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16799 + /*
16800 + * If we have written this much or more blocks before encountering busy
16801 + * jnode in flush list - abort flushing hoping that next time we get
16802 + * called this jnode will be clean already, and we will save some
16803 + * seeks.
16804 + */
16805 + PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16806 + /* The maximum number of nodes to scan left on a level during flush. */
16807 + PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16808 + /* preferred IO size */
16809 + PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16810 + /* carry flags used for insertion of new nodes */
16811 + PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16812 + /* carry flags used for insertion of new extents */
16813 + PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16814 + /* carry flags used for paste operations */
16815 + PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16816 + /* carry flags used for insert operations */
16817 + PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16818 +
16819 +#ifdef CONFIG_REISER4_BADBLOCKS
16820 + /*
16821 + * Alternative master superblock location in case if it's original
16822 + * location is not writeable/accessable. This is offset in BYTES.
16823 + */
16824 + PUSH_SB_FIELD_OPT(altsuper, "%lu");
16825 +#endif
16826 +
16827 + /* turn on BSD-style gid assignment */
16828 + PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16829 + /* turn on 32 bit times */
16830 + PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16831 + /*
16832 + * Don't load all bitmap blocks at mount time, it is useful for
16833 + * machines with tiny RAM and large disks.
16834 + */
16835 + PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16836 + /* disable transaction commits during write() */
16837 + PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16838 + /* disable use of write barriers in the reiser4 log writer. */
16839 + PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16840 +
16841 + PUSH_OPT(
16842 + {
16843 + /*
16844 + * tree traversal readahead parameters:
16845 + * -o readahead:MAXNUM:FLAGS
16846 + * MAXNUM - max number fo nodes to request readahead for: -1UL
16847 + * will set it to max_sane_readahead()
16848 + * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16849 + * CONTINUE_ON_PRESENT
16850 + */
16851 + .name = "readahead",
16852 + .type = OPT_FORMAT,
16853 + .u = {
16854 + .f = {
16855 + .format = "%u:%u",
16856 + .nr_args = 2,
16857 + .arg1 = &sbinfo->ra_params.max,
16858 + .arg2 = &sbinfo->ra_params.flags,
16859 + .arg3 = NULL,
16860 + .arg4 = NULL
16861 + }
16862 + }
16863 + }
16864 + );
16865 +
16866 + /* What to do in case of fs error */
16867 + PUSH_OPT(
16868 + {
16869 + .name = "onerror",
16870 + .type = OPT_ONEOF,
16871 + .u = {
16872 + .oneof = {
16873 + .result = &sbinfo->onerror,
16874 + .list = {
16875 + "panic", "remount-ro", NULL
16876 + },
16877 + }
16878 + }
16879 + }
16880 + );
16881 +
16882 + /* modify default settings to values set by mount options */
16883 + result = parse_options(opt_string, opts, p - opts);
16884 + kfree(opts);
16885 + if (result != 0)
16886 + return result;
16887 +
16888 + /* correct settings to sanity values */
16889 + sbinfo->tmgr.atom_max_age *= HZ;
16890 + if (sbinfo->tmgr.atom_max_age <= 0)
16891 + /* overflow */
16892 + sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16893 +
16894 + /* round optimal io size up to 512 bytes */
16895 + sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16896 + sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16897 + if (sbinfo->optimal_io_size == 0) {
16898 + warning("nikita-2497", "optimal_io_size is too small");
16899 + return RETERR(-EINVAL);
16900 + }
16901 + return result;
16902 +}
16903 +
16904 +/**
16905 + * reiser4_init_read_super - read reiser4 master super block
16906 + * @super: super block to fill
16907 + * @silent: if 0 - print warnings
16908 + *
16909 + * Reads reiser4 master super block either from predefined location or from
16910 + * location specified by altsuper mount option, initializes disk format plugin.
16911 + */
16912 +int reiser4_init_read_super(struct super_block *super, int silent)
16913 +{
16914 + struct buffer_head *super_bh;
16915 + struct reiser4_master_sb *master_sb;
16916 + reiser4_super_info_data *sbinfo = get_super_private(super);
16917 + unsigned long blocksize;
16918 +
16919 + read_super_block:
16920 +#ifdef CONFIG_REISER4_BADBLOCKS
16921 + if (sbinfo->altsuper)
16922 + /*
16923 + * read reiser4 master super block at position specified by
16924 + * mount option
16925 + */
16926 + super_bh = sb_bread(super,
16927 + (sector_t)(sbinfo->altsuper / super->s_blocksize));
16928 + else
16929 +#endif
16930 + /* read reiser4 master super block at 16-th 4096 block */
16931 + super_bh = sb_bread(super,
16932 + (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16933 + if (!super_bh)
16934 + return RETERR(-EIO);
16935 +
16936 + master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16937 + /* check reiser4 magic string */
16938 + if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16939 + sizeof(REISER4_SUPER_MAGIC_STRING))) {
16940 + /* reiser4 master super block contains filesystem blocksize */
16941 + blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16942 +
16943 + if (blocksize != PAGE_CACHE_SIZE) {
16944 + /*
16945 + * currenly reiser4's blocksize must be equal to
16946 + * pagesize
16947 + */
16948 + if (!silent)
16949 + warning("nikita-2609",
16950 + "%s: wrong block size %ld\n", super->s_id,
16951 + blocksize);
16952 + brelse(super_bh);
16953 + return RETERR(-EINVAL);
16954 + }
16955 + if (blocksize != super->s_blocksize) {
16956 + /*
16957 + * filesystem uses different blocksize. Reread master
16958 + * super block with correct blocksize
16959 + */
16960 + brelse(super_bh);
16961 + if (!sb_set_blocksize(super, (int)blocksize))
16962 + return RETERR(-EINVAL);
16963 + goto read_super_block;
16964 + }
16965 +
16966 + sbinfo->df_plug =
16967 + disk_format_plugin_by_id(
16968 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16969 + if (sbinfo->df_plug == NULL) {
16970 + if (!silent)
16971 + warning("nikita-26091",
16972 + "%s: unknown disk format plugin %d\n",
16973 + super->s_id,
16974 + le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16975 + brelse(super_bh);
16976 + return RETERR(-EINVAL);
16977 + }
16978 + sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
16979 + brelse(super_bh);
16980 + return 0;
16981 + }
16982 +
16983 + /* there is no reiser4 on the device */
16984 + if (!silent)
16985 + warning("nikita-2608",
16986 + "%s: wrong master super block magic", super->s_id);
16987 + brelse(super_bh);
16988 + return RETERR(-EINVAL);
16989 +}
16990 +
16991 +static struct {
16992 + reiser4_plugin_type type;
16993 + reiser4_plugin_id id;
16994 +} default_plugins[PSET_LAST] = {
16995 + [PSET_FILE] = {
16996 + .type = REISER4_FILE_PLUGIN_TYPE,
16997 + .id = UNIX_FILE_PLUGIN_ID
16998 + },
16999 + [PSET_DIR] = {
17000 + .type = REISER4_DIR_PLUGIN_TYPE,
17001 + .id = HASHED_DIR_PLUGIN_ID
17002 + },
17003 + [PSET_HASH] = {
17004 + .type = REISER4_HASH_PLUGIN_TYPE,
17005 + .id = R5_HASH_ID
17006 + },
17007 + [PSET_FIBRATION] = {
17008 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
17009 + .id = FIBRATION_DOT_O
17010 + },
17011 + [PSET_PERM] = {
17012 + .type = REISER4_PERM_PLUGIN_TYPE,
17013 + .id = NULL_PERM_ID
17014 + },
17015 + [PSET_FORMATTING] = {
17016 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
17017 + .id = SMALL_FILE_FORMATTING_ID
17018 + },
17019 + [PSET_SD] = {
17020 + .type = REISER4_ITEM_PLUGIN_TYPE,
17021 + .id = STATIC_STAT_DATA_ID
17022 + },
17023 + [PSET_DIR_ITEM] = {
17024 + .type = REISER4_ITEM_PLUGIN_TYPE,
17025 + .id = COMPOUND_DIR_ID
17026 + },
17027 + [PSET_CIPHER] = {
17028 + .type = REISER4_CIPHER_PLUGIN_TYPE,
17029 + .id = NONE_CIPHER_ID
17030 + },
17031 + [PSET_DIGEST] = {
17032 + .type = REISER4_DIGEST_PLUGIN_TYPE,
17033 + .id = SHA256_32_DIGEST_ID
17034 + },
17035 + [PSET_COMPRESSION] = {
17036 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17037 + .id = LZO1_COMPRESSION_ID
17038 + },
17039 + [PSET_COMPRESSION_MODE] = {
17040 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17041 + .id = CONVX_COMPRESSION_MODE_ID
17042 + },
17043 + [PSET_CLUSTER] = {
17044 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
17045 + .id = CLUSTER_64K_ID
17046 + },
17047 + [PSET_CREATE] = {
17048 + .type = REISER4_FILE_PLUGIN_TYPE,
17049 + .id = UNIX_FILE_PLUGIN_ID
17050 + }
17051 +};
17052 +
17053 +/* access to default plugin table */
17054 +reiser4_plugin *get_default_plugin(pset_member memb)
17055 +{
17056 + return plugin_by_id(default_plugins[memb].type,
17057 + default_plugins[memb].id);
17058 +}
17059 +
17060 +/**
17061 + * reiser4_init_root_inode - obtain inode of root directory
17062 + * @super: super block of filesystem
17063 + *
17064 + * Obtains inode of root directory (reading it from disk), initializes plugin
17065 + * set it was not initialized.
17066 + */
17067 +int reiser4_init_root_inode(struct super_block *super)
17068 +{
17069 + reiser4_super_info_data *sbinfo = get_super_private(super);
17070 + struct inode *inode;
17071 + int result = 0;
17072 +
17073 + inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17074 + if (IS_ERR(inode))
17075 + return RETERR(PTR_ERR(inode));
17076 +
17077 + super->s_root = d_alloc_root(inode);
17078 + if (!super->s_root) {
17079 + iput(inode);
17080 + return RETERR(-ENOMEM);
17081 + }
17082 +
17083 + super->s_root->d_op = &sbinfo->ops.dentry;
17084 +
17085 + if (!is_inode_loaded(inode)) {
17086 + pset_member memb;
17087 + plugin_set *pset;
17088 +
17089 + pset = reiser4_inode_data(inode)->pset;
17090 + for (memb = 0; memb < PSET_LAST; ++memb) {
17091 +
17092 + if (aset_get(pset, memb) != NULL)
17093 + continue;
17094 +
17095 + result = grab_plugin_pset(inode, NULL, memb);
17096 + if (result != 0)
17097 + break;
17098 +
17099 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17100 + }
17101 +
17102 + if (result == 0) {
17103 + if (REISER4_DEBUG) {
17104 + for (memb = 0; memb < PSET_LAST; ++memb)
17105 + assert("nikita-3500",
17106 + aset_get(pset, memb) != NULL);
17107 + }
17108 + } else
17109 + warning("nikita-3448", "Cannot set plugins of root: %i",
17110 + result);
17111 + reiser4_iget_complete(inode);
17112 +
17113 + /* As the default pset kept in the root dir may has been changed
17114 + (length is unknown), call update_sd. */
17115 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17116 + result = reiser4_grab_space(
17117 + inode_file_plugin(inode)->estimate.update(inode),
17118 + BA_CAN_COMMIT);
17119 +
17120 + if (result == 0)
17121 + result = reiser4_update_sd(inode);
17122 +
17123 + all_grabbed2free();
17124 + }
17125 + }
17126 +
17127 + super->s_maxbytes = MAX_LFS_FILESIZE;
17128 + return result;
17129 +}
17130 +
17131 +/*
17132 + * Local variables:
17133 + * c-indentation-style: "K&R"
17134 + * mode-name: "LC"
17135 + * c-basic-offset: 8
17136 + * tab-width: 8
17137 + * fill-column: 79
17138 + * End:
17139 + */
17140 diff -urN linux-2.6.23.orig/fs/reiser4/inode.c linux-2.6.23/fs/reiser4/inode.c
17141 --- linux-2.6.23.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300
17142 +++ linux-2.6.23/fs/reiser4/inode.c 2007-12-04 16:49:30.000000000 +0300
17143 @@ -0,0 +1,709 @@
17144 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17145 +
17146 +/* Inode specific operations. */
17147 +
17148 +#include "forward.h"
17149 +#include "debug.h"
17150 +#include "key.h"
17151 +#include "kassign.h"
17152 +#include "coord.h"
17153 +#include "seal.h"
17154 +#include "dscale.h"
17155 +#include "plugin/item/item.h"
17156 +#include "plugin/security/perm.h"
17157 +#include "plugin/plugin.h"
17158 +#include "plugin/object.h"
17159 +#include "znode.h"
17160 +#include "vfs_ops.h"
17161 +#include "inode.h"
17162 +#include "super.h"
17163 +#include "reiser4.h"
17164 +
17165 +#include <linux/fs.h> /* for struct super_block, address_space */
17166 +
17167 +/* return reiser4 internal tree which inode belongs to */
17168 +/* Audited by: green(2002.06.17) */
17169 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17170 +{
17171 + assert("nikita-256", inode != NULL);
17172 + assert("nikita-257", inode->i_sb != NULL);
17173 + return reiser4_get_tree(inode->i_sb);
17174 +}
17175 +
17176 +/* return reiser4-specific inode flags */
17177 +static inline unsigned long *inode_flags(const struct inode *const inode)
17178 +{
17179 + assert("nikita-2842", inode != NULL);
17180 + return &reiser4_inode_data(inode)->flags;
17181 +}
17182 +
17183 +/* set reiser4-specific flag @f in @inode */
17184 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17185 +{
17186 + assert("nikita-2248", inode != NULL);
17187 + set_bit((int)f, inode_flags(inode));
17188 +}
17189 +
17190 +/* clear reiser4-specific flag @f in @inode */
17191 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17192 +{
17193 + assert("nikita-2250", inode != NULL);
17194 + clear_bit((int)f, inode_flags(inode));
17195 +}
17196 +
17197 +/* true if reiser4-specific flag @f is set in @inode */
17198 +int reiser4_inode_get_flag(const struct inode *inode,
17199 + reiser4_file_plugin_flags f)
17200 +{
17201 + assert("nikita-2251", inode != NULL);
17202 + return test_bit((int)f, inode_flags(inode));
17203 +}
17204 +
17205 +/* convert oid to inode number */
17206 +ino_t oid_to_ino(oid_t oid)
17207 +{
17208 + return (ino_t) oid;
17209 +}
17210 +
17211 +/* convert oid to user visible inode number */
17212 +ino_t oid_to_uino(oid_t oid)
17213 +{
17214 + /* reiser4 object is uniquely identified by oid which is 64 bit
17215 + quantity. Kernel in-memory inode is indexed (in the hash table) by
17216 + 32 bit i_ino field, but this is not a problem, because there is a
17217 + way to further distinguish inodes with identical inode numbers
17218 + (find_actor supplied to iget()).
17219 +
17220 + But user space expects unique 32 bit inode number. Obviously this
17221 + is impossible. Work-around is to somehow hash oid into user visible
17222 + inode number.
17223 + */
17224 + oid_t max_ino = (ino_t) ~ 0;
17225 +
17226 + if (REISER4_INO_IS_OID || (oid <= max_ino))
17227 + return oid;
17228 + else
17229 + /* this is remotely similar to algorithm used to find next pid
17230 + to use for process: after wrap-around start from some
17231 + offset rather than from 0. Idea is that there are some long
17232 + living objects with which we don't want to collide.
17233 + */
17234 + return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17235 +}
17236 +
17237 +/* check that "inode" is on reiser4 file-system */
17238 +int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17239 +{
17240 + return inode != NULL && is_reiser4_super(inode->i_sb);
17241 +}
17242 +
17243 +/* Maximal length of a name that can be stored in directory @inode.
17244 +
17245 + This is used in check during file creation and lookup. */
17246 +int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17247 +{
17248 + assert("nikita-287", is_reiser4_inode(inode));
17249 + assert("nikita-1710", inode_dir_item_plugin(inode));
17250 + if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17251 + return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17252 + else
17253 + return 255;
17254 +}
17255 +
17256 +#if REISER4_USE_COLLISION_LIMIT
17257 +/* Maximal number of hash collisions for this directory. */
17258 +int max_hash_collisions(const struct inode *dir /* inode queried */ )
17259 +{
17260 + assert("nikita-1711", dir != NULL);
17261 + return reiser4_inode_data(dir)->plugin.max_collisions;
17262 +}
17263 +#endif /* REISER4_USE_COLLISION_LIMIT */
17264 +
17265 +/* Install file, inode, and address_space operation on @inode, depending on
17266 + its mode. */
17267 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17268 + reiser4_object_create_data * data /* parameters to create
17269 + * object */ )
17270 +{
17271 + reiser4_super_info_data *sinfo;
17272 + file_plugin *fplug;
17273 + dir_plugin *dplug;
17274 +
17275 + fplug = inode_file_plugin(inode);
17276 + dplug = inode_dir_plugin(inode);
17277 +
17278 + sinfo = get_super_private(inode->i_sb);
17279 +
17280 + switch (inode->i_mode & S_IFMT) {
17281 + case S_IFSOCK:
17282 + case S_IFBLK:
17283 + case S_IFCHR:
17284 + case S_IFIFO:
17285 + {
17286 + dev_t rdev; /* to keep gcc happy */
17287 +
17288 + assert("vs-46", fplug != NULL);
17289 + /* ugly hack with rdev */
17290 + if (data == NULL) {
17291 + rdev = inode->i_rdev;
17292 + inode->i_rdev = 0;
17293 + } else
17294 + rdev = data->rdev;
17295 + inode->i_blocks = 0;
17296 + assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17297 + inode->i_op = file_plugins[fplug->h.id].inode_ops;
17298 + /* initialize inode->i_fop and inode->i_rdev for block and char
17299 + devices */
17300 + init_special_inode(inode, inode->i_mode, rdev);
17301 + /* all address space operations are null */
17302 + inode->i_mapping->a_ops =
17303 + file_plugins[fplug->h.id].as_ops;
17304 + break;
17305 + }
17306 + case S_IFLNK:
17307 + assert("vs-46", fplug != NULL);
17308 + assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17309 + inode->i_op = file_plugins[fplug->h.id].inode_ops;
17310 + inode->i_fop = NULL;
17311 + /* all address space operations are null */
17312 + inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17313 + break;
17314 + case S_IFDIR:
17315 + assert("vs-46", dplug != NULL);
17316 + assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17317 + dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17318 + inode->i_op = dir_plugins[dplug->h.id].inode_ops;
17319 + inode->i_fop = dir_plugins[dplug->h.id].file_ops;
17320 + inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
17321 + break;
17322 + case S_IFREG:
17323 + assert("vs-46", fplug != NULL);
17324 + assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17325 + fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17326 + inode->i_op = file_plugins[fplug->h.id].inode_ops;
17327 + inode->i_fop = file_plugins[fplug->h.id].file_ops;
17328 + inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17329 + break;
17330 + default:
17331 + warning("nikita-291", "wrong file mode: %o for %llu",
17332 + inode->i_mode,
17333 + (unsigned long long)get_inode_oid(inode));
17334 + reiser4_make_bad_inode(inode);
17335 + return RETERR(-EINVAL);
17336 + }
17337 + return 0;
17338 +}
17339 +
17340 +/* Initialize inode from disk data. Called with inode locked.
17341 + Return inode locked. */
17342 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17343 + coord_t * coord /* coord of stat data */ )
17344 +{
17345 + int result;
17346 + item_plugin *iplug;
17347 + void *body;
17348 + int length;
17349 + reiser4_inode *state;
17350 +
17351 + assert("nikita-292", coord != NULL);
17352 + assert("nikita-293", inode != NULL);
17353 +
17354 + coord_clear_iplug(coord);
17355 + result = zload(coord->node);
17356 + if (result)
17357 + return result;
17358 + iplug = item_plugin_by_coord(coord);
17359 + body = item_body_by_coord(coord);
17360 + length = item_length_by_coord(coord);
17361 +
17362 + assert("nikita-295", iplug != NULL);
17363 + assert("nikita-296", body != NULL);
17364 + assert("nikita-297", length > 0);
17365 +
17366 + /* inode is under I_LOCK now */
17367 +
17368 + state = reiser4_inode_data(inode);
17369 + /* call stat-data plugin method to load sd content into inode */
17370 + result = iplug->s.sd.init_inode(inode, body, length);
17371 + set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17372 + if (result == 0) {
17373 + result = setup_inode_ops(inode, NULL);
17374 + if (result == 0 && inode->i_sb->s_root &&
17375 + inode->i_sb->s_root->d_inode)
17376 + result = finish_pset(inode);
17377 + }
17378 + zrelse(coord->node);
17379 + return result;
17380 +}
17381 +
17382 +/* read `inode' from the disk. This is what was previously in
17383 + reiserfs_read_inode2().
17384 +
17385 + Must be called with inode locked. Return inode still locked.
17386 +*/
17387 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17388 + const reiser4_key * key /* key of stat data */ ,
17389 + int silent)
17390 +{
17391 + int result;
17392 + lock_handle lh;
17393 + reiser4_inode *info;
17394 + coord_t coord;
17395 +
17396 + assert("nikita-298", inode != NULL);
17397 + assert("nikita-1945", !is_inode_loaded(inode));
17398 +
17399 + info = reiser4_inode_data(inode);
17400 + assert("nikita-300", info->locality_id != 0);
17401 +
17402 + coord_init_zero(&coord);
17403 + init_lh(&lh);
17404 + /* locate stat-data in a tree and return znode locked */
17405 + result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17406 + assert("nikita-301", !is_inode_loaded(inode));
17407 + if (result == 0) {
17408 + /* use stat-data plugin to load sd into inode. */
17409 + result = init_inode(inode, &coord);
17410 + if (result == 0) {
17411 + /* initialize stat-data seal */
17412 + spin_lock_inode(inode);
17413 + reiser4_seal_init(&info->sd_seal, &coord, key);
17414 + info->sd_coord = coord;
17415 + spin_unlock_inode(inode);
17416 +
17417 + /* call file plugin's method to initialize plugin
17418 + * specific part of inode */
17419 + if (inode_file_plugin(inode)->init_inode_data)
17420 + inode_file_plugin(inode)->init_inode_data(inode,
17421 + NULL,
17422 + 0);
17423 + /* load detached directory cursors for stateless
17424 + * directory readers (NFS). */
17425 + reiser4_load_cursors(inode);
17426 +
17427 + /* Check the opened inode for consistency. */
17428 + result =
17429 + get_super_private(inode->i_sb)->df_plug->
17430 + check_open(inode);
17431 + }
17432 + }
17433 + /* lookup_sd() doesn't release coord because we want znode
17434 + stay read-locked while stat-data fields are accessed in
17435 + init_inode() */
17436 + done_lh(&lh);
17437 +
17438 + if (result != 0)
17439 + reiser4_make_bad_inode(inode);
17440 + return result;
17441 +}
17442 +
17443 +/* initialise new reiser4 inode being inserted into hash table. */
17444 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17445 + void *opaque /* key of stat data passed to the
17446 + * iget5_locked as cookie */ )
17447 +{
17448 + reiser4_key *key;
17449 +
17450 + assert("nikita-1995", inode != NULL);
17451 + assert("nikita-1996", opaque != NULL);
17452 + key = opaque;
17453 + set_inode_oid(inode, get_key_objectid(key));
17454 + reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17455 + return 0;
17456 +}
17457 +
17458 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17459 +
17460 + This function is called by iget5_locked() to distinguish reiser4 inodes
17461 + having the same inode numbers. Such inodes can only exist due to some error
17462 + condition. One of them should be bad. Inodes with identical inode numbers
17463 + (objectids) are distinguished by their packing locality.
17464 +
17465 +*/
17466 +static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17467 + * check */ ,
17468 + void *opaque /* "cookie" passed to
17469 + * iget5_locked(). This is stat data
17470 + * key */ )
17471 +{
17472 + reiser4_key *key;
17473 +
17474 + key = opaque;
17475 + return
17476 + /* oid is unique, so first term is enough, actually. */
17477 + get_inode_oid(inode) == get_key_objectid(key) &&
17478 + /*
17479 + * also, locality should be checked, but locality is stored in
17480 + * the reiser4-specific part of the inode, and actor can be
17481 + * called against arbitrary inode that happened to be in this
17482 + * hash chain. Hence we first have to check that this is
17483 + * reiser4 inode at least. is_reiser4_inode() is probably too
17484 + * early to call, as inode may have ->i_op not yet
17485 + * initialised.
17486 + */
17487 + is_reiser4_super(inode->i_sb) &&
17488 + /*
17489 + * usually objectid is unique, but pseudo files use counter to
17490 + * generate objectid. All pseudo files are placed into special
17491 + * (otherwise unused) locality.
17492 + */
17493 + reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17494 +}
17495 +
17496 +/* hook for kmem_cache_create */
17497 +void loading_init_once(reiser4_inode * info)
17498 +{
17499 + mutex_init(&info->loading);
17500 +}
17501 +
17502 +/* for reiser4_alloc_inode */
17503 +void loading_alloc(reiser4_inode * info)
17504 +{
17505 + assert("vs-1717", !mutex_is_locked(&info->loading));
17506 +}
17507 +
17508 +/* for reiser4_destroy */
17509 +void loading_destroy(reiser4_inode * info)
17510 +{
17511 + assert("vs-1717a", !mutex_is_locked(&info->loading));
17512 +}
17513 +
17514 +static void loading_begin(reiser4_inode * info)
17515 +{
17516 + mutex_lock(&info->loading);
17517 +}
17518 +
17519 +static void loading_end(reiser4_inode * info)
17520 +{
17521 + mutex_unlock(&info->loading);
17522 +}
17523 +
17524 +/**
17525 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17526 + * @super: super block of filesystem
17527 + * @key: key of inode's stat-data
17528 + * @silent:
17529 + *
17530 + * This is our helper function a la iget(). This is be called by
17531 + * lookup_common() and reiser4_read_super(). Return inode locked or error
17532 + * encountered.
17533 + */
17534 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17535 + int silent)
17536 +{
17537 + struct inode *inode;
17538 + int result;
17539 + reiser4_inode *info;
17540 +
17541 + assert("nikita-302", super != NULL);
17542 + assert("nikita-303", key != NULL);
17543 +
17544 + result = 0;
17545 +
17546 + /* call iget(). Our ->read_inode() is dummy, so this will either
17547 + find inode in cache or return uninitialised inode */
17548 + inode = iget5_locked(super,
17549 + (unsigned long)get_key_objectid(key),
17550 + reiser4_inode_find_actor,
17551 + init_locked_inode, (reiser4_key *) key);
17552 + if (inode == NULL)
17553 + return ERR_PTR(RETERR(-ENOMEM));
17554 + if (is_bad_inode(inode)) {
17555 + warning("nikita-304", "Bad inode found");
17556 + reiser4_print_key("key", key);
17557 + iput(inode);
17558 + return ERR_PTR(RETERR(-EIO));
17559 + }
17560 +
17561 + info = reiser4_inode_data(inode);
17562 +
17563 + /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17564 + loaded and initialized inode from just allocated inode. If
17565 + REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17566 + info->loading. The place in reiser4 which uses not initialized inode
17567 + is the reiser4 repacker, see repacker-related functions in
17568 + plugin/item/extent.c */
17569 + if (!is_inode_loaded(inode)) {
17570 + loading_begin(info);
17571 + if (!is_inode_loaded(inode)) {
17572 + /* locking: iget5_locked returns locked inode */
17573 + assert("nikita-1941", !is_inode_loaded(inode));
17574 + assert("nikita-1949",
17575 + reiser4_inode_find_actor(inode,
17576 + (reiser4_key *) key));
17577 + /* now, inode has objectid as ->i_ino and locality in
17578 + reiser4-specific part. This is enough for
17579 + read_inode() to read stat data from the disk */
17580 + result = read_inode(inode, key, silent);
17581 + } else
17582 + loading_end(info);
17583 + }
17584 +
17585 + if (inode->i_state & I_NEW)
17586 + unlock_new_inode(inode);
17587 +
17588 + if (is_bad_inode(inode)) {
17589 + assert("vs-1717", result != 0);
17590 + loading_end(info);
17591 + iput(inode);
17592 + inode = ERR_PTR(result);
17593 + } else if (REISER4_DEBUG) {
17594 + reiser4_key found_key;
17595 +
17596 + assert("vs-1717", result == 0);
17597 + build_sd_key(inode, &found_key);
17598 + if (!keyeq(&found_key, key)) {
17599 + warning("nikita-305", "Wrong key in sd");
17600 + reiser4_print_key("sought for", key);
17601 + reiser4_print_key("found", &found_key);
17602 + }
17603 + if (inode->i_nlink == 0) {
17604 + warning("nikita-3559", "Unlinked inode found: %llu\n",
17605 + (unsigned long long)get_inode_oid(inode));
17606 + }
17607 + }
17608 + return inode;
17609 +}
17610 +
17611 +/* reiser4_iget() may return not fully initialized inode, this function should
17612 + * be called after one completes reiser4 inode initializing. */
17613 +void reiser4_iget_complete(struct inode *inode)
17614 +{
17615 + assert("zam-988", is_reiser4_inode(inode));
17616 +
17617 + if (!is_inode_loaded(inode)) {
17618 + reiser4_inode_set_flag(inode, REISER4_LOADED);
17619 + loading_end(reiser4_inode_data(inode));
17620 + }
17621 +}
17622 +
17623 +void reiser4_make_bad_inode(struct inode *inode)
17624 +{
17625 + assert("nikita-1934", inode != NULL);
17626 +
17627 + /* clear LOADED bit */
17628 + reiser4_inode_clr_flag(inode, REISER4_LOADED);
17629 + make_bad_inode(inode);
17630 + return;
17631 +}
17632 +
17633 +file_plugin *inode_file_plugin(const struct inode * inode)
17634 +{
17635 + assert("nikita-1997", inode != NULL);
17636 + return reiser4_inode_data(inode)->pset->file;
17637 +}
17638 +
17639 +dir_plugin *inode_dir_plugin(const struct inode * inode)
17640 +{
17641 + assert("nikita-1998", inode != NULL);
17642 + return reiser4_inode_data(inode)->pset->dir;
17643 +}
17644 +
17645 +formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17646 +{
17647 + assert("nikita-2000", inode != NULL);
17648 + return reiser4_inode_data(inode)->pset->formatting;
17649 +}
17650 +
17651 +hash_plugin *inode_hash_plugin(const struct inode * inode)
17652 +{
17653 + assert("nikita-2001", inode != NULL);
17654 + return reiser4_inode_data(inode)->pset->hash;
17655 +}
17656 +
17657 +fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17658 +{
17659 + assert("nikita-2001", inode != NULL);
17660 + return reiser4_inode_data(inode)->pset->fibration;
17661 +}
17662 +
17663 +cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17664 +{
17665 + assert("edward-36", inode != NULL);
17666 + return reiser4_inode_data(inode)->pset->cipher;
17667 +}
17668 +
17669 +compression_plugin *inode_compression_plugin(const struct inode * inode)
17670 +{
17671 + assert("edward-37", inode != NULL);
17672 + return reiser4_inode_data(inode)->pset->compression;
17673 +}
17674 +
17675 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17676 + inode)
17677 +{
17678 + assert("edward-1330", inode != NULL);
17679 + return reiser4_inode_data(inode)->pset->compression_mode;
17680 +}
17681 +
17682 +cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17683 +{
17684 + assert("edward-1328", inode != NULL);
17685 + return reiser4_inode_data(inode)->pset->cluster;
17686 +}
17687 +
17688 +file_plugin *inode_create_plugin(const struct inode * inode)
17689 +{
17690 + assert("edward-1329", inode != NULL);
17691 + return reiser4_inode_data(inode)->pset->create;
17692 +}
17693 +
17694 +digest_plugin *inode_digest_plugin(const struct inode * inode)
17695 +{
17696 + assert("edward-86", inode != NULL);
17697 + return reiser4_inode_data(inode)->pset->digest;
17698 +}
17699 +
17700 +item_plugin *inode_sd_plugin(const struct inode * inode)
17701 +{
17702 + assert("vs-534", inode != NULL);
17703 + return reiser4_inode_data(inode)->pset->sd;
17704 +}
17705 +
17706 +item_plugin *inode_dir_item_plugin(const struct inode * inode)
17707 +{
17708 + assert("vs-534", inode != NULL);
17709 + return reiser4_inode_data(inode)->pset->dir_item;
17710 +}
17711 +
17712 +file_plugin *child_create_plugin(const struct inode * inode)
17713 +{
17714 + assert("edward-1329", inode != NULL);
17715 + return reiser4_inode_data(inode)->hset->create;
17716 +}
17717 +
17718 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17719 +{
17720 + reiser4_inode *state;
17721 +
17722 + assert("nikita-2716", inode != NULL);
17723 + assert("nikita-2717", ext < LAST_SD_EXTENSION);
17724 + assert("nikita-3491", spin_inode_is_locked(inode));
17725 +
17726 + state = reiser4_inode_data(inode);
17727 + state->extmask |= 1 << ext;
17728 + /* force re-calculation of stat-data length on next call to
17729 + update_sd(). */
17730 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17731 +}
17732 +
17733 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17734 +{
17735 + reiser4_inode *state;
17736 +
17737 + assert("vpf-1926", inode != NULL);
17738 + assert("vpf-1927", ext < LAST_SD_EXTENSION);
17739 + assert("vpf-1928", spin_inode_is_locked(inode));
17740 +
17741 + state = reiser4_inode_data(inode);
17742 + state->extmask &= ~(1 << ext);
17743 + /* force re-calculation of stat-data length on next call to
17744 + update_sd(). */
17745 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17746 +}
17747 +
17748 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17749 +{
17750 + assert("edward-1287", inode != NULL);
17751 + if (!dscale_fit(old, new))
17752 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17753 + return;
17754 +}
17755 +
17756 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17757 +{
17758 + assert("nikita-2875", inode != NULL);
17759 + spin_lock_inode(inode);
17760 + inode_check_scale_nolock(inode, old, new);
17761 + spin_unlock_inode(inode);
17762 +}
17763 +
17764 +/*
17765 + * initialize ->ordering field of inode. This field defines how file stat-data
17766 + * and body is ordered within a tree with respect to other objects within the
17767 + * same parent directory.
17768 + */
17769 +void
17770 +init_inode_ordering(struct inode *inode,
17771 + reiser4_object_create_data * crd, int create)
17772 +{
17773 + reiser4_key key;
17774 +
17775 + if (create) {
17776 + struct inode *parent;
17777 +
17778 + parent = crd->parent;
17779 + assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17780 + inode_dir_plugin(parent)->build_entry_key(parent,
17781 + &crd->dentry->d_name,
17782 + &key);
17783 + } else {
17784 + coord_t *coord;
17785 +
17786 + coord = &reiser4_inode_data(inode)->sd_coord;
17787 + coord_clear_iplug(coord);
17788 + /* safe to use ->sd_coord, because node is under long term
17789 + * lock */
17790 + WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17791 + }
17792 +
17793 + set_inode_ordering(inode, get_key_ordering(&key));
17794 +}
17795 +
17796 +znode *inode_get_vroot(struct inode *inode)
17797 +{
17798 + reiser4_block_nr blk;
17799 + znode *result;
17800 +
17801 + spin_lock_inode(inode);
17802 + blk = reiser4_inode_data(inode)->vroot;
17803 + spin_unlock_inode(inode);
17804 + if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17805 + result = zlook(reiser4_tree_by_inode(inode), &blk);
17806 + else
17807 + result = NULL;
17808 + return result;
17809 +}
17810 +
17811 +void inode_set_vroot(struct inode *inode, znode *vroot)
17812 +{
17813 + spin_lock_inode(inode);
17814 + reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17815 + spin_unlock_inode(inode);
17816 +}
17817 +
17818 +#if REISER4_DEBUG
17819 +
17820 +void reiser4_inode_invariant(const struct inode *inode)
17821 +{
17822 + assert("nikita-3077", spin_inode_is_locked(inode));
17823 +}
17824 +
17825 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
17826 +{
17827 + return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17828 + r4_inode->nr_jnodes == 0;
17829 +}
17830 +
17831 +#endif
17832 +
17833 +/* true if directory is empty (only contains dot and dotdot) */
17834 +/* FIXME: shouldn't it be dir plugin method? */
17835 +int is_dir_empty(const struct inode *dir)
17836 +{
17837 + assert("nikita-1976", dir != NULL);
17838 +
17839 + /* rely on our method to maintain directory i_size being equal to the
17840 + number of entries. */
17841 + return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17842 +}
17843 +
17844 +/* Make Linus happy.
17845 + Local variables:
17846 + c-indentation-style: "K&R"
17847 + mode-name: "LC"
17848 + c-basic-offset: 8
17849 + tab-width: 8
17850 + fill-column: 120
17851 + End:
17852 +*/
17853 diff -urN linux-2.6.23.orig/fs/reiser4/inode.h linux-2.6.23/fs/reiser4/inode.h
17854 --- linux-2.6.23.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300
17855 +++ linux-2.6.23/fs/reiser4/inode.h 2007-12-04 16:49:30.000000000 +0300
17856 @@ -0,0 +1,449 @@
17857 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17858 +
17859 +/* Inode functions. */
17860 +
17861 +#if !defined( __REISER4_INODE_H__ )
17862 +#define __REISER4_INODE_H__
17863 +
17864 +#include "forward.h"
17865 +#include "debug.h"
17866 +#include "key.h"
17867 +#include "seal.h"
17868 +#include "plugin/plugin.h"
17869 +#include "plugin/file/cryptcompress.h"
17870 +#include "plugin/file/file.h"
17871 +#include "plugin/dir/dir.h"
17872 +#include "plugin/plugin_set.h"
17873 +#include "plugin/security/perm.h"
17874 +#include "vfs_ops.h"
17875 +#include "jnode.h"
17876 +#include "fsdata.h"
17877 +
17878 +#include <linux/types.h> /* for __u?? , ino_t */
17879 +#include <linux/fs.h> /* for struct super_block, struct
17880 + * rw_semaphore, etc */
17881 +#include <linux/spinlock.h>
17882 +#include <asm/types.h>
17883 +
17884 +/* reiser4-specific inode flags. They are "transient" and are not
17885 + supposed to be stored on disk. Used to trace "state" of
17886 + inode
17887 +*/
17888 +typedef enum {
17889 + /* this is light-weight inode, inheriting some state from its
17890 + parent */
17891 + REISER4_LIGHT_WEIGHT = 0,
17892 + /* stat data wasn't yet created */
17893 + REISER4_NO_SD = 1,
17894 + /* internal immutable flag. Currently is only used
17895 + to avoid race condition during file creation.
17896 + See comment in create_object(). */
17897 + REISER4_IMMUTABLE = 2,
17898 + /* inode was read from storage */
17899 + REISER4_LOADED = 3,
17900 + /* this bit is set for symlinks. inode->i_private points to target
17901 + name of symlink. */
17902 + REISER4_GENERIC_PTR_USED = 4,
17903 + /* set if size of stat-data item for this inode is known. If this is
17904 + * set we can avoid recalculating size of stat-data on each update. */
17905 + REISER4_SDLEN_KNOWN = 5,
17906 + /* reiser4_inode->crypt points to the crypto stat */
17907 + REISER4_CRYPTO_STAT_LOADED = 6,
17908 + /* cryptcompress_inode_data points to the secret key */
17909 + REISER4_SECRET_KEY_INSTALLED = 7,
17910 + /* File (possibly) has pages corresponding to the tail items, that
17911 + * were created by ->readpage. It is set by mmap_unix_file() and
17912 + * sendfile_unix_file(). This bit is inspected by write_unix_file and
17913 + * kill-hook of tail items. It is never cleared once set. This bit is
17914 + * modified and inspected under i_mutex. */
17915 + REISER4_HAS_MMAP = 8,
17916 + REISER4_PART_MIXED = 9,
17917 + REISER4_PART_IN_CONV = 10,
17918 + /* This flag indicates that file plugin conversion is in progress */
17919 + REISER4_FILE_CONV_IN_PROGRESS = 11
17920 +} reiser4_file_plugin_flags;
17921 +
17922 +/* state associated with each inode.
17923 + reiser4 inode.
17924 +
17925 + NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17926 + be of the same size. File-system allocates inodes by itself through
17927 + s_op->allocate_inode() method. So, it is possible to adjust size of inode
17928 + at the time of its creation.
17929 +
17930 + Invariants involving parts of this data-type:
17931 +
17932 + [inode->eflushed]
17933 +
17934 +*/
17935 +
17936 +typedef struct reiser4_inode reiser4_inode;
17937 +/* return pointer to reiser4-specific part of inode */
17938 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17939 + /* inode queried */ );
17940 +
17941 +#if BITS_PER_LONG == 64
17942 +
17943 +#define REISER4_INO_IS_OID (1)
17944 +typedef struct {;
17945 +} oid_hi_t;
17946 +
17947 +/* BITS_PER_LONG == 64 */
17948 +#else
17949 +
17950 +#define REISER4_INO_IS_OID (0)
17951 +typedef __u32 oid_hi_t;
17952 +
17953 +/* BITS_PER_LONG == 64 */
17954 +#endif
17955 +
17956 +struct reiser4_inode {
17957 + /* spin lock protecting fields of this structure. */
17958 + spinlock_t guard;
17959 + /* main plugin set that control the file
17960 + (see comments in plugin/plugin_set.c) */
17961 + plugin_set *pset;
17962 + /* plugin set for inheritance
17963 + (see comments in plugin/plugin_set.c) */
17964 + plugin_set *hset;
17965 + /* high 32 bits of object id */
17966 + oid_hi_t oid_hi;
17967 + /* seal for stat-data */
17968 + seal_t sd_seal;
17969 + /* locality id for this file */
17970 + oid_t locality_id;
17971 +#if REISER4_LARGE_KEY
17972 + __u64 ordering;
17973 +#endif
17974 + /* coord of stat-data in sealed node */
17975 + coord_t sd_coord;
17976 + /* bit-mask of stat-data extentions used by this file */
17977 + __u64 extmask;
17978 + /* bitmask of non-default plugins for this inode */
17979 + __u16 plugin_mask;
17980 + /* bitmask of set heir plugins for this inode. */
17981 + __u16 heir_mask;
17982 + union {
17983 + struct list_head readdir_list;
17984 + struct list_head not_used;
17985 + } lists;
17986 + /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
17987 + unsigned long flags;
17988 + union {
17989 + /* fields specific to unix_file plugin */
17990 + struct unix_file_info unix_file_info;
17991 + /* fields specific to cryptcompress file plugin */
17992 + struct cryptcompress_info cryptcompress_info;
17993 + } file_plugin_data;
17994 +
17995 + /* this semaphore is to serialize readers and writers of @pset->file
17996 + * when file plugin conversion is enabled
17997 + */
17998 + struct rw_semaphore conv_sem;
17999 +
18000 + /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18001 + tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18002 + struct radix_tree_root jnodes_tree;
18003 +#if REISER4_DEBUG
18004 + /* number of unformatted node jnodes of this file in jnode hash table */
18005 + unsigned long nr_jnodes;
18006 +#endif
18007 +
18008 + /* block number of virtual root for this object. See comment above
18009 + * fs/reiser4/search.c:handle_vroot() */
18010 + reiser4_block_nr vroot;
18011 + struct mutex loading;
18012 +};
18013 +
18014 +void loading_init_once(reiser4_inode *);
18015 +void loading_alloc(reiser4_inode *);
18016 +void loading_destroy(reiser4_inode *);
18017 +
18018 +struct reiser4_inode_object {
18019 + /* private part */
18020 + reiser4_inode p;
18021 + /* generic fields not specific to reiser4, but used by VFS */
18022 + struct inode vfs_inode;
18023 +};
18024 +
18025 +/* return pointer to the reiser4 specific portion of @inode */
18026 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18027 + /* inode queried */ )
18028 +{
18029 + assert("nikita-254", inode != NULL);
18030 + return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
18031 +}
18032 +
18033 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18034 + r4_inode /* inode queried */
18035 + )
18036 +{
18037 + return &container_of(r4_inode, struct reiser4_inode_object, p)->vfs_inode;
18038 +}
18039 +
18040 +/*
18041 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18042 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18043 + * bits.
18044 + *
18045 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18046 + * of inode, otherwise whole oid is stored in i_ino.
18047 + *
18048 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18049 + */
18050 +
18051 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18052 +
18053 +#if REISER4_INO_IS_OID
18054 +
18055 +static inline oid_t get_inode_oid(const struct inode *inode)
18056 +{
18057 + return inode->i_ino;
18058 +}
18059 +
18060 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18061 +{
18062 + inode->i_ino = oid;
18063 +}
18064 +
18065 +/* REISER4_INO_IS_OID */
18066 +#else
18067 +
18068 +static inline oid_t get_inode_oid(const struct inode *inode)
18069 +{
18070 + return
18071 + ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18072 + inode->i_ino;
18073 +}
18074 +
18075 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18076 +{
18077 + assert("nikita-2519", inode != NULL);
18078 + inode->i_ino = (ino_t) (oid);
18079 + reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18080 + assert("nikita-2521", get_inode_oid(inode) == (oid));
18081 +}
18082 +
18083 +/* REISER4_INO_IS_OID */
18084 +#endif
18085 +
18086 +static inline oid_t get_inode_locality(const struct inode *inode)
18087 +{
18088 + return reiser4_inode_data(inode)->locality_id;
18089 +}
18090 +
18091 +#if REISER4_LARGE_KEY
18092 +static inline __u64 get_inode_ordering(const struct inode *inode)
18093 +{
18094 + return reiser4_inode_data(inode)->ordering;
18095 +}
18096 +
18097 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18098 +{
18099 + reiser4_inode_data(inode)->ordering = ordering;
18100 +}
18101 +
18102 +#else
18103 +
18104 +#define get_inode_ordering(inode) (0)
18105 +#define set_inode_ordering(inode, val) noop
18106 +
18107 +#endif
18108 +
18109 +/* return inode in which @uf_info is embedded */
18110 +static inline struct inode *
18111 +unix_file_info_to_inode(const struct unix_file_info * uf_info)
18112 +{
18113 + return &container_of(uf_info, struct reiser4_inode_object,
18114 + p.file_plugin_data.unix_file_info)->vfs_inode;
18115 +}
18116 +
18117 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18118 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18119 +
18120 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18121 +
18122 +#if REISER4_DEBUG
18123 +extern void reiser4_inode_invariant(const struct inode *inode);
18124 +extern int inode_has_no_jnodes(reiser4_inode *);
18125 +#else
18126 +#define reiser4_inode_invariant(inode) noop
18127 +#endif
18128 +
18129 +static inline int spin_inode_is_locked(const struct inode *inode)
18130 +{
18131 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18132 + return 1;
18133 +}
18134 +
18135 +/**
18136 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18137 + * @inode: inode to lock
18138 + *
18139 + * In debug mode it checks that lower priority locks are not held and
18140 + * increments reiser4_context's lock counters on which lock ordering checking
18141 + * is based.
18142 + */
18143 +static inline void spin_lock_inode(struct inode *inode)
18144 +{
18145 + assert("", LOCK_CNT_NIL(spin_locked));
18146 + /* check lock ordering */
18147 + assert_spin_not_locked(&d_lock);
18148 +
18149 + spin_lock(&reiser4_inode_data(inode)->guard);
18150 +
18151 + LOCK_CNT_INC(spin_locked_inode);
18152 + LOCK_CNT_INC(spin_locked);
18153 +
18154 + reiser4_inode_invariant(inode);
18155 +}
18156 +
18157 +/**
18158 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18159 + * @inode: inode to unlock
18160 + *
18161 + * In debug mode it checks that spinlock is held and decrements
18162 + * reiser4_context's lock counters on which lock ordering checking is based.
18163 + */
18164 +static inline void spin_unlock_inode(struct inode *inode)
18165 +{
18166 + assert_spin_locked(&reiser4_inode_data(inode)->guard);
18167 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18168 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18169 +
18170 + reiser4_inode_invariant(inode);
18171 +
18172 + LOCK_CNT_DEC(spin_locked_inode);
18173 + LOCK_CNT_DEC(spin_locked);
18174 +
18175 + spin_unlock(&reiser4_inode_data(inode)->guard);
18176 +}
18177 +
18178 +extern znode *inode_get_vroot(struct inode *inode);
18179 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18180 +
18181 +extern int reiser4_max_filename_len(const struct inode *inode);
18182 +extern int max_hash_collisions(const struct inode *dir);
18183 +extern void reiser4_unlock_inode(struct inode *inode);
18184 +extern int is_reiser4_inode(const struct inode *inode);
18185 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18186 +extern struct inode *reiser4_iget(struct super_block *super,
18187 + const reiser4_key * key, int silent);
18188 +extern void reiser4_iget_complete(struct inode *inode);
18189 +extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18190 +extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18191 +extern int reiser4_inode_get_flag(const struct inode *inode,
18192 + reiser4_file_plugin_flags f);
18193 +
18194 +/* has inode been initialized? */
18195 +static inline int
18196 +is_inode_loaded(const struct inode *inode /* inode queried */ )
18197 +{
18198 + assert("nikita-1120", inode != NULL);
18199 + return reiser4_inode_get_flag(inode, REISER4_LOADED);
18200 +}
18201 +
18202 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18203 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18204 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18205 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18206 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18207 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18208 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18209 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18210 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18211 + *inode);
18212 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18213 +extern file_plugin *inode_create_plugin(const struct inode *inode);
18214 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18215 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18216 +extern file_plugin *child_create_plugin(const struct inode *inode);
18217 +
18218 +extern void reiser4_make_bad_inode(struct inode *inode);
18219 +
18220 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18221 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18222 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18223 +extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18224 +
18225 +#define INODE_SET_SIZE(i, value) \
18226 +({ \
18227 + struct inode *__i; \
18228 + typeof(value) __v; \
18229 + \
18230 + __i = (i); \
18231 + __v = (value); \
18232 + inode_check_scale(__i, __i->i_size, __v); \
18233 + i_size_write(__i, __v); \
18234 +})
18235 +
18236 +/*
18237 + * update field @field in inode @i to contain value @value.
18238 + */
18239 +#define INODE_SET_FIELD(i, field, value) \
18240 +({ \
18241 + struct inode *__i; \
18242 + typeof(value) __v; \
18243 + \
18244 + __i = (i); \
18245 + __v = (value); \
18246 + inode_check_scale(__i, __i->field, __v); \
18247 + __i->field = __v; \
18248 +})
18249 +
18250 +#define INODE_INC_FIELD(i, field) \
18251 +({ \
18252 + struct inode *__i; \
18253 + \
18254 + __i = (i); \
18255 + inode_check_scale(__i, __i->field, __i->field + 1); \
18256 + ++ __i->field; \
18257 +})
18258 +
18259 +#define INODE_DEC_FIELD(i, field) \
18260 +({ \
18261 + struct inode *__i; \
18262 + \
18263 + __i = (i); \
18264 + inode_check_scale(__i, __i->field, __i->field - 1); \
18265 + -- __i->field; \
18266 +})
18267 +
18268 +/* See comment before reiser4_readdir_common() for description. */
18269 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18270 +{
18271 + return &reiser4_inode_data(inode)->lists.readdir_list;
18272 +}
18273 +
18274 +extern void init_inode_ordering(struct inode *inode,
18275 + reiser4_object_create_data * crd, int create);
18276 +
18277 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18278 +{
18279 + return &reiser4_inode_data(inode)->jnodes_tree;
18280 +}
18281 +
18282 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18283 + * r4_inode)
18284 +{
18285 + return &r4_inode->jnodes_tree;
18286 +}
18287 +
18288 +#if REISER4_DEBUG
18289 +extern void print_inode(const char *prefix, const struct inode *i);
18290 +#endif
18291 +
18292 +int is_dir_empty(const struct inode *);
18293 +
18294 +/* __REISER4_INODE_H__ */
18295 +#endif
18296 +
18297 +/* Make Linus happy.
18298 + Local variables:
18299 + c-indentation-style: "K&R"
18300 + mode-name: "LC"
18301 + c-basic-offset: 8
18302 + tab-width: 8
18303 + fill-column: 120
18304 + End:
18305 +*/
18306 diff -urN linux-2.6.23.orig/fs/reiser4/ioctl.h linux-2.6.23/fs/reiser4/ioctl.h
18307 --- linux-2.6.23.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300
18308 +++ linux-2.6.23/fs/reiser4/ioctl.h 2007-12-04 16:49:30.000000000 +0300
18309 @@ -0,0 +1,41 @@
18310 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18311 + * reiser4/README */
18312 +
18313 +#if !defined( __REISER4_IOCTL_H__ )
18314 +#define __REISER4_IOCTL_H__
18315 +
18316 +#include <linux/fs.h>
18317 +
18318 +/*
18319 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18320 + * extents and fix in this state. This is used by applications that rely on
18321 + *
18322 + * . files being block aligned, and
18323 + *
18324 + * . files never migrating on disk
18325 + *
18326 + * for example, boot loaders (LILO) need this.
18327 + *
18328 + * This ioctl should be used as
18329 + *
18330 + * result = ioctl(fd, REISER4_IOC_UNPACK);
18331 + *
18332 + * File behind fd descriptor will be converted to the extents (if necessary),
18333 + * and its stat-data will be updated so that it will never be converted back
18334 + * into tails again.
18335 + */
18336 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18337 +
18338 +/* __REISER4_IOCTL_H__ */
18339 +#endif
18340 +
18341 +/* Make Linus happy.
18342 + Local variables:
18343 + c-indentation-style: "K&R"
18344 + mode-name: "LC"
18345 + c-basic-offset: 8
18346 + tab-width: 8
18347 + fill-column: 120
18348 + scroll-step: 1
18349 + End:
18350 +*/
18351 diff -urN linux-2.6.23.orig/fs/reiser4/jnode.c linux-2.6.23/fs/reiser4/jnode.c
18352 --- linux-2.6.23.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300
18353 +++ linux-2.6.23/fs/reiser4/jnode.c 2007-12-04 16:49:30.000000000 +0300
18354 @@ -0,0 +1,1924 @@
18355 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18356 + * reiser4/README */
18357 +/* Jnode manipulation functions. */
18358 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18359 +
18360 + In particular, jnodes are used to track transactional information
18361 + associated with each block. Each znode contains jnode as ->zjnode field.
18362 +
18363 + Jnode stands for either Josh or Journal node.
18364 +*/
18365 +
18366 +/*
18367 + * Taxonomy.
18368 + *
18369 + * Jnode represents block containing data or meta-data. There are jnodes
18370 + * for:
18371 + *
18372 + * unformatted blocks (jnodes proper). There are plans, however to
18373 + * have a handle per extent unit rather than per each unformatted
18374 + * block, because there are so many of them.
18375 + *
18376 + * For bitmaps. Each bitmap is actually represented by two jnodes--one
18377 + * for working and another for "commit" data, together forming bnode.
18378 + *
18379 + * For io-heads. These are used by log writer.
18380 + *
18381 + * For formatted nodes (znode). See comment at the top of znode.c for
18382 + * details specific to the formatted nodes (znodes).
18383 + *
18384 + * Node data.
18385 + *
18386 + * Jnode provides access to the data of node it represents. Data are
18387 + * stored in a page. Page is kept in a page cache. This means, that jnodes
18388 + * are highly interconnected with page cache and VM internals.
18389 + *
18390 + * jnode has a pointer to page (->pg) containing its data. Pointer to data
18391 + * themselves is cached in ->data field to avoid frequent calls to
18392 + * page_address().
18393 + *
18394 + * jnode and page are attached to each other by jnode_attach_page(). This
18395 + * function places pointer to jnode in set_page_private(), sets PG_private
18396 + * flag and increments page counter.
18397 + *
18398 + * Opposite operation is performed by page_clear_jnode().
18399 + *
18400 + * jnode->pg is protected by jnode spin lock, and page->private is
18401 + * protected by page lock. See comment at the top of page_cache.c for
18402 + * more.
18403 + *
18404 + * page can be detached from jnode for two reasons:
18405 + *
18406 + * . jnode is removed from a tree (file is truncated, of formatted
18407 + * node is removed by balancing).
18408 + *
18409 + * . during memory pressure, VM calls ->releasepage() method
18410 + * (reiser4_releasepage()) to evict page from memory.
18411 + *
18412 + * (there, of course, is also umount, but this is special case we are not
18413 + * concerned with here).
18414 + *
18415 + * To protect jnode page from eviction, one calls jload() function that
18416 + * "pins" page in memory (loading it if necessary), increments
18417 + * jnode->d_count, and kmap()s page. Page is unpinned through call to
18418 + * jrelse().
18419 + *
18420 + * Jnode life cycle.
18421 + *
18422 + * jnode is created, placed in hash table, and, optionally, in per-inode
18423 + * radix tree. Page can be attached to jnode, pinned, released, etc.
18424 + *
18425 + * When jnode is captured into atom its reference counter is
18426 + * increased. While being part of an atom, jnode can be "early
18427 + * flushed". This means that as part of flush procedure, jnode is placed
18428 + * into "relocate set", and its page is submitted to the disk. After io
18429 + * completes, page can be detached, then loaded again, re-dirtied, etc.
18430 + *
18431 + * Thread acquired reference to jnode by calling jref() and releases it by
18432 + * jput(). When last reference is removed, jnode is still retained in
18433 + * memory (cached) if it has page attached, _unless_ it is scheduled for
18434 + * destruction (has JNODE_HEARD_BANSHEE bit set).
18435 + *
18436 + * Tree read-write lock was used as "existential" lock for jnodes. That is,
18437 + * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18438 + * that is, tree lock protected unreferenced jnodes stored in the hash
18439 + * table, from recycling.
18440 + *
18441 + * This resulted in high contention on tree lock, because jref()/jput() is
18442 + * frequent operation. To ameliorate this problem, RCU is used: when jput()
18443 + * is just about to release last reference on jnode it sets JNODE_RIP bit
18444 + * on it, and then proceed with jnode destruction (removing jnode from hash
18445 + * table, cbk_cache, detaching page, etc.). All places that change jnode
18446 + * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18447 + * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18448 + * jnode_rip_check() function), and pretend that nothing was found in hash
18449 + * table if bit is set.
18450 + *
18451 + * jput defers actual return of jnode into slab cache to some later time
18452 + * (by call_rcu()), this guarantees that other threads can safely continue
18453 + * working with JNODE_RIP-ped jnode.
18454 + *
18455 + */
18456 +
18457 +#include "reiser4.h"
18458 +#include "debug.h"
18459 +#include "dformat.h"
18460 +#include "jnode.h"
18461 +#include "plugin/plugin_header.h"
18462 +#include "plugin/plugin.h"
18463 +#include "txnmgr.h"
18464 +/*#include "jnode.h"*/
18465 +#include "znode.h"
18466 +#include "tree.h"
18467 +#include "tree_walk.h"
18468 +#include "super.h"
18469 +#include "inode.h"
18470 +#include "page_cache.h"
18471 +
18472 +#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18473 +#include <linux/types.h>
18474 +#include <linux/slab.h>
18475 +#include <linux/pagemap.h>
18476 +#include <linux/swap.h>
18477 +#include <linux/fs.h> /* for struct address_space */
18478 +#include <linux/writeback.h> /* for inode_lock */
18479 +
18480 +static struct kmem_cache *_jnode_slab = NULL;
18481 +
18482 +static void jnode_set_type(jnode * node, jnode_type type);
18483 +static int jdelete(jnode * node);
18484 +static int jnode_try_drop(jnode * node);
18485 +
18486 +#if REISER4_DEBUG
18487 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18488 +#endif
18489 +
18490 +/* true if valid page is attached to jnode */
18491 +static inline int jnode_is_parsed(jnode * node)
18492 +{
18493 + return JF_ISSET(node, JNODE_PARSED);
18494 +}
18495 +
18496 +/* hash table support */
18497 +
18498 +/* compare two jnode keys for equality. Used by hash-table macros */
18499 +static inline int jnode_key_eq(const struct jnode_key * k1,
18500 + const struct jnode_key * k2)
18501 +{
18502 + assert("nikita-2350", k1 != NULL);
18503 + assert("nikita-2351", k2 != NULL);
18504 +
18505 + return (k1->index == k2->index && k1->objectid == k2->objectid);
18506 +}
18507 +
18508 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18509 +static inline __u32 jnode_key_hashfn(j_hash_table * table,
18510 + const struct jnode_key * key)
18511 +{
18512 + assert("nikita-2352", key != NULL);
18513 + assert("nikita-3346", IS_POW(table->_buckets));
18514 +
18515 + /* yes, this is remarkable simply (where not stupid) hash function. */
18516 + return (key->objectid + key->index) & (table->_buckets - 1);
18517 +}
18518 +
18519 +/* The hash table definition */
18520 +#define KMALLOC(size) reiser4_vmalloc(size)
18521 +#define KFREE(ptr, size) vfree(ptr)
18522 +TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18523 + jnode_key_hashfn, jnode_key_eq);
18524 +#undef KFREE
18525 +#undef KMALLOC
18526 +
18527 +/* call this to initialise jnode hash table */
18528 +int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18529 +{
18530 + assert("nikita-2359", tree != NULL);
18531 + return j_hash_init(&tree->jhash_table, 16384);
18532 +}
18533 +
18534 +/* call this to destroy jnode hash table. This is called during umount. */
18535 +int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18536 +{
18537 + j_hash_table *jtable;
18538 + jnode *node;
18539 + jnode *next;
18540 +
18541 + assert("nikita-2360", tree != NULL);
18542 +
18543 + /*
18544 + * Scan hash table and free all jnodes.
18545 + */
18546 + jtable = &tree->jhash_table;
18547 + if (jtable->_table) {
18548 + for_all_in_htable(jtable, j, node, next) {
18549 + assert("nikita-2361", !atomic_read(&node->x_count));
18550 + jdrop(node);
18551 + }
18552 +
18553 + j_hash_done(&tree->jhash_table);
18554 + }
18555 + return 0;
18556 +}
18557 +
18558 +/**
18559 + * init_jnodes - create jnode cache
18560 + *
18561 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18562 + */
18563 +int init_jnodes(void)
18564 +{
18565 + assert("umka-168", _jnode_slab == NULL);
18566 +
18567 + _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18568 + SLAB_HWCACHE_ALIGN |
18569 + SLAB_RECLAIM_ACCOUNT, NULL);
18570 + if (_jnode_slab == NULL)
18571 + return RETERR(-ENOMEM);
18572 +
18573 + return 0;
18574 +}
18575 +
18576 +/**
18577 + * done_znodes - delete znode cache
18578 + *
18579 + * This is called on reiser4 module unloading or system shutdown.
18580 + */
18581 +void done_jnodes(void)
18582 +{
18583 + destroy_reiser4_cache(&_jnode_slab);
18584 +}
18585 +
18586 +/* Initialize a jnode. */
18587 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18588 +{
18589 + assert("umka-175", node != NULL);
18590 +
18591 + memset(node, 0, sizeof(jnode));
18592 + ON_DEBUG(node->magic = JMAGIC);
18593 + jnode_set_type(node, type);
18594 + atomic_set(&node->d_count, 0);
18595 + atomic_set(&node->x_count, 0);
18596 + spin_lock_init(&node->guard);
18597 + spin_lock_init(&node->load);
18598 + node->atom = NULL;
18599 + node->tree = tree;
18600 + INIT_LIST_HEAD(&node->capture_link);
18601 +
18602 + ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18603 +
18604 + INIT_RCU_HEAD(&node->rcu);
18605 +
18606 +#if REISER4_DEBUG
18607 + {
18608 + reiser4_super_info_data *sbinfo;
18609 +
18610 + sbinfo = get_super_private(tree->super);
18611 + spin_lock_irq(&sbinfo->all_guard);
18612 + list_add(&node->jnodes, &sbinfo->all_jnodes);
18613 + spin_unlock_irq(&sbinfo->all_guard);
18614 + }
18615 +#endif
18616 +}
18617 +
18618 +#if REISER4_DEBUG
18619 +/*
18620 + * Remove jnode from ->all_jnodes list.
18621 + */
18622 +static void jnode_done(jnode * node, reiser4_tree * tree)
18623 +{
18624 + reiser4_super_info_data *sbinfo;
18625 +
18626 + sbinfo = get_super_private(tree->super);
18627 +
18628 + spin_lock_irq(&sbinfo->all_guard);
18629 + assert("nikita-2422", !list_empty(&node->jnodes));
18630 + list_del_init(&node->jnodes);
18631 + spin_unlock_irq(&sbinfo->all_guard);
18632 +}
18633 +#endif
18634 +
18635 +/* return already existing jnode of page */
18636 +jnode *jnode_by_page(struct page *pg)
18637 +{
18638 + assert("nikita-2066", pg != NULL);
18639 + assert("nikita-2400", PageLocked(pg));
18640 + assert("nikita-2068", PagePrivate(pg));
18641 + assert("nikita-2067", jprivate(pg) != NULL);
18642 + return jprivate(pg);
18643 +}
18644 +
18645 +/* exported functions to allocate/free jnode objects outside this file */
18646 +jnode *jalloc(void)
18647 +{
18648 + jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18649 + return jal;
18650 +}
18651 +
18652 +/* return jnode back to the slab allocator */
18653 +inline void jfree(jnode * node)
18654 +{
18655 + assert("zam-449", node != NULL);
18656 +
18657 + assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18658 + NODE_LIST(node) == NOT_CAPTURED));
18659 + assert("nikita-3222", list_empty(&node->jnodes));
18660 + assert("nikita-3221", jnode_page(node) == NULL);
18661 +
18662 + /* not yet phash_jnode_destroy(node); */
18663 +
18664 + kmem_cache_free(_jnode_slab, node);
18665 +}
18666 +
18667 +/*
18668 + * This function is supplied as RCU callback. It actually frees jnode when
18669 + * last reference to it is gone.
18670 + */
18671 +static void jnode_free_actor(struct rcu_head *head)
18672 +{
18673 + jnode *node;
18674 + jnode_type jtype;
18675 +
18676 + node = container_of(head, jnode, rcu);
18677 + jtype = jnode_get_type(node);
18678 +
18679 + ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18680 +
18681 + switch (jtype) {
18682 + case JNODE_IO_HEAD:
18683 + case JNODE_BITMAP:
18684 + case JNODE_UNFORMATTED_BLOCK:
18685 + jfree(node);
18686 + break;
18687 + case JNODE_FORMATTED_BLOCK:
18688 + zfree(JZNODE(node));
18689 + break;
18690 + case JNODE_INODE:
18691 + default:
18692 + wrong_return_value("nikita-3197", "Wrong jnode type");
18693 + }
18694 +}
18695 +
18696 +/*
18697 + * Free a jnode. Post a callback to be executed later through RCU when all
18698 + * references to @node are released.
18699 + */
18700 +static inline void jnode_free(jnode * node, jnode_type jtype)
18701 +{
18702 + if (jtype != JNODE_INODE) {
18703 + /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18704 + call_rcu(&node->rcu, jnode_free_actor);
18705 + } else
18706 + jnode_list_remove(node);
18707 +}
18708 +
18709 +/* allocate new unformatted jnode */
18710 +static jnode *jnew_unformatted(void)
18711 +{
18712 + jnode *jal;
18713 +
18714 + jal = jalloc();
18715 + if (jal == NULL)
18716 + return NULL;
18717 +
18718 + jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18719 + jal->key.j.mapping = NULL;
18720 + jal->key.j.index = (unsigned long)-1;
18721 + jal->key.j.objectid = 0;
18722 + return jal;
18723 +}
18724 +
18725 +/* look for jnode with given mapping and offset within hash table */
18726 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18727 +{
18728 + struct jnode_key jkey;
18729 + jnode *node;
18730 +
18731 + assert("nikita-2353", tree != NULL);
18732 +
18733 + jkey.objectid = objectid;
18734 + jkey.index = index;
18735 +
18736 + /*
18737 + * hash table is _not_ protected by any lock during lookups. All we
18738 + * have to do is to disable preemption to keep RCU happy.
18739 + */
18740 +
18741 + rcu_read_lock();
18742 + node = j_hash_find(&tree->jhash_table, &jkey);
18743 + if (node != NULL) {
18744 + /* protect @node from recycling */
18745 + jref(node);
18746 + assert("nikita-2955", jnode_invariant(node, 0, 0));
18747 + node = jnode_rip_check(tree, node);
18748 + }
18749 + rcu_read_unlock();
18750 + return node;
18751 +}
18752 +
18753 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18754 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18755 +{
18756 + assert("vs-1694", mapping->host != NULL);
18757 +
18758 + return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18759 +}
18760 +
18761 +jnode *jfind(struct address_space * mapping, unsigned long index)
18762 +{
18763 + reiser4_tree *tree;
18764 + jnode *node;
18765 +
18766 + assert("vs-1694", mapping->host != NULL);
18767 + tree = reiser4_tree_by_inode(mapping->host);
18768 +
18769 + read_lock_tree(tree);
18770 + node = jfind_nolock(mapping, index);
18771 + if (node != NULL)
18772 + jref(node);
18773 + read_unlock_tree(tree);
18774 + return node;
18775 +}
18776 +
18777 +static void inode_attach_jnode(jnode * node)
18778 +{
18779 + struct inode *inode;
18780 + reiser4_inode *info;
18781 + struct radix_tree_root *rtree;
18782 +
18783 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18784 + assert("zam-1043", node->key.j.mapping != NULL);
18785 + inode = node->key.j.mapping->host;
18786 + info = reiser4_inode_data(inode);
18787 + rtree = jnode_tree_by_reiser4_inode(info);
18788 + if (rtree->rnode == NULL) {
18789 + /* prevent inode from being pruned when it has jnodes attached
18790 + to it */
18791 + write_lock_irq(&inode->i_data.tree_lock);
18792 + inode->i_data.nrpages++;
18793 + write_unlock_irq(&inode->i_data.tree_lock);
18794 + }
18795 + assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18796 + check_me("zam-1045",
18797 + !radix_tree_insert(rtree, node->key.j.index, node));
18798 + ON_DEBUG(info->nr_jnodes++);
18799 +}
18800 +
18801 +static void inode_detach_jnode(jnode * node)
18802 +{
18803 + struct inode *inode;
18804 + reiser4_inode *info;
18805 + struct radix_tree_root *rtree;
18806 +
18807 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18808 + assert("zam-1044", node->key.j.mapping != NULL);
18809 + inode = node->key.j.mapping->host;
18810 + info = reiser4_inode_data(inode);
18811 + rtree = jnode_tree_by_reiser4_inode(info);
18812 +
18813 + assert("zam-1051", info->nr_jnodes != 0);
18814 + assert("zam-1052", rtree->rnode != NULL);
18815 + ON_DEBUG(info->nr_jnodes--);
18816 +
18817 + /* delete jnode from inode's radix tree of jnodes */
18818 + check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18819 + if (rtree->rnode == NULL) {
18820 + /* inode can be pruned now */
18821 + write_lock_irq(&inode->i_data.tree_lock);
18822 + inode->i_data.nrpages--;
18823 + write_unlock_irq(&inode->i_data.tree_lock);
18824 + }
18825 +}
18826 +
18827 +/* put jnode into hash table (where they can be found by flush who does not know
18828 + mapping) and to inode's tree of jnodes (where they can be found (hopefully
18829 + faster) in places where mapping is known). Currently it is used by
18830 + fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18831 + created */
18832 +static void
18833 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18834 + unsigned long index)
18835 +{
18836 + j_hash_table *jtable;
18837 +
18838 + assert("vs-1446", jnode_is_unformatted(node));
18839 + assert("vs-1442", node->key.j.mapping == 0);
18840 + assert("vs-1443", node->key.j.objectid == 0);
18841 + assert("vs-1444", node->key.j.index == (unsigned long)-1);
18842 + assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18843 +
18844 + node->key.j.mapping = mapping;
18845 + node->key.j.objectid = get_inode_oid(mapping->host);
18846 + node->key.j.index = index;
18847 +
18848 + jtable = &jnode_get_tree(node)->jhash_table;
18849 +
18850 + /* race with some other thread inserting jnode into the hash table is
18851 + * impossible, because we keep the page lock. */
18852 + /*
18853 + * following assertion no longer holds because of RCU: it is possible
18854 + * jnode is in the hash table, but with JNODE_RIP bit set.
18855 + */
18856 + /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18857 + j_hash_insert_rcu(jtable, node);
18858 + inode_attach_jnode(node);
18859 +}
18860 +
18861 +static void unhash_unformatted_node_nolock(jnode * node)
18862 +{
18863 + assert("vs-1683", node->key.j.mapping != NULL);
18864 + assert("vs-1684",
18865 + node->key.j.objectid ==
18866 + get_inode_oid(node->key.j.mapping->host));
18867 +
18868 + /* remove jnode from hash-table */
18869 + j_hash_remove_rcu(&node->tree->jhash_table, node);
18870 + inode_detach_jnode(node);
18871 + node->key.j.mapping = NULL;
18872 + node->key.j.index = (unsigned long)-1;
18873 + node->key.j.objectid = 0;
18874 +
18875 +}
18876 +
18877 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18878 + reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
18879 + reiser4_uncapture_jnode */
18880 +void unhash_unformatted_jnode(jnode * node)
18881 +{
18882 + assert("vs-1445", jnode_is_unformatted(node));
18883 +
18884 + write_lock_tree(node->tree);
18885 + unhash_unformatted_node_nolock(node);
18886 + write_unlock_tree(node->tree);
18887 +}
18888 +
18889 +/*
18890 + * search hash table for a jnode with given oid and index. If not found,
18891 + * allocate new jnode, insert it, and also insert into radix tree for the
18892 + * given inode/mapping.
18893 + */
18894 +static jnode *find_get_jnode(reiser4_tree * tree,
18895 + struct address_space *mapping,
18896 + oid_t oid, unsigned long index)
18897 +{
18898 + jnode *result;
18899 + jnode *shadow;
18900 + int preload;
18901 +
18902 + result = jnew_unformatted();
18903 +
18904 + if (unlikely(result == NULL))
18905 + return ERR_PTR(RETERR(-ENOMEM));
18906 +
18907 + preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
18908 + if (preload != 0)
18909 + return ERR_PTR(preload);
18910 +
18911 + write_lock_tree(tree);
18912 + shadow = jfind_nolock(mapping, index);
18913 + if (likely(shadow == NULL)) {
18914 + /* add new jnode to hash table and inode's radix tree of jnodes */
18915 + jref(result);
18916 + hash_unformatted_jnode(result, mapping, index);
18917 + } else {
18918 + /* jnode is found in inode's radix tree of jnodes */
18919 + jref(shadow);
18920 + jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18921 + assert("vs-1498", shadow->key.j.mapping == mapping);
18922 + result = shadow;
18923 + }
18924 + write_unlock_tree(tree);
18925 +
18926 + assert("nikita-2955",
18927 + ergo(result != NULL, jnode_invariant(result, 0, 0)));
18928 + radix_tree_preload_end();
18929 + return result;
18930 +}
18931 +
18932 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18933 + creates) jnode corresponding to page @pg. jnode is attached to page and
18934 + inserted into jnode hash-table. */
18935 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18936 +{
18937 + /*
18938 + * There are two ways to create jnode: starting with pre-existing page
18939 + * and without page.
18940 + *
18941 + * When page already exists, jnode is created
18942 + * (jnode_of_page()->do_jget()) under page lock. This is done in
18943 + * ->writepage(), or when capturing anonymous page dirtied through
18944 + * mmap.
18945 + *
18946 + * Jnode without page is created by index_extent_jnode().
18947 + *
18948 + */
18949 +
18950 + jnode *result;
18951 + oid_t oid = get_inode_oid(pg->mapping->host);
18952 +
18953 + assert("umka-176", pg != NULL);
18954 + assert("nikita-2394", PageLocked(pg));
18955 +
18956 + result = jprivate(pg);
18957 + if (likely(result != NULL))
18958 + return jref(result);
18959 +
18960 + tree = reiser4_tree_by_page(pg);
18961 +
18962 + /* check hash-table first */
18963 + result = jfind(pg->mapping, pg->index);
18964 + if (unlikely(result != NULL)) {
18965 + spin_lock_jnode(result);
18966 + jnode_attach_page(result, pg);
18967 + spin_unlock_jnode(result);
18968 + result->key.j.mapping = pg->mapping;
18969 + return result;
18970 + }
18971 +
18972 + /* since page is locked, jnode should be allocated with GFP_NOFS flag */
18973 + reiser4_ctx_gfp_mask_force(GFP_NOFS);
18974 + result = find_get_jnode(tree, pg->mapping, oid, pg->index);
18975 + if (unlikely(IS_ERR(result)))
18976 + return result;
18977 + /* attach jnode to page */
18978 + spin_lock_jnode(result);
18979 + jnode_attach_page(result, pg);
18980 + spin_unlock_jnode(result);
18981 + return result;
18982 +}
18983 +
18984 +/*
18985 + * return jnode for @pg, creating it if necessary.
18986 + */
18987 +jnode *jnode_of_page(struct page * pg)
18988 +{
18989 + jnode *result;
18990 +
18991 + assert("umka-176", pg != NULL);
18992 + assert("nikita-2394", PageLocked(pg));
18993 +
18994 + result = do_jget(reiser4_tree_by_page(pg), pg);
18995 +
18996 + if (REISER4_DEBUG && !IS_ERR(result)) {
18997 + assert("nikita-3210", result == jprivate(pg));
18998 + assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
18999 + if (jnode_is_unformatted(jprivate(pg))) {
19000 + assert("nikita-2364",
19001 + jprivate(pg)->key.j.index == pg->index);
19002 + assert("nikita-2367",
19003 + jprivate(pg)->key.j.mapping == pg->mapping);
19004 + assert("nikita-2365",
19005 + jprivate(pg)->key.j.objectid ==
19006 + get_inode_oid(pg->mapping->host));
19007 + assert("vs-1200",
19008 + jprivate(pg)->key.j.objectid ==
19009 + pg->mapping->host->i_ino);
19010 + assert("nikita-2356",
19011 + jnode_is_unformatted(jnode_by_page(pg)));
19012 + }
19013 + assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19014 + }
19015 + return result;
19016 +}
19017 +
19018 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19019 + * page.*/
19020 +void jnode_attach_page(jnode * node, struct page *pg)
19021 +{
19022 + assert("nikita-2060", node != NULL);
19023 + assert("nikita-2061", pg != NULL);
19024 +
19025 + assert("nikita-2050", jprivate(pg) == 0ul);
19026 + assert("nikita-2393", !PagePrivate(pg));
19027 + assert("vs-1741", node->pg == NULL);
19028 +
19029 + assert("nikita-2396", PageLocked(pg));
19030 + assert_spin_locked(&(node->guard));
19031 +
19032 + page_cache_get(pg);
19033 + set_page_private(pg, (unsigned long)node);
19034 + node->pg = pg;
19035 + SetPagePrivate(pg);
19036 +}
19037 +
19038 +/* Dual to jnode_attach_page: break a binding between page and jnode */
19039 +void page_clear_jnode(struct page *page, jnode * node)
19040 +{
19041 + assert("nikita-2424", page != NULL);
19042 + assert("nikita-2425", PageLocked(page));
19043 + assert("nikita-2426", node != NULL);
19044 + assert_spin_locked(&(node->guard));
19045 + assert("nikita-2428", PagePrivate(page));
19046 +
19047 + assert("nikita-3551", !PageWriteback(page));
19048 +
19049 + JF_CLR(node, JNODE_PARSED);
19050 + set_page_private(page, 0ul);
19051 + ClearPagePrivate(page);
19052 + node->pg = NULL;
19053 + page_cache_release(page);
19054 +}
19055 +
19056 +#if 0
19057 +/* it is only used in one place to handle error */
19058 +void
19059 +page_detach_jnode(struct page *page, struct address_space *mapping,
19060 + unsigned long index)
19061 +{
19062 + assert("nikita-2395", page != NULL);
19063 +
19064 + lock_page(page);
19065 + if ((page->mapping == mapping) && (page->index == index)
19066 + && PagePrivate(page)) {
19067 + jnode *node;
19068 +
19069 + node = jprivate(page);
19070 + spin_lock_jnode(node);
19071 + page_clear_jnode(page, node);
19072 + spin_unlock_jnode(node);
19073 + }
19074 + unlock_page(page);
19075 +}
19076 +#endif /* 0 */
19077 +
19078 +/* return @node page locked.
19079 +
19080 + Locking ordering requires that one first takes page lock and afterwards
19081 + spin lock on node attached to this page. Sometimes it is necessary to go in
19082 + the opposite direction. This is done through standard trylock-and-release
19083 + loop.
19084 +*/
19085 +static struct page *jnode_lock_page(jnode * node)
19086 +{
19087 + struct page *page;
19088 +
19089 + assert("nikita-2052", node != NULL);
19090 + assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19091 +
19092 + while (1) {
19093 +
19094 + spin_lock_jnode(node);
19095 + page = jnode_page(node);
19096 + if (page == NULL) {
19097 + break;
19098 + }
19099 +
19100 + /* no need to page_cache_get( page ) here, because page cannot
19101 + be evicted from memory without detaching it from jnode and
19102 + this requires spin lock on jnode that we already hold.
19103 + */
19104 + if (!TestSetPageLocked(page)) {
19105 + /* We won a lock on jnode page, proceed. */
19106 + break;
19107 + }
19108 +
19109 + /* Page is locked by someone else. */
19110 + page_cache_get(page);
19111 + spin_unlock_jnode(node);
19112 + wait_on_page_locked(page);
19113 + /* it is possible that page was detached from jnode and
19114 + returned to the free pool, or re-assigned while we were
19115 + waiting on locked bit. This will be rechecked on the next
19116 + loop iteration.
19117 + */
19118 + page_cache_release(page);
19119 +
19120 + /* try again */
19121 + }
19122 + return page;
19123 +}
19124 +
19125 +/*
19126 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19127 + * validness of jnode content.
19128 + */
19129 +static inline int jparse(jnode * node)
19130 +{
19131 + int result;
19132 +
19133 + assert("nikita-2466", node != NULL);
19134 +
19135 + spin_lock_jnode(node);
19136 + if (likely(!jnode_is_parsed(node))) {
19137 + result = jnode_ops(node)->parse(node);
19138 + if (likely(result == 0))
19139 + JF_SET(node, JNODE_PARSED);
19140 + } else
19141 + result = 0;
19142 + spin_unlock_jnode(node);
19143 + return result;
19144 +}
19145 +
19146 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19147 + * one. */
19148 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19149 +{
19150 + struct page *page;
19151 +
19152 + spin_lock_jnode(node);
19153 + page = jnode_page(node);
19154 +
19155 + if (page == NULL) {
19156 + spin_unlock_jnode(node);
19157 + page = find_or_create_page(jnode_get_mapping(node),
19158 + jnode_get_index(node), gfp_flags);
19159 + if (page == NULL)
19160 + return ERR_PTR(RETERR(-ENOMEM));
19161 + } else {
19162 + if (!TestSetPageLocked(page)) {
19163 + spin_unlock_jnode(node);
19164 + return page;
19165 + }
19166 + page_cache_get(page);
19167 + spin_unlock_jnode(node);
19168 + lock_page(page);
19169 + assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19170 + }
19171 +
19172 + spin_lock_jnode(node);
19173 + if (!jnode_page(node))
19174 + jnode_attach_page(node, page);
19175 + spin_unlock_jnode(node);
19176 +
19177 + page_cache_release(page);
19178 + assert("zam-894", jnode_page(node) == page);
19179 + return page;
19180 +}
19181 +
19182 +/* Start read operation for jnode's page if page is not up-to-date. */
19183 +static int jnode_start_read(jnode * node, struct page *page)
19184 +{
19185 + assert("zam-893", PageLocked(page));
19186 +
19187 + if (PageUptodate(page)) {
19188 + unlock_page(page);
19189 + return 0;
19190 + }
19191 + return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19192 +}
19193 +
19194 +#if REISER4_DEBUG
19195 +static void check_jload(jnode * node, struct page *page)
19196 +{
19197 + if (jnode_is_znode(node)) {
19198 + node40_header *nh;
19199 + znode *z;
19200 +
19201 + z = JZNODE(node);
19202 + if (znode_is_any_locked(z)) {
19203 + nh = (node40_header *) kmap(page);
19204 + /* this only works for node40-only file systems. For
19205 + * debugging. */
19206 + assert("nikita-3253",
19207 + z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19208 + kunmap(page);
19209 + }
19210 + assert("nikita-3565", znode_invariant(z));
19211 + }
19212 +}
19213 +#else
19214 +#define check_jload(node, page) noop
19215 +#endif
19216 +
19217 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19218 + * to call jload() shortly. This will bring appropriate portion of jnode into
19219 + * CPU cache. */
19220 +void jload_prefetch(jnode * node)
19221 +{
19222 + prefetchw(&node->x_count);
19223 +}
19224 +
19225 +/* load jnode's data into memory */
19226 +int jload_gfp(jnode * node /* node to load */ ,
19227 + gfp_t gfp_flags /* allocation flags */ ,
19228 + int do_kmap /* true if page should be kmapped */ )
19229 +{
19230 + struct page *page;
19231 + int result = 0;
19232 + int parsed;
19233 +
19234 + assert("nikita-3010", reiser4_schedulable());
19235 +
19236 + prefetchw(&node->pg);
19237 +
19238 + /* taking d-reference implies taking x-reference. */
19239 + jref(node);
19240 +
19241 + /*
19242 + * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19243 + * should be atomic, otherwise there is a race against
19244 + * reiser4_releasepage().
19245 + */
19246 + spin_lock(&(node->load));
19247 + add_d_ref(node);
19248 + parsed = jnode_is_parsed(node);
19249 + spin_unlock(&(node->load));
19250 +
19251 + if (unlikely(!parsed)) {
19252 + page = jnode_get_page_locked(node, gfp_flags);
19253 + if (unlikely(IS_ERR(page))) {
19254 + result = PTR_ERR(page);
19255 + goto failed;
19256 + }
19257 +
19258 + result = jnode_start_read(node, page);
19259 + if (unlikely(result != 0))
19260 + goto failed;
19261 +
19262 + wait_on_page_locked(page);
19263 + if (unlikely(!PageUptodate(page))) {
19264 + result = RETERR(-EIO);
19265 + goto failed;
19266 + }
19267 +
19268 + if (do_kmap)
19269 + node->data = kmap(page);
19270 +
19271 + result = jparse(node);
19272 + if (unlikely(result != 0)) {
19273 + if (do_kmap)
19274 + kunmap(page);
19275 + goto failed;
19276 + }
19277 + check_jload(node, page);
19278 + } else {
19279 + page = jnode_page(node);
19280 + check_jload(node, page);
19281 + if (do_kmap)
19282 + node->data = kmap(page);
19283 + }
19284 +
19285 + if (!is_writeout_mode())
19286 + /* We do not mark pages active if jload is called as a part of
19287 + * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19288 + * and write_logs() add no value to cached data, there is no
19289 + * sense to mark pages as active when they go to disk, it just
19290 + * confuses vm scanning routines because clean page could be
19291 + * moved out from inactive list as a result of this
19292 + * mark_page_accessed() call. */
19293 + mark_page_accessed(page);
19294 +
19295 + return 0;
19296 +
19297 + failed:
19298 + jrelse_tail(node);
19299 + return result;
19300 +
19301 +}
19302 +
19303 +/* start asynchronous reading for given jnode's page. */
19304 +int jstartio(jnode * node)
19305 +{
19306 + struct page *page;
19307 +
19308 + page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19309 + if (IS_ERR(page))
19310 + return PTR_ERR(page);
19311 +
19312 + return jnode_start_read(node, page);
19313 +}
19314 +
19315 +/* Initialize a node by calling appropriate plugin instead of reading
19316 + * node from disk as in jload(). */
19317 +int jinit_new(jnode * node, gfp_t gfp_flags)
19318 +{
19319 + struct page *page;
19320 + int result;
19321 +
19322 + jref(node);
19323 + add_d_ref(node);
19324 +
19325 + page = jnode_get_page_locked(node, gfp_flags);
19326 + if (IS_ERR(page)) {
19327 + result = PTR_ERR(page);
19328 + goto failed;
19329 + }
19330 +
19331 + SetPageUptodate(page);
19332 + unlock_page(page);
19333 +
19334 + node->data = kmap(page);
19335 +
19336 + if (!jnode_is_parsed(node)) {
19337 + jnode_plugin *jplug = jnode_ops(node);
19338 + spin_lock_jnode(node);
19339 + result = jplug->init(node);
19340 + spin_unlock_jnode(node);
19341 + if (result) {
19342 + kunmap(page);
19343 + goto failed;
19344 + }
19345 + JF_SET(node, JNODE_PARSED);
19346 + }
19347 +
19348 + return 0;
19349 +
19350 + failed:
19351 + jrelse(node);
19352 + return result;
19353 +}
19354 +
19355 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19356 +void jrelse_tail(jnode * node /* jnode to release references to */ )
19357 +{
19358 + assert("nikita-489", atomic_read(&node->d_count) > 0);
19359 + atomic_dec(&node->d_count);
19360 + /* release reference acquired in jload_gfp() or jinit_new() */
19361 + jput(node);
19362 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
19363 + LOCK_CNT_DEC(d_refs);
19364 +}
19365 +
19366 +/* drop reference to node data. When last reference is dropped, data are
19367 + unloaded. */
19368 +void jrelse(jnode * node /* jnode to release references to */ )
19369 +{
19370 + struct page *page;
19371 +
19372 + assert("nikita-487", node != NULL);
19373 + assert_spin_not_locked(&(node->guard));
19374 +
19375 + page = jnode_page(node);
19376 + if (likely(page != NULL)) {
19377 + /*
19378 + * it is safe not to lock jnode here, because at this point
19379 + * @node->d_count is greater than zero (if jrelse() is used
19380 + * correctly, that is). JNODE_PARSED may be not set yet, if,
19381 + * for example, we got here as a result of error handling path
19382 + * in jload(). Anyway, page cannot be detached by
19383 + * reiser4_releasepage(). truncate will invalidate page
19384 + * regardless, but this should not be a problem.
19385 + */
19386 + kunmap(page);
19387 + }
19388 + jrelse_tail(node);
19389 +}
19390 +
19391 +/* called from jput() to wait for io completion */
19392 +static void jnode_finish_io(jnode * node)
19393 +{
19394 + struct page *page;
19395 +
19396 + assert("nikita-2922", node != NULL);
19397 +
19398 + spin_lock_jnode(node);
19399 + page = jnode_page(node);
19400 + if (page != NULL) {
19401 + page_cache_get(page);
19402 + spin_unlock_jnode(node);
19403 + wait_on_page_writeback(page);
19404 + page_cache_release(page);
19405 + } else
19406 + spin_unlock_jnode(node);
19407 +}
19408 +
19409 +/*
19410 + * This is called by jput() when last reference to jnode is released. This is
19411 + * separate function, because we want fast path of jput() to be inline and,
19412 + * therefore, small.
19413 + */
19414 +void jput_final(jnode * node)
19415 +{
19416 + int r_i_p;
19417 +
19418 + /* A fast check for keeping node in cache. We always keep node in cache
19419 + * if its page is present and node was not marked for deletion */
19420 + if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19421 + rcu_read_unlock();
19422 + return;
19423 + }
19424 + r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19425 + /*
19426 + * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19427 + * this case it is safe to access node after unlock.
19428 + */
19429 + rcu_read_unlock();
19430 + if (r_i_p) {
19431 + jnode_finish_io(node);
19432 + if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19433 + /* node is removed from the tree. */
19434 + jdelete(node);
19435 + else
19436 + jnode_try_drop(node);
19437 + }
19438 + /* if !r_i_p some other thread is already killing it */
19439 +}
19440 +
19441 +int jwait_io(jnode * node, int rw)
19442 +{
19443 + struct page *page;
19444 + int result;
19445 +
19446 + assert("zam-447", node != NULL);
19447 + assert("zam-448", jnode_page(node) != NULL);
19448 +
19449 + page = jnode_page(node);
19450 +
19451 + result = 0;
19452 + if (rw == READ) {
19453 + wait_on_page_locked(page);
19454 + } else {
19455 + assert("nikita-2227", rw == WRITE);
19456 + wait_on_page_writeback(page);
19457 + }
19458 + if (PageError(page))
19459 + result = RETERR(-EIO);
19460 +
19461 + return result;
19462 +}
19463 +
19464 +/*
19465 + * jnode types and plugins.
19466 + *
19467 + * jnode by itself is a "base type". There are several different jnode
19468 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19469 + * has to do different things based on jnode type. In the standard reiser4 way
19470 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19471 + *
19472 + * Functions below deal with jnode types and define methods of jnode plugin.
19473 + *
19474 + */
19475 +
19476 +/* set jnode type. This is done during jnode initialization. */
19477 +static void jnode_set_type(jnode * node, jnode_type type)
19478 +{
19479 + static unsigned long type_to_mask[] = {
19480 + [JNODE_UNFORMATTED_BLOCK] = 1,
19481 + [JNODE_FORMATTED_BLOCK] = 0,
19482 + [JNODE_BITMAP] = 2,
19483 + [JNODE_IO_HEAD] = 6,
19484 + [JNODE_INODE] = 4
19485 + };
19486 +
19487 + assert("zam-647", type < LAST_JNODE_TYPE);
19488 + assert("nikita-2815", !jnode_is_loaded(node));
19489 + assert("nikita-3386", node->state == 0);
19490 +
19491 + node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19492 +}
19493 +
19494 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19495 + * specific initialization. */
19496 +static int init_noinit(jnode * node UNUSED_ARG)
19497 +{
19498 + return 0;
19499 +}
19500 +
19501 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19502 + * specific pasring. */
19503 +static int parse_noparse(jnode * node UNUSED_ARG)
19504 +{
19505 + return 0;
19506 +}
19507 +
19508 +/* ->mapping() method for unformatted jnode */
19509 +struct address_space *mapping_jnode(const jnode * node)
19510 +{
19511 + struct address_space *map;
19512 +
19513 + assert("nikita-2713", node != NULL);
19514 +
19515 + /* mapping is stored in jnode */
19516 +
19517 + map = node->key.j.mapping;
19518 + assert("nikita-2714", map != NULL);
19519 + assert("nikita-2897", is_reiser4_inode(map->host));
19520 + assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19521 + return map;
19522 +}
19523 +
19524 +/* ->index() method for unformatted jnodes */
19525 +unsigned long index_jnode(const jnode * node)
19526 +{
19527 + /* index is stored in jnode */
19528 + return node->key.j.index;
19529 +}
19530 +
19531 +/* ->remove() method for unformatted jnodes */
19532 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19533 +{
19534 + /* remove jnode from hash table and radix tree */
19535 + if (node->key.j.mapping)
19536 + unhash_unformatted_node_nolock(node);
19537 +}
19538 +
19539 +/* ->mapping() method for znodes */
19540 +static struct address_space *mapping_znode(const jnode * node)
19541 +{
19542 + /* all znodes belong to fake inode */
19543 + return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19544 +}
19545 +
19546 +/* ->index() method for znodes */
19547 +static unsigned long index_znode(const jnode * node)
19548 +{
19549 + unsigned long addr;
19550 + assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19551 +
19552 + /* index of znode is just its address (shifted) */
19553 + addr = (unsigned long)node;
19554 + return (addr - PAGE_OFFSET) >> znode_shift_order;
19555 +}
19556 +
19557 +/* ->mapping() method for bitmap jnode */
19558 +static struct address_space *mapping_bitmap(const jnode * node)
19559 +{
19560 + /* all bitmap blocks belong to special bitmap inode */
19561 + return get_super_private(jnode_get_tree(node)->super)->bitmap->
19562 + i_mapping;
19563 +}
19564 +
19565 +/* ->index() method for jnodes that are indexed by address */
19566 +static unsigned long index_is_address(const jnode * node)
19567 +{
19568 + unsigned long ind;
19569 +
19570 + ind = (unsigned long)node;
19571 + return ind - PAGE_OFFSET;
19572 +}
19573 +
19574 +/* resolve race with jput */
19575 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19576 +{
19577 + /*
19578 + * This is used as part of RCU-based jnode handling.
19579 + *
19580 + * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19581 + * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19582 + * not protected during this, so concurrent thread may execute
19583 + * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19584 + * freed in jput_final(). To avoid such races, jput_final() sets
19585 + * JNODE_RIP on jnode (under tree lock). All places that work with
19586 + * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19587 + * (first without taking tree lock), and if this bit is set, released
19588 + * reference acquired by the current thread and returns NULL.
19589 + *
19590 + * As a result, if jnode is being concurrently freed, NULL is returned
19591 + * and caller should pretend that jnode wasn't found in the first
19592 + * place.
19593 + *
19594 + * Otherwise it's safe to release "rcu-read-lock" and continue with
19595 + * jnode.
19596 + */
19597 + if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19598 + read_lock_tree(tree);
19599 + if (JF_ISSET(node, JNODE_RIP)) {
19600 + dec_x_ref(node);
19601 + node = NULL;
19602 + }
19603 + read_unlock_tree(tree);
19604 + }
19605 + return node;
19606 +}
19607 +
19608 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19609 +{
19610 + struct inode *inode;
19611 + item_plugin *iplug;
19612 + loff_t off;
19613 +
19614 + assert("nikita-3092", node != NULL);
19615 + assert("nikita-3093", key != NULL);
19616 + assert("nikita-3094", jnode_is_unformatted(node));
19617 +
19618 + off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19619 + inode = mapping_jnode(node)->host;
19620 +
19621 + if (node->parent_item_id != 0)
19622 + iplug = item_plugin_by_id(node->parent_item_id);
19623 + else
19624 + iplug = NULL;
19625 +
19626 + if (iplug != NULL && iplug->f.key_by_offset)
19627 + iplug->f.key_by_offset(inode, off, key);
19628 + else {
19629 + file_plugin *fplug;
19630 +
19631 + fplug = inode_file_plugin(inode);
19632 + assert("zam-1007", fplug != NULL);
19633 + assert("zam-1008", fplug->key_by_inode != NULL);
19634 +
19635 + fplug->key_by_inode(inode, off, key);
19636 + }
19637 +
19638 + return key;
19639 +}
19640 +
19641 +/* ->parse() method for formatted nodes */
19642 +static int parse_znode(jnode * node)
19643 +{
19644 + return zparse(JZNODE(node));
19645 +}
19646 +
19647 +/* ->delete() method for formatted nodes */
19648 +static void delete_znode(jnode * node, reiser4_tree * tree)
19649 +{
19650 + znode *z;
19651 +
19652 + assert_rw_write_locked(&(tree->tree_lock));
19653 + assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19654 +
19655 + z = JZNODE(node);
19656 + assert("vs-899", z->c_count == 0);
19657 +
19658 + /* delete znode from sibling list. */
19659 + sibling_list_remove(z);
19660 +
19661 + znode_remove(z, tree);
19662 +}
19663 +
19664 +/* ->remove() method for formatted nodes */
19665 +static int remove_znode(jnode * node, reiser4_tree * tree)
19666 +{
19667 + znode *z;
19668 +
19669 + assert_rw_write_locked(&(tree->tree_lock));
19670 + z = JZNODE(node);
19671 +
19672 + if (z->c_count == 0) {
19673 + /* detach znode from sibling list. */
19674 + sibling_list_drop(z);
19675 + /* this is called with tree spin-lock held, so call
19676 + znode_remove() directly (rather than znode_lock_remove()). */
19677 + znode_remove(z, tree);
19678 + return 0;
19679 + }
19680 + return RETERR(-EBUSY);
19681 +}
19682 +
19683 +/* ->init() method for formatted nodes */
19684 +static int init_znode(jnode * node)
19685 +{
19686 + znode *z;
19687 +
19688 + z = JZNODE(node);
19689 + /* call node plugin to do actual initialization */
19690 + return z->nplug->init(z);
19691 +}
19692 +
19693 +/* ->clone() method for formatted nodes */
19694 +static jnode *clone_formatted(jnode * node)
19695 +{
19696 + znode *clone;
19697 +
19698 + assert("vs-1430", jnode_is_znode(node));
19699 + clone = zalloc(reiser4_ctx_gfp_mask_get());
19700 + if (clone == NULL)
19701 + return ERR_PTR(RETERR(-ENOMEM));
19702 + zinit(clone, NULL, current_tree);
19703 + jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19704 + /* ZJNODE(clone)->key.z is not initialized */
19705 + clone->level = JZNODE(node)->level;
19706 +
19707 + return ZJNODE(clone);
19708 +}
19709 +
19710 +/* jplug->clone for unformatted nodes */
19711 +static jnode *clone_unformatted(jnode * node)
19712 +{
19713 + jnode *clone;
19714 +
19715 + assert("vs-1431", jnode_is_unformatted(node));
19716 + clone = jalloc();
19717 + if (clone == NULL)
19718 + return ERR_PTR(RETERR(-ENOMEM));
19719 +
19720 + jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19721 + jnode_set_block(clone, jnode_get_block(node));
19722 +
19723 + return clone;
19724 +
19725 +}
19726 +
19727 +/*
19728 + * Setup jnode plugin methods for various jnode types.
19729 + */
19730 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19731 + [JNODE_UNFORMATTED_BLOCK] = {
19732 + .h = {
19733 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19734 + .id = JNODE_UNFORMATTED_BLOCK,
19735 + .pops = NULL,
19736 + .label = "unformatted",
19737 + .desc = "unformatted node",
19738 + .linkage = {NULL, NULL}
19739 + },
19740 + .init = init_noinit,
19741 + .parse = parse_noparse,
19742 + .mapping = mapping_jnode,
19743 + .index = index_jnode,
19744 + .clone = clone_unformatted
19745 + },
19746 + [JNODE_FORMATTED_BLOCK] = {
19747 + .h = {
19748 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19749 + .id = JNODE_FORMATTED_BLOCK,
19750 + .pops = NULL,
19751 + .label = "formatted",
19752 + .desc = "formatted tree node",
19753 + .linkage = {NULL, NULL}
19754 + },
19755 + .init = init_znode,
19756 + .parse = parse_znode,
19757 + .mapping = mapping_znode,
19758 + .index = index_znode,
19759 + .clone = clone_formatted
19760 + },
19761 + [JNODE_BITMAP] = {
19762 + .h = {
19763 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19764 + .id = JNODE_BITMAP,
19765 + .pops = NULL,
19766 + .label = "bitmap",
19767 + .desc = "bitmap node",
19768 + .linkage = {NULL, NULL}
19769 + },
19770 + .init = init_noinit,
19771 + .parse = parse_noparse,
19772 + .mapping = mapping_bitmap,
19773 + .index = index_is_address,
19774 + .clone = NULL
19775 + },
19776 + [JNODE_IO_HEAD] = {
19777 + .h = {
19778 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19779 + .id = JNODE_IO_HEAD,
19780 + .pops = NULL,
19781 + .label = "io head",
19782 + .desc = "io head",
19783 + .linkage = {NULL, NULL}
19784 + },
19785 + .init = init_noinit,
19786 + .parse = parse_noparse,
19787 + .mapping = mapping_bitmap,
19788 + .index = index_is_address,
19789 + .clone = NULL
19790 + },
19791 + [JNODE_INODE] = {
19792 + .h = {
19793 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
19794 + .id = JNODE_INODE,
19795 + .pops = NULL,
19796 + .label = "inode",
19797 + .desc = "inode's builtin jnode",
19798 + .linkage = {NULL, NULL}
19799 + },
19800 + .init = NULL,
19801 + .parse = NULL,
19802 + .mapping = NULL,
19803 + .index = NULL,
19804 + .clone = NULL
19805 + }
19806 +};
19807 +
19808 +/*
19809 + * jnode destruction.
19810 + *
19811 + * Thread may use a jnode after it acquired a reference to it. References are
19812 + * counted in ->x_count field. Reference protects jnode from being
19813 + * recycled. This is different from protecting jnode data (that are stored in
19814 + * jnode page) from being evicted from memory. Data are protected by jload()
19815 + * and released by jrelse().
19816 + *
19817 + * If thread already possesses a reference to the jnode it can acquire another
19818 + * one through jref(). Initial reference is obtained (usually) by locating
19819 + * jnode in some indexing structure that depends on jnode type: formatted
19820 + * nodes are kept in global hash table, where they are indexed by block
19821 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19822 + * table, which is indexed by oid and offset within file, and in per-inode
19823 + * radix tree.
19824 + *
19825 + * Reference to jnode is released by jput(). If last reference is released,
19826 + * jput_final() is called. This function determines whether jnode has to be
19827 + * deleted (this happens when corresponding node is removed from the file
19828 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19829 + * should be just "removed" (deleted from memory).
19830 + *
19831 + * Jnode destruction is signally delicate dance because of locking and RCU.
19832 + */
19833 +
19834 +/*
19835 + * Returns true if jnode cannot be removed right now. This check is called
19836 + * under tree lock. If it returns true, jnode is irrevocably committed to be
19837 + * deleted/removed.
19838 + */
19839 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19840 +{
19841 + /* if other thread managed to acquire a reference to this jnode, don't
19842 + * free it. */
19843 + if (atomic_read(&node->x_count) > 0)
19844 + return 1;
19845 + /* also, don't free znode that has children in memory */
19846 + if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19847 + return 1;
19848 + return 0;
19849 +}
19850 +
19851 +/*
19852 + * this is called as part of removing jnode. Based on jnode type, call
19853 + * corresponding function that removes jnode from indices and returns it back
19854 + * to the appropriate slab (through RCU).
19855 + */
19856 +static inline void
19857 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19858 +{
19859 + switch (jtype) {
19860 + case JNODE_UNFORMATTED_BLOCK:
19861 + remove_jnode(node, tree);
19862 + break;
19863 + case JNODE_IO_HEAD:
19864 + case JNODE_BITMAP:
19865 + break;
19866 + case JNODE_INODE:
19867 + break;
19868 + case JNODE_FORMATTED_BLOCK:
19869 + remove_znode(node, tree);
19870 + break;
19871 + default:
19872 + wrong_return_value("nikita-3196", "Wrong jnode type");
19873 + }
19874 +}
19875 +
19876 +/*
19877 + * this is called as part of deleting jnode. Based on jnode type, call
19878 + * corresponding function that removes jnode from indices and returns it back
19879 + * to the appropriate slab (through RCU).
19880 + *
19881 + * This differs from jnode_remove() only for formatted nodes---for them
19882 + * sibling list handling is different for removal and deletion.
19883 + */
19884 +static inline void
19885 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19886 +{
19887 + switch (jtype) {
19888 + case JNODE_UNFORMATTED_BLOCK:
19889 + remove_jnode(node, tree);
19890 + break;
19891 + case JNODE_IO_HEAD:
19892 + case JNODE_BITMAP:
19893 + break;
19894 + case JNODE_FORMATTED_BLOCK:
19895 + delete_znode(node, tree);
19896 + break;
19897 + case JNODE_INODE:
19898 + default:
19899 + wrong_return_value("nikita-3195", "Wrong jnode type");
19900 + }
19901 +}
19902 +
19903 +#if REISER4_DEBUG
19904 +/*
19905 + * remove jnode from the debugging list of all jnodes hanging off super-block.
19906 + */
19907 +void jnode_list_remove(jnode * node)
19908 +{
19909 + reiser4_super_info_data *sbinfo;
19910 +
19911 + sbinfo = get_super_private(jnode_get_tree(node)->super);
19912 +
19913 + spin_lock_irq(&sbinfo->all_guard);
19914 + assert("nikita-2422", !list_empty(&node->jnodes));
19915 + list_del_init(&node->jnodes);
19916 + spin_unlock_irq(&sbinfo->all_guard);
19917 +}
19918 +#endif
19919 +
19920 +/*
19921 + * this is called by jput_final() to remove jnode when last reference to it is
19922 + * released.
19923 + */
19924 +static int jnode_try_drop(jnode * node)
19925 +{
19926 + int result;
19927 + reiser4_tree *tree;
19928 + jnode_type jtype;
19929 +
19930 + assert("nikita-2491", node != NULL);
19931 + assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19932 +
19933 + tree = jnode_get_tree(node);
19934 + jtype = jnode_get_type(node);
19935 +
19936 + spin_lock_jnode(node);
19937 + write_lock_tree(tree);
19938 + /*
19939 + * if jnode has a page---leave it alone. Memory pressure will
19940 + * eventually kill page and jnode.
19941 + */
19942 + if (jnode_page(node) != NULL) {
19943 + write_unlock_tree(tree);
19944 + spin_unlock_jnode(node);
19945 + JF_CLR(node, JNODE_RIP);
19946 + return RETERR(-EBUSY);
19947 + }
19948 +
19949 + /* re-check ->x_count under tree lock. */
19950 + result = jnode_is_busy(node, jtype);
19951 + if (result == 0) {
19952 + assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19953 + assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19954 +
19955 + spin_unlock_jnode(node);
19956 + /* no page and no references---despatch him. */
19957 + jnode_remove(node, jtype, tree);
19958 + write_unlock_tree(tree);
19959 + jnode_free(node, jtype);
19960 + } else {
19961 + /* busy check failed: reference was acquired by concurrent
19962 + * thread. */
19963 + write_unlock_tree(tree);
19964 + spin_unlock_jnode(node);
19965 + JF_CLR(node, JNODE_RIP);
19966 + }
19967 + return result;
19968 +}
19969 +
19970 +/* jdelete() -- Delete jnode from the tree and file system */
19971 +static int jdelete(jnode * node /* jnode to finish with */ )
19972 +{
19973 + struct page *page;
19974 + int result;
19975 + reiser4_tree *tree;
19976 + jnode_type jtype;
19977 +
19978 + assert("nikita-467", node != NULL);
19979 + assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
19980 +
19981 + jtype = jnode_get_type(node);
19982 +
19983 + page = jnode_lock_page(node);
19984 + assert_spin_locked(&(node->guard));
19985 +
19986 + tree = jnode_get_tree(node);
19987 +
19988 + write_lock_tree(tree);
19989 + /* re-check ->x_count under tree lock. */
19990 + result = jnode_is_busy(node, jtype);
19991 + if (likely(!result)) {
19992 + assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19993 + assert("jmacd-511", atomic_read(&node->d_count) == 0);
19994 +
19995 + /* detach page */
19996 + if (page != NULL) {
19997 + /*
19998 + * FIXME this is racy against jnode_extent_write().
19999 + */
20000 + page_clear_jnode(page, node);
20001 + }
20002 + spin_unlock_jnode(node);
20003 + /* goodbye */
20004 + jnode_delete(node, jtype, tree);
20005 + write_unlock_tree(tree);
20006 + jnode_free(node, jtype);
20007 + /* @node is no longer valid pointer */
20008 + if (page != NULL)
20009 + reiser4_drop_page(page);
20010 + } else {
20011 + /* busy check failed: reference was acquired by concurrent
20012 + * thread. */
20013 + JF_CLR(node, JNODE_RIP);
20014 + write_unlock_tree(tree);
20015 + spin_unlock_jnode(node);
20016 + if (page != NULL)
20017 + unlock_page(page);
20018 + }
20019 + return result;
20020 +}
20021 +
20022 +/* drop jnode on the floor.
20023 +
20024 + Return value:
20025 +
20026 + -EBUSY: failed to drop jnode, because there are still references to it
20027 +
20028 + 0: successfully dropped jnode
20029 +
20030 +*/
20031 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20032 +{
20033 + struct page *page;
20034 + jnode_type jtype;
20035 + int result;
20036 +
20037 + assert("zam-602", node != NULL);
20038 + assert_rw_not_read_locked(&(tree->tree_lock));
20039 + assert_rw_not_write_locked(&(tree->tree_lock));
20040 + assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20041 +
20042 + jtype = jnode_get_type(node);
20043 +
20044 + page = jnode_lock_page(node);
20045 + assert_spin_locked(&(node->guard));
20046 +
20047 + write_lock_tree(tree);
20048 +
20049 + /* re-check ->x_count under tree lock. */
20050 + result = jnode_is_busy(node, jtype);
20051 + if (!result) {
20052 + assert("nikita-2488", page == jnode_page(node));
20053 + assert("nikita-2533", atomic_read(&node->d_count) == 0);
20054 + if (page != NULL) {
20055 + assert("nikita-2126", !PageDirty(page));
20056 + assert("nikita-2127", PageUptodate(page));
20057 + assert("nikita-2181", PageLocked(page));
20058 + page_clear_jnode(page, node);
20059 + }
20060 + spin_unlock_jnode(node);
20061 + jnode_remove(node, jtype, tree);
20062 + write_unlock_tree(tree);
20063 + jnode_free(node, jtype);
20064 + if (page != NULL) {
20065 + reiser4_drop_page(page);
20066 + }
20067 + } else {
20068 + /* busy check failed: reference was acquired by concurrent
20069 + * thread. */
20070 + JF_CLR(node, JNODE_RIP);
20071 + write_unlock_tree(tree);
20072 + spin_unlock_jnode(node);
20073 + if (page != NULL)
20074 + unlock_page(page);
20075 + }
20076 + return result;
20077 +}
20078 +
20079 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20080 + be 0 (where applicable). */
20081 +void jdrop(jnode * node)
20082 +{
20083 + jdrop_in_tree(node, jnode_get_tree(node));
20084 +}
20085 +
20086 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20087 + functionality (these j-nodes are not in any hash table) just for reading
20088 + from and writing to disk. */
20089 +
20090 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20091 +{
20092 + jnode *jal = jalloc();
20093 +
20094 + if (jal != NULL) {
20095 + jnode_init(jal, current_tree, JNODE_IO_HEAD);
20096 + jnode_set_block(jal, block);
20097 + }
20098 +
20099 + jref(jal);
20100 +
20101 + return jal;
20102 +}
20103 +
20104 +void reiser4_drop_io_head(jnode * node)
20105 +{
20106 + assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20107 +
20108 + jput(node);
20109 + jdrop(node);
20110 +}
20111 +
20112 +/* protect keep jnode data from reiser4_releasepage() */
20113 +void pin_jnode_data(jnode * node)
20114 +{
20115 + assert("zam-671", jnode_page(node) != NULL);
20116 + page_cache_get(jnode_page(node));
20117 +}
20118 +
20119 +/* make jnode data free-able again */
20120 +void unpin_jnode_data(jnode * node)
20121 +{
20122 + assert("zam-672", jnode_page(node) != NULL);
20123 + page_cache_release(jnode_page(node));
20124 +}
20125 +
20126 +struct address_space *jnode_get_mapping(const jnode * node)
20127 +{
20128 + assert("nikita-3162", node != NULL);
20129 + return jnode_ops(node)->mapping(node);
20130 +}
20131 +
20132 +#if REISER4_DEBUG
20133 +/* debugging aid: jnode invariant */
20134 +int jnode_invariant_f(const jnode * node, char const **msg)
20135 +{
20136 +#define _ergo(ant, con) \
20137 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20138 +#define _check(exp) ((*msg) = #exp, (exp))
20139 +
20140 + return _check(node != NULL) &&
20141 + /* [jnode-queued] */
20142 + /* only relocated node can be queued, except that when znode
20143 + * is being deleted, its JNODE_RELOC bit is cleared */
20144 + _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20145 + JF_ISSET(node, JNODE_RELOC) ||
20146 + JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20147 + _check(node->jnodes.prev != NULL) &&
20148 + _check(node->jnodes.next != NULL) &&
20149 + /* [jnode-dirty] invariant */
20150 + /* dirty inode is part of atom */
20151 + _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20152 + /* [jnode-oid] invariant */
20153 + /* for unformatted node ->objectid and ->mapping fields are
20154 + * consistent */
20155 + _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20156 + node->key.j.objectid ==
20157 + get_inode_oid(node->key.j.mapping->host)) &&
20158 + /* [jnode-atom-valid] invariant */
20159 + /* node atom has valid state */
20160 + _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20161 + /* [jnode-page-binding] invariant */
20162 + /* if node points to page, it points back to node */
20163 + _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20164 + /* [jnode-refs] invariant */
20165 + /* only referenced jnode can be loaded */
20166 + _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20167 +
20168 +}
20169 +
20170 +static const char *jnode_type_name(jnode_type type)
20171 +{
20172 + switch (type) {
20173 + case JNODE_UNFORMATTED_BLOCK:
20174 + return "unformatted";
20175 + case JNODE_FORMATTED_BLOCK:
20176 + return "formatted";
20177 + case JNODE_BITMAP:
20178 + return "bitmap";
20179 + case JNODE_IO_HEAD:
20180 + return "io head";
20181 + case JNODE_INODE:
20182 + return "inode";
20183 + case LAST_JNODE_TYPE:
20184 + return "last";
20185 + default:{
20186 + static char unknown[30];
20187 +
20188 + sprintf(unknown, "unknown %i", type);
20189 + return unknown;
20190 + }
20191 + }
20192 +}
20193 +
20194 +#define jnode_state_name( node, flag ) \
20195 + ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20196 +
20197 +/* debugging aid: output human readable information about @node */
20198 +static void info_jnode(const char *prefix /* prefix to print */ ,
20199 + const jnode * node /* node to print */ )
20200 +{
20201 + assert("umka-068", prefix != NULL);
20202 +
20203 + if (node == NULL) {
20204 + printk("%s: null\n", prefix);
20205 + return;
20206 + }
20207 +
20208 + printk
20209 + ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20210 + " block: %s, d_count: %d, x_count: %d, "
20211 + "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20212 + node->state,
20213 + jnode_state_name(node, JNODE_PARSED),
20214 + jnode_state_name(node, JNODE_HEARD_BANSHEE),
20215 + jnode_state_name(node, JNODE_LEFT_CONNECTED),
20216 + jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20217 + jnode_state_name(node, JNODE_ORPHAN),
20218 + jnode_state_name(node, JNODE_CREATED),
20219 + jnode_state_name(node, JNODE_RELOC),
20220 + jnode_state_name(node, JNODE_OVRWR),
20221 + jnode_state_name(node, JNODE_DIRTY),
20222 + jnode_state_name(node, JNODE_IS_DYING),
20223 + jnode_state_name(node, JNODE_RIP),
20224 + jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20225 + jnode_state_name(node, JNODE_WRITEBACK),
20226 + jnode_state_name(node, JNODE_NEW),
20227 + jnode_state_name(node, JNODE_DKSET),
20228 + jnode_state_name(node, JNODE_REPACK),
20229 + jnode_state_name(node, JNODE_CLUSTER_PAGE),
20230 + jnode_get_level(node), sprint_address(jnode_get_block(node)),
20231 + atomic_read(&node->d_count), atomic_read(&node->x_count),
20232 + jnode_page(node), node->atom, 0, 0,
20233 + jnode_type_name(jnode_get_type(node)));
20234 + if (jnode_is_unformatted(node)) {
20235 + printk("inode: %llu, index: %lu, ",
20236 + node->key.j.objectid, node->key.j.index);
20237 + }
20238 +}
20239 +
20240 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20241 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20242 +{
20243 + char const *failed_msg;
20244 + int result;
20245 + reiser4_tree *tree;
20246 +
20247 + tree = jnode_get_tree(node);
20248 +
20249 + assert("umka-063312", node != NULL);
20250 + assert("umka-064321", tree != NULL);
20251 +
20252 + if (!jlocked && !tlocked)
20253 + spin_lock_jnode((jnode *) node);
20254 + if (!tlocked)
20255 + read_lock_tree(jnode_get_tree(node));
20256 + result = jnode_invariant_f(node, &failed_msg);
20257 + if (!result) {
20258 + info_jnode("corrupted node", node);
20259 + warning("jmacd-555", "Condition %s failed", failed_msg);
20260 + }
20261 + if (!tlocked)
20262 + read_unlock_tree(jnode_get_tree(node));
20263 + if (!jlocked && !tlocked)
20264 + spin_unlock_jnode((jnode *) node);
20265 + return result;
20266 +}
20267 +
20268 +#endif /* REISER4_DEBUG */
20269 +
20270 +/* Make Linus happy.
20271 + Local variables:
20272 + c-indentation-style: "K&R"
20273 + mode-name: "LC"
20274 + c-basic-offset: 8
20275 + tab-width: 8
20276 + fill-column: 80
20277 + End:
20278 +*/
20279 diff -urN linux-2.6.23.orig/fs/reiser4/jnode.h linux-2.6.23/fs/reiser4/jnode.h
20280 --- linux-2.6.23.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300
20281 +++ linux-2.6.23/fs/reiser4/jnode.h 2007-12-04 16:49:30.000000000 +0300
20282 @@ -0,0 +1,702 @@
20283 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20284 + * reiser4/README */
20285 +
20286 +/* Declaration of jnode. See jnode.c for details. */
20287 +
20288 +#ifndef __JNODE_H__
20289 +#define __JNODE_H__
20290 +
20291 +#include "forward.h"
20292 +#include "type_safe_hash.h"
20293 +#include "txnmgr.h"
20294 +#include "key.h"
20295 +#include "debug.h"
20296 +#include "dformat.h"
20297 +#include "page_cache.h"
20298 +#include "context.h"
20299 +
20300 +#include "plugin/plugin.h"
20301 +
20302 +#include <linux/fs.h>
20303 +#include <linux/mm.h>
20304 +#include <linux/spinlock.h>
20305 +#include <asm/atomic.h>
20306 +#include <linux/bitops.h>
20307 +#include <linux/list.h>
20308 +#include <linux/rcupdate.h>
20309 +
20310 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20311 + nodes) */
20312 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20313 +
20314 +/* declare hash table of znodes */
20315 +TYPE_SAFE_HASH_DECLARE(z, znode);
20316 +
20317 +struct jnode_key {
20318 + __u64 objectid;
20319 + unsigned long index;
20320 + struct address_space *mapping;
20321 +};
20322 +
20323 +/*
20324 + Jnode is the "base class" of other nodes in reiser4. It is also happens to
20325 + be exactly the node we use for unformatted tree nodes.
20326 +
20327 + Jnode provides following basic functionality:
20328 +
20329 + . reference counting and indexing.
20330 +
20331 + . integration with page cache. Jnode has ->pg reference to which page can
20332 + be attached.
20333 +
20334 + . interface to transaction manager. It is jnode that is kept in transaction
20335 + manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20336 + means, there should be special type of jnode for inode.)
20337 +
20338 + Locking:
20339 +
20340 + Spin lock: the following fields are protected by the per-jnode spin lock:
20341 +
20342 + ->state
20343 + ->atom
20344 + ->capture_link
20345 +
20346 + Following fields are protected by the global tree lock:
20347 +
20348 + ->link
20349 + ->key.z (content of ->key.z is only changed in znode_rehash())
20350 + ->key.j
20351 +
20352 + Atomic counters
20353 +
20354 + ->x_count
20355 + ->d_count
20356 +
20357 + ->pg, and ->data are protected by spin lock for unused jnode and are
20358 + immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20359 + is false).
20360 +
20361 + ->tree is immutable after creation
20362 +
20363 + Unclear
20364 +
20365 + ->blocknr: should be under jnode spin-lock, but current interface is based
20366 + on passing of block address.
20367 +
20368 + If you ever need to spin lock two nodes at once, do this in "natural"
20369 + memory order: lock znode with lower address first. (See lock_two_nodes().)
20370 +
20371 + Invariants involving this data-type:
20372 +
20373 + [jnode-dirty]
20374 + [jnode-refs]
20375 + [jnode-oid]
20376 + [jnode-queued]
20377 + [jnode-atom-valid]
20378 + [jnode-page-binding]
20379 +*/
20380 +
20381 +struct jnode {
20382 +#if REISER4_DEBUG
20383 +#define JMAGIC 0x52654973 /* "ReIs" */
20384 + int magic;
20385 +#endif
20386 + /* FIRST CACHE LINE (16 bytes): data used by jload */
20387 +
20388 + /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20389 + /* 0 */ unsigned long state;
20390 +
20391 + /* lock, protecting jnode's fields. */
20392 + /* 4 */ spinlock_t load;
20393 +
20394 + /* counter of references to jnode itself. Increased on jref().
20395 + Decreased on jput().
20396 + */
20397 + /* 8 */ atomic_t x_count;
20398 +
20399 + /* counter of references to jnode's data. Pin data page(s) in
20400 + memory while this is greater than 0. Increased on jload().
20401 + Decreased on jrelse().
20402 + */
20403 + /* 12 */ atomic_t d_count;
20404 +
20405 + /* SECOND CACHE LINE: data used by hash table lookups */
20406 +
20407 + /* 16 */ union {
20408 + /* znodes are hashed by block number */
20409 + reiser4_block_nr z;
20410 + /* unformatted nodes are hashed by mapping plus offset */
20411 + struct jnode_key j;
20412 + } key;
20413 +
20414 + /* THIRD CACHE LINE */
20415 +
20416 + /* 32 */ union {
20417 + /* pointers to maintain hash-table */
20418 + z_hash_link z;
20419 + j_hash_link j;
20420 + } link;
20421 +
20422 + /* pointer to jnode page. */
20423 + /* 36 */ struct page *pg;
20424 + /* pointer to node itself. This is page_address(node->pg) when page is
20425 + attached to the jnode
20426 + */
20427 + /* 40 */ void *data;
20428 +
20429 + /* 44 */ reiser4_tree *tree;
20430 +
20431 + /* FOURTH CACHE LINE: atom related fields */
20432 +
20433 + /* 48 */ spinlock_t guard;
20434 +
20435 + /* atom the block is in, if any */
20436 + /* 52 */ txn_atom *atom;
20437 +
20438 + /* capture list */
20439 + /* 56 */ struct list_head capture_link;
20440 +
20441 + /* FIFTH CACHE LINE */
20442 +
20443 + /* 64 */ struct rcu_head rcu;
20444 + /* crosses cache line */
20445 +
20446 + /* SIXTH CACHE LINE */
20447 +
20448 + /* the real blocknr (where io is going to/from) */
20449 + /* 80 */ reiser4_block_nr blocknr;
20450 + /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20451 + /* NOTE: this parent_item_id looks like jnode type. */
20452 + /* 88 */ reiser4_plugin_id parent_item_id;
20453 + /* 92 */
20454 +#if REISER4_DEBUG
20455 + /* list of all jnodes for debugging purposes. */
20456 + struct list_head jnodes;
20457 + /* how many times this jnode was written in one transaction */
20458 + int written;
20459 + /* this indicates which atom's list the jnode is on */
20460 + atom_list list;
20461 +#endif
20462 +} __attribute__ ((aligned(16)));
20463 +
20464 +/*
20465 + * jnode types. Enumeration of existing jnode types.
20466 + */
20467 +typedef enum {
20468 + JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20469 + JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20470 + JNODE_BITMAP, /* bitmap */
20471 + JNODE_IO_HEAD, /* jnode representing a block in the
20472 + * wandering log */
20473 + JNODE_INODE, /* jnode embedded into inode */
20474 + LAST_JNODE_TYPE
20475 +} jnode_type;
20476 +
20477 +/* jnode states */
20478 +typedef enum {
20479 + /* jnode's page is loaded and data checked */
20480 + JNODE_PARSED = 0,
20481 + /* node was deleted, not all locks on it were released. This
20482 + node is empty and is going to be removed from the tree
20483 + shortly. */
20484 + JNODE_HEARD_BANSHEE = 1,
20485 + /* left sibling pointer is valid */
20486 + JNODE_LEFT_CONNECTED = 2,
20487 + /* right sibling pointer is valid */
20488 + JNODE_RIGHT_CONNECTED = 3,
20489 +
20490 + /* znode was just created and doesn't yet have a pointer from
20491 + its parent */
20492 + JNODE_ORPHAN = 4,
20493 +
20494 + /* this node was created by its transaction and has not been assigned
20495 + a block address. */
20496 + JNODE_CREATED = 5,
20497 +
20498 + /* this node is currently relocated */
20499 + JNODE_RELOC = 6,
20500 + /* this node is currently wandered */
20501 + JNODE_OVRWR = 7,
20502 +
20503 + /* this znode has been modified */
20504 + JNODE_DIRTY = 8,
20505 +
20506 + /* znode lock is being invalidated */
20507 + JNODE_IS_DYING = 9,
20508 +
20509 + /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20510 +
20511 + /* jnode is queued for flushing. */
20512 + JNODE_FLUSH_QUEUED = 12,
20513 +
20514 + /* In the following bits jnode type is encoded. */
20515 + JNODE_TYPE_1 = 13,
20516 + JNODE_TYPE_2 = 14,
20517 + JNODE_TYPE_3 = 15,
20518 +
20519 + /* jnode is being destroyed */
20520 + JNODE_RIP = 16,
20521 +
20522 + /* znode was not captured during locking (it might so be because
20523 + ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20524 + JNODE_MISSED_IN_CAPTURE = 17,
20525 +
20526 + /* write is in progress */
20527 + JNODE_WRITEBACK = 18,
20528 +
20529 + /* FIXME: now it is used by crypto-compress plugin only */
20530 + JNODE_NEW = 19,
20531 +
20532 + /* delimiting keys are already set for this znode. */
20533 + JNODE_DKSET = 20,
20534 +
20535 + /* when this bit is set page and jnode can not be disconnected */
20536 + JNODE_WRITE_PREPARED = 21,
20537 +
20538 + JNODE_CLUSTER_PAGE = 22,
20539 + /* Jnode is marked for repacking, that means the reiser4 flush and the
20540 + * block allocator should process this node special way */
20541 + JNODE_REPACK = 23,
20542 + /* node should be converted by flush in squalloc phase */
20543 + JNODE_CONVERTIBLE = 24,
20544 + /*
20545 + * When jnode is dirtied for the first time in given transaction,
20546 + * do_jnode_make_dirty() checks whether this jnode can possible became
20547 + * member of overwrite set. If so, this bit is set, and one block is
20548 + * reserved in the ->flush_reserved space of atom.
20549 + *
20550 + * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20551 + *
20552 + * (1) flush decides that we want this block to go into relocate
20553 + * set after all.
20554 + *
20555 + * (2) wandering log is allocated (by log writer)
20556 + *
20557 + * (3) extent is allocated
20558 + *
20559 + */
20560 + JNODE_FLUSH_RESERVED = 29
20561 +} reiser4_jnode_state;
20562 +
20563 +/* Macros for accessing the jnode state. */
20564 +
20565 +static inline void JF_CLR(jnode * j, int f)
20566 +{
20567 + assert("unknown-1", j->magic == JMAGIC);
20568 + clear_bit(f, &j->state);
20569 +}
20570 +static inline int JF_ISSET(const jnode * j, int f)
20571 +{
20572 + assert("unknown-2", j->magic == JMAGIC);
20573 + return test_bit(f, &((jnode *) j)->state);
20574 +}
20575 +static inline void JF_SET(jnode * j, int f)
20576 +{
20577 + assert("unknown-3", j->magic == JMAGIC);
20578 + set_bit(f, &j->state);
20579 +}
20580 +
20581 +static inline int JF_TEST_AND_SET(jnode * j, int f)
20582 +{
20583 + assert("unknown-4", j->magic == JMAGIC);
20584 + return test_and_set_bit(f, &j->state);
20585 +}
20586 +
20587 +static inline void spin_lock_jnode(jnode *node)
20588 +{
20589 + /* check that spinlocks of lower priorities are not held */
20590 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20591 + LOCK_CNT_NIL(spin_locked_txnh) &&
20592 + LOCK_CNT_NIL(spin_locked_zlock) &&
20593 + LOCK_CNT_NIL(rw_locked_dk) &&
20594 + LOCK_CNT_LT(spin_locked_jnode, 2)));
20595 +
20596 + spin_lock(&(node->guard));
20597 +
20598 + LOCK_CNT_INC(spin_locked_jnode);
20599 + LOCK_CNT_INC(spin_locked);
20600 +}
20601 +
20602 +static inline void spin_unlock_jnode(jnode *node)
20603 +{
20604 + assert_spin_locked(&(node->guard));
20605 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20606 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20607 +
20608 + LOCK_CNT_DEC(spin_locked_jnode);
20609 + LOCK_CNT_DEC(spin_locked);
20610 +
20611 + spin_unlock(&(node->guard));
20612 +}
20613 +
20614 +static inline int jnode_is_in_deleteset(const jnode * node)
20615 +{
20616 + return JF_ISSET(node, JNODE_RELOC);
20617 +}
20618 +
20619 +extern int init_jnodes(void);
20620 +extern void done_jnodes(void);
20621 +
20622 +/* Jnode routines */
20623 +extern jnode *jalloc(void);
20624 +extern void jfree(jnode * node) NONNULL;
20625 +extern jnode *jclone(jnode *);
20626 +extern jnode *jlookup(reiser4_tree * tree,
20627 + oid_t objectid, unsigned long ind) NONNULL;
20628 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20629 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
20630 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
20631 +void jnode_attach_page(jnode * node, struct page *pg);
20632 +
20633 +void unhash_unformatted_jnode(jnode *);
20634 +extern jnode *page_next_jnode(jnode * node) NONNULL;
20635 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20636 +extern void jnode_make_dirty(jnode * node) NONNULL;
20637 +extern void jnode_make_clean(jnode * node) NONNULL;
20638 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20639 +extern void jnode_make_wander(jnode *) NONNULL;
20640 +extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20641 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20642 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20643 +
20644 +/**
20645 + * jnode_get_block
20646 + * @node: jnode to query
20647 + *
20648 + */
20649 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20650 +{
20651 + assert("nikita-528", node != NULL);
20652 +
20653 + return &node->blocknr;
20654 +}
20655 +
20656 +/**
20657 + * jnode_set_block
20658 + * @node: jnode to update
20659 + * @blocknr: new block nr
20660 + */
20661 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20662 +{
20663 + assert("nikita-2020", node != NULL);
20664 + assert("umka-055", blocknr != NULL);
20665 + node->blocknr = *blocknr;
20666 +}
20667 +
20668 +
20669 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
20670 + * jnode was emergency flushed---then block number chosen by eflush is
20671 + * used. */
20672 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20673 +{
20674 + assert("nikita-2768", node != NULL);
20675 + assert_spin_locked(&(node->guard));
20676 +
20677 + return jnode_get_block(node);
20678 +}
20679 +
20680 +/* Jnode flush interface. */
20681 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20682 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
20683 +
20684 +/* FIXME-VS: these are used in plugin/item/extent.c */
20685 +
20686 +/* does extent_get_block have to be called */
20687 +#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20688 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20689 +
20690 +/* the node should be converted during flush squalloc phase */
20691 +#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20692 +#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20693 +
20694 +/* Macros to convert from jnode to znode, znode to jnode. These are macros
20695 + because C doesn't allow overloading of const prototypes. */
20696 +#define ZJNODE(x) (& (x) -> zjnode)
20697 +#define JZNODE(x) \
20698 +({ \
20699 + typeof (x) __tmp_x; \
20700 + \
20701 + __tmp_x = (x); \
20702 + assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20703 + (znode*) __tmp_x; \
20704 +})
20705 +
20706 +extern int jnodes_tree_init(reiser4_tree * tree);
20707 +extern int jnodes_tree_done(reiser4_tree * tree);
20708 +
20709 +#if REISER4_DEBUG
20710 +
20711 +extern int znode_is_any_locked(const znode * node);
20712 +extern void jnode_list_remove(jnode * node);
20713 +
20714 +#else
20715 +
20716 +#define jnode_list_remove(node) noop
20717 +
20718 +#endif
20719 +
20720 +int znode_is_root(const znode * node) NONNULL;
20721 +
20722 +/* bump reference counter on @node */
20723 +static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20724 +{
20725 + assert("nikita-1911", node != NULL);
20726 +
20727 + atomic_inc(&node->x_count);
20728 + LOCK_CNT_INC(x_refs);
20729 +}
20730 +
20731 +static inline void dec_x_ref(jnode * node)
20732 +{
20733 + assert("nikita-3215", node != NULL);
20734 + assert("nikita-3216", atomic_read(&node->x_count) > 0);
20735 +
20736 + atomic_dec(&node->x_count);
20737 + assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20738 + LOCK_CNT_DEC(x_refs);
20739 +}
20740 +
20741 +/* jref() - increase counter of references to jnode/znode (x_count) */
20742 +static inline jnode *jref(jnode * node)
20743 +{
20744 + assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20745 + add_x_ref(node);
20746 + return node;
20747 +}
20748 +
20749 +/* get the page of jnode */
20750 +static inline struct page *jnode_page(const jnode * node)
20751 +{
20752 + return node->pg;
20753 +}
20754 +
20755 +/* return pointer to jnode data */
20756 +static inline char *jdata(const jnode * node)
20757 +{
20758 + assert("nikita-1415", node != NULL);
20759 + assert("nikita-3198", jnode_page(node) != NULL);
20760 + return node->data;
20761 +}
20762 +
20763 +static inline int jnode_is_loaded(const jnode * node)
20764 +{
20765 + assert("zam-506", node != NULL);
20766 + return atomic_read(&node->d_count) > 0;
20767 +}
20768 +
20769 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20770 +
20771 +static inline void jnode_set_reloc(jnode * node)
20772 +{
20773 + assert("nikita-2431", node != NULL);
20774 + assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20775 + JF_SET(node, JNODE_RELOC);
20776 +}
20777 +
20778 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20779 +
20780 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20781 +
20782 +static inline int jload(jnode *node)
20783 +{
20784 + return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20785 +}
20786 +
20787 +extern int jinit_new(jnode *, gfp_t) NONNULL;
20788 +extern int jstartio(jnode *) NONNULL;
20789 +
20790 +extern void jdrop(jnode *) NONNULL;
20791 +extern int jwait_io(jnode *, int rw) NONNULL;
20792 +
20793 +void jload_prefetch(jnode *);
20794 +
20795 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20796 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
20797 +
20798 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
20799 +{
20800 + assert("nikita-2691", node != NULL);
20801 + return node->tree;
20802 +}
20803 +
20804 +extern void pin_jnode_data(jnode *);
20805 +extern void unpin_jnode_data(jnode *);
20806 +
20807 +static inline jnode_type jnode_get_type(const jnode * node)
20808 +{
20809 + static const unsigned long state_mask =
20810 + (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20811 +
20812 + static jnode_type mask_to_type[] = {
20813 + /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20814 +
20815 + /* 000 */
20816 + [0] = JNODE_FORMATTED_BLOCK,
20817 + /* 001 */
20818 + [1] = JNODE_UNFORMATTED_BLOCK,
20819 + /* 010 */
20820 + [2] = JNODE_BITMAP,
20821 + /* 011 */
20822 + [3] = LAST_JNODE_TYPE, /*invalid */
20823 + /* 100 */
20824 + [4] = JNODE_INODE,
20825 + /* 101 */
20826 + [5] = LAST_JNODE_TYPE,
20827 + /* 110 */
20828 + [6] = JNODE_IO_HEAD,
20829 + /* 111 */
20830 + [7] = LAST_JNODE_TYPE, /* invalid */
20831 + };
20832 +
20833 + return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20834 +}
20835 +
20836 +/* returns true if node is a znode */
20837 +static inline int jnode_is_znode(const jnode * node)
20838 +{
20839 + return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20840 +}
20841 +
20842 +static inline int jnode_is_flushprepped(jnode * node)
20843 +{
20844 + assert("jmacd-78212", node != NULL);
20845 + assert_spin_locked(&(node->guard));
20846 + return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20847 + JF_ISSET(node, JNODE_OVRWR);
20848 +}
20849 +
20850 +/* Return true if @node has already been processed by the squeeze and allocate
20851 + process. This implies the block address has been finalized for the
20852 + duration of this atom (or it is clean and will remain in place). If this
20853 + returns true you may use the block number as a hint. */
20854 +static inline int jnode_check_flushprepped(jnode * node)
20855 +{
20856 + int result;
20857 +
20858 + /* It must be clean or relocated or wandered. New allocations are set to relocate. */
20859 + spin_lock_jnode(node);
20860 + result = jnode_is_flushprepped(node);
20861 + spin_unlock_jnode(node);
20862 + return result;
20863 +}
20864 +
20865 +/* returns true if node is unformatted */
20866 +static inline int jnode_is_unformatted(const jnode * node)
20867 +{
20868 + assert("jmacd-0123", node != NULL);
20869 + return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20870 +}
20871 +
20872 +/* returns true if node represents a cluster cache page */
20873 +static inline int jnode_is_cluster_page(const jnode * node)
20874 +{
20875 + assert("edward-50", node != NULL);
20876 + return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20877 +}
20878 +
20879 +/* returns true is node is builtin inode's jnode */
20880 +static inline int jnode_is_inode(const jnode * node)
20881 +{
20882 + assert("vs-1240", node != NULL);
20883 + return jnode_get_type(node) == JNODE_INODE;
20884 +}
20885 +
20886 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20887 +{
20888 + assert("nikita-2367", type < LAST_JNODE_TYPE);
20889 + return jnode_plugin_by_id((reiser4_plugin_id) type);
20890 +}
20891 +
20892 +static inline jnode_plugin *jnode_ops(const jnode * node)
20893 +{
20894 + assert("nikita-2366", node != NULL);
20895 +
20896 + return jnode_ops_of(jnode_get_type(node));
20897 +}
20898 +
20899 +/* Get the index of a block. */
20900 +static inline unsigned long jnode_get_index(jnode * node)
20901 +{
20902 + return jnode_ops(node)->index(node);
20903 +}
20904 +
20905 +/* return true if "node" is the root */
20906 +static inline int jnode_is_root(const jnode * node)
20907 +{
20908 + return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20909 +}
20910 +
20911 +extern struct address_space *mapping_jnode(const jnode * node);
20912 +extern unsigned long index_jnode(const jnode * node);
20913 +
20914 +static inline void jput(jnode * node);
20915 +extern void jput_final(jnode * node);
20916 +
20917 +/* bump data counter on @node */
20918 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20919 +{
20920 + assert("nikita-1962", node != NULL);
20921 +
20922 + atomic_inc(&node->d_count);
20923 + if (jnode_is_unformatted(node) || jnode_is_znode(node))
20924 + LOCK_CNT_INC(d_refs);
20925 +}
20926 +
20927 +/* jput() - decrement x_count reference counter on znode.
20928 +
20929 + Count may drop to 0, jnode stays in cache until memory pressure causes the
20930 + eviction of its page. The c_count variable also ensures that children are
20931 + pressured out of memory before the parent. The jnode remains hashed as
20932 + long as the VM allows its page to stay in memory.
20933 +*/
20934 +static inline void jput(jnode * node)
20935 +{
20936 + assert("jmacd-509", node != NULL);
20937 + assert("jmacd-510", atomic_read(&node->x_count) > 0);
20938 + assert("zam-926", reiser4_schedulable());
20939 + LOCK_CNT_DEC(x_refs);
20940 +
20941 + rcu_read_lock();
20942 + /*
20943 + * we don't need any kind of lock here--jput_final() uses RCU.
20944 + */
20945 + if (unlikely(atomic_dec_and_test(&node->x_count))) {
20946 + jput_final(node);
20947 + } else
20948 + rcu_read_unlock();
20949 + assert("nikita-3473", reiser4_schedulable());
20950 +}
20951 +
20952 +extern void jrelse(jnode * node);
20953 +extern void jrelse_tail(jnode * node);
20954 +
20955 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20956 +
20957 +/* resolve race with jput */
20958 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20959 +{
20960 + if (unlikely(JF_ISSET(node, JNODE_RIP)))
20961 + node = jnode_rip_sync(tree, node);
20962 + return node;
20963 +}
20964 +
20965 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20966 +
20967 +#if REISER4_DEBUG
20968 +extern int jnode_invariant_f(const jnode *node, char const **msg);
20969 +#endif
20970 +
20971 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
20972 +
20973 +/* __JNODE_H__ */
20974 +#endif
20975 +
20976 +/* Make Linus happy.
20977 + Local variables:
20978 + c-indentation-style: "K&R"
20979 + mode-name: "LC"
20980 + c-basic-offset: 8
20981 + tab-width: 8
20982 + fill-column: 120
20983 + End:
20984 +*/
20985 diff -urN linux-2.6.23.orig/fs/reiser4/kassign.c linux-2.6.23/fs/reiser4/kassign.c
20986 --- linux-2.6.23.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300
20987 +++ linux-2.6.23/fs/reiser4/kassign.c 2007-12-04 22:59:05.790367863 +0300
20988 @@ -0,0 +1,661 @@
20989 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20990 + * reiser4/README */
20991 +
20992 +/* Key assignment policy implementation */
20993 +
20994 +/*
20995 + * In reiser4 every piece of file system data and meta-data has a key. Keys
20996 + * are used to store information in and retrieve it from reiser4 internal
20997 + * tree. In addition to this, keys define _ordering_ of all file system
20998 + * information: things having close keys are placed into the same or
20999 + * neighboring (in the tree order) nodes of the tree. As our block allocator
21000 + * tries to respect tree order (see flush.c), keys also define order in which
21001 + * things are laid out on the disk, and hence, affect performance directly.
21002 + *
21003 + * Obviously, assignment of keys to data and meta-data should be consistent
21004 + * across whole file system. Algorithm that calculates a key for a given piece
21005 + * of data or meta-data is referred to as "key assignment".
21006 + *
21007 + * Key assignment is too expensive to be implemented as a plugin (that is,
21008 + * with an ability to support different key assignment schemas in the same
21009 + * compiled kernel image). As a compromise, all key-assignment functions and
21010 + * data-structures are collected in this single file, so that modifications to
21011 + * key assignment algorithm can be localized. Additional changes may be
21012 + * required in key.[ch].
21013 + *
21014 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21015 + * may guess, there is "Plan B" too.
21016 + *
21017 + */
21018 +
21019 +/*
21020 + * Additional complication with key assignment implementation is a requirement
21021 + * to support different key length.
21022 + */
21023 +
21024 +/*
21025 + * KEY ASSIGNMENT: PLAN A, LONG KEYS.
21026 + *
21027 + * DIRECTORY ITEMS
21028 + *
21029 + * | 60 | 4 | 7 |1| 56 | 64 | 64 |
21030 + * +--------------+---+---+-+-------------+------------------+-----------------+
21031 + * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
21032 + * +--------------+---+---+-+-------------+------------------+-----------------+
21033 + * | | | | |
21034 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21035 + *
21036 + * dirid objectid of directory this item is for
21037 + *
21038 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21039 + *
21040 + * H 1 if last 8 bytes of the key contain hash,
21041 + * 0 if last 8 bytes of the key contain prefix-3
21042 + *
21043 + * prefix-1 first 7 characters of file name.
21044 + * Padded by zeroes if name is not long enough.
21045 + *
21046 + * prefix-2 next 8 characters of the file name.
21047 + *
21048 + * prefix-3 next 8 characters of the file name.
21049 + *
21050 + * hash hash of the rest of file name (i.e., portion of file
21051 + * name not included into prefix-1 and prefix-2).
21052 + *
21053 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21054 + * in the key. Such file names are called "short". They are distinguished by H
21055 + * bit set 0 in the key.
21056 + *
21057 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21058 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21059 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21060 + * characters of the name.
21061 + *
21062 + * This key assignment reaches following important goals:
21063 + *
21064 + * (1) directory entries are sorted in approximately lexicographical
21065 + * order.
21066 + *
21067 + * (2) collisions (when multiple directory items have the same key), while
21068 + * principally unavoidable in a tree with fixed length keys, are rare.
21069 + *
21070 + * STAT DATA
21071 + *
21072 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21073 + * +--------------+---+-----------------+---+--------------+-----------------+
21074 + * | locality id | 1 | ordering | 0 | objectid | 0 |
21075 + * +--------------+---+-----------------+---+--------------+-----------------+
21076 + * | | | | |
21077 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21078 + *
21079 + * locality id object id of a directory where first name was created for
21080 + * the object
21081 + *
21082 + * ordering copy of second 8-byte portion of the key of directory
21083 + * entry for the first name of this object. Ordering has a form
21084 + * {
21085 + * fibration :7;
21086 + * h :1;
21087 + * prefix1 :56;
21088 + * }
21089 + * see description of key for directory entry above.
21090 + *
21091 + * objectid object id for this object
21092 + *
21093 + * This key assignment policy is designed to keep stat-data in the same order
21094 + * as corresponding directory items, thus speeding up readdir/stat types of
21095 + * workload.
21096 + *
21097 + * FILE BODY
21098 + *
21099 + * | 60 | 4 | 64 | 4 | 60 | 64 |
21100 + * +--------------+---+-----------------+---+--------------+-----------------+
21101 + * | locality id | 4 | ordering | 0 | objectid | offset |
21102 + * +--------------+---+-----------------+---+--------------+-----------------+
21103 + * | | | | |
21104 + * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21105 + *
21106 + * locality id object id of a directory where first name was created for
21107 + * the object
21108 + *
21109 + * ordering the same as in the key of stat-data for this object
21110 + *
21111 + * objectid object id for this object
21112 + *
21113 + * offset logical offset from the beginning of this file.
21114 + * Measured in bytes.
21115 + *
21116 + *
21117 + * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21118 + *
21119 + * DIRECTORY ITEMS
21120 + *
21121 + * | 60 | 4 | 7 |1| 56 | 64 |
21122 + * +--------------+---+---+-+-------------+-----------------+
21123 + * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21124 + * +--------------+---+---+-+-------------+-----------------+
21125 + * | | | |
21126 + * | 8 bytes | 8 bytes | 8 bytes |
21127 + *
21128 + * dirid objectid of directory this item is for
21129 + *
21130 + * F fibration, see fs/reiser4/plugin/fibration.[ch]
21131 + *
21132 + * H 1 if last 8 bytes of the key contain hash,
21133 + * 0 if last 8 bytes of the key contain prefix-2
21134 + *
21135 + * prefix-1 first 7 characters of file name.
21136 + * Padded by zeroes if name is not long enough.
21137 + *
21138 + * prefix-2 next 8 characters of the file name.
21139 + *
21140 + * hash hash of the rest of file name (i.e., portion of file
21141 + * name not included into prefix-1).
21142 + *
21143 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21144 + * the key. Such file names are called "short". They are distinguished by H
21145 + * bit set in the key.
21146 + *
21147 + * Other file names are "long". For long name, H bit is 0, and first 7
21148 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21149 + * key are occupied by hash of the remaining characters of the name.
21150 + *
21151 + * STAT DATA
21152 + *
21153 + * | 60 | 4 | 4 | 60 | 64 |
21154 + * +--------------+---+---+--------------+-----------------+
21155 + * | locality id | 1 | 0 | objectid | 0 |
21156 + * +--------------+---+---+--------------+-----------------+
21157 + * | | | |
21158 + * | 8 bytes | 8 bytes | 8 bytes |
21159 + *
21160 + * locality id object id of a directory where first name was created for
21161 + * the object
21162 + *
21163 + * objectid object id for this object
21164 + *
21165 + * FILE BODY
21166 + *
21167 + * | 60 | 4 | 4 | 60 | 64 |
21168 + * +--------------+---+---+--------------+-----------------+
21169 + * | locality id | 4 | 0 | objectid | offset |
21170 + * +--------------+---+---+--------------+-----------------+
21171 + * | | | |
21172 + * | 8 bytes | 8 bytes | 8 bytes |
21173 + *
21174 + * locality id object id of a directory where first name was created for
21175 + * the object
21176 + *
21177 + * objectid object id for this object
21178 + *
21179 + * offset logical offset from the beginning of this file.
21180 + * Measured in bytes.
21181 + *
21182 + *
21183 + */
21184 +
21185 +#include "debug.h"
21186 +#include "key.h"
21187 +#include "kassign.h"
21188 +#include "vfs_ops.h"
21189 +#include "inode.h"
21190 +#include "super.h"
21191 +#include "dscale.h"
21192 +
21193 +#include <linux/types.h> /* for __u?? */
21194 +#include <linux/fs.h> /* for struct super_block, etc */
21195 +
21196 +/* bitmask for H bit (see comment at the beginning of this file */
21197 +static const __u64 longname_mark = 0x0100000000000000ull;
21198 +/* bitmask for F and H portions of the key. */
21199 +static const __u64 fibration_mask = 0xff00000000000000ull;
21200 +
21201 +/* return true if name is not completely encoded in @key */
21202 +int is_longname_key(const reiser4_key * key)
21203 +{
21204 + __u64 highpart;
21205 +
21206 + assert("nikita-2863", key != NULL);
21207 + if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21208 + reiser4_print_key("oops", key);
21209 + assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21210 +
21211 + if (REISER4_LARGE_KEY)
21212 + highpart = get_key_ordering(key);
21213 + else
21214 + highpart = get_key_objectid(key);
21215 +
21216 + return (highpart & longname_mark) ? 1 : 0;
21217 +}
21218 +
21219 +/* return true if @name is too long to be completely encoded in the key */
21220 +int is_longname(const char *name UNUSED_ARG, int len)
21221 +{
21222 + if (REISER4_LARGE_KEY)
21223 + return len > 23;
21224 + else
21225 + return len > 15;
21226 +}
21227 +
21228 +/* code ascii string into __u64.
21229 +
21230 + Put characters of @name into result (@str) one after another starting
21231 + from @start_idx-th highest (arithmetically) byte. This produces
21232 + endian-safe encoding. memcpy(2) will not do.
21233 +
21234 +*/
21235 +static __u64 pack_string(const char *name /* string to encode */ ,
21236 + int start_idx /* highest byte in result from
21237 + * which to start encoding */ )
21238 +{
21239 + unsigned i;
21240 + __u64 str;
21241 +
21242 + str = 0;
21243 + for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21244 + str <<= 8;
21245 + str |= (unsigned char)name[i];
21246 + }
21247 + str <<= (sizeof str - i - start_idx) << 3;
21248 + return str;
21249 +}
21250 +
21251 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21252 + * string encoded in it and stores result in @buf */
21253 +char * reiser4_unpack_string(__u64 value, char *buf)
21254 +{
21255 + do {
21256 + *buf = value >> (64 - 8);
21257 + if (*buf)
21258 + ++buf;
21259 + value <<= 8;
21260 + } while (value != 0);
21261 + *buf = 0;
21262 + return buf;
21263 +}
21264 +
21265 +/* obtain name encoded in @key and store it in @buf */
21266 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21267 +{
21268 + char *c;
21269 +
21270 + assert("nikita-2868", !is_longname_key(key));
21271 +
21272 + c = buf;
21273 + if (REISER4_LARGE_KEY) {
21274 + c = reiser4_unpack_string(get_key_ordering(key) &
21275 + ~fibration_mask, c);
21276 + c = reiser4_unpack_string(get_key_fulloid(key), c);
21277 + } else
21278 + c = reiser4_unpack_string(get_key_fulloid(key) &
21279 + ~fibration_mask, c);
21280 + reiser4_unpack_string(get_key_offset(key), c);
21281 + return buf;
21282 +}
21283 +
21284 +/**
21285 + * complete_entry_key - calculate entry key by name
21286 + * @dir: directory where entry is (or will be) in
21287 + * @name: name to calculate key of
21288 + * @len: lenth of name
21289 + * @result: place to store result in
21290 + *
21291 + * Sets fields of entry key @result which depend on file name.
21292 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21293 + * objectid and offset. Otherwise, objectid and offset are set.
21294 + */
21295 +void complete_entry_key(const struct inode *dir, const char *name,
21296 + int len, reiser4_key *result)
21297 +{
21298 +#if REISER4_LARGE_KEY
21299 + __u64 ordering;
21300 + __u64 objectid;
21301 + __u64 offset;
21302 +
21303 + assert("nikita-1139", dir != NULL);
21304 + assert("nikita-1142", result != NULL);
21305 + assert("nikita-2867", strlen(name) == len);
21306 +
21307 + /*
21308 + * key allocation algorithm for directory entries in case of large
21309 + * keys:
21310 + *
21311 + * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21312 + * characters into ordering field of key, next 8 charactes (if any)
21313 + * into objectid field of key and next 8 ones (of any) into offset
21314 + * field of key
21315 + *
21316 + * If file name is longer than 23 characters, put first 7 characters
21317 + * into key's ordering, next 8 to objectid and hash of remaining
21318 + * characters into offset field.
21319 + *
21320 + * To distinguish above cases, in latter set up unused high bit in
21321 + * ordering field.
21322 + */
21323 +
21324 + /* [0-6] characters to ordering */
21325 + ordering = pack_string(name, 1);
21326 + if (len > 7) {
21327 + /* [7-14] characters to objectid */
21328 + objectid = pack_string(name + 7, 0);
21329 + if (len > 15) {
21330 + if (len <= 23) {
21331 + /* [15-23] characters to offset */
21332 + offset = pack_string(name + 15, 0);
21333 + } else {
21334 + /* note in a key the fact that offset contains hash. */
21335 + ordering |= longname_mark;
21336 +
21337 + /* offset is the hash of the file name's tail. */
21338 + offset = inode_hash_plugin(dir)->hash(name + 15,
21339 + len - 15);
21340 + }
21341 + } else {
21342 + offset = 0ull;
21343 + }
21344 + } else {
21345 + objectid = 0ull;
21346 + offset = 0ull;
21347 + }
21348 +
21349 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21350 + ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21351 +
21352 + set_key_ordering(result, ordering);
21353 + set_key_fulloid(result, objectid);
21354 + set_key_offset(result, offset);
21355 + return;
21356 +
21357 +#else
21358 + __u64 objectid;
21359 + __u64 offset;
21360 +
21361 + assert("nikita-1139", dir != NULL);
21362 + assert("nikita-1142", result != NULL);
21363 + assert("nikita-2867", strlen(name) == len);
21364 +
21365 + /*
21366 + * key allocation algorithm for directory entries in case of not large
21367 + * keys:
21368 + *
21369 + * If name is not longer than 7 + 8 = 15 characters, put first 7
21370 + * characters into objectid field of key, next 8 charactes (if any)
21371 + * into offset field of key
21372 + *
21373 + * If file name is longer than 15 characters, put first 7 characters
21374 + * into key's objectid, and hash of remaining characters into offset
21375 + * field.
21376 + *
21377 + * To distinguish above cases, in latter set up unused high bit in
21378 + * objectid field.
21379 + */
21380 +
21381 + /* [0-6] characters to objectid */
21382 + objectid = pack_string(name, 1);
21383 + if (len > 7) {
21384 + if (len <= 15) {
21385 + /* [7-14] characters to offset */
21386 + offset = pack_string(name + 7, 0);
21387 + } else {
21388 + /* note in a key the fact that offset contains hash. */
21389 + objectid |= longname_mark;
21390 +
21391 + /* offset is the hash of the file name. */
21392 + offset = inode_hash_plugin(dir)->hash(name + 7,
21393 + len - 7);
21394 + }
21395 + } else
21396 + offset = 0ull;
21397 +
21398 + assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21399 + objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21400 +
21401 + set_key_fulloid(result, objectid);
21402 + set_key_offset(result, offset);
21403 + return;
21404 +#endif /* ! REISER4_LARGE_KEY */
21405 +}
21406 +
21407 +/* true, if @key is the key of "." */
21408 +int is_dot_key(const reiser4_key * key /* key to check */ )
21409 +{
21410 + assert("nikita-1717", key != NULL);
21411 + assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21412 + return
21413 + (get_key_ordering(key) == 0ull) &&
21414 + (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21415 +}
21416 +
21417 +/* build key for stat-data.
21418 +
21419 + return key of stat-data of this object. This should became sd plugin
21420 + method in the future. For now, let it be here.
21421 +
21422 +*/
21423 +reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21424 + reiser4_key * result /* resulting key of @target
21425 + stat-data */ )
21426 +{
21427 + assert("nikita-261", result != NULL);
21428 +
21429 + reiser4_key_init(result);
21430 + set_key_locality(result, reiser4_inode_data(target)->locality_id);
21431 + set_key_ordering(result, get_inode_ordering(target));
21432 + set_key_objectid(result, get_inode_oid(target));
21433 + set_key_type(result, KEY_SD_MINOR);
21434 + set_key_offset(result, (__u64) 0);
21435 + return result;
21436 +}
21437 +
21438 +/* encode part of key into &obj_key_id
21439 +
21440 + This encodes into @id part of @key sufficient to restore @key later,
21441 + given that latter is key of object (key of stat-data).
21442 +
21443 + See &obj_key_id
21444 +*/
21445 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21446 + obj_key_id * id /* id where key is encoded in */ )
21447 +{
21448 + assert("nikita-1151", key != NULL);
21449 + assert("nikita-1152", id != NULL);
21450 +
21451 + memcpy(id, key, sizeof *id);
21452 + return 0;
21453 +}
21454 +
21455 +/* encode reference to @obj in @id.
21456 +
21457 + This is like build_obj_key_id() above, but takes inode as parameter. */
21458 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21459 + obj_key_id * id /* result */ )
21460 +{
21461 + reiser4_key sdkey;
21462 +
21463 + assert("nikita-1166", obj != NULL);
21464 + assert("nikita-1167", id != NULL);
21465 +
21466 + build_sd_key(obj, &sdkey);
21467 + build_obj_key_id(&sdkey, id);
21468 + return 0;
21469 +}
21470 +
21471 +/* decode @id back into @key
21472 +
21473 + Restore key of object stat-data from @id. This is dual to
21474 + build_obj_key_id() above.
21475 +*/
21476 +int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21477 + * from */ ,
21478 + reiser4_key * key /* result */ )
21479 +{
21480 + assert("nikita-1153", id != NULL);
21481 + assert("nikita-1154", key != NULL);
21482 +
21483 + reiser4_key_init(key);
21484 + memcpy(key, id, sizeof *id);
21485 + return 0;
21486 +}
21487 +
21488 +/* extract objectid of directory from key of directory entry within said
21489 + directory.
21490 + */
21491 +oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21492 + * directory
21493 + * entry */ )
21494 +{
21495 + assert("nikita-1314", de_key != NULL);
21496 + return get_key_locality(de_key);
21497 +}
21498 +
21499 +/* encode into @id key of directory entry.
21500 +
21501 + Encode into @id information sufficient to later distinguish directory
21502 + entries within the same directory. This is not whole key, because all
21503 + directory entries within directory item share locality which is equal
21504 + to objectid of their directory.
21505 +
21506 +*/
21507 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21508 + const struct qstr *name /* name to be given to @obj by
21509 + * directory entry being
21510 + * constructed */ ,
21511 + de_id * id /* short key of directory entry */ )
21512 +{
21513 + reiser4_key key;
21514 +
21515 + assert("nikita-1290", dir != NULL);
21516 + assert("nikita-1292", id != NULL);
21517 +
21518 + /* NOTE-NIKITA this is suboptimal. */
21519 + inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21520 + return build_de_id_by_key(&key, id);
21521 +}
21522 +
21523 +/* encode into @id key of directory entry.
21524 +
21525 + Encode into @id information sufficient to later distinguish directory
21526 + entries within the same directory. This is not whole key, because all
21527 + directory entries within directory item share locality which is equal
21528 + to objectid of their directory.
21529 +
21530 +*/
21531 +int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21532 + * entry */ ,
21533 + de_id * id /* short key of directory entry */ )
21534 +{
21535 + memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21536 + return 0;
21537 +}
21538 +
21539 +/* restore from @id key of directory entry.
21540 +
21541 + Function dual to build_de_id(): given @id and locality, build full
21542 + key of directory entry within directory item.
21543 +
21544 +*/
21545 +int extract_key_from_de_id(const oid_t locality /* locality of directory
21546 + * entry */ ,
21547 + const de_id * id /* directory entry id */ ,
21548 + reiser4_key * key /* result */ )
21549 +{
21550 + /* no need to initialise key here: all fields are overwritten */
21551 + memcpy(((__u64 *) key) + 1, id, sizeof *id);
21552 + set_key_locality(key, locality);
21553 + set_key_type(key, KEY_FILE_NAME_MINOR);
21554 + return 0;
21555 +}
21556 +
21557 +/* compare two &de_id's */
21558 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21559 + const de_id * id2 /* second &de_id to compare */ )
21560 +{
21561 + /* NOTE-NIKITA ugly implementation */
21562 + reiser4_key k1;
21563 + reiser4_key k2;
21564 +
21565 + extract_key_from_de_id((oid_t) 0, id1, &k1);
21566 + extract_key_from_de_id((oid_t) 0, id2, &k2);
21567 + return keycmp(&k1, &k2);
21568 +}
21569 +
21570 +/* compare &de_id with key */
21571 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21572 + const reiser4_key * key /* key to compare */ )
21573 +{
21574 + cmp_t result;
21575 + reiser4_key *k1;
21576 +
21577 + k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21578 + result = KEY_DIFF_EL(k1, key, 1);
21579 + if (result == EQUAL_TO) {
21580 + result = KEY_DIFF_EL(k1, key, 2);
21581 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21582 + result = KEY_DIFF_EL(k1, key, 3);
21583 + }
21584 + }
21585 + return result;
21586 +}
21587 +
21588 +/*
21589 + * return number of bytes necessary to encode @inode identity.
21590 + */
21591 +int inode_onwire_size(const struct inode *inode)
21592 +{
21593 + int result;
21594 +
21595 + result = dscale_bytes(get_inode_oid(inode));
21596 + result += dscale_bytes(get_inode_locality(inode));
21597 +
21598 + /*
21599 + * ordering is large (it usually has highest bits set), so it makes
21600 + * little sense to dscale it.
21601 + */
21602 + if (REISER4_LARGE_KEY)
21603 + result += sizeof(get_inode_ordering(inode));
21604 + return result;
21605 +}
21606 +
21607 +/*
21608 + * encode @inode identity at @start
21609 + */
21610 +char *build_inode_onwire(const struct inode *inode, char *start)
21611 +{
21612 + start += dscale_write(start, get_inode_locality(inode));
21613 + start += dscale_write(start, get_inode_oid(inode));
21614 +
21615 + if (REISER4_LARGE_KEY) {
21616 + put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21617 + start += sizeof(get_inode_ordering(inode));
21618 + }
21619 + return start;
21620 +}
21621 +
21622 +/*
21623 + * extract key that was previously encoded by build_inode_onwire() at @addr
21624 + */
21625 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21626 +{
21627 + __u64 val;
21628 +
21629 + addr += dscale_read(addr, &val);
21630 + val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21631 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21632 + addr += dscale_read(addr, &val);
21633 + put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21634 +#if REISER4_LARGE_KEY
21635 + memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21636 + addr += sizeof key_id->ordering;
21637 +#endif
21638 + return addr;
21639 +}
21640 +
21641 +/* Make Linus happy.
21642 + Local variables:
21643 + c-indentation-style: "K&R"
21644 + mode-name: "LC"
21645 + c-basic-offset: 8
21646 + tab-width: 8
21647 + fill-column: 120
21648 + End:
21649 +*/
21650 diff -urN linux-2.6.23.orig/fs/reiser4/kassign.h linux-2.6.23/fs/reiser4/kassign.h
21651 --- linux-2.6.23.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300
21652 +++ linux-2.6.23/fs/reiser4/kassign.h 2007-12-04 22:59:05.794368893 +0300
21653 @@ -0,0 +1,110 @@
21654 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21655 + * reiser4/README */
21656 +
21657 +/* Key assignment policy interface. See kassign.c for details. */
21658 +
21659 +#if !defined( __KASSIGN_H__ )
21660 +#define __KASSIGN_H__
21661 +
21662 +#include "forward.h"
21663 +#include "key.h"
21664 +#include "dformat.h"
21665 +
21666 +#include <linux/types.h> /* for __u?? */
21667 +#include <linux/fs.h> /* for struct super_block, etc */
21668 +#include <linux/dcache.h> /* for struct qstr */
21669 +
21670 +/* key assignment functions */
21671 +
21672 +/* Information from which key of file stat-data can be uniquely
21673 + restored. This depends on key assignment policy for
21674 + stat-data. Currently it's enough to store object id and locality id
21675 + (60+60==120) bits, because minor packing locality and offset of
21676 + stat-data key are always known constants: KEY_SD_MINOR and 0
21677 + respectively. For simplicity 4 bits are wasted in each id, and just
21678 + two 64 bit integers are stored.
21679 +
21680 + This field has to be byte-aligned, because we don't want to waste
21681 + space in directory entries. There is another side of a coin of
21682 + course: we waste CPU and bus bandwidth in stead, by copying data back
21683 + and forth.
21684 +
21685 + Next optimization: &obj_key_id is mainly used to address stat data from
21686 + directory entries. Under the assumption that majority of files only have
21687 + only name (one hard link) from *the* parent directory it seems reasonable
21688 + to only store objectid of stat data and take its locality from key of
21689 + directory item.
21690 +
21691 + This requires some flag to be added to the &obj_key_id to distinguish
21692 + between these two cases. Remaining bits in flag byte are then asking to be
21693 + used to store file type.
21694 +
21695 + This optimization requires changes in directory item handling code.
21696 +
21697 +*/
21698 +typedef struct obj_key_id {
21699 + d8 locality[sizeof(__u64)];
21700 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21701 + )
21702 + d8 objectid[sizeof(__u64)];
21703 +}
21704 +obj_key_id;
21705 +
21706 +/* Information sufficient to uniquely identify directory entry within
21707 + compressed directory item.
21708 +
21709 + For alignment issues see &obj_key_id above.
21710 +*/
21711 +typedef struct de_id {
21712 + ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21713 + d8 objectid[sizeof(__u64)];
21714 + d8 offset[sizeof(__u64)];
21715 +}
21716 +de_id;
21717 +
21718 +extern int inode_onwire_size(const struct inode *obj);
21719 +extern char *build_inode_onwire(const struct inode *obj, char *area);
21720 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21721 +
21722 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21723 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21724 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21725 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21726 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
21727 + de_id * id);
21728 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21729 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21730 + reiser4_key * key);
21731 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21732 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21733 +
21734 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21735 +extern void build_entry_key_common(const struct inode *dir,
21736 + const struct qstr *name,
21737 + reiser4_key * result);
21738 +extern void build_entry_key_stable_entry(const struct inode *dir,
21739 + const struct qstr *name,
21740 + reiser4_key * result);
21741 +extern int is_dot_key(const reiser4_key * key);
21742 +extern reiser4_key *build_sd_key(const struct inode *target,
21743 + reiser4_key * result);
21744 +
21745 +extern int is_longname_key(const reiser4_key * key);
21746 +extern int is_longname(const char *name, int len);
21747 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21748 +extern char *reiser4_unpack_string(__u64 value, char *buf);
21749 +extern void complete_entry_key(const struct inode *dir, const char *name,
21750 + int len, reiser4_key *result);
21751 +
21752 +/* __KASSIGN_H__ */
21753 +#endif
21754 +
21755 +/* Make Linus happy.
21756 + Local variables:
21757 + c-indentation-style: "K&R"
21758 + mode-name: "LC"
21759 + c-basic-offset: 8
21760 + tab-width: 8
21761 + fill-column: 120
21762 + End:
21763 +*/
21764 diff -urN linux-2.6.23.orig/fs/reiser4/Kconfig linux-2.6.23/fs/reiser4/Kconfig
21765 --- linux-2.6.23.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300
21766 +++ linux-2.6.23/fs/reiser4/Kconfig 2007-12-04 16:49:30.000000000 +0300
21767 @@ -0,0 +1,34 @@
21768 +config REISER4_FS
21769 + tristate "Reiser4 (EXPERIMENTAL)"
21770 + depends on EXPERIMENTAL
21771 + select ZLIB_INFLATE
21772 + select ZLIB_DEFLATE
21773 + select LZO_COMPRESS
21774 + select LZO_DECOMPRESS
21775 + select CRYPTO
21776 + help
21777 + Reiser4 is a filesystem that performs all filesystem operations
21778 + as atomic transactions, which means that it either performs a
21779 + write, or it does not, and in the event of a crash it does not
21780 + partially perform it or corrupt it.
21781 +
21782 + It stores files in dancing trees, which are like balanced trees but
21783 + faster. It packs small files together so that they share blocks
21784 + without wasting space. This means you can use it to store really
21785 + small files. It also means that it saves you disk space. It avoids
21786 + hassling you with anachronisms like having a maximum number of
21787 + inodes, and wasting space if you use less than that number.
21788 +
21789 + Reiser4 is a distinct filesystem type from reiserfs (V3).
21790 + It's therefore not possible to use reiserfs file systems
21791 + with reiser4.
21792 +
21793 + To learn more about reiser4, go to http://www.namesys.com
21794 +
21795 +config REISER4_DEBUG
21796 + bool "Enable reiser4 debug mode"
21797 + depends on REISER4_FS
21798 + help
21799 + Don't use this unless you are debugging reiser4.
21800 +
21801 + If unsure, say N.
21802 diff -urN linux-2.6.23.orig/fs/reiser4/key.c linux-2.6.23/fs/reiser4/key.c
21803 --- linux-2.6.23.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300
21804 +++ linux-2.6.23/fs/reiser4/key.c 2007-12-04 16:49:30.000000000 +0300
21805 @@ -0,0 +1,137 @@
21806 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21807 +
21808 +/* Key manipulations. */
21809 +
21810 +#include "debug.h"
21811 +#include "key.h"
21812 +#include "super.h"
21813 +#include "reiser4.h"
21814 +
21815 +#include <linux/types.h> /* for __u?? */
21816 +
21817 +/* Minimal possible key: all components are zero. It is presumed that this is
21818 + independent of key scheme. */
21819 +static const reiser4_key MINIMAL_KEY = {
21820 + .el = {
21821 + 0ull,
21822 + ON_LARGE_KEY(0ull,)
21823 + 0ull,
21824 + 0ull
21825 + }
21826 +};
21827 +
21828 +/* Maximal possible key: all components are ~0. It is presumed that this is
21829 + independent of key scheme. */
21830 +static const reiser4_key MAXIMAL_KEY = {
21831 + .el = {
21832 + __constant_cpu_to_le64(~0ull),
21833 + ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21834 + __constant_cpu_to_le64(~0ull),
21835 + __constant_cpu_to_le64(~0ull)
21836 + }
21837 +};
21838 +
21839 +/* Initialize key. */
21840 +void reiser4_key_init(reiser4_key * key /* key to init */ )
21841 +{
21842 + assert("nikita-1169", key != NULL);
21843 + memset(key, 0, sizeof *key);
21844 +}
21845 +
21846 +/* minimal possible key in the tree. Return pointer to the static storage. */
21847 +const reiser4_key *reiser4_min_key(void)
21848 +{
21849 + return &MINIMAL_KEY;
21850 +}
21851 +
21852 +/* maximum possible key in the tree. Return pointer to the static storage. */
21853 +const reiser4_key *reiser4_max_key(void)
21854 +{
21855 + return &MAXIMAL_KEY;
21856 +}
21857 +
21858 +#if REISER4_DEBUG
21859 +/* debugging aid: print symbolic name of key type */
21860 +static const char *type_name(unsigned int key_type /* key type */ )
21861 +{
21862 + switch (key_type) {
21863 + case KEY_FILE_NAME_MINOR:
21864 + return "file name";
21865 + case KEY_SD_MINOR:
21866 + return "stat data";
21867 + case KEY_ATTR_NAME_MINOR:
21868 + return "attr name";
21869 + case KEY_ATTR_BODY_MINOR:
21870 + return "attr body";
21871 + case KEY_BODY_MINOR:
21872 + return "file body";
21873 + default:
21874 + return "unknown";
21875 + }
21876 +}
21877 +
21878 +/* debugging aid: print human readable information about key */
21879 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
21880 + const reiser4_key * key /* key to print */ )
21881 +{
21882 + /* turn bold on */
21883 + /* printf ("\033[1m"); */
21884 + if (key == NULL)
21885 + printk("%s: null key\n", prefix);
21886 + else {
21887 + if (REISER4_LARGE_KEY)
21888 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21889 + get_key_locality(key),
21890 + get_key_type(key),
21891 + get_key_ordering(key),
21892 + get_key_band(key),
21893 + get_key_objectid(key), get_key_offset(key));
21894 + else
21895 + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21896 + get_key_locality(key),
21897 + get_key_type(key),
21898 + get_key_band(key),
21899 + get_key_objectid(key), get_key_offset(key));
21900 + /*
21901 + * if this is a key of directory entry, try to decode part of
21902 + * a name stored in the key, and output it.
21903 + */
21904 + if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21905 + char buf[DE_NAME_BUF_LEN];
21906 + char *c;
21907 +
21908 + c = buf;
21909 + c = reiser4_unpack_string(get_key_ordering(key), c);
21910 + reiser4_unpack_string(get_key_fulloid(key), c);
21911 + printk("[%s", buf);
21912 + if (is_longname_key(key))
21913 + /*
21914 + * only part of the name is stored in the key.
21915 + */
21916 + printk("...]\n");
21917 + else {
21918 + /*
21919 + * whole name is stored in the key.
21920 + */
21921 + reiser4_unpack_string(get_key_offset(key), buf);
21922 + printk("%s]\n", buf);
21923 + }
21924 + } else {
21925 + printk("[%s]\n", type_name(get_key_type(key)));
21926 + }
21927 + }
21928 + /* turn bold off */
21929 + /* printf ("\033[m\017"); */
21930 +}
21931 +
21932 +#endif
21933 +
21934 +/* Make Linus happy.
21935 + Local variables:
21936 + c-indentation-style: "K&R"
21937 + mode-name: "LC"
21938 + c-basic-offset: 8
21939 + tab-width: 8
21940 + fill-column: 120
21941 + End:
21942 +*/
21943 diff -urN linux-2.6.23.orig/fs/reiser4/key.h linux-2.6.23/fs/reiser4/key.h
21944 --- linux-2.6.23.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300
21945 +++ linux-2.6.23/fs/reiser4/key.h 2007-12-04 16:49:30.000000000 +0300
21946 @@ -0,0 +1,384 @@
21947 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21948 +
21949 +/* Declarations of key-related data-structures and operations on keys. */
21950 +
21951 +#if !defined( __REISER4_KEY_H__ )
21952 +#define __REISER4_KEY_H__
21953 +
21954 +#include "dformat.h"
21955 +#include "forward.h"
21956 +#include "debug.h"
21957 +
21958 +#include <linux/types.h> /* for __u?? */
21959 +
21960 +/* Operations on keys in reiser4 tree */
21961 +
21962 +/* No access to any of these fields shall be done except via a
21963 + wrapping macro/function, and that wrapping macro/function shall
21964 + convert to little endian order. Compare keys will consider cpu byte order. */
21965 +
21966 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
21967 + which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
21968 + within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
21969 + approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
21970 + right one. */
21971 +
21972 +/* possible values for minor packing locality (4 bits required) */
21973 +typedef enum {
21974 + /* file name */
21975 + KEY_FILE_NAME_MINOR = 0,
21976 + /* stat-data */
21977 + KEY_SD_MINOR = 1,
21978 + /* file attribute name */
21979 + KEY_ATTR_NAME_MINOR = 2,
21980 + /* file attribute value */
21981 + KEY_ATTR_BODY_MINOR = 3,
21982 + /* file body (tail or extent) */
21983 + KEY_BODY_MINOR = 4,
21984 +} key_minor_locality;
21985 +
21986 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
21987 + Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
21988 + and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
21989 + segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
21990 + block_alloc.c to check the node type when deciding where to allocate the node.
21991 +
21992 + The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
21993 + should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
21994 + current implementation tails have a different minor packing locality from extents, and no files have both extents and
21995 + tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
21996 +*/
21997 +
21998 +/* Arbitrary major packing localities can be assigned to objects using
21999 + the reiser4(filenameA/..packing<=some_number) system call.
22000 +
22001 + In reiser4, the creat() syscall creates a directory
22002 +
22003 + whose default flow (that which is referred to if the directory is
22004 + read as a file) is the traditional unix file body.
22005 +
22006 + whose directory plugin is the 'filedir'
22007 +
22008 + whose major packing locality is that of the parent of the object created.
22009 +
22010 + The static_stat item is a particular commonly used directory
22011 + compression (the one for normal unix files).
22012 +
22013 + The filedir plugin checks to see if the static_stat item exists.
22014 + There is a unique key for static_stat. If yes, then it uses the
22015 + static_stat item for all of the values that it contains. The
22016 + static_stat item contains a flag for each stat it contains which
22017 + indicates whether one should look outside the static_stat item for its
22018 + contents.
22019 +*/
22020 +
22021 +/* offset of fields in reiser4_key. Value of each element of this enum
22022 + is index within key (thought as array of __u64's) where this field
22023 + is. */
22024 +typedef enum {
22025 + /* major "locale", aka dirid. Sits in 1st element */
22026 + KEY_LOCALITY_INDEX = 0,
22027 + /* minor "locale", aka item type. Sits in 1st element */
22028 + KEY_TYPE_INDEX = 0,
22029 + ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22030 + /* "object band". Sits in 2nd element */
22031 + KEY_BAND_INDEX,
22032 + /* objectid. Sits in 2nd element */
22033 + KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22034 + /* full objectid. Sits in 2nd element */
22035 + KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22036 + /* Offset. Sits in 3rd element */
22037 + KEY_OFFSET_INDEX,
22038 + /* Name hash. Sits in 3rd element */
22039 + KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22040 + KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22041 + KEY_LAST_INDEX
22042 +} reiser4_key_field_index;
22043 +
22044 +/* key in reiser4 internal "balanced" tree. It is just array of three
22045 + 64bit integers in disk byte order (little-endian by default). This
22046 + array is actually indexed by reiser4_key_field. Each __u64 within
22047 + this array is called "element". Logical key component encoded within
22048 + elements are called "fields".
22049 +
22050 + We declare this as union with second component dummy to suppress
22051 + inconvenient array<->pointer casts implied in C. */
22052 +union reiser4_key {
22053 + __le64 el[KEY_LAST_INDEX];
22054 + int pad;
22055 +};
22056 +
22057 +/* bitmasks showing where within reiser4_key particular key is stored. */
22058 +/* major locality occupies higher 60 bits of the first element */
22059 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22060 +
22061 +/* minor locality occupies lower 4 bits of the first element */
22062 +#define KEY_TYPE_MASK 0xfull
22063 +
22064 +/* controversial band occupies higher 4 bits of the 2nd element */
22065 +#define KEY_BAND_MASK 0xf000000000000000ull
22066 +
22067 +/* objectid occupies lower 60 bits of the 2nd element */
22068 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22069 +
22070 +/* full 64bit objectid*/
22071 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22072 +
22073 +/* offset is just 3rd L.M.Nt itself */
22074 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22075 +
22076 +/* ordering is whole second element */
22077 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22078 +
22079 +/* how many bits key element should be shifted to left to get particular field */
22080 +typedef enum {
22081 + KEY_LOCALITY_SHIFT = 4,
22082 + KEY_TYPE_SHIFT = 0,
22083 + KEY_BAND_SHIFT = 60,
22084 + KEY_OBJECTID_SHIFT = 0,
22085 + KEY_FULLOID_SHIFT = 0,
22086 + KEY_OFFSET_SHIFT = 0,
22087 + KEY_ORDERING_SHIFT = 0,
22088 +} reiser4_key_field_shift;
22089 +
22090 +static inline __u64
22091 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22092 +{
22093 + assert("nikita-753", key != NULL);
22094 + assert("nikita-754", off < KEY_LAST_INDEX);
22095 + return le64_to_cpu(get_unaligned(&key->el[off]));
22096 +}
22097 +
22098 +static inline void
22099 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22100 +{
22101 + assert("nikita-755", key != NULL);
22102 + assert("nikita-756", off < KEY_LAST_INDEX);
22103 + put_unaligned(cpu_to_le64(value), &key->el[off]);
22104 +}
22105 +
22106 +/* macro to define getter and setter functions for field F with type T */
22107 +#define DEFINE_KEY_FIELD( L, U, T ) \
22108 +static inline T get_key_ ## L ( const reiser4_key *key ) \
22109 +{ \
22110 + assert( "nikita-750", key != NULL ); \
22111 + return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22112 + KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22113 +} \
22114 + \
22115 +static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22116 +{ \
22117 + __u64 el; \
22118 + \
22119 + assert( "nikita-752", key != NULL ); \
22120 + \
22121 + el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22122 + /* clear field bits in the key */ \
22123 + el &= ~KEY_ ## U ## _MASK; \
22124 + /* actually it should be \
22125 + \
22126 + el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22127 + \
22128 + but we trust user to never pass values that wouldn't fit \
22129 + into field. Clearing extra bits is one operation, but this \
22130 + function is time-critical. \
22131 + But check this in assertion. */ \
22132 + assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22133 + ~KEY_ ## U ## _MASK ) == 0 ); \
22134 + el |= ( loc << KEY_ ## U ## _SHIFT ); \
22135 + set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22136 +}
22137 +
22138 +typedef __u64 oid_t;
22139 +
22140 +/* define get_key_locality(), set_key_locality() */
22141 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22142 +/* define get_key_type(), set_key_type() */
22143 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22144 +/* define get_key_band(), set_key_band() */
22145 +DEFINE_KEY_FIELD(band, BAND, __u64);
22146 +/* define get_key_objectid(), set_key_objectid() */
22147 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22148 +/* define get_key_fulloid(), set_key_fulloid() */
22149 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22150 +/* define get_key_offset(), set_key_offset() */
22151 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22152 +#if (REISER4_LARGE_KEY)
22153 +/* define get_key_ordering(), set_key_ordering() */
22154 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22155 +#else
22156 +static inline __u64 get_key_ordering(const reiser4_key * key)
22157 +{
22158 + return 0;
22159 +}
22160 +
22161 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22162 +{
22163 +}
22164 +#endif
22165 +
22166 +/* key comparison result */
22167 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22168 + EQUAL_TO = 0, /* if keys are equal */
22169 + GREATER_THAN = +1 /* if first key is greater than second */
22170 +} cmp_t;
22171 +
22172 +void reiser4_key_init(reiser4_key * key);
22173 +
22174 +/* minimal possible key in the tree. Return pointer to the static storage. */
22175 +extern const reiser4_key *reiser4_min_key(void);
22176 +extern const reiser4_key *reiser4_max_key(void);
22177 +
22178 +/* helper macro for keycmp() */
22179 +#define KEY_DIFF(k1, k2, field) \
22180 +({ \
22181 + typeof (get_key_ ## field (k1)) f1; \
22182 + typeof (get_key_ ## field (k2)) f2; \
22183 + \
22184 + f1 = get_key_ ## field (k1); \
22185 + f2 = get_key_ ## field (k2); \
22186 + \
22187 + (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22188 +})
22189 +
22190 +/* helper macro for keycmp() */
22191 +#define KEY_DIFF_EL(k1, k2, off) \
22192 +({ \
22193 + __u64 e1; \
22194 + __u64 e2; \
22195 + \
22196 + e1 = get_key_el(k1, off); \
22197 + e2 = get_key_el(k2, off); \
22198 + \
22199 + (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22200 +})
22201 +
22202 +/* compare `k1' and `k2'. This function is a heart of "key allocation
22203 + policy". All you need to implement new policy is to add yet another
22204 + clause here. */
22205 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22206 + const reiser4_key * k2 /* second key to compare */ )
22207 +{
22208 + cmp_t result;
22209 +
22210 + /*
22211 + * This function is the heart of reiser4 tree-routines. Key comparison
22212 + * is among most heavily used operations in the file system.
22213 + */
22214 +
22215 + assert("nikita-439", k1 != NULL);
22216 + assert("nikita-440", k2 != NULL);
22217 +
22218 + /* there is no actual branch here: condition is compile time constant
22219 + * and constant folding and propagation ensures that only one branch
22220 + * is actually compiled in. */
22221 +
22222 + if (REISER4_PLANA_KEY_ALLOCATION) {
22223 + /* if physical order of fields in a key is identical
22224 + with logical order, we can implement key comparison
22225 + as three 64bit comparisons. */
22226 + /* logical order of fields in plan-a:
22227 + locality->type->objectid->offset. */
22228 + /* compare locality and type at once */
22229 + result = KEY_DIFF_EL(k1, k2, 0);
22230 + if (result == EQUAL_TO) {
22231 + /* compare objectid (and band if it's there) */
22232 + result = KEY_DIFF_EL(k1, k2, 1);
22233 + /* compare offset */
22234 + if (result == EQUAL_TO) {
22235 + result = KEY_DIFF_EL(k1, k2, 2);
22236 + if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22237 + result = KEY_DIFF_EL(k1, k2, 3);
22238 + }
22239 + }
22240 + }
22241 + } else if (REISER4_3_5_KEY_ALLOCATION) {
22242 + result = KEY_DIFF(k1, k2, locality);
22243 + if (result == EQUAL_TO) {
22244 + result = KEY_DIFF(k1, k2, objectid);
22245 + if (result == EQUAL_TO) {
22246 + result = KEY_DIFF(k1, k2, type);
22247 + if (result == EQUAL_TO)
22248 + result = KEY_DIFF(k1, k2, offset);
22249 + }
22250 + }
22251 + } else
22252 + impossible("nikita-441", "Unknown key allocation scheme!");
22253 + return result;
22254 +}
22255 +
22256 +/* true if @k1 equals @k2 */
22257 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22258 + const reiser4_key * k2 /* second key to compare */ )
22259 +{
22260 + assert("nikita-1879", k1 != NULL);
22261 + assert("nikita-1880", k2 != NULL);
22262 + return !memcmp(k1, k2, sizeof *k1);
22263 +}
22264 +
22265 +/* true if @k1 is less than @k2 */
22266 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22267 + const reiser4_key * k2 /* second key to compare */ )
22268 +{
22269 + assert("nikita-1952", k1 != NULL);
22270 + assert("nikita-1953", k2 != NULL);
22271 + return keycmp(k1, k2) == LESS_THAN;
22272 +}
22273 +
22274 +/* true if @k1 is less than or equal to @k2 */
22275 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22276 + const reiser4_key * k2 /* second key to compare */ )
22277 +{
22278 + assert("nikita-1954", k1 != NULL);
22279 + assert("nikita-1955", k2 != NULL);
22280 + return keycmp(k1, k2) != GREATER_THAN;
22281 +}
22282 +
22283 +/* true if @k1 is greater than @k2 */
22284 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22285 + const reiser4_key * k2 /* second key to compare */ )
22286 +{
22287 + assert("nikita-1959", k1 != NULL);
22288 + assert("nikita-1960", k2 != NULL);
22289 + return keycmp(k1, k2) == GREATER_THAN;
22290 +}
22291 +
22292 +/* true if @k1 is greater than or equal to @k2 */
22293 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22294 + const reiser4_key * k2 /* second key to compare */ )
22295 +{
22296 + assert("nikita-1956", k1 != NULL);
22297 + assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22298 + * November 3: Laika */
22299 + return keycmp(k1, k2) != LESS_THAN;
22300 +}
22301 +
22302 +static inline void prefetchkey(reiser4_key * key)
22303 +{
22304 + prefetch(key);
22305 + prefetch(&key->el[KEY_CACHELINE_END]);
22306 +}
22307 +
22308 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22309 + 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22310 +/* size of a buffer suitable to hold human readable key representation */
22311 +#define KEY_BUF_LEN (80)
22312 +
22313 +#if REISER4_DEBUG
22314 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22315 +#else
22316 +#define reiser4_print_key(p,k) noop
22317 +#endif
22318 +
22319 +/* __FS_REISERFS_KEY_H__ */
22320 +#endif
22321 +
22322 +/* Make Linus happy.
22323 + Local variables:
22324 + c-indentation-style: "K&R"
22325 + mode-name: "LC"
22326 + c-basic-offset: 8
22327 + tab-width: 8
22328 + fill-column: 120
22329 + End:
22330 +*/
22331 diff -urN linux-2.6.23.orig/fs/reiser4/ktxnmgrd.c linux-2.6.23/fs/reiser4/ktxnmgrd.c
22332 --- linux-2.6.23.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300
22333 +++ linux-2.6.23/fs/reiser4/ktxnmgrd.c 2007-12-04 16:49:30.000000000 +0300
22334 @@ -0,0 +1,214 @@
22335 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22336 +/* Transaction manager daemon. */
22337 +
22338 +/*
22339 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22340 + * needed/important for the following reasons:
22341 + *
22342 + * 1. in reiser4 atom is not committed immediately when last transaction
22343 + * handle closes, unless atom is either too old or too large (see
22344 + * atom_should_commit()). This is done to avoid committing too frequently.
22345 + * because:
22346 + *
22347 + * 2. sometimes we don't want to commit atom when closing last transaction
22348 + * handle even if it is old and fat enough. For example, because we are at
22349 + * this point under directory semaphore, and committing would stall all
22350 + * accesses to this directory.
22351 + *
22352 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22353 + * either due to (tunable) timeout or because it was explicitly woken up by
22354 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22355 + * eligible.
22356 + *
22357 + */
22358 +
22359 +#include "debug.h"
22360 +#include "txnmgr.h"
22361 +#include "tree.h"
22362 +#include "ktxnmgrd.h"
22363 +#include "super.h"
22364 +#include "reiser4.h"
22365 +
22366 +#include <linux/sched.h> /* for struct task_struct */
22367 +#include <linux/wait.h>
22368 +#include <linux/suspend.h>
22369 +#include <linux/kernel.h>
22370 +#include <linux/writeback.h>
22371 +#include <linux/kthread.h>
22372 +#include <linux/freezer.h>
22373 +
22374 +static int scan_mgr(struct super_block *);
22375 +
22376 +/*
22377 + * change current->comm so that ps, top, and friends will see changed
22378 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22379 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22380 + */
22381 +#define set_comm( state ) \
22382 + snprintf( current -> comm, sizeof( current -> comm ), \
22383 + "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22384 +
22385 +/**
22386 + * ktxnmgrd - kernel txnmgr daemon
22387 + * @arg: pointer to super block
22388 + *
22389 + * The background transaction manager daemon, started as a kernel thread during
22390 + * reiser4 initialization.
22391 + */
22392 +static int ktxnmgrd(void *arg)
22393 +{
22394 + struct super_block *super;
22395 + ktxnmgrd_context *ctx;
22396 + txn_mgr *mgr;
22397 + int done = 0;
22398 +
22399 + super = arg;
22400 + mgr = &get_super_private(super)->tmgr;
22401 +
22402 + /*
22403 + * do_fork() just copies task_struct into the new thread. ->fs_context
22404 + * shouldn't be copied of course. This shouldn't be a problem for the
22405 + * rest of the code though.
22406 + */
22407 + current->journal_info = NULL;
22408 + ctx = mgr->daemon;
22409 + while (1) {
22410 + try_to_freeze();
22411 + set_comm("wait");
22412 + {
22413 + DEFINE_WAIT(__wait);
22414 +
22415 + prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22416 + if (kthread_should_stop()) {
22417 + done = 1;
22418 + } else
22419 + schedule_timeout(ctx->timeout);
22420 + finish_wait(&ctx->wait, &__wait);
22421 + }
22422 + if (done)
22423 + break;
22424 + set_comm("run");
22425 + spin_lock(&ctx->guard);
22426 + /*
22427 + * wait timed out or ktxnmgrd was woken up by explicit request
22428 + * to commit something. Scan list of atoms in txnmgr and look
22429 + * for too old atoms.
22430 + */
22431 + do {
22432 + ctx->rescan = 0;
22433 + scan_mgr(super);
22434 + spin_lock(&ctx->guard);
22435 + if (ctx->rescan) {
22436 + /*
22437 + * the list could be modified while ctx
22438 + * spinlock was released, we have to repeat
22439 + * scanning from the beginning
22440 + */
22441 + break;
22442 + }
22443 + } while (ctx->rescan);
22444 + spin_unlock(&ctx->guard);
22445 + }
22446 + return 0;
22447 +}
22448 +
22449 +#undef set_comm
22450 +
22451 +/**
22452 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22453 + * @super: pointer to super block
22454 + *
22455 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22456 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22457 + */
22458 +int reiser4_init_ktxnmgrd(struct super_block *super)
22459 +{
22460 + txn_mgr *mgr;
22461 + ktxnmgrd_context *ctx;
22462 +
22463 + mgr = &get_super_private(super)->tmgr;
22464 +
22465 + assert("zam-1014", mgr->daemon == NULL);
22466 +
22467 + ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22468 + if (!ctx)
22469 + return RETERR(-ENOMEM);
22470 +
22471 + assert("nikita-2442", ctx != NULL);
22472 +
22473 + init_waitqueue_head(&ctx->wait);
22474 +
22475 + /*kcond_init(&ctx->startup);*/
22476 + spin_lock_init(&ctx->guard);
22477 + ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22478 + ctx->rescan = 1;
22479 + mgr->daemon = ctx;
22480 +
22481 + ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22482 + if (IS_ERR(ctx->tsk)) {
22483 + int ret = PTR_ERR(ctx->tsk);
22484 + mgr->daemon = NULL;
22485 + kfree(ctx);
22486 + return RETERR(ret);
22487 + }
22488 + return 0;
22489 +}
22490 +
22491 +void ktxnmgrd_kick(txn_mgr *mgr)
22492 +{
22493 + assert("nikita-3234", mgr != NULL);
22494 + assert("nikita-3235", mgr->daemon != NULL);
22495 + wake_up(&mgr->daemon->wait);
22496 +}
22497 +
22498 +int is_current_ktxnmgrd(void)
22499 +{
22500 + return (get_current_super_private()->tmgr.daemon->tsk == current);
22501 +}
22502 +
22503 +/**
22504 + * scan_mgr - commit atoms which are to be committed
22505 + * @super: super block to commit atoms of
22506 + *
22507 + * Commits old atoms.
22508 + */
22509 +static int scan_mgr(struct super_block *super)
22510 +{
22511 + int ret;
22512 + reiser4_context ctx;
22513 +
22514 + init_stack_context(&ctx, super);
22515 +
22516 + ret = commit_some_atoms(&get_super_private(super)->tmgr);
22517 +
22518 + reiser4_exit_context(&ctx);
22519 + return ret;
22520 +}
22521 +
22522 +/**
22523 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22524 + * @mgr:
22525 + *
22526 + * This is called on umount. Stops ktxnmgrd and free t
22527 + */
22528 +void reiser4_done_ktxnmgrd(struct super_block *super)
22529 +{
22530 + txn_mgr *mgr;
22531 +
22532 + mgr = &get_super_private(super)->tmgr;
22533 + assert("zam-1012", mgr->daemon != NULL);
22534 +
22535 + kthread_stop(mgr->daemon->tsk);
22536 + kfree(mgr->daemon);
22537 + mgr->daemon = NULL;
22538 +}
22539 +
22540 +/*
22541 + * Local variables:
22542 + * c-indentation-style: "K&R"
22543 + * mode-name: "LC"
22544 + * c-basic-offset: 8
22545 + * tab-width: 8
22546 + * fill-column: 120
22547 + * End:
22548 + */
22549 diff -urN linux-2.6.23.orig/fs/reiser4/ktxnmgrd.h linux-2.6.23/fs/reiser4/ktxnmgrd.h
22550 --- linux-2.6.23.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300
22551 +++ linux-2.6.23/fs/reiser4/ktxnmgrd.h 2007-12-04 16:49:30.000000000 +0300
22552 @@ -0,0 +1,52 @@
22553 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22554 + * reiser4/README */
22555 +
22556 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22557 +
22558 +#ifndef __KTXNMGRD_H__
22559 +#define __KTXNMGRD_H__
22560 +
22561 +#include "txnmgr.h"
22562 +
22563 +#include <linux/fs.h>
22564 +#include <linux/wait.h>
22565 +#include <linux/completion.h>
22566 +#include <linux/spinlock.h>
22567 +#include <asm/atomic.h>
22568 +#include <linux/sched.h> /* for struct task_struct */
22569 +
22570 +/* in this structure all data necessary to start up, shut down and communicate
22571 + * with ktxnmgrd are kept. */
22572 +struct ktxnmgrd_context {
22573 + /* wait queue head on which ktxnmgrd sleeps */
22574 + wait_queue_head_t wait;
22575 + /* spin lock protecting all fields of this structure */
22576 + spinlock_t guard;
22577 + /* timeout of sleeping on ->wait */
22578 + signed long timeout;
22579 + /* kernel thread running ktxnmgrd */
22580 + struct task_struct *tsk;
22581 + /* list of all file systems served by this ktxnmgrd */
22582 + struct list_head queue;
22583 + /* should ktxnmgrd repeat scanning of atoms? */
22584 + unsigned int rescan:1;
22585 +};
22586 +
22587 +extern int reiser4_init_ktxnmgrd(struct super_block *);
22588 +extern void reiser4_done_ktxnmgrd(struct super_block *);
22589 +
22590 +extern void ktxnmgrd_kick(txn_mgr * mgr);
22591 +extern int is_current_ktxnmgrd(void);
22592 +
22593 +/* __KTXNMGRD_H__ */
22594 +#endif
22595 +
22596 +/* Make Linus happy.
22597 + Local variables:
22598 + c-indentation-style: "K&R"
22599 + mode-name: "LC"
22600 + c-basic-offset: 8
22601 + tab-width: 8
22602 + fill-column: 120
22603 + End:
22604 +*/
22605 diff -urN linux-2.6.23.orig/fs/reiser4/lock.c linux-2.6.23/fs/reiser4/lock.c
22606 --- linux-2.6.23.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22607 +++ linux-2.6.23/fs/reiser4/lock.c 2007-12-04 16:49:30.000000000 +0300
22608 @@ -0,0 +1,1232 @@
22609 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22610 + * reiser4/README */
22611 +
22612 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22613 + order. V4 balances the tree from the bottom up, and searches the tree from
22614 + the top down, and that is really the way we want it, so tradition won't work
22615 + for us.
22616 +
22617 + Instead we have two lock orderings, a high priority lock ordering, and a low
22618 + priority lock ordering. Each node in the tree has a lock in its znode.
22619 +
22620 + Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22621 + has a set (maybe empty) of already locked nodes ("process locked set"). Each
22622 + process may have a pending lock request to a node locked by another process.
22623 + Note: we lock and unlock, but do not transfer locks: it is possible
22624 + transferring locks instead would save some bus locking....
22625 +
22626 + Deadlock occurs when we have a loop constructed from process locked sets and
22627 + lock request vectors.
22628 +
22629 + NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22630 + memory is extended with "znodes" with which we connect nodes with their left
22631 + and right neighbors using sibling pointers stored in the znodes. When we
22632 + perform balancing operations we often go from left to right and from right to
22633 + left.
22634 +
22635 + +-P1-+ +-P3-+
22636 + |+--+| V1 |+--+|
22637 + ||N1|| -------> ||N3||
22638 + |+--+| |+--+|
22639 + +----+ +----+
22640 + ^ |
22641 + |V2 |V3
22642 + | v
22643 + +---------P2---------+
22644 + |+--+ +--+|
22645 + ||N2| -------- |N4||
22646 + |+--+ +--+|
22647 + +--------------------+
22648 +
22649 + We solve this by ensuring that only low priority processes lock in top to
22650 + bottom order and from right to left, and high priority processes lock from
22651 + bottom to top and left to right.
22652 +
22653 + ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22654 + kill those damn busy loops.
22655 + ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22656 + stage) cannot be ordered that way. There are no rules what nodes can belong
22657 + to the atom and what nodes cannot. We cannot define what is right or left
22658 + direction, what is top or bottom. We can take immediate parent or side
22659 + neighbor of one node, but nobody guarantees that, say, left neighbor node is
22660 + not a far right neighbor for other nodes from the same atom. It breaks
22661 + deadlock avoidance rules and hi-low priority locking cannot be applied for
22662 + atom locks.
22663 +
22664 + How does it help to avoid deadlocks ?
22665 +
22666 + Suppose we have a deadlock with n processes. Processes from one priority
22667 + class never deadlock because they take locks in one consistent
22668 + order.
22669 +
22670 + So, any possible deadlock loop must have low priority as well as high
22671 + priority processes. There are no other lock priority levels except low and
22672 + high. We know that any deadlock loop contains at least one node locked by a
22673 + low priority process and requested by a high priority process. If this
22674 + situation is caught and resolved it is sufficient to avoid deadlocks.
22675 +
22676 + V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22677 +
22678 + The deadlock prevention algorithm is based on comparing
22679 + priorities of node owners (processes which keep znode locked) and
22680 + requesters (processes which want to acquire a lock on znode). We
22681 + implement a scheme where low-priority owners yield locks to
22682 + high-priority requesters. We created a signal passing system that
22683 + is used to ask low-priority processes to yield one or more locked
22684 + znodes.
22685 +
22686 + The condition when a znode needs to change its owners is described by the
22687 + following formula:
22688 +
22689 + #############################################
22690 + # #
22691 + # (number of high-priority requesters) > 0 #
22692 + # AND #
22693 + # (numbers of high-priority owners) == 0 #
22694 + # #
22695 + #############################################
22696 +
22697 + Note that a low-priority process delays node releasing if another
22698 + high-priority process owns this node. So, slightly more strictly speaking,
22699 + to have a deadlock capable cycle you must have a loop in which a high
22700 + priority process is waiting on a low priority process to yield a node, which
22701 + is slightly different from saying a high priority process is waiting on a
22702 + node owned by a low priority process.
22703 +
22704 + It is enough to avoid deadlocks if we prevent any low-priority process from
22705 + falling asleep if its locked set contains a node which satisfies the
22706 + deadlock condition.
22707 +
22708 + That condition is implicitly or explicitly checked in all places where new
22709 + high-priority requests may be added or removed from node request queue or
22710 + high-priority process takes or releases a lock on node. The main
22711 + goal of these checks is to never lose the moment when node becomes "has
22712 + wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22713 + at that time.
22714 +
22715 + The information about received signals is stored in the per-process
22716 + structure (lock stack) and analyzed before a low-priority process goes to
22717 + sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22718 + sleeping process up and forces him to re-check lock status and received
22719 + signal info. If "must-yield-this-lock" signals were received the locking
22720 + primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22721 +
22722 + V4 LOCKING DRAWBACKS
22723 +
22724 + If we have already balanced on one level, and we are propagating our changes
22725 + upward to a higher level, it could be very messy to surrender all locks on
22726 + the lower level because we put so much computational work into it, and
22727 + reverting them to their state before they were locked might be very complex.
22728 + We also don't want to acquire all locks before performing balancing because
22729 + that would either be almost as much work as the balancing, or it would be
22730 + too conservative and lock too much. We want balancing to be done only at
22731 + high priority. Yet, we might want to go to the left one node and use some
22732 + of its empty space... So we make one attempt at getting the node to the left
22733 + using try_lock, and if it fails we do without it, because we didn't really
22734 + need it, it was only a nice to have.
22735 +
22736 + LOCK STRUCTURES DESCRIPTION
22737 +
22738 + The following data structures are used in the reiser4 locking
22739 + implementation:
22740 +
22741 + All fields related to long-term locking are stored in znode->lock.
22742 +
22743 + The lock stack is a per thread object. It owns all znodes locked by the
22744 + thread. One znode may be locked by several threads in case of read lock or
22745 + one znode may be write locked by one thread several times. The special link
22746 + objects (lock handles) support n<->m relation between znodes and lock
22747 + owners.
22748 +
22749 + <Thread 1> <Thread 2>
22750 +
22751 + +---------+ +---------+
22752 + | LS1 | | LS2 |
22753 + +---------+ +---------+
22754 + ^ ^
22755 + |---------------+ +----------+
22756 + v v v v
22757 + +---------+ +---------+ +---------+ +---------+
22758 + | LH1 | | LH2 | | LH3 | | LH4 |
22759 + +---------+ +---------+ +---------+ +---------+
22760 + ^ ^ ^ ^
22761 + | +------------+ |
22762 + v v v
22763 + +---------+ +---------+ +---------+
22764 + | Z1 | | Z2 | | Z3 |
22765 + +---------+ +---------+ +---------+
22766 +
22767 + Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22768 + picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22769 + LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22770 + Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22771 + list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22772 + is locked (for read) twice by different threads and two lock handles are on
22773 + its list. Each lock handle represents a single relation of a locking of a
22774 + znode by a thread. Locking of a znode is an establishing of a locking
22775 + relation between the lock stack and the znode by adding of a new lock handle
22776 + to a list of lock handles, the lock stack. The lock stack links all lock
22777 + handles for all znodes locked by the lock stack. The znode list groups all
22778 + lock handles for all locks stacks which locked the znode.
22779 +
22780 + Yet another relation may exist between znode and lock owners. If lock
22781 + procedure cannot immediately take lock on an object it adds the lock owner
22782 + on special `requestors' list belongs to znode. That list represents a
22783 + queue of pending lock requests. Because one lock owner may request only
22784 + only one lock object at a time, it is a 1->n relation between lock objects
22785 + and a lock owner implemented as it is described above. Full information
22786 + (priority, pointers to lock and link objects) about each lock request is
22787 + stored in lock owner structure in `request' field.
22788 +
22789 + SHORT_TERM LOCKING
22790 +
22791 + This is a list of primitive operations over lock stacks / lock handles /
22792 + znodes and locking descriptions for them.
22793 +
22794 + 1. locking / unlocking which is done by two list insertion/deletion, one
22795 + to/from znode's list of lock handles, another one is to/from lock stack's
22796 + list of lock handles. The first insertion is protected by
22797 + znode->lock.guard spinlock. The list owned by the lock stack can be
22798 + modified only by thread who owns the lock stack and nobody else can
22799 + modify/read it. There is nothing to be protected by a spinlock or
22800 + something else.
22801 +
22802 + 2. adding/removing a lock request to/from znode requesters list. The rule is
22803 + that znode->lock.guard spinlock should be taken for this.
22804 +
22805 + 3. we can traverse list of lock handles and use references to lock stacks who
22806 + locked given znode if znode->lock.guard spinlock is taken.
22807 +
22808 + 4. If a lock stack is associated with a znode as a lock requestor or lock
22809 + owner its existence is guaranteed by znode->lock.guard spinlock. Some its
22810 + (lock stack's) fields should be protected from being accessed in parallel
22811 + by two or more threads. Please look at lock_stack structure definition
22812 + for the info how those fields are protected. */
22813 +
22814 +/* Znode lock and capturing intertwining. */
22815 +/* In current implementation we capture formatted nodes before locking
22816 + them. Take a look on longterm lock znode, reiser4_try_capture() request
22817 + precedes locking requests. The longterm_lock_znode function unconditionally
22818 + captures znode before even checking of locking conditions.
22819 +
22820 + Another variant is to capture znode after locking it. It was not tested, but
22821 + at least one deadlock condition is supposed to be there. One thread has
22822 + locked a znode (Node-1) and calls reiser4_try_capture() for it.
22823 + reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22824 + Second thread is a flushing thread, its current atom is the atom Node-1
22825 + belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22826 + is locked by the first thread. The described situation is a deadlock. */
22827 +
22828 +#include "debug.h"
22829 +#include "txnmgr.h"
22830 +#include "znode.h"
22831 +#include "jnode.h"
22832 +#include "tree.h"
22833 +#include "plugin/node/node.h"
22834 +#include "super.h"
22835 +
22836 +#include <linux/spinlock.h>
22837 +
22838 +#if REISER4_DEBUG
22839 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
22840 + znode_lock_request);
22841 +#endif
22842 +
22843 +/* Returns a lock owner associated with current thread */
22844 +lock_stack *get_current_lock_stack(void)
22845 +{
22846 + return &get_current_context()->stack;
22847 +}
22848 +
22849 +/* Wakes up all low priority owners informing them about possible deadlock */
22850 +static void wake_up_all_lopri_owners(znode * node)
22851 +{
22852 + lock_handle *handle;
22853 +
22854 + assert_spin_locked(&(node->lock.guard));
22855 + list_for_each_entry(handle, &node->lock.owners, owners_link) {
22856 + assert("nikita-1832", handle->node == node);
22857 + /* count this signal in owner->nr_signaled */
22858 + if (!handle->signaled) {
22859 + handle->signaled = 1;
22860 + atomic_inc(&handle->owner->nr_signaled);
22861 + /* Wake up a single process */
22862 + reiser4_wake_up(handle->owner);
22863 + }
22864 + }
22865 +}
22866 +
22867 +/* Adds a lock to a lock owner, which means creating a link to the lock and
22868 + putting the link into the two lists all links are on (the doubly linked list
22869 + that forms the lock_stack, and the doubly linked list of links attached
22870 + to a lock.
22871 +*/
22872 +static inline void
22873 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
22874 +{
22875 + assert("jmacd-810", handle->owner == NULL);
22876 + assert_spin_locked(&(node->lock.guard));
22877 +
22878 + handle->owner = owner;
22879 + handle->node = node;
22880 +
22881 + assert("reiser4-4",
22882 + ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22883 +
22884 + /* add lock handle to the end of lock_stack's list of locks */
22885 + list_add_tail(&handle->locks_link, &owner->locks);
22886 + ON_DEBUG(owner->nr_locks++);
22887 + reiser4_ctx_gfp_mask_set();
22888 +
22889 + /* add lock handle to the head of znode's list of owners */
22890 + list_add(&handle->owners_link, &node->lock.owners);
22891 + handle->signaled = 0;
22892 +}
22893 +
22894 +/* Breaks a relation between a lock and its owner */
22895 +static inline void unlink_object(lock_handle * handle)
22896 +{
22897 + assert("zam-354", handle->owner != NULL);
22898 + assert("nikita-1608", handle->node != NULL);
22899 + assert_spin_locked(&(handle->node->lock.guard));
22900 + assert("nikita-1829", handle->owner == get_current_lock_stack());
22901 + assert("reiser4-5", handle->owner->nr_locks > 0);
22902 +
22903 + /* remove lock handle from lock_stack's list of locks */
22904 + list_del(&handle->locks_link);
22905 + ON_DEBUG(handle->owner->nr_locks--);
22906 + reiser4_ctx_gfp_mask_set();
22907 + assert("reiser4-6",
22908 + ergo(list_empty_careful(&handle->owner->locks),
22909 + handle->owner->nr_locks == 0));
22910 + /* remove lock handle from znode's list of owners */
22911 + list_del(&handle->owners_link);
22912 + /* indicates that lock handle is free now */
22913 + handle->node = NULL;
22914 +#if REISER4_DEBUG
22915 + INIT_LIST_HEAD(&handle->locks_link);
22916 + INIT_LIST_HEAD(&handle->owners_link);
22917 + handle->owner = NULL;
22918 +#endif
22919 +}
22920 +
22921 +/* Actually locks an object knowing that we are able to do this */
22922 +static void lock_object(lock_stack * owner)
22923 +{
22924 + struct lock_request *request;
22925 + znode *node;
22926 +
22927 + request = &owner->request;
22928 + node = request->node;
22929 + assert_spin_locked(&(node->lock.guard));
22930 + if (request->mode == ZNODE_READ_LOCK) {
22931 + node->lock.nr_readers++;
22932 + } else {
22933 + /* check that we don't switched from read to write lock */
22934 + assert("nikita-1840", node->lock.nr_readers <= 0);
22935 + /* We allow recursive locking; a node can be locked several
22936 + times for write by same process */
22937 + node->lock.nr_readers--;
22938 + }
22939 +
22940 + link_object(request->handle, owner, node);
22941 +
22942 + if (owner->curpri) {
22943 + node->lock.nr_hipri_owners++;
22944 + }
22945 +}
22946 +
22947 +/* Check for recursive write locking */
22948 +static int recursive(lock_stack * owner)
22949 +{
22950 + int ret;
22951 + znode *node;
22952 + lock_handle *lh;
22953 +
22954 + node = owner->request.node;
22955 +
22956 + /* Owners list is not empty for a locked node */
22957 + assert("zam-314", !list_empty_careful(&node->lock.owners));
22958 + assert("nikita-1841", owner == get_current_lock_stack());
22959 + assert_spin_locked(&(node->lock.guard));
22960 +
22961 + lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
22962 + ret = (lh->owner == owner);
22963 +
22964 + /* Recursive read locking should be done usual way */
22965 + assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
22966 + /* mixing of read/write locks is not allowed */
22967 + assert("zam-341", !ret || znode_is_wlocked(node));
22968 +
22969 + return ret;
22970 +}
22971 +
22972 +#if REISER4_DEBUG
22973 +/* Returns true if the lock is held by the calling thread. */
22974 +int znode_is_any_locked(const znode * node)
22975 +{
22976 + lock_handle *handle;
22977 + lock_stack *stack;
22978 + int ret;
22979 +
22980 + if (!znode_is_locked(node)) {
22981 + return 0;
22982 + }
22983 +
22984 + stack = get_current_lock_stack();
22985 +
22986 + spin_lock_stack(stack);
22987 +
22988 + ret = 0;
22989 +
22990 + list_for_each_entry(handle, &stack->locks, locks_link) {
22991 + if (handle->node == node) {
22992 + ret = 1;
22993 + break;
22994 + }
22995 + }
22996 +
22997 + spin_unlock_stack(stack);
22998 +
22999 + return ret;
23000 +}
23001 +
23002 +#endif
23003 +
23004 +/* Returns true if a write lock is held by the calling thread. */
23005 +int znode_is_write_locked(const znode * node)
23006 +{
23007 + lock_stack *stack;
23008 + lock_handle *handle;
23009 +
23010 + assert("jmacd-8765", node != NULL);
23011 +
23012 + if (!znode_is_wlocked(node)) {
23013 + return 0;
23014 + }
23015 +
23016 + stack = get_current_lock_stack();
23017 +
23018 + /*
23019 + * When znode is write locked, all owner handles point to the same lock
23020 + * stack. Get pointer to lock stack from the first lock handle from
23021 + * znode's owner list
23022 + */
23023 + handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23024 +
23025 + return (handle->owner == stack);
23026 +}
23027 +
23028 +/* This "deadlock" condition is the essential part of reiser4 locking
23029 + implementation. This condition is checked explicitly by calling
23030 + check_deadlock_condition() or implicitly in all places where znode lock
23031 + state (set of owners and request queue) is changed. Locking code is
23032 + designed to use this condition to trigger procedure of passing object from
23033 + low priority owner(s) to high priority one(s).
23034 +
23035 + The procedure results in passing an event (setting lock_handle->signaled
23036 + flag) and counting this event in nr_signaled field of owner's lock stack
23037 + object and wakeup owner's process.
23038 +*/
23039 +static inline int check_deadlock_condition(znode * node)
23040 +{
23041 + assert_spin_locked(&(node->lock.guard));
23042 + return node->lock.nr_hipri_requests > 0
23043 + && node->lock.nr_hipri_owners == 0;
23044 +}
23045 +
23046 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23047 +{
23048 + zlock * lock = &node->lock;
23049 +
23050 + return mode == ZNODE_READ_LOCK &&
23051 + lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23052 +}
23053 +
23054 +/* checks lock/request compatibility */
23055 +static int can_lock_object(lock_stack * owner)
23056 +{
23057 + znode *node = owner->request.node;
23058 +
23059 + assert_spin_locked(&(node->lock.guard));
23060 +
23061 + /* See if the node is disconnected. */
23062 + if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23063 + return RETERR(-EINVAL);
23064 +
23065 + /* Do not ever try to take a lock if we are going in low priority
23066 + direction and a node have a high priority request without high
23067 + priority owners. */
23068 + if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23069 + return RETERR(-E_REPEAT);
23070 + if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23071 + return RETERR(-E_REPEAT);
23072 + if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23073 + return RETERR(-E_REPEAT);
23074 + return 0;
23075 +}
23076 +
23077 +/* Setting of a high priority to the process. It clears "signaled" flags
23078 + because znode locked by high-priority process can't satisfy our "deadlock
23079 + condition". */
23080 +static void set_high_priority(lock_stack * owner)
23081 +{
23082 + assert("nikita-1846", owner == get_current_lock_stack());
23083 + /* Do nothing if current priority is already high */
23084 + if (!owner->curpri) {
23085 + /* We don't need locking for owner->locks list, because, this
23086 + * function is only called with the lock stack of the current
23087 + * thread, and no other thread can play with owner->locks list
23088 + * and/or change ->node pointers of lock handles in this list.
23089 + *
23090 + * (Interrupts also are not involved.)
23091 + */
23092 + lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23093 + while (&owner->locks != &item->locks_link) {
23094 + znode *node = item->node;
23095 +
23096 + spin_lock_zlock(&node->lock);
23097 +
23098 + node->lock.nr_hipri_owners++;
23099 +
23100 + /* we can safely set signaled to zero, because
23101 + previous statement (nr_hipri_owners ++) guarantees
23102 + that signaled will be never set again. */
23103 + item->signaled = 0;
23104 + spin_unlock_zlock(&node->lock);
23105 +
23106 + item = list_entry(item->locks_link.next, lock_handle, locks_link);
23107 + }
23108 + owner->curpri = 1;
23109 + atomic_set(&owner->nr_signaled, 0);
23110 + }
23111 +}
23112 +
23113 +/* Sets a low priority to the process. */
23114 +static void set_low_priority(lock_stack * owner)
23115 +{
23116 + assert("nikita-3075", owner == get_current_lock_stack());
23117 + /* Do nothing if current priority is already low */
23118 + if (owner->curpri) {
23119 + /* scan all locks (lock handles) held by @owner, which is
23120 + actually current thread, and check whether we are reaching
23121 + deadlock possibility anywhere.
23122 + */
23123 + lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23124 + while (&owner->locks != &handle->locks_link) {
23125 + znode *node = handle->node;
23126 + spin_lock_zlock(&node->lock);
23127 + /* this thread just was hipri owner of @node, so
23128 + nr_hipri_owners has to be greater than zero. */
23129 + assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23130 + node->lock.nr_hipri_owners--;
23131 + /* If we have deadlock condition, adjust a nr_signaled
23132 + field. It is enough to set "signaled" flag only for
23133 + current process, other low-pri owners will be
23134 + signaled and waken up after current process unlocks
23135 + this object and any high-priority requestor takes
23136 + control. */
23137 + if (check_deadlock_condition(node)
23138 + && !handle->signaled) {
23139 + handle->signaled = 1;
23140 + atomic_inc(&owner->nr_signaled);
23141 + }
23142 + spin_unlock_zlock(&node->lock);
23143 + handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23144 + }
23145 + owner->curpri = 0;
23146 + }
23147 +}
23148 +
23149 +static void remove_lock_request(lock_stack * requestor)
23150 +{
23151 + zlock * lock = &requestor->request.node->lock;
23152 +
23153 + if (requestor->curpri) {
23154 + assert("nikita-1838", lock->nr_hipri_requests > 0);
23155 + lock->nr_hipri_requests--;
23156 + if (requestor->request.mode == ZNODE_WRITE_LOCK)
23157 + lock->nr_hipri_write_requests --;
23158 + }
23159 + list_del(&requestor->requestors_link);
23160 +}
23161 +
23162 +static void invalidate_all_lock_requests(znode * node)
23163 +{
23164 + lock_stack *requestor, *tmp;
23165 +
23166 + assert_spin_locked(&(node->lock.guard));
23167 +
23168 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23169 + remove_lock_request(requestor);
23170 + requestor->request.ret_code = -EINVAL;
23171 + reiser4_wake_up(requestor);
23172 + requestor->request.mode = ZNODE_NO_LOCK;
23173 + }
23174 +}
23175 +
23176 +static void dispatch_lock_requests(znode * node)
23177 +{
23178 + lock_stack *requestor, *tmp;
23179 +
23180 + assert_spin_locked(&(node->lock.guard));
23181 +
23182 + list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23183 + if (znode_is_write_locked(node))
23184 + break;
23185 + if (!can_lock_object(requestor)) {
23186 + lock_object(requestor);
23187 + remove_lock_request(requestor);
23188 + requestor->request.ret_code = 0;
23189 + reiser4_wake_up(requestor);
23190 + requestor->request.mode = ZNODE_NO_LOCK;
23191 + }
23192 + }
23193 +}
23194 +
23195 +/* release long-term lock, acquired by longterm_lock_znode() */
23196 +void longterm_unlock_znode(lock_handle * handle)
23197 +{
23198 + znode *node = handle->node;
23199 + lock_stack *oldowner = handle->owner;
23200 + int hipri;
23201 + int readers;
23202 + int rdelta;
23203 + int youdie;
23204 +
23205 + /*
23206 + * this is time-critical and highly optimized code. Modify carefully.
23207 + */
23208 +
23209 + assert("jmacd-1021", handle != NULL);
23210 + assert("jmacd-1022", handle->owner != NULL);
23211 + assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23212 +
23213 + assert("zam-130", oldowner == get_current_lock_stack());
23214 +
23215 + LOCK_CNT_DEC(long_term_locked_znode);
23216 +
23217 + /*
23218 + * to minimize amount of operations performed under lock, pre-compute
23219 + * all variables used within critical section. This makes code
23220 + * obscure.
23221 + */
23222 +
23223 + /* was this lock of hi or lo priority */
23224 + hipri = oldowner->curpri ? 1 : 0;
23225 + /* number of readers */
23226 + readers = node->lock.nr_readers;
23227 + /* +1 if write lock, -1 if read lock */
23228 + rdelta = (readers > 0) ? -1 : +1;
23229 + /* true if node is to die and write lock is released */
23230 + youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23231 +
23232 + spin_lock_zlock(&node->lock);
23233 +
23234 + assert("zam-101", znode_is_locked(node));
23235 +
23236 + /* Adjust a number of high priority owners of this lock */
23237 + assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23238 + node->lock.nr_hipri_owners -= hipri;
23239 +
23240 + /* Handle znode deallocation on last write-lock release. */
23241 + if (znode_is_wlocked_once(node)) {
23242 + if (youdie) {
23243 + forget_znode(handle);
23244 + assert("nikita-2191", znode_invariant(node));
23245 + zput(node);
23246 + return;
23247 + }
23248 + }
23249 +
23250 + if (handle->signaled)
23251 + atomic_dec(&oldowner->nr_signaled);
23252 +
23253 + /* Unlocking means owner<->object link deletion */
23254 + unlink_object(handle);
23255 +
23256 + /* This is enough to be sure whether an object is completely
23257 + unlocked. */
23258 + node->lock.nr_readers += rdelta;
23259 +
23260 + /* If the node is locked it must have an owners list. Likewise, if
23261 + the node is unlocked it must have an empty owners list. */
23262 + assert("zam-319", equi(znode_is_locked(node),
23263 + !list_empty_careful(&node->lock.owners)));
23264 +
23265 +#if REISER4_DEBUG
23266 + if (!znode_is_locked(node))
23267 + ++node->times_locked;
23268 +#endif
23269 +
23270 + /* If there are pending lock requests we wake up a requestor */
23271 + if (!znode_is_wlocked(node))
23272 + dispatch_lock_requests(node);
23273 + if (check_deadlock_condition(node))
23274 + wake_up_all_lopri_owners(node);
23275 + spin_unlock_zlock(&node->lock);
23276 +
23277 + /* minus one reference from handle->node */
23278 + assert("nikita-2190", znode_invariant(node));
23279 + ON_DEBUG(check_lock_data());
23280 + ON_DEBUG(check_lock_node_data(node));
23281 + zput(node);
23282 +}
23283 +
23284 +/* final portion of longterm-lock */
23285 +static int
23286 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23287 +{
23288 + znode *node = owner->request.node;
23289 +
23290 + assert_spin_locked(&(node->lock.guard));
23291 +
23292 + /* If we broke with (ok == 0) it means we can_lock, now do it. */
23293 + if (ok == 0) {
23294 + lock_object(owner);
23295 + owner->request.mode = 0;
23296 + /* count a reference from lockhandle->node
23297 +
23298 + znode was already referenced at the entry to this function,
23299 + hence taking spin-lock here is not necessary (see comment
23300 + in the zref()).
23301 + */
23302 + zref(node);
23303 +
23304 + LOCK_CNT_INC(long_term_locked_znode);
23305 + }
23306 + spin_unlock_zlock(&node->lock);
23307 + ON_DEBUG(check_lock_data());
23308 + ON_DEBUG(check_lock_node_data(node));
23309 + return ok;
23310 +}
23311 +
23312 +/*
23313 + * version of longterm_znode_lock() optimized for the most common case: read
23314 + * lock without any special flags. This is the kind of lock that any tree
23315 + * traversal takes on the root node of the tree, which is very frequent.
23316 + */
23317 +static int longterm_lock_tryfast(lock_stack * owner)
23318 +{
23319 + int result;
23320 + znode *node;
23321 + zlock *lock;
23322 +
23323 + node = owner->request.node;
23324 + lock = &node->lock;
23325 +
23326 + assert("nikita-3340", reiser4_schedulable());
23327 + assert("nikita-3341", request_is_deadlock_safe(node,
23328 + ZNODE_READ_LOCK,
23329 + ZNODE_LOCK_LOPRI));
23330 + spin_lock_zlock(lock);
23331 + result = can_lock_object(owner);
23332 + spin_unlock_zlock(lock);
23333 +
23334 + if (likely(result != -EINVAL)) {
23335 + spin_lock_znode(node);
23336 + result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23337 + spin_unlock_znode(node);
23338 + spin_lock_zlock(lock);
23339 + if (unlikely(result != 0)) {
23340 + owner->request.mode = 0;
23341 + } else {
23342 + result = can_lock_object(owner);
23343 + if (unlikely(result == -E_REPEAT)) {
23344 + /* fall back to longterm_lock_znode() */
23345 + spin_unlock_zlock(lock);
23346 + return 1;
23347 + }
23348 + }
23349 + return lock_tail(owner, result, ZNODE_READ_LOCK);
23350 + } else
23351 + return 1;
23352 +}
23353 +
23354 +/* locks given lock object */
23355 +int longterm_lock_znode(
23356 + /* local link object (allocated by lock owner thread, usually on its own
23357 + * stack) */
23358 + lock_handle * handle,
23359 + /* znode we want to lock. */
23360 + znode * node,
23361 + /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23362 + znode_lock_mode mode,
23363 + /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23364 + znode_lock_request request) {
23365 + int ret;
23366 + int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23367 + int non_blocking = 0;
23368 + int has_atom;
23369 + txn_capture cap_flags;
23370 + zlock *lock;
23371 + txn_handle *txnh;
23372 + tree_level level;
23373 +
23374 + /* Get current process context */
23375 + lock_stack *owner = get_current_lock_stack();
23376 +
23377 + /* Check that the lock handle is initialized and isn't already being
23378 + * used. */
23379 + assert("jmacd-808", handle->owner == NULL);
23380 + assert("nikita-3026", reiser4_schedulable());
23381 + assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23382 + assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23383 + /* long term locks are not allowed in the VM contexts (->writepage(),
23384 + * prune_{d,i}cache()).
23385 + *
23386 + * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23387 + * bug caused by d_splice_alias() only working for directories.
23388 + */
23389 + assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23390 + assert ("zam-1055", mode != ZNODE_NO_LOCK);
23391 +
23392 + cap_flags = 0;
23393 + if (request & ZNODE_LOCK_NONBLOCK) {
23394 + cap_flags |= TXN_CAPTURE_NONBLOCKING;
23395 + non_blocking = 1;
23396 + }
23397 +
23398 + if (request & ZNODE_LOCK_DONT_FUSE)
23399 + cap_flags |= TXN_CAPTURE_DONT_FUSE;
23400 +
23401 + /* If we are changing our process priority we must adjust a number
23402 + of high priority owners for each znode that we already lock */
23403 + if (hipri) {
23404 + set_high_priority(owner);
23405 + } else {
23406 + set_low_priority(owner);
23407 + }
23408 +
23409 + level = znode_get_level(node);
23410 +
23411 + /* Fill request structure with our values. */
23412 + owner->request.mode = mode;
23413 + owner->request.handle = handle;
23414 + owner->request.node = node;
23415 +
23416 + txnh = get_current_context()->trans;
23417 + lock = &node->lock;
23418 +
23419 + if (mode == ZNODE_READ_LOCK && request == 0) {
23420 + ret = longterm_lock_tryfast(owner);
23421 + if (ret <= 0)
23422 + return ret;
23423 + }
23424 +
23425 + has_atom = (txnh->atom != NULL);
23426 +
23427 + /* Synchronize on node's zlock guard lock. */
23428 + spin_lock_zlock(lock);
23429 +
23430 + if (znode_is_locked(node) &&
23431 + mode == ZNODE_WRITE_LOCK && recursive(owner))
23432 + return lock_tail(owner, 0, mode);
23433 +
23434 + for (;;) {
23435 + /* Check the lock's availability: if it is unavaiable we get
23436 + E_REPEAT, 0 indicates "can_lock", otherwise the node is
23437 + invalid. */
23438 + ret = can_lock_object(owner);
23439 +
23440 + if (unlikely(ret == -EINVAL)) {
23441 + /* @node is dying. Leave it alone. */
23442 + break;
23443 + }
23444 +
23445 + if (unlikely(ret == -E_REPEAT && non_blocking)) {
23446 + /* either locking of @node by the current thread will
23447 + * lead to the deadlock, or lock modes are
23448 + * incompatible. */
23449 + break;
23450 + }
23451 +
23452 + assert("nikita-1844", (ret == 0)
23453 + || ((ret == -E_REPEAT) && !non_blocking));
23454 + /* If we can get the lock... Try to capture first before
23455 + taking the lock. */
23456 +
23457 + /* first handle commonest case where node and txnh are already
23458 + * in the same atom. */
23459 + /* safe to do without taking locks, because:
23460 + *
23461 + * 1. read of aligned word is atomic with respect to writes to
23462 + * this word
23463 + *
23464 + * 2. false negatives are handled in reiser4_try_capture().
23465 + *
23466 + * 3. false positives are impossible.
23467 + *
23468 + * PROOF: left as an exercise to the curious reader.
23469 + *
23470 + * Just kidding. Here is one:
23471 + *
23472 + * At the time T0 txnh->atom is stored in txnh_atom.
23473 + *
23474 + * At the time T1 node->atom is stored in node_atom.
23475 + *
23476 + * At the time T2 we observe that
23477 + *
23478 + * txnh_atom != NULL && node_atom == txnh_atom.
23479 + *
23480 + * Imagine that at this moment we acquire node and txnh spin
23481 + * lock in this order. Suppose that under spin lock we have
23482 + *
23483 + * node->atom != txnh->atom, (S1)
23484 + *
23485 + * at the time T3.
23486 + *
23487 + * txnh->atom != NULL still, because txnh is open by the
23488 + * current thread.
23489 + *
23490 + * Suppose node->atom == NULL, that is, node was un-captured
23491 + * between T1, and T3. But un-capturing of formatted node is
23492 + * always preceded by the call to reiser4_invalidate_lock(),
23493 + * which marks znode as JNODE_IS_DYING under zlock spin
23494 + * lock. Contradiction, because can_lock_object() above checks
23495 + * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23496 + *
23497 + * Suppose that node->atom != node_atom, that is, atom, node
23498 + * belongs to was fused into another atom: node_atom was fused
23499 + * into node->atom. Atom of txnh was equal to node_atom at T2,
23500 + * which means that under spin lock, txnh->atom == node->atom,
23501 + * because txnh->atom can only follow fusion
23502 + * chain. Contradicts S1.
23503 + *
23504 + * The same for hypothesis txnh->atom != txnh_atom. Hence,
23505 + * node->atom == node_atom == txnh_atom == txnh->atom. Again
23506 + * contradicts S1. Hence S1 is false. QED.
23507 + *
23508 + */
23509 +
23510 + if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23511 + ;
23512 + } else {
23513 + /*
23514 + * unlock zlock spin lock here. It is possible for
23515 + * longterm_unlock_znode() to sneak in here, but there
23516 + * is no harm: reiser4_invalidate_lock() will mark znode
23517 + * as JNODE_IS_DYING and this will be noted by
23518 + * can_lock_object() below.
23519 + */
23520 + spin_unlock_zlock(lock);
23521 + spin_lock_znode(node);
23522 + ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23523 + spin_unlock_znode(node);
23524 + spin_lock_zlock(lock);
23525 + if (unlikely(ret != 0)) {
23526 + /* In the failure case, the txnmgr releases
23527 + the znode's lock (or in some cases, it was
23528 + released a while ago). There's no need to
23529 + reacquire it so we should return here,
23530 + avoid releasing the lock. */
23531 + owner->request.mode = 0;
23532 + break;
23533 + }
23534 +
23535 + /* Check the lock's availability again -- this is
23536 + because under some circumstances the capture code
23537 + has to release and reacquire the znode spinlock. */
23538 + ret = can_lock_object(owner);
23539 + }
23540 +
23541 + /* This time, a return of (ret == 0) means we can lock, so we
23542 + should break out of the loop. */
23543 + if (likely(ret != -E_REPEAT || non_blocking))
23544 + break;
23545 +
23546 + /* Lock is unavailable, we have to wait. */
23547 + ret = reiser4_prepare_to_sleep(owner);
23548 + if (unlikely(ret != 0))
23549 + break;
23550 +
23551 + assert_spin_locked(&(node->lock.guard));
23552 + if (hipri) {
23553 + /* If we are going in high priority direction then
23554 + increase high priority requests counter for the
23555 + node */
23556 + lock->nr_hipri_requests++;
23557 + if (mode == ZNODE_WRITE_LOCK)
23558 + lock->nr_hipri_write_requests ++;
23559 + /* If there are no high priority owners for a node,
23560 + then immediately wake up low priority owners, so
23561 + they can detect possible deadlock */
23562 + if (lock->nr_hipri_owners == 0)
23563 + wake_up_all_lopri_owners(node);
23564 + }
23565 + list_add_tail(&owner->requestors_link, &lock->requestors);
23566 +
23567 + /* Ok, here we have prepared a lock request, so unlock
23568 + a znode ... */
23569 + spin_unlock_zlock(lock);
23570 + /* ... and sleep */
23571 + reiser4_go_to_sleep(owner);
23572 + if (owner->request.mode == ZNODE_NO_LOCK)
23573 + goto request_is_done;
23574 + spin_lock_zlock(lock);
23575 + if (owner->request.mode == ZNODE_NO_LOCK) {
23576 + spin_unlock_zlock(lock);
23577 + request_is_done:
23578 + if (owner->request.ret_code == 0) {
23579 + LOCK_CNT_INC(long_term_locked_znode);
23580 + zref(node);
23581 + }
23582 + return owner->request.ret_code;
23583 + }
23584 + remove_lock_request(owner);
23585 + }
23586 +
23587 + return lock_tail(owner, ret, mode);
23588 +}
23589 +
23590 +/* lock object invalidation means changing of lock object state to `INVALID'
23591 + and waiting for all other processes to cancel theirs lock requests. */
23592 +void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23593 + * owner and lock
23594 + * object is being
23595 + * invalidated. */ )
23596 +{
23597 + znode *node = handle->node;
23598 + lock_stack *owner = handle->owner;
23599 +
23600 + assert("zam-325", owner == get_current_lock_stack());
23601 + assert("zam-103", znode_is_write_locked(node));
23602 + assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23603 + assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23604 + assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23605 + assert("nikita-3097", znode_is_wlocked_once(node));
23606 + assert_spin_locked(&(node->lock.guard));
23607 +
23608 + if (handle->signaled)
23609 + atomic_dec(&owner->nr_signaled);
23610 +
23611 + ZF_SET(node, JNODE_IS_DYING);
23612 + unlink_object(handle);
23613 + node->lock.nr_readers = 0;
23614 +
23615 + invalidate_all_lock_requests(node);
23616 + spin_unlock_zlock(&node->lock);
23617 +}
23618 +
23619 +/* Initializes lock_stack. */
23620 +void init_lock_stack(lock_stack * owner /* pointer to
23621 + * allocated
23622 + * structure. */ )
23623 +{
23624 + INIT_LIST_HEAD(&owner->locks);
23625 + INIT_LIST_HEAD(&owner->requestors_link);
23626 + spin_lock_init(&owner->sguard);
23627 + owner->curpri = 1;
23628 + init_waitqueue_head(&owner->wait);
23629 +}
23630 +
23631 +/* Initializes lock object. */
23632 +void reiser4_init_lock(zlock * lock /* pointer on allocated
23633 + * uninitialized lock object
23634 + * structure. */ )
23635 +{
23636 + memset(lock, 0, sizeof(zlock));
23637 + spin_lock_init(&lock->guard);
23638 + INIT_LIST_HEAD(&lock->requestors);
23639 + INIT_LIST_HEAD(&lock->owners);
23640 +}
23641 +
23642 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
23643 + heap locations). */
23644 +static void
23645 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23646 +{
23647 + znode *node = old->node;
23648 + lock_stack *owner = old->owner;
23649 + int signaled;
23650 +
23651 + /* locks_list, modified by link_object() is not protected by
23652 + anything. This is valid because only current thread ever modifies
23653 + locks_list of its lock_stack.
23654 + */
23655 + assert("nikita-1827", owner == get_current_lock_stack());
23656 + assert("nikita-1831", new->owner == NULL);
23657 +
23658 + spin_lock_zlock(&node->lock);
23659 +
23660 + signaled = old->signaled;
23661 + if (unlink_old) {
23662 + unlink_object(old);
23663 + } else {
23664 + if (node->lock.nr_readers > 0) {
23665 + node->lock.nr_readers += 1;
23666 + } else {
23667 + node->lock.nr_readers -= 1;
23668 + }
23669 + if (signaled) {
23670 + atomic_inc(&owner->nr_signaled);
23671 + }
23672 + if (owner->curpri) {
23673 + node->lock.nr_hipri_owners += 1;
23674 + }
23675 + LOCK_CNT_INC(long_term_locked_znode);
23676 +
23677 + zref(node);
23678 + }
23679 + link_object(new, owner, node);
23680 + new->signaled = signaled;
23681 +
23682 + spin_unlock_zlock(&node->lock);
23683 +}
23684 +
23685 +void move_lh(lock_handle * new, lock_handle * old)
23686 +{
23687 + move_lh_internal(new, old, /*unlink_old */ 1);
23688 +}
23689 +
23690 +void copy_lh(lock_handle * new, lock_handle * old)
23691 +{
23692 + move_lh_internal(new, old, /*unlink_old */ 0);
23693 +}
23694 +
23695 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23696 +int reiser4_check_deadlock(void)
23697 +{
23698 + lock_stack *owner = get_current_lock_stack();
23699 + return atomic_read(&owner->nr_signaled) != 0;
23700 +}
23701 +
23702 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23703 + priorities. */
23704 +int reiser4_prepare_to_sleep(lock_stack * owner)
23705 +{
23706 + assert("nikita-1847", owner == get_current_lock_stack());
23707 +
23708 + /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23709 + * counted in nr_signaled */
23710 + if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23711 + assert("zam-959", !owner->curpri);
23712 + return RETERR(-E_DEADLOCK);
23713 + }
23714 + return 0;
23715 +}
23716 +
23717 +/* Wakes up a single thread */
23718 +void __reiser4_wake_up(lock_stack * owner)
23719 +{
23720 + atomic_set(&owner->wakeup, 1);
23721 + wake_up(&owner->wait);
23722 +}
23723 +
23724 +/* Puts a thread to sleep */
23725 +void reiser4_go_to_sleep(lock_stack * owner)
23726 +{
23727 + /* Well, we might sleep here, so holding of any spinlocks is no-no */
23728 + assert("nikita-3027", reiser4_schedulable());
23729 +
23730 + wait_event(owner->wait, atomic_read(&owner->wakeup));
23731 + atomic_set(&owner->wakeup, 0);
23732 +}
23733 +
23734 +int lock_stack_isclean(lock_stack * owner)
23735 +{
23736 + if (list_empty_careful(&owner->locks)) {
23737 + assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23738 + return 1;
23739 + }
23740 +
23741 + return 0;
23742 +}
23743 +
23744 +#if REISER4_DEBUG
23745 +
23746 +/*
23747 + * debugging functions
23748 + */
23749 +
23750 +static void list_check(struct list_head *head)
23751 +{
23752 + struct list_head *pos;
23753 +
23754 + list_for_each(pos, head)
23755 + assert("", (pos->prev != NULL && pos->next != NULL &&
23756 + pos->prev->next == pos && pos->next->prev == pos));
23757 +}
23758 +
23759 +/* check consistency of locking data-structures hanging of the @stack */
23760 +static void check_lock_stack(lock_stack * stack)
23761 +{
23762 + spin_lock_stack(stack);
23763 + /* check that stack->locks is not corrupted */
23764 + list_check(&stack->locks);
23765 + spin_unlock_stack(stack);
23766 +}
23767 +
23768 +/* check consistency of locking data structures */
23769 +void check_lock_data(void)
23770 +{
23771 + check_lock_stack(&get_current_context()->stack);
23772 +}
23773 +
23774 +/* check consistency of locking data structures for @node */
23775 +void check_lock_node_data(znode * node)
23776 +{
23777 + spin_lock_zlock(&node->lock);
23778 + list_check(&node->lock.owners);
23779 + list_check(&node->lock.requestors);
23780 + spin_unlock_zlock(&node->lock);
23781 +}
23782 +
23783 +/* check that given lock request is dead lock safe. This check is, of course,
23784 + * not exhaustive. */
23785 +static int
23786 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23787 + znode_lock_request request)
23788 +{
23789 + lock_stack *owner;
23790 +
23791 + owner = get_current_lock_stack();
23792 + /*
23793 + * check that hipri lock request is not issued when there are locked
23794 + * nodes at the higher levels.
23795 + */
23796 + if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23797 + znode_get_level(node) != 0) {
23798 + lock_handle *item;
23799 +
23800 + list_for_each_entry(item, &owner->locks, locks_link) {
23801 + znode *other;
23802 +
23803 + other = item->node;
23804 +
23805 + if (znode_get_level(other) == 0)
23806 + continue;
23807 + if (znode_get_level(other) > znode_get_level(node))
23808 + return 0;
23809 + }
23810 + }
23811 + return 1;
23812 +}
23813 +
23814 +#endif
23815 +
23816 +/* return pointer to static storage with name of lock_mode. For
23817 + debugging */
23818 +const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23819 +{
23820 + if (lock == ZNODE_READ_LOCK)
23821 + return "read";
23822 + else if (lock == ZNODE_WRITE_LOCK)
23823 + return "write";
23824 + else {
23825 + static char buf[30];
23826 +
23827 + sprintf(buf, "unknown: %i", lock);
23828 + return buf;
23829 + }
23830 +}
23831 +
23832 +/* Make Linus happy.
23833 + Local variables:
23834 + c-indentation-style: "K&R"
23835 + mode-name: "LC"
23836 + c-basic-offset: 8
23837 + tab-width: 8
23838 + fill-column: 79
23839 + End:
23840 +*/
23841 diff -urN linux-2.6.23.orig/fs/reiser4/lock.h linux-2.6.23/fs/reiser4/lock.h
23842 --- linux-2.6.23.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23843 +++ linux-2.6.23/fs/reiser4/lock.h 2007-12-04 16:49:30.000000000 +0300
23844 @@ -0,0 +1,249 @@
23845 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23846 +
23847 +/* Long term locking data structures. See lock.c for details. */
23848 +
23849 +#ifndef __LOCK_H__
23850 +#define __LOCK_H__
23851 +
23852 +#include "forward.h"
23853 +#include "debug.h"
23854 +#include "dformat.h"
23855 +#include "key.h"
23856 +#include "coord.h"
23857 +#include "plugin/node/node.h"
23858 +#include "txnmgr.h"
23859 +#include "readahead.h"
23860 +
23861 +#include <linux/types.h>
23862 +#include <linux/spinlock.h>
23863 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
23864 +#include <asm/atomic.h>
23865 +#include <linux/wait.h>
23866 +
23867 +/* Per-znode lock object */
23868 +struct zlock {
23869 + spinlock_t guard;
23870 + /* The number of readers if positive; the number of recursively taken
23871 + write locks if negative. Protected by zlock spin lock. */
23872 + int nr_readers;
23873 + /* A number of processes (lock_stacks) that have this object
23874 + locked with high priority */
23875 + unsigned nr_hipri_owners;
23876 + /* A number of attempts to lock znode in high priority direction */
23877 + unsigned nr_hipri_requests;
23878 + /* A linked list of lock_handle objects that contains pointers
23879 + for all lock_stacks which have this lock object locked */
23880 + unsigned nr_hipri_write_requests;
23881 + struct list_head owners;
23882 + /* A linked list of lock_stacks that wait for this lock */
23883 + struct list_head requestors;
23884 +};
23885 +
23886 +static inline void spin_lock_zlock(zlock *lock)
23887 +{
23888 + /* check that zlock is not locked */
23889 + assert("", LOCK_CNT_NIL(spin_locked_zlock));
23890 + /* check that spinlocks of lower priorities are not held */
23891 + assert("", LOCK_CNT_NIL(spin_locked_stack));
23892 +
23893 + spin_lock(&lock->guard);
23894 +
23895 + LOCK_CNT_INC(spin_locked_zlock);
23896 + LOCK_CNT_INC(spin_locked);
23897 +}
23898 +
23899 +static inline void spin_unlock_zlock(zlock *lock)
23900 +{
23901 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23902 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23903 +
23904 + LOCK_CNT_DEC(spin_locked_zlock);
23905 + LOCK_CNT_DEC(spin_locked);
23906 +
23907 + spin_unlock(&lock->guard);
23908 +}
23909 +
23910 +#define lock_is_locked(lock) ((lock)->nr_readers != 0)
23911 +#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
23912 +#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
23913 +#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
23914 +#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
23915 +#define lock_mode_compatible(lock, mode) \
23916 + (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23917 + ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23918 +
23919 +/* Since we have R/W znode locks we need additional bidirectional `link'
23920 + objects to implement n<->m relationship between lock owners and lock
23921 + objects. We call them `lock handles'.
23922 +
23923 + Locking: see lock.c/"SHORT-TERM LOCKING"
23924 +*/
23925 +struct lock_handle {
23926 + /* This flag indicates that a signal to yield a lock was passed to
23927 + lock owner and counted in owner->nr_signalled
23928 +
23929 + Locking: this is accessed under spin lock on ->node.
23930 + */
23931 + int signaled;
23932 + /* A link to owner of a lock */
23933 + lock_stack *owner;
23934 + /* A link to znode locked */
23935 + znode *node;
23936 + /* A list of all locks for a process */
23937 + struct list_head locks_link;
23938 + /* A list of all owners for a znode */
23939 + struct list_head owners_link;
23940 +};
23941 +
23942 +struct lock_request {
23943 + /* A pointer to uninitialized link object */
23944 + lock_handle *handle;
23945 + /* A pointer to the object we want to lock */
23946 + znode *node;
23947 + /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23948 + znode_lock_mode mode;
23949 + /* how dispatch_lock_requests() returns lock request result code */
23950 + int ret_code;
23951 +};
23952 +
23953 +/* A lock stack structure for accumulating locks owned by a process */
23954 +struct lock_stack {
23955 + /* A guard lock protecting a lock stack */
23956 + spinlock_t sguard;
23957 + /* number of znodes which were requested by high priority processes */
23958 + atomic_t nr_signaled;
23959 + /* Current priority of a process
23960 +
23961 + This is only accessed by the current thread and thus requires no
23962 + locking.
23963 + */
23964 + int curpri;
23965 + /* A list of all locks owned by this process. Elements can be added to
23966 + * this list only by the current thread. ->node pointers in this list
23967 + * can be only changed by the current thread. */
23968 + struct list_head locks;
23969 + /* When lock_stack waits for the lock, it puts itself on double-linked
23970 + requestors list of that lock */
23971 + struct list_head requestors_link;
23972 + /* Current lock request info.
23973 +
23974 + This is only accessed by the current thread and thus requires no
23975 + locking.
23976 + */
23977 + struct lock_request request;
23978 + /* the following two fields are the lock stack's
23979 + * synchronization object to use with the standard linux/wait.h
23980 + * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
23981 + * usage details. */
23982 + wait_queue_head_t wait;
23983 + atomic_t wakeup;
23984 +#if REISER4_DEBUG
23985 + int nr_locks; /* number of lock handles in the above list */
23986 +#endif
23987 +};
23988 +
23989 +/*
23990 + User-visible znode locking functions
23991 +*/
23992 +
23993 +extern int longterm_lock_znode(lock_handle * handle,
23994 + znode * node,
23995 + znode_lock_mode mode,
23996 + znode_lock_request request);
23997 +
23998 +extern void longterm_unlock_znode(lock_handle * handle);
23999 +
24000 +extern int reiser4_check_deadlock(void);
24001 +
24002 +extern lock_stack *get_current_lock_stack(void);
24003 +
24004 +extern void init_lock_stack(lock_stack * owner);
24005 +extern void reiser4_init_lock(zlock * lock);
24006 +
24007 +static inline void init_lh(lock_handle *lh)
24008 +{
24009 +#if REISER4_DEBUG
24010 + memset(lh, 0, sizeof *lh);
24011 + INIT_LIST_HEAD(&lh->locks_link);
24012 + INIT_LIST_HEAD(&lh->owners_link);
24013 +#else
24014 + lh->node = NULL;
24015 +#endif
24016 +}
24017 +
24018 +static inline void done_lh(lock_handle *lh)
24019 +{
24020 + assert("zam-342", lh != NULL);
24021 + if (lh->node != NULL)
24022 + longterm_unlock_znode(lh);
24023 +}
24024 +
24025 +extern void move_lh(lock_handle * new, lock_handle * old);
24026 +extern void copy_lh(lock_handle * new, lock_handle * old);
24027 +
24028 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
24029 +extern void reiser4_go_to_sleep(lock_stack * owner);
24030 +extern void __reiser4_wake_up(lock_stack * owner);
24031 +
24032 +extern int lock_stack_isclean(lock_stack * owner);
24033 +
24034 +/* zlock object state check macros: only used in assertions. Both forms imply that the
24035 + lock is held by the current thread. */
24036 +extern int znode_is_write_locked(const znode *);
24037 +extern void reiser4_invalidate_lock(lock_handle *);
24038 +
24039 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24040 +#define spin_ordering_pred_stack(stack) \
24041 + (LOCK_CNT_NIL(spin_locked_stack) && \
24042 + LOCK_CNT_NIL(spin_locked_txnmgr) && \
24043 + LOCK_CNT_NIL(spin_locked_inode) && \
24044 + LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24045 + LOCK_CNT_NIL(spin_locked_super_eflush) )
24046 +
24047 +static inline void spin_lock_stack(lock_stack *stack)
24048 +{
24049 + assert("", spin_ordering_pred_stack(stack));
24050 + spin_lock(&(stack->sguard));
24051 + LOCK_CNT_INC(spin_locked_stack);
24052 + LOCK_CNT_INC(spin_locked);
24053 +}
24054 +
24055 +static inline void spin_unlock_stack(lock_stack *stack)
24056 +{
24057 + assert_spin_locked(&(stack->sguard));
24058 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24059 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24060 + LOCK_CNT_DEC(spin_locked_stack);
24061 + LOCK_CNT_DEC(spin_locked);
24062 + spin_unlock(&(stack->sguard));
24063 +}
24064 +
24065 +static inline void reiser4_wake_up(lock_stack * owner)
24066 +{
24067 + spin_lock_stack(owner);
24068 + __reiser4_wake_up(owner);
24069 + spin_unlock_stack(owner);
24070 +}
24071 +
24072 +const char *lock_mode_name(znode_lock_mode lock);
24073 +
24074 +#if REISER4_DEBUG
24075 +extern void check_lock_data(void);
24076 +extern void check_lock_node_data(znode * node);
24077 +#else
24078 +#define check_lock_data() noop
24079 +#define check_lock_node_data() noop
24080 +#endif
24081 +
24082 +/* __LOCK_H__ */
24083 +#endif
24084 +
24085 +/* Make Linus happy.
24086 + Local variables:
24087 + c-indentation-style: "K&R"
24088 + mode-name: "LC"
24089 + c-basic-offset: 8
24090 + tab-width: 8
24091 + fill-column: 120
24092 + End:
24093 +*/
24094 diff -urN linux-2.6.23.orig/fs/reiser4/Makefile linux-2.6.23/fs/reiser4/Makefile
24095 --- linux-2.6.23.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
24096 +++ linux-2.6.23/fs/reiser4/Makefile 2007-12-04 16:49:30.000000000 +0300
24097 @@ -0,0 +1,98 @@
24098 +#
24099 +# reiser4/Makefile
24100 +#
24101 +
24102 +obj-$(CONFIG_REISER4_FS) += reiser4.o
24103 +
24104 +reiser4-y := \
24105 + debug.o \
24106 + jnode.o \
24107 + znode.o \
24108 + key.o \
24109 + pool.o \
24110 + tree_mod.o \
24111 + estimate.o \
24112 + carry.o \
24113 + carry_ops.o \
24114 + lock.o \
24115 + tree.o \
24116 + context.o \
24117 + tap.o \
24118 + coord.o \
24119 + block_alloc.o \
24120 + txnmgr.o \
24121 + kassign.o \
24122 + flush.o \
24123 + wander.o \
24124 + eottl.o \
24125 + search.o \
24126 + page_cache.o \
24127 + seal.o \
24128 + dscale.o \
24129 + flush_queue.o \
24130 + ktxnmgrd.o \
24131 + blocknrset.o \
24132 + super.o \
24133 + super_ops.o \
24134 + fsdata.o \
24135 + export_ops.o \
24136 + oid.o \
24137 + tree_walk.o \
24138 + inode.o \
24139 + vfs_ops.o \
24140 + as_ops.o \
24141 + entd.o\
24142 + readahead.o \
24143 + status_flags.o \
24144 + init_super.o \
24145 + safe_link.o \
24146 + \
24147 + plugin/plugin.o \
24148 + plugin/plugin_set.o \
24149 + plugin/node/node.o \
24150 + plugin/object.o \
24151 + plugin/cluster.o \
24152 + plugin/inode_ops.o \
24153 + plugin/inode_ops_rename.o \
24154 + plugin/file_ops.o \
24155 + plugin/file_ops_readdir.o \
24156 + plugin/file_plugin_common.o \
24157 + plugin/file/file.o \
24158 + plugin/file/tail_conversion.o \
24159 + plugin/file/file_conversion.o \
24160 + plugin/file/symlink.o \
24161 + plugin/file/cryptcompress.o \
24162 + plugin/dir_plugin_common.o \
24163 + plugin/dir/hashed_dir.o \
24164 + plugin/dir/seekable_dir.o \
24165 + plugin/node/node40.o \
24166 + \
24167 + plugin/crypto/cipher.o \
24168 + plugin/crypto/digest.o \
24169 + \
24170 + plugin/compress/compress.o \
24171 + plugin/compress/compress_mode.o \
24172 + \
24173 + plugin/item/static_stat.o \
24174 + plugin/item/sde.o \
24175 + plugin/item/cde.o \
24176 + plugin/item/blackbox.o \
24177 + plugin/item/internal.o \
24178 + plugin/item/tail.o \
24179 + plugin/item/ctail.o \
24180 + plugin/item/extent.o \
24181 + plugin/item/extent_item_ops.o \
24182 + plugin/item/extent_file_ops.o \
24183 + plugin/item/extent_flush_ops.o \
24184 + \
24185 + plugin/hash.o \
24186 + plugin/fibration.o \
24187 + plugin/tail_policy.o \
24188 + plugin/item/item.o \
24189 + \
24190 + plugin/security/perm.o \
24191 + plugin/space/bitmap.o \
24192 + \
24193 + plugin/disk_format/disk_format40.o \
24194 + plugin/disk_format/disk_format.o
24195 +
24196 diff -urN linux-2.6.23.orig/fs/reiser4/oid.c linux-2.6.23/fs/reiser4/oid.c
24197 --- linux-2.6.23.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300
24198 +++ linux-2.6.23/fs/reiser4/oid.c 2007-12-04 16:49:30.000000000 +0300
24199 @@ -0,0 +1,141 @@
24200 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24201 +
24202 +#include "debug.h"
24203 +#include "super.h"
24204 +#include "txnmgr.h"
24205 +
24206 +/* we used to have oid allocation plugin. It was removed because it
24207 + was recognized as providing unneeded level of abstraction. If one
24208 + ever will find it useful - look at yet_unneeded_abstractions/oid
24209 +*/
24210 +
24211 +/*
24212 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24213 + * are provided by disk format plugin that reads them from the disk during
24214 + * mount.
24215 + */
24216 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24217 +{
24218 + reiser4_super_info_data *sbinfo;
24219 +
24220 + sbinfo = get_super_private(super);
24221 +
24222 + sbinfo->next_to_use = next;
24223 + sbinfo->oids_in_use = nr_files;
24224 + return 0;
24225 +}
24226 +
24227 +/*
24228 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24229 + * runs out of oids.
24230 + */
24231 +oid_t oid_allocate(struct super_block * super)
24232 +{
24233 + reiser4_super_info_data *sbinfo;
24234 + oid_t oid;
24235 +
24236 + sbinfo = get_super_private(super);
24237 +
24238 + spin_lock_reiser4_super(sbinfo);
24239 + if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24240 + oid = sbinfo->next_to_use++;
24241 + sbinfo->oids_in_use++;
24242 + } else
24243 + oid = ABSOLUTE_MAX_OID;
24244 + spin_unlock_reiser4_super(sbinfo);
24245 + return oid;
24246 +}
24247 +
24248 +/*
24249 + * Tell oid allocator that @oid is now free.
24250 + */
24251 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24252 +{
24253 + reiser4_super_info_data *sbinfo;
24254 +
24255 + sbinfo = get_super_private(super);
24256 +
24257 + spin_lock_reiser4_super(sbinfo);
24258 + sbinfo->oids_in_use--;
24259 + spin_unlock_reiser4_super(sbinfo);
24260 + return 0;
24261 +}
24262 +
24263 +/*
24264 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24265 + * without actually allocating it. This is used by disk format plugin to save
24266 + * oid allocator state on the disk.
24267 + */
24268 +oid_t oid_next(const struct super_block * super)
24269 +{
24270 + reiser4_super_info_data *sbinfo;
24271 + oid_t oid;
24272 +
24273 + sbinfo = get_super_private(super);
24274 +
24275 + spin_lock_reiser4_super(sbinfo);
24276 + oid = sbinfo->next_to_use;
24277 + spin_unlock_reiser4_super(sbinfo);
24278 + return oid;
24279 +}
24280 +
24281 +/*
24282 + * returns number of currently used oids. This is used by statfs(2) to report
24283 + * number of "inodes" and by disk format plugin to save oid allocator state on
24284 + * the disk.
24285 + */
24286 +long oids_used(const struct super_block *super)
24287 +{
24288 + reiser4_super_info_data *sbinfo;
24289 + oid_t used;
24290 +
24291 + sbinfo = get_super_private(super);
24292 +
24293 + spin_lock_reiser4_super(sbinfo);
24294 + used = sbinfo->oids_in_use;
24295 + spin_unlock_reiser4_super(sbinfo);
24296 + if (used < (__u64) ((long)~0) >> 1)
24297 + return (long)used;
24298 + else
24299 + return (long)-1;
24300 +}
24301 +
24302 +/*
24303 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24304 + * at the point when we are irrevocably committed to creation of the new file
24305 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24306 + * error).
24307 + */
24308 +void oid_count_allocated(void)
24309 +{
24310 + txn_atom *atom;
24311 +
24312 + atom = get_current_atom_locked();
24313 + atom->nr_objects_created++;
24314 + spin_unlock_atom(atom);
24315 +}
24316 +
24317 +/*
24318 + * Count oid as free in atom. This is done after call to oid_release() at the
24319 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24320 + * when oid release cannot be any longer rolled back due to some error).
24321 + */
24322 +void oid_count_released(void)
24323 +{
24324 + txn_atom *atom;
24325 +
24326 + atom = get_current_atom_locked();
24327 + atom->nr_objects_deleted++;
24328 + spin_unlock_atom(atom);
24329 +}
24330 +
24331 +/*
24332 + Local variables:
24333 + c-indentation-style: "K&R"
24334 + mode-name: "LC"
24335 + c-basic-offset: 8
24336 + tab-width: 8
24337 + fill-column: 120
24338 + scroll-step: 1
24339 + End:
24340 +*/
24341 diff -urN linux-2.6.23.orig/fs/reiser4/page_cache.c linux-2.6.23/fs/reiser4/page_cache.c
24342 --- linux-2.6.23.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300
24343 +++ linux-2.6.23/fs/reiser4/page_cache.c 2007-12-04 21:05:55.806810005 +0300
24344 @@ -0,0 +1,730 @@
24345 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24346 + * reiser4/README */
24347 +
24348 +/* Memory pressure hooks. Fake inodes handling. */
24349 +
24350 +/* GLOSSARY
24351 +
24352 + . Formatted and unformatted nodes.
24353 + Elements of reiser4 balanced tree to store data and metadata.
24354 + Unformatted nodes are pointed to by extent pointers. Such nodes
24355 + are used to store data of large objects. Unlike unformatted nodes,
24356 + formatted ones have associated format described by node4X plugin.
24357 +
24358 + . Jnode (or journal node)
24359 + The in-memory header which is used to track formatted and unformatted
24360 + nodes, bitmap nodes, etc. In particular, jnodes are used to track
24361 + transactional information associated with each block(see reiser4/jnode.c
24362 + for details).
24363 +
24364 + . Znode
24365 + The in-memory header which is used to track formatted nodes. Contains
24366 + embedded jnode (see reiser4/znode.c for details).
24367 +*/
24368 +
24369 +/* We store all file system meta data (and data, of course) in the page cache.
24370 +
24371 + What does this mean? In stead of using bread/brelse we create special
24372 + "fake" inode (one per super block) and store content of formatted nodes
24373 + into pages bound to this inode in the page cache. In newer kernels bread()
24374 + already uses inode attached to block device (bd_inode). Advantage of having
24375 + our own fake inode is that we can install appropriate methods in its
24376 + address_space operations. Such methods are called by VM on memory pressure
24377 + (or during background page flushing) and we can use them to react
24378 + appropriately.
24379 +
24380 + In initial version we only support one block per page. Support for multiple
24381 + blocks per page is complicated by relocation.
24382 +
24383 + To each page, used by reiser4, jnode is attached. jnode is analogous to
24384 + buffer head. Difference is that jnode is bound to the page permanently:
24385 + jnode cannot be removed from memory until its backing page is.
24386 +
24387 + jnode contain pointer to page (->pg field) and page contain pointer to
24388 + jnode in ->private field. Pointer from jnode to page is protected to by
24389 + jnode's spinlock and pointer from page to jnode is protected by page lock
24390 + (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24391 + lock. To go into reverse direction use jnode_lock_page() function that uses
24392 + standard try-lock-and-release device.
24393 +
24394 + Properties:
24395 +
24396 + 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24397 + reference counter is increased.
24398 +
24399 + 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24400 + reference counter is decreased.
24401 +
24402 + 3. on jload() reference counter on jnode page is increased, page is
24403 + kmapped and `referenced'.
24404 +
24405 + 4. on jrelse() inverse operations are performed.
24406 +
24407 + 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24408 +
24409 + DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24410 + historically.]
24411 +
24412 + [In the following discussion, `lock' invariably means long term lock on
24413 + znode.] (What about page locks?)
24414 +
24415 + There is some special class of deadlock possibilities related to memory
24416 + pressure. Locks acquired by other reiser4 threads are accounted for in
24417 + deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24418 + invoked additional hidden arc is added to the locking graph: thread that
24419 + tries to allocate memory waits for ->vm_writeback() to finish. If this
24420 + thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24421 + prevention is useless.
24422 +
24423 + Another related problem is possibility for ->vm_writeback() to run out of
24424 + memory itself. This is not a problem for ext2 and friends, because their
24425 + ->vm_writeback() don't allocate much memory, but reiser4 flush is
24426 + definitely able to allocate huge amounts of memory.
24427 +
24428 + It seems that there is no reliable way to cope with the problems above. In
24429 + stead it was decided that ->vm_writeback() (as invoked in the kswapd
24430 + context) wouldn't perform any flushing itself, but rather should just wake
24431 + up some auxiliary thread dedicated for this purpose (or, the same thread
24432 + that does periodic commit of old atoms (ktxnmgrd.c)).
24433 +
24434 + Details:
24435 +
24436 + 1. Page is called `reclaimable' against particular reiser4 mount F if this
24437 + page can be ultimately released by try_to_free_pages() under presumptions
24438 + that:
24439 +
24440 + a. ->vm_writeback() for F is no-op, and
24441 +
24442 + b. none of the threads accessing F are making any progress, and
24443 +
24444 + c. other reiser4 mounts obey the same memory reservation protocol as F
24445 + (described below).
24446 +
24447 + For example, clean un-pinned page, or page occupied by ext2 data are
24448 + reclaimable against any reiser4 mount.
24449 +
24450 + When there is more than one reiser4 mount in a system, condition (c) makes
24451 + reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24452 +
24453 + THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24454 +
24455 + Fake inode is used to bound formatted nodes and each node is indexed within
24456 + fake inode by its block number. If block size of smaller than page size, it
24457 + may so happen that block mapped to the page with formatted node is occupied
24458 + by unformatted node or is unallocated. This lead to some complications,
24459 + because flushing whole page can lead to an incorrect overwrite of
24460 + unformatted node that is moreover, can be cached in some other place as
24461 + part of the file body. To avoid this, buffers for unformatted nodes are
24462 + never marked dirty. Also pages in the fake are never marked dirty. This
24463 + rules out usage of ->writepage() as memory pressure hook. In stead
24464 + ->releasepage() is used.
24465 +
24466 + Josh is concerned that page->buffer is going to die. This should not pose
24467 + significant problem though, because we need to add some data structures to
24468 + the page anyway (jnode) and all necessary book keeping can be put there.
24469 +
24470 +*/
24471 +
24472 +/* Life cycle of pages/nodes.
24473 +
24474 + jnode contains reference to page and page contains reference back to
24475 + jnode. This reference is counted in page ->count. Thus, page bound to jnode
24476 + cannot be released back into free pool.
24477 +
24478 + 1. Formatted nodes.
24479 +
24480 + 1. formatted node is represented by znode. When new znode is created its
24481 + ->pg pointer is NULL initially.
24482 +
24483 + 2. when node content is loaded into znode (by call to zload()) for the
24484 + first time following happens (in call to ->read_node() or
24485 + ->allocate_node()):
24486 +
24487 + 1. new page is added to the page cache.
24488 +
24489 + 2. this page is attached to znode and its ->count is increased.
24490 +
24491 + 3. page is kmapped.
24492 +
24493 + 3. if more calls to zload() follow (without corresponding zrelses), page
24494 + counter is left intact and in its stead ->d_count is increased in znode.
24495 +
24496 + 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24497 + ->release_node() is called and page is kunmapped as result.
24498 +
24499 + 5. at some moment node can be captured by a transaction. Its ->x_count
24500 + is then increased by transaction manager.
24501 +
24502 + 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24503 + bit set) following will happen (also see comment at the top of znode.c):
24504 +
24505 + 1. when last lock is released, node will be uncaptured from
24506 + transaction. This released reference that transaction manager acquired
24507 + at the step 5.
24508 +
24509 + 2. when last reference is released, zput() detects that node is
24510 + actually deleted and calls ->delete_node()
24511 + operation. page_cache_delete_node() implementation detaches jnode from
24512 + page and releases page.
24513 +
24514 + 7. otherwise (node wasn't removed from the tree), last reference to
24515 + znode will be released after transaction manager committed transaction
24516 + node was in. This implies squallocing of this node (see
24517 + flush.c). Nothing special happens at this point. Znode is still in the
24518 + hash table and page is still attached to it.
24519 +
24520 + 8. znode is actually removed from the memory because of the memory
24521 + pressure, or during umount (znodes_tree_done()). Anyway, znode is
24522 + removed by the call to zdrop(). At this moment, page is detached from
24523 + znode and removed from the inode address space.
24524 +
24525 +*/
24526 +
24527 +#include "debug.h"
24528 +#include "dformat.h"
24529 +#include "key.h"
24530 +#include "txnmgr.h"
24531 +#include "jnode.h"
24532 +#include "znode.h"
24533 +#include "block_alloc.h"
24534 +#include "tree.h"
24535 +#include "vfs_ops.h"
24536 +#include "inode.h"
24537 +#include "super.h"
24538 +#include "entd.h"
24539 +#include "page_cache.h"
24540 +#include "ktxnmgrd.h"
24541 +
24542 +#include <linux/types.h>
24543 +#include <linux/fs.h>
24544 +#include <linux/mm.h> /* for struct page */
24545 +#include <linux/swap.h> /* for struct page */
24546 +#include <linux/pagemap.h>
24547 +#include <linux/bio.h>
24548 +#include <linux/writeback.h>
24549 +#include <linux/blkdev.h>
24550 +
24551 +static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24552 +
24553 +static struct address_space_operations formatted_fake_as_ops;
24554 +
24555 +static const oid_t fake_ino = 0x1;
24556 +static const oid_t bitmap_ino = 0x2;
24557 +static const oid_t cc_ino = 0x3;
24558 +
24559 +static void
24560 +init_fake_inode(struct super_block *super, struct inode *fake,
24561 + struct inode **pfake)
24562 +{
24563 + assert("nikita-2168", fake->i_state & I_NEW);
24564 + fake->i_mapping->a_ops = &formatted_fake_as_ops;
24565 + *pfake = fake;
24566 + /* NOTE-NIKITA something else? */
24567 + unlock_new_inode(fake);
24568 +}
24569 +
24570 +/**
24571 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24572 + * @super: super block to init fake inode for
24573 + *
24574 + * Initializes fake inode to which formatted nodes are bound in the page cache
24575 + * and inode for bitmaps.
24576 + */
24577 +int reiser4_init_formatted_fake(struct super_block *super)
24578 +{
24579 + struct inode *fake;
24580 + struct inode *bitmap;
24581 + struct inode *cc;
24582 + reiser4_super_info_data *sinfo;
24583 +
24584 + assert("nikita-1703", super != NULL);
24585 +
24586 + sinfo = get_super_private_nocheck(super);
24587 + fake = iget_locked(super, oid_to_ino(fake_ino));
24588 +
24589 + if (fake != NULL) {
24590 + init_fake_inode(super, fake, &sinfo->fake);
24591 +
24592 + bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24593 + if (bitmap != NULL) {
24594 + init_fake_inode(super, bitmap, &sinfo->bitmap);
24595 +
24596 + cc = iget_locked(super, oid_to_ino(cc_ino));
24597 + if (cc != NULL) {
24598 + init_fake_inode(super, cc, &sinfo->cc);
24599 + return 0;
24600 + } else {
24601 + iput(sinfo->fake);
24602 + iput(sinfo->bitmap);
24603 + sinfo->fake = NULL;
24604 + sinfo->bitmap = NULL;
24605 + }
24606 + } else {
24607 + iput(sinfo->fake);
24608 + sinfo->fake = NULL;
24609 + }
24610 + }
24611 + return RETERR(-ENOMEM);
24612 +}
24613 +
24614 +/**
24615 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24616 + * @super: super block to init fake inode for
24617 + *
24618 + * Releases inodes which were used as address spaces of bitmap and formatted
24619 + * nodes.
24620 + */
24621 +void reiser4_done_formatted_fake(struct super_block *super)
24622 +{
24623 + reiser4_super_info_data *sinfo;
24624 +
24625 + sinfo = get_super_private_nocheck(super);
24626 +
24627 + if (sinfo->fake != NULL) {
24628 + iput(sinfo->fake);
24629 + sinfo->fake = NULL;
24630 + }
24631 +
24632 + if (sinfo->bitmap != NULL) {
24633 + iput(sinfo->bitmap);
24634 + sinfo->bitmap = NULL;
24635 + }
24636 +
24637 + if (sinfo->cc != NULL) {
24638 + iput(sinfo->cc);
24639 + sinfo->cc = NULL;
24640 + }
24641 + return;
24642 +}
24643 +
24644 +void reiser4_wait_page_writeback(struct page *page)
24645 +{
24646 + assert("zam-783", PageLocked(page));
24647 +
24648 + do {
24649 + unlock_page(page);
24650 + wait_on_page_writeback(page);
24651 + lock_page(page);
24652 + } while (PageWriteback(page));
24653 +}
24654 +
24655 +/* return tree @page is in */
24656 +reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24657 +{
24658 + assert("nikita-2461", page != NULL);
24659 + return &get_super_private(page->mapping->host->i_sb)->tree;
24660 +}
24661 +
24662 +/* completion handler for single page bio-based read.
24663 +
24664 + mpage_end_io_read() would also do. But it's static.
24665 +
24666 +*/
24667 +static int
24668 +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24669 + int err UNUSED_ARG)
24670 +{
24671 + struct page *page;
24672 +
24673 + if (bio->bi_size != 0) {
24674 + warning("nikita-3332", "Truncated single page read: %i",
24675 + bio->bi_size);
24676 + return 1;
24677 + }
24678 +
24679 + page = bio->bi_io_vec[0].bv_page;
24680 +
24681 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24682 + SetPageUptodate(page);
24683 + } else {
24684 + ClearPageUptodate(page);
24685 + SetPageError(page);
24686 + }
24687 + unlock_page(page);
24688 + bio_put(bio);
24689 + return 0;
24690 +}
24691 +
24692 +/* completion handler for single page bio-based write.
24693 +
24694 + mpage_end_io_write() would also do. But it's static.
24695 +
24696 +*/
24697 +static int
24698 +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24699 + int err UNUSED_ARG)
24700 +{
24701 + struct page *page;
24702 +
24703 + if (bio->bi_size != 0) {
24704 + warning("nikita-3333", "Truncated single page write: %i",
24705 + bio->bi_size);
24706 + return 1;
24707 + }
24708 +
24709 + page = bio->bi_io_vec[0].bv_page;
24710 +
24711 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24712 + SetPageError(page);
24713 + end_page_writeback(page);
24714 + bio_put(bio);
24715 + return 0;
24716 +}
24717 +
24718 +/* ->readpage() method for formatted nodes */
24719 +static int formatted_readpage(struct file *f UNUSED_ARG,
24720 + struct page *page /* page to read */ )
24721 +{
24722 + assert("nikita-2412", PagePrivate(page) && jprivate(page));
24723 + return reiser4_page_io(page, jprivate(page), READ,
24724 + reiser4_ctx_gfp_mask_get());
24725 +}
24726 +
24727 +/**
24728 + * reiser4_page_io - submit single-page bio request
24729 + * @page: page to perform io for
24730 + * @node: jnode of page
24731 + * @rw: read or write
24732 + * @gfp: gfp mask for bio allocation
24733 + *
24734 + * Submits single page read or write.
24735 + */
24736 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24737 +{
24738 + struct bio *bio;
24739 + int result;
24740 +
24741 + assert("nikita-2094", page != NULL);
24742 + assert("nikita-2226", PageLocked(page));
24743 + assert("nikita-2634", node != NULL);
24744 + assert("nikita-2893", rw == READ || rw == WRITE);
24745 +
24746 + if (rw) {
24747 + if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24748 + unlock_page(page);
24749 + return 0;
24750 + }
24751 + }
24752 +
24753 + bio = page_bio(page, node, rw, gfp);
24754 + if (!IS_ERR(bio)) {
24755 + if (rw == WRITE) {
24756 + set_page_writeback(page);
24757 + unlock_page(page);
24758 + }
24759 + reiser4_submit_bio(rw, bio);
24760 + result = 0;
24761 + } else {
24762 + unlock_page(page);
24763 + result = PTR_ERR(bio);
24764 + }
24765 +
24766 + return result;
24767 +}
24768 +
24769 +/* helper function to construct bio for page */
24770 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24771 +{
24772 + struct bio *bio;
24773 + assert("nikita-2092", page != NULL);
24774 + assert("nikita-2633", node != NULL);
24775 +
24776 + /* Simple implementation in the assumption that blocksize == pagesize.
24777 +
24778 + We only have to submit one block, but submit_bh() will allocate bio
24779 + anyway, so lets use all the bells-and-whistles of bio code.
24780 + */
24781 +
24782 + bio = bio_alloc(gfp, 1);
24783 + if (bio != NULL) {
24784 + int blksz;
24785 + struct super_block *super;
24786 + reiser4_block_nr blocknr;
24787 +
24788 + super = page->mapping->host->i_sb;
24789 + assert("nikita-2029", super != NULL);
24790 + blksz = super->s_blocksize;
24791 + assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24792 +
24793 + spin_lock_jnode(node);
24794 + blocknr = *jnode_get_io_block(node);
24795 + spin_unlock_jnode(node);
24796 +
24797 + assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24798 + assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24799 +
24800 + bio->bi_bdev = super->s_bdev;
24801 + /* fill bio->bi_sector before calling bio_add_page(), because
24802 + * q->merge_bvec_fn may want to inspect it (see
24803 + * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24804 + bio->bi_sector = blocknr * (blksz >> 9);
24805 +
24806 + if (!bio_add_page(bio, page, blksz, 0)) {
24807 + warning("nikita-3452",
24808 + "Single page bio cannot be constructed");
24809 + return ERR_PTR(RETERR(-EINVAL));
24810 + }
24811 +
24812 + /* bio -> bi_idx is filled by bio_init() */
24813 + bio->bi_end_io = (rw == READ) ?
24814 + end_bio_single_page_read : end_bio_single_page_write;
24815 +
24816 + return bio;
24817 + } else
24818 + return ERR_PTR(RETERR(-ENOMEM));
24819 +}
24820 +
24821 +/* this function is internally called by jnode_make_dirty() */
24822 +int reiser4_set_page_dirty_internal(struct page *page)
24823 +{
24824 + struct address_space *mapping;
24825 +
24826 + mapping = page->mapping;
24827 + BUG_ON(mapping == NULL);
24828 +
24829 + if (!TestSetPageDirty(page)) {
24830 + if (mapping_cap_account_dirty(mapping))
24831 + inc_zone_page_state(page, NR_FILE_DIRTY);
24832 +
24833 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24834 + }
24835 +
24836 + /* znode must be dirty ? */
24837 + if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
24838 + assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24839 + return 0;
24840 +}
24841 +
24842 +#if 0
24843 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24844 +{
24845 + if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24846 + return 1;
24847 + if (ctx->super != s)
24848 + return 1;
24849 + if (get_super_private(s)->entd.tsk == current)
24850 + return 0;
24851 + if (!lock_stack_isclean(&ctx->stack))
24852 + return 0;
24853 + if (ctx->trans->atom != NULL)
24854 + return 0;
24855 + return 1;
24856 +}
24857 +#endif
24858 +
24859 +/**
24860 + * reiser4_writepage - writepage of struct address_space_operations
24861 + * @page: page to write
24862 + * @wbc:
24863 + *
24864 + *
24865 + */
24866 +/* Common memory pressure notification. */
24867 +int reiser4_writepage(struct page *page,
24868 + struct writeback_control *wbc)
24869 +{
24870 + struct super_block *s;
24871 + reiser4_context *ctx;
24872 +
24873 + assert("vs-828", PageLocked(page));
24874 +
24875 + s = page->mapping->host->i_sb;
24876 + ctx = get_current_context_check();
24877 +
24878 + //assert("", can_hit_entd(ctx, s));
24879 + return write_page_by_ent(page, wbc);
24880 +}
24881 +
24882 +/* ->set_page_dirty() method of formatted address_space */
24883 +static int formatted_set_page_dirty(struct page *page)
24884 +{
24885 + assert("nikita-2173", page != NULL);
24886 + BUG();
24887 + return __set_page_dirty_nobuffers(page);
24888 +}
24889 +
24890 +/* writepages method of address space operations in reiser4 is used to involve
24891 + into transactions pages which are dirtied via mmap. Only regular files can
24892 + have such pages. Fake inode is used to access formatted nodes via page
24893 + cache. As formatted nodes can never be mmaped, fake inode's writepages has
24894 + nothing to do */
24895 +static int
24896 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24897 +{
24898 + return 0;
24899 +}
24900 +
24901 +/* address space operations for the fake inode */
24902 +static struct address_space_operations formatted_fake_as_ops = {
24903 + /* Perform a writeback of a single page as a memory-freeing
24904 + * operation. */
24905 + .writepage = reiser4_writepage,
24906 + /* this is called to read formatted node */
24907 + .readpage = formatted_readpage,
24908 + /* ->sync_page() method of fake inode address space operations. Called
24909 + from wait_on_page() and lock_page().
24910 +
24911 + This is most annoyingly misnomered method. Actually it is called
24912 + from wait_on_page_bit() and lock_page() and its purpose is to
24913 + actually start io by jabbing device drivers.
24914 + */
24915 + .sync_page = block_sync_page,
24916 + /* Write back some dirty pages from this mapping. Called from sync.
24917 + called during sync (pdflush) */
24918 + .writepages = writepages_fake,
24919 + /* Set a page dirty */
24920 + .set_page_dirty = formatted_set_page_dirty,
24921 + /* used for read-ahead. Not applicable */
24922 + .readpages = NULL,
24923 + .prepare_write = NULL,
24924 + .commit_write = NULL,
24925 + .bmap = NULL,
24926 + /* called just before page is being detached from inode mapping and
24927 + removed from memory. Called on truncate, cut/squeeze, and
24928 + umount. */
24929 + .invalidatepage = reiser4_invalidatepage,
24930 + /* this is called by shrink_cache() so that file system can try to
24931 + release objects (jnodes, buffers, journal heads) attached to page
24932 + and, may be made page itself free-able.
24933 + */
24934 + .releasepage = reiser4_releasepage,
24935 + .direct_IO = NULL
24936 +};
24937 +
24938 +/* called just before page is released (no longer used by reiser4). Callers:
24939 + jdelete() and extent2tail(). */
24940 +void reiser4_drop_page(struct page *page)
24941 +{
24942 + assert("nikita-2181", PageLocked(page));
24943 + clear_page_dirty_for_io(page);
24944 + ClearPageUptodate(page);
24945 +#if defined(PG_skipped)
24946 + ClearPageSkipped(page);
24947 +#endif
24948 + unlock_page(page);
24949 +}
24950 +
24951 +#define JNODE_GANG_SIZE (16)
24952 +
24953 +/* find all jnodes from range specified and invalidate them */
24954 +static int
24955 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24956 +{
24957 + reiser4_inode *info;
24958 + int truncated_jnodes;
24959 + reiser4_tree *tree;
24960 + unsigned long index;
24961 + unsigned long end;
24962 +
24963 + if (inode_file_plugin(inode) ==
24964 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24965 + /*
24966 + * No need to get rid of jnodes here: if the single jnode of
24967 + * page cluster did not have page, then it was found and killed
24968 + * before in
24969 + * truncate_complete_page_cluster()->jput()->jput_final(),
24970 + * otherwise it will be dropped by reiser4_invalidatepage()
24971 + */
24972 + return 0;
24973 + truncated_jnodes = 0;
24974 +
24975 + info = reiser4_inode_data(inode);
24976 + tree = reiser4_tree_by_inode(inode);
24977 +
24978 + index = from;
24979 + end = from + count;
24980 +
24981 + while (1) {
24982 + jnode *gang[JNODE_GANG_SIZE];
24983 + int taken;
24984 + int i;
24985 + jnode *node;
24986 +
24987 + assert("nikita-3466", index <= end);
24988 +
24989 + read_lock_tree(tree);
24990 + taken =
24991 + radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
24992 + (void **)gang, index,
24993 + JNODE_GANG_SIZE);
24994 + for (i = 0; i < taken; ++i) {
24995 + node = gang[i];
24996 + if (index_jnode(node) < end)
24997 + jref(node);
24998 + else
24999 + gang[i] = NULL;
25000 + }
25001 + read_unlock_tree(tree);
25002 +
25003 + for (i = 0; i < taken; ++i) {
25004 + node = gang[i];
25005 + if (node != NULL) {
25006 + index = max(index, index_jnode(node));
25007 + spin_lock_jnode(node);
25008 + assert("edward-1457", node->pg == NULL);
25009 + /* this is always called after
25010 + truncate_inode_pages_range(). Therefore, here
25011 + jnode can not have page. New pages can not be
25012 + created because truncate_jnodes_range goes
25013 + under exclusive access on file obtained,
25014 + where as new page creation requires
25015 + non-exclusive access obtained */
25016 + JF_SET(node, JNODE_HEARD_BANSHEE);
25017 + reiser4_uncapture_jnode(node);
25018 + unhash_unformatted_jnode(node);
25019 + truncated_jnodes++;
25020 + jput(node);
25021 + } else
25022 + break;
25023 + }
25024 + if (i != taken || taken == 0)
25025 + break;
25026 + }
25027 + return truncated_jnodes;
25028 +}
25029 +
25030 +/* Truncating files in reiser4: problems and solutions.
25031 +
25032 + VFS calls fs's truncate after it has called truncate_inode_pages()
25033 + to get rid of pages corresponding to part of file being truncated.
25034 + In reiser4 it may cause existence of unallocated extents which do
25035 + not have jnodes. Flush code does not expect that. Solution of this
25036 + problem is straightforward. As vfs's truncate is implemented using
25037 + setattr operation, it seems reasonable to have ->setattr() that
25038 + will cut file body. However, flush code also does not expect dirty
25039 + pages without parent items, so it is impossible to cut all items,
25040 + then truncate all pages in two steps. We resolve this problem by
25041 + cutting items one-by-one. Each such fine-grained step performed
25042 + under longterm znode lock calls at the end ->kill_hook() method of
25043 + a killed item to remove its binded pages and jnodes.
25044 +
25045 + The following function is a common part of mentioned kill hooks.
25046 + Also, this is called before tail-to-extent conversion (to not manage
25047 + few copies of the data).
25048 +*/
25049 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25050 + unsigned long count, int even_cows)
25051 +{
25052 + loff_t from_bytes, count_bytes;
25053 +
25054 + if (count == 0)
25055 + return;
25056 + from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25057 + count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25058 +
25059 + unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25060 + truncate_inode_pages_range(mapping, from_bytes,
25061 + from_bytes + count_bytes - 1);
25062 + truncate_jnodes_range(mapping->host, from, count);
25063 +}
25064 +
25065 +/*
25066 + * Local variables:
25067 + * c-indentation-style: "K&R"
25068 + * mode-name: "LC"
25069 + * c-basic-offset: 8
25070 + * tab-width: 8
25071 + * fill-column: 120
25072 + * scroll-step: 1
25073 + * End:
25074 + */
25075 diff -urN linux-2.6.23.orig/fs/reiser4/page_cache.h linux-2.6.23/fs/reiser4/page_cache.h
25076 --- linux-2.6.23.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300
25077 +++ linux-2.6.23/fs/reiser4/page_cache.h 2007-12-04 16:49:30.000000000 +0300
25078 @@ -0,0 +1,68 @@
25079 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25080 + * reiser4/README */
25081 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25082 +
25083 +#if !defined( __REISER4_PAGE_CACHE_H__ )
25084 +#define __REISER4_PAGE_CACHE_H__
25085 +
25086 +#include "forward.h"
25087 +#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25088 +
25089 +#include <linux/fs.h> /* for struct super_block, address_space */
25090 +#include <linux/mm.h> /* for struct page */
25091 +#include <linux/pagemap.h> /* for lock_page() */
25092 +#include <linux/vmalloc.h> /* for __vmalloc() */
25093 +
25094 +extern int reiser4_init_formatted_fake(struct super_block *);
25095 +extern void reiser4_done_formatted_fake(struct super_block *);
25096 +
25097 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25098 +
25099 +extern int reiser4_set_page_dirty_internal(struct page *);
25100 +
25101 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25102 +
25103 +extern void reiser4_wait_page_writeback(struct page *);
25104 +static inline void lock_and_wait_page_writeback(struct page *page)
25105 +{
25106 + lock_page(page);
25107 + if (unlikely(PageWriteback(page)))
25108 + reiser4_wait_page_writeback(page);
25109 +}
25110 +
25111 +#define jprivate(page) ((jnode *)page_private(page))
25112 +
25113 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25114 +extern void reiser4_drop_page(struct page *);
25115 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25116 + unsigned long count, int even_cows);
25117 +extern void capture_reiser4_inodes(struct super_block *,
25118 + struct writeback_control *);
25119 +static inline void * reiser4_vmalloc (unsigned long size)
25120 +{
25121 + return __vmalloc(size,
25122 + reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25123 + PAGE_KERNEL);
25124 +}
25125 +
25126 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25127 +
25128 +#if REISER4_DEBUG
25129 +extern void print_page(const char *prefix, struct page *page);
25130 +#else
25131 +#define print_page(prf, p) noop
25132 +#endif
25133 +
25134 +/* __REISER4_PAGE_CACHE_H__ */
25135 +#endif
25136 +
25137 +/* Make Linus happy.
25138 + Local variables:
25139 + c-indentation-style: "K&R"
25140 + mode-name: "LC"
25141 + c-basic-offset: 8
25142 + tab-width: 8
25143 + fill-column: 120
25144 + scroll-step: 1
25145 + End:
25146 +*/
25147 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/cluster.c linux-2.6.23/fs/reiser4/plugin/cluster.c
25148 --- linux-2.6.23.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300
25149 +++ linux-2.6.23/fs/reiser4/plugin/cluster.c 2007-12-04 16:49:30.000000000 +0300
25150 @@ -0,0 +1,71 @@
25151 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25152 +
25153 +/* Contains reiser4 cluster plugins (see
25154 + http://www.namesys.com/cryptcompress_design.html
25155 + "Concepts of clustering" for details). */
25156 +
25157 +#include "plugin_header.h"
25158 +#include "plugin.h"
25159 +#include "../inode.h"
25160 +
25161 +static int change_cluster(struct inode *inode,
25162 + reiser4_plugin * plugin,
25163 + pset_member memb)
25164 +{
25165 + assert("edward-1324", inode != NULL);
25166 + assert("edward-1325", plugin != NULL);
25167 + assert("edward-1326", is_reiser4_inode(inode));
25168 + assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25169 +
25170 + /* Can't change the cluster plugin for already existent regular files. */
25171 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25172 + return RETERR(-EINVAL);
25173 +
25174 + /* If matches, nothing to change. */
25175 + if (inode_hash_plugin(inode) != NULL &&
25176 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25177 + return 0;
25178 +
25179 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25180 + PSET_CLUSTER, plugin);
25181 +}
25182 +
25183 +static reiser4_plugin_ops cluster_plugin_ops = {
25184 + .init = NULL,
25185 + .load = NULL,
25186 + .save_len = NULL,
25187 + .save = NULL,
25188 + .change = &change_cluster
25189 +};
25190 +
25191 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25192 + [CLUSTER_ ## ID ## _ID] = { \
25193 + .h = { \
25194 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25195 + .id = CLUSTER_ ## ID ## _ID, \
25196 + .pops = &cluster_plugin_ops, \
25197 + .label = LABEL, \
25198 + .desc = DESC, \
25199 + .linkage = {NULL, NULL} \
25200 + }, \
25201 + .shift = SHIFT \
25202 + }
25203 +
25204 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25205 + SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25206 + SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25207 + SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25208 + SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25209 + SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25210 +};
25211 +
25212 +/*
25213 + Local variables:
25214 + c-indentation-style: "K&R"
25215 + mode-name: "LC"
25216 + c-basic-offset: 8
25217 + tab-width: 8
25218 + fill-column: 120
25219 + scroll-step: 1
25220 + End:
25221 +*/
25222 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/cluster.h linux-2.6.23/fs/reiser4/plugin/cluster.h
25223 --- linux-2.6.23.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300
25224 +++ linux-2.6.23/fs/reiser4/plugin/cluster.h 2007-12-04 16:49:30.000000000 +0300
25225 @@ -0,0 +1,395 @@
25226 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25227 +
25228 +/* This file contains size/offset translators, modulators
25229 + and other helper functions. */
25230 +
25231 +#if !defined( __FS_REISER4_CLUSTER_H__ )
25232 +#define __FS_REISER4_CLUSTER_H__
25233 +
25234 +#include "../inode.h"
25235 +
25236 +static inline int inode_cluster_shift(struct inode *inode)
25237 +{
25238 + assert("edward-92", inode != NULL);
25239 + assert("edward-93", reiser4_inode_data(inode) != NULL);
25240 +
25241 + return inode_cluster_plugin(inode)->shift;
25242 +}
25243 +
25244 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25245 +{
25246 + return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25247 +}
25248 +
25249 +/* cluster size in page units */
25250 +static inline unsigned cluster_nrpages(struct inode *inode)
25251 +{
25252 + return 1U << cluster_nrpages_shift(inode);
25253 +}
25254 +
25255 +static inline size_t inode_cluster_size(struct inode *inode)
25256 +{
25257 + assert("edward-96", inode != NULL);
25258 +
25259 + return 1U << inode_cluster_shift(inode);
25260 +}
25261 +
25262 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25263 +{
25264 + return idx >> cluster_nrpages_shift(inode);
25265 +}
25266 +
25267 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25268 +{
25269 + return idx << cluster_nrpages_shift(inode);
25270 +}
25271 +
25272 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25273 +{
25274 + return clust_to_pg(pg_to_clust(idx, inode), inode);
25275 +}
25276 +
25277 +static inline pgoff_t off_to_pg(loff_t off)
25278 +{
25279 + return (off >> PAGE_CACHE_SHIFT);
25280 +}
25281 +
25282 +static inline loff_t pg_to_off(pgoff_t idx)
25283 +{
25284 + return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25285 +}
25286 +
25287 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25288 +{
25289 + return off >> inode_cluster_shift(inode);
25290 +}
25291 +
25292 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25293 +{
25294 + return (loff_t) idx << inode_cluster_shift(inode);
25295 +}
25296 +
25297 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25298 +{
25299 + return clust_to_off(off_to_clust(off, inode), inode);
25300 +}
25301 +
25302 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25303 +{
25304 + return clust_to_pg(off_to_clust(off, inode), inode);
25305 +}
25306 +
25307 +static inline unsigned off_to_pgoff(loff_t off)
25308 +{
25309 + return off & (PAGE_CACHE_SIZE - 1);
25310 +}
25311 +
25312 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25313 +{
25314 + return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25315 +}
25316 +
25317 +static inline pgoff_t offset_in_clust(struct page * page)
25318 +{
25319 + assert("edward-1488", page != NULL);
25320 + assert("edward-1489", page->mapping != NULL);
25321 +
25322 + return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
25323 +}
25324 +
25325 +static inline int first_page_in_cluster(struct page * page)
25326 +{
25327 + return offset_in_clust(page) == 0;
25328 +}
25329 +
25330 +static inline int last_page_in_cluster(struct page * page)
25331 +{
25332 + return offset_in_clust(page) ==
25333 + cluster_nrpages(page->mapping->host) - 1;
25334 +}
25335 +
25336 +static inline unsigned
25337 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25338 +{
25339 + return off_to_cloff(pg_to_off(idx), inode);
25340 +}
25341 +
25342 +/*********************** Size translators **************************/
25343 +
25344 +/* Translate linear size.
25345 + * New units are (1 << @blk_shift) times larger, then old ones.
25346 + * In other words, calculate number of logical blocks, occupied
25347 + * by @count elements
25348 + */
25349 +static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
25350 +{
25351 + return (count + (1UL << blkbits) - 1) >> blkbits;
25352 +}
25353 +
25354 +/* size in pages */
25355 +static inline pgoff_t size_in_pages(loff_t size)
25356 +{
25357 + return size_in_blocks(size, PAGE_CACHE_SHIFT);
25358 +}
25359 +
25360 +/* size in logical clusters */
25361 +static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
25362 +{
25363 + return size_in_blocks(size, inode_cluster_shift(inode));
25364 +}
25365 +
25366 +/* size in pages to the size in page clusters */
25367 +static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25368 +{
25369 + return size_in_blocks(size, cluster_nrpages_shift(inode));
25370 +}
25371 +
25372 +/*********************** Size modulators ***************************/
25373 +
25374 +/*
25375 + Modulate linear size by nominated block size and offset.
25376 +
25377 + The "finite" function (which is zero almost everywhere).
25378 + How much is a height of the figure at a position @pos,
25379 + when trying to construct rectangle of height (1 << @blkbits),
25380 + and square @size.
25381 +
25382 + ******
25383 + *******
25384 + *******
25385 + *******
25386 + ----------> pos
25387 +*/
25388 +static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
25389 +{
25390 + unsigned end = size >> blkbits;
25391 + if (pos < end)
25392 + return 1U << blkbits;
25393 + if (unlikely(pos > end))
25394 + return 0;
25395 + return size & ~(~0ull << blkbits);
25396 +}
25397 +
25398 +/* the same as above, but block size is page size */
25399 +static inline unsigned __mbp(loff_t size, pgoff_t pos)
25400 +{
25401 + return __mbb(size, pos, PAGE_CACHE_SHIFT);
25402 +}
25403 +
25404 +/* number of file's bytes in the nominated logical cluster */
25405 +static inline unsigned lbytes(cloff_t index, struct inode * inode)
25406 +{
25407 + return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25408 +}
25409 +
25410 +/* number of file's bytes in the nominated page */
25411 +static inline unsigned pbytes(pgoff_t index, struct inode * inode)
25412 +{
25413 + return __mbp(i_size_read(inode), index);
25414 +}
25415 +
25416 +/* return true, if logical cluster is not occupied by the file */
25417 +static inline int new_logical_cluster(struct cluster_handle * clust,
25418 + struct inode *inode)
25419 +{
25420 + return clust_to_off(clust->index, inode) >= i_size_read(inode);
25421 +}
25422 +
25423 +/* return true, if pages @p1 and @p2 are of the same page cluster */
25424 +static inline int same_page_cluster(struct page * p1, struct page * p2)
25425 +{
25426 + assert("edward-1490", p1 != NULL);
25427 + assert("edward-1491", p2 != NULL);
25428 + assert("edward-1492", p1->mapping != NULL);
25429 + assert("edward-1493", p2->mapping != NULL);
25430 +
25431 + return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25432 + pg_to_clust(page_index(p2), p2->mapping->host));
25433 +}
25434 +
25435 +static inline int cluster_is_complete(struct cluster_handle * clust,
25436 + struct inode * inode)
25437 +{
25438 + return clust->tc.lsize == inode_cluster_size(inode);
25439 +}
25440 +
25441 +static inline void reiser4_slide_init(struct reiser4_slide * win)
25442 +{
25443 + assert("edward-1084", win != NULL);
25444 + memset(win, 0, sizeof *win);
25445 +}
25446 +
25447 +static inline tfm_action
25448 +cluster_get_tfm_act(struct tfm_cluster * tc)
25449 +{
25450 + assert("edward-1356", tc != NULL);
25451 + return tc->act;
25452 +}
25453 +
25454 +static inline void
25455 +cluster_set_tfm_act(struct tfm_cluster * tc, tfm_action act)
25456 +{
25457 + assert("edward-1356", tc != NULL);
25458 + tc->act = act;
25459 +}
25460 +
25461 +static inline void cluster_init_act(struct cluster_handle * clust,
25462 + tfm_action act,
25463 + struct reiser4_slide * window)
25464 +{
25465 + assert("edward-84", clust != NULL);
25466 + memset(clust, 0, sizeof *clust);
25467 + cluster_set_tfm_act(&clust->tc, act);
25468 + clust->dstat = INVAL_DISK_CLUSTER;
25469 + clust->win = window;
25470 +}
25471 +
25472 +static inline void cluster_init_read(struct cluster_handle * clust,
25473 + struct reiser4_slide * window)
25474 +{
25475 + cluster_init_act (clust, TFMA_READ, window);
25476 +}
25477 +
25478 +static inline void cluster_init_write(struct cluster_handle * clust,
25479 + struct reiser4_slide * window)
25480 +{
25481 + cluster_init_act (clust, TFMA_WRITE, window);
25482 +}
25483 +
25484 +/* true if @p1 and @p2 are items of the same disk cluster */
25485 +static inline int same_disk_cluster(const coord_t * p1, const coord_t * p2)
25486 +{
25487 + /* drop this if you have other items to aggregate */
25488 + assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25489 +
25490 + return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25491 +}
25492 +
25493 +static inline int dclust_get_extension_dsize(hint_t * hint)
25494 +{
25495 + return hint->ext_coord.extension.ctail.dsize;
25496 +}
25497 +
25498 +static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25499 +{
25500 + hint->ext_coord.extension.ctail.dsize = dsize;
25501 +}
25502 +
25503 +static inline int dclust_get_extension_shift(hint_t * hint)
25504 +{
25505 + return hint->ext_coord.extension.ctail.shift;
25506 +}
25507 +
25508 +static inline int dclust_get_extension_ncount(hint_t * hint)
25509 +{
25510 + return hint->ext_coord.extension.ctail.ncount;
25511 +}
25512 +
25513 +static inline void dclust_inc_extension_ncount(hint_t * hint)
25514 +{
25515 + hint->ext_coord.extension.ctail.ncount ++;
25516 +}
25517 +
25518 +static inline void dclust_init_extension(hint_t * hint)
25519 +{
25520 + memset(&hint->ext_coord.extension.ctail, 0,
25521 + sizeof(hint->ext_coord.extension.ctail));
25522 +}
25523 +
25524 +static inline int hint_is_unprepped_dclust(hint_t * hint)
25525 +{
25526 + assert("edward-1451", hint_is_valid(hint));
25527 + return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25528 +}
25529 +
25530 +static inline void coord_set_between_clusters(coord_t * coord)
25531 +{
25532 +#if REISER4_DEBUG
25533 + int result;
25534 + result = zload(coord->node);
25535 + assert("edward-1296", !result);
25536 +#endif
25537 + if (!coord_is_between_items(coord)) {
25538 + coord->between = AFTER_ITEM;
25539 + coord->unit_pos = 0;
25540 + }
25541 +#if REISER4_DEBUG
25542 + zrelse(coord->node);
25543 +#endif
25544 +}
25545 +
25546 +int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25547 +int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
25548 + znode_lock_mode mode);
25549 +int checkout_logical_cluster(struct cluster_handle *, jnode *, struct inode *);
25550 +int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25551 +void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25552 + int even_cows);
25553 +void invalidate_hint_cluster(struct cluster_handle * clust);
25554 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode * inode,
25555 + znode_lock_mode lock_mode);
25556 +void reset_cluster_params(struct cluster_handle * clust);
25557 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
25558 + int count);
25559 +int prepare_page_cluster(struct inode *inode, struct cluster_handle * clust,
25560 + rw_op rw);
25561 +void put_page_cluster(struct cluster_handle * clust,
25562 + struct inode * inode, rw_op rw);
25563 +void put_cluster_handle(struct cluster_handle * clust);
25564 +int grab_tfm_stream(struct inode *inode, struct tfm_cluster * tc, tfm_stream_id id);
25565 +int tfm_cluster_is_uptodate(struct tfm_cluster * tc);
25566 +void tfm_cluster_set_uptodate(struct tfm_cluster * tc);
25567 +void tfm_cluster_clr_uptodate(struct tfm_cluster * tc);
25568 +
25569 +/* move cluster handle to the target position
25570 + specified by the page of index @pgidx */
25571 +static inline void move_cluster_forward(struct cluster_handle * clust,
25572 + struct inode *inode,
25573 + pgoff_t pgidx)
25574 +{
25575 + assert("edward-1297", clust != NULL);
25576 + assert("edward-1298", inode != NULL);
25577 +
25578 + reset_cluster_params(clust);
25579 + if (clust->index_valid &&
25580 + /* Hole in the indices. Hint became invalid and can not be
25581 + used by find_cluster_item() even if seal/node versions
25582 + will coincide */
25583 + pg_to_clust(pgidx, inode) != clust->index + 1) {
25584 + reiser4_unset_hint(clust->hint);
25585 + invalidate_hint_cluster(clust);
25586 + }
25587 + clust->index = pg_to_clust(pgidx, inode);
25588 + clust->index_valid = 1;
25589 +}
25590 +
25591 +static inline int alloc_clust_pages(struct cluster_handle * clust,
25592 + struct inode *inode)
25593 +{
25594 + assert("edward-791", clust != NULL);
25595 + assert("edward-792", inode != NULL);
25596 + clust->pages =
25597 + kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25598 + reiser4_ctx_gfp_mask_get());
25599 + if (!clust->pages)
25600 + return -ENOMEM;
25601 + return 0;
25602 +}
25603 +
25604 +static inline void free_clust_pages(struct cluster_handle * clust)
25605 +{
25606 + kfree(clust->pages);
25607 +}
25608 +
25609 +#endif /* __FS_REISER4_CLUSTER_H__ */
25610 +
25611 +/* Make Linus happy.
25612 + Local variables:
25613 + c-indentation-style: "K&R"
25614 + mode-name: "LC"
25615 + c-basic-offset: 8
25616 + tab-width: 8
25617 + fill-column: 120
25618 + scroll-step: 1
25619 + End:
25620 +*/
25621 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.23/fs/reiser4/plugin/compress/compress.c
25622 --- linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300
25623 +++ linux-2.6.23/fs/reiser4/plugin/compress/compress.c 2007-12-04 16:49:30.000000000 +0300
25624 @@ -0,0 +1,367 @@
25625 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25626 +/* reiser4 compression transform plugins */
25627 +
25628 +#include "../../debug.h"
25629 +#include "../../inode.h"
25630 +#include "../plugin.h"
25631 +
25632 +#include <linux/lzo.h>
25633 +#include <linux/zlib.h>
25634 +#include <linux/types.h>
25635 +#include <linux/hardirq.h>
25636 +
25637 +static int change_compression(struct inode *inode,
25638 + reiser4_plugin * plugin,
25639 + pset_member memb)
25640 +{
25641 + assert("edward-1316", inode != NULL);
25642 + assert("edward-1317", plugin != NULL);
25643 + assert("edward-1318", is_reiser4_inode(inode));
25644 + assert("edward-1319",
25645 + plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25646 +
25647 + /* cannot change compression plugin of already existing regular object */
25648 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25649 + return RETERR(-EINVAL);
25650 +
25651 + /* If matches, nothing to change. */
25652 + if (inode_hash_plugin(inode) != NULL &&
25653 + inode_hash_plugin(inode)->h.id == plugin->h.id)
25654 + return 0;
25655 +
25656 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25657 + PSET_COMPRESSION, plugin);
25658 +}
25659 +
25660 +static reiser4_plugin_ops compression_plugin_ops = {
25661 + .init = NULL,
25662 + .load = NULL,
25663 + .save_len = NULL,
25664 + .save = NULL,
25665 + .change = &change_compression
25666 +};
25667 +
25668 +/******************************************************************************/
25669 +/* gzip1 compression */
25670 +/******************************************************************************/
25671 +
25672 +#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25673 +#define GZIP1_DEF_WINBITS 15
25674 +#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25675 +
25676 +static int gzip1_init(void)
25677 +{
25678 + int ret = -EINVAL;
25679 +#if REISER4_ZLIB
25680 + ret = 0;
25681 +#endif
25682 + if (ret == -EINVAL)
25683 + warning("edward-1337", "Zlib not compiled into kernel");
25684 + return ret;
25685 +}
25686 +
25687 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25688 +{
25689 + return 0;
25690 +}
25691 +
25692 +static coa_t gzip1_alloc(tfm_action act)
25693 +{
25694 + coa_t coa = NULL;
25695 +#if REISER4_ZLIB
25696 + int ret = 0;
25697 + switch (act) {
25698 + case TFMA_WRITE: /* compress */
25699 + coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25700 + if (!coa) {
25701 + ret = -ENOMEM;
25702 + break;
25703 + }
25704 + break;
25705 + case TFMA_READ: /* decompress */
25706 + coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25707 + if (!coa) {
25708 + ret = -ENOMEM;
25709 + break;
25710 + }
25711 + break;
25712 + default:
25713 + impossible("edward-767",
25714 + "trying to alloc workspace for unknown tfm action");
25715 + }
25716 + if (ret) {
25717 + warning("edward-768",
25718 + "alloc workspace for gzip1 (tfm action = %d) failed\n",
25719 + act);
25720 + return ERR_PTR(ret);
25721 + }
25722 +#endif
25723 + return coa;
25724 +}
25725 +
25726 +static void gzip1_free(coa_t coa, tfm_action act)
25727 +{
25728 + assert("edward-769", coa != NULL);
25729 +
25730 + switch (act) {
25731 + case TFMA_WRITE: /* compress */
25732 + vfree(coa);
25733 + break;
25734 + case TFMA_READ: /* decompress */
25735 + vfree(coa);
25736 + break;
25737 + default:
25738 + impossible("edward-770", "unknown tfm action");
25739 + }
25740 + return;
25741 +}
25742 +
25743 +static int gzip1_min_size_deflate(void)
25744 +{
25745 + return 64;
25746 +}
25747 +
25748 +static void
25749 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25750 + __u8 * dst_first, unsigned *dst_len)
25751 +{
25752 +#if REISER4_ZLIB
25753 + int ret = 0;
25754 + struct z_stream_s stream;
25755 +
25756 + assert("edward-842", coa != NULL);
25757 + assert("edward-875", src_len != 0);
25758 +
25759 + stream.workspace = coa;
25760 + ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25761 + -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25762 + Z_DEFAULT_STRATEGY);
25763 + if (ret != Z_OK) {
25764 + warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25765 + goto rollback;
25766 + }
25767 + ret = zlib_deflateReset(&stream);
25768 + if (ret != Z_OK) {
25769 + warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25770 + goto rollback;
25771 + }
25772 + stream.next_in = src_first;
25773 + stream.avail_in = src_len;
25774 + stream.next_out = dst_first;
25775 + stream.avail_out = *dst_len;
25776 +
25777 + ret = zlib_deflate(&stream, Z_FINISH);
25778 + if (ret != Z_STREAM_END) {
25779 + if (ret != Z_OK)
25780 + warning("edward-773",
25781 + "zlib_deflate returned %d\n", ret);
25782 + goto rollback;
25783 + }
25784 + *dst_len = stream.total_out;
25785 + return;
25786 + rollback:
25787 + *dst_len = src_len;
25788 +#endif
25789 + return;
25790 +}
25791 +
25792 +static void
25793 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25794 + __u8 * dst_first, unsigned *dst_len)
25795 +{
25796 +#if REISER4_ZLIB
25797 + int ret = 0;
25798 + struct z_stream_s stream;
25799 +
25800 + assert("edward-843", coa != NULL);
25801 + assert("edward-876", src_len != 0);
25802 +
25803 + stream.workspace = coa;
25804 + ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25805 + if (ret != Z_OK) {
25806 + warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25807 + return;
25808 + }
25809 + ret = zlib_inflateReset(&stream);
25810 + if (ret != Z_OK) {
25811 + warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25812 + return;
25813 + }
25814 +
25815 + stream.next_in = src_first;
25816 + stream.avail_in = src_len;
25817 + stream.next_out = dst_first;
25818 + stream.avail_out = *dst_len;
25819 +
25820 + ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25821 + /*
25822 + * Work around a bug in zlib, which sometimes wants to taste an extra
25823 + * byte when being used in the (undocumented) raw deflate mode.
25824 + * (From USAGI).
25825 + */
25826 + if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25827 + u8 zerostuff = 0;
25828 + stream.next_in = &zerostuff;
25829 + stream.avail_in = 1;
25830 + ret = zlib_inflate(&stream, Z_FINISH);
25831 + }
25832 + if (ret != Z_STREAM_END) {
25833 + warning("edward-776", "zlib_inflate returned %d\n", ret);
25834 + return;
25835 + }
25836 + *dst_len = stream.total_out;
25837 +#endif
25838 + return;
25839 +}
25840 +
25841 +/******************************************************************************/
25842 +/* lzo1 compression */
25843 +/******************************************************************************/
25844 +
25845 +static int lzo1_init(void)
25846 +{
25847 + return 0;
25848 +}
25849 +
25850 +static int lzo1_overrun(unsigned in_len)
25851 +{
25852 + return in_len / 64 + 16 + 3;
25853 +}
25854 +
25855 +static coa_t lzo1_alloc(tfm_action act)
25856 +{
25857 + int ret = 0;
25858 + coa_t coa = NULL;
25859 +
25860 + switch (act) {
25861 + case TFMA_WRITE: /* compress */
25862 + coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
25863 + if (!coa) {
25864 + ret = -ENOMEM;
25865 + break;
25866 + }
25867 + case TFMA_READ: /* decompress */
25868 + break;
25869 + default:
25870 + impossible("edward-877",
25871 + "trying to alloc workspace for unknown tfm action");
25872 + }
25873 + if (ret) {
25874 + warning("edward-878",
25875 + "alloc workspace for lzo1 (tfm action = %d) failed\n",
25876 + act);
25877 + return ERR_PTR(ret);
25878 + }
25879 + return coa;
25880 +}
25881 +
25882 +static void lzo1_free(coa_t coa, tfm_action act)
25883 +{
25884 + assert("edward-879", coa != NULL);
25885 +
25886 + switch (act) {
25887 + case TFMA_WRITE: /* compress */
25888 + vfree(coa);
25889 + break;
25890 + case TFMA_READ: /* decompress */
25891 + impossible("edward-1304",
25892 + "trying to free non-allocated workspace");
25893 + default:
25894 + impossible("edward-880", "unknown tfm action");
25895 + }
25896 + return;
25897 +}
25898 +
25899 +static int lzo1_min_size_deflate(void)
25900 +{
25901 + return 256;
25902 +}
25903 +
25904 +static void
25905 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25906 + __u8 * dst_first, unsigned *dst_len)
25907 +{
25908 + int result;
25909 +
25910 + assert("edward-846", coa != NULL);
25911 + assert("edward-847", src_len != 0);
25912 +
25913 + result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25914 + if (unlikely(result != LZO_E_OK)) {
25915 + warning("edward-849", "lzo1x_1_compress failed\n");
25916 + goto out;
25917 + }
25918 + if (*dst_len >= src_len) {
25919 + //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25920 + goto out;
25921 + }
25922 + return;
25923 + out:
25924 + *dst_len = src_len;
25925 + return;
25926 +}
25927 +
25928 +static void
25929 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25930 + __u8 * dst_first, unsigned *dst_len)
25931 +{
25932 + int result;
25933 +
25934 + assert("edward-851", coa == NULL);
25935 + assert("edward-852", src_len != 0);
25936 +
25937 + result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
25938 + if (result != LZO_E_OK)
25939 + warning("edward-853", "lzo1x_1_decompress failed\n");
25940 + return;
25941 +}
25942 +
25943 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25944 + [LZO1_COMPRESSION_ID] = {
25945 + .h = {
25946 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25947 + .id = LZO1_COMPRESSION_ID,
25948 + .pops = &compression_plugin_ops,
25949 + .label = "lzo1",
25950 + .desc = "lzo1 compression transform",
25951 + .linkage = {NULL, NULL}
25952 + },
25953 + .init = lzo1_init,
25954 + .overrun = lzo1_overrun,
25955 + .alloc = lzo1_alloc,
25956 + .free = lzo1_free,
25957 + .min_size_deflate = lzo1_min_size_deflate,
25958 + .checksum = reiser4_adler32,
25959 + .compress = lzo1_compress,
25960 + .decompress = lzo1_decompress
25961 + },
25962 + [GZIP1_COMPRESSION_ID] = {
25963 + .h = {
25964 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25965 + .id = GZIP1_COMPRESSION_ID,
25966 + .pops = &compression_plugin_ops,
25967 + .label = "gzip1",
25968 + .desc = "gzip1 compression transform",
25969 + .linkage = {NULL, NULL}
25970 + },
25971 + .init = gzip1_init,
25972 + .overrun = gzip1_overrun,
25973 + .alloc = gzip1_alloc,
25974 + .free = gzip1_free,
25975 + .min_size_deflate = gzip1_min_size_deflate,
25976 + .checksum = reiser4_adler32,
25977 + .compress = gzip1_compress,
25978 + .decompress = gzip1_decompress
25979 + }
25980 +};
25981 +
25982 +/*
25983 + Local variables:
25984 + c-indentation-style: "K&R"
25985 + mode-name: "LC"
25986 + c-basic-offset: 8
25987 + tab-width: 8
25988 + fill-column: 120
25989 + scroll-step: 1
25990 + End:
25991 +*/
25992 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.23/fs/reiser4/plugin/compress/compress.h
25993 --- linux-2.6.23.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300
25994 +++ linux-2.6.23/fs/reiser4/plugin/compress/compress.h 2007-12-04 16:49:30.000000000 +0300
25995 @@ -0,0 +1,43 @@
25996 +#if !defined( __FS_REISER4_COMPRESS_H__ )
25997 +#define __FS_REISER4_COMPRESS_H__
25998 +
25999 +#include <linux/types.h>
26000 +#include <linux/string.h>
26001 +
26002 +/* transform direction */
26003 +typedef enum {
26004 + TFMA_READ, /* decrypt, decompress */
26005 + TFMA_WRITE, /* encrypt, compress */
26006 + TFMA_LAST
26007 +} tfm_action;
26008 +
26009 +/* supported compression algorithms */
26010 +typedef enum {
26011 + LZO1_COMPRESSION_ID,
26012 + GZIP1_COMPRESSION_ID,
26013 + LAST_COMPRESSION_ID,
26014 +} reiser4_compression_id;
26015 +
26016 +/* the same as pgoff, but units are page clusters */
26017 +typedef unsigned long cloff_t;
26018 +
26019 +/* working data of a (de)compression algorithm */
26020 +typedef void *coa_t;
26021 +
26022 +/* table for all supported (de)compression algorithms */
26023 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26024 +
26025 +__u32 reiser4_adler32(char *data, __u32 len);
26026 +
26027 +#endif /* __FS_REISER4_COMPRESS_H__ */
26028 +
26029 +/* Make Linus happy.
26030 + Local variables:
26031 + c-indentation-style: "K&R"
26032 + mode-name: "LC"
26033 + c-basic-offset: 8
26034 + tab-width: 8
26035 + fill-column: 120
26036 + scroll-step: 1
26037 + End:
26038 +*/
26039 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.23/fs/reiser4/plugin/compress/compress_mode.c
26040 --- linux-2.6.23.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300
26041 +++ linux-2.6.23/fs/reiser4/plugin/compress/compress_mode.c 2007-12-04 16:49:30.000000000 +0300
26042 @@ -0,0 +1,162 @@
26043 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26044 +/* This file contains Reiser4 compression mode plugins.
26045 +
26046 + Compression mode plugin is a set of handlers called by compressor
26047 + at flush time and represent some heuristics including the ones
26048 + which are to avoid compression of incompressible data, see
26049 + http://www.namesys.com/cryptcompress_design.html for more details.
26050 +*/
26051 +#include "../../inode.h"
26052 +#include "../plugin.h"
26053 +
26054 +static int should_deflate_none(struct inode * inode, cloff_t index)
26055 +{
26056 + return 0;
26057 +}
26058 +
26059 +static int should_deflate_common(struct inode * inode, cloff_t index)
26060 +{
26061 + return compression_is_on(cryptcompress_inode_data(inode));
26062 +}
26063 +
26064 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
26065 +{
26066 + turn_off_compression(cryptcompress_inode_data(inode));
26067 + return 0;
26068 +}
26069 +
26070 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
26071 +{
26072 + struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26073 +
26074 + assert("edward-1462",
26075 + get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26076 + get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26077 +
26078 + turn_off_compression(info);
26079 + if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26080 + set_lattice_factor(info, get_lattice_factor(info) << 1);
26081 + return 0;
26082 +}
26083 +
26084 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
26085 +{
26086 + turn_on_compression(cryptcompress_inode_data(inode));
26087 + set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26088 + return 0;
26089 +}
26090 +
26091 +/* Check on dynamic lattice, the adaptive compression modes which
26092 + defines the following behavior:
26093 +
26094 + Compression is on: try to compress everything and turn
26095 + it off, whenever cluster is incompressible.
26096 +
26097 + Compression is off: try to compress clusters of indexes
26098 + k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26099 + them is compressible. If incompressible, then increase FACTOR */
26100 +
26101 +/* check if @index belongs to one-dimensional lattice
26102 + of sparce factor @factor */
26103 +static int is_on_lattice(cloff_t index, int factor)
26104 +{
26105 + return (factor ? index % factor == 0: index == 0);
26106 +}
26107 +
26108 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
26109 +{
26110 + return should_deflate_common(inode, index) ||
26111 + is_on_lattice(index,
26112 + get_lattice_factor
26113 + (cryptcompress_inode_data(inode)));
26114 +}
26115 +
26116 +/* compression mode_plugins */
26117 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26118 + [NONE_COMPRESSION_MODE_ID] = {
26119 + .h = {
26120 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26121 + .id = NONE_COMPRESSION_MODE_ID,
26122 + .pops = NULL,
26123 + .label = "none",
26124 + .desc = "Compress nothing",
26125 + .linkage = {NULL, NULL}
26126 + },
26127 + .should_deflate = should_deflate_none,
26128 + .accept_hook = NULL,
26129 + .discard_hook = NULL
26130 + },
26131 + /* Check-on-dynamic-lattice adaptive compression mode */
26132 + [LATTD_COMPRESSION_MODE_ID] = {
26133 + .h = {
26134 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26135 + .id = LATTD_COMPRESSION_MODE_ID,
26136 + .pops = NULL,
26137 + .label = "lattd",
26138 + .desc = "Check on dynamic lattice",
26139 + .linkage = {NULL, NULL}
26140 + },
26141 + .should_deflate = should_deflate_lattd,
26142 + .accept_hook = accept_hook_lattd,
26143 + .discard_hook = discard_hook_lattd
26144 + },
26145 + /* Check-ultimately compression mode:
26146 + Turn off compression forever as soon as we meet
26147 + incompressible data */
26148 + [ULTIM_COMPRESSION_MODE_ID] = {
26149 + .h = {
26150 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26151 + .id = ULTIM_COMPRESSION_MODE_ID,
26152 + .pops = NULL,
26153 + .label = "ultim",
26154 + .desc = "Check ultimately",
26155 + .linkage = {NULL, NULL}
26156 + },
26157 + .should_deflate = should_deflate_common,
26158 + .accept_hook = NULL,
26159 + .discard_hook = discard_hook_ultim
26160 + },
26161 + /* Force-to-compress-everything compression mode */
26162 + [FORCE_COMPRESSION_MODE_ID] = {
26163 + .h = {
26164 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26165 + .id = FORCE_COMPRESSION_MODE_ID,
26166 + .pops = NULL,
26167 + .label = "force",
26168 + .desc = "Force to compress everything",
26169 + .linkage = {NULL, NULL}
26170 + },
26171 + .should_deflate = NULL,
26172 + .accept_hook = NULL,
26173 + .discard_hook = NULL
26174 + },
26175 + /* Convert-to-extent compression mode.
26176 + In this mode items will be converted to extents and management
26177 + will be passed to (classic) unix file plugin as soon as ->write()
26178 + detects that the first complete logical cluster (of index #0) is
26179 + incompressible. */
26180 + [CONVX_COMPRESSION_MODE_ID] = {
26181 + .h = {
26182 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26183 + .id = CONVX_COMPRESSION_MODE_ID,
26184 + .pops = NULL,
26185 + .label = "conv",
26186 + .desc = "Convert to extent",
26187 + .linkage = {NULL, NULL}
26188 + },
26189 + .should_deflate = should_deflate_common,
26190 + .accept_hook = NULL,
26191 + .discard_hook = NULL
26192 + }
26193 +};
26194 +
26195 +/*
26196 + Local variables:
26197 + c-indentation-style: "K&R"
26198 + mode-name: "LC"
26199 + c-basic-offset: 8
26200 + tab-width: 8
26201 + fill-column: 120
26202 + scroll-step: 1
26203 + End:
26204 +*/
26205 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.23/fs/reiser4/plugin/compress/Makefile
26206 --- linux-2.6.23.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300
26207 +++ linux-2.6.23/fs/reiser4/plugin/compress/Makefile 2007-12-04 16:49:30.000000000 +0300
26208 @@ -0,0 +1,5 @@
26209 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26210 +
26211 +compress_plugins-objs := \
26212 + compress.o \
26213 + compress_mode.o
26214 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.23/fs/reiser4/plugin/crypto/cipher.c
26215 --- linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
26216 +++ linux-2.6.23/fs/reiser4/plugin/crypto/cipher.c 2007-12-04 16:49:30.000000000 +0300
26217 @@ -0,0 +1,37 @@
26218 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
26219 + licensing governed by reiser4/README */
26220 +/* Reiser4 cipher transform plugins */
26221 +
26222 +#include "../../debug.h"
26223 +#include "../plugin.h"
26224 +
26225 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
26226 + [NONE_CIPHER_ID] = {
26227 + .h = {
26228 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
26229 + .id = NONE_CIPHER_ID,
26230 + .pops = NULL,
26231 + .label = "none",
26232 + .desc = "no cipher transform",
26233 + .linkage = {NULL, NULL}
26234 + },
26235 + .alloc = NULL,
26236 + .free = NULL,
26237 + .scale = NULL,
26238 + .align_stream = NULL,
26239 + .setkey = NULL,
26240 + .encrypt = NULL,
26241 + .decrypt = NULL
26242 + }
26243 +};
26244 +
26245 +/* Make Linus happy.
26246 + Local variables:
26247 + c-indentation-style: "K&R"
26248 + mode-name: "LC"
26249 + c-basic-offset: 8
26250 + tab-width: 8
26251 + fill-column: 120
26252 + scroll-step: 1
26253 + End:
26254 +*/
26255 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.23/fs/reiser4/plugin/crypto/cipher.h
26256 --- linux-2.6.23.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
26257 +++ linux-2.6.23/fs/reiser4/plugin/crypto/cipher.h 2007-12-04 16:49:30.000000000 +0300
26258 @@ -0,0 +1,55 @@
26259 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26260 +/* This file contains definitions for the objects operated
26261 + by reiser4 key manager, which is something like keyring
26262 + wrapped by appropriate reiser4 plugin */
26263 +
26264 +#if !defined( __FS_REISER4_CRYPT_H__ )
26265 +#define __FS_REISER4_CRYPT_H__
26266 +
26267 +#include <linux/crypto.h>
26268 +
26269 +/* key info imported from user space */
26270 +struct reiser4_crypto_data {
26271 + int keysize; /* uninstantiated key size */
26272 + __u8 * key; /* uninstantiated key */
26273 + int keyid_size; /* size of passphrase */
26274 + __u8 * keyid; /* passphrase */
26275 +};
26276 +
26277 +/* This object contains all needed infrastructure to implement
26278 + cipher transform. This is operated (allocating, inheriting,
26279 + validating, binding to host inode, etc..) by reiser4 key manager.
26280 +
26281 + This info can be allocated in two cases:
26282 + 1. importing a key from user space.
26283 + 2. reading inode from disk */
26284 +struct reiser4_crypto_info {
26285 + struct inode * host;
26286 + struct crypto_hash * digest;
26287 + struct crypto_blkcipher * cipher;
26288 +#if 0
26289 + cipher_key_plugin * kplug; /* key manager */
26290 +#endif
26291 + __u8 * keyid; /* key fingerprint, created by digest plugin,
26292 + using uninstantiated key and passphrase.
26293 + supposed to be stored in disk stat-data */
26294 + int inst; /* this indicates if the cipher key is
26295 + instantiated (case 1 above) */
26296 + int keysize; /* uninstantiated key size (bytes), supposed
26297 + to be stored in disk stat-data */
26298 + int keyload_count; /* number of the objects which has this
26299 + crypto-stat attached */
26300 +};
26301 +
26302 +#endif /* __FS_REISER4_CRYPT_H__ */
26303 +
26304 +/*
26305 + Local variables:
26306 + c-indentation-style: "K&R"
26307 + mode-name: "LC"
26308 + c-basic-offset: 8
26309 + tab-width: 8
26310 + fill-column: 120
26311 + scroll-step: 1
26312 + End:
26313 +*/
26314 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.23/fs/reiser4/plugin/crypto/digest.c
26315 --- linux-2.6.23.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
26316 +++ linux-2.6.23/fs/reiser4/plugin/crypto/digest.c 2007-12-04 16:49:30.000000000 +0300
26317 @@ -0,0 +1,58 @@
26318 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26319 +
26320 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
26321 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
26322 +#include "../../debug.h"
26323 +#include "../plugin_header.h"
26324 +#include "../plugin.h"
26325 +#include "../file/cryptcompress.h"
26326 +
26327 +#include <linux/types.h>
26328 +
26329 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
26330 +
26331 +static struct crypto_hash * alloc_sha256 (void)
26332 +{
26333 +#if REISER4_SHA256
26334 + return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
26335 +#else
26336 + warning("edward-1418", "sha256 unsupported");
26337 + return ERR_PTR(-EINVAL);
26338 +#endif
26339 +}
26340 +
26341 +static void free_sha256 (struct crypto_hash * tfm)
26342 +{
26343 +#if REISER4_SHA256
26344 + crypto_free_hash(tfm);
26345 +#endif
26346 + return;
26347 +}
26348 +
26349 +/* digest plugins */
26350 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
26351 + [SHA256_32_DIGEST_ID] = {
26352 + .h = {
26353 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
26354 + .id = SHA256_32_DIGEST_ID,
26355 + .pops = NULL,
26356 + .label = "sha256_32",
26357 + .desc = "sha256_32 digest transform",
26358 + .linkage = {NULL, NULL}
26359 + },
26360 + .fipsize = sizeof(__u32),
26361 + .alloc = alloc_sha256,
26362 + .free = free_sha256
26363 + }
26364 +};
26365 +
26366 +/*
26367 + Local variables:
26368 + c-indentation-style: "K&R"
26369 + mode-name: "LC"
26370 + c-basic-offset: 8
26371 + tab-width: 8
26372 + fill-column: 120
26373 + scroll-step: 1
26374 + End:
26375 +*/
26376 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.23/fs/reiser4/plugin/dir/dir.h
26377 --- linux-2.6.23.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
26378 +++ linux-2.6.23/fs/reiser4/plugin/dir/dir.h 2007-12-04 16:49:30.000000000 +0300
26379 @@ -0,0 +1,36 @@
26380 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26381 + * reiser4/README */
26382 +
26383 +/* this file contains declarations of methods implementing directory plugins */
26384 +
26385 +#if !defined( __REISER4_DIR_H__ )
26386 +#define __REISER4_DIR_H__
26387 +
26388 +/*#include "../../key.h"
26389 +
26390 +#include <linux/fs.h>*/
26391 +
26392 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
26393 +
26394 +/* "hashed" directory methods of dir plugin */
26395 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
26396 + reiser4_key *);
26397 +
26398 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
26399 +
26400 +/* "seekable" directory methods of dir plugin */
26401 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
26402 + reiser4_key *);
26403 +
26404 +/* __REISER4_DIR_H__ */
26405 +#endif
26406 +
26407 +/*
26408 + Local variables:
26409 + c-indentation-style: "K&R"
26410 + mode-name: "LC"
26411 + c-basic-offset: 8
26412 + tab-width: 8
26413 + fill-column: 120
26414 + End:
26415 +*/
26416 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.23/fs/reiser4/plugin/dir/hashed_dir.c
26417 --- linux-2.6.23.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300
26418 +++ linux-2.6.23/fs/reiser4/plugin/dir/hashed_dir.c 2007-12-04 16:49:30.000000000 +0300
26419 @@ -0,0 +1,81 @@
26420 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26421 + * reiser4/README */
26422 +
26423 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
26424 + names to the files. */
26425 +
26426 +/*
26427 + * Hashed directory logically consists of persistent directory
26428 + * entries. Directory entry is a pair of a file name and a key of stat-data of
26429 + * a file that has this name in the given directory.
26430 + *
26431 + * Directory entries are stored in the tree in the form of directory
26432 + * items. Directory item should implement dir_entry_ops portion of item plugin
26433 + * interface (see plugin/item/item.h). Hashed directory interacts with
26434 + * directory item plugin exclusively through dir_entry_ops operations.
26435 + *
26436 + * Currently there are two implementations of directory items: "simple
26437 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
26438 + * (plugin/item/cde.[ch]) with the latter being the default.
26439 + *
26440 + * There is, however some delicate way through which directory code interferes
26441 + * with item plugin: key assignment policy. A key for a directory item is
26442 + * chosen by directory code, and as described in kassign.c, this key contains
26443 + * a portion of file name. Directory item uses this knowledge to avoid storing
26444 + * this portion of file name twice: in the key and in the directory item body.
26445 + *
26446 + */
26447 +
26448 +#include "../../inode.h"
26449 +
26450 +void complete_entry_key(const struct inode *, const char *name,
26451 + int len, reiser4_key * result);
26452 +
26453 +/* this is implementation of build_entry_key method of dir
26454 + plugin for HASHED_DIR_PLUGIN_ID
26455 + */
26456 +void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
26457 + * (or will be) in.*/
26458 + const struct qstr *qname, /* name of file referenced
26459 + * by this entry */
26460 + reiser4_key * result /* resulting key of directory
26461 + * entry */ )
26462 +{
26463 + const char *name;
26464 + int len;
26465 +
26466 + assert("nikita-1139", dir != NULL);
26467 + assert("nikita-1140", qname != NULL);
26468 + assert("nikita-1141", qname->name != NULL);
26469 + assert("nikita-1142", result != NULL);
26470 +
26471 + name = qname->name;
26472 + len = qname->len;
26473 +
26474 + assert("nikita-2867", strlen(name) == len);
26475 +
26476 + reiser4_key_init(result);
26477 + /* locality of directory entry's key is objectid of parent
26478 + directory */
26479 + set_key_locality(result, get_inode_oid(dir));
26480 + /* minor packing locality is constant */
26481 + set_key_type(result, KEY_FILE_NAME_MINOR);
26482 + /* dot is special case---we always want it to be first entry in
26483 + a directory. Actually, we just want to have smallest
26484 + directory entry.
26485 + */
26486 + if (len == 1 && name[0] == '.')
26487 + return;
26488 +
26489 + /* initialize part of entry key which depends on file name */
26490 + complete_entry_key(dir, name, len, result);
26491 +}
26492 +
26493 +/* Local variables:
26494 + c-indentation-style: "K&R"
26495 + mode-name: "LC"
26496 + c-basic-offset: 8
26497 + tab-width: 8
26498 + fill-column: 120
26499 + End:
26500 +*/
26501 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.23/fs/reiser4/plugin/dir/Makefile
26502 --- linux-2.6.23.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300
26503 +++ linux-2.6.23/fs/reiser4/plugin/dir/Makefile 2007-12-04 16:49:30.000000000 +0300
26504 @@ -0,0 +1,5 @@
26505 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
26506 +
26507 +dir_plugins-objs := \
26508 + hashed_dir.o \
26509 + seekable_dir.o
26510 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.23/fs/reiser4/plugin/dir/seekable_dir.c
26511 --- linux-2.6.23.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300
26512 +++ linux-2.6.23/fs/reiser4/plugin/dir/seekable_dir.c 2007-12-04 16:49:30.000000000 +0300
26513 @@ -0,0 +1,46 @@
26514 +/* Copyright 2005 by Hans Reiser, licensing governed by
26515 + * reiser4/README */
26516 +
26517 +#include "../../inode.h"
26518 +
26519 +/* this is implementation of build_entry_key method of dir
26520 + plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
26521 + This is for directories where we want repeatable and restartable readdir()
26522 + even in case 32bit user level struct dirent (readdir(3)).
26523 +*/
26524 +void
26525 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
26526 + reiser4_key * result)
26527 +{
26528 + oid_t objectid;
26529 +
26530 + assert("nikita-2283", dir != NULL);
26531 + assert("nikita-2284", name != NULL);
26532 + assert("nikita-2285", name->name != NULL);
26533 + assert("nikita-2286", result != NULL);
26534 +
26535 + reiser4_key_init(result);
26536 + /* locality of directory entry's key is objectid of parent
26537 + directory */
26538 + set_key_locality(result, get_inode_oid(dir));
26539 + /* minor packing locality is constant */
26540 + set_key_type(result, KEY_FILE_NAME_MINOR);
26541 + /* dot is special case---we always want it to be first entry in
26542 + a directory. Actually, we just want to have smallest
26543 + directory entry.
26544 + */
26545 + if ((name->len == 1) && (name->name[0] == '.'))
26546 + return;
26547 +
26548 + /* objectid of key is 31 lowest bits of hash. */
26549 + objectid =
26550 + inode_hash_plugin(dir)->hash(name->name,
26551 + (int)name->len) & 0x7fffffff;
26552 +
26553 + assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
26554 + set_key_objectid(result, objectid);
26555 +
26556 + /* offset is always 0. */
26557 + set_key_offset(result, (__u64) 0);
26558 + return;
26559 +}
26560 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.23/fs/reiser4/plugin/dir_plugin_common.c
26561 --- linux-2.6.23.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
26562 +++ linux-2.6.23/fs/reiser4/plugin/dir_plugin_common.c 2007-12-04 16:49:30.000000000 +0300
26563 @@ -0,0 +1,872 @@
26564 +/* Copyright 2005 by Hans Reiser, licensing governed by
26565 + reiser4/README */
26566 +
26567 +/* this file contains typical implementations for most of methods of
26568 + directory plugin
26569 +*/
26570 +
26571 +#include "../inode.h"
26572 +
26573 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
26574 + lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
26575 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
26576 +void check_light_weight(struct inode *inode, struct inode *parent);
26577 +
26578 +/* this is common implementation of get_parent method of dir plugin
26579 + this is used by NFS kernel server to "climb" up directory tree to
26580 + check permissions
26581 + */
26582 +struct dentry *get_parent_common(struct inode *child)
26583 +{
26584 + struct super_block *s;
26585 + struct inode *parent;
26586 + struct dentry dotdot;
26587 + struct dentry *dentry;
26588 + reiser4_key key;
26589 + int result;
26590 +
26591 + /*
26592 + * lookup dotdot entry.
26593 + */
26594 +
26595 + s = child->i_sb;
26596 + memset(&dotdot, 0, sizeof(dotdot));
26597 + dotdot.d_name.name = "..";
26598 + dotdot.d_name.len = 2;
26599 + dotdot.d_op = &get_super_private(s)->ops.dentry;
26600 +
26601 + result = reiser4_lookup_name(child, &dotdot, &key);
26602 + if (result != 0)
26603 + return ERR_PTR(result);
26604 +
26605 + parent = reiser4_iget(s, &key, 1);
26606 + if (!IS_ERR(parent)) {
26607 + /*
26608 + * FIXME-NIKITA dubious: attributes are inherited from @child
26609 + * to @parent. But:
26610 + *
26611 + * (*) this is the only this we can do
26612 + *
26613 + * (*) attributes of light-weight object are inherited
26614 + * from a parent through which object was looked up first,
26615 + * so it is ambiguous anyway.
26616 + *
26617 + */
26618 + check_light_weight(parent, child);
26619 + reiser4_iget_complete(parent);
26620 + dentry = d_alloc_anon(parent);
26621 + if (dentry == NULL) {
26622 + iput(parent);
26623 + dentry = ERR_PTR(RETERR(-ENOMEM));
26624 + } else
26625 + dentry->d_op = &get_super_private(s)->ops.dentry;
26626 + } else if (PTR_ERR(parent) == -ENOENT)
26627 + dentry = ERR_PTR(RETERR(-ESTALE));
26628 + else
26629 + dentry = (void *)parent;
26630 + return dentry;
26631 +}
26632 +
26633 +/* this is common implementation of is_name_acceptable method of dir
26634 + plugin
26635 + */
26636 +int is_name_acceptable_common(const struct inode *inode, /* directory to check */
26637 + const char *name UNUSED_ARG, /* name to check */
26638 + int len /* @name's length */ )
26639 +{
26640 + assert("nikita-733", inode != NULL);
26641 + assert("nikita-734", name != NULL);
26642 + assert("nikita-735", len > 0);
26643 +
26644 + return len <= reiser4_max_filename_len(inode);
26645 +}
26646 +
26647 +/* there is no common implementation of build_entry_key method of dir
26648 + plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
26649 + plugin/dir/seekable.c:build_entry_key_seekable() for example
26650 +*/
26651 +
26652 +/* this is common implementation of build_readdir_key method of dir
26653 + plugin
26654 + see reiser4_readdir_common for more details
26655 +*/
26656 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
26657 + reiser4_key * result /* where to store key */ )
26658 +{
26659 + reiser4_file_fsdata *fdata;
26660 + struct inode *inode;
26661 +
26662 + assert("nikita-1361", dir != NULL);
26663 + assert("nikita-1362", result != NULL);
26664 + assert("nikita-1363", dir->f_dentry != NULL);
26665 + inode = dir->f_dentry->d_inode;
26666 + assert("nikita-1373", inode != NULL);
26667 +
26668 + fdata = reiser4_get_file_fsdata(dir);
26669 + if (IS_ERR(fdata))
26670 + return PTR_ERR(fdata);
26671 + assert("nikita-1364", fdata != NULL);
26672 + return extract_key_from_de_id(get_inode_oid(inode),
26673 + &fdata->dir.readdir.position.
26674 + dir_entry_key, result);
26675 +
26676 +}
26677 +
26678 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
26679 + int adj);
26680 +
26681 +/* this is common implementation of add_entry method of dir plugin
26682 +*/
26683 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
26684 + * in */
26685 + struct dentry *where, /* new name */
26686 + reiser4_object_create_data * data, /* parameters of
26687 + * new object */
26688 + reiser4_dir_entry_desc * entry /* parameters of
26689 + * new directory
26690 + * entry */)
26691 +{
26692 + int result;
26693 + coord_t *coord;
26694 + lock_handle lh;
26695 + struct reiser4_dentry_fsdata *fsdata;
26696 + reiser4_block_nr reserve;
26697 +
26698 + assert("nikita-1114", object != NULL);
26699 + assert("nikita-1250", where != NULL);
26700 +
26701 + fsdata = reiser4_get_dentry_fsdata(where);
26702 + if (unlikely(IS_ERR(fsdata)))
26703 + return PTR_ERR(fsdata);
26704 +
26705 + reserve = inode_dir_plugin(object)->estimate.add_entry(object);
26706 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26707 + return RETERR(-ENOSPC);
26708 +
26709 + init_lh(&lh);
26710 + coord = &fsdata->dec.entry_coord;
26711 + coord_clear_iplug(coord);
26712 +
26713 + /* check for this entry in a directory. This is plugin method. */
26714 + result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
26715 + entry);
26716 + if (likely(result == -ENOENT)) {
26717 + /* add new entry. Just pass control to the directory
26718 + item plugin. */
26719 + assert("nikita-1709", inode_dir_item_plugin(object));
26720 + assert("nikita-2230", coord->node == lh.node);
26721 + reiser4_seal_done(&fsdata->dec.entry_seal);
26722 + result =
26723 + inode_dir_item_plugin(object)->s.dir.add_entry(object,
26724 + coord, &lh,
26725 + where,
26726 + entry);
26727 + if (result == 0) {
26728 + reiser4_adjust_dir_file(object, where,
26729 + fsdata->dec.pos + 1, +1);
26730 + INODE_INC_FIELD(object, i_size);
26731 + }
26732 + } else if (result == 0) {
26733 + assert("nikita-2232", coord->node == lh.node);
26734 + result = RETERR(-EEXIST);
26735 + }
26736 + done_lh(&lh);
26737 +
26738 + return result;
26739 +}
26740 +
26741 +/**
26742 + * rem_entry - remove entry from directory item
26743 + * @dir:
26744 + * @dentry:
26745 + * @entry:
26746 + * @coord:
26747 + * @lh:
26748 + *
26749 + * Checks that coordinate @coord is set properly and calls item plugin
26750 + * method to cut entry.
26751 + */
26752 +static int
26753 +rem_entry(struct inode *dir, struct dentry *dentry,
26754 + reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
26755 +{
26756 + item_plugin *iplug;
26757 + struct inode *child;
26758 +
26759 + iplug = inode_dir_item_plugin(dir);
26760 + child = dentry->d_inode;
26761 + assert("nikita-3399", child != NULL);
26762 +
26763 + /* check that we are really destroying an entry for @child */
26764 + if (REISER4_DEBUG) {
26765 + int result;
26766 + reiser4_key key;
26767 +
26768 + result = iplug->s.dir.extract_key(coord, &key);
26769 + if (result != 0)
26770 + return result;
26771 + if (get_key_objectid(&key) != get_inode_oid(child)) {
26772 + warning("nikita-3397",
26773 + "rem_entry: %#llx != %#llx\n",
26774 + get_key_objectid(&key),
26775 + (unsigned long long)get_inode_oid(child));
26776 + return RETERR(-EIO);
26777 + }
26778 + }
26779 + return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
26780 +}
26781 +
26782 +/**
26783 + * reiser4_rem_entry_common - remove entry from a directory
26784 + * @dir: directory to remove entry from
26785 + * @where: name that is being removed
26786 + * @entry: description of entry being removed
26787 + *
26788 + * This is common implementation of rem_entry method of dir plugin.
26789 + */
26790 +int reiser4_rem_entry_common(struct inode *dir,
26791 + struct dentry *dentry,
26792 + reiser4_dir_entry_desc *entry)
26793 +{
26794 + int result;
26795 + coord_t *coord;
26796 + lock_handle lh;
26797 + struct reiser4_dentry_fsdata *fsdata;
26798 + __u64 tograb;
26799 +
26800 + assert("nikita-1124", dir != NULL);
26801 + assert("nikita-1125", dentry != NULL);
26802 +
26803 + tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
26804 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
26805 + if (result != 0)
26806 + return RETERR(-ENOSPC);
26807 +
26808 + init_lh(&lh);
26809 +
26810 + /* check for this entry in a directory. This is plugin method. */
26811 + result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
26812 + fsdata = reiser4_get_dentry_fsdata(dentry);
26813 + if (IS_ERR(fsdata)) {
26814 + done_lh(&lh);
26815 + return PTR_ERR(fsdata);
26816 + }
26817 +
26818 + coord = &fsdata->dec.entry_coord;
26819 +
26820 + assert("nikita-3404",
26821 + get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
26822 + dir->i_size <= 1);
26823 +
26824 + coord_clear_iplug(coord);
26825 + if (result == 0) {
26826 + /* remove entry. Just pass control to the directory item
26827 + plugin. */
26828 + assert("vs-542", inode_dir_item_plugin(dir));
26829 + reiser4_seal_done(&fsdata->dec.entry_seal);
26830 + reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
26831 + result =
26832 + WITH_COORD(coord,
26833 + rem_entry(dir, dentry, entry, coord, &lh));
26834 + if (result == 0) {
26835 + if (dir->i_size >= 1)
26836 + INODE_DEC_FIELD(dir, i_size);
26837 + else {
26838 + warning("nikita-2509", "Dir %llu is runt",
26839 + (unsigned long long)
26840 + get_inode_oid(dir));
26841 + result = RETERR(-EIO);
26842 + }
26843 +
26844 + assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
26845 + dentry->d_inode->i_size != 2 ||
26846 + inode_dir_plugin(dentry->d_inode) == NULL);
26847 + }
26848 + }
26849 + done_lh(&lh);
26850 +
26851 + return result;
26852 +}
26853 +
26854 +static reiser4_block_nr estimate_init(struct inode *parent,
26855 + struct inode *object);
26856 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
26857 +
26858 +/* this is common implementation of init method of dir plugin
26859 + create "." and ".." entries
26860 +*/
26861 +int reiser4_dir_init_common(struct inode *object, /* new directory */
26862 + struct inode *parent, /* parent directory */
26863 + reiser4_object_create_data * data /* info passed
26864 + * to us, this
26865 + * is filled by
26866 + * reiser4()
26867 + * syscall in
26868 + * particular */)
26869 +{
26870 + reiser4_block_nr reserve;
26871 +
26872 + assert("nikita-680", object != NULL);
26873 + assert("nikita-681", S_ISDIR(object->i_mode));
26874 + assert("nikita-682", parent != NULL);
26875 + assert("nikita-684", data != NULL);
26876 + assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
26877 + assert("nikita-687", object->i_mode & S_IFDIR);
26878 +
26879 + reserve = estimate_init(parent, object);
26880 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26881 + return RETERR(-ENOSPC);
26882 +
26883 + return create_dot_dotdot(object, parent);
26884 +}
26885 +
26886 +/* this is common implementation of done method of dir plugin
26887 + remove "." entry
26888 +*/
26889 +int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
26890 +{
26891 + int result;
26892 + reiser4_block_nr reserve;
26893 + struct dentry goodby_dots;
26894 + reiser4_dir_entry_desc entry;
26895 +
26896 + assert("nikita-1449", object != NULL);
26897 +
26898 + if (reiser4_inode_get_flag(object, REISER4_NO_SD))
26899 + return 0;
26900 +
26901 + /* of course, this can be rewritten to sweep everything in one
26902 + reiser4_cut_tree(). */
26903 + memset(&entry, 0, sizeof entry);
26904 +
26905 + /* FIXME: this done method is called from reiser4_delete_dir_common which
26906 + * reserved space already */
26907 + reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
26908 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
26909 + return RETERR(-ENOSPC);
26910 +
26911 + memset(&goodby_dots, 0, sizeof goodby_dots);
26912 + entry.obj = goodby_dots.d_inode = object;
26913 + goodby_dots.d_name.name = ".";
26914 + goodby_dots.d_name.len = 1;
26915 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
26916 + reiser4_free_dentry_fsdata(&goodby_dots);
26917 + if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
26918 + /* only worth a warning
26919 +
26920 + "values of \ eB\ f will give rise to dom!\n"
26921 + -- v6src/s2/mv.c:89
26922 + */
26923 + warning("nikita-2252", "Cannot remove dot of %lli: %i",
26924 + (unsigned long long)get_inode_oid(object), result);
26925 + return 0;
26926 +}
26927 +
26928 +/* this is common implementation of attach method of dir plugin
26929 +*/
26930 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
26931 + struct inode *parent UNUSED_ARG)
26932 +{
26933 + assert("nikita-2647", child != NULL);
26934 + assert("nikita-2648", parent != NULL);
26935 +
26936 + return 0;
26937 +}
26938 +
26939 +/* this is common implementation of detach method of dir plugin
26940 + remove "..", decrease nlink on parent
26941 +*/
26942 +int reiser4_detach_common(struct inode *object, struct inode *parent)
26943 +{
26944 + int result;
26945 + struct dentry goodby_dots;
26946 + reiser4_dir_entry_desc entry;
26947 +
26948 + assert("nikita-2885", object != NULL);
26949 + assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
26950 +
26951 + memset(&entry, 0, sizeof entry);
26952 +
26953 + /* NOTE-NIKITA this only works if @parent is -the- parent of
26954 + @object, viz. object whose key is stored in dotdot
26955 + entry. Wouldn't work with hard-links on directories. */
26956 + memset(&goodby_dots, 0, sizeof goodby_dots);
26957 + entry.obj = goodby_dots.d_inode = parent;
26958 + goodby_dots.d_name.name = "..";
26959 + goodby_dots.d_name.len = 2;
26960 + result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
26961 + reiser4_free_dentry_fsdata(&goodby_dots);
26962 + if (result == 0) {
26963 + /* the dot should be the only entry remaining at this time... */
26964 + assert("nikita-3400",
26965 + object->i_size == 1 && object->i_nlink <= 2);
26966 +#if 0
26967 + /* and, together with the only name directory can have, they
26968 + * provides for the last 2 remaining references. If we get
26969 + * here as part of error handling during mkdir, @object
26970 + * possibly has no name yet, so its nlink == 1. If we get here
26971 + * from rename (targeting empty directory), it has no name
26972 + * already, so its nlink == 1. */
26973 + assert("nikita-3401",
26974 + object->i_nlink == 2 || object->i_nlink == 1);
26975 +#endif
26976 +
26977 + /* decrement nlink of directory removed ".." pointed
26978 + to */
26979 + reiser4_del_nlink(parent, NULL, 0);
26980 + }
26981 + return result;
26982 +}
26983 +
26984 +/* this is common implementation of estimate.add_entry method of
26985 + dir plugin
26986 + estimation of adding entry which supposes that entry is inserting a
26987 + unit into item
26988 +*/
26989 +reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
26990 +{
26991 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
26992 +}
26993 +
26994 +/* this is common implementation of estimate.rem_entry method of dir
26995 + plugin
26996 +*/
26997 +reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
26998 +{
26999 + return estimate_one_item_removal(reiser4_tree_by_inode(inode));
27000 +}
27001 +
27002 +/* this is common implementation of estimate.unlink method of dir
27003 + plugin
27004 +*/
27005 +reiser4_block_nr
27006 +dir_estimate_unlink_common(const struct inode * parent,
27007 + const struct inode * object)
27008 +{
27009 + reiser4_block_nr res;
27010 +
27011 + /* hashed_rem_entry(object) */
27012 + res = inode_dir_plugin(object)->estimate.rem_entry(object);
27013 + /* del_nlink(parent) */
27014 + res += 2 * inode_file_plugin(parent)->estimate.update(parent);
27015 +
27016 + return res;
27017 +}
27018 +
27019 +/*
27020 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
27021 + * methods: if @inode is a light-weight file, setup its credentials
27022 + * that are not stored in the stat-data in this case
27023 + */
27024 +void check_light_weight(struct inode *inode, struct inode *parent)
27025 +{
27026 + if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
27027 + inode->i_uid = parent->i_uid;
27028 + inode->i_gid = parent->i_gid;
27029 + /* clear light-weight flag. If inode would be read by any
27030 + other name, [ug]id wouldn't change. */
27031 + reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
27032 + }
27033 +}
27034 +
27035 +/* looks for name specified in @dentry in directory @parent and if name is
27036 + found - key of object found entry points to is stored in @entry->key */
27037 +int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
27038 + * name in */
27039 + struct dentry *dentry, /* name to look for */
27040 + reiser4_key * key /* place to store key */ )
27041 +{
27042 + int result;
27043 + coord_t *coord;
27044 + lock_handle lh;
27045 + const char *name;
27046 + int len;
27047 + reiser4_dir_entry_desc entry;
27048 + struct reiser4_dentry_fsdata *fsdata;
27049 +
27050 + assert("nikita-1247", parent != NULL);
27051 + assert("nikita-1248", dentry != NULL);
27052 + assert("nikita-1123", dentry->d_name.name != NULL);
27053 + assert("vs-1486",
27054 + dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
27055 +
27056 + name = dentry->d_name.name;
27057 + len = dentry->d_name.len;
27058 +
27059 + if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
27060 + /* some arbitrary error code to return */
27061 + return RETERR(-ENAMETOOLONG);
27062 +
27063 + fsdata = reiser4_get_dentry_fsdata(dentry);
27064 + if (IS_ERR(fsdata))
27065 + return PTR_ERR(fsdata);
27066 +
27067 + coord = &fsdata->dec.entry_coord;
27068 + coord_clear_iplug(coord);
27069 + init_lh(&lh);
27070 +
27071 + /* find entry in a directory. This is plugin method. */
27072 + result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
27073 + &entry);
27074 + if (result == 0) {
27075 + /* entry was found, extract object key from it. */
27076 + result =
27077 + WITH_COORD(coord,
27078 + item_plugin_by_coord(coord)->s.dir.
27079 + extract_key(coord, key));
27080 + }
27081 + done_lh(&lh);
27082 + return result;
27083 +
27084 +}
27085 +
27086 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
27087 +static reiser4_block_nr
27088 +estimate_init(struct inode *parent, struct inode *object)
27089 +{
27090 + reiser4_block_nr res = 0;
27091 +
27092 + assert("vpf-321", parent != NULL);
27093 + assert("vpf-322", object != NULL);
27094 +
27095 + /* hashed_add_entry(object) */
27096 + res += inode_dir_plugin(object)->estimate.add_entry(object);
27097 + /* reiser4_add_nlink(object) */
27098 + res += inode_file_plugin(object)->estimate.update(object);
27099 + /* hashed_add_entry(object) */
27100 + res += inode_dir_plugin(object)->estimate.add_entry(object);
27101 + /* reiser4_add_nlink(parent) */
27102 + res += inode_file_plugin(parent)->estimate.update(parent);
27103 +
27104 + return 0;
27105 +}
27106 +
27107 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
27108 +static int create_dot_dotdot(struct inode *object /* object to create dot and
27109 + * dotdot for */ ,
27110 + struct inode *parent /* parent of @object */)
27111 +{
27112 + int result;
27113 + struct dentry dots_entry;
27114 + reiser4_dir_entry_desc entry;
27115 +
27116 + assert("nikita-688", object != NULL);
27117 + assert("nikita-689", S_ISDIR(object->i_mode));
27118 + assert("nikita-691", parent != NULL);
27119 +
27120 + /* We store dot and dotdot as normal directory entries. This is
27121 + not necessary, because almost all information stored in them
27122 + is already in the stat-data of directory, the only thing
27123 + being missed is objectid of grand-parent directory that can
27124 + easily be added there as extension.
27125 +
27126 + But it is done the way it is done, because not storing dot
27127 + and dotdot will lead to the following complications:
27128 +
27129 + . special case handling in ->lookup().
27130 + . addition of another extension to the sd.
27131 + . dependency on key allocation policy for stat data.
27132 +
27133 + */
27134 +
27135 + memset(&entry, 0, sizeof entry);
27136 + memset(&dots_entry, 0, sizeof dots_entry);
27137 + entry.obj = dots_entry.d_inode = object;
27138 + dots_entry.d_name.name = ".";
27139 + dots_entry.d_name.len = 1;
27140 + result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
27141 + reiser4_free_dentry_fsdata(&dots_entry);
27142 +
27143 + if (result == 0) {
27144 + result = reiser4_add_nlink(object, object, 0);
27145 + if (result == 0) {
27146 + entry.obj = dots_entry.d_inode = parent;
27147 + dots_entry.d_name.name = "..";
27148 + dots_entry.d_name.len = 2;
27149 + result = reiser4_add_entry_common(object,
27150 + &dots_entry, NULL, &entry);
27151 + reiser4_free_dentry_fsdata(&dots_entry);
27152 + /* if creation of ".." failed, iput() will delete
27153 + object with ".". */
27154 + if (result == 0) {
27155 + result = reiser4_add_nlink(parent, object, 0);
27156 + if (result != 0)
27157 + /*
27158 + * if we failed to bump i_nlink, try
27159 + * to remove ".."
27160 + */
27161 + reiser4_detach_common(object, parent);
27162 + }
27163 + }
27164 + }
27165 +
27166 + if (result != 0) {
27167 + /*
27168 + * in the case of error, at least update stat-data so that,
27169 + * ->i_nlink updates are not lingering.
27170 + */
27171 + reiser4_update_sd(object);
27172 + reiser4_update_sd(parent);
27173 + }
27174 +
27175 + return result;
27176 +}
27177 +
27178 +/*
27179 + * return 0 iff @coord contains a directory entry for the file with the name
27180 + * @name.
27181 + */
27182 +static int
27183 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
27184 +{
27185 + item_plugin *iplug;
27186 + char buf[DE_NAME_BUF_LEN];
27187 +
27188 + iplug = item_plugin_by_coord(coord);
27189 + if (iplug == NULL) {
27190 + warning("nikita-1135", "Cannot get item plugin");
27191 + print_coord("coord", coord, 1);
27192 + return RETERR(-EIO);
27193 + } else if (item_id_by_coord(coord) !=
27194 + item_id_by_plugin(inode_dir_item_plugin(dir))) {
27195 + /* item id of current item does not match to id of items a
27196 + directory is built of */
27197 + warning("nikita-1136", "Wrong item plugin");
27198 + print_coord("coord", coord, 1);
27199 + return RETERR(-EIO);
27200 + }
27201 + assert("nikita-1137", iplug->s.dir.extract_name);
27202 +
27203 + /* Compare name stored in this entry with name we are looking for.
27204 +
27205 + NOTE-NIKITA Here should go code for support of something like
27206 + unicode, code tables, etc.
27207 + */
27208 + return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
27209 +}
27210 +
27211 +static int
27212 +check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
27213 +{
27214 + return WITH_COORD(coord, check_item(dir, coord, name->name));
27215 +}
27216 +
27217 +/*
27218 + * argument package used by entry_actor to scan entries with identical keys.
27219 + */
27220 +struct entry_actor_args {
27221 + /* name we are looking for */
27222 + const char *name;
27223 + /* key of directory entry. entry_actor() scans through sequence of
27224 + * items/units having the same key */
27225 + reiser4_key *key;
27226 + /* how many entries with duplicate key was scanned so far. */
27227 + int non_uniq;
27228 +#if REISER4_USE_COLLISION_LIMIT
27229 + /* scan limit */
27230 + int max_non_uniq;
27231 +#endif
27232 + /* return parameter: set to true, if ->name wasn't found */
27233 + int not_found;
27234 + /* what type of lock to take when moving to the next node during
27235 + * scan */
27236 + znode_lock_mode mode;
27237 +
27238 + /* last coord that was visited during scan */
27239 + coord_t last_coord;
27240 + /* last node locked during scan */
27241 + lock_handle last_lh;
27242 + /* inode of directory */
27243 + const struct inode *inode;
27244 +};
27245 +
27246 +/* Function called by reiser4_find_entry() to look for given name
27247 + in the directory. */
27248 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
27249 + coord_t * coord /* current coord */ ,
27250 + lock_handle * lh /* current lock handle */ ,
27251 + void *entry_actor_arg /* argument to scan */ )
27252 +{
27253 + reiser4_key unit_key;
27254 + struct entry_actor_args *args;
27255 +
27256 + assert("nikita-1131", tree != NULL);
27257 + assert("nikita-1132", coord != NULL);
27258 + assert("nikita-1133", entry_actor_arg != NULL);
27259 +
27260 + args = entry_actor_arg;
27261 + ++args->non_uniq;
27262 +#if REISER4_USE_COLLISION_LIMIT
27263 + if (args->non_uniq > args->max_non_uniq) {
27264 + args->not_found = 1;
27265 + /* hash collision overflow. */
27266 + return RETERR(-EBUSY);
27267 + }
27268 +#endif
27269 +
27270 + /*
27271 + * did we just reach the end of the sequence of items/units with
27272 + * identical keys?
27273 + */
27274 + if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
27275 + assert("nikita-1791",
27276 + keylt(args->key, unit_key_by_coord(coord, &unit_key)));
27277 + args->not_found = 1;
27278 + args->last_coord.between = AFTER_UNIT;
27279 + return 0;
27280 + }
27281 +
27282 + coord_dup(&args->last_coord, coord);
27283 + /*
27284 + * did scan just moved to the next node?
27285 + */
27286 + if (args->last_lh.node != lh->node) {
27287 + int lock_result;
27288 +
27289 + /*
27290 + * if so, lock new node with the mode requested by the caller
27291 + */
27292 + done_lh(&args->last_lh);
27293 + assert("nikita-1896", znode_is_any_locked(lh->node));
27294 + lock_result = longterm_lock_znode(&args->last_lh, lh->node,
27295 + args->mode, ZNODE_LOCK_HIPRI);
27296 + if (lock_result != 0)
27297 + return lock_result;
27298 + }
27299 + return check_item(args->inode, coord, args->name);
27300 +}
27301 +
27302 +/* Look for given @name within directory @dir.
27303 +
27304 + This is called during lookup, creation and removal of directory
27305 + entries and on reiser4_rename_common
27306 +
27307 + First calculate key that directory entry for @name would have. Search
27308 + for this key in the tree. If such key is found, scan all items with
27309 + the same key, checking name in each directory entry along the way.
27310 +*/
27311 +int reiser4_find_entry(struct inode *dir, /* directory to scan */
27312 + struct dentry *de, /* name to search for */
27313 + lock_handle * lh, /* resulting lock handle */
27314 + znode_lock_mode mode, /* required lock mode */
27315 + reiser4_dir_entry_desc * entry /* parameters of found
27316 + directory entry */)
27317 +{
27318 + const struct qstr *name;
27319 + seal_t *seal;
27320 + coord_t *coord;
27321 + int result;
27322 + __u32 flags;
27323 + struct de_location *dec;
27324 + struct reiser4_dentry_fsdata *fsdata;
27325 +
27326 + assert("nikita-1130", lh != NULL);
27327 + assert("nikita-1128", dir != NULL);
27328 +
27329 + name = &de->d_name;
27330 + assert("nikita-1129", name != NULL);
27331 +
27332 + /* dentry private data don't require lock, because dentry
27333 + manipulations are protected by i_mutex on parent.
27334 +
27335 + This is not so for inodes, because there is no -the- parent in
27336 + inode case.
27337 + */
27338 + fsdata = reiser4_get_dentry_fsdata(de);
27339 + if (IS_ERR(fsdata))
27340 + return PTR_ERR(fsdata);
27341 + dec = &fsdata->dec;
27342 +
27343 + coord = &dec->entry_coord;
27344 + coord_clear_iplug(coord);
27345 + seal = &dec->entry_seal;
27346 + /* compose key of directory entry for @name */
27347 + inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
27348 +
27349 + if (reiser4_seal_is_set(seal)) {
27350 + /* check seal */
27351 + result = reiser4_seal_validate(seal, coord, &entry->key,
27352 + lh, mode, ZNODE_LOCK_LOPRI);
27353 + if (result == 0) {
27354 + /* key was found. Check that it is really item we are
27355 + looking for. */
27356 + result = check_entry(dir, coord, name);
27357 + if (result == 0)
27358 + return 0;
27359 + }
27360 + }
27361 + flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
27362 + /*
27363 + * find place in the tree where directory item should be located.
27364 + */
27365 + result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
27366 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
27367 + flags, NULL /*ra_info */ );
27368 + if (result == CBK_COORD_FOUND) {
27369 + struct entry_actor_args arg;
27370 +
27371 + /* fast path: no hash collisions */
27372 + result = check_entry(dir, coord, name);
27373 + if (result == 0) {
27374 + reiser4_seal_init(seal, coord, &entry->key);
27375 + dec->pos = 0;
27376 + } else if (result > 0) {
27377 + /* Iterate through all units with the same keys. */
27378 + arg.name = name->name;
27379 + arg.key = &entry->key;
27380 + arg.not_found = 0;
27381 + arg.non_uniq = 0;
27382 +#if REISER4_USE_COLLISION_LIMIT
27383 + arg.max_non_uniq = max_hash_collisions(dir);
27384 + assert("nikita-2851", arg.max_non_uniq > 1);
27385 +#endif
27386 + arg.mode = mode;
27387 + arg.inode = dir;
27388 + coord_init_zero(&arg.last_coord);
27389 + init_lh(&arg.last_lh);
27390 +
27391 + result = reiser4_iterate_tree
27392 + (reiser4_tree_by_inode(dir),
27393 + coord, lh,
27394 + entry_actor, &arg, mode, 1);
27395 + /* if end of the tree or extent was reached during
27396 + scanning. */
27397 + if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
27398 + /* step back */
27399 + done_lh(lh);
27400 +
27401 + result = zload(arg.last_coord.node);
27402 + if (result == 0) {
27403 + coord_clear_iplug(&arg.last_coord);
27404 + coord_dup(coord, &arg.last_coord);
27405 + move_lh(lh, &arg.last_lh);
27406 + result = RETERR(-ENOENT);
27407 + zrelse(arg.last_coord.node);
27408 + --arg.non_uniq;
27409 + }
27410 + }
27411 +
27412 + done_lh(&arg.last_lh);
27413 + if (result == 0)
27414 + reiser4_seal_init(seal, coord, &entry->key);
27415 +
27416 + if (result == 0 || result == -ENOENT) {
27417 + assert("nikita-2580", arg.non_uniq > 0);
27418 + dec->pos = arg.non_uniq - 1;
27419 + }
27420 + }
27421 + } else
27422 + dec->pos = -1;
27423 + return result;
27424 +}
27425 +
27426 +/*
27427 + Local variables:
27428 + c-indentation-style: "K&R"
27429 + mode-name: "LC"
27430 + c-basic-offset: 8
27431 + tab-width: 8
27432 + fill-column: 120
27433 + scroll-step: 1
27434 + End:
27435 +*/
27436 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.c
27437 --- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300
27438 +++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.c 2007-12-04 16:49:30.000000000 +0300
27439 @@ -0,0 +1,655 @@
27440 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27441 +
27442 +#include "../../debug.h"
27443 +#include "../../dformat.h"
27444 +#include "../../key.h"
27445 +#include "../node/node.h"
27446 +#include "../space/space_allocator.h"
27447 +#include "disk_format40.h"
27448 +#include "../plugin.h"
27449 +#include "../../txnmgr.h"
27450 +#include "../../jnode.h"
27451 +#include "../../tree.h"
27452 +#include "../../super.h"
27453 +#include "../../wander.h"
27454 +#include "../../inode.h"
27455 +#include "../../ktxnmgrd.h"
27456 +#include "../../status_flags.h"
27457 +
27458 +#include <linux/types.h> /* for __u?? */
27459 +#include <linux/fs.h> /* for struct super_block */
27460 +#include <linux/buffer_head.h>
27461 +
27462 +/* reiser 4.0 default disk layout */
27463 +
27464 +/* Amount of free blocks needed to perform release_format40 when fs gets
27465 + mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
27466 + & tx record. */
27467 +#define RELEASE_RESERVED 4
27468 +
27469 +/* The greatest supported format40 version number */
27470 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
27471 +
27472 +/* This flag indicates that backup should be updated
27473 + (the update is performed by fsck) */
27474 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
27475 +
27476 +/* functions to access fields of format40_disk_super_block */
27477 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
27478 +{
27479 + return le64_to_cpu(get_unaligned(&sb->block_count));
27480 +}
27481 +
27482 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
27483 +{
27484 + return le64_to_cpu(get_unaligned(&sb->free_blocks));
27485 +}
27486 +
27487 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
27488 +{
27489 + return le64_to_cpu(get_unaligned(&sb->root_block));
27490 +}
27491 +
27492 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
27493 +{
27494 + return le16_to_cpu(get_unaligned(&sb->tree_height));
27495 +}
27496 +
27497 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
27498 +{
27499 + return le64_to_cpu(get_unaligned(&sb->file_count));
27500 +}
27501 +
27502 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
27503 +{
27504 + return le64_to_cpu(get_unaligned(&sb->oid));
27505 +}
27506 +
27507 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
27508 +{
27509 + return le32_to_cpu(get_unaligned(&sb->mkfs_id));
27510 +}
27511 +
27512 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
27513 +{
27514 + return le64_to_cpu(get_unaligned(&sb->flags));
27515 +}
27516 +
27517 +static __u32 get_format40_version(const format40_disk_super_block * sb)
27518 +{
27519 + return le32_to_cpu(get_unaligned(&sb->version)) &
27520 + ~FORMAT40_UPDATE_BACKUP;
27521 +}
27522 +
27523 +static int update_backup_version(const format40_disk_super_block * sb)
27524 +{
27525 + return (le32_to_cpu(get_unaligned(&sb->version)) &
27526 + FORMAT40_UPDATE_BACKUP);
27527 +}
27528 +
27529 +static int update_disk_version(const format40_disk_super_block * sb)
27530 +{
27531 + return (get_format40_version(sb) < FORMAT40_VERSION);
27532 +}
27533 +
27534 +static int incomplete_compatibility(const format40_disk_super_block * sb)
27535 +{
27536 + return (get_format40_version(sb) > FORMAT40_VERSION);
27537 +}
27538 +
27539 +static format40_super_info *get_sb_info(struct super_block *super)
27540 +{
27541 + return &get_super_private(super)->u.format40;
27542 +}
27543 +
27544 +static int consult_diskmap(struct super_block *s)
27545 +{
27546 + format40_super_info *info;
27547 + journal_location *jloc;
27548 +
27549 + info = get_sb_info(s);
27550 + jloc = &get_super_private(s)->jloc;
27551 + /* Default format-specific locations, if there is nothing in
27552 + * diskmap */
27553 + jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
27554 + jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
27555 + info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
27556 +#ifdef CONFIG_REISER4_BADBLOCKS
27557 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
27558 + &jloc->footer);
27559 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
27560 + &jloc->header);
27561 + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
27562 + &info->loc.super);
27563 +#endif
27564 + return 0;
27565 +}
27566 +
27567 +/* find any valid super block of disk_format40 (even if the first
27568 + super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
27569 + if needed */
27570 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
27571 + *s)
27572 +{
27573 + struct buffer_head *super_bh;
27574 + format40_disk_super_block *disk_sb;
27575 + format40_super_info *info;
27576 +
27577 + assert("umka-487", s != NULL);
27578 +
27579 + info = get_sb_info(s);
27580 +
27581 + super_bh = sb_bread(s, info->loc.super);
27582 + if (super_bh == NULL)
27583 + return ERR_PTR(RETERR(-EIO));
27584 +
27585 + disk_sb = (format40_disk_super_block *) super_bh->b_data;
27586 + if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
27587 + brelse(super_bh);
27588 + return ERR_PTR(RETERR(-EINVAL));
27589 + }
27590 +
27591 + reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
27592 + reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
27593 + le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27594 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27595 +
27596 + return super_bh;
27597 +}
27598 +
27599 +/* find the most recent version of super block. This is called after journal is
27600 + replayed */
27601 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
27602 +{
27603 + /* Here the most recent superblock copy has to be read. However, as
27604 + journal replay isn't complete, we are using
27605 + find_a_disk_format40_super_block() function. */
27606 + return find_a_disk_format40_super_block(s);
27607 +}
27608 +
27609 +static int get_super_jnode(struct super_block *s)
27610 +{
27611 + reiser4_super_info_data *sbinfo = get_super_private(s);
27612 + jnode *sb_jnode;
27613 + int ret;
27614 +
27615 + sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
27616 +
27617 + ret = jload(sb_jnode);
27618 +
27619 + if (ret) {
27620 + reiser4_drop_io_head(sb_jnode);
27621 + return ret;
27622 + }
27623 +
27624 + pin_jnode_data(sb_jnode);
27625 + jrelse(sb_jnode);
27626 +
27627 + sbinfo->u.format40.sb_jnode = sb_jnode;
27628 +
27629 + return 0;
27630 +}
27631 +
27632 +static void done_super_jnode(struct super_block *s)
27633 +{
27634 + jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27635 +
27636 + if (sb_jnode) {
27637 + unpin_jnode_data(sb_jnode);
27638 + reiser4_drop_io_head(sb_jnode);
27639 + }
27640 +}
27641 +
27642 +typedef enum format40_init_stage {
27643 + NONE_DONE = 0,
27644 + CONSULT_DISKMAP,
27645 + FIND_A_SUPER,
27646 + INIT_JOURNAL_INFO,
27647 + INIT_STATUS,
27648 + JOURNAL_REPLAY,
27649 + READ_SUPER,
27650 + KEY_CHECK,
27651 + INIT_OID,
27652 + INIT_TREE,
27653 + JOURNAL_RECOVER,
27654 + INIT_SA,
27655 + INIT_JNODE,
27656 + ALL_DONE
27657 +} format40_init_stage;
27658 +
27659 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
27660 +{
27661 + format40_disk_super_block *sb_copy;
27662 +
27663 + sb_copy = kmalloc(sizeof(format40_disk_super_block),
27664 + reiser4_ctx_gfp_mask_get());
27665 + if (sb_copy == NULL)
27666 + return ERR_PTR(RETERR(-ENOMEM));
27667 + memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
27668 + sizeof(format40_disk_super_block));
27669 + return sb_copy;
27670 +}
27671 +
27672 +static int check_key_format(const format40_disk_super_block *sb_copy)
27673 +{
27674 + if (!equi(REISER4_LARGE_KEY,
27675 + get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
27676 + warning("nikita-3228", "Key format mismatch. "
27677 + "Only %s keys are supported.",
27678 + REISER4_LARGE_KEY ? "large" : "small");
27679 + return RETERR(-EINVAL);
27680 + }
27681 + return 0;
27682 +}
27683 +
27684 +/**
27685 + * try_init_format40
27686 + * @super:
27687 + * @stage:
27688 + *
27689 + */
27690 +static int try_init_format40(struct super_block *super,
27691 + format40_init_stage *stage)
27692 +{
27693 + int result;
27694 + struct buffer_head *super_bh;
27695 + reiser4_super_info_data *sbinfo;
27696 + format40_disk_super_block *sb_copy;
27697 + tree_level height;
27698 + reiser4_block_nr root_block;
27699 + node_plugin *nplug;
27700 +
27701 + assert("vs-475", super != NULL);
27702 + assert("vs-474", get_super_private(super));
27703 +
27704 + *stage = NONE_DONE;
27705 +
27706 + result = consult_diskmap(super);
27707 + if (result)
27708 + return result;
27709 + *stage = CONSULT_DISKMAP;
27710 +
27711 + super_bh = find_a_disk_format40_super_block(super);
27712 + if (IS_ERR(super_bh))
27713 + return PTR_ERR(super_bh);
27714 + brelse(super_bh);
27715 + *stage = FIND_A_SUPER;
27716 +
27717 + /* ok, we are sure that filesystem format is a format40 format */
27718 +
27719 + /* map jnodes for journal control blocks (header, footer) to disk */
27720 + result = reiser4_init_journal_info(super);
27721 + if (result)
27722 + return result;
27723 + *stage = INIT_JOURNAL_INFO;
27724 +
27725 + /* ok, we are sure that filesystem format is a format40 format */
27726 + /* Now check it's state */
27727 + result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
27728 + if (result != 0 && result != -EINVAL)
27729 + /* -EINVAL means there is no magic, so probably just old
27730 + * fs. */
27731 + return result;
27732 + *stage = INIT_STATUS;
27733 +
27734 + result = reiser4_status_query(NULL, NULL);
27735 + if (result == REISER4_STATUS_MOUNT_WARN)
27736 + notice("vpf-1363", "Warning: mounting %s with errors.",
27737 + super->s_id);
27738 + if (result == REISER4_STATUS_MOUNT_RO)
27739 + notice("vpf-1364", "Warning: mounting %s with fatal errors,"
27740 + " forcing read-only mount.", super->s_id);
27741 + result = reiser4_journal_replay(super);
27742 + if (result)
27743 + return result;
27744 + *stage = JOURNAL_REPLAY;
27745 +
27746 + super_bh = read_super_block(super);
27747 + if (IS_ERR(super_bh))
27748 + return PTR_ERR(super_bh);
27749 + *stage = READ_SUPER;
27750 +
27751 + /* allocate and make a copy of format40_disk_super_block */
27752 + sb_copy = copy_sb(super_bh);
27753 + brelse(super_bh);
27754 +
27755 + if (IS_ERR(sb_copy))
27756 + return PTR_ERR(sb_copy);
27757 + printk("reiser4: %s: found disk format 4.0.%u.\n",
27758 + super->s_id,
27759 + get_format40_version(sb_copy));
27760 + if (incomplete_compatibility(sb_copy))
27761 + printk("reiser4: Warning: The last completely supported "
27762 + "version of disk format40 is %u. Some objects of "
27763 + "the semantic tree can be unaccessible.\n",
27764 + FORMAT40_VERSION);
27765 + /* make sure that key format of kernel and filesystem match */
27766 + result = check_key_format(sb_copy);
27767 + if (result) {
27768 + kfree(sb_copy);
27769 + return result;
27770 + }
27771 + *stage = KEY_CHECK;
27772 +
27773 + result = oid_init_allocator(super, get_format40_file_count(sb_copy),
27774 + get_format40_oid(sb_copy));
27775 + if (result) {
27776 + kfree(sb_copy);
27777 + return result;
27778 + }
27779 + *stage = INIT_OID;
27780 +
27781 + /* get things necessary to init reiser4_tree */
27782 + root_block = get_format40_root_block(sb_copy);
27783 + height = get_format40_tree_height(sb_copy);
27784 + nplug = node_plugin_by_id(NODE40_ID);
27785 +
27786 + /* initialize reiser4_super_info_data */
27787 + sbinfo = get_super_private(super);
27788 + assert("", sbinfo->tree.super == super);
27789 + /* init reiser4_tree for the filesystem */
27790 + result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
27791 + if (result) {
27792 + kfree(sb_copy);
27793 + return result;
27794 + }
27795 + *stage = INIT_TREE;
27796 +
27797 + /*
27798 + * initialize reiser4_super_info_data with data from format40 super
27799 + * block
27800 + */
27801 + sbinfo->default_uid = 0;
27802 + sbinfo->default_gid = 0;
27803 + sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
27804 + /* number of blocks in filesystem and reserved space */
27805 + reiser4_set_block_count(super, get_format40_block_count(sb_copy));
27806 + sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
27807 + sbinfo->version = get_format40_version(sb_copy);
27808 + kfree(sb_copy);
27809 +
27810 + if (update_backup_version(sb_copy))
27811 + printk("reiser4: Warning: metadata backup is not updated. "
27812 + "Please run 'fsck.reiser4 --fix' on %s.\n",
27813 + super->s_id);
27814 +
27815 + sbinfo->fsuid = 0;
27816 + sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
27817 + * are not supported */
27818 + sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
27819 + * layout 40 are
27820 + * of one
27821 + * plugin */
27822 + /* sbinfo->tmgr is initialized already */
27823 +
27824 + /* recover sb data which were logged separately from sb block */
27825 +
27826 + /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
27827 + * oid_init_allocator() and reiser4_set_free_blocks() with new
27828 + * data. What's the reason to call them above? */
27829 + result = reiser4_journal_recover_sb_data(super);
27830 + if (result != 0)
27831 + return result;
27832 + *stage = JOURNAL_RECOVER;
27833 +
27834 + /*
27835 + * Set number of used blocks. The number of used blocks is not stored
27836 + * neither in on-disk super block nor in the journal footer blocks. At
27837 + * this moment actual values of total blocks and free block counters
27838 + * are set in the reiser4 super block (in-memory structure) and we can
27839 + * calculate number of used blocks from them.
27840 + */
27841 + reiser4_set_data_blocks(super,
27842 + reiser4_block_count(super) -
27843 + reiser4_free_blocks(super));
27844 +
27845 +#if REISER4_DEBUG
27846 + sbinfo->min_blocks_used = 16 /* reserved area */ +
27847 + 2 /* super blocks */ +
27848 + 2 /* journal footer and header */ ;
27849 +#endif
27850 +
27851 + /* init disk space allocator */
27852 + result = sa_init_allocator(reiser4_get_space_allocator(super),
27853 + super, NULL);
27854 + if (result)
27855 + return result;
27856 + *stage = INIT_SA;
27857 +
27858 + result = get_super_jnode(super);
27859 + if (result == 0)
27860 + *stage = ALL_DONE;
27861 + return result;
27862 +}
27863 +
27864 +/* plugin->u.format.get_ready */
27865 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
27866 +{
27867 + int result;
27868 + format40_init_stage stage;
27869 +
27870 + result = try_init_format40(s, &stage);
27871 + switch (stage) {
27872 + case ALL_DONE:
27873 + assert("nikita-3458", result == 0);
27874 + break;
27875 + case INIT_JNODE:
27876 + done_super_jnode(s);
27877 + case INIT_SA:
27878 + sa_destroy_allocator(reiser4_get_space_allocator(s), s);
27879 + case JOURNAL_RECOVER:
27880 + case INIT_TREE:
27881 + reiser4_done_tree(&get_super_private(s)->tree);
27882 + case INIT_OID:
27883 + case KEY_CHECK:
27884 + case READ_SUPER:
27885 + case JOURNAL_REPLAY:
27886 + case INIT_STATUS:
27887 + reiser4_status_finish();
27888 + case INIT_JOURNAL_INFO:
27889 + reiser4_done_journal_info(s);
27890 + case FIND_A_SUPER:
27891 + case CONSULT_DISKMAP:
27892 + case NONE_DONE:
27893 + break;
27894 + default:
27895 + impossible("nikita-3457", "init stage: %i", stage);
27896 + }
27897 +
27898 + if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
27899 + return RETERR(-ENOSPC);
27900 +
27901 + return result;
27902 +}
27903 +
27904 +static void pack_format40_super(const struct super_block *s, char *data)
27905 +{
27906 + format40_disk_super_block *super_data =
27907 + (format40_disk_super_block *) data;
27908 +
27909 + reiser4_super_info_data *sbinfo = get_super_private(s);
27910 +
27911 + assert("zam-591", data != NULL);
27912 +
27913 + put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
27914 + &super_data->free_blocks);
27915 +
27916 + put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
27917 + &super_data->root_block);
27918 +
27919 + put_unaligned(cpu_to_le64(oid_next(s)),
27920 + &super_data->oid);
27921 +
27922 + put_unaligned(cpu_to_le64(oids_used(s)),
27923 + &super_data->file_count);
27924 +
27925 + put_unaligned(cpu_to_le16(sbinfo->tree.height),
27926 + &super_data->tree_height);
27927 +
27928 + if (update_disk_version(super_data)) {
27929 + __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
27930 +
27931 + put_unaligned(cpu_to_le32(version), &super_data->version);
27932 + }
27933 +}
27934 +
27935 +/* plugin->u.format.log_super
27936 + return a jnode which should be added to transaction when the super block
27937 + gets logged */
27938 +jnode *log_super_format40(struct super_block *s)
27939 +{
27940 + jnode *sb_jnode;
27941 +
27942 + sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27943 +
27944 + jload(sb_jnode);
27945 +
27946 + pack_format40_super(s, jdata(sb_jnode));
27947 +
27948 + jrelse(sb_jnode);
27949 +
27950 + return sb_jnode;
27951 +}
27952 +
27953 +/* plugin->u.format.release */
27954 +int release_format40(struct super_block *s)
27955 +{
27956 + int ret;
27957 + reiser4_super_info_data *sbinfo;
27958 +
27959 + sbinfo = get_super_private(s);
27960 + assert("zam-579", sbinfo != NULL);
27961 +
27962 + if (!rofs_super(s)) {
27963 + ret = reiser4_capture_super_block(s);
27964 + if (ret != 0)
27965 + warning("vs-898",
27966 + "reiser4_capture_super_block failed: %d",
27967 + ret);
27968 +
27969 + ret = txnmgr_force_commit_all(s, 1);
27970 + if (ret != 0)
27971 + warning("jmacd-74438", "txn_force failed: %d", ret);
27972 +
27973 + all_grabbed2free();
27974 + }
27975 +
27976 + sa_destroy_allocator(&sbinfo->space_allocator, s);
27977 + reiser4_done_journal_info(s);
27978 + done_super_jnode(s);
27979 +
27980 + rcu_barrier();
27981 + reiser4_done_tree(&sbinfo->tree);
27982 + /* call finish_rcu(), because some znode were "released" in
27983 + * reiser4_done_tree(). */
27984 + rcu_barrier();
27985 +
27986 + return 0;
27987 +}
27988 +
27989 +#define FORMAT40_ROOT_LOCALITY 41
27990 +#define FORMAT40_ROOT_OBJECTID 42
27991 +
27992 +/* plugin->u.format.root_dir_key */
27993 +const reiser4_key *root_dir_key_format40(const struct super_block *super
27994 + UNUSED_ARG)
27995 +{
27996 + static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
27997 + .el = {
27998 + __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
27999 +#if REISER4_LARGE_KEY
28000 + ON_LARGE_KEY(0ull,)
28001 +#endif
28002 + __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
28003 + 0ull
28004 + }
28005 + };
28006 +
28007 + return &FORMAT40_ROOT_DIR_KEY;
28008 +}
28009 +
28010 +/* plugin->u.format.check_open.
28011 + Check the opened object for validness. For now it checks for the valid oid &
28012 + locality only, can be improved later and it its work may depend on the mount
28013 + options. */
28014 +int check_open_format40(const struct inode *object)
28015 +{
28016 + oid_t max, oid;
28017 +
28018 + max = oid_next(object->i_sb) - 1;
28019 +
28020 + /* Check the oid. */
28021 + oid = get_inode_oid(object);
28022 + if (oid > max) {
28023 + warning("vpf-1360", "The object with the oid %llu "
28024 + "greater then the max used oid %llu found.",
28025 + (unsigned long long)oid, (unsigned long long)max);
28026 +
28027 + return RETERR(-EIO);
28028 + }
28029 +
28030 + /* Check the locality. */
28031 + oid = reiser4_inode_data(object)->locality_id;
28032 + if (oid > max) {
28033 + warning("vpf-1361", "The object with the locality %llu "
28034 + "greater then the max used oid %llu found.",
28035 + (unsigned long long)oid, (unsigned long long)max);
28036 +
28037 + return RETERR(-EIO);
28038 + }
28039 +
28040 + return 0;
28041 +}
28042 +
28043 +/* plugin->u.format.version_update.
28044 + Perform all version update operations from the on-disk
28045 + format40_disk_super_block.version on disk to FORMAT40_VERSION.
28046 + */
28047 +int version_update_format40(struct super_block *super) {
28048 + txn_handle * trans;
28049 + lock_handle lh;
28050 + txn_atom *atom;
28051 + int ret;
28052 +
28053 + /* Nothing to do if RO mount or the on-disk version is not less. */
28054 + if (super->s_flags & MS_RDONLY)
28055 + return 0;
28056 +
28057 + if (get_super_private(super)->version >= FORMAT40_VERSION)
28058 + return 0;
28059 +
28060 + printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
28061 + "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
28062 + "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
28063 +
28064 + /* Mark the uber znode dirty to call log_super on write_logs. */
28065 + init_lh(&lh);
28066 + ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
28067 + ZNODE_LOCK_HIPRI, &lh);
28068 + if (ret != 0)
28069 + return ret;
28070 +
28071 + znode_make_dirty(lh.node);
28072 + done_lh(&lh);
28073 +
28074 + /* Update the backup blocks. */
28075 +
28076 + /* Force write_logs immediately. */
28077 + trans = get_current_context()->trans;
28078 + atom = get_current_atom_locked();
28079 + assert("vpf-1906", atom != NULL);
28080 +
28081 + spin_lock_txnh(trans);
28082 + return force_commit_atom(trans);
28083 +}
28084 +
28085 +/* Make Linus happy.
28086 + Local variables:
28087 + c-indentation-style: "K&R"
28088 + mode-name: "LC"
28089 + c-basic-offset: 8
28090 + tab-width: 8
28091 + fill-column: 120
28092 + scroll-step: 1
28093 + End:
28094 +*/
28095 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.h
28096 --- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300
28097 +++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format40.h 2007-12-04 16:49:30.000000000 +0300
28098 @@ -0,0 +1,109 @@
28099 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28100 +
28101 +/* this file contains:
28102 + - definition of ondisk super block of standart disk layout for
28103 + reiser 4.0 (layout 40)
28104 + - definition of layout 40 specific portion of in-core super block
28105 + - declarations of functions implementing methods of layout plugin
28106 + for layout 40
28107 + - declarations of functions used to get/set fields in layout 40 super block
28108 +*/
28109 +
28110 +#ifndef __DISK_FORMAT40_H__
28111 +#define __DISK_FORMAT40_H__
28112 +
28113 +/* magic for default reiser4 layout */
28114 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
28115 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
28116 +
28117 +#include "../../dformat.h"
28118 +
28119 +#include <linux/fs.h> /* for struct super_block */
28120 +
28121 +typedef enum {
28122 + FORMAT40_LARGE_KEYS
28123 +} format40_flags;
28124 +
28125 +/* ondisk super block for format 40. It is 512 bytes long */
28126 +typedef struct format40_disk_super_block {
28127 + /* 0 */ d64 block_count;
28128 + /* number of block in a filesystem */
28129 + /* 8 */ d64 free_blocks;
28130 + /* number of free blocks */
28131 + /* 16 */ d64 root_block;
28132 + /* filesystem tree root block */
28133 + /* 24 */ d64 oid;
28134 + /* smallest free objectid */
28135 + /* 32 */ d64 file_count;
28136 + /* number of files in a filesystem */
28137 + /* 40 */ d64 flushes;
28138 + /* number of times super block was
28139 + flushed. Needed if format 40
28140 + will have few super blocks */
28141 + /* 48 */ d32 mkfs_id;
28142 + /* unique identifier of fs */
28143 + /* 52 */ char magic[16];
28144 + /* magic string ReIsEr40FoRmAt */
28145 + /* 68 */ d16 tree_height;
28146 + /* height of filesystem tree */
28147 + /* 70 */ d16 formatting_policy;
28148 + /* not used anymore */
28149 + /* 72 */ d64 flags;
28150 + /* 80 */ d32 version;
28151 + /* on-disk format version number
28152 + initially assigned by mkfs as the greatest format40
28153 + version number supported by reiser4progs and updated
28154 + in mount time in accordance with the greatest format40
28155 + version number supported by kernel.
28156 + Is used by fsck to catch possible corruption and
28157 + for various compatibility issues */
28158 + /* 84 */ char not_used[428];
28159 +} format40_disk_super_block;
28160 +
28161 +/* format 40 specific part of reiser4_super_info_data */
28162 +typedef struct format40_super_info {
28163 +/* format40_disk_super_block actual_sb; */
28164 + jnode *sb_jnode;
28165 + struct {
28166 + reiser4_block_nr super;
28167 + } loc;
28168 +} format40_super_info;
28169 +
28170 +/* Defines for journal header and footer respectively. */
28171 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
28172 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
28173 +
28174 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
28175 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
28176 +
28177 +#define FORMAT40_STATUS_BLOCKNR \
28178 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
28179 +
28180 +/* Diskmap declarations */
28181 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
28182 +#define FORMAT40_SUPER 1
28183 +#define FORMAT40_JH 2
28184 +#define FORMAT40_JF 3
28185 +
28186 +/* declarations of functions implementing methods of layout plugin for
28187 + format 40. The functions theirself are in disk_format40.c */
28188 +extern int init_format_format40(struct super_block *, void *data);
28189 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
28190 +extern int release_format40(struct super_block *s);
28191 +extern jnode *log_super_format40(struct super_block *s);
28192 +extern int check_open_format40(const struct inode *object);
28193 +extern int version_update_format40(struct super_block *super);
28194 +
28195 +/* __DISK_FORMAT40_H__ */
28196 +#endif
28197 +
28198 +/* Make Linus happy.
28199 + Local variables:
28200 + c-indentation-style: "K&R"
28201 + mode-name: "LC"
28202 + c-basic-offset: 8
28203 + tab-width: 8
28204 + fill-column: 120
28205 + scroll-step: 1
28206 + End:
28207 +*/
28208 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.c
28209 --- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
28210 +++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.c 2007-12-04 16:49:30.000000000 +0300
28211 @@ -0,0 +1,38 @@
28212 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28213 +
28214 +#include "../../debug.h"
28215 +#include "../plugin_header.h"
28216 +#include "disk_format40.h"
28217 +#include "disk_format.h"
28218 +#include "../plugin.h"
28219 +
28220 +/* initialization of disk layout plugins */
28221 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
28222 + [FORMAT40_ID] = {
28223 + .h = {
28224 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
28225 + .id = FORMAT40_ID,
28226 + .pops = NULL,
28227 + .label = "reiser40",
28228 + .desc = "standard disk layout for reiser40",
28229 + .linkage = {NULL, NULL}
28230 + },
28231 + .init_format = init_format_format40,
28232 + .root_dir_key = root_dir_key_format40,
28233 + .release = release_format40,
28234 + .log_super = log_super_format40,
28235 + .check_open = check_open_format40,
28236 + .version_update = version_update_format40
28237 + }
28238 +};
28239 +
28240 +/* Make Linus happy.
28241 + Local variables:
28242 + c-indentation-style: "K&R"
28243 + mode-name: "LC"
28244 + c-basic-offset: 8
28245 + tab-width: 8
28246 + fill-column: 120
28247 + scroll-step: 1
28248 + End:
28249 +*/
28250 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.h
28251 --- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
28252 +++ linux-2.6.23/fs/reiser4/plugin/disk_format/disk_format.h 2007-12-04 16:49:30.000000000 +0300
28253 @@ -0,0 +1,27 @@
28254 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28255 +
28256 +/* identifiers for disk layouts, they are also used as indexes in array of disk
28257 + plugins */
28258 +
28259 +#if !defined( __REISER4_DISK_FORMAT_H__ )
28260 +#define __REISER4_DISK_FORMAT_H__
28261 +
28262 +typedef enum {
28263 + /* standard reiser4 disk layout plugin id */
28264 + FORMAT40_ID,
28265 + LAST_FORMAT_ID
28266 +} disk_format_id;
28267 +
28268 +/* __REISER4_DISK_FORMAT_H__ */
28269 +#endif
28270 +
28271 +/* Make Linus happy.
28272 + Local variables:
28273 + c-indentation-style: "K&R"
28274 + mode-name: "LC"
28275 + c-basic-offset: 8
28276 + tab-width: 8
28277 + fill-column: 120
28278 + scroll-step: 1
28279 + End:
28280 +*/
28281 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.23/fs/reiser4/plugin/disk_format/Makefile
28282 --- linux-2.6.23.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300
28283 +++ linux-2.6.23/fs/reiser4/plugin/disk_format/Makefile 2007-12-04 16:49:30.000000000 +0300
28284 @@ -0,0 +1,5 @@
28285 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
28286 +
28287 +df_plugins-objs := \
28288 + disk_format40.o \
28289 + disk_format.o
28290 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/fibration.c linux-2.6.23/fs/reiser4/plugin/fibration.c
28291 --- linux-2.6.23.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300
28292 +++ linux-2.6.23/fs/reiser4/plugin/fibration.c 2007-12-04 16:49:30.000000000 +0300
28293 @@ -0,0 +1,175 @@
28294 +/* Copyright 2004 by Hans Reiser, licensing governed by
28295 + * reiser4/README */
28296 +
28297 +/* Directory fibrations */
28298 +
28299 +/*
28300 + * Suppose we have a directory tree with sources of some project. During
28301 + * compilation .o files are created within this tree. This makes access
28302 + * to the original source files less efficient, because source files are
28303 + * now "diluted" by object files: default directory plugin uses prefix
28304 + * of a file name as a part of the key for directory entry (and this
28305 + * part is also inherited by the key of file body). This means that
28306 + * foo.o will be located close to foo.c and foo.h in the tree.
28307 + *
28308 + * To avoid this effect directory plugin fill highest 7 (unused
28309 + * originally) bits of the second component of the directory entry key
28310 + * by bit-pattern depending on the file name (see
28311 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
28312 + * "fibre". Fibre of the file name key is inherited by key of stat data
28313 + * and keys of file body (in the case of REISER4_LARGE_KEY).
28314 + *
28315 + * Fibre for a given file is chosen by per-directory fibration
28316 + * plugin. Names within given fibre are ordered lexicographically.
28317 + */
28318 +
28319 +#include "../debug.h"
28320 +#include "plugin_header.h"
28321 +#include "plugin.h"
28322 +#include "../super.h"
28323 +#include "../inode.h"
28324 +
28325 +#include <linux/types.h>
28326 +
28327 +static const int fibre_shift = 57;
28328 +
28329 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
28330 +
28331 +/*
28332 + * Trivial fibration: all files of directory are just ordered
28333 + * lexicographically.
28334 + */
28335 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
28336 +{
28337 + return FIBRE_NO(0);
28338 +}
28339 +
28340 +/*
28341 + * dot-o fibration: place .o files after all others.
28342 + */
28343 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
28344 +{
28345 + /* special treatment for .*\.o */
28346 + if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
28347 + return FIBRE_NO(1);
28348 + else
28349 + return FIBRE_NO(0);
28350 +}
28351 +
28352 +/*
28353 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
28354 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
28355 + * default fibre for the rest.
28356 + */
28357 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
28358 +{
28359 + if (len > 2 && name[len - 2] == '.')
28360 + return FIBRE_NO(name[len - 1]);
28361 + else
28362 + return FIBRE_NO(0);
28363 +}
28364 +
28365 +/*
28366 + * ext.3 fibration: try to separate files with different 3-character
28367 + * extensions from each other.
28368 + */
28369 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
28370 +{
28371 + if (len > 4 && name[len - 4] == '.')
28372 + return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
28373 + else
28374 + return FIBRE_NO(0);
28375 +}
28376 +
28377 +static int change_fibration(struct inode *inode,
28378 + reiser4_plugin * plugin,
28379 + pset_member memb)
28380 +{
28381 + int result;
28382 +
28383 + assert("nikita-3503", inode != NULL);
28384 + assert("nikita-3504", plugin != NULL);
28385 +
28386 + assert("nikita-3505", is_reiser4_inode(inode));
28387 + assert("nikita-3506", inode_dir_plugin(inode) != NULL);
28388 + assert("nikita-3507",
28389 + plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
28390 +
28391 + result = 0;
28392 + if (inode_fibration_plugin(inode) == NULL ||
28393 + inode_fibration_plugin(inode)->h.id != plugin->h.id) {
28394 + if (is_dir_empty(inode) == 0)
28395 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
28396 + PSET_FIBRATION, plugin);
28397 + else
28398 + result = RETERR(-ENOTEMPTY);
28399 +
28400 + }
28401 + return result;
28402 +}
28403 +
28404 +static reiser4_plugin_ops fibration_plugin_ops = {
28405 + .init = NULL,
28406 + .load = NULL,
28407 + .save_len = NULL,
28408 + .save = NULL,
28409 + .change = change_fibration
28410 +};
28411 +
28412 +/* fibration plugins */
28413 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
28414 + [FIBRATION_LEXICOGRAPHIC] = {
28415 + .h = {
28416 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28417 + .id = FIBRATION_LEXICOGRAPHIC,
28418 + .pops = &fibration_plugin_ops,
28419 + .label = "lexicographic",
28420 + .desc = "no fibration",
28421 + .linkage = {NULL, NULL}
28422 + },
28423 + .fibre = fibre_trivial
28424 + },
28425 + [FIBRATION_DOT_O] = {
28426 + .h = {
28427 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28428 + .id = FIBRATION_DOT_O,
28429 + .pops = &fibration_plugin_ops,
28430 + .label = "dot-o",
28431 + .desc = "fibrate .o files separately",
28432 + .linkage = {NULL, NULL}
28433 + },
28434 + .fibre = fibre_dot_o
28435 + },
28436 + [FIBRATION_EXT_1] = {
28437 + .h = {
28438 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28439 + .id = FIBRATION_EXT_1,
28440 + .pops = &fibration_plugin_ops,
28441 + .label = "ext-1",
28442 + .desc = "fibrate file by single character extension",
28443 + .linkage = {NULL, NULL}
28444 + },
28445 + .fibre = fibre_ext_1
28446 + },
28447 + [FIBRATION_EXT_3] = {
28448 + .h = {
28449 + .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28450 + .id = FIBRATION_EXT_3,
28451 + .pops = &fibration_plugin_ops,
28452 + .label = "ext-3",
28453 + .desc = "fibrate file by three character extension",
28454 + .linkage = {NULL, NULL}
28455 + },
28456 + .fibre = fibre_ext_3
28457 + }
28458 +};
28459 +
28460 +/*
28461 + * Local variables:
28462 + * c-indentation-style: "K&R"
28463 + * mode-name: "LC"
28464 + * c-basic-offset: 8
28465 + * tab-width: 8
28466 + * fill-column: 79
28467 + * End:
28468 + */
28469 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/fibration.h linux-2.6.23/fs/reiser4/plugin/fibration.h
28470 --- linux-2.6.23.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300
28471 +++ linux-2.6.23/fs/reiser4/plugin/fibration.h 2007-12-04 16:49:30.000000000 +0300
28472 @@ -0,0 +1,37 @@
28473 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
28474 +
28475 +/* Fibration plugin used by hashed directory plugin to segment content
28476 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
28477 +
28478 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
28479 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
28480 +
28481 +#include "plugin_header.h"
28482 +
28483 +typedef struct fibration_plugin {
28484 + /* generic fields */
28485 + plugin_header h;
28486 +
28487 + __u64(*fibre) (const struct inode * dir, const char *name, int len);
28488 +} fibration_plugin;
28489 +
28490 +typedef enum {
28491 + FIBRATION_LEXICOGRAPHIC,
28492 + FIBRATION_DOT_O,
28493 + FIBRATION_EXT_1,
28494 + FIBRATION_EXT_3,
28495 + LAST_FIBRATION_ID
28496 +} reiser4_fibration_id;
28497 +
28498 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
28499 +#endif
28500 +
28501 +/* Make Linus happy.
28502 + Local variables:
28503 + c-indentation-style: "K&R"
28504 + mode-name: "LC"
28505 + c-basic-offset: 8
28506 + tab-width: 8
28507 + fill-column: 120
28508 + End:
28509 +*/
28510 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.c
28511 --- linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300
28512 +++ linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.c 2007-12-04 23:04:00.722303973 +0300
28513 @@ -0,0 +1,3778 @@
28514 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
28515 + reiser4/README */
28516 +/*
28517 + * Written by Edward Shishkin.
28518 + *
28519 + * Implementations of inode/file/address_space operations
28520 + * specific for cryptcompress file plugin which manages
28521 + * regular files built of compressed and(or) encrypted bodies.
28522 + * See http://dev.namesys.com/CryptcompressPlugin for details.
28523 + */
28524 +
28525 +#include "../../inode.h"
28526 +#include "../cluster.h"
28527 +#include "../object.h"
28528 +#include "../../tree_walk.h"
28529 +#include "cryptcompress.h"
28530 +
28531 +#include <linux/pagevec.h>
28532 +#include <asm/uaccess.h>
28533 +#include <linux/swap.h>
28534 +#include <linux/writeback.h>
28535 +#include <linux/random.h>
28536 +#include <linux/scatterlist.h>
28537 +
28538 +/*
28539 + Managing primary and secondary caches by Reiser4
28540 + cryptcompress file plugin. Synchronization scheme.
28541 +
28542 +
28543 + +------------------+
28544 + +------------------->| tfm stream |
28545 + | | (compressed data)|
28546 + flush | +------------------+
28547 + +-----------------+ |
28548 + |(->)longterm lock| V
28549 +--+ writepages() | | +-***-+ reiser4 +---+
28550 + | | +--+ | *** | storage tree | |
28551 + | | | +-***-+ (primary cache)| |
28552 +u | write() (secondary| cache) V / | \ | |
28553 +s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d |
28554 +e | | | |page cluster | | | **disk cluster** | | i |
28555 +r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s |
28556 + | read() ^ ^ | | k |
28557 + | | (->)longterm lock| | page_io()| |
28558 + | | +------+ | |
28559 +--+ readpages() | | +---+
28560 + | V
28561 + | +------------------+
28562 + +--------------------| tfm stream |
28563 + | (plain text) |
28564 + +------------------+
28565 +*/
28566 +
28567 +/* get cryptcompress specific portion of inode */
28568 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
28569 +{
28570 + return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
28571 +}
28572 +
28573 +/* plugin->u.file.init_inode_data */
28574 +void init_inode_data_cryptcompress(struct inode *inode,
28575 + reiser4_object_create_data * crd,
28576 + int create)
28577 +{
28578 + struct cryptcompress_info *data;
28579 +
28580 + data = cryptcompress_inode_data(inode);
28581 + assert("edward-685", data != NULL);
28582 +
28583 + memset(data, 0, sizeof(*data));
28584 +
28585 + mutex_init(&data->checkin_mutex);
28586 + data->trunc_index = ULONG_MAX;
28587 + turn_on_compression(data);
28588 + set_lattice_factor(data, MIN_LATTICE_FACTOR);
28589 + init_inode_ordering(inode, crd, create);
28590 +}
28591 +
28592 +/* The following is a part of reiser4 cipher key manager
28593 + which is called when opening/creating a cryptcompress file */
28594 +
28595 +/* get/set cipher key info */
28596 +struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
28597 +{
28598 + assert("edward-90", inode != NULL);
28599 + assert("edward-91", reiser4_inode_data(inode) != NULL);
28600 + return cryptcompress_inode_data(inode)->crypt;
28601 +}
28602 +
28603 +static void set_inode_crypto_info (struct inode * inode,
28604 + struct reiser4_crypto_info * info)
28605 +{
28606 + cryptcompress_inode_data(inode)->crypt = info;
28607 +}
28608 +
28609 +/* allocate a cipher key info */
28610 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
28611 +{
28612 + struct reiser4_crypto_info *info;
28613 + int fipsize;
28614 +
28615 + info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
28616 + if (!info)
28617 + return ERR_PTR(-ENOMEM);
28618 +
28619 + fipsize = inode_digest_plugin(inode)->fipsize;
28620 + info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
28621 + if (!info->keyid) {
28622 + kfree(info);
28623 + return ERR_PTR(-ENOMEM);
28624 + }
28625 + info->host = inode;
28626 + return info;
28627 +}
28628 +
28629 +#if 0
28630 +/* allocate/free low-level info for cipher and digest
28631 + transforms */
28632 +static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
28633 +{
28634 + struct crypto_blkcipher * ctfm = NULL;
28635 + struct crypto_hash * dtfm = NULL;
28636 + cipher_plugin * cplug = inode_cipher_plugin(info->host);
28637 + digest_plugin * dplug = inode_digest_plugin(info->host);
28638 +
28639 + if (cplug->alloc) {
28640 + ctfm = cplug->alloc();
28641 + if (IS_ERR(ctfm)) {
28642 + warning("edward-1364",
28643 + "Can not allocate info for %s\n",
28644 + cplug->h.desc);
28645 + return RETERR(PTR_ERR(ctfm));
28646 + }
28647 + }
28648 + info_set_cipher(info, ctfm);
28649 + if (dplug->alloc) {
28650 + dtfm = dplug->alloc();
28651 + if (IS_ERR(dtfm)) {
28652 + warning("edward-1365",
28653 + "Can not allocate info for %s\n",
28654 + dplug->h.desc);
28655 + goto unhappy_with_digest;
28656 + }
28657 + }
28658 + info_set_digest(info, dtfm);
28659 + return 0;
28660 + unhappy_with_digest:
28661 + if (cplug->free) {
28662 + cplug->free(ctfm);
28663 + info_set_cipher(info, NULL);
28664 + }
28665 + return RETERR(PTR_ERR(dtfm));
28666 +}
28667 +#endif
28668 +
28669 +static void
28670 +free_crypto_tfms(struct reiser4_crypto_info * info)
28671 +{
28672 + assert("edward-1366", info != NULL);
28673 + if (!info_get_cipher(info)) {
28674 + assert("edward-1601", !info_get_digest(info));
28675 + return;
28676 + }
28677 + inode_cipher_plugin(info->host)->free(info_get_cipher(info));
28678 + info_set_cipher(info, NULL);
28679 + inode_digest_plugin(info->host)->free(info_get_digest(info));
28680 + info_set_digest(info, NULL);
28681 + return;
28682 +}
28683 +
28684 +#if 0
28685 +/* create a key fingerprint for disk stat-data */
28686 +static int create_keyid (struct reiser4_crypto_info * info,
28687 + struct reiser4_crypto_data * data)
28688 +{
28689 + int ret = -ENOMEM;
28690 + size_t blk, pad;
28691 + __u8 * dmem;
28692 + __u8 * cmem;
28693 + struct hash_desc ddesc;
28694 + struct blkcipher_desc cdesc;
28695 + struct scatterlist sg;
28696 +
28697 + assert("edward-1367", info != NULL);
28698 + assert("edward-1368", info->keyid != NULL);
28699 +
28700 + ddesc.tfm = info_get_digest(info);
28701 + ddesc.flags = 0;
28702 + cdesc.tfm = info_get_cipher(info);
28703 + cdesc.flags = 0;
28704 +
28705 + dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
28706 + reiser4_ctx_gfp_mask_get());
28707 + if (!dmem)
28708 + goto exit1;
28709 +
28710 + blk = crypto_blkcipher_blocksize(cdesc.tfm);
28711 +
28712 + pad = data->keyid_size % blk;
28713 + pad = (pad ? blk - pad : 0);
28714 +
28715 + cmem = kmalloc((size_t)data->keyid_size + pad,
28716 + reiser4_ctx_gfp_mask_get());
28717 + if (!cmem)
28718 + goto exit2;
28719 + memcpy(cmem, data->keyid, data->keyid_size);
28720 + memset(cmem + data->keyid_size, 0, pad);
28721 +
28722 + sg_init_one(&sg, cmem, data->keyid_size + pad);
28723 +
28724 + ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
28725 + data->keyid_size + pad);
28726 + if (ret) {
28727 + warning("edward-1369",
28728 + "encryption failed flags=%x\n", cdesc.flags);
28729 + goto exit3;
28730 + }
28731 + ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
28732 + if (ret) {
28733 + warning("edward-1602",
28734 + "digest failed flags=%x\n", ddesc.flags);
28735 + goto exit3;
28736 + }
28737 + memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
28738 + exit3:
28739 + kfree(cmem);
28740 + exit2:
28741 + kfree(dmem);
28742 + exit1:
28743 + return ret;
28744 +}
28745 +#endif
28746 +
28747 +static void destroy_keyid(struct reiser4_crypto_info * info)
28748 +{
28749 + assert("edward-1370", info != NULL);
28750 + assert("edward-1371", info->keyid != NULL);
28751 + kfree(info->keyid);
28752 + return;
28753 +}
28754 +
28755 +static void __free_crypto_info (struct inode * inode)
28756 +{
28757 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
28758 + assert("edward-1372", info != NULL);
28759 +
28760 + free_crypto_tfms(info);
28761 + destroy_keyid(info);
28762 + kfree(info);
28763 +}
28764 +
28765 +#if 0
28766 +static void instantiate_crypto_info(struct reiser4_crypto_info * info)
28767 +{
28768 + assert("edward-1373", info != NULL);
28769 + assert("edward-1374", info->inst == 0);
28770 + info->inst = 1;
28771 +}
28772 +#endif
28773 +
28774 +static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
28775 +{
28776 + assert("edward-1375", info != NULL);
28777 + info->inst = 0;
28778 +}
28779 +
28780 +#if 0
28781 +static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
28782 +{
28783 + return info->inst;
28784 +}
28785 +
28786 +static int inode_has_cipher_key(struct inode * inode)
28787 +{
28788 + assert("edward-1376", inode != NULL);
28789 + return inode_crypto_info(inode) &&
28790 + is_crypto_info_instantiated(inode_crypto_info(inode));
28791 +}
28792 +#endif
28793 +
28794 +static void free_crypto_info (struct inode * inode)
28795 +{
28796 + uninstantiate_crypto_info(inode_crypto_info(inode));
28797 + __free_crypto_info(inode);
28798 +}
28799 +
28800 +static int need_cipher(struct inode * inode)
28801 +{
28802 + return inode_cipher_plugin(inode) !=
28803 + cipher_plugin_by_id(NONE_CIPHER_ID);
28804 +}
28805 +
28806 +/* Parse @data which contains a (uninstantiated) cipher key imported
28807 + from user space, create a low-level cipher info and attach it to
28808 + the @object. If success, then info contains an instantiated key */
28809 +#if 0
28810 +struct reiser4_crypto_info * create_crypto_info(struct inode * object,
28811 + struct reiser4_crypto_data * data)
28812 +{
28813 + int ret;
28814 + struct reiser4_crypto_info * info;
28815 +
28816 + assert("edward-1377", data != NULL);
28817 + assert("edward-1378", need_cipher(object));
28818 +
28819 + if (inode_file_plugin(object) !=
28820 + file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
28821 + return ERR_PTR(-EINVAL);
28822 +
28823 + info = reiser4_alloc_crypto_info(object);
28824 + if (IS_ERR(info))
28825 + return info;
28826 + ret = alloc_crypto_tfms(info);
28827 + if (ret)
28828 + goto err;
28829 + /* instantiating a key */
28830 + ret = crypto_blkcipher_setkey(info_get_cipher(info),
28831 + data->key,
28832 + data->keysize);
28833 + if (ret) {
28834 + warning("edward-1379",
28835 + "setkey failed flags=%x",
28836 + crypto_blkcipher_get_flags(info_get_cipher(info)));
28837 + goto err;
28838 + }
28839 + info->keysize = data->keysize;
28840 + ret = create_keyid(info, data);
28841 + if (ret)
28842 + goto err;
28843 + instantiate_crypto_info(info);
28844 + return info;
28845 + err:
28846 + __free_crypto_info(object);
28847 + return ERR_PTR(ret);
28848 +}
28849 +#endif
28850 +
28851 +/* increment/decrement a load counter when
28852 + attaching/detaching the crypto-stat to any object */
28853 +static void load_crypto_info(struct reiser4_crypto_info * info)
28854 +{
28855 + assert("edward-1380", info != NULL);
28856 + inc_keyload_count(info);
28857 +}
28858 +
28859 +static void unload_crypto_info(struct inode * inode)
28860 +{
28861 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
28862 + assert("edward-1381", info->keyload_count > 0);
28863 +
28864 + dec_keyload_count(inode_crypto_info(inode));
28865 + if (info->keyload_count == 0)
28866 + /* final release */
28867 + free_crypto_info(inode);
28868 +}
28869 +
28870 +/* attach/detach an existing crypto-stat */
28871 +void reiser4_attach_crypto_info(struct inode * inode,
28872 + struct reiser4_crypto_info * info)
28873 +{
28874 + assert("edward-1382", inode != NULL);
28875 + assert("edward-1383", info != NULL);
28876 + assert("edward-1384", inode_crypto_info(inode) == NULL);
28877 +
28878 + set_inode_crypto_info(inode, info);
28879 + load_crypto_info(info);
28880 +}
28881 +
28882 +/* returns true, if crypto stat can be attached to the @host */
28883 +#if REISER4_DEBUG
28884 +static int host_allows_crypto_info(struct inode * host)
28885 +{
28886 + int ret;
28887 + file_plugin * fplug = inode_file_plugin(host);
28888 +
28889 + switch (fplug->h.id) {
28890 + case CRYPTCOMPRESS_FILE_PLUGIN_ID:
28891 + ret = 1;
28892 + break;
28893 + default:
28894 + ret = 0;
28895 + }
28896 + return ret;
28897 +}
28898 +#endif /* REISER4_DEBUG */
28899 +
28900 +static void reiser4_detach_crypto_info(struct inode * inode)
28901 +{
28902 + assert("edward-1385", inode != NULL);
28903 + assert("edward-1386", host_allows_crypto_info(inode));
28904 +
28905 + if (inode_crypto_info(inode))
28906 + unload_crypto_info(inode);
28907 + set_inode_crypto_info(inode, NULL);
28908 +}
28909 +
28910 +#if 0
28911 +
28912 +/* compare fingerprints of @child and @parent */
28913 +static int keyid_eq(struct reiser4_crypto_info * child,
28914 + struct reiser4_crypto_info * parent)
28915 +{
28916 + return !memcmp(child->keyid,
28917 + parent->keyid,
28918 + info_digest_plugin(parent)->fipsize);
28919 +}
28920 +
28921 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
28922 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
28923 +{
28924 + if (!need_cipher(child))
28925 + return 0;
28926 + /* the child is created */
28927 + if (!inode_crypto_info(child))
28928 + return 1;
28929 + /* the child is looked up */
28930 + if (!inode_crypto_info(parent))
28931 + return 0;
28932 + return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
28933 + inode_digest_plugin(child) == inode_digest_plugin(parent) &&
28934 + inode_crypto_info(child)->keysize ==
28935 + inode_crypto_info(parent)->keysize &&
28936 + keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
28937 +}
28938 +#endif
28939 +
28940 +/* helper functions for ->create() method of the cryptcompress plugin */
28941 +static int inode_set_crypto(struct inode * object)
28942 +{
28943 + reiser4_inode * info;
28944 + if (!inode_crypto_info(object)) {
28945 + if (need_cipher(object))
28946 + return RETERR(-EINVAL);
28947 + /* the file is not to be encrypted */
28948 + return 0;
28949 + }
28950 + info = reiser4_inode_data(object);
28951 + info->extmask |= (1 << CRYPTO_STAT);
28952 + return 0;
28953 +}
28954 +
28955 +static int inode_init_compression(struct inode * object)
28956 +{
28957 + int result = 0;
28958 + assert("edward-1461", object != NULL);
28959 + if (inode_compression_plugin(object)->init)
28960 + result = inode_compression_plugin(object)->init();
28961 + return result;
28962 +}
28963 +
28964 +static int inode_check_cluster(struct inode * object)
28965 +{
28966 + assert("edward-696", object != NULL);
28967 +
28968 + if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
28969 + warning("edward-1320", "Can not support '%s' "
28970 + "logical clusters (less then page size)",
28971 + inode_cluster_plugin(object)->h.label);
28972 + return RETERR(-EINVAL);
28973 + }
28974 + if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
28975 + warning("edward-1463", "Can not support '%s' "
28976 + "logical clusters (too big for transform)",
28977 + inode_cluster_plugin(object)->h.label);
28978 + return RETERR(-EINVAL);
28979 + }
28980 + return 0;
28981 +}
28982 +
28983 +/* plugin->destroy_inode() */
28984 +void destroy_inode_cryptcompress(struct inode * inode)
28985 +{
28986 + assert("edward-1464", INODE_PGCOUNT(inode) == 0);
28987 + reiser4_detach_crypto_info(inode);
28988 + return;
28989 +}
28990 +
28991 +/* plugin->create_object():
28992 +. install plugins
28993 +. attach crypto info if specified
28994 +. attach compression info if specified
28995 +. attach cluster info
28996 +*/
28997 +int create_object_cryptcompress(struct inode *object, struct inode *parent,
28998 + reiser4_object_create_data * data)
28999 +{
29000 + int result;
29001 + reiser4_inode *info;
29002 +
29003 + assert("edward-23", object != NULL);
29004 + assert("edward-24", parent != NULL);
29005 + assert("edward-30", data != NULL);
29006 + assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
29007 + assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
29008 +
29009 + info = reiser4_inode_data(object);
29010 +
29011 + assert("edward-29", info != NULL);
29012 +
29013 + /* set file bit */
29014 + info->plugin_mask |= (1 << PSET_FILE);
29015 +
29016 + /* set crypto */
29017 + result = inode_set_crypto(object);
29018 + if (result)
29019 + goto error;
29020 + /* set compression */
29021 + result = inode_init_compression(object);
29022 + if (result)
29023 + goto error;
29024 + /* set cluster */
29025 + result = inode_check_cluster(object);
29026 + if (result)
29027 + goto error;
29028 +
29029 + /* save everything in disk stat-data */
29030 + result = write_sd_by_inode_common(object);
29031 + if (!result)
29032 + return 0;
29033 + error:
29034 + reiser4_detach_crypto_info(object);
29035 + return result;
29036 +}
29037 +
29038 +/* plugin->open() */
29039 +int open_cryptcompress(struct inode * inode, struct file * file)
29040 +{
29041 + return 0;
29042 +}
29043 +
29044 +/* returns a blocksize, the attribute of a cipher algorithm */
29045 +static unsigned int
29046 +cipher_blocksize(struct inode * inode)
29047 +{
29048 + assert("edward-758", need_cipher(inode));
29049 + assert("edward-1400", inode_crypto_info(inode) != NULL);
29050 + return crypto_blkcipher_blocksize
29051 + (info_get_cipher(inode_crypto_info(inode)));
29052 +}
29053 +
29054 +/* returns offset translated by scale factor of the crypto-algorithm */
29055 +static loff_t inode_scaled_offset (struct inode * inode,
29056 + const loff_t src_off /* input offset */)
29057 +{
29058 + assert("edward-97", inode != NULL);
29059 +
29060 + if (!need_cipher(inode) ||
29061 + src_off == get_key_offset(reiser4_min_key()) ||
29062 + src_off == get_key_offset(reiser4_max_key()))
29063 + return src_off;
29064 +
29065 + return inode_cipher_plugin(inode)->scale(inode,
29066 + cipher_blocksize(inode),
29067 + src_off);
29068 +}
29069 +
29070 +/* returns disk cluster size */
29071 +size_t inode_scaled_cluster_size(struct inode * inode)
29072 +{
29073 + assert("edward-110", inode != NULL);
29074 +
29075 + return inode_scaled_offset(inode, inode_cluster_size(inode));
29076 +}
29077 +
29078 +/* set number of cluster pages */
29079 +static void set_cluster_nrpages(struct cluster_handle * clust,
29080 + struct inode *inode)
29081 +{
29082 + struct reiser4_slide * win;
29083 +
29084 + assert("edward-180", clust != NULL);
29085 + assert("edward-1040", inode != NULL);
29086 +
29087 + clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
29088 + win = clust->win;
29089 + if (!win) {
29090 + clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
29091 + return;
29092 + }
29093 + assert("edward-1176", clust->op != LC_INVAL);
29094 + assert("edward-1064", win->off + win->count + win->delta != 0);
29095 +
29096 + if (win->stat == HOLE_WINDOW &&
29097 + win->off == 0 && win->count == inode_cluster_size(inode)) {
29098 + /* special case: writing a "fake" logical cluster */
29099 + clust->nr_pages = 0;
29100 + return;
29101 + }
29102 + clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
29103 + lbytes(clust->index, inode)));
29104 + return;
29105 +}
29106 +
29107 +/* plugin->key_by_inode()
29108 + build key of a disk cluster */
29109 +int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
29110 + reiser4_key * key)
29111 +{
29112 + assert("edward-64", inode != 0);
29113 +
29114 + if (likely(off != get_key_offset(reiser4_max_key())))
29115 + off = off_to_clust_to_off(off, inode);
29116 + if (inode_crypto_info(inode))
29117 + off = inode_scaled_offset(inode, off);
29118 +
29119 + key_by_inode_and_offset_common(inode, 0, key);
29120 + set_key_offset(key, (__u64)off);
29121 + return 0;
29122 +}
29123 +
29124 +/* plugin->flow_by_inode() */
29125 +/* flow is used to read/write disk clusters */
29126 +int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
29127 + int user, /* 1: @buf is of user space,
29128 + 0: kernel space */
29129 + loff_t size, /* @buf size */
29130 + loff_t off, /* offset to start io from */
29131 + rw_op op, /* READ or WRITE */
29132 + flow_t * f /* resulting flow */)
29133 +{
29134 + assert("edward-436", f != NULL);
29135 + assert("edward-149", inode != NULL);
29136 + assert("edward-150", inode_file_plugin(inode) != NULL);
29137 + assert("edward-1465", user == 0); /* we use flow to read/write
29138 + disk clusters located in
29139 + kernel space */
29140 + f->length = size;
29141 + memcpy(&f->data, &buf, sizeof(buf));
29142 + f->user = user;
29143 + f->op = op;
29144 +
29145 + return key_by_inode_cryptcompress(inode, off, &f->key);
29146 +}
29147 +
29148 +static int
29149 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
29150 + znode_lock_mode lock_mode)
29151 +{
29152 + coord_t *coord;
29153 +
29154 + assert("edward-704", hint != NULL);
29155 + assert("edward-1089", !hint_is_valid(hint));
29156 + assert("edward-706", hint->lh.owner == NULL);
29157 +
29158 + coord = &hint->ext_coord.coord;
29159 +
29160 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
29161 + /* hint either not set or set by different operation */
29162 + return RETERR(-E_REPEAT);
29163 +
29164 + if (get_key_offset(key) != hint->offset)
29165 + /* hint is set for different key */
29166 + return RETERR(-E_REPEAT);
29167 +
29168 + assert("edward-707", reiser4_schedulable());
29169 +
29170 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
29171 + key, &hint->lh, lock_mode,
29172 + ZNODE_LOCK_LOPRI);
29173 +}
29174 +
29175 +/* reserve disk space when writing a logical cluster */
29176 +static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
29177 +{
29178 + int result = 0;
29179 +
29180 + assert("edward-965", reiser4_schedulable());
29181 + assert("edward-439", inode != NULL);
29182 + assert("edward-440", clust != NULL);
29183 + assert("edward-441", clust->pages != NULL);
29184 +
29185 + if (clust->nr_pages == 0) {
29186 + assert("edward-1152", clust->win != NULL);
29187 + assert("edward-1153", clust->win->stat == HOLE_WINDOW);
29188 + /* don't reserve disk space for fake logical cluster */
29189 + return 0;
29190 + }
29191 + assert("edward-442", jprivate(clust->pages[0]) != NULL);
29192 +
29193 + result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
29194 + estimate_update_cluster(inode),
29195 + BA_CAN_COMMIT);
29196 + if (result)
29197 + return result;
29198 + clust->reserved = 1;
29199 + grabbed2cluster_reserved(estimate_insert_cluster(inode) +
29200 + estimate_update_cluster(inode));
29201 +#if REISER4_DEBUG
29202 + clust->reserved_prepped = estimate_update_cluster(inode);
29203 + clust->reserved_unprepped = estimate_insert_cluster(inode);
29204 +#endif
29205 + /* there can be space grabbed by txnmgr_force_commit_all */
29206 + return 0;
29207 +}
29208 +
29209 +/* free reserved disk space if writing a logical cluster fails */
29210 +static void free_reserved4cluster(struct inode *inode,
29211 + struct cluster_handle *ch, int count)
29212 +{
29213 + assert("edward-967", ch->reserved == 1);
29214 +
29215 + cluster_reserved2free(count);
29216 + ch->reserved = 0;
29217 +}
29218 +
29219 +/* The core search procedure of the cryptcompress plugin.
29220 + If returned value is not cbk_errored, then current znode is locked */
29221 +static int find_cluster_item(hint_t * hint,
29222 + const reiser4_key * key, /* key of the item we are
29223 + looking for */
29224 + znode_lock_mode lock_mode /* which lock */ ,
29225 + ra_info_t * ra_info, lookup_bias bias, __u32 flags)
29226 +{
29227 + int result;
29228 + reiser4_key ikey;
29229 + int went_right = 0;
29230 + coord_t *coord = &hint->ext_coord.coord;
29231 + coord_t orig = *coord;
29232 +
29233 + assert("edward-152", hint != NULL);
29234 +
29235 + if (!hint_is_valid(hint)) {
29236 + result = cryptcompress_hint_validate(hint, key, lock_mode);
29237 + if (result == -E_REPEAT)
29238 + goto traverse_tree;
29239 + else if (result) {
29240 + assert("edward-1216", 0);
29241 + return result;
29242 + }
29243 + hint_set_valid(hint);
29244 + }
29245 + assert("edward-709", znode_is_any_locked(coord->node));
29246 +
29247 + /* In-place lookup is going here, it means we just need to
29248 + check if next item of the @coord match to the @keyhint) */
29249 +
29250 + if (equal_to_rdk(coord->node, key)) {
29251 + result = goto_right_neighbor(coord, &hint->lh);
29252 + if (result == -E_NO_NEIGHBOR) {
29253 + assert("edward-1217", 0);
29254 + return RETERR(-EIO);
29255 + }
29256 + if (result)
29257 + return result;
29258 + assert("edward-1218", equal_to_ldk(coord->node, key));
29259 + went_right = 1;
29260 + } else {
29261 + coord->item_pos++;
29262 + coord->unit_pos = 0;
29263 + coord->between = AT_UNIT;
29264 + }
29265 + result = zload(coord->node);
29266 + if (result)
29267 + return result;
29268 + assert("edward-1219", !node_is_empty(coord->node));
29269 +
29270 + if (!coord_is_existing_item(coord)) {
29271 + zrelse(coord->node);
29272 + goto not_found;
29273 + }
29274 + item_key_by_coord(coord, &ikey);
29275 + zrelse(coord->node);
29276 + if (!keyeq(key, &ikey))
29277 + goto not_found;
29278 + /* Ok, item is found, update node counts */
29279 + if (went_right)
29280 + dclust_inc_extension_ncount(hint);
29281 + return CBK_COORD_FOUND;
29282 +
29283 + not_found:
29284 + assert("edward-1220", coord->item_pos > 0);
29285 + //coord->item_pos--;
29286 + /* roll back */
29287 + *coord = orig;
29288 + ON_DEBUG(coord_update_v(coord));
29289 + return CBK_COORD_NOTFOUND;
29290 +
29291 + traverse_tree:
29292 + assert("edward-713", hint->lh.owner == NULL);
29293 + assert("edward-714", reiser4_schedulable());
29294 +
29295 + reiser4_unset_hint(hint);
29296 + dclust_init_extension(hint);
29297 + coord_init_zero(coord);
29298 + result = coord_by_key(current_tree, key, coord, &hint->lh,
29299 + lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
29300 + CBK_UNIQUE | flags, ra_info);
29301 + if (cbk_errored(result))
29302 + return result;
29303 + if(result == CBK_COORD_FOUND)
29304 + dclust_inc_extension_ncount(hint);
29305 + hint_set_valid(hint);
29306 + return result;
29307 +}
29308 +
29309 +/* This function is called by deflate[inflate] manager when
29310 + creating a transformed/plain stream to check if we should
29311 + create/cut some overhead. If this returns true, then @oh
29312 + contains the size of this overhead.
29313 + */
29314 +static int need_cut_or_align(struct inode * inode,
29315 + struct cluster_handle * ch, rw_op rw, int * oh)
29316 +{
29317 + struct tfm_cluster * tc = &ch->tc;
29318 + switch (rw) {
29319 + case WRITE_OP: /* estimate align */
29320 + *oh = tc->len % cipher_blocksize(inode);
29321 + if (*oh != 0)
29322 + return 1;
29323 + break;
29324 + case READ_OP: /* estimate cut */
29325 + *oh = *(tfm_output_data(ch) + tc->len - 1);
29326 + break;
29327 + default:
29328 + impossible("edward-1401", "bad option");
29329 + }
29330 + return (tc->len != tc->lsize);
29331 +}
29332 +
29333 +/* create/cut an overhead of transformed/plain stream */
29334 +static void align_or_cut_overhead(struct inode * inode,
29335 + struct cluster_handle * ch, rw_op rw)
29336 +{
29337 + int oh;
29338 + cipher_plugin * cplug = inode_cipher_plugin(inode);
29339 +
29340 + assert("edward-1402", need_cipher(inode));
29341 +
29342 + if (!need_cut_or_align(inode, ch, rw, &oh))
29343 + return;
29344 + switch (rw) {
29345 + case WRITE_OP: /* do align */
29346 + ch->tc.len +=
29347 + cplug->align_stream(tfm_input_data(ch) +
29348 + ch->tc.len, ch->tc.len,
29349 + cipher_blocksize(inode));
29350 + *(tfm_input_data(ch) + ch->tc.len - 1) =
29351 + cipher_blocksize(inode) - oh;
29352 + break;
29353 + case READ_OP: /* do cut */
29354 + assert("edward-1403", oh <= cipher_blocksize(inode));
29355 + ch->tc.len -= oh;
29356 + break;
29357 + default:
29358 + impossible("edward-1404", "bad option");
29359 + }
29360 + return;
29361 +}
29362 +
29363 +static unsigned max_cipher_overhead(struct inode * inode)
29364 +{
29365 + if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
29366 + return 0;
29367 + return cipher_blocksize(inode);
29368 +}
29369 +
29370 +static int deflate_overhead(struct inode *inode)
29371 +{
29372 + return (inode_compression_plugin(inode)->
29373 + checksum ? DC_CHECKSUM_SIZE : 0);
29374 +}
29375 +
29376 +static unsigned deflate_overrun(struct inode * inode, int ilen)
29377 +{
29378 + return coa_overrun(inode_compression_plugin(inode), ilen);
29379 +}
29380 +
29381 +/* Estimating compressibility of a logical cluster by various
29382 + policies represented by compression mode plugin.
29383 + If this returns false, then compressor won't be called for
29384 + the cluster of index @index.
29385 +*/
29386 +static int should_compress(struct tfm_cluster * tc, cloff_t index,
29387 + struct inode *inode)
29388 +{
29389 + compression_plugin *cplug = inode_compression_plugin(inode);
29390 + compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
29391 +
29392 + assert("edward-1321", tc->len != 0);
29393 + assert("edward-1322", cplug != NULL);
29394 + assert("edward-1323", mplug != NULL);
29395 +
29396 + return /* estimate by size */
29397 + (cplug->min_size_deflate ?
29398 + tc->len >= cplug->min_size_deflate() :
29399 + 1) &&
29400 + /* estimate by compression mode plugin */
29401 + (mplug->should_deflate ?
29402 + mplug->should_deflate(inode, index) :
29403 + 1);
29404 +}
29405 +
29406 +/* Evaluating results of compression transform.
29407 + Returns true, if we need to accept this results */
29408 +static int save_compressed(int size_before, int size_after, struct inode *inode)
29409 +{
29410 + return (size_after + deflate_overhead(inode) +
29411 + max_cipher_overhead(inode) < size_before);
29412 +}
29413 +
29414 +/* Guess result of the evaluation above */
29415 +static int need_inflate(struct cluster_handle * ch, struct inode * inode,
29416 + int encrypted /* is cluster encrypted */ )
29417 +{
29418 + struct tfm_cluster * tc = &ch->tc;
29419 +
29420 + assert("edward-142", tc != 0);
29421 + assert("edward-143", inode != NULL);
29422 +
29423 + return tc->len <
29424 + (encrypted ?
29425 + inode_scaled_offset(inode, tc->lsize) :
29426 + tc->lsize);
29427 +}
29428 +
29429 +/* If results of compression were accepted, then we add
29430 + a checksum to catch possible disk cluster corruption.
29431 + The following is a format of the data stored in disk clusters:
29432 +
29433 + data This is (transformed) logical cluster.
29434 + cipher_overhead This is created by ->align() method
29435 + of cipher plugin. May be absent.
29436 + checksum (4) This is created by ->checksum method
29437 + of compression plugin to check
29438 + integrity. May be absent.
29439 +
29440 + Crypto overhead format:
29441 +
29442 + data
29443 + control_byte (1) contains aligned overhead size:
29444 + 1 <= overhead <= cipher_blksize
29445 +*/
29446 +/* Append a checksum at the end of a transformed stream */
29447 +static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29448 +{
29449 + __u32 checksum;
29450 +
29451 + assert("edward-1309", tc != NULL);
29452 + assert("edward-1310", tc->len > 0);
29453 + assert("edward-1311", cplug->checksum != NULL);
29454 +
29455 + checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
29456 + put_unaligned(cpu_to_le32(checksum),
29457 + (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
29458 + tc->len += (int)DC_CHECKSUM_SIZE;
29459 +}
29460 +
29461 +/* Check a disk cluster checksum.
29462 + Returns 0 if checksum is correct, otherwise returns 1 */
29463 +static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29464 +{
29465 + assert("edward-1312", tc != NULL);
29466 + assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
29467 + assert("edward-1314", cplug->checksum != NULL);
29468 +
29469 + if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
29470 + tc->len - (int)DC_CHECKSUM_SIZE) !=
29471 + le32_to_cpu(get_unaligned((d32 *)
29472 + (tfm_stream_data(tc, INPUT_STREAM)
29473 + + tc->len - (int)DC_CHECKSUM_SIZE)))) {
29474 + warning("edward-156",
29475 + "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
29476 + (int)le32_to_cpu
29477 + (get_unaligned((d32 *)
29478 + (tfm_stream_data(tc, INPUT_STREAM) +
29479 + tc->len - (int)DC_CHECKSUM_SIZE))),
29480 + (int)cplug->checksum
29481 + (tfm_stream_data(tc, INPUT_STREAM),
29482 + tc->len - (int)DC_CHECKSUM_SIZE));
29483 + return 1;
29484 + }
29485 + tc->len -= (int)DC_CHECKSUM_SIZE;
29486 + return 0;
29487 +}
29488 +
29489 +/* get input/output stream for some transform action */
29490 +int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
29491 + tfm_stream_id id)
29492 +{
29493 + size_t size = inode_scaled_cluster_size(inode);
29494 +
29495 + assert("edward-901", tc != NULL);
29496 + assert("edward-1027", inode_compression_plugin(inode) != NULL);
29497 +
29498 + if (cluster_get_tfm_act(tc) == TFMA_WRITE)
29499 + size += deflate_overrun(inode, inode_cluster_size(inode));
29500 +
29501 + if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
29502 + alternate_streams(tc);
29503 + if (!get_tfm_stream(tc, id))
29504 + return alloc_tfm_stream(tc, size, id);
29505 +
29506 + assert("edward-902", tfm_stream_is_set(tc, id));
29507 +
29508 + if (tfm_stream_size(tc, id) < size)
29509 + return realloc_tfm_stream(tc, size, id);
29510 + return 0;
29511 +}
29512 +
29513 +/* Common deflate manager */
29514 +int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
29515 +{
29516 + int result = 0;
29517 + int compressed = 0;
29518 + int encrypted = 0;
29519 + struct tfm_cluster * tc = &clust->tc;
29520 + compression_plugin * coplug;
29521 +
29522 + assert("edward-401", inode != NULL);
29523 + assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
29524 + assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
29525 + assert("edward-498", !tfm_cluster_is_uptodate(tc));
29526 +
29527 + coplug = inode_compression_plugin(inode);
29528 + if (should_compress(tc, clust->index, inode)) {
29529 + /* try to compress, discard bad results */
29530 + __u32 dst_len;
29531 + compression_mode_plugin * mplug =
29532 + inode_compression_mode_plugin(inode);
29533 + assert("edward-602", coplug != NULL);
29534 + assert("edward-1423", coplug->compress != NULL);
29535 +
29536 + result = grab_coa(tc, coplug);
29537 + if (result) {
29538 + warning("edward-1424",
29539 + "alloc_coa failed with ret=%d, skipped compression",
29540 + result);
29541 + goto cipher;
29542 + }
29543 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29544 + if (result) {
29545 + warning("edward-1425",
29546 + "alloc stream failed with ret=%d, skipped compression",
29547 + result);
29548 + goto cipher;
29549 + }
29550 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
29551 + coplug->compress(get_coa(tc, coplug->h.id, tc->act),
29552 + tfm_input_data(clust), tc->len,
29553 + tfm_output_data(clust), &dst_len);
29554 + /* make sure we didn't overwrite extra bytes */
29555 + assert("edward-603",
29556 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
29557 +
29558 + /* evaluate results of compression transform */
29559 + if (save_compressed(tc->len, dst_len, inode)) {
29560 + /* good result, accept */
29561 + tc->len = dst_len;
29562 + if (mplug->accept_hook != NULL) {
29563 + result = mplug->accept_hook(inode, clust->index);
29564 + if (result)
29565 + warning("edward-1426",
29566 + "accept_hook failed with ret=%d",
29567 + result);
29568 + }
29569 + compressed = 1;
29570 + }
29571 + else {
29572 + /* bad result, discard */
29573 +#if 0
29574 + if (cluster_is_complete(clust, inode))
29575 + warning("edward-1496",
29576 + "incompressible cluster %lu (inode %llu)",
29577 + clust->index,
29578 + (unsigned long long)get_inode_oid(inode));
29579 +#endif
29580 + if (mplug->discard_hook != NULL &&
29581 + cluster_is_complete(clust, inode)) {
29582 + result = mplug->discard_hook(inode,
29583 + clust->index);
29584 + if (result)
29585 + warning("edward-1427",
29586 + "discard_hook failed with ret=%d",
29587 + result);
29588 + }
29589 + }
29590 + }
29591 + cipher:
29592 + if (need_cipher(inode)) {
29593 + cipher_plugin * ciplug;
29594 + struct blkcipher_desc desc;
29595 + struct scatterlist src;
29596 + struct scatterlist dst;
29597 +
29598 + ciplug = inode_cipher_plugin(inode);
29599 + desc.tfm = info_get_cipher(inode_crypto_info(inode));
29600 + desc.flags = 0;
29601 + if (compressed)
29602 + alternate_streams(tc);
29603 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29604 + if (result)
29605 + return result;
29606 +
29607 + align_or_cut_overhead(inode, clust, WRITE_OP);
29608 + sg_init_one(&src, tfm_input_data(clust), tc->len);
29609 + sg_init_one(&dst, tfm_output_data(clust), tc->len);
29610 +
29611 + result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
29612 + if (result) {
29613 + warning("edward-1405",
29614 + "encryption failed flags=%x\n", desc.flags);
29615 + return result;
29616 + }
29617 + encrypted = 1;
29618 + }
29619 + if (compressed && coplug->checksum != NULL)
29620 + dc_set_checksum(coplug, tc);
29621 + if (!compressed && !encrypted)
29622 + alternate_streams(tc);
29623 + return result;
29624 +}
29625 +
29626 +/* Common inflate manager. */
29627 +int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
29628 +{
29629 + int result = 0;
29630 + int transformed = 0;
29631 + struct tfm_cluster * tc = &clust->tc;
29632 + compression_plugin * coplug;
29633 +
29634 + assert("edward-905", inode != NULL);
29635 + assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
29636 + assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
29637 + assert("edward-1349", tc->act == TFMA_READ);
29638 + assert("edward-907", !tfm_cluster_is_uptodate(tc));
29639 +
29640 + /* Handle a checksum (if any) */
29641 + coplug = inode_compression_plugin(inode);
29642 + if (need_inflate(clust, inode, need_cipher(inode)) &&
29643 + coplug->checksum != NULL) {
29644 + result = dc_check_checksum(coplug, tc);
29645 + if (unlikely(result)) {
29646 + warning("edward-1460",
29647 + "Inode %llu: disk cluster %lu looks corrupted",
29648 + (unsigned long long)get_inode_oid(inode),
29649 + clust->index);
29650 + return RETERR(-EIO);
29651 + }
29652 + }
29653 + if (need_cipher(inode)) {
29654 + cipher_plugin * ciplug;
29655 + struct blkcipher_desc desc;
29656 + struct scatterlist src;
29657 + struct scatterlist dst;
29658 +
29659 + ciplug = inode_cipher_plugin(inode);
29660 + desc.tfm = info_get_cipher(inode_crypto_info(inode));
29661 + desc.flags = 0;
29662 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29663 + if (result)
29664 + return result;
29665 + assert("edward-909", tfm_cluster_is_set(tc));
29666 +
29667 + sg_init_one(&src, tfm_input_data(clust), tc->len);
29668 + sg_init_one(&dst, tfm_output_data(clust), tc->len);
29669 +
29670 + result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
29671 + if (result) {
29672 + warning("edward-1600", "decrypt failed flags=%x\n",
29673 + desc.flags);
29674 + return result;
29675 + }
29676 + align_or_cut_overhead(inode, clust, READ_OP);
29677 + transformed = 1;
29678 + }
29679 + if (need_inflate(clust, inode, 0)) {
29680 + unsigned dst_len = inode_cluster_size(inode);
29681 + if(transformed)
29682 + alternate_streams(tc);
29683 +
29684 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29685 + if (result)
29686 + return result;
29687 + assert("edward-1305", coplug->decompress != NULL);
29688 + assert("edward-910", tfm_cluster_is_set(tc));
29689 +
29690 + coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
29691 + tfm_input_data(clust), tc->len,
29692 + tfm_output_data(clust), &dst_len);
29693 + /* check length */
29694 + tc->len = dst_len;
29695 + assert("edward-157", dst_len == tc->lsize);
29696 + transformed = 1;
29697 + }
29698 + if (!transformed)
29699 + alternate_streams(tc);
29700 + return result;
29701 +}
29702 +
29703 +/* This is implementation of readpage method of struct
29704 + address_space_operations for cryptcompress plugin. */
29705 +int readpage_cryptcompress(struct file *file, struct page *page)
29706 +{
29707 + reiser4_context *ctx;
29708 + struct cluster_handle clust;
29709 + item_plugin *iplug;
29710 + int result;
29711 +
29712 + assert("edward-88", PageLocked(page));
29713 + assert("vs-976", !PageUptodate(page));
29714 + assert("edward-89", page->mapping && page->mapping->host);
29715 +
29716 + ctx = reiser4_init_context(page->mapping->host->i_sb);
29717 + if (IS_ERR(ctx)) {
29718 + unlock_page(page);
29719 + return PTR_ERR(ctx);
29720 + }
29721 + assert("edward-113",
29722 + ergo(file != NULL,
29723 + page->mapping == file->f_dentry->d_inode->i_mapping));
29724 +
29725 + if (PageUptodate(page)) {
29726 + warning("edward-1338", "page is already uptodate\n");
29727 + unlock_page(page);
29728 + reiser4_exit_context(ctx);
29729 + return 0;
29730 + }
29731 + cluster_init_read(&clust, NULL);
29732 + clust.file = file;
29733 + iplug = item_plugin_by_id(CTAIL_ID);
29734 + if (!iplug->s.file.readpage) {
29735 + unlock_page(page);
29736 + put_cluster_handle(&clust);
29737 + reiser4_exit_context(ctx);
29738 + return -EINVAL;
29739 + }
29740 + result = iplug->s.file.readpage(&clust, page);
29741 +
29742 + put_cluster_handle(&clust);
29743 + reiser4_txn_restart(ctx);
29744 + reiser4_exit_context(ctx);
29745 + return result;
29746 +}
29747 +
29748 +/* number of pages to check in */
29749 +static int get_new_nrpages(struct cluster_handle * clust)
29750 +{
29751 + switch (clust->op) {
29752 + case LC_APPOV:
29753 + return clust->nr_pages;
29754 + case LC_TRUNC:
29755 + assert("edward-1179", clust->win != NULL);
29756 + return size_in_pages(clust->win->off + clust->win->count);
29757 + default:
29758 + impossible("edward-1180", "bad page cluster option");
29759 + return 0;
29760 + }
29761 +}
29762 +
29763 +static void set_cluster_pages_dirty(struct cluster_handle * clust,
29764 + struct inode * inode)
29765 +{
29766 + int i;
29767 + struct page *pg;
29768 + int nrpages = get_new_nrpages(clust);
29769 +
29770 + for (i = 0; i < nrpages; i++) {
29771 +
29772 + pg = clust->pages[i];
29773 + assert("edward-968", pg != NULL);
29774 + lock_page(pg);
29775 + assert("edward-1065", PageUptodate(pg));
29776 + reiser4_set_page_dirty_internal(pg);
29777 + unlock_page(pg);
29778 + mark_page_accessed(pg);
29779 + }
29780 +}
29781 +
29782 +/* Grab a page cluster for read/write operations.
29783 + Attach a jnode for write operations (when preparing for modifications, which
29784 + are supposed to be committed).
29785 +
29786 + We allocate only one jnode per page cluster; this jnode is binded to the
29787 + first page of this cluster, so we have an extra-reference that will be put
29788 + as soon as jnode is evicted from memory), other references will be cleaned
29789 + up in flush time (assume that check in page cluster was successful).
29790 +*/
29791 +int grab_page_cluster(struct inode * inode,
29792 + struct cluster_handle * clust, rw_op rw)
29793 +{
29794 + int i;
29795 + int result = 0;
29796 + jnode *node = NULL;
29797 +
29798 + assert("edward-182", clust != NULL);
29799 + assert("edward-183", clust->pages != NULL);
29800 + assert("edward-1466", clust->node == NULL);
29801 + assert("edward-1428", inode != NULL);
29802 + assert("edward-1429", inode->i_mapping != NULL);
29803 + assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
29804 +
29805 + if (clust->nr_pages == 0)
29806 + return 0;
29807 +
29808 + for (i = 0; i < clust->nr_pages; i++) {
29809 +
29810 + assert("edward-1044", clust->pages[i] == NULL);
29811 +
29812 + clust->pages[i] =
29813 + find_or_create_page(inode->i_mapping,
29814 + clust_to_pg(clust->index, inode) + i,
29815 + reiser4_ctx_gfp_mask_get());
29816 + if (!clust->pages[i]) {
29817 + result = RETERR(-ENOMEM);
29818 + break;
29819 + }
29820 + if (i == 0 && rw == WRITE_OP) {
29821 + node = jnode_of_page(clust->pages[i]);
29822 + if (IS_ERR(node)) {
29823 + result = PTR_ERR(node);
29824 + unlock_page(clust->pages[i]);
29825 + break;
29826 + }
29827 + JF_SET(node, JNODE_CLUSTER_PAGE);
29828 + assert("edward-920", jprivate(clust->pages[0]));
29829 + }
29830 + INODE_PGCOUNT_INC(inode);
29831 + unlock_page(clust->pages[i]);
29832 + }
29833 + if (unlikely(result)) {
29834 + while (i) {
29835 + put_cluster_page(clust->pages[--i]);
29836 + INODE_PGCOUNT_DEC(inode);
29837 + }
29838 + if (node && !IS_ERR(node))
29839 + jput(node);
29840 + return result;
29841 + }
29842 + clust->node = node;
29843 + return 0;
29844 +}
29845 +
29846 +static void truncate_page_cluster_range(struct inode * inode,
29847 + struct page ** pages,
29848 + cloff_t index,
29849 + int from, int count,
29850 + int even_cows)
29851 +{
29852 + assert("edward-1467", count > 0);
29853 + reiser4_invalidate_pages(inode->i_mapping,
29854 + clust_to_pg(index, inode) + from,
29855 + count, even_cows);
29856 +}
29857 +
29858 +/* Put @count pages starting from @from offset */
29859 +static void __put_page_cluster(int from, int count,
29860 + struct page ** pages, struct inode * inode)
29861 +{
29862 + int i;
29863 + assert("edward-1468", pages != NULL);
29864 + assert("edward-1469", inode != NULL);
29865 + assert("edward-1470", from >= 0 && count >= 0);
29866 +
29867 + for (i = 0; i < count; i++) {
29868 + assert("edward-1471", pages[from + i] != NULL);
29869 + assert("edward-1472",
29870 + pages[from + i]->index == pages[from]->index + i);
29871 +
29872 + put_cluster_page(pages[from + i]);
29873 + INODE_PGCOUNT_DEC(inode);
29874 + }
29875 +}
29876 +
29877 +/*
29878 + * This is dual to grab_page_cluster,
29879 + * however if @rw == WRITE_OP, then we call this function
29880 + * only if something is failed before checkin page cluster.
29881 + */
29882 +void put_page_cluster(struct cluster_handle * clust,
29883 + struct inode * inode, rw_op rw)
29884 +{
29885 + assert("edward-445", clust != NULL);
29886 + assert("edward-922", clust->pages != NULL);
29887 + assert("edward-446",
29888 + ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
29889 +
29890 + __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
29891 + if (rw == WRITE_OP) {
29892 + if (unlikely(clust->node)) {
29893 + assert("edward-447",
29894 + clust->node == jprivate(clust->pages[0]));
29895 + jput(clust->node);
29896 + clust->node = NULL;
29897 + }
29898 + }
29899 +}
29900 +
29901 +#if REISER4_DEBUG
29902 +int cryptcompress_inode_ok(struct inode *inode)
29903 +{
29904 + if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
29905 + return 0;
29906 + if (!cluster_shift_ok(inode_cluster_shift(inode)))
29907 + return 0;
29908 + return 1;
29909 +}
29910 +
29911 +static int window_ok(struct reiser4_slide * win, struct inode *inode)
29912 +{
29913 + assert("edward-1115", win != NULL);
29914 + assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
29915 +
29916 + return (win->off != inode_cluster_size(inode)) &&
29917 + (win->off + win->count + win->delta <= inode_cluster_size(inode));
29918 +}
29919 +
29920 +static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
29921 +{
29922 + assert("edward-279", clust != NULL);
29923 +
29924 + if (!clust->pages)
29925 + return 0;
29926 + return (clust->win ? window_ok(clust->win, inode) : 1);
29927 +}
29928 +#if 0
29929 +static int pages_truncate_ok(struct inode *inode, pgoff_t start)
29930 +{
29931 + int found;
29932 + struct page * page;
29933 +
29934 + found = find_get_pages(inode->i_mapping, start, 1, &page);
29935 + if (found)
29936 + put_cluster_page(page);
29937 + return !found;
29938 +}
29939 +#else
29940 +#define pages_truncate_ok(inode, start) 1
29941 +#endif
29942 +
29943 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
29944 +{
29945 + jnode *node;
29946 + node = jlookup(current_tree, get_inode_oid(inode),
29947 + clust_to_pg(index, inode));
29948 + if (likely(!node))
29949 + return 1;
29950 + jput(node);
29951 + return 0;
29952 +}
29953 +
29954 +static int find_fake_appended(struct inode *inode, cloff_t * index);
29955 +
29956 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
29957 +{
29958 + int result;
29959 + cloff_t raidx;
29960 +
29961 + result = find_fake_appended(inode, &raidx);
29962 + return !result && (aidx == raidx);
29963 +}
29964 +#endif
29965 +
29966 +/* guess next window stat */
29967 +static inline window_stat next_window_stat(struct reiser4_slide * win)
29968 +{
29969 + assert("edward-1130", win != NULL);
29970 + return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
29971 + HOLE_WINDOW : DATA_WINDOW);
29972 +}
29973 +
29974 +/* guess and set next cluster index and window params */
29975 +static void move_update_window(struct inode * inode,
29976 + struct cluster_handle * clust,
29977 + loff_t file_off, loff_t to_file)
29978 +{
29979 + struct reiser4_slide * win;
29980 +
29981 + assert("edward-185", clust != NULL);
29982 + assert("edward-438", clust->pages != NULL);
29983 + assert("edward-281", cluster_ok(clust, inode));
29984 +
29985 + win = clust->win;
29986 + if (!win)
29987 + return;
29988 +
29989 + switch (win->stat) {
29990 + case DATA_WINDOW:
29991 + /* increment */
29992 + clust->index++;
29993 + win->stat = DATA_WINDOW;
29994 + win->off = 0;
29995 + win->count = min((loff_t)inode_cluster_size(inode), to_file);
29996 + break;
29997 + case HOLE_WINDOW:
29998 + switch (next_window_stat(win)) {
29999 + case HOLE_WINDOW:
30000 + /* skip */
30001 + clust->index = off_to_clust(file_off, inode);
30002 + win->stat = HOLE_WINDOW;
30003 + win->off = 0;
30004 + win->count = off_to_cloff(file_off, inode);
30005 + win->delta = min((loff_t)(inode_cluster_size(inode) -
30006 + win->count), to_file);
30007 + break;
30008 + case DATA_WINDOW:
30009 + /* stay */
30010 + win->stat = DATA_WINDOW;
30011 + /* off+count+delta=inv */
30012 + win->off = win->off + win->count;
30013 + win->count = win->delta;
30014 + win->delta = 0;
30015 + break;
30016 + default:
30017 + impossible("edward-282", "wrong next window state");
30018 + }
30019 + break;
30020 + default:
30021 + impossible("edward-283", "wrong current window state");
30022 + }
30023 + assert("edward-1068", cluster_ok(clust, inode));
30024 +}
30025 +
30026 +static int update_sd_cryptcompress(struct inode *inode)
30027 +{
30028 + int result = 0;
30029 +
30030 + assert("edward-978", reiser4_schedulable());
30031 +
30032 + result = reiser4_grab_space_force(/* one for stat data update */
30033 + estimate_update_common(inode),
30034 + BA_CAN_COMMIT);
30035 + if (result)
30036 + return result;
30037 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
30038 + result = reiser4_update_sd(inode);
30039 +
30040 + return result;
30041 +}
30042 +
30043 +static void uncapture_cluster_jnode(jnode * node)
30044 +{
30045 + txn_atom *atom;
30046 +
30047 + assert_spin_locked(&(node->guard));
30048 +
30049 + atom = jnode_get_atom(node);
30050 + if (atom == NULL) {
30051 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
30052 + spin_unlock_jnode(node);
30053 + return;
30054 + }
30055 + reiser4_uncapture_block(node);
30056 + spin_unlock_atom(atom);
30057 + jput(node);
30058 +}
30059 +
30060 +static void put_found_pages(struct page **pages, int nr)
30061 +{
30062 + int i;
30063 + for (i = 0; i < nr; i++) {
30064 + assert("edward-1045", pages[i] != NULL);
30065 + put_cluster_page(pages[i]);
30066 + }
30067 +}
30068 +
30069 +/* Lifecycle of a logical cluster in the system.
30070 + *
30071 + *
30072 + * Logical cluster of a cryptcompress file is represented in the system by
30073 + * . page cluster (in memory, primary cache, contains plain text);
30074 + * . disk cluster (in memory, secondary cache, contains transformed text).
30075 + * Primary cache is to reduce number of transform operations (compression,
30076 + * encryption), i.e. to implement transform-caching strategy.
30077 + * Secondary cache is to reduce number of I/O operations, i.e. for usual
30078 + * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
30079 + * a logical cluster to the primary cache. Disk cluster is a set of items
30080 + * of the same type defined by some reiser4 item plugin id.
30081 + *
30082 + * 1. Performing modifications
30083 + *
30084 + * Every modification of a cryptcompress file is considered as a set of
30085 + * operations performed on file's logical clusters. Every such "atomic"
30086 + * modification is truncate, append and(or) overwrite some bytes of a
30087 + * logical cluster performed in the primary cache with the following
30088 + * synchronization with the secondary cache (in flush time). Disk clusters,
30089 + * which live in the secondary cache, are supposed to be synchronized with
30090 + * disk. The mechanism of synchronization of primary and secondary caches
30091 + * includes so-called checkin/checkout technique described below.
30092 + *
30093 + * 2. Submitting modifications
30094 + *
30095 + * Each page cluster has associated jnode (a special in-memory header to
30096 + * keep a track of transactions in reiser4), which is attached to its first
30097 + * page when grabbing page cluster for modifications (see grab_page_cluster).
30098 + * Submitting modifications (see checkin_logical_cluster) is going per logical
30099 + * cluster and includes:
30100 + * . checkin_cluster_size;
30101 + * . checkin_page_cluster.
30102 + * checkin_cluster_size() is resolved to file size update (which completely
30103 + * defines new size of logical cluster (number of file's bytes in a logical
30104 + * cluster).
30105 + * checkin_page_cluster() captures jnode of a page cluster and installs
30106 + * jnode's dirty flag (if needed) to indicate that modifications are
30107 + * successfully checked in.
30108 + *
30109 + * 3. Checking out modifications
30110 + *
30111 + * Is going per logical cluster in flush time (see checkout_logical_cluster).
30112 + * This is the time of synchronizing primary and secondary caches.
30113 + * checkout_logical_cluster() includes:
30114 + * . checkout_page_cluster (retrieving checked in pages).
30115 + * . uncapture jnode (including clear dirty flag and unlock)
30116 + *
30117 + * 4. Committing modifications
30118 + *
30119 + * Proceeding a synchronization of primary and secondary caches. When checking
30120 + * out page cluster (the phase above) pages are locked/flushed/unlocked
30121 + * one-by-one in ascending order of their indexes to contiguous stream, which
30122 + * is supposed to be transformed (compressed, encrypted), chopped up into items
30123 + * and committed to disk as a disk cluster.
30124 + *
30125 + * 5. Managing page references
30126 + *
30127 + * Every checked in page have a special additional "control" reference,
30128 + * which is dropped at checkout. We need this to avoid unexpected evicting
30129 + * pages from memory before checkout. Control references are managed so
30130 + * they are not accumulated with every checkin:
30131 + *
30132 + * 0
30133 + * checkin -> 1
30134 + * 0 -> checkout
30135 + * checkin -> 1
30136 + * checkin -> 1
30137 + * checkin -> 1
30138 + * 0 -> checkout
30139 + * ...
30140 + *
30141 + * Every page cluster has its own unique "cluster lock". Update/drop
30142 + * references are serialized via this lock. Number of checked in cluster
30143 + * pages is calculated by i_size under cluster lock. File size is updated
30144 + * at every checkin action also under cluster lock (except cases of
30145 + * appending/truncating fake logical clusters).
30146 + *
30147 + * Proof of correctness:
30148 + *
30149 + * Since we update file size under cluster lock, in the case of non-fake
30150 + * logical cluster with its lock held we do have expected number of checked
30151 + * in pages. On the other hand, append/truncate of fake logical clusters
30152 + * doesn't change number of checked in pages of any cluster.
30153 + *
30154 + * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
30155 + * Currently, I don't see any reason to create a special lock for those
30156 + * needs.
30157 + */
30158 +
30159 +static inline void lock_cluster(jnode * node)
30160 +{
30161 + spin_lock_jnode(node);
30162 +}
30163 +
30164 +static inline void unlock_cluster(jnode * node)
30165 +{
30166 + spin_unlock_jnode(node);
30167 +}
30168 +
30169 +static inline void unlock_cluster_uncapture(jnode * node)
30170 +{
30171 + uncapture_cluster_jnode(node);
30172 +}
30173 +
30174 +/* Set new file size by window. Cluster lock is required. */
30175 +static void checkin_file_size(struct cluster_handle * clust,
30176 + struct inode * inode)
30177 +{
30178 + loff_t new_size;
30179 + struct reiser4_slide * win;
30180 +
30181 + assert("edward-1181", clust != NULL);
30182 + assert("edward-1182", inode != NULL);
30183 + assert("edward-1473", clust->pages != NULL);
30184 + assert("edward-1474", clust->pages[0] != NULL);
30185 + assert("edward-1475", jprivate(clust->pages[0]) != NULL);
30186 + assert_spin_locked(&(jprivate(clust->pages[0])->guard));
30187 +
30188 +
30189 + win = clust->win;
30190 + assert("edward-1183", win != NULL);
30191 +
30192 + new_size = clust_to_off(clust->index, inode) + win->off;
30193 +
30194 + switch (clust->op) {
30195 + case LC_APPOV:
30196 + if (new_size + win->count <= i_size_read(inode))
30197 + /* overwrite only */
30198 + return;
30199 + new_size += win->count;
30200 + break;
30201 + case LC_TRUNC:
30202 + break;
30203 + default:
30204 + impossible("edward-1184", "bad page cluster option");
30205 + break;
30206 + }
30207 + inode_check_scale_nolock(inode, i_size_read(inode), new_size);
30208 + i_size_write(inode, new_size);
30209 + return;
30210 +}
30211 +
30212 +static inline void checkin_cluster_size(struct cluster_handle * clust,
30213 + struct inode * inode)
30214 +{
30215 + if (clust->win)
30216 + checkin_file_size(clust, inode);
30217 +}
30218 +
30219 +static int checkin_page_cluster(struct cluster_handle * clust,
30220 + struct inode * inode)
30221 +{
30222 + int result;
30223 + jnode * node;
30224 + int old_nrpages = clust->old_nrpages;
30225 + int new_nrpages = get_new_nrpages(clust);
30226 +
30227 + node = clust->node;
30228 +
30229 + assert("edward-221", node != NULL);
30230 + assert("edward-971", clust->reserved == 1);
30231 + assert("edward-1263",
30232 + clust->reserved_prepped == estimate_update_cluster(inode));
30233 + assert("edward-1264", clust->reserved_unprepped == 0);
30234 +
30235 + if (JF_ISSET(node, JNODE_DIRTY)) {
30236 + /*
30237 + * page cluster was checked in, but not yet
30238 + * checked out, so release related resources
30239 + */
30240 + free_reserved4cluster(inode, clust,
30241 + estimate_update_cluster(inode));
30242 + __put_page_cluster(0, clust->old_nrpages,
30243 + clust->pages, inode);
30244 + } else {
30245 + result = capture_cluster_jnode(node);
30246 + if (unlikely(result)) {
30247 + unlock_cluster(node);
30248 + return result;
30249 + }
30250 + jnode_make_dirty_locked(node);
30251 + clust->reserved = 0;
30252 + }
30253 + unlock_cluster(node);
30254 +
30255 + if (new_nrpages < old_nrpages) {
30256 + /* truncate >= 1 complete pages */
30257 + __put_page_cluster(new_nrpages,
30258 + old_nrpages - new_nrpages,
30259 + clust->pages, inode);
30260 + truncate_page_cluster_range(inode,
30261 + clust->pages, clust->index,
30262 + new_nrpages,
30263 + old_nrpages - new_nrpages,
30264 + 0);
30265 + }
30266 +#if REISER4_DEBUG
30267 + clust->reserved_prepped -= estimate_update_cluster(inode);
30268 +#endif
30269 + return 0;
30270 +}
30271 +
30272 +/* Submit modifications of a logical cluster */
30273 +static int checkin_logical_cluster(struct cluster_handle * clust,
30274 + struct inode *inode)
30275 +{
30276 + int result = 0;
30277 + jnode * node;
30278 +
30279 + node = clust->node;
30280 +
30281 + assert("edward-1035", node != NULL);
30282 + assert("edward-1029", clust != NULL);
30283 + assert("edward-1030", clust->reserved == 1);
30284 + assert("edward-1031", clust->nr_pages != 0);
30285 + assert("edward-1032", clust->pages != NULL);
30286 + assert("edward-1033", clust->pages[0] != NULL);
30287 + assert("edward-1446", jnode_is_cluster_page(node));
30288 + assert("edward-1476", node == jprivate(clust->pages[0]));
30289 +
30290 + lock_cluster(node);
30291 + checkin_cluster_size(clust, inode);
30292 + /* this will unlock cluster */
30293 + result = checkin_page_cluster(clust, inode);
30294 + jput(node);
30295 + clust->node = NULL;
30296 + return result;
30297 +}
30298 +
30299 +/*
30300 + * Retrieve size of logical cluster that was checked in at
30301 + * the latest modifying session (cluster lock is required)
30302 + */
30303 +static inline void checkout_cluster_size(struct cluster_handle * clust,
30304 + struct inode * inode)
30305 +{
30306 + struct tfm_cluster *tc = &clust->tc;
30307 +
30308 + tc->len = lbytes(clust->index, inode);
30309 + assert("edward-1478", tc->len != 0);
30310 +}
30311 +
30312 +/*
30313 + * Retrieve a page cluster with the latest submitted modifications
30314 + * and flush its pages to previously allocated contiguous stream.
30315 + */
30316 +static void checkout_page_cluster(struct cluster_handle * clust,
30317 + jnode * node, struct inode * inode)
30318 +{
30319 + int i;
30320 + int found;
30321 + int to_put;
30322 + struct tfm_cluster *tc = &clust->tc;
30323 +
30324 + /* find and put checked in pages: cluster is locked,
30325 + * so we must get expected number (to_put) of pages
30326 + */
30327 + to_put = size_in_pages(lbytes(clust->index, inode));
30328 + found = find_get_pages(inode->i_mapping,
30329 + clust_to_pg(clust->index, inode),
30330 + to_put, clust->pages);
30331 + BUG_ON(found != to_put);
30332 +
30333 + __put_page_cluster(0, to_put, clust->pages, inode);
30334 + unlock_cluster_uncapture(node);
30335 +
30336 + /* Flush found pages.
30337 + *
30338 + * Note, that we don't disable modifications while flushing,
30339 + * moreover, some found pages can be truncated, as we have
30340 + * released cluster lock.
30341 + */
30342 + for (i = 0; i < found; i++) {
30343 + int in_page;
30344 + char * data;
30345 + assert("edward-1479",
30346 + clust->pages[i]->index == clust->pages[0]->index + i);
30347 +
30348 + lock_page(clust->pages[i]);
30349 + if (!PageUptodate(clust->pages[i])) {
30350 + /* page was truncated */
30351 + assert("edward-1480",
30352 + i_size_read(inode) <= page_offset(clust->pages[i]));
30353 + assert("edward-1481",
30354 + clust->pages[i]->mapping != inode->i_mapping);
30355 + unlock_page(clust->pages[i]);
30356 + break;
30357 + }
30358 + /* Update the number of bytes in the logical cluster,
30359 + * as it could be partially truncated. Note, that only
30360 + * partial truncate is possible (complete truncate can
30361 + * not go here, as it is performed via ->kill_hook()
30362 + * called by cut_file_items(), and the last one must
30363 + * wait for znode locked with parent coord).
30364 + */
30365 + checkout_cluster_size(clust, inode);
30366 +
30367 + /* this can be zero, as new file size is
30368 + checked in before truncating pages */
30369 + in_page = __mbp(tc->len, i);
30370 +
30371 + data = kmap(clust->pages[i]);
30372 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
30373 + data, in_page);
30374 + kunmap(clust->pages[i]);
30375 +
30376 + if (PageDirty(clust->pages[i]))
30377 + cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
30378 +
30379 + unlock_page(clust->pages[i]);
30380 +
30381 + if (in_page < PAGE_CACHE_SIZE)
30382 + /* end of the file */
30383 + break;
30384 + }
30385 + put_found_pages(clust->pages, found); /* find_get_pages */
30386 + tc->lsize = tc->len;
30387 + return;
30388 +}
30389 +
30390 +/* Check out modifications of a logical cluster */
30391 +int checkout_logical_cluster(struct cluster_handle * clust,
30392 + jnode * node, struct inode *inode)
30393 +{
30394 + int result;
30395 + struct tfm_cluster *tc = &clust->tc;
30396 +
30397 + assert("edward-980", node != NULL);
30398 + assert("edward-236", inode != NULL);
30399 + assert("edward-237", clust != NULL);
30400 + assert("edward-240", !clust->win);
30401 + assert("edward-241", reiser4_schedulable());
30402 + assert("edward-718", cryptcompress_inode_ok(inode));
30403 +
30404 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
30405 + if (result) {
30406 + warning("edward-1430", "alloc stream failed with ret=%d",
30407 + result);
30408 + return RETERR(-E_REPEAT);
30409 + }
30410 + lock_cluster(node);
30411 +
30412 + if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
30413 + /* race with another flush */
30414 + warning("edward-982",
30415 + "checking out logical cluster %lu of inode %llu: "
30416 + "jnode is not dirty", clust->index,
30417 + (unsigned long long)get_inode_oid(inode));
30418 + unlock_cluster(node);
30419 + return RETERR(-E_REPEAT);
30420 + }
30421 + cluster_reserved2grabbed(estimate_update_cluster(inode));
30422 +
30423 + /* this will unlock cluster */
30424 + checkout_page_cluster(clust, node, inode);
30425 + return 0;
30426 +}
30427 +
30428 +/* set hint for the cluster of the index @index */
30429 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
30430 + cloff_t index, znode_lock_mode mode)
30431 +{
30432 + reiser4_key key;
30433 + assert("edward-722", cryptcompress_inode_ok(inode));
30434 + assert("edward-723",
30435 + inode_file_plugin(inode) ==
30436 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
30437 +
30438 + inode_file_plugin(inode)->key_by_inode(inode,
30439 + clust_to_off(index, inode),
30440 + &key);
30441 +
30442 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
30443 + hint->offset = get_key_offset(&key);
30444 + hint->mode = mode;
30445 +}
30446 +
30447 +void invalidate_hint_cluster(struct cluster_handle * clust)
30448 +{
30449 + assert("edward-1291", clust != NULL);
30450 + assert("edward-1292", clust->hint != NULL);
30451 +
30452 + done_lh(&clust->hint->lh);
30453 + hint_clr_valid(clust->hint);
30454 +}
30455 +
30456 +static void put_hint_cluster(struct cluster_handle * clust,
30457 + struct inode *inode, znode_lock_mode mode)
30458 +{
30459 + assert("edward-1286", clust != NULL);
30460 + assert("edward-1287", clust->hint != NULL);
30461 +
30462 + set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
30463 + invalidate_hint_cluster(clust);
30464 +}
30465 +
30466 +static int balance_dirty_page_cluster(struct cluster_handle * clust,
30467 + struct inode *inode, loff_t off,
30468 + loff_t to_file)
30469 +{
30470 + int result;
30471 + struct cryptcompress_info * info;
30472 +
30473 + assert("edward-724", inode != NULL);
30474 + assert("edward-725", cryptcompress_inode_ok(inode));
30475 +
30476 + /* set next window params */
30477 + move_update_window(inode, clust, off, to_file);
30478 +
30479 + result = update_sd_cryptcompress(inode);
30480 + if (result)
30481 + return result;
30482 + assert("edward-726", clust->hint->lh.owner == NULL);
30483 + info = cryptcompress_inode_data(inode);
30484 +
30485 + mutex_unlock(&info->checkin_mutex);
30486 + reiser4_throttle_write(inode);
30487 + mutex_lock(&info->checkin_mutex);
30488 + return 0;
30489 +}
30490 +
30491 +/* set zeroes to the page cluster, proceed it, and maybe, try to capture
30492 + its pages */
30493 +static int write_hole(struct inode *inode, struct cluster_handle * clust,
30494 + loff_t file_off, loff_t to_file)
30495 +{
30496 + int result = 0;
30497 + unsigned cl_off, cl_count = 0;
30498 + unsigned to_pg, pg_off;
30499 + struct reiser4_slide * win;
30500 +
30501 + assert("edward-190", clust != NULL);
30502 + assert("edward-1069", clust->win != NULL);
30503 + assert("edward-191", inode != NULL);
30504 + assert("edward-727", cryptcompress_inode_ok(inode));
30505 + assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
30506 + assert("edward-1154",
30507 + ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
30508 +
30509 + win = clust->win;
30510 +
30511 + assert("edward-1070", win != NULL);
30512 + assert("edward-201", win->stat == HOLE_WINDOW);
30513 + assert("edward-192", cluster_ok(clust, inode));
30514 +
30515 + if (win->off == 0 && win->count == inode_cluster_size(inode)) {
30516 + /* This part of the hole will be represented by "fake"
30517 + * logical cluster, i.e. which doesn't have appropriate
30518 + * disk cluster until someone modify this logical cluster
30519 + * and make it dirty.
30520 + * So go forward here..
30521 + */
30522 + move_update_window(inode, clust, file_off, to_file);
30523 + return 0;
30524 + }
30525 + cl_count = win->count; /* number of zeroes to write */
30526 + cl_off = win->off;
30527 + pg_off = off_to_pgoff(win->off);
30528 +
30529 + while (cl_count) {
30530 + struct page *page;
30531 + page = clust->pages[off_to_pg(cl_off)];
30532 +
30533 + assert("edward-284", page != NULL);
30534 +
30535 + to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
30536 + lock_page(page);
30537 + zero_user_page(page, pg_off, to_pg, KM_USER0);
30538 + SetPageUptodate(page);
30539 + reiser4_set_page_dirty_internal(page);
30540 + mark_page_accessed(page);
30541 + unlock_page(page);
30542 +
30543 + cl_off += to_pg;
30544 + cl_count -= to_pg;
30545 + pg_off = 0;
30546 + }
30547 + if (!win->delta) {
30548 + /* only zeroes in this window, try to capture
30549 + */
30550 + result = checkin_logical_cluster(clust, inode);
30551 + if (result)
30552 + return result;
30553 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
30554 + result =
30555 + balance_dirty_page_cluster(clust, inode, file_off, to_file);
30556 + } else
30557 + move_update_window(inode, clust, file_off, to_file);
30558 + return result;
30559 +}
30560 +
30561 +/*
30562 + The main disk search procedure for cryptcompress plugin, which
30563 + . scans all items of disk cluster with the lock mode @mode
30564 + . maybe reads each one (if @read)
30565 + . maybe makes its znode dirty (if write lock mode was specified)
30566 +
30567 + NOTE-EDWARD: Callers should handle the case when disk cluster
30568 + is incomplete (-EIO)
30569 +*/
30570 +int find_disk_cluster(struct cluster_handle * clust,
30571 + struct inode *inode, int read, znode_lock_mode mode)
30572 +{
30573 + flow_t f;
30574 + hint_t *hint;
30575 + int result = 0;
30576 + int was_grabbed;
30577 + ra_info_t ra_info;
30578 + file_plugin *fplug;
30579 + item_plugin *iplug;
30580 + struct tfm_cluster *tc;
30581 + struct cryptcompress_info * info;
30582 +
30583 + assert("edward-138", clust != NULL);
30584 + assert("edward-728", clust->hint != NULL);
30585 + assert("edward-226", reiser4_schedulable());
30586 + assert("edward-137", inode != NULL);
30587 + assert("edward-729", cryptcompress_inode_ok(inode));
30588 +
30589 + hint = clust->hint;
30590 + fplug = inode_file_plugin(inode);
30591 + was_grabbed = get_current_context()->grabbed_blocks;
30592 + info = cryptcompress_inode_data(inode);
30593 + tc = &clust->tc;
30594 +
30595 + assert("edward-462", !tfm_cluster_is_uptodate(tc));
30596 + assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
30597 +
30598 + dclust_init_extension(hint);
30599 +
30600 + /* set key of the first disk cluster item */
30601 + fplug->flow_by_inode(inode,
30602 + (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
30603 + 0 /* kernel space */ ,
30604 + inode_scaled_cluster_size(inode),
30605 + clust_to_off(clust->index, inode), READ_OP, &f);
30606 + if (mode == ZNODE_WRITE_LOCK) {
30607 + /* reserve for flush to make dirty all the leaf nodes
30608 + which contain disk cluster */
30609 + result =
30610 + reiser4_grab_space_force(estimate_dirty_cluster(inode),
30611 + BA_CAN_COMMIT);
30612 + if (result)
30613 + goto out;
30614 + }
30615 +
30616 + ra_info.key_to_stop = f.key;
30617 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30618 +
30619 + while (f.length) {
30620 + result = find_cluster_item(hint, &f.key, mode,
30621 + NULL, FIND_EXACT,
30622 + (mode == ZNODE_WRITE_LOCK ?
30623 + CBK_FOR_INSERT : 0));
30624 + switch (result) {
30625 + case CBK_COORD_NOTFOUND:
30626 + result = 0;
30627 + if (inode_scaled_offset
30628 + (inode, clust_to_off(clust->index, inode)) ==
30629 + get_key_offset(&f.key)) {
30630 + /* first item not found, this is treated
30631 + as disk cluster is absent */
30632 + clust->dstat = FAKE_DISK_CLUSTER;
30633 + goto out;
30634 + }
30635 + /* we are outside the cluster, stop search here */
30636 + assert("edward-146",
30637 + f.length != inode_scaled_cluster_size(inode));
30638 + goto ok;
30639 + case CBK_COORD_FOUND:
30640 + assert("edward-148",
30641 + hint->ext_coord.coord.between == AT_UNIT);
30642 + assert("edward-460",
30643 + hint->ext_coord.coord.unit_pos == 0);
30644 +
30645 + coord_clear_iplug(&hint->ext_coord.coord);
30646 + result = zload_ra(hint->ext_coord.coord.node, &ra_info);
30647 + if (unlikely(result))
30648 + goto out;
30649 + iplug = item_plugin_by_coord(&hint->ext_coord.coord);
30650 + assert("edward-147",
30651 + item_id_by_coord(&hint->ext_coord.coord) ==
30652 + CTAIL_ID);
30653 +
30654 + result = iplug->s.file.read(NULL, &f, hint);
30655 + if (result) {
30656 + zrelse(hint->ext_coord.coord.node);
30657 + goto out;
30658 + }
30659 + if (mode == ZNODE_WRITE_LOCK) {
30660 + /* Don't make dirty more nodes then it was
30661 + estimated (see comments before
30662 + estimate_dirty_cluster). Missed nodes will be
30663 + read up in flush time if they are evicted from
30664 + memory */
30665 + if (dclust_get_extension_ncount(hint) <=
30666 + estimate_dirty_cluster(inode))
30667 + znode_make_dirty(hint->ext_coord.coord.node);
30668 +
30669 + znode_set_convertible(hint->ext_coord.coord.
30670 + node);
30671 + }
30672 + zrelse(hint->ext_coord.coord.node);
30673 + break;
30674 + default:
30675 + goto out;
30676 + }
30677 + }
30678 + ok:
30679 + /* at least one item was found */
30680 + /* NOTE-EDWARD: Callers should handle the case
30681 + when disk cluster is incomplete (-EIO) */
30682 + tc->len = inode_scaled_cluster_size(inode) - f.length;
30683 + tc->lsize = lbytes(clust->index, inode);
30684 + assert("edward-1196", tc->len > 0);
30685 + assert("edward-1406", tc->lsize > 0);
30686 +
30687 + if (hint_is_unprepped_dclust(clust->hint)) {
30688 + clust->dstat = UNPR_DISK_CLUSTER;
30689 + } else if (clust->index == info->trunc_index) {
30690 + clust->dstat = TRNC_DISK_CLUSTER;
30691 + } else {
30692 + clust->dstat = PREP_DISK_CLUSTER;
30693 + dclust_set_extension_dsize(clust->hint, tc->len);
30694 + }
30695 + out:
30696 + assert("edward-1339",
30697 + get_current_context()->grabbed_blocks >= was_grabbed);
30698 + grabbed2free(get_current_context(),
30699 + get_current_super_private(),
30700 + get_current_context()->grabbed_blocks - was_grabbed);
30701 + return result;
30702 +}
30703 +
30704 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
30705 + znode_lock_mode lock_mode)
30706 +{
30707 + reiser4_key key;
30708 + ra_info_t ra_info;
30709 +
30710 + assert("edward-730", reiser4_schedulable());
30711 + assert("edward-731", clust != NULL);
30712 + assert("edward-732", inode != NULL);
30713 +
30714 + if (hint_is_valid(clust->hint)) {
30715 + assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
30716 + assert("edward-1294",
30717 + znode_is_write_locked(clust->hint->lh.node));
30718 + /* already have a valid locked position */
30719 + return (clust->dstat ==
30720 + FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
30721 + CBK_COORD_FOUND);
30722 + }
30723 + key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
30724 + &key);
30725 + ra_info.key_to_stop = key;
30726 + set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30727 +
30728 + return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
30729 + CBK_FOR_INSERT);
30730 +}
30731 +
30732 +/* Read needed cluster pages before modifying.
30733 + If success, @clust->hint contains locked position in the tree.
30734 + Also:
30735 + . find and set disk cluster state
30736 + . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
30737 +*/
30738 +static int read_some_cluster_pages(struct inode * inode,
30739 + struct cluster_handle * clust)
30740 +{
30741 + int i;
30742 + int result = 0;
30743 + item_plugin *iplug;
30744 + struct reiser4_slide * win = clust->win;
30745 + znode_lock_mode mode = ZNODE_WRITE_LOCK;
30746 +
30747 + iplug = item_plugin_by_id(CTAIL_ID);
30748 +
30749 + assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
30750 +
30751 +#if REISER4_DEBUG
30752 + if (clust->nr_pages == 0) {
30753 + /* start write hole from fake disk cluster */
30754 + assert("edward-1117", win != NULL);
30755 + assert("edward-1118", win->stat == HOLE_WINDOW);
30756 + assert("edward-1119", new_logical_cluster(clust, inode));
30757 + }
30758 +#endif
30759 + if (new_logical_cluster(clust, inode)) {
30760 + /*
30761 + new page cluster is about to be written, nothing to read,
30762 + */
30763 + assert("edward-734", reiser4_schedulable());
30764 + assert("edward-735", clust->hint->lh.owner == NULL);
30765 +
30766 + if (clust->nr_pages) {
30767 + int off;
30768 + struct page * pg;
30769 + assert("edward-1419", clust->pages != NULL);
30770 + pg = clust->pages[clust->nr_pages - 1];
30771 + assert("edward-1420", pg != NULL);
30772 + off = off_to_pgoff(win->off+win->count+win->delta);
30773 + if (off) {
30774 + lock_page(pg);
30775 + zero_user_page(pg, off, PAGE_CACHE_SIZE - off,
30776 + KM_USER0);
30777 + unlock_page(pg);
30778 + }
30779 + }
30780 + clust->dstat = FAKE_DISK_CLUSTER;
30781 + return 0;
30782 + }
30783 + /*
30784 + Here we should search for disk cluster to figure out its real state.
30785 + Also there is one more important reason to do disk search: we need
30786 + to make disk cluster _dirty_ if it exists
30787 + */
30788 +
30789 + /* if windows is specified, read the only pages
30790 + that will be modified partially */
30791 +
30792 + for (i = 0; i < clust->nr_pages; i++) {
30793 + struct page *pg = clust->pages[i];
30794 +
30795 + lock_page(pg);
30796 + if (PageUptodate(pg)) {
30797 + unlock_page(pg);
30798 + continue;
30799 + }
30800 + unlock_page(pg);
30801 +
30802 + if (win &&
30803 + i >= size_in_pages(win->off) &&
30804 + i < off_to_pg(win->off + win->count + win->delta))
30805 + /* page will be completely overwritten */
30806 + continue;
30807 +
30808 + if (win && (i == clust->nr_pages - 1) &&
30809 + /* the last page is
30810 + partially modified,
30811 + not uptodate .. */
30812 + (size_in_pages(i_size_read(inode)) <= pg->index)) {
30813 + /* .. and appended,
30814 + so set zeroes to the rest */
30815 + int offset;
30816 + lock_page(pg);
30817 + assert("edward-1260",
30818 + size_in_pages(win->off + win->count +
30819 + win->delta) - 1 == i);
30820 +
30821 + offset =
30822 + off_to_pgoff(win->off + win->count + win->delta);
30823 + zero_user_page(pg, offset, PAGE_CACHE_SIZE - offset,
30824 + KM_USER0);
30825 + unlock_page(pg);
30826 + /* still not uptodate */
30827 + break;
30828 + }
30829 + lock_page(pg);
30830 + result = do_readpage_ctail(inode, clust, pg, mode);
30831 +
30832 + assert("edward-1526", ergo(!result, PageUptodate(pg)));
30833 + unlock_page(pg);
30834 + if (result) {
30835 + warning("edward-219", "do_readpage_ctail failed");
30836 + goto out;
30837 + }
30838 + }
30839 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
30840 + /* disk cluster unclaimed, but we need to make its znodes dirty
30841 + * to make flush update convert its content
30842 + */
30843 + result = find_disk_cluster(clust, inode,
30844 + 0 /* do not read items */,
30845 + mode);
30846 + }
30847 + out:
30848 + tfm_cluster_clr_uptodate(&clust->tc);
30849 + return result;
30850 +}
30851 +
30852 +static int should_create_unprepped_cluster(struct cluster_handle * clust,
30853 + struct inode * inode)
30854 +{
30855 + assert("edward-737", clust != NULL);
30856 +
30857 + switch (clust->dstat) {
30858 + case PREP_DISK_CLUSTER:
30859 + case UNPR_DISK_CLUSTER:
30860 + return 0;
30861 + case FAKE_DISK_CLUSTER:
30862 + if (clust->win &&
30863 + clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
30864 + assert("edward-1172",
30865 + new_logical_cluster(clust, inode));
30866 + return 0;
30867 + }
30868 + return 1;
30869 + default:
30870 + impossible("edward-1173", "bad disk cluster state");
30871 + return 0;
30872 + }
30873 +}
30874 +
30875 +static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
30876 + struct inode *inode)
30877 +{
30878 + int result;
30879 +
30880 + assert("edward-1123", reiser4_schedulable());
30881 + assert("edward-737", clust != NULL);
30882 + assert("edward-738", inode != NULL);
30883 + assert("edward-739", cryptcompress_inode_ok(inode));
30884 + assert("edward-1053", clust->hint != NULL);
30885 +
30886 + if (!should_create_unprepped_cluster(clust, inode)) {
30887 + if (clust->reserved) {
30888 + cluster_reserved2free(estimate_insert_cluster(inode));
30889 +#if REISER4_DEBUG
30890 + assert("edward-1267",
30891 + clust->reserved_unprepped ==
30892 + estimate_insert_cluster(inode));
30893 + clust->reserved_unprepped -=
30894 + estimate_insert_cluster(inode);
30895 +#endif
30896 + }
30897 + return 0;
30898 + }
30899 + assert("edward-1268", clust->reserved);
30900 + cluster_reserved2grabbed(estimate_insert_cluster(inode));
30901 +#if REISER4_DEBUG
30902 + assert("edward-1441",
30903 + clust->reserved_unprepped == estimate_insert_cluster(inode));
30904 + clust->reserved_unprepped -= estimate_insert_cluster(inode);
30905 +#endif
30906 + result = ctail_insert_unprepped_cluster(clust, inode);
30907 + if (result)
30908 + return result;
30909 +
30910 + inode_add_bytes(inode, inode_cluster_size(inode));
30911 +
30912 + assert("edward-743", cryptcompress_inode_ok(inode));
30913 + assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
30914 +
30915 + clust->dstat = UNPR_DISK_CLUSTER;
30916 + return 0;
30917 +}
30918 +
30919 +/* . Grab page cluster for read, write, setattr, etc. operations;
30920 + * . Truncate its complete pages, if needed;
30921 + */
30922 +int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
30923 + rw_op rw)
30924 +{
30925 + assert("edward-177", inode != NULL);
30926 + assert("edward-741", cryptcompress_inode_ok(inode));
30927 + assert("edward-740", clust->pages != NULL);
30928 +
30929 + set_cluster_nrpages(clust, inode);
30930 + reset_cluster_pgset(clust, cluster_nrpages(inode));
30931 + return grab_page_cluster(inode, clust, rw);
30932 +}
30933 +
30934 +/* Truncate complete page cluster of index @index.
30935 + * This is called by ->kill_hook() method of item
30936 + * plugin when deleting a disk cluster of such index.
30937 + */
30938 +void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
30939 + int even_cows)
30940 +{
30941 + int found;
30942 + int nr_pages;
30943 + jnode *node;
30944 + struct page *pages[MAX_CLUSTER_NRPAGES];
30945 +
30946 + node = jlookup(current_tree, get_inode_oid(inode),
30947 + clust_to_pg(index, inode));
30948 + nr_pages = size_in_pages(lbytes(index, inode));
30949 + assert("edward-1483", nr_pages != 0);
30950 + if (!node)
30951 + goto truncate;
30952 + found = find_get_pages(inode->i_mapping,
30953 + clust_to_pg(index, inode),
30954 + cluster_nrpages(inode), pages);
30955 + if (!found) {
30956 + assert("edward-1484", jnode_truncate_ok(inode, index));
30957 + return;
30958 + }
30959 + lock_cluster(node);
30960 +
30961 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
30962 + && index == 0)
30963 + /* converting to unix_file is in progress */
30964 + JF_CLR(node, JNODE_CLUSTER_PAGE);
30965 + if (JF_ISSET(node, JNODE_DIRTY)) {
30966 + /*
30967 + * @nr_pages were checked in, but not yet checked out -
30968 + * we need to release them. (also there can be pages
30969 + * attached to page cache by read(), etc. - don't take
30970 + * them into account).
30971 + */
30972 + assert("edward-1198", found >= nr_pages);
30973 +
30974 + /* free disk space grabbed for disk cluster converting */
30975 + cluster_reserved2grabbed(estimate_update_cluster(inode));
30976 + grabbed2free(get_current_context(),
30977 + get_current_super_private(),
30978 + estimate_update_cluster(inode));
30979 + __put_page_cluster(0, nr_pages, pages, inode);
30980 +
30981 + /* This will clear dirty bit, uncapture and unlock jnode */
30982 + unlock_cluster_uncapture(node);
30983 + } else
30984 + unlock_cluster(node);
30985 + jput(node); /* jlookup */
30986 + put_found_pages(pages, found); /* find_get_pages */
30987 + truncate:
30988 + if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
30989 + index == 0)
30990 + return;
30991 + truncate_page_cluster_range(inode, pages, index, 0,
30992 + cluster_nrpages(inode),
30993 + even_cows);
30994 + assert("edward-1201",
30995 + ergo(!reiser4_inode_get_flag(inode,
30996 + REISER4_FILE_CONV_IN_PROGRESS),
30997 + jnode_truncate_ok(inode, index)));
30998 + return;
30999 +}
31000 +
31001 +/*
31002 + * Set cluster handle @clust of a logical cluster before
31003 + * modifications which are supposed to be committed.
31004 + *
31005 + * . grab cluster pages;
31006 + * . reserve disk space;
31007 + * . maybe read pages from disk and set the disk cluster dirty;
31008 + * . maybe write hole and check in (partially zeroed) logical cluster;
31009 + * . create 'unprepped' disk cluster for new or fake logical one.
31010 + */
31011 +static int prepare_logical_cluster(struct inode *inode,
31012 + loff_t file_off, /* write position
31013 + in the file */
31014 + loff_t to_file, /* bytes of users data
31015 + to write to the file */
31016 + struct cluster_handle * clust,
31017 + logical_cluster_op op)
31018 +{
31019 + int result = 0;
31020 + struct reiser4_slide * win = clust->win;
31021 +
31022 + reset_cluster_params(clust);
31023 + cluster_set_tfm_act(&clust->tc, TFMA_READ);
31024 +#if REISER4_DEBUG
31025 + clust->ctx = get_current_context();
31026 +#endif
31027 + assert("edward-1190", op != LC_INVAL);
31028 +
31029 + clust->op = op;
31030 +
31031 + result = prepare_page_cluster(inode, clust, WRITE_OP);
31032 + if (result)
31033 + return result;
31034 + assert("edward-1447",
31035 + ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
31036 + assert("edward-1448",
31037 + ergo(clust->nr_pages != 0,
31038 + jnode_is_cluster_page(jprivate(clust->pages[0]))));
31039 +
31040 + result = reserve4cluster(inode, clust);
31041 + if (result)
31042 + goto err1;
31043 + result = read_some_cluster_pages(inode, clust);
31044 + if (result) {
31045 + free_reserved4cluster(inode,
31046 + clust,
31047 + estimate_update_cluster(inode) +
31048 + estimate_insert_cluster(inode));
31049 + goto err1;
31050 + }
31051 + assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
31052 +
31053 + result = cryptcompress_make_unprepped_cluster(clust, inode);
31054 + if (result)
31055 + goto err2;
31056 + if (win && win->stat == HOLE_WINDOW) {
31057 + result = write_hole(inode, clust, file_off, to_file);
31058 + if (result)
31059 + goto err2;
31060 + }
31061 + return 0;
31062 + err2:
31063 + free_reserved4cluster(inode, clust,
31064 + estimate_update_cluster(inode));
31065 + err1:
31066 + put_page_cluster(clust, inode, WRITE_OP);
31067 + assert("edward-1125", result == -ENOSPC);
31068 + return result;
31069 +}
31070 +
31071 +/* set window by two offsets */
31072 +static void set_window(struct cluster_handle * clust,
31073 + struct reiser4_slide * win, struct inode *inode,
31074 + loff_t o1, loff_t o2)
31075 +{
31076 + assert("edward-295", clust != NULL);
31077 + assert("edward-296", inode != NULL);
31078 + assert("edward-1071", win != NULL);
31079 + assert("edward-297", o1 <= o2);
31080 +
31081 + clust->index = off_to_clust(o1, inode);
31082 +
31083 + win->off = off_to_cloff(o1, inode);
31084 + win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
31085 + o2 - o1);
31086 + win->delta = 0;
31087 +
31088 + clust->win = win;
31089 +}
31090 +
31091 +static int set_cluster_by_window(struct inode *inode,
31092 + struct cluster_handle * clust,
31093 + struct reiser4_slide * win, size_t length,
31094 + loff_t file_off)
31095 +{
31096 + int result;
31097 +
31098 + assert("edward-197", clust != NULL);
31099 + assert("edward-1072", win != NULL);
31100 + assert("edward-198", inode != NULL);
31101 +
31102 + result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
31103 + if (result)
31104 + return result;
31105 +
31106 + if (file_off > i_size_read(inode)) {
31107 + /* Uhmm, hole in cryptcompress file... */
31108 + loff_t hole_size;
31109 + hole_size = file_off - inode->i_size;
31110 +
31111 + set_window(clust, win, inode, inode->i_size, file_off);
31112 + win->stat = HOLE_WINDOW;
31113 + if (win->off + hole_size < inode_cluster_size(inode))
31114 + /* there is also user's data to append to the hole */
31115 + win->delta = min(inode_cluster_size(inode) -
31116 + (win->off + win->count), length);
31117 + return 0;
31118 + }
31119 + set_window(clust, win, inode, file_off, file_off + length);
31120 + win->stat = DATA_WINDOW;
31121 + return 0;
31122 +}
31123 +
31124 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
31125 + int count)
31126 +{
31127 + int result = 0;
31128 + int (*setting_actor)(struct cluster_handle * clust, int count);
31129 +
31130 + assert("edward-1358", clust != NULL);
31131 + assert("edward-1359", page != NULL);
31132 + assert("edward-1360", page->mapping != NULL);
31133 + assert("edward-1361", page->mapping->host != NULL);
31134 +
31135 + setting_actor =
31136 + (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
31137 + result = setting_actor(clust, count);
31138 + clust->index = pg_to_clust(page->index, page->mapping->host);
31139 + return result;
31140 +}
31141 +
31142 +/* reset all the params that not get updated */
31143 +void reset_cluster_params(struct cluster_handle * clust)
31144 +{
31145 + assert("edward-197", clust != NULL);
31146 +
31147 + clust->dstat = INVAL_DISK_CLUSTER;
31148 + clust->tc.uptodate = 0;
31149 + clust->tc.len = 0;
31150 +}
31151 +
31152 +/* the heart of write_cryptcompress */
31153 +static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
31154 + const char __user *buf, size_t to_write,
31155 + loff_t pos, int *conv_occured)
31156 +{
31157 + int i;
31158 + hint_t *hint;
31159 + int result = 0;
31160 + size_t count;
31161 + struct reiser4_slide win;
31162 + struct cluster_handle clust;
31163 + struct cryptcompress_info * info;
31164 +
31165 + assert("edward-161", reiser4_schedulable());
31166 + assert("edward-748", cryptcompress_inode_ok(inode));
31167 + assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
31168 + assert("edward-1274", get_current_context()->grabbed_blocks == 0);
31169 +
31170 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31171 + if (hint == NULL)
31172 + return RETERR(-ENOMEM);
31173 +
31174 + result = load_file_hint(file, hint);
31175 + if (result) {
31176 + kfree(hint);
31177 + return result;
31178 + }
31179 + count = to_write;
31180 +
31181 + reiser4_slide_init(&win);
31182 + cluster_init_read(&clust, &win);
31183 + clust.hint = hint;
31184 + info = cryptcompress_inode_data(inode);
31185 +
31186 + mutex_lock(&info->checkin_mutex);
31187 +
31188 + result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
31189 + if (result)
31190 + goto out;
31191 +
31192 + if (next_window_stat(&win) == HOLE_WINDOW) {
31193 + /* write hole in this iteration
31194 + separated from the loop below */
31195 + result = write_conversion_hook(file, inode,
31196 + pos,
31197 + &clust,
31198 + NULL);
31199 + if (result)
31200 + goto out;
31201 + result = prepare_logical_cluster(inode, pos, count, &clust,
31202 + LC_APPOV);
31203 + if (result)
31204 + goto out;
31205 + }
31206 + do {
31207 + const char __user * src;
31208 + unsigned page_off, to_page;
31209 +
31210 + assert("edward-750", reiser4_schedulable());
31211 +
31212 + result = write_conversion_hook(file, inode,
31213 + pos + to_write - count,
31214 + &clust,
31215 + conv_occured);
31216 + if (result || *conv_occured)
31217 + goto out;
31218 + result = prepare_logical_cluster(inode, pos, count, &clust,
31219 + LC_APPOV);
31220 + if (result)
31221 + goto out;
31222 +
31223 + assert("edward-751", cryptcompress_inode_ok(inode));
31224 + assert("edward-204", win.stat == DATA_WINDOW);
31225 + assert("edward-1288", hint_is_valid(clust.hint));
31226 + assert("edward-752",
31227 + znode_is_write_locked(hint->ext_coord.coord.node));
31228 + put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
31229 +
31230 + /* set write position in page */
31231 + page_off = off_to_pgoff(win.off);
31232 +
31233 + /* copy user's data to cluster pages */
31234 + for (i = off_to_pg(win.off), src = buf;
31235 + i < size_in_pages(win.off + win.count);
31236 + i++, src += to_page) {
31237 + to_page = __mbp(win.off + win.count, i) - page_off;
31238 + assert("edward-1039",
31239 + page_off + to_page <= PAGE_CACHE_SIZE);
31240 + assert("edward-287", clust.pages[i] != NULL);
31241 +
31242 + fault_in_pages_readable(src, to_page);
31243 +
31244 + lock_page(clust.pages[i]);
31245 + result =
31246 + __copy_from_user((char *)kmap(clust.pages[i]) +
31247 + page_off, src, to_page);
31248 + kunmap(clust.pages[i]);
31249 + if (unlikely(result)) {
31250 + unlock_page(clust.pages[i]);
31251 + result = -EFAULT;
31252 + goto err2;
31253 + }
31254 + SetPageUptodate(clust.pages[i]);
31255 + reiser4_set_page_dirty_internal(clust.pages[i]);
31256 + flush_dcache_page(clust.pages[i]);
31257 + mark_page_accessed(clust.pages[i]);
31258 + unlock_page(clust.pages[i]);
31259 + page_off = 0;
31260 + }
31261 + assert("edward-753", cryptcompress_inode_ok(inode));
31262 +
31263 + result = checkin_logical_cluster(&clust, inode);
31264 + if (result)
31265 + goto err2;
31266 +
31267 + buf += win.count;
31268 + count -= win.count;
31269 +
31270 + result = balance_dirty_page_cluster(&clust, inode, 0, count);
31271 + if (result)
31272 + goto err1;
31273 + assert("edward-755", hint->lh.owner == NULL);
31274 + reset_cluster_params(&clust);
31275 + continue;
31276 + err2:
31277 + put_page_cluster(&clust, inode, WRITE_OP);
31278 + err1:
31279 + if (clust.reserved)
31280 + free_reserved4cluster(inode,
31281 + &clust,
31282 + estimate_update_cluster(inode));
31283 + break;
31284 + } while (count);
31285 + out:
31286 + /*
31287 + * NOTE: at this point file may have
31288 + * another (unix-file) plugin installed
31289 + */
31290 + done_lh(&hint->lh);
31291 + if (result == -EEXIST)
31292 + warning("edward-1407", "write returns EEXIST!\n");
31293 +
31294 + put_cluster_handle(&clust);
31295 + save_file_hint(file, hint);
31296 + kfree(hint);
31297 + /*
31298 + * don't release cryptcompress-specific
31299 + * checkin_mutex, if conversion occured
31300 + */
31301 + if (*conv_occured == 0)
31302 + mutex_unlock(&info->checkin_mutex);
31303 + if (buf) {
31304 + /* if nothing were written - there must be an error */
31305 + assert("edward-195", ergo((to_write == count),
31306 + (result < 0 || *conv_occured)));
31307 + return (to_write - count) ? (to_write - count) : result;
31308 + }
31309 + return result;
31310 +}
31311 +
31312 +/**
31313 + * plugin->write()
31314 + * @file: file to write to
31315 + * @buf: address of user-space buffer
31316 + * @read_amount: number of bytes to write
31317 + * @off: position in file to write to
31318 + */
31319 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
31320 + size_t count, loff_t *off, int *conv)
31321 +{
31322 + ssize_t result;
31323 + struct inode *inode;
31324 + reiser4_context *ctx;
31325 + loff_t pos = *off;
31326 + struct cryptcompress_info *info;
31327 +
31328 + assert("edward-1449", *conv == 0);
31329 +
31330 + inode = file->f_dentry->d_inode;
31331 + assert("edward-196", cryptcompress_inode_ok(inode));
31332 +
31333 + info = cryptcompress_inode_data(inode);
31334 +
31335 + ctx = reiser4_init_context(inode->i_sb);
31336 + if (IS_ERR(ctx))
31337 + return PTR_ERR(ctx);
31338 +
31339 + mutex_lock(&inode->i_mutex);
31340 +
31341 + result = generic_write_checks(file, &pos, &count, 0);
31342 + if (unlikely(result != 0))
31343 + goto out;
31344 + if (unlikely(count == 0))
31345 + goto out;
31346 + result = remove_suid(file->f_dentry);
31347 + if (unlikely(result != 0))
31348 + goto out;
31349 + /* remove_suid might create a transaction */
31350 + reiser4_txn_restart(ctx);
31351 +
31352 + result = do_write_cryptcompress(file, inode, buf, count, pos, conv);
31353 +
31354 + if (result < 0)
31355 + goto out;
31356 + /* update position in a file */
31357 + *off = pos + result;
31358 + out:
31359 + mutex_unlock(&inode->i_mutex);
31360 +
31361 + context_set_commit_async(ctx);
31362 + reiser4_exit_context(ctx);
31363 + return result;
31364 +}
31365 +
31366 +/* plugin->readpages */
31367 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
31368 + struct list_head *pages, unsigned nr_pages)
31369 +{
31370 + reiser4_context * ctx;
31371 + int ret;
31372 +
31373 + ctx = reiser4_init_context(mapping->host->i_sb);
31374 + if (IS_ERR(ctx)) {
31375 + ret = PTR_ERR(ctx);
31376 + goto err;
31377 + }
31378 + /* cryptcompress file can be built of ctail items only */
31379 + ret = readpages_ctail(file, mapping, pages);
31380 + reiser4_txn_restart(ctx);
31381 + reiser4_exit_context(ctx);
31382 + if (ret) {
31383 +err:
31384 + put_pages_list(pages);
31385 + }
31386 + return ret;
31387 +}
31388 +
31389 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
31390 +{
31391 + /* reserve one block to update stat data item */
31392 + assert("edward-1193",
31393 + inode_file_plugin(inode)->estimate.update ==
31394 + estimate_update_common);
31395 + return estimate_update_common(inode);
31396 +}
31397 +
31398 +/**
31399 + * plugin->read
31400 + * @file: file to read from
31401 + * @buf: address of user-space buffer
31402 + * @read_amount: number of bytes to read
31403 + * @off: position in file to read from
31404 + */
31405 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
31406 + loff_t * off)
31407 +{
31408 + ssize_t result;
31409 + struct inode *inode;
31410 + reiser4_context *ctx;
31411 + struct cryptcompress_info *info;
31412 + reiser4_block_nr needed;
31413 +
31414 + inode = file->f_dentry->d_inode;
31415 + assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
31416 +
31417 + ctx = reiser4_init_context(inode->i_sb);
31418 + if (IS_ERR(ctx))
31419 + return PTR_ERR(ctx);
31420 +
31421 + info = cryptcompress_inode_data(inode);
31422 + needed = cryptcompress_estimate_read(inode);
31423 +
31424 + result = reiser4_grab_space(needed, BA_CAN_COMMIT);
31425 + if (result != 0) {
31426 + reiser4_exit_context(ctx);
31427 + return result;
31428 + }
31429 + result = do_sync_read(file, buf, size, off);
31430 +
31431 + context_set_commit_async(ctx);
31432 + reiser4_exit_context(ctx);
31433 +
31434 + return result;
31435 +}
31436 +
31437 +/* Look for a disk cluster and keep lookup result in @found.
31438 + * If @index > 0, then find disk cluster of the index (@index - 1);
31439 + * If @index == 0, then find the rightmost disk cluster.
31440 + * Keep incremented index of the found disk cluster in @found.
31441 + * @found == 0 means that disk cluster was not found (in the last
31442 + * case (@index == 0) it means that file doesn't have disk clusters).
31443 + */
31444 +static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
31445 + cloff_t index)
31446 +{
31447 + int result;
31448 + reiser4_key key;
31449 + loff_t offset;
31450 + hint_t *hint;
31451 + lock_handle *lh;
31452 + lookup_bias bias;
31453 + coord_t *coord;
31454 + item_plugin *iplug;
31455 +
31456 + assert("edward-1131", inode != NULL);
31457 + assert("edward-95", cryptcompress_inode_ok(inode));
31458 +
31459 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31460 + if (hint == NULL)
31461 + return RETERR(-ENOMEM);
31462 + hint_init_zero(hint);
31463 + lh = &hint->lh;
31464 +
31465 + bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
31466 + offset =
31467 + (index ? clust_to_off(index, inode) -
31468 + 1 : get_key_offset(reiser4_max_key()));
31469 +
31470 + key_by_inode_cryptcompress(inode, offset, &key);
31471 +
31472 + /* find the last item of this object */
31473 + result =
31474 + find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
31475 + bias, 0);
31476 + if (cbk_errored(result)) {
31477 + done_lh(lh);
31478 + kfree(hint);
31479 + return result;
31480 + }
31481 + if (result == CBK_COORD_NOTFOUND) {
31482 + /* no real disk clusters */
31483 + done_lh(lh);
31484 + kfree(hint);
31485 + *found = 0;
31486 + return 0;
31487 + }
31488 + /* disk cluster is found */
31489 + coord = &hint->ext_coord.coord;
31490 + coord_clear_iplug(coord);
31491 + result = zload(coord->node);
31492 + if (unlikely(result)) {
31493 + done_lh(lh);
31494 + kfree(hint);
31495 + return result;
31496 + }
31497 + iplug = item_plugin_by_coord(coord);
31498 + assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
31499 + assert("edward-1202", ctail_ok(coord));
31500 +
31501 + item_key_by_coord(coord, &key);
31502 + *found = off_to_clust(get_key_offset(&key), inode) + 1;
31503 +
31504 + assert("edward-1132", ergo(index, index == *found));
31505 +
31506 + zrelse(coord->node);
31507 + done_lh(lh);
31508 + kfree(hint);
31509 + return 0;
31510 +}
31511 +
31512 +static int find_fake_appended(struct inode *inode, cloff_t * index)
31513 +{
31514 + return lookup_disk_cluster(inode, index,
31515 + 0 /* find last real one */ );
31516 +}
31517 +
31518 +/* Set left coord when unit is not found after node_lookup()
31519 + This takes into account that there can be holes in a sequence
31520 + of disk clusters */
31521 +
31522 +static void adjust_left_coord(coord_t * left_coord)
31523 +{
31524 + switch (left_coord->between) {
31525 + case AFTER_UNIT:
31526 + left_coord->between = AFTER_ITEM;
31527 + case AFTER_ITEM:
31528 + case BEFORE_UNIT:
31529 + break;
31530 + default:
31531 + impossible("edward-1204", "bad left coord to cut");
31532 + }
31533 + return;
31534 +}
31535 +
31536 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
31537 +
31538 +/* plugin->cut_tree_worker */
31539 +int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
31540 + const reiser4_key * to_key,
31541 + reiser4_key * smallest_removed,
31542 + struct inode *object, int truncate,
31543 + int *progress)
31544 +{
31545 + lock_handle next_node_lock;
31546 + coord_t left_coord;
31547 + int result;
31548 +
31549 + assert("edward-1158", tap->coord->node != NULL);
31550 + assert("edward-1159", znode_is_write_locked(tap->coord->node));
31551 + assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
31552 +
31553 + *progress = 0;
31554 + init_lh(&next_node_lock);
31555 +
31556 + while (1) {
31557 + znode *node; /* node from which items are cut */
31558 + node_plugin *nplug; /* node plugin for @node */
31559 +
31560 + node = tap->coord->node;
31561 +
31562 + /* Move next_node_lock to the next node on the left. */
31563 + result =
31564 + reiser4_get_left_neighbor(&next_node_lock, node,
31565 + ZNODE_WRITE_LOCK,
31566 + GN_CAN_USE_UPPER_LEVELS);
31567 + if (result != 0 && result != -E_NO_NEIGHBOR)
31568 + break;
31569 + /* FIXME-EDWARD: Check can we delete the node as a whole. */
31570 + result = reiser4_tap_load(tap);
31571 + if (result)
31572 + return result;
31573 +
31574 + /* Prepare the second (right) point for cut_node() */
31575 + if (*progress)
31576 + coord_init_last_unit(tap->coord, node);
31577 +
31578 + else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
31579 + /* set rightmost unit for the items without lookup method */
31580 + tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
31581 +
31582 + nplug = node->nplug;
31583 +
31584 + assert("edward-1161", nplug);
31585 + assert("edward-1162", nplug->lookup);
31586 +
31587 + /* left_coord is leftmost unit cut from @node */
31588 + result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
31589 +
31590 + if (IS_CBKERR(result))
31591 + break;
31592 +
31593 + if (result == CBK_COORD_NOTFOUND)
31594 + adjust_left_coord(&left_coord);
31595 +
31596 + /* adjust coordinates so that they are set to existing units */
31597 + if (coord_set_to_right(&left_coord)
31598 + || coord_set_to_left(tap->coord)) {
31599 + result = 0;
31600 + break;
31601 + }
31602 +
31603 + if (coord_compare(&left_coord, tap->coord) ==
31604 + COORD_CMP_ON_RIGHT) {
31605 + /* keys from @from_key to @to_key are not in the tree */
31606 + result = 0;
31607 + break;
31608 + }
31609 +
31610 + /* cut data from one node */
31611 + *smallest_removed = *reiser4_min_key();
31612 + result = kill_node_content(&left_coord,
31613 + tap->coord,
31614 + from_key,
31615 + to_key,
31616 + smallest_removed,
31617 + next_node_lock.node,
31618 + object, truncate);
31619 + reiser4_tap_relse(tap);
31620 +
31621 + if (result)
31622 + break;
31623 +
31624 + ++(*progress);
31625 +
31626 + /* Check whether all items with keys >= from_key were removed
31627 + * from the tree. */
31628 + if (keyle(smallest_removed, from_key))
31629 + /* result = 0; */
31630 + break;
31631 +
31632 + if (next_node_lock.node == NULL)
31633 + break;
31634 +
31635 + result = reiser4_tap_move(tap, &next_node_lock);
31636 + done_lh(&next_node_lock);
31637 + if (result)
31638 + break;
31639 +
31640 + /* Break long cut_tree operation (deletion of a large file) if
31641 + * atom requires commit. */
31642 + if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
31643 + && current_atom_should_commit()) {
31644 + result = -E_REPEAT;
31645 + break;
31646 + }
31647 + }
31648 + done_lh(&next_node_lock);
31649 + return result;
31650 +}
31651 +
31652 +/* Append or expand hole in two steps:
31653 + * 1) set zeroes to the rightmost page of the rightmost non-fake
31654 + * logical cluster;
31655 + * 2) expand hole via fake logical clusters (just increase i_size)
31656 + */
31657 +static int cryptcompress_append_hole(struct inode *inode /* with old size */,
31658 + loff_t new_size)
31659 +{
31660 + int result = 0;
31661 + hint_t *hint;
31662 + lock_handle *lh;
31663 + loff_t hole_size;
31664 + int nr_zeroes;
31665 + struct reiser4_slide win;
31666 + struct cluster_handle clust;
31667 +
31668 + assert("edward-1133", inode->i_size < new_size);
31669 + assert("edward-1134", reiser4_schedulable());
31670 + assert("edward-1135", cryptcompress_inode_ok(inode));
31671 + assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
31672 + assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
31673 +
31674 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31675 + if (hint == NULL)
31676 + return RETERR(-ENOMEM);
31677 + hint_init_zero(hint);
31678 + lh = &hint->lh;
31679 +
31680 + reiser4_slide_init(&win);
31681 + cluster_init_read(&clust, &win);
31682 + clust.hint = hint;
31683 +
31684 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31685 + if (result)
31686 + goto out;
31687 + if (off_to_cloff(inode->i_size, inode) == 0)
31688 + goto append_fake;
31689 + hole_size = new_size - inode->i_size;
31690 + nr_zeroes =
31691 + inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
31692 + if (hole_size < nr_zeroes)
31693 + nr_zeroes = hole_size;
31694 + set_window(&clust, &win, inode, inode->i_size,
31695 + inode->i_size + nr_zeroes);
31696 + win.stat = HOLE_WINDOW;
31697 +
31698 + assert("edward-1137",
31699 + clust.index == off_to_clust(inode->i_size, inode));
31700 +
31701 + result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
31702 +
31703 + assert("edward-1271", !result || result == -ENOSPC);
31704 + if (result)
31705 + goto out;
31706 + assert("edward-1139",
31707 + clust.dstat == PREP_DISK_CLUSTER ||
31708 + clust.dstat == UNPR_DISK_CLUSTER);
31709 +
31710 + assert("edward-1431", hole_size >= nr_zeroes);
31711 + if (hole_size == nr_zeroes)
31712 + /* nothing to append anymore */
31713 + goto out;
31714 + append_fake:
31715 + INODE_SET_SIZE(inode, new_size);
31716 + out:
31717 + done_lh(lh);
31718 + kfree(hint);
31719 + put_cluster_handle(&clust);
31720 + return result;
31721 +}
31722 +
31723 +static int
31724 +update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
31725 +{
31726 + return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
31727 + ? 0 : reiser4_update_file_size(inode, key, update_sd));
31728 +}
31729 +
31730 +/* Prune cryptcompress file in two steps:
31731 + * 1) cut all nominated logical clusters except the leftmost one which
31732 + * is to be partially truncated. Note, that there can be "holes"
31733 + * represented by fake logical clusters.
31734 + * 2) set zeroes and capture leftmost partially truncated logical
31735 + * cluster, if it is not fake; otherwise prune fake logical cluster
31736 + * (just decrease i_size).
31737 + */
31738 +static int prune_cryptcompress(struct inode *inode, loff_t new_size,
31739 + int update_sd, cloff_t aidx)
31740 +{
31741 + int result = 0;
31742 + unsigned nr_zeroes;
31743 + loff_t to_prune;
31744 + loff_t old_size;
31745 + cloff_t ridx;
31746 +
31747 + hint_t *hint;
31748 + lock_handle *lh;
31749 + struct reiser4_slide win;
31750 + struct cluster_handle clust;
31751 +
31752 + assert("edward-1140", inode->i_size >= new_size);
31753 + assert("edward-1141", reiser4_schedulable());
31754 + assert("edward-1142", cryptcompress_inode_ok(inode));
31755 + assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
31756 +
31757 + old_size = inode->i_size;
31758 +
31759 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31760 + if (hint == NULL)
31761 + return RETERR(-ENOMEM);
31762 + hint_init_zero(hint);
31763 + lh = &hint->lh;
31764 +
31765 + reiser4_slide_init(&win);
31766 + cluster_init_read(&clust, &win);
31767 + clust.hint = hint;
31768 +
31769 + /* calculate index of the rightmost logical cluster
31770 + that will be completely truncated */
31771 + ridx = size_in_lc(new_size, inode);
31772 +
31773 + /* truncate all disk clusters starting from @ridx */
31774 + assert("edward-1174", ridx <= aidx);
31775 + old_size = inode->i_size;
31776 + if (ridx != aidx) {
31777 + struct cryptcompress_info * info;
31778 + info = cryptcompress_inode_data(inode);
31779 + result = cut_file_items(inode,
31780 + clust_to_off(ridx, inode),
31781 + update_sd,
31782 + clust_to_off(aidx, inode),
31783 + update_cryptcompress_size);
31784 + info->trunc_index = ULONG_MAX;
31785 + if (result)
31786 + goto out;
31787 + }
31788 + /*
31789 + * there can be pages of fake logical clusters, truncate them
31790 + */
31791 + truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
31792 + assert("edward-1524",
31793 + pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
31794 + /*
31795 + * now perform partial truncate of last logical cluster
31796 + */
31797 + if (!off_to_cloff(new_size, inode)) {
31798 + /* no partial truncate is needed */
31799 + assert("edward-1145", inode->i_size == new_size);
31800 + goto truncate_fake;
31801 + }
31802 + assert("edward-1146", new_size < inode->i_size);
31803 +
31804 + to_prune = inode->i_size - new_size;
31805 +
31806 + /* check if the last logical cluster is fake */
31807 + result = lookup_disk_cluster(inode, &aidx, ridx);
31808 + if (result)
31809 + goto out;
31810 + if (!aidx)
31811 + /* yup, this is fake one */
31812 + goto truncate_fake;
31813 +
31814 + assert("edward-1148", aidx == ridx);
31815 +
31816 + /* do partial truncate of the last page cluster,
31817 + and try to capture this one */
31818 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31819 + if (result)
31820 + goto out;
31821 + nr_zeroes = (off_to_pgoff(new_size) ?
31822 + PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
31823 + set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
31824 + win.stat = HOLE_WINDOW;
31825 +
31826 + assert("edward-1149", clust.index == ridx - 1);
31827 +
31828 + result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
31829 + if (result)
31830 + goto out;
31831 + assert("edward-1151",
31832 + clust.dstat == PREP_DISK_CLUSTER ||
31833 + clust.dstat == UNPR_DISK_CLUSTER);
31834 +
31835 + assert("edward-1191", inode->i_size == new_size);
31836 + assert("edward-1206", body_truncate_ok(inode, ridx));
31837 + truncate_fake:
31838 + /* drop all the pages that don't have jnodes (i.e. pages
31839 + which can not be truncated by cut_file_items() because
31840 + of holes represented by fake disk clusters) including
31841 + the pages of partially truncated cluster which was
31842 + released by prepare_logical_cluster() */
31843 + INODE_SET_SIZE(inode, new_size);
31844 + truncate_inode_pages(inode->i_mapping, new_size);
31845 + out:
31846 + assert("edward-1334", !result || result == -ENOSPC);
31847 + assert("edward-1497",
31848 + pages_truncate_ok(inode, size_in_pages(new_size)));
31849 +
31850 + done_lh(lh);
31851 + kfree(hint);
31852 + put_cluster_handle(&clust);
31853 + return result;
31854 +}
31855 +
31856 +/* Prepare cryptcompress file for truncate:
31857 + * prune or append rightmost fake logical clusters (if any)
31858 + */
31859 +static int start_truncate_fake(struct inode *inode, cloff_t aidx,
31860 + loff_t new_size, int update_sd)
31861 +{
31862 + int result = 0;
31863 + int bytes;
31864 +
31865 + if (new_size > inode->i_size) {
31866 + /* append */
31867 + if (inode->i_size < clust_to_off(aidx, inode))
31868 + /* no fake bytes */
31869 + return 0;
31870 + bytes = new_size - inode->i_size;
31871 + INODE_SET_SIZE(inode, inode->i_size + bytes);
31872 + } else {
31873 + /* prune */
31874 + if (inode->i_size <= clust_to_off(aidx, inode))
31875 + /* no fake bytes */
31876 + return 0;
31877 + bytes = inode->i_size -
31878 + max(new_size, clust_to_off(aidx, inode));
31879 + if (!bytes)
31880 + return 0;
31881 + INODE_SET_SIZE(inode, inode->i_size - bytes);
31882 + /* In the case of fake prune we need to drop page cluster.
31883 + There are only 2 cases for partially truncated page:
31884 + 1. If is is dirty, therefore it is anonymous
31885 + (was dirtied via mmap), and will be captured
31886 + later via ->capture().
31887 + 2. If is clean, therefore it is filled by zeroes.
31888 + In both cases we don't need to make it dirty and
31889 + capture here.
31890 + */
31891 + truncate_inode_pages(inode->i_mapping, inode->i_size);
31892 + }
31893 + if (update_sd)
31894 + result = update_sd_cryptcompress(inode);
31895 + return result;
31896 +}
31897 +
31898 +/**
31899 + * This is called in setattr_cryptcompress when it is used to truncate,
31900 + * and in delete_object_cryptcompress
31901 + */
31902 +static int cryptcompress_truncate(struct inode *inode, /* old size */
31903 + loff_t new_size, /* new size */
31904 + int update_sd)
31905 +{
31906 + int result;
31907 + cloff_t aidx;
31908 +
31909 + result = find_fake_appended(inode, &aidx);
31910 + if (result)
31911 + return result;
31912 + assert("edward-1208",
31913 + ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
31914 +
31915 + result = start_truncate_fake(inode, aidx, new_size, update_sd);
31916 + if (result)
31917 + return result;
31918 + if (inode->i_size == new_size)
31919 + /* nothing to truncate anymore */
31920 + return 0;
31921 + result = (inode->i_size < new_size ?
31922 + cryptcompress_append_hole(inode, new_size) :
31923 + prune_cryptcompress(inode, new_size, update_sd, aidx));
31924 + if (!result && update_sd)
31925 + result = update_sd_cryptcompress(inode);
31926 + return result;
31927 +}
31928 +
31929 +/* Capture an anonymous pager cluster. (Page cluser is
31930 + * anonymous if it contains at least one anonymous page
31931 + */
31932 +static int capture_anon_page_cluster(struct cluster_handle * clust,
31933 + struct inode * inode)
31934 +{
31935 + int result;
31936 +
31937 + assert("edward-1073", clust != NULL);
31938 + assert("edward-1074", inode != NULL);
31939 + assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
31940 +
31941 + result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
31942 + if (result)
31943 + return result;
31944 + set_cluster_pages_dirty(clust, inode);
31945 + result = checkin_logical_cluster(clust, inode);
31946 + put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
31947 + if (unlikely(result))
31948 + put_page_cluster(clust, inode, WRITE_OP);
31949 + return result;
31950 +}
31951 +
31952 +/* Starting from @index find tagged pages of the same page cluster.
31953 + * Clear the tag for each of them. Return number of found pages.
31954 + */
31955 +static int find_anon_page_cluster(struct address_space * mapping,
31956 + pgoff_t * index, struct page ** pages)
31957 +{
31958 + int i = 0;
31959 + int found;
31960 + write_lock_irq(&mapping->tree_lock);
31961 + do {
31962 + /* looking for one page */
31963 + found = radix_tree_gang_lookup_tag(&mapping->page_tree,
31964 + (void **)&pages[i],
31965 + *index, 1,
31966 + PAGECACHE_TAG_REISER4_MOVED);
31967 + if (!found)
31968 + break;
31969 + if (!same_page_cluster(pages[0], pages[i]))
31970 + break;
31971 +
31972 + /* found */
31973 + page_cache_get(pages[i]);
31974 + *index = pages[i]->index + 1;
31975 +
31976 + radix_tree_tag_clear(&mapping->page_tree,
31977 + pages[i]->index,
31978 + PAGECACHE_TAG_REISER4_MOVED);
31979 + if (last_page_in_cluster(pages[i++]))
31980 + break;
31981 + } while (1);
31982 + write_unlock_irq(&mapping->tree_lock);
31983 + return i;
31984 +}
31985 +
31986 +#define MAX_PAGES_TO_CAPTURE (1024)
31987 +
31988 +/* Capture anonymous page clusters */
31989 +static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
31990 + int to_capture)
31991 +{
31992 + int count = 0;
31993 + int found = 0;
31994 + int result = 0;
31995 + hint_t *hint;
31996 + lock_handle *lh;
31997 + struct inode * inode;
31998 + struct cluster_handle clust;
31999 + struct page * pages[MAX_CLUSTER_NRPAGES];
32000 +
32001 + assert("edward-1127", mapping != NULL);
32002 + assert("edward-1128", mapping->host != NULL);
32003 + assert("edward-1440", mapping->host->i_mapping == mapping);
32004 +
32005 + inode = mapping->host;
32006 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32007 + if (hint == NULL)
32008 + return RETERR(-ENOMEM);
32009 + hint_init_zero(hint);
32010 + lh = &hint->lh;
32011 +
32012 + cluster_init_read(&clust, NULL);
32013 + clust.hint = hint;
32014 +
32015 + result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32016 + if (result)
32017 + goto out;
32018 +
32019 + while (to_capture > 0) {
32020 + found = find_anon_page_cluster(mapping, index, pages);
32021 + if (!found) {
32022 + *index = (pgoff_t) - 1;
32023 + break;
32024 + }
32025 + move_cluster_forward(&clust, inode, pages[0]->index);
32026 + result = capture_anon_page_cluster(&clust, inode);
32027 +
32028 + put_found_pages(pages, found); /* find_anon_page_cluster */
32029 + if (result)
32030 + break;
32031 + to_capture -= clust.nr_pages;
32032 + count += clust.nr_pages;
32033 + }
32034 + if (result) {
32035 + warning("edward-1077",
32036 + "Capture failed (inode %llu, result=%i, captured=%d)\n",
32037 + (unsigned long long)get_inode_oid(inode), result, count);
32038 + } else {
32039 + assert("edward-1078", ergo(found > 0, count > 0));
32040 + if (to_capture <= 0)
32041 + /* there may be left more pages */
32042 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
32043 + result = count;
32044 + }
32045 + out:
32046 + done_lh(lh);
32047 + kfree(hint);
32048 + put_cluster_handle(&clust);
32049 + return result;
32050 +}
32051 +
32052 +/* Returns true if inode's mapping has dirty pages
32053 + which do not belong to any atom */
32054 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
32055 +{
32056 + int result;
32057 + read_lock_irq(&inode->i_mapping->tree_lock);
32058 + result = radix_tree_tagged(&inode->i_mapping->page_tree,
32059 + PAGECACHE_TAG_REISER4_MOVED);
32060 + read_unlock_irq(&inode->i_mapping->tree_lock);
32061 + return result;
32062 +}
32063 +
32064 +/* plugin->writepages */
32065 +int writepages_cryptcompress(struct address_space *mapping,
32066 + struct writeback_control *wbc)
32067 +{
32068 + int result = 0;
32069 + long to_capture;
32070 + pgoff_t nrpages;
32071 + pgoff_t index = 0;
32072 + struct inode *inode;
32073 + struct cryptcompress_info *info;
32074 +
32075 + inode = mapping->host;
32076 + if (!cryptcompress_inode_has_anon_pages(inode))
32077 + goto end;
32078 + info = cryptcompress_inode_data(inode);
32079 + nrpages = size_in_pages(i_size_read(inode));
32080 +
32081 + if (wbc->sync_mode != WB_SYNC_ALL)
32082 + to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
32083 + else
32084 + to_capture = MAX_PAGES_TO_CAPTURE;
32085 + do {
32086 + reiser4_context *ctx;
32087 +
32088 + ctx = reiser4_init_context(inode->i_sb);
32089 + if (IS_ERR(ctx)) {
32090 + result = PTR_ERR(ctx);
32091 + break;
32092 + }
32093 + /* avoid recursive calls to ->sync_inodes */
32094 + ctx->nobalance = 1;
32095 +
32096 + assert("edward-1079",
32097 + lock_stack_isclean(get_current_lock_stack()));
32098 +
32099 + reiser4_txn_restart_current();
32100 +
32101 + if (get_current_context()->entd) {
32102 + if (mutex_trylock(&info->checkin_mutex) == 0) {
32103 + /* the mutex might be occupied by
32104 + entd caller */
32105 + result = RETERR(-EBUSY);
32106 + reiser4_exit_context(ctx);
32107 + break;
32108 + }
32109 + } else
32110 + mutex_lock(&info->checkin_mutex);
32111 +
32112 + result = capture_anon_pages(inode->i_mapping, &index,
32113 + to_capture);
32114 + mutex_unlock(&info->checkin_mutex);
32115 +
32116 + if (result < 0) {
32117 + reiser4_exit_context(ctx);
32118 + break;
32119 + }
32120 + wbc->nr_to_write -= result;
32121 + if (wbc->sync_mode != WB_SYNC_ALL) {
32122 + reiser4_exit_context(ctx);
32123 + break;
32124 + }
32125 + result = txnmgr_force_commit_all(inode->i_sb, 0);
32126 + reiser4_exit_context(ctx);
32127 + } while (result >= 0 && index < nrpages);
32128 +
32129 + end:
32130 + if (is_in_reiser4_context()) {
32131 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
32132 + /* there are already pages to flush, flush them out,
32133 + do not delay until end of reiser4_sync_inodes */
32134 + reiser4_writeout(inode->i_sb, wbc);
32135 + get_current_context()->nr_captured = 0;
32136 + }
32137 + }
32138 + return result;
32139 +}
32140 +
32141 +/* plugin->ioctl */
32142 +int ioctl_cryptcompress(struct inode *inode, struct file *filp,
32143 + unsigned int cmd, unsigned long arg)
32144 +{
32145 + return 0;
32146 +}
32147 +
32148 +/* plugin->mmap */
32149 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
32150 +{
32151 + int result;
32152 + struct inode *inode;
32153 + reiser4_context *ctx;
32154 +
32155 + inode = file->f_dentry->d_inode;
32156 + ctx = reiser4_init_context(inode->i_sb);
32157 + if (IS_ERR(ctx))
32158 + return PTR_ERR(ctx);
32159 + /*
32160 + * generic_file_mmap will do update_atime. Grab space for stat data
32161 + * update.
32162 + */
32163 + result = reiser4_grab_space_force
32164 + (inode_file_plugin(inode)->estimate.update(inode),
32165 + BA_CAN_COMMIT);
32166 + if (result) {
32167 + reiser4_exit_context(ctx);
32168 + return result;
32169 + }
32170 + result = generic_file_mmap(file, vma);
32171 + reiser4_exit_context(ctx);
32172 + return result;
32173 +}
32174 +
32175 +/* plugin->delete_object */
32176 +int delete_object_cryptcompress(struct inode *inode)
32177 +{
32178 + int result;
32179 + struct cryptcompress_info * info;
32180 +
32181 + assert("edward-429", inode->i_nlink == 0);
32182 +
32183 + reiser4_txn_restart_current();
32184 + info = cryptcompress_inode_data(inode);
32185 +
32186 + mutex_lock(&info->checkin_mutex);
32187 + result = cryptcompress_truncate(inode, 0, 0);
32188 + mutex_unlock(&info->checkin_mutex);
32189 +
32190 + if (result) {
32191 + warning("edward-430",
32192 + "cannot truncate cryptcompress file %lli: %i",
32193 + (unsigned long long)get_inode_oid(inode),
32194 + result);
32195 + }
32196 + truncate_inode_pages(inode->i_mapping, 0);
32197 + assert("edward-1487", pages_truncate_ok(inode, 0));
32198 + /* and remove stat data */
32199 + return reiser4_delete_object_common(inode);
32200 +}
32201 +
32202 +/*
32203 + * plugin->setattr
32204 + * This implements actual truncate (see comments in reiser4/page_cache.c)
32205 + */
32206 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
32207 +{
32208 + int result;
32209 + struct inode *inode;
32210 + struct cryptcompress_info * info;
32211 +
32212 + inode = dentry->d_inode;
32213 + info = cryptcompress_inode_data(inode);
32214 +
32215 + if (attr->ia_valid & ATTR_SIZE) {
32216 + if (i_size_read(inode) != attr->ia_size) {
32217 + reiser4_context *ctx;
32218 + loff_t old_size;
32219 +
32220 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
32221 + if (IS_ERR(ctx))
32222 + return PTR_ERR(ctx);
32223 +
32224 + old_size = i_size_read(inode);
32225 + inode_check_scale(inode, old_size, attr->ia_size);
32226 +
32227 + mutex_lock(&info->checkin_mutex);
32228 + result = cryptcompress_truncate(inode,
32229 + attr->ia_size,
32230 + 1/* update sd */);
32231 + mutex_unlock(&info->checkin_mutex);
32232 + if (result) {
32233 + warning("edward-1192",
32234 + "truncate_cryptcompress failed: oid %lli, "
32235 + "old size %lld, new size %lld, retval %d",
32236 + (unsigned long long)
32237 + get_inode_oid(inode), old_size,
32238 + attr->ia_size, result);
32239 + }
32240 + context_set_commit_async(ctx);
32241 + reiser4_exit_context(ctx);
32242 + } else
32243 + result = 0;
32244 + } else
32245 + result = reiser4_setattr_common(dentry, attr);
32246 + return result;
32247 +}
32248 +
32249 +/* plugin->release */
32250 +int release_cryptcompress(struct inode *inode, struct file *file)
32251 +{
32252 + reiser4_context *ctx = reiser4_init_context(inode->i_sb);
32253 +
32254 + if (IS_ERR(ctx))
32255 + return PTR_ERR(ctx);
32256 + reiser4_free_file_fsdata(file);
32257 + reiser4_exit_context(ctx);
32258 + return 0;
32259 +}
32260 +
32261 +/* plugin->prepare_write */
32262 +int prepare_write_cryptcompress(struct file *file, struct page *page,
32263 + unsigned from, unsigned to)
32264 +{
32265 + return -EINVAL;
32266 +}
32267 +
32268 +/* plugin->commit_write */
32269 +int commit_write_cryptcompress(struct file *file, struct page *page,
32270 + unsigned from, unsigned to)
32271 +{
32272 + BUG();
32273 + return 0;
32274 +}
32275 +
32276 +/* plugin->bmap */
32277 +sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
32278 +{
32279 + return -EINVAL;
32280 +}
32281 +
32282 +/*
32283 + Local variables:
32284 + c-indentation-style: "K&R"
32285 + mode-name: "LC"
32286 + c-basic-offset: 8
32287 + tab-width: 8
32288 + fill-column: 80
32289 + scroll-step: 1
32290 + End:
32291 +*/
32292 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.h
32293 --- linux-2.6.23.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300
32294 +++ linux-2.6.23/fs/reiser4/plugin/file/cryptcompress.h 2007-12-04 16:49:30.000000000 +0300
32295 @@ -0,0 +1,604 @@
32296 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
32297 +/* See http://www.namesys.com/cryptcompress_design.html */
32298 +
32299 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
32300 +#define __FS_REISER4_CRYPTCOMPRESS_H__
32301 +
32302 +#include "../../page_cache.h"
32303 +#include "../compress/compress.h"
32304 +#include "../crypto/cipher.h"
32305 +
32306 +#include <linux/pagemap.h>
32307 +
32308 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
32309 +#define MAX_CLUSTER_SHIFT 16
32310 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
32311 +#define DC_CHECKSUM_SIZE 4
32312 +
32313 +#define MIN_LATTICE_FACTOR 1
32314 +#define MAX_LATTICE_FACTOR 32
32315 +
32316 +/* this mask contains all non-standard plugins that might
32317 + be present in reiser4-specific part of inode managed by
32318 + cryptcompress file plugin */
32319 +#define cryptcompress_mask \
32320 + ((1 << PSET_FILE) | \
32321 + (1 << PSET_CLUSTER) | \
32322 + (1 << PSET_CIPHER) | \
32323 + (1 << PSET_DIGEST) | \
32324 + (1 << PSET_COMPRESSION) | \
32325 + (1 << PSET_COMPRESSION_MODE))
32326 +
32327 +#if REISER4_DEBUG
32328 +static inline int cluster_shift_ok(int shift)
32329 +{
32330 + return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
32331 +}
32332 +#endif
32333 +
32334 +#if REISER4_DEBUG
32335 +#define INODE_PGCOUNT(inode) \
32336 +({ \
32337 + assert("edward-1530", inode_file_plugin(inode) == \
32338 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32339 + atomic_read(&cryptcompress_inode_data(inode)->pgcount); \
32340 + })
32341 +#define INODE_PGCOUNT_INC(inode) \
32342 +do { \
32343 + assert("edward-1531", inode_file_plugin(inode) == \
32344 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
32345 + atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \
32346 +} while (0)
32347 +#define INODE_PGCOUNT_DEC(inode) \
32348 +do { \
32349 + if (inode_file_plugin(inode) == \
32350 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \
32351 + atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \
32352 +} while (0)
32353 +#else
32354 +#define INODE_PGCOUNT(inode) (0)
32355 +#define INODE_PGCOUNT_INC(inode)
32356 +#define INODE_PGCOUNT_DEC(inode)
32357 +#endif /* REISER4_DEBUG */
32358 +
32359 +struct tfm_stream {
32360 + __u8 *data;
32361 + size_t size;
32362 +};
32363 +
32364 +typedef enum {
32365 + INPUT_STREAM,
32366 + OUTPUT_STREAM,
32367 + LAST_STREAM
32368 +} tfm_stream_id;
32369 +
32370 +typedef struct tfm_stream * tfm_unit[LAST_STREAM];
32371 +
32372 +static inline __u8 *ts_data(struct tfm_stream * stm)
32373 +{
32374 + assert("edward-928", stm != NULL);
32375 + return stm->data;
32376 +}
32377 +
32378 +static inline size_t ts_size(struct tfm_stream * stm)
32379 +{
32380 + assert("edward-929", stm != NULL);
32381 + return stm->size;
32382 +}
32383 +
32384 +static inline void set_ts_size(struct tfm_stream * stm, size_t size)
32385 +{
32386 + assert("edward-930", stm != NULL);
32387 +
32388 + stm->size = size;
32389 +}
32390 +
32391 +static inline int alloc_ts(struct tfm_stream ** stm)
32392 +{
32393 + assert("edward-931", stm);
32394 + assert("edward-932", *stm == NULL);
32395 +
32396 + *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
32397 + if (!*stm)
32398 + return -ENOMEM;
32399 + return 0;
32400 +}
32401 +
32402 +static inline void free_ts(struct tfm_stream * stm)
32403 +{
32404 + assert("edward-933", !ts_data(stm));
32405 + assert("edward-934", !ts_size(stm));
32406 +
32407 + kfree(stm);
32408 +}
32409 +
32410 +static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
32411 +{
32412 + assert("edward-935", !ts_data(stm));
32413 + assert("edward-936", !ts_size(stm));
32414 + assert("edward-937", size != 0);
32415 +
32416 + stm->data = reiser4_vmalloc(size);
32417 + if (!stm->data)
32418 + return -ENOMEM;
32419 + set_ts_size(stm, size);
32420 + return 0;
32421 +}
32422 +
32423 +static inline void free_ts_data(struct tfm_stream * stm)
32424 +{
32425 + assert("edward-938", equi(ts_data(stm), ts_size(stm)));
32426 +
32427 + if (ts_data(stm))
32428 + vfree(ts_data(stm));
32429 + memset(stm, 0, sizeof *stm);
32430 +}
32431 +
32432 +/* Write modes for item conversion in flush convert phase */
32433 +typedef enum {
32434 + CRC_APPEND_ITEM = 1,
32435 + CRC_OVERWRITE_ITEM = 2,
32436 + CRC_CUT_ITEM = 3
32437 +} cryptcompress_write_mode_t;
32438 +
32439 +typedef enum {
32440 + LC_INVAL = 0, /* invalid value */
32441 + LC_APPOV = 1, /* append and/or overwrite */
32442 + LC_TRUNC = 2 /* truncate */
32443 +} logical_cluster_op;
32444 +
32445 +/* Transform cluster.
32446 + * Intermediate state between page cluster and disk cluster
32447 + * Is used for data transform (compression/encryption)
32448 + */
32449 +struct tfm_cluster {
32450 + coa_set coa; /* compression algorithms info */
32451 + tfm_unit tun; /* plain and transformed streams */
32452 + tfm_action act;
32453 + int uptodate;
32454 + int lsize; /* number of bytes in logical cluster */
32455 + int len; /* length of the transform stream */
32456 +};
32457 +
32458 +static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32459 + tfm_action act)
32460 +{
32461 + return tc->coa[id][act];
32462 +}
32463 +
32464 +static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32465 + tfm_action act, coa_t coa)
32466 +{
32467 + tc->coa[id][act] = coa;
32468 +}
32469 +
32470 +static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32471 +{
32472 + coa_t coa;
32473 +
32474 + coa = cplug->alloc(tc->act);
32475 + if (IS_ERR(coa))
32476 + return PTR_ERR(coa);
32477 + set_coa(tc, cplug->h.id, tc->act, coa);
32478 + return 0;
32479 +}
32480 +
32481 +static inline int
32482 +grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32483 +{
32484 + return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
32485 + alloc_coa(tc, cplug) : 0);
32486 +}
32487 +
32488 +static inline void free_coa_set(struct tfm_cluster * tc)
32489 +{
32490 + tfm_action j;
32491 + reiser4_compression_id i;
32492 + compression_plugin *cplug;
32493 +
32494 + assert("edward-810", tc != NULL);
32495 +
32496 + for (j = 0; j < TFMA_LAST; j++)
32497 + for (i = 0; i < LAST_COMPRESSION_ID; i++) {
32498 + if (!get_coa(tc, i, j))
32499 + continue;
32500 + cplug = compression_plugin_by_id(i);
32501 + assert("edward-812", cplug->free != NULL);
32502 + cplug->free(get_coa(tc, i, j), j);
32503 + set_coa(tc, i, j, 0);
32504 + }
32505 + return;
32506 +}
32507 +
32508 +static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
32509 + tfm_stream_id id)
32510 +{
32511 + return tc->tun[id];
32512 +}
32513 +
32514 +static inline void set_tfm_stream(struct tfm_cluster * tc,
32515 + tfm_stream_id id, struct tfm_stream * ts)
32516 +{
32517 + tc->tun[id] = ts;
32518 +}
32519 +
32520 +static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
32521 +{
32522 + return ts_data(get_tfm_stream(tc, id));
32523 +}
32524 +
32525 +static inline void set_tfm_stream_data(struct tfm_cluster * tc,
32526 + tfm_stream_id id, __u8 * data)
32527 +{
32528 + get_tfm_stream(tc, id)->data = data;
32529 +}
32530 +
32531 +static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
32532 +{
32533 + return ts_size(get_tfm_stream(tc, id));
32534 +}
32535 +
32536 +static inline void
32537 +set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
32538 +{
32539 + get_tfm_stream(tc, id)->size = size;
32540 +}
32541 +
32542 +static inline int
32543 +alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32544 +{
32545 + assert("edward-939", tc != NULL);
32546 + assert("edward-940", !get_tfm_stream(tc, id));
32547 +
32548 + tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
32549 + reiser4_ctx_gfp_mask_get());
32550 + if (!tc->tun[id])
32551 + return -ENOMEM;
32552 + return alloc_ts_data(get_tfm_stream(tc, id), size);
32553 +}
32554 +
32555 +static inline int
32556 +realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32557 +{
32558 + assert("edward-941", tfm_stream_size(tc, id) < size);
32559 + free_ts_data(get_tfm_stream(tc, id));
32560 + return alloc_ts_data(get_tfm_stream(tc, id), size);
32561 +}
32562 +
32563 +static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
32564 +{
32565 + free_ts_data(get_tfm_stream(tc, id));
32566 + free_ts(get_tfm_stream(tc, id));
32567 + set_tfm_stream(tc, id, 0);
32568 +}
32569 +
32570 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
32571 +{
32572 + return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
32573 +}
32574 +
32575 +static inline void free_tfm_unit(struct tfm_cluster * tc)
32576 +{
32577 + tfm_stream_id id;
32578 + for (id = 0; id < LAST_STREAM; id++) {
32579 + if (!get_tfm_stream(tc, id))
32580 + continue;
32581 + free_tfm_stream(tc, id);
32582 + }
32583 +}
32584 +
32585 +static inline void put_tfm_cluster(struct tfm_cluster * tc)
32586 +{
32587 + assert("edward-942", tc != NULL);
32588 + free_coa_set(tc);
32589 + free_tfm_unit(tc);
32590 +}
32591 +
32592 +static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
32593 +{
32594 + assert("edward-943", tc != NULL);
32595 + assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
32596 + return (tc->uptodate == 1);
32597 +}
32598 +
32599 +static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
32600 +{
32601 + assert("edward-945", tc != NULL);
32602 + assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
32603 + tc->uptodate = 1;
32604 + return;
32605 +}
32606 +
32607 +static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
32608 +{
32609 + assert("edward-947", tc != NULL);
32610 + assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
32611 + tc->uptodate = 0;
32612 + return;
32613 +}
32614 +
32615 +static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
32616 +{
32617 + return (get_tfm_stream(tc, id) &&
32618 + tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
32619 +}
32620 +
32621 +static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
32622 +{
32623 + int i;
32624 + for (i = 0; i < LAST_STREAM; i++)
32625 + if (!tfm_stream_is_set(tc, i))
32626 + return 0;
32627 + return 1;
32628 +}
32629 +
32630 +static inline void alternate_streams(struct tfm_cluster * tc)
32631 +{
32632 + struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
32633 +
32634 + set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
32635 + set_tfm_stream(tc, OUTPUT_STREAM, tmp);
32636 +}
32637 +
32638 +/* Set of states to indicate a kind of data
32639 + * that will be written to the window */
32640 +typedef enum {
32641 + DATA_WINDOW, /* user's data */
32642 + HOLE_WINDOW /* zeroes (such kind of data can be written
32643 + * if we start to write from offset > i_size) */
32644 +} window_stat;
32645 +
32646 +/* Window (of logical cluster size) discretely sliding along a file.
32647 + * Is used to locate hole region in a logical cluster to be properly
32648 + * represented on disk.
32649 + * We split a write to cryptcompress file into writes to its logical
32650 + * clusters. Before writing to a logical cluster we set a window, i.e.
32651 + * calculate values of the following fields:
32652 + */
32653 +struct reiser4_slide {
32654 + unsigned off; /* offset to write from */
32655 + unsigned count; /* number of bytes to write */
32656 + unsigned delta; /* number of bytes to append to the hole */
32657 + window_stat stat; /* what kind of data will be written starting
32658 + from @off */
32659 +};
32660 +
32661 +/* Possible states of a disk cluster */
32662 +typedef enum {
32663 + INVAL_DISK_CLUSTER, /* unknown state */
32664 + PREP_DISK_CLUSTER, /* disk cluster got converted by flush
32665 + * at least 1 time */
32666 + UNPR_DISK_CLUSTER, /* disk cluster just created and should be
32667 + * converted by flush */
32668 + FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory
32669 + * nor on disk */
32670 + TRNC_DISK_CLUSTER /* disk cluster is partially truncated */
32671 +} disk_cluster_stat;
32672 +
32673 +/* The following structure represents various stages of the same logical
32674 + * cluster of index @index:
32675 + * . fixed slide
32676 + * . page cluster (stage in primary cache)
32677 + * . transform cluster (transition stage)
32678 + * . disk cluster (stage in secondary cache)
32679 + * This structure is used in transition and synchronizing operations, e.g.
32680 + * transform cluster is a transition state when synchronizing page cluster
32681 + * and disk cluster.
32682 + * FIXME: Encapsulate page cluster, disk cluster.
32683 + */
32684 +struct cluster_handle {
32685 + cloff_t index; /* offset in a file (unit is a cluster size) */
32686 + int index_valid; /* for validating the index above, if needed */
32687 + struct file *file; /* host file */
32688 +
32689 + /* logical cluster */
32690 + struct reiser4_slide *win; /* sliding window to locate holes */
32691 + logical_cluster_op op; /* logical cluster operation (truncate or
32692 + append/overwrite) */
32693 + /* transform cluster */
32694 + struct tfm_cluster tc; /* contains all needed info to synchronize
32695 + page cluster and disk cluster) */
32696 + /* page cluster */
32697 + int nr_pages; /* number of pages of current checkin action */
32698 + int old_nrpages; /* number of pages of last checkin action */
32699 + struct page **pages; /* attached pages */
32700 + jnode * node; /* jnode for capture */
32701 +
32702 + /* disk cluster */
32703 + hint_t *hint; /* current position in the tree */
32704 + disk_cluster_stat dstat; /* state of the current disk cluster */
32705 + int reserved; /* is space for disk cluster reserved */
32706 +#if REISER4_DEBUG
32707 + reiser4_context *ctx;
32708 + int reserved_prepped;
32709 + int reserved_unprepped;
32710 +#endif
32711 +
32712 +};
32713 +
32714 +static inline __u8 * tfm_input_data (struct cluster_handle * clust)
32715 +{
32716 + return tfm_stream_data(&clust->tc, INPUT_STREAM);
32717 +}
32718 +
32719 +static inline __u8 * tfm_output_data (struct cluster_handle * clust)
32720 +{
32721 + return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
32722 +}
32723 +
32724 +static inline int reset_cluster_pgset(struct cluster_handle * clust,
32725 + int nrpages)
32726 +{
32727 + assert("edward-1057", clust->pages != NULL);
32728 + memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
32729 + return 0;
32730 +}
32731 +
32732 +static inline int alloc_cluster_pgset(struct cluster_handle * clust,
32733 + int nrpages)
32734 +{
32735 + assert("edward-949", clust != NULL);
32736 + assert("edward-1362", clust->pages == NULL);
32737 + assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
32738 +
32739 + clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
32740 + reiser4_ctx_gfp_mask_get());
32741 + if (!clust->pages)
32742 + return RETERR(-ENOMEM);
32743 + return 0;
32744 +}
32745 +
32746 +static inline void free_cluster_pgset(struct cluster_handle * clust)
32747 +{
32748 + assert("edward-951", clust->pages != NULL);
32749 + kfree(clust->pages);
32750 + clust->pages = NULL;
32751 +}
32752 +
32753 +static inline void put_cluster_handle(struct cluster_handle * clust)
32754 +{
32755 + assert("edward-435", clust != NULL);
32756 +
32757 + put_tfm_cluster(&clust->tc);
32758 + if (clust->pages)
32759 + free_cluster_pgset(clust);
32760 + memset(clust, 0, sizeof *clust);
32761 +}
32762 +
32763 +static inline void inc_keyload_count(struct reiser4_crypto_info * data)
32764 +{
32765 + assert("edward-1410", data != NULL);
32766 + data->keyload_count++;
32767 +}
32768 +
32769 +static inline void dec_keyload_count(struct reiser4_crypto_info * data)
32770 +{
32771 + assert("edward-1411", data != NULL);
32772 + assert("edward-1412", data->keyload_count > 0);
32773 + data->keyload_count--;
32774 +}
32775 +
32776 +static inline int capture_cluster_jnode(jnode * node)
32777 +{
32778 + return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32779 +}
32780 +
32781 +/* cryptcompress specific part of reiser4_inode */
32782 +struct cryptcompress_info {
32783 + struct mutex checkin_mutex; /* This is to serialize
32784 + * checkin_logical_cluster operations */
32785 + cloff_t trunc_index; /* Index of the leftmost truncated disk
32786 + * cluster (to resolve races with read) */
32787 + struct reiser4_crypto_info *crypt;
32788 + /*
32789 + * the following 2 fields are controlled by compression mode plugin
32790 + */
32791 + int compress_toggle; /* Current status of compressibility */
32792 + int lattice_factor; /* Factor of dynamic lattice. FIXME: Have
32793 + * a compression_toggle to keep the factor
32794 + */
32795 +#if REISER4_DEBUG
32796 + atomic_t pgcount; /* number of grabbed pages */
32797 +#endif
32798 +};
32799 +
32800 +static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
32801 +{
32802 + info->compress_toggle = val;
32803 +}
32804 +
32805 +static inline int get_compression_toggle (struct cryptcompress_info * info)
32806 +{
32807 + return info->compress_toggle;
32808 +}
32809 +
32810 +static inline int compression_is_on(struct cryptcompress_info * info)
32811 +{
32812 + return get_compression_toggle(info) == 1;
32813 +}
32814 +
32815 +static inline void turn_on_compression(struct cryptcompress_info * info)
32816 +{
32817 + set_compression_toggle(info, 1);
32818 +}
32819 +
32820 +static inline void turn_off_compression(struct cryptcompress_info * info)
32821 +{
32822 + set_compression_toggle(info, 0);
32823 +}
32824 +
32825 +static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
32826 +{
32827 + info->lattice_factor = val;
32828 +}
32829 +
32830 +static inline int get_lattice_factor(struct cryptcompress_info * info)
32831 +{
32832 + return info->lattice_factor;
32833 +}
32834 +
32835 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
32836 +int equal_to_rdk(znode *, const reiser4_key *);
32837 +int goto_right_neighbor(coord_t *, lock_handle *);
32838 +int cryptcompress_inode_ok(struct inode *inode);
32839 +int coord_is_unprepped_ctail(const coord_t * coord);
32840 +extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
32841 + struct page * page, znode_lock_mode mode);
32842 +extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
32843 + struct inode * inode);
32844 +extern int readpages_cryptcompress(struct file*, struct address_space*,
32845 + struct list_head*, unsigned);
32846 +int bind_cryptcompress(struct inode *child, struct inode *parent);
32847 +void destroy_inode_cryptcompress(struct inode * inode);
32848 +int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
32849 + rw_op rw);
32850 +int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
32851 + struct cluster_handle * clust, int * progress);
32852 +struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
32853 +void inherit_crypto_info_common(struct inode * parent, struct inode * object,
32854 + int (*can_inherit)(struct inode * child,
32855 + struct inode * parent));
32856 +void reiser4_attach_crypto_info(struct inode * inode,
32857 + struct reiser4_crypto_info * info);
32858 +void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
32859 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
32860 +
32861 +static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
32862 +{
32863 + return info->cipher;
32864 +}
32865 +
32866 +static inline void info_set_cipher(struct reiser4_crypto_info * info,
32867 + struct crypto_blkcipher * tfm)
32868 +{
32869 + info->cipher = tfm;
32870 +}
32871 +
32872 +static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
32873 +{
32874 + return info->digest;
32875 +}
32876 +
32877 +static inline void info_set_digest(struct reiser4_crypto_info * info,
32878 + struct crypto_hash * tfm)
32879 +{
32880 + info->digest = tfm;
32881 +}
32882 +
32883 +static inline void put_cluster_page(struct page * page)
32884 +{
32885 + page_cache_release(page);
32886 +}
32887 +
32888 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
32889 +
32890 +/* Make Linus happy.
32891 + Local variables:
32892 + c-indentation-style: "K&R"
32893 + mode-name: "LC"
32894 + c-basic-offset: 8
32895 + tab-width: 8
32896 + fill-column: 120
32897 + scroll-step: 1
32898 + End:
32899 +*/
32900 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/file.c linux-2.6.23/fs/reiser4/plugin/file/file.c
32901 --- linux-2.6.23.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300
32902 +++ linux-2.6.23/fs/reiser4/plugin/file/file.c 2007-12-04 23:04:00.726305004 +0300
32903 @@ -0,0 +1,2735 @@
32904 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
32905 + * reiser4/README */
32906 +
32907 +/*
32908 + * this file contains implementations of inode/file/address_space/file plugin
32909 + * operations specific for "unix file plugin" (plugin id is
32910 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
32911 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
32912 + * no items but stat data)
32913 + */
32914 +
32915 +#include "../../inode.h"
32916 +#include "../../super.h"
32917 +#include "../../tree_walk.h"
32918 +#include "../../carry.h"
32919 +#include "../../page_cache.h"
32920 +#include "../../ioctl.h"
32921 +#include "../object.h"
32922 +#include "../cluster.h"
32923 +#include "../../safe_link.h"
32924 +
32925 +#include <linux/writeback.h>
32926 +#include <linux/pagevec.h>
32927 +#include <linux/syscalls.h>
32928 +
32929 +
32930 +static int unpack(struct file *file, struct inode *inode, int forever);
32931 +static void drop_access(struct unix_file_info *);
32932 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
32933 + znode_lock_mode lock_mode);
32934 +
32935 +/* Get exclusive access and make sure that file is not partially
32936 + * converted (It may happen that another process is doing tail
32937 + * conversion. If so, wait until it completes)
32938 + */
32939 +static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
32940 + struct inode *inode)
32941 +{
32942 + do {
32943 + get_exclusive_access(uf_info);
32944 + if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
32945 + break;
32946 + drop_exclusive_access(uf_info);
32947 + schedule();
32948 + } while (1);
32949 +}
32950 +
32951 +/* get unix file plugin specific portion of inode */
32952 +struct unix_file_info *unix_file_inode_data(const struct inode *inode)
32953 +{
32954 + return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
32955 +}
32956 +
32957 +/**
32958 + * equal_to_rdk - compare key and znode's right delimiting key
32959 + * @node: node whose right delimiting key to compare with @key
32960 + * @key: key to compare with @node's right delimiting key
32961 + *
32962 + * Returns true if @key is equal to right delimiting key of @node.
32963 + */
32964 +int equal_to_rdk(znode *node, const reiser4_key *key)
32965 +{
32966 + int result;
32967 +
32968 + read_lock_dk(znode_get_tree(node));
32969 + result = keyeq(key, znode_get_rd_key(node));
32970 + read_unlock_dk(znode_get_tree(node));
32971 + return result;
32972 +}
32973 +
32974 +#if REISER4_DEBUG
32975 +
32976 +/**
32977 + * equal_to_ldk - compare key and znode's left delimiting key
32978 + * @node: node whose left delimiting key to compare with @key
32979 + * @key: key to compare with @node's left delimiting key
32980 + *
32981 + * Returns true if @key is equal to left delimiting key of @node.
32982 + */
32983 +int equal_to_ldk(znode *node, const reiser4_key *key)
32984 +{
32985 + int result;
32986 +
32987 + read_lock_dk(znode_get_tree(node));
32988 + result = keyeq(key, znode_get_ld_key(node));
32989 + read_unlock_dk(znode_get_tree(node));
32990 + return result;
32991 +}
32992 +
32993 +/**
32994 + * check_coord - check whether coord corresponds to key
32995 + * @coord: coord to check
32996 + * @key: key @coord has to correspond to
32997 + *
32998 + * Returns true if @coord is set as if it was set as result of lookup with @key
32999 + * in coord->node.
33000 + */
33001 +static int check_coord(const coord_t *coord, const reiser4_key *key)
33002 +{
33003 + coord_t twin;
33004 +
33005 + node_plugin_by_node(coord->node)->lookup(coord->node, key,
33006 + FIND_MAX_NOT_MORE_THAN, &twin);
33007 + return coords_equal(coord, &twin);
33008 +}
33009 +
33010 +#endif /* REISER4_DEBUG */
33011 +
33012 +/**
33013 + * init_uf_coord - initialize extended coord
33014 + * @uf_coord:
33015 + * @lh:
33016 + *
33017 + *
33018 + */
33019 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
33020 +{
33021 + coord_init_zero(&uf_coord->coord);
33022 + coord_clear_iplug(&uf_coord->coord);
33023 + uf_coord->lh = lh;
33024 + init_lh(lh);
33025 + memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
33026 + uf_coord->valid = 0;
33027 +}
33028 +
33029 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
33030 +{
33031 + assert("vs-1333", uf_coord->valid == 0);
33032 +
33033 + if (coord_is_between_items(&uf_coord->coord))
33034 + return;
33035 +
33036 + assert("vs-1348",
33037 + item_plugin_by_coord(&uf_coord->coord)->s.file.
33038 + init_coord_extension);
33039 +
33040 + item_body_by_coord(&uf_coord->coord);
33041 + item_plugin_by_coord(&uf_coord->coord)->s.file.
33042 + init_coord_extension(uf_coord, offset);
33043 +}
33044 +
33045 +/**
33046 + * goto_right_neighbor - lock right neighbor, drop current node lock
33047 + * @coord:
33048 + * @lh:
33049 + *
33050 + * Obtain lock on right neighbor and drop lock on current node.
33051 + */
33052 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
33053 +{
33054 + int result;
33055 + lock_handle lh_right;
33056 +
33057 + assert("vs-1100", znode_is_locked(coord->node));
33058 +
33059 + init_lh(&lh_right);
33060 + result = reiser4_get_right_neighbor(&lh_right, coord->node,
33061 + znode_is_wlocked(coord->node) ?
33062 + ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
33063 + GN_CAN_USE_UPPER_LEVELS);
33064 + if (result) {
33065 + done_lh(&lh_right);
33066 + return result;
33067 + }
33068 +
33069 + /*
33070 + * we hold two longterm locks on neighboring nodes. Unlock left of
33071 + * them
33072 + */
33073 + done_lh(lh);
33074 +
33075 + coord_init_first_unit_nocheck(coord, lh_right.node);
33076 + move_lh(lh, &lh_right);
33077 +
33078 + return 0;
33079 +
33080 +}
33081 +
33082 +/**
33083 + * set_file_state
33084 + * @uf_info:
33085 + * @cbk_result:
33086 + * @level:
33087 + *
33088 + * This is to be used by find_file_item and in find_file_state to
33089 + * determine real state of file
33090 + */
33091 +static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
33092 + tree_level level)
33093 +{
33094 + if (cbk_errored(cbk_result))
33095 + /* error happened in find_file_item */
33096 + return;
33097 +
33098 + assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
33099 +
33100 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33101 + if (cbk_result == CBK_COORD_NOTFOUND)
33102 + uf_info->container = UF_CONTAINER_EMPTY;
33103 + else if (level == LEAF_LEVEL)
33104 + uf_info->container = UF_CONTAINER_TAILS;
33105 + else
33106 + uf_info->container = UF_CONTAINER_EXTENTS;
33107 + } else {
33108 + /*
33109 + * file state is known, check whether it is set correctly if
33110 + * file is not being tail converted
33111 + */
33112 + if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
33113 + REISER4_PART_IN_CONV)) {
33114 + assert("vs-1162",
33115 + ergo(level == LEAF_LEVEL &&
33116 + cbk_result == CBK_COORD_FOUND,
33117 + uf_info->container == UF_CONTAINER_TAILS));
33118 + assert("vs-1165",
33119 + ergo(level == TWIG_LEVEL &&
33120 + cbk_result == CBK_COORD_FOUND,
33121 + uf_info->container == UF_CONTAINER_EXTENTS));
33122 + }
33123 + }
33124 +}
33125 +
33126 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
33127 + const reiser4_key *key, znode_lock_mode lock_mode,
33128 + struct inode *inode)
33129 +{
33130 + return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
33131 + FIND_MAX_NOT_MORE_THAN,
33132 + TWIG_LEVEL, LEAF_LEVEL,
33133 + (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
33134 + (CBK_UNIQUE | CBK_FOR_INSERT),
33135 + NULL /* ra_info */ );
33136 +}
33137 +
33138 +/**
33139 + * find_file_item - look for file item in the tree
33140 + * @hint: provides coordinate, lock handle, seal
33141 + * @key: key for search
33142 + * @mode: mode of lock to put on returned node
33143 + * @ra_info:
33144 + * @inode:
33145 + *
33146 + * This finds position in the tree corresponding to @key. It first tries to use
33147 + * @hint's seal if it is set.
33148 + */
33149 +int find_file_item(hint_t *hint, const reiser4_key *key,
33150 + znode_lock_mode lock_mode,
33151 + struct inode *inode)
33152 +{
33153 + int result;
33154 + coord_t *coord;
33155 + lock_handle *lh;
33156 +
33157 + assert("nikita-3030", reiser4_schedulable());
33158 + assert("vs-1707", hint != NULL);
33159 + assert("vs-47", inode != NULL);
33160 +
33161 + coord = &hint->ext_coord.coord;
33162 + lh = hint->ext_coord.lh;
33163 + init_lh(lh);
33164 +
33165 + result = hint_validate(hint, key, 1 /* check key */, lock_mode);
33166 + if (!result) {
33167 + if (coord->between == AFTER_UNIT &&
33168 + equal_to_rdk(coord->node, key)) {
33169 + result = goto_right_neighbor(coord, lh);
33170 + if (result == -E_NO_NEIGHBOR)
33171 + return RETERR(-EIO);
33172 + if (result)
33173 + return result;
33174 + assert("vs-1152", equal_to_ldk(coord->node, key));
33175 + /*
33176 + * we moved to different node. Invalidate coord
33177 + * extension, zload is necessary to init it again
33178 + */
33179 + hint->ext_coord.valid = 0;
33180 + }
33181 +
33182 + set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
33183 + znode_get_level(coord->node));
33184 +
33185 + return CBK_COORD_FOUND;
33186 + }
33187 +
33188 + coord_init_zero(coord);
33189 + result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
33190 + set_file_state(unix_file_inode_data(inode), result,
33191 + znode_get_level(coord->node));
33192 +
33193 + /* FIXME: we might already have coord extension initialized */
33194 + hint->ext_coord.valid = 0;
33195 + return result;
33196 +}
33197 +
33198 +/* plugin->u.file.write_flowom = NULL
33199 + plugin->u.file.read_flow = NULL */
33200 +
33201 +void hint_init_zero(hint_t * hint)
33202 +{
33203 + memset(hint, 0, sizeof(*hint));
33204 + init_lh(&hint->lh);
33205 + hint->ext_coord.lh = &hint->lh;
33206 +}
33207 +
33208 +static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
33209 +{
33210 + int result;
33211 + reiser4_key key;
33212 + coord_t coord;
33213 + lock_handle lh;
33214 +
33215 + assert("vs-1628", ea_obtained(uf_info));
33216 +
33217 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33218 + key_by_inode_and_offset_common(inode, 0, &key);
33219 + init_lh(&lh);
33220 + result = find_file_item_nohint(&coord, &lh, &key,
33221 + ZNODE_READ_LOCK, inode);
33222 + set_file_state(uf_info, result, znode_get_level(coord.node));
33223 + done_lh(&lh);
33224 + if (!cbk_errored(result))
33225 + result = 0;
33226 + } else
33227 + result = 0;
33228 + assert("vs-1074",
33229 + ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
33230 + reiser4_txn_restart_current();
33231 + return result;
33232 +}
33233 +
33234 +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
33235 + data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
33236 + if page corresponds to hole extent and unallocated one will have to be created */
33237 +static int reserve_partial_page(reiser4_tree * tree)
33238 +{
33239 + grab_space_enable();
33240 + return reiser4_grab_reserved(reiser4_get_current_sb(),
33241 + 1 +
33242 + 2 * estimate_one_insert_into_item(tree),
33243 + BA_CAN_COMMIT);
33244 +}
33245 +
33246 +/* estimate and reserve space needed to cut one item and update one stat data */
33247 +static int reserve_cut_iteration(reiser4_tree * tree)
33248 +{
33249 + __u64 estimate = estimate_one_item_removal(tree)
33250 + + estimate_one_insert_into_item(tree);
33251 +
33252 + assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
33253 +
33254 + grab_space_enable();
33255 + /* We need to double our estimate now that we can delete more than one
33256 + node. */
33257 + return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
33258 + BA_CAN_COMMIT);
33259 +}
33260 +
33261 +int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
33262 + int update_sd)
33263 +{
33264 + int result = 0;
33265 +
33266 + INODE_SET_SIZE(inode, get_key_offset(key));
33267 + if (update_sd) {
33268 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33269 + result = reiser4_update_sd(inode);
33270 + }
33271 + return result;
33272 +}
33273 +
33274 +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
33275 + and update file stat data on every single cut from the tree */
33276 +int
33277 +cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
33278 + loff_t cur_size, int (*update_actor) (struct inode *,
33279 + reiser4_key *, int))
33280 +{
33281 + reiser4_key from_key, to_key;
33282 + reiser4_key smallest_removed;
33283 + file_plugin *fplug = inode_file_plugin(inode);
33284 + int result;
33285 + int progress = 0;
33286 +
33287 + assert("vs-1248",
33288 + fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
33289 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33290 +
33291 + fplug->key_by_inode(inode, new_size, &from_key);
33292 + to_key = from_key;
33293 + set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
33294 + /* this loop normally runs just once */
33295 + while (1) {
33296 + result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
33297 + if (result)
33298 + break;
33299 +
33300 + result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
33301 + &smallest_removed, inode, 1,
33302 + &progress);
33303 + if (result == -E_REPEAT) {
33304 + /* -E_REPEAT is a signal to interrupt a long file truncation process */
33305 + if (progress) {
33306 + result =
33307 + update_actor(inode, &smallest_removed,
33308 + update_sd);
33309 + if (result)
33310 + break;
33311 + }
33312 +
33313 + /* the below does up(sbinfo->delete_mutex). Do not get folled */
33314 + reiser4_release_reserved(inode->i_sb);
33315 +
33316 + /* reiser4_cut_tree_object() was interrupted probably because
33317 + * current atom requires commit, we have to release
33318 + * transaction handle to allow atom commit. */
33319 + reiser4_txn_restart_current();
33320 + continue;
33321 + }
33322 + if (result
33323 + && !(result == CBK_COORD_NOTFOUND && new_size == 0
33324 + && inode->i_size == 0))
33325 + break;
33326 +
33327 + set_key_offset(&smallest_removed, new_size);
33328 + /* Final sd update after the file gets its correct size */
33329 + result = update_actor(inode, &smallest_removed, update_sd);
33330 + break;
33331 + }
33332 +
33333 + /* the below does up(sbinfo->delete_mutex). Do not get folled */
33334 + reiser4_release_reserved(inode->i_sb);
33335 +
33336 + return result;
33337 +}
33338 +
33339 +int find_or_create_extent(struct page *page);
33340 +
33341 +/* part of truncate_file_body: it is called when truncate is used to make file
33342 + shorter */
33343 +static int shorten_file(struct inode *inode, loff_t new_size)
33344 +{
33345 + int result;
33346 + struct page *page;
33347 + int padd_from;
33348 + unsigned long index;
33349 + struct unix_file_info *uf_info;
33350 +
33351 + /*
33352 + * all items of ordinary reiser4 file are grouped together. That is why
33353 + * we can use reiser4_cut_tree. Plan B files (for instance) can not be
33354 + * truncated that simply
33355 + */
33356 + result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
33357 + get_key_offset(reiser4_max_key()),
33358 + reiser4_update_file_size);
33359 + if (result)
33360 + return result;
33361 +
33362 + uf_info = unix_file_inode_data(inode);
33363 + assert("vs-1105", new_size == inode->i_size);
33364 + if (new_size == 0) {
33365 + uf_info->container = UF_CONTAINER_EMPTY;
33366 + return 0;
33367 + }
33368 +
33369 + result = find_file_state(inode, uf_info);
33370 + if (result)
33371 + return result;
33372 + if (uf_info->container == UF_CONTAINER_TAILS)
33373 + /*
33374 + * No need to worry about zeroing last page after new file
33375 + * end
33376 + */
33377 + return 0;
33378 +
33379 + padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
33380 + if (!padd_from)
33381 + /* file is truncated to page boundary */
33382 + return 0;
33383 +
33384 + result = reserve_partial_page(reiser4_tree_by_inode(inode));
33385 + if (result) {
33386 + reiser4_release_reserved(inode->i_sb);
33387 + return result;
33388 + }
33389 +
33390 + /* last page is partially truncated - zero its content */
33391 + index = (inode->i_size >> PAGE_CACHE_SHIFT);
33392 + page = read_mapping_page(inode->i_mapping, index, NULL);
33393 + if (IS_ERR(page)) {
33394 + /*
33395 + * the below does up(sbinfo->delete_mutex). Do not get
33396 + * confused
33397 + */
33398 + reiser4_release_reserved(inode->i_sb);
33399 + if (likely(PTR_ERR(page) == -EINVAL)) {
33400 + /* looks like file is built of tail items */
33401 + return 0;
33402 + }
33403 + return PTR_ERR(page);
33404 + }
33405 + wait_on_page_locked(page);
33406 + if (!PageUptodate(page)) {
33407 + page_cache_release(page);
33408 + /*
33409 + * the below does up(sbinfo->delete_mutex). Do not get
33410 + * confused
33411 + */
33412 + reiser4_release_reserved(inode->i_sb);
33413 + return RETERR(-EIO);
33414 + }
33415 +
33416 + /*
33417 + * if page correspons to hole extent unit - unallocated one will be
33418 + * created here. This is not necessary
33419 + */
33420 + result = find_or_create_extent(page);
33421 +
33422 + /*
33423 + * FIXME: cut_file_items has already updated inode. Probably it would
33424 + * be better to update it here when file is really truncated
33425 + */
33426 + if (result) {
33427 + page_cache_release(page);
33428 + /*
33429 + * the below does up(sbinfo->delete_mutex). Do not get
33430 + * confused
33431 + */
33432 + reiser4_release_reserved(inode->i_sb);
33433 + return result;
33434 + }
33435 +
33436 + lock_page(page);
33437 + assert("vs-1066", PageLocked(page));
33438 + zero_user_page(page, padd_from, PAGE_CACHE_SIZE - padd_from, KM_USER0);
33439 + unlock_page(page);
33440 + page_cache_release(page);
33441 + /* the below does up(sbinfo->delete_mutex). Do not get confused */
33442 + reiser4_release_reserved(inode->i_sb);
33443 + return 0;
33444 +}
33445 +
33446 +/**
33447 + * should_have_notail
33448 + * @uf_info:
33449 + * @new_size:
33450 + *
33451 + * Calls formatting plugin to see whether file of size @new_size has to be
33452 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
33453 + */
33454 +static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
33455 +{
33456 + if (!uf_info->tplug)
33457 + return 1;
33458 + return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
33459 + new_size);
33460 +
33461 +}
33462 +
33463 +/**
33464 + * truncate_file_body - change length of file
33465 + * @inode: inode of file
33466 + * @new_size: new file length
33467 + *
33468 + * Adjusts items file @inode is built of to match @new_size. It may either cut
33469 + * items or add them to represent a hole at the end of file. The caller has to
33470 + * obtain exclusive access to the file.
33471 + */
33472 +static int truncate_file_body(struct inode *inode, struct iattr *attr)
33473 +{
33474 + int result;
33475 + loff_t new_size = attr->ia_size;
33476 +
33477 + if (inode->i_size < new_size) {
33478 + /* expanding truncate */
33479 + struct file * file = attr->ia_file;
33480 + struct unix_file_info *uf_info = unix_file_inode_data(inode);
33481 +
33482 + assert("edward-1532", attr->ia_valid & ATTR_FILE);
33483 +
33484 + result = find_file_state(inode, uf_info);
33485 + if (result)
33486 + return result;
33487 +
33488 + if (should_have_notail(uf_info, new_size)) {
33489 + /*
33490 + * file of size @new_size has to be built of
33491 + * extents. If it is built of tails - convert to
33492 + * extents
33493 + */
33494 + if (uf_info->container == UF_CONTAINER_TAILS) {
33495 + /*
33496 + * if file is being convered by another process
33497 + * - wait until it completes
33498 + */
33499 + while (1) {
33500 + if (reiser4_inode_get_flag(inode,
33501 + REISER4_PART_IN_CONV)) {
33502 + drop_exclusive_access(uf_info);
33503 + schedule();
33504 + get_exclusive_access(uf_info);
33505 + continue;
33506 + }
33507 + break;
33508 + }
33509 +
33510 + if (uf_info->container == UF_CONTAINER_TAILS) {
33511 + result = tail2extent(uf_info);
33512 + if (result)
33513 + return result;
33514 + }
33515 + }
33516 + result = reiser4_write_extent(file, NULL, 0,
33517 + &new_size);
33518 + if (result)
33519 + return result;
33520 + uf_info->container = UF_CONTAINER_EXTENTS;
33521 + } else {
33522 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
33523 + result = reiser4_write_extent(file, NULL, 0,
33524 + &new_size);
33525 + if (result)
33526 + return result;
33527 + } else {
33528 + result = reiser4_write_tail(file, NULL, 0,
33529 + &new_size);
33530 + if (result)
33531 + return result;
33532 + uf_info->container = UF_CONTAINER_TAILS;
33533 + }
33534 + }
33535 + BUG_ON(result > 0);
33536 + INODE_SET_FIELD(inode, i_size, new_size);
33537 + file_update_time(file);
33538 + result = reiser4_update_sd(inode);
33539 + BUG_ON(result != 0);
33540 + reiser4_free_file_fsdata(file);
33541 + } else
33542 + result = shorten_file(inode, new_size);
33543 + return result;
33544 +}
33545 +
33546 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
33547 +
33548 +/**
33549 + * load_file_hint - copy hint from struct file to local variable
33550 + * @file: file to get hint from
33551 + * @hint: structure to fill
33552 + *
33553 + * Reiser4 specific portion of struct file may contain information (hint)
33554 + * stored on exiting from previous read or write. That information includes
33555 + * seal of znode and coord within that znode where previous read or write
33556 + * stopped. This function copies that information to @hint if it was stored or
33557 + * initializes @hint by 0s otherwise.
33558 + */
33559 +int load_file_hint(struct file *file, hint_t *hint)
33560 +{
33561 + reiser4_file_fsdata *fsdata;
33562 +
33563 + if (file) {
33564 + fsdata = reiser4_get_file_fsdata(file);
33565 + if (IS_ERR(fsdata))
33566 + return PTR_ERR(fsdata);
33567 +
33568 + spin_lock_inode(file->f_dentry->d_inode);
33569 + if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
33570 + *hint = fsdata->reg.hint;
33571 + init_lh(&hint->lh);
33572 + hint->ext_coord.lh = &hint->lh;
33573 + spin_unlock_inode(file->f_dentry->d_inode);
33574 + /*
33575 + * force re-validation of the coord on the first
33576 + * iteration of the read/write loop.
33577 + */
33578 + hint->ext_coord.valid = 0;
33579 + assert("nikita-19892", coords_equal(&hint->seal.coord1,
33580 + &hint->ext_coord.
33581 + coord));
33582 + return 0;
33583 + }
33584 + memset(&fsdata->reg.hint, 0, sizeof(hint_t));
33585 + spin_unlock_inode(file->f_dentry->d_inode);
33586 + }
33587 + hint_init_zero(hint);
33588 + return 0;
33589 +}
33590 +
33591 +/**
33592 + * save_file_hint - copy hint to reiser4 private struct file's part
33593 + * @file: file to save hint in
33594 + * @hint: hint to save
33595 + *
33596 + * This copies @hint to reiser4 private part of struct file. It can help
33597 + * speedup future accesses to the file.
33598 + */
33599 +void save_file_hint(struct file *file, const hint_t *hint)
33600 +{
33601 + reiser4_file_fsdata *fsdata;
33602 +
33603 + assert("edward-1337", hint != NULL);
33604 +
33605 + if (!file || !reiser4_seal_is_set(&hint->seal))
33606 + return;
33607 + fsdata = reiser4_get_file_fsdata(file);
33608 + assert("vs-965", !IS_ERR(fsdata));
33609 + assert("nikita-19891",
33610 + coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
33611 + assert("vs-30", hint->lh.owner == NULL);
33612 + spin_lock_inode(file->f_dentry->d_inode);
33613 + fsdata->reg.hint = *hint;
33614 + spin_unlock_inode(file->f_dentry->d_inode);
33615 + return;
33616 +}
33617 +
33618 +void reiser4_unset_hint(hint_t * hint)
33619 +{
33620 + assert("vs-1315", hint);
33621 + hint->ext_coord.valid = 0;
33622 + reiser4_seal_done(&hint->seal);
33623 + done_lh(&hint->lh);
33624 +}
33625 +
33626 +/* coord must be set properly. So, that reiser4_set_hint
33627 + has nothing to do */
33628 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
33629 + znode_lock_mode mode)
33630 +{
33631 + ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
33632 + assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
33633 +
33634 + reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
33635 + hint->offset = get_key_offset(key);
33636 + hint->mode = mode;
33637 + done_lh(&hint->lh);
33638 +}
33639 +
33640 +int hint_is_set(const hint_t * hint)
33641 +{
33642 + return reiser4_seal_is_set(&hint->seal);
33643 +}
33644 +
33645 +#if REISER4_DEBUG
33646 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
33647 +{
33648 + return (get_key_locality(k1) == get_key_locality(k2) &&
33649 + get_key_type(k1) == get_key_type(k2) &&
33650 + get_key_band(k1) == get_key_band(k2) &&
33651 + get_key_ordering(k1) == get_key_ordering(k2) &&
33652 + get_key_objectid(k1) == get_key_objectid(k2));
33653 +}
33654 +#endif
33655 +
33656 +static int
33657 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
33658 + znode_lock_mode lock_mode)
33659 +{
33660 + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
33661 + /* hint either not set or set by different operation */
33662 + return RETERR(-E_REPEAT);
33663 +
33664 + assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
33665 +
33666 + if (check_key && get_key_offset(key) != hint->offset)
33667 + /* hint is set for different key */
33668 + return RETERR(-E_REPEAT);
33669 +
33670 + assert("vs-31", hint->ext_coord.lh == &hint->lh);
33671 + return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
33672 + hint->ext_coord.lh, lock_mode,
33673 + ZNODE_LOCK_LOPRI);
33674 +}
33675 +
33676 +/**
33677 + * find_or_create_extent -
33678 + * @page:
33679 + *
33680 + *
33681 + */
33682 +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
33683 + unallocated extent if it does not exist yet, initialize jnode, capture page */
33684 +int find_or_create_extent(struct page *page)
33685 +{
33686 + int result;
33687 + struct inode *inode;
33688 + int plugged_hole;
33689 +
33690 + jnode *node;
33691 +
33692 + assert("vs-1065", page->mapping && page->mapping->host);
33693 + inode = page->mapping->host;
33694 +
33695 + lock_page(page);
33696 + node = jnode_of_page(page);
33697 + if (IS_ERR(node)) {
33698 + unlock_page(page);
33699 + return PTR_ERR(node);
33700 + }
33701 + JF_SET(node, JNODE_WRITE_PREPARED);
33702 + unlock_page(page);
33703 +
33704 + if (node->blocknr == 0) {
33705 + plugged_hole = 0;
33706 + result = reiser4_update_extent(inode, node, page_offset(page),
33707 + &plugged_hole);
33708 + if (result) {
33709 + JF_CLR(node, JNODE_WRITE_PREPARED);
33710 + jput(node);
33711 + warning("", "reiser4_update_extent failed: %d", result);
33712 + return result;
33713 + }
33714 + if (plugged_hole)
33715 + reiser4_update_sd(inode);
33716 + } else {
33717 + spin_lock_jnode(node);
33718 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
33719 + BUG_ON(result != 0);
33720 + jnode_make_dirty_locked(node);
33721 + spin_unlock_jnode(node);
33722 + }
33723 +
33724 + BUG_ON(node->atom == NULL);
33725 + JF_CLR(node, JNODE_WRITE_PREPARED);
33726 + jput(node);
33727 +
33728 + if (get_current_context()->entd) {
33729 + entd_context *ent = get_entd_context(node->tree->super);
33730 +
33731 + if (ent->cur_request->page == page)
33732 + ent->cur_request->node = node;
33733 + }
33734 + return 0;
33735 +}
33736 +
33737 +/**
33738 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
33739 + * @inode: inode to check
33740 + *
33741 + * Returns true if inode's mapping has dirty pages which do not belong to any
33742 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
33743 + * tree or were eflushed and can be found via jnodes tagged
33744 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
33745 + */
33746 +static int has_anonymous_pages(struct inode *inode)
33747 +{
33748 + int result;
33749 +
33750 + read_lock_irq(&inode->i_mapping->tree_lock);
33751 + result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
33752 + read_unlock_irq(&inode->i_mapping->tree_lock);
33753 + return result;
33754 +}
33755 +
33756 +/**
33757 + * capture_page_and_create_extent -
33758 + * @page: page to be captured
33759 + *
33760 + * Grabs space for extent creation and stat data update and calls function to
33761 + * do actual work.
33762 + */
33763 +static int capture_page_and_create_extent(struct page *page)
33764 +{
33765 + int result;
33766 + struct inode *inode;
33767 +
33768 + assert("vs-1084", page->mapping && page->mapping->host);
33769 + inode = page->mapping->host;
33770 + assert("vs-1139",
33771 + unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
33772 + /* page belongs to file */
33773 + assert("vs-1393",
33774 + inode->i_size > page_offset(page));
33775 +
33776 + /* page capture may require extent creation (if it does not exist yet)
33777 + and stat data's update (number of blocks changes on extent
33778 + creation) */
33779 + grab_space_enable();
33780 + result = reiser4_grab_space(2 * estimate_one_insert_into_item
33781 + (reiser4_tree_by_inode(inode)),
33782 + BA_CAN_COMMIT);
33783 + if (likely(!result))
33784 + result = find_or_create_extent(page);
33785 +
33786 + if (result != 0)
33787 + SetPageError(page);
33788 + return result;
33789 +}
33790 +
33791 +/* this is implementation of method commit_write of struct
33792 + address_space_operations for unix file plugin */
33793 +int
33794 +commit_write_unix_file(struct file *file, struct page *page,
33795 + unsigned from, unsigned to)
33796 +{
33797 + reiser4_context *ctx;
33798 + struct inode *inode;
33799 + int result;
33800 +
33801 + assert("umka-3101", file != NULL);
33802 + assert("umka-3102", page != NULL);
33803 + assert("umka-3093", PageLocked(page));
33804 +
33805 + SetPageUptodate(page);
33806 +
33807 + inode = page->mapping->host;
33808 + ctx = reiser4_init_context(page->mapping->host->i_sb);
33809 + if (IS_ERR(ctx))
33810 + return PTR_ERR(ctx);
33811 + page_cache_get(page);
33812 + unlock_page(page);
33813 + result = capture_page_and_create_extent(page);
33814 + lock_page(page);
33815 + page_cache_release(page);
33816 +
33817 + /* don't commit transaction under inode semaphore */
33818 + context_set_commit_async(ctx);
33819 + reiser4_exit_context(ctx);
33820 + return result;
33821 +}
33822 +
33823 +/*
33824 + * Support for "anonymous" pages and jnodes.
33825 + *
33826 + * When file is write-accessed through mmap pages can be dirtied from the user
33827 + * level. In this case kernel is not notified until one of following happens:
33828 + *
33829 + * (1) msync()
33830 + *
33831 + * (2) truncate() (either explicit or through unlink)
33832 + *
33833 + * (3) VM scanner starts reclaiming mapped pages, dirtying them before
33834 + * starting write-back.
33835 + *
33836 + * As a result of (3) ->writepage may be called on a dirty page without
33837 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
33838 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
33839 + * this situation by creating jnode for anonymous page, starting IO on the
33840 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
33841 + * memory. Such jnode is also called anonymous.
33842 + *
33843 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
33844 + * tree. This is done by capture_anonymous_*() functions below.
33845 + */
33846 +
33847 +/**
33848 + * capture_anonymous_page - involve page into transaction
33849 + * @pg: page to deal with
33850 + *
33851 + * Takes care that @page has corresponding metadata in the tree, creates jnode
33852 + * for @page and captures it. On success 1 is returned.
33853 + */
33854 +static int capture_anonymous_page(struct page *page)
33855 +{
33856 + int result;
33857 +
33858 + if (PageWriteback(page))
33859 + /* FIXME: do nothing? */
33860 + return 0;
33861 +
33862 + result = capture_page_and_create_extent(page);
33863 + if (result == 0) {
33864 + result = 1;
33865 + } else
33866 + warning("nikita-3329",
33867 + "Cannot capture anon page: %i", result);
33868 +
33869 + return result;
33870 +}
33871 +
33872 +/**
33873 + * capture_anonymous_pages - find and capture pages dirtied via mmap
33874 + * @mapping: address space where to look for pages
33875 + * @index: start index
33876 + * @to_capture: maximum number of pages to capture
33877 + *
33878 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
33879 + * captures (involves into atom) them, returns number of captured pages,
33880 + * updates @index to next page after the last captured one.
33881 + */
33882 +static int
33883 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
33884 + unsigned int to_capture)
33885 +{
33886 + int result;
33887 + struct pagevec pvec;
33888 + unsigned int i, count;
33889 + int nr;
33890 +
33891 + pagevec_init(&pvec, 0);
33892 + count = min(pagevec_space(&pvec), to_capture);
33893 + nr = 0;
33894 +
33895 + /* find pages tagged MOVED */
33896 + write_lock_irq(&mapping->tree_lock);
33897 + pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
33898 + (void **)pvec.pages, *index, count,
33899 + PAGECACHE_TAG_REISER4_MOVED);
33900 + if (pagevec_count(&pvec) == 0) {
33901 + /*
33902 + * there are no pages tagged MOVED in mapping->page_tree
33903 + * starting from *index
33904 + */
33905 + write_unlock_irq(&mapping->tree_lock);
33906 + *index = (pgoff_t)-1;
33907 + return 0;
33908 + }
33909 +
33910 + /* clear MOVED tag for all found pages */
33911 + for (i = 0; i < pagevec_count(&pvec); i++) {
33912 + void *p;
33913 +
33914 + page_cache_get(pvec.pages[i]);
33915 + p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
33916 + PAGECACHE_TAG_REISER4_MOVED);
33917 + assert("vs-49", p == pvec.pages[i]);
33918 + }
33919 + write_unlock_irq(&mapping->tree_lock);
33920 +
33921 +
33922 + *index = pvec.pages[i - 1]->index + 1;
33923 +
33924 + for (i = 0; i < pagevec_count(&pvec); i++) {
33925 + /*
33926 + * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
33927 + * reiser4_set_page_dirty_internal which is called when jnode is
33928 + * captured
33929 + */
33930 + result = capture_anonymous_page(pvec.pages[i]);
33931 + if (result == 1)
33932 + nr++;
33933 + else {
33934 + if (result < 0) {
33935 + warning("vs-1454",
33936 + "failed to capture page: "
33937 + "result=%d, captured=%d)\n",
33938 + result, i);
33939 +
33940 + /*
33941 + * set MOVED tag to all pages which left not
33942 + * captured
33943 + */
33944 + write_lock_irq(&mapping->tree_lock);
33945 + for (; i < pagevec_count(&pvec); i ++) {
33946 + radix_tree_tag_set(&mapping->page_tree,
33947 + pvec.pages[i]->index,
33948 + PAGECACHE_TAG_REISER4_MOVED);
33949 + }
33950 + write_unlock_irq(&mapping->tree_lock);
33951 +
33952 + pagevec_release(&pvec);
33953 + return result;
33954 + } else {
33955 + /*
33956 + * result == 0. capture_anonymous_page returns
33957 + * 0 for Writeback-ed page. Set MOVED tag on
33958 + * that page
33959 + */
33960 + write_lock_irq(&mapping->tree_lock);
33961 + radix_tree_tag_set(&mapping->page_tree,
33962 + pvec.pages[i]->index,
33963 + PAGECACHE_TAG_REISER4_MOVED);
33964 + write_unlock_irq(&mapping->tree_lock);
33965 + if (i == 0)
33966 + *index = pvec.pages[0]->index;
33967 + else
33968 + *index = pvec.pages[i - 1]->index + 1;
33969 + }
33970 + }
33971 + }
33972 + pagevec_release(&pvec);
33973 + return nr;
33974 +}
33975 +
33976 +/**
33977 + * capture_anonymous_jnodes - find and capture anonymous jnodes
33978 + * @mapping: address space where to look for jnodes
33979 + * @from: start index
33980 + * @to: end index
33981 + * @to_capture: maximum number of jnodes to capture
33982 + *
33983 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
33984 + * the range of indexes @from-@to and captures them, returns number of captured
33985 + * jnodes, updates @from to next jnode after the last captured one.
33986 + */
33987 +static int
33988 +capture_anonymous_jnodes(struct address_space *mapping,
33989 + pgoff_t *from, pgoff_t to, int to_capture)
33990 +{
33991 + *from = to;
33992 + return 0;
33993 +}
33994 +
33995 +/*
33996 + * Commit atom of the jnode of a page.
33997 + */
33998 +static int sync_page(struct page *page)
33999 +{
34000 + int result;
34001 + do {
34002 + jnode *node;
34003 + txn_atom *atom;
34004 +
34005 + lock_page(page);
34006 + node = jprivate(page);
34007 + if (node != NULL) {
34008 + spin_lock_jnode(node);
34009 + atom = jnode_get_atom(node);
34010 + spin_unlock_jnode(node);
34011 + } else
34012 + atom = NULL;
34013 + unlock_page(page);
34014 + result = reiser4_sync_atom(atom);
34015 + } while (result == -E_REPEAT);
34016 + /*
34017 + * ZAM-FIXME-HANS: document the logic of this loop, is it just to
34018 + * handle the case where more pages get added to the atom while we are
34019 + * syncing it?
34020 + */
34021 + assert("nikita-3485", ergo(result == 0,
34022 + get_current_context()->trans->atom == NULL));
34023 + return result;
34024 +}
34025 +
34026 +/*
34027 + * Commit atoms of pages on @pages list.
34028 + * call sync_page for each page from mapping's page tree
34029 + */
34030 +static int sync_page_list(struct inode *inode)
34031 +{
34032 + int result;
34033 + struct address_space *mapping;
34034 + unsigned long from; /* start index for radix_tree_gang_lookup */
34035 + unsigned int found; /* return value for radix_tree_gang_lookup */
34036 +
34037 + mapping = inode->i_mapping;
34038 + from = 0;
34039 + result = 0;
34040 + read_lock_irq(&mapping->tree_lock);
34041 + while (result == 0) {
34042 + struct page *page;
34043 +
34044 + found =
34045 + radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
34046 + from, 1);
34047 + assert("", found < 2);
34048 + if (found == 0)
34049 + break;
34050 +
34051 + /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
34052 + sys_fsync */
34053 + page_cache_get(page);
34054 + read_unlock_irq(&mapping->tree_lock);
34055 +
34056 + from = page->index + 1;
34057 +
34058 + result = sync_page(page);
34059 +
34060 + page_cache_release(page);
34061 + read_lock_irq(&mapping->tree_lock);
34062 + }
34063 +
34064 + read_unlock_irq(&mapping->tree_lock);
34065 + return result;
34066 +}
34067 +
34068 +static int commit_file_atoms(struct inode *inode)
34069 +{
34070 + int result;
34071 + struct unix_file_info *uf_info;
34072 +
34073 + uf_info = unix_file_inode_data(inode);
34074 +
34075 + get_exclusive_access(uf_info);
34076 + /*
34077 + * find what items file is made from
34078 + */
34079 + result = find_file_state(inode, uf_info);
34080 + drop_exclusive_access(uf_info);
34081 + if (result != 0)
34082 + return result;
34083 +
34084 + /*
34085 + * file state cannot change because we are under ->i_mutex
34086 + */
34087 + switch (uf_info->container) {
34088 + case UF_CONTAINER_EXTENTS:
34089 + /* find_file_state might open join an atom */
34090 + reiser4_txn_restart_current();
34091 + result =
34092 + /*
34093 + * when we are called by
34094 + * filemap_fdatawrite->
34095 + * do_writepages()->
34096 + * reiser4_writepages()
34097 + *
34098 + * inode->i_mapping->dirty_pages are spices into
34099 + * ->io_pages, leaving ->dirty_pages dirty.
34100 + *
34101 + * When we are called from
34102 + * reiser4_fsync()->sync_unix_file(), we have to
34103 + * commit atoms of all pages on the ->dirty_list.
34104 + *
34105 + * So for simplicity we just commit ->io_pages and
34106 + * ->dirty_pages.
34107 + */
34108 + sync_page_list(inode);
34109 + break;
34110 + case UF_CONTAINER_TAILS:
34111 + /*
34112 + * NOTE-NIKITA probably we can be smarter for tails. For now
34113 + * just commit all existing atoms.
34114 + */
34115 + result = txnmgr_force_commit_all(inode->i_sb, 0);
34116 + break;
34117 + case UF_CONTAINER_EMPTY:
34118 + result = 0;
34119 + break;
34120 + case UF_CONTAINER_UNKNOWN:
34121 + default:
34122 + result = -EIO;
34123 + break;
34124 + }
34125 +
34126 + /*
34127 + * commit current transaction: there can be captured nodes from
34128 + * find_file_state() and finish_conversion().
34129 + */
34130 + reiser4_txn_restart_current();
34131 + return result;
34132 +}
34133 +
34134 +/**
34135 + * writepages_unix_file - writepages of struct address_space_operations
34136 + * @mapping:
34137 + * @wbc:
34138 + *
34139 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
34140 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
34141 + * created by reiser4_writepage.
34142 + */
34143 +int writepages_unix_file(struct address_space *mapping,
34144 + struct writeback_control *wbc)
34145 +{
34146 + int result;
34147 + struct unix_file_info *uf_info;
34148 + pgoff_t pindex, jindex, nr_pages;
34149 + long to_capture;
34150 + struct inode *inode;
34151 +
34152 + inode = mapping->host;
34153 + if (!has_anonymous_pages(inode)) {
34154 + result = 0;
34155 + goto end;
34156 + }
34157 + jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
34158 + result = 0;
34159 + nr_pages = size_in_pages(i_size_read(inode));
34160 +
34161 + uf_info = unix_file_inode_data(inode);
34162 +
34163 + do {
34164 + reiser4_context *ctx;
34165 +
34166 + if (wbc->sync_mode != WB_SYNC_ALL)
34167 + to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
34168 + else
34169 + to_capture = CAPTURE_APAGE_BURST;
34170 +
34171 + ctx = reiser4_init_context(inode->i_sb);
34172 + if (IS_ERR(ctx)) {
34173 + result = PTR_ERR(ctx);
34174 + break;
34175 + }
34176 + /* avoid recursive calls to ->sync_inodes */
34177 + ctx->nobalance = 1;
34178 + assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
34179 + assert("", LOCK_CNT_NIL(inode_sem_w));
34180 + assert("", LOCK_CNT_NIL(inode_sem_r));
34181 +
34182 + reiser4_txn_restart_current();
34183 +
34184 + /* we have to get nonexclusive access to the file */
34185 + if (get_current_context()->entd) {
34186 + /*
34187 + * use nonblocking version of nonexclusive_access to
34188 + * avoid deadlock which might look like the following:
34189 + * process P1 holds NEA on file F1 and called entd to
34190 + * reclaim some memory. Entd works for P1 and is going
34191 + * to capture pages of file F2. To do that entd has to
34192 + * get NEA to F2. F2 is held by process P2 which also
34193 + * called entd. But entd is serving P1 at the moment
34194 + * and P2 has to wait. Process P3 trying to get EA to
34195 + * file F2. Existence of pending EA request to file F2
34196 + * makes impossible for entd to get NEA to file
34197 + * F2. Neither of these process can continue. Using
34198 + * nonblocking version of gettign NEA is supposed to
34199 + * avoid this deadlock.
34200 + */
34201 + if (try_to_get_nonexclusive_access(uf_info) == 0) {
34202 + result = RETERR(-EBUSY);
34203 + reiser4_exit_context(ctx);
34204 + break;
34205 + }
34206 + } else
34207 + get_nonexclusive_access(uf_info);
34208 +
34209 + while (to_capture > 0) {
34210 + pgoff_t start;
34211 +
34212 + assert("vs-1727", jindex <= pindex);
34213 + if (pindex == jindex) {
34214 + start = pindex;
34215 + result =
34216 + capture_anonymous_pages(inode->i_mapping,
34217 + &pindex,
34218 + to_capture);
34219 + if (result <= 0)
34220 + break;
34221 + to_capture -= result;
34222 + wbc->nr_to_write -= result;
34223 + if (start + result == pindex) {
34224 + jindex = pindex;
34225 + continue;
34226 + }
34227 + if (to_capture <= 0)
34228 + break;
34229 + }
34230 + /* deal with anonymous jnodes between jindex and pindex */
34231 + result =
34232 + capture_anonymous_jnodes(inode->i_mapping, &jindex,
34233 + pindex, to_capture);
34234 + if (result < 0)
34235 + break;
34236 + to_capture -= result;
34237 + get_current_context()->nr_captured += result;
34238 +
34239 + if (jindex == (pgoff_t) - 1) {
34240 + assert("vs-1728", pindex == (pgoff_t) - 1);
34241 + break;
34242 + }
34243 + }
34244 + if (to_capture <= 0)
34245 + /* there may be left more pages */
34246 + __mark_inode_dirty(inode, I_DIRTY_PAGES);
34247 +
34248 + drop_nonexclusive_access(uf_info);
34249 + if (result < 0) {
34250 + /* error happened */
34251 + reiser4_exit_context(ctx);
34252 + return result;
34253 + }
34254 + if (wbc->sync_mode != WB_SYNC_ALL) {
34255 + reiser4_exit_context(ctx);
34256 + return 0;
34257 + }
34258 + result = commit_file_atoms(inode);
34259 + reiser4_exit_context(ctx);
34260 + if (pindex >= nr_pages && jindex == pindex)
34261 + break;
34262 + } while (1);
34263 +
34264 + end:
34265 + if (is_in_reiser4_context()) {
34266 + if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34267 + /*
34268 + * there are already pages to flush, flush them out, do
34269 + * not delay until end of reiser4_sync_inodes
34270 + */
34271 + reiser4_writeout(inode->i_sb, wbc);
34272 + get_current_context()->nr_captured = 0;
34273 + }
34274 + }
34275 + return result;
34276 +}
34277 +
34278 +/**
34279 + * readpage_unix_file_nolock - readpage of struct address_space_operations
34280 + * @file:
34281 + * @page:
34282 + *
34283 + * Compose a key and search for item containing information about @page
34284 + * data. If item is found - its readpage method is called.
34285 + */
34286 +int readpage_unix_file(struct file *file, struct page *page)
34287 +{
34288 + reiser4_context *ctx;
34289 + int result;
34290 + struct inode *inode;
34291 + reiser4_key key;
34292 + item_plugin *iplug;
34293 + hint_t *hint;
34294 + lock_handle *lh;
34295 + coord_t *coord;
34296 +
34297 + assert("vs-1062", PageLocked(page));
34298 + assert("vs-976", !PageUptodate(page));
34299 + assert("vs-1061", page->mapping && page->mapping->host);
34300 +
34301 + if (page->mapping->host->i_size <= page_offset(page)) {
34302 + /* page is out of file */
34303 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
34304 + SetPageUptodate(page);
34305 + unlock_page(page);
34306 + return 0;
34307 + }
34308 +
34309 + inode = page->mapping->host;
34310 + ctx = reiser4_init_context(inode->i_sb);
34311 + if (IS_ERR(ctx)) {
34312 + unlock_page(page);
34313 + return PTR_ERR(ctx);
34314 + }
34315 +
34316 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34317 + if (hint == NULL) {
34318 + unlock_page(page);
34319 + reiser4_exit_context(ctx);
34320 + return RETERR(-ENOMEM);
34321 + }
34322 +
34323 + result = load_file_hint(file, hint);
34324 + if (result) {
34325 + kfree(hint);
34326 + unlock_page(page);
34327 + reiser4_exit_context(ctx);
34328 + return result;
34329 + }
34330 + lh = &hint->lh;
34331 +
34332 + /* get key of first byte of the page */
34333 + key_by_inode_and_offset_common(inode, page_offset(page), &key);
34334 +
34335 + /* look for file metadata corresponding to first byte of page */
34336 + page_cache_get(page);
34337 + unlock_page(page);
34338 + result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
34339 + lock_page(page);
34340 + page_cache_release(page);
34341 +
34342 + if (page->mapping == NULL) {
34343 + /*
34344 + * readpage allows truncate to run concurrently. Page was
34345 + * truncated while it was not locked
34346 + */
34347 + done_lh(lh);
34348 + kfree(hint);
34349 + unlock_page(page);
34350 + reiser4_txn_restart(ctx);
34351 + reiser4_exit_context(ctx);
34352 + return -EINVAL;
34353 + }
34354 +
34355 + if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
34356 + if (result == CBK_COORD_FOUND &&
34357 + hint->ext_coord.coord.between != AT_UNIT)
34358 + /* file is truncated */
34359 + result = -EINVAL;
34360 + done_lh(lh);
34361 + kfree(hint);
34362 + unlock_page(page);
34363 + reiser4_txn_restart(ctx);
34364 + reiser4_exit_context(ctx);
34365 + return result;
34366 + }
34367 +
34368 + /*
34369 + * item corresponding to page is found. It can not be removed because
34370 + * znode lock is held
34371 + */
34372 + if (PageUptodate(page)) {
34373 + done_lh(lh);
34374 + kfree(hint);
34375 + unlock_page(page);
34376 + reiser4_txn_restart(ctx);
34377 + reiser4_exit_context(ctx);
34378 + return 0;
34379 + }
34380 +
34381 + coord = &hint->ext_coord.coord;
34382 + result = zload(coord->node);
34383 + if (result) {
34384 + done_lh(lh);
34385 + kfree(hint);
34386 + unlock_page(page);
34387 + reiser4_txn_restart(ctx);
34388 + reiser4_exit_context(ctx);
34389 + return result;
34390 + }
34391 +
34392 + validate_extended_coord(&hint->ext_coord, page_offset(page));
34393 +
34394 + if (!coord_is_existing_unit(coord)) {
34395 + /* this indicates corruption */
34396 + warning("vs-280",
34397 + "Looking for page %lu of file %llu (size %lli). "
34398 + "No file items found (%d). File is corrupted?\n",
34399 + page->index, (unsigned long long)get_inode_oid(inode),
34400 + inode->i_size, result);
34401 + zrelse(coord->node);
34402 + done_lh(lh);
34403 + kfree(hint);
34404 + unlock_page(page);
34405 + reiser4_txn_restart(ctx);
34406 + reiser4_exit_context(ctx);
34407 + return RETERR(-EIO);
34408 + }
34409 +
34410 + /*
34411 + * get plugin of found item or use plugin if extent if there are no
34412 + * one
34413 + */
34414 + iplug = item_plugin_by_coord(coord);
34415 + if (iplug->s.file.readpage)
34416 + result = iplug->s.file.readpage(coord, page);
34417 + else
34418 + result = RETERR(-EINVAL);
34419 +
34420 + if (!result) {
34421 + set_key_offset(&key,
34422 + (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
34423 + /* FIXME should call reiser4_set_hint() */
34424 + reiser4_unset_hint(hint);
34425 + } else {
34426 + unlock_page(page);
34427 + reiser4_unset_hint(hint);
34428 + }
34429 + assert("vs-979",
34430 + ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
34431 + assert("vs-9791", ergo(result != 0, !PageLocked(page)));
34432 +
34433 + zrelse(coord->node);
34434 + done_lh(lh);
34435 +
34436 + save_file_hint(file, hint);
34437 + kfree(hint);
34438 +
34439 + /*
34440 + * FIXME: explain why it is needed. HINT: page allocation in write can
34441 + * not be done when atom is not NULL because reiser4_writepage can not
34442 + * kick entd and have to eflush
34443 + */
34444 + reiser4_txn_restart(ctx);
34445 + reiser4_exit_context(ctx);
34446 + return result;
34447 +}
34448 +
34449 +struct uf_readpages_context {
34450 + lock_handle lh;
34451 + coord_t coord;
34452 +};
34453 +
34454 +/* A callback function for readpages_unix_file/read_cache_pages.
34455 + * If the file is build of tails, then return error (-ENOENT).
34456 + *
34457 + * @data -- a pointer to reiser4_readpages_context object,
34458 + * to save the twig lock and the coord between
34459 + * read_cache_page iterations.
34460 + * @page -- page to start read.
34461 + */
34462 +static int uf_readpages_filler(void * data, struct page * page)
34463 +{
34464 + struct uf_readpages_context *rc = data;
34465 + jnode * node;
34466 + int ret = 0;
34467 + reiser4_extent *ext;
34468 + __u64 ext_index;
34469 + int cbk_done = 0;
34470 + struct address_space * mapping = page->mapping;
34471 +
34472 + if (PageUptodate(page)) {
34473 + unlock_page(page);
34474 + return 0;
34475 + }
34476 + page_cache_get(page);
34477 +
34478 + if (rc->lh.node == 0) {
34479 + /* no twig lock - have to do tree search. */
34480 + reiser4_key key;
34481 + repeat:
34482 + unlock_page(page);
34483 + key_by_inode_and_offset_common(
34484 + mapping->host, page_offset(page), &key);
34485 + ret = coord_by_key(
34486 + &get_super_private(mapping->host->i_sb)->tree,
34487 + &key, &rc->coord, &rc->lh,
34488 + ZNODE_READ_LOCK, FIND_EXACT,
34489 + TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
34490 + if (unlikely(ret))
34491 + goto exit;
34492 + lock_page(page);
34493 + if (PageUptodate(page))
34494 + goto unlock;
34495 + cbk_done = 1;
34496 + }
34497 + ret = zload(rc->coord.node);
34498 + if (unlikely(ret))
34499 + goto unlock;
34500 + if (!coord_is_existing_item(&rc->coord) ||
34501 + !item_is_extent(&rc->coord)) {
34502 + zrelse(rc->coord.node);
34503 + ret = RETERR(-EIO);
34504 + goto unlock;
34505 + }
34506 + ext = extent_by_coord(&rc->coord);
34507 + ext_index = extent_unit_index(&rc->coord);
34508 + if (page->index < ext_index ||
34509 + page->index >= ext_index + extent_get_width(ext)) {
34510 + /* the page index doesn't belong to the extent unit
34511 + which the coord points to - release the lock and
34512 + repeat with tree search. */
34513 + zrelse(rc->coord.node);
34514 + done_lh(&rc->lh);
34515 + /* we can be here after a CBK call only in case of
34516 + corruption of the tree or the tree lookup algorithm bug. */
34517 + if (unlikely(cbk_done)) {
34518 + ret = RETERR(-EIO);
34519 + goto unlock;
34520 + }
34521 + goto repeat;
34522 + }
34523 + node = jnode_of_page(page);
34524 + if (unlikely(IS_ERR(node))) {
34525 + zrelse(rc->coord.node);
34526 + ret = PTR_ERR(node);
34527 + goto unlock;
34528 + }
34529 + ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
34530 + jput(node);
34531 + zrelse(rc->coord.node);
34532 + if (likely(!ret))
34533 + goto exit;
34534 + unlock:
34535 + unlock_page(page);
34536 + exit:
34537 + page_cache_release(page);
34538 + return ret;
34539 +}
34540 +
34541 +/**
34542 + * readpages_unix_file - called by the readahead code, starts reading for each
34543 + * page of given list of pages
34544 + */
34545 +int readpages_unix_file(
34546 + struct file *file, struct address_space *mapping,
34547 + struct list_head *pages, unsigned nr_pages)
34548 +{
34549 + reiser4_context *ctx;
34550 + struct uf_readpages_context rc;
34551 + int ret;
34552 +
34553 + ctx = reiser4_init_context(mapping->host->i_sb);
34554 + if (IS_ERR(ctx)) {
34555 + put_pages_list(pages);
34556 + return PTR_ERR(ctx);
34557 + }
34558 + init_lh(&rc.lh);
34559 + ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
34560 + done_lh(&rc.lh);
34561 + context_set_commit_async(ctx);
34562 + /* close the transaction to protect further page allocation from deadlocks */
34563 + reiser4_txn_restart(ctx);
34564 + reiser4_exit_context(ctx);
34565 + return ret;
34566 +}
34567 +
34568 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
34569 + loff_t count UNUSED_ARG)
34570 +{
34571 + /* We should reserve one block, because of updating of the stat data
34572 + item */
34573 + assert("vs-1249",
34574 + inode_file_plugin(inode)->estimate.update ==
34575 + estimate_update_common);
34576 + return estimate_update_common(inode);
34577 +}
34578 +
34579 +/* this is called with nonexclusive access obtained, file's container can not change */
34580 +static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
34581 + char __user *buf, /* address of user-space buffer */
34582 + size_t count, /* number of bytes to read */
34583 + loff_t *off)
34584 +{
34585 + int result;
34586 + struct inode *inode;
34587 + flow_t flow;
34588 + int (*read_f) (struct file *, flow_t *, hint_t *);
34589 + coord_t *coord;
34590 + znode *loaded;
34591 +
34592 + inode = file->f_dentry->d_inode;
34593 +
34594 + /* build flow */
34595 + assert("vs-1250",
34596 + inode_file_plugin(inode)->flow_by_inode ==
34597 + flow_by_inode_unix_file);
34598 + result =
34599 + flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
34600 + *off, READ_OP, &flow);
34601 + if (unlikely(result))
34602 + return result;
34603 +
34604 + /* get seal and coord sealed with it from reiser4 private data
34605 + of struct file. The coord will tell us where our last read
34606 + of this file finished, and the seal will help to determine
34607 + if that location is still valid.
34608 + */
34609 + coord = &hint->ext_coord.coord;
34610 + while (flow.length && result == 0) {
34611 + result =
34612 + find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
34613 + if (cbk_errored(result))
34614 + /* error happened */
34615 + break;
34616 +
34617 + if (coord->between != AT_UNIT) {
34618 + /* there were no items corresponding to given offset */
34619 + done_lh(hint->ext_coord.lh);
34620 + break;
34621 + }
34622 +
34623 + loaded = coord->node;
34624 + result = zload(loaded);
34625 + if (unlikely(result)) {
34626 + done_lh(hint->ext_coord.lh);
34627 + break;
34628 + }
34629 +
34630 + if (hint->ext_coord.valid == 0)
34631 + validate_extended_coord(&hint->ext_coord,
34632 + get_key_offset(&flow.key));
34633 +
34634 + assert("vs-4", hint->ext_coord.valid == 1);
34635 + assert("vs-33", hint->ext_coord.lh == &hint->lh);
34636 + /* call item's read method */
34637 + read_f = item_plugin_by_coord(coord)->s.file.read;
34638 + result = read_f(file, &flow, hint);
34639 + zrelse(loaded);
34640 + done_lh(hint->ext_coord.lh);
34641 + }
34642 +
34643 + return (count - flow.length) ? (count - flow.length) : result;
34644 +}
34645 +
34646 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
34647 +
34648 +/**
34649 + * read_unix_file - read of struct file_operations
34650 + * @file: file to read from
34651 + * @buf: address of user-space buffer
34652 + * @read_amount: number of bytes to read
34653 + * @off: position in file to read from
34654 + *
34655 + * This is implementation of vfs's read method of struct file_operations for
34656 + * unix file plugin.
34657 + */
34658 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
34659 + loff_t *off)
34660 +{
34661 + reiser4_context *ctx;
34662 + ssize_t result;
34663 + struct inode *inode;
34664 + struct unix_file_info *uf_info;
34665 +
34666 + if (unlikely(read_amount == 0))
34667 + return 0;
34668 +
34669 + assert("umka-072", file != NULL);
34670 + assert("umka-074", off != NULL);
34671 + inode = file->f_dentry->d_inode;
34672 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34673 +
34674 + ctx = reiser4_init_context(inode->i_sb);
34675 + if (IS_ERR(ctx))
34676 + return PTR_ERR(ctx);
34677 + uf_info = unix_file_inode_data(inode);
34678 + if (uf_info->container == UF_CONTAINER_UNKNOWN) {
34679 + get_exclusive_access(uf_info);
34680 + result = find_file_state(inode, uf_info);
34681 + if (unlikely(result != 0))
34682 + goto out;
34683 + } else
34684 + get_nonexclusive_access(uf_info);
34685 + result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
34686 + BA_CAN_COMMIT);
34687 + if (unlikely(result != 0))
34688 + goto out;
34689 + if (uf_info->container == UF_CONTAINER_EXTENTS){
34690 + result = do_sync_read(file, buf, read_amount, off);
34691 + } else if (uf_info->container == UF_CONTAINER_TAILS ||
34692 + reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
34693 + reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34694 + result = read_unix_file_container_tails(file, buf, read_amount, off);
34695 + } else {
34696 + assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
34697 + result = 0;
34698 + }
34699 +out:
34700 + drop_access(uf_info);
34701 + context_set_commit_async(ctx);
34702 + reiser4_exit_context(ctx);
34703 + return result;
34704 +}
34705 +
34706 +static ssize_t read_unix_file_container_tails(
34707 + struct file *file, char __user *buf, size_t read_amount, loff_t *off)
34708 +{
34709 + int result;
34710 + struct inode *inode;
34711 + hint_t *hint;
34712 + struct unix_file_info *uf_info;
34713 + size_t count, read, left;
34714 + loff_t size;
34715 +
34716 + assert("umka-072", file != NULL);
34717 + assert("umka-074", off != NULL);
34718 + inode = file->f_dentry->d_inode;
34719 + assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34720 +
34721 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34722 + if (hint == NULL)
34723 + return RETERR(-ENOMEM);
34724 +
34725 + result = load_file_hint(file, hint);
34726 + if (result) {
34727 + kfree(hint);
34728 + return result;
34729 + }
34730 +
34731 + left = read_amount;
34732 + count = 0;
34733 + uf_info = unix_file_inode_data(inode);
34734 + while (left > 0) {
34735 + reiser4_txn_restart_current();
34736 + size = i_size_read(inode);
34737 + if (*off >= size)
34738 + /* position to read from is past the end of file */
34739 + break;
34740 + if (*off + left > size)
34741 + left = size - *off;
34742 + /* faultin user page */
34743 + result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
34744 + if (result)
34745 + return RETERR(-EFAULT);
34746 +
34747 + read = read_file(hint, file, buf,
34748 + left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
34749 + off);
34750 + if (read < 0) {
34751 + result = read;
34752 + break;
34753 + }
34754 + left -= read;
34755 + buf += read;
34756 +
34757 + /* update position in a file */
34758 + *off += read;
34759 + /* total number of read bytes */
34760 + count += read;
34761 + }
34762 + done_lh(&hint->lh);
34763 + save_file_hint(file, hint);
34764 + kfree(hint);
34765 + if (count)
34766 + file_accessed(file);
34767 + /* return number of read bytes or error code if nothing is read */
34768 + return count ? count : result;
34769 +}
34770 +
34771 +/* This function takes care about @file's pages. First of all it checks if
34772 + filesystems readonly and if so gets out. Otherwise, it throws out all
34773 + pages of file if it was mapped for read and going to be mapped for write
34774 + and consists of tails. This is done in order to not manage few copies
34775 + of the data (first in page cache and second one in tails them selves)
34776 + for the case of mapping files consisting tails.
34777 +
34778 + Here also tail2extent conversion is performed if it is allowed and file
34779 + is going to be written or mapped for write. This functions may be called
34780 + from write_unix_file() or mmap_unix_file(). */
34781 +static int check_pages_unix_file(struct file *file, struct inode *inode)
34782 +{
34783 + reiser4_invalidate_pages(inode->i_mapping, 0,
34784 + (inode->i_size + PAGE_CACHE_SIZE -
34785 + 1) >> PAGE_CACHE_SHIFT, 0);
34786 + return unpack(file, inode, 0 /* not forever */ );
34787 +}
34788 +
34789 +/**
34790 + * mmap_unix_file - mmap of struct file_operations
34791 + * @file: file to mmap
34792 + * @vma:
34793 + *
34794 + * This is implementation of vfs's mmap method of struct file_operations for
34795 + * unix file plugin. It converts file to extent if necessary. Sets
34796 + * reiser4_inode's flag - REISER4_HAS_MMAP.
34797 + */
34798 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
34799 +{
34800 + reiser4_context *ctx;
34801 + int result;
34802 + struct inode *inode;
34803 + struct unix_file_info *uf_info;
34804 + reiser4_block_nr needed;
34805 +
34806 + inode = file->f_dentry->d_inode;
34807 + ctx = reiser4_init_context(inode->i_sb);
34808 + if (IS_ERR(ctx))
34809 + return PTR_ERR(ctx);
34810 +
34811 + uf_info = unix_file_inode_data(inode);
34812 +
34813 + get_exclusive_access_careful(uf_info, inode);
34814 +
34815 + if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
34816 + /*
34817 + * we need file built of extent items. If it is still built of
34818 + * tail items we have to convert it. Find what items the file
34819 + * is built of
34820 + */
34821 + result = find_file_state(inode, uf_info);
34822 + if (result != 0) {
34823 + drop_exclusive_access(uf_info);
34824 + reiser4_exit_context(ctx);
34825 + return result;
34826 + }
34827 +
34828 + assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
34829 + uf_info->container == UF_CONTAINER_EXTENTS ||
34830 + uf_info->container == UF_CONTAINER_EMPTY));
34831 + if (uf_info->container == UF_CONTAINER_TAILS) {
34832 + /*
34833 + * invalidate all pages and convert file from tails to
34834 + * extents
34835 + */
34836 + result = check_pages_unix_file(file, inode);
34837 + if (result) {
34838 + drop_exclusive_access(uf_info);
34839 + reiser4_exit_context(ctx);
34840 + return result;
34841 + }
34842 + }
34843 + }
34844 +
34845 + /*
34846 + * generic_file_mmap will do update_atime. Grab space for stat data
34847 + * update.
34848 + */
34849 + needed = inode_file_plugin(inode)->estimate.update(inode);
34850 + result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
34851 + if (result) {
34852 + drop_exclusive_access(uf_info);
34853 + reiser4_exit_context(ctx);
34854 + return result;
34855 + }
34856 +
34857 + result = generic_file_mmap(file, vma);
34858 + if (result == 0) {
34859 + /* mark file as having mapping. */
34860 + reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
34861 + }
34862 +
34863 + drop_exclusive_access(uf_info);
34864 + reiser4_exit_context(ctx);
34865 + return result;
34866 +}
34867 +
34868 +/**
34869 + * find_first_item
34870 + * @inode:
34871 + *
34872 + * Finds file item which is responsible for first byte in the file.
34873 + */
34874 +static int find_first_item(struct inode *inode)
34875 +{
34876 + coord_t coord;
34877 + lock_handle lh;
34878 + reiser4_key key;
34879 + int result;
34880 +
34881 + coord_init_zero(&coord);
34882 + init_lh(&lh);
34883 + inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
34884 + result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
34885 + inode);
34886 + if (result == CBK_COORD_FOUND) {
34887 + if (coord.between == AT_UNIT) {
34888 + result = zload(coord.node);
34889 + if (result == 0) {
34890 + result = item_id_by_coord(&coord);
34891 + zrelse(coord.node);
34892 + if (result != EXTENT_POINTER_ID &&
34893 + result != FORMATTING_ID)
34894 + result = RETERR(-EIO);
34895 + }
34896 + } else
34897 + result = RETERR(-EIO);
34898 + }
34899 + done_lh(&lh);
34900 + return result;
34901 +}
34902 +
34903 +/**
34904 + * open_unix_file
34905 + * @inode:
34906 + * @file:
34907 + *
34908 + * If filesystem is not readonly - complete uncompleted tail conversion if
34909 + * there was one
34910 + */
34911 +int open_unix_file(struct inode *inode, struct file *file)
34912 +{
34913 + int result;
34914 + reiser4_context *ctx;
34915 + struct unix_file_info *uf_info;
34916 +
34917 + if (IS_RDONLY(inode))
34918 + return 0;
34919 +
34920 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
34921 + return 0;
34922 +
34923 + ctx = reiser4_init_context(inode->i_sb);
34924 + if (IS_ERR(ctx))
34925 + return PTR_ERR(ctx);
34926 +
34927 + uf_info = unix_file_inode_data(inode);
34928 +
34929 + get_exclusive_access_careful(uf_info, inode);
34930 +
34931 + if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34932 + /*
34933 + * other process completed the conversion
34934 + */
34935 + drop_exclusive_access(uf_info);
34936 + reiser4_exit_context(ctx);
34937 + return 0;
34938 + }
34939 +
34940 + /*
34941 + * file left in semi converted state after unclean shutdown or another
34942 + * thread is doing conversion and dropped exclusive access which doing
34943 + * balance dirty pages. Complete the conversion
34944 + */
34945 + result = find_first_item(inode);
34946 + if (result == EXTENT_POINTER_ID)
34947 + /*
34948 + * first item is extent, therefore there was incomplete
34949 + * tail2extent conversion. Complete it
34950 + */
34951 + result = tail2extent(unix_file_inode_data(inode));
34952 + else if (result == FORMATTING_ID)
34953 + /*
34954 + * first item is formatting item, therefore there was
34955 + * incomplete extent2tail conversion. Complete it
34956 + */
34957 + result = extent2tail(file, unix_file_inode_data(inode));
34958 + else
34959 + result = -EIO;
34960 +
34961 + assert("vs-1712",
34962 + ergo(result == 0,
34963 + (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
34964 + !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
34965 + drop_exclusive_access(uf_info);
34966 + reiser4_exit_context(ctx);
34967 + return result;
34968 +}
34969 +
34970 +#define NEITHER_OBTAINED 0
34971 +#define EA_OBTAINED 1
34972 +#define NEA_OBTAINED 2
34973 +
34974 +static void drop_access(struct unix_file_info *uf_info)
34975 +{
34976 + if (uf_info->exclusive_use)
34977 + drop_exclusive_access(uf_info);
34978 + else
34979 + drop_nonexclusive_access(uf_info);
34980 +}
34981 +
34982 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
34983 + __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
34984 +
34985 +/**
34986 + * write_unix_file - write of struct file_operations
34987 + * @file: file to write to
34988 + * @buf: address of user-space buffer
34989 + * @write_amount: number of bytes to write
34990 + * @off: position in file to write to
34991 + *
34992 + * This is implementation of vfs's write method of struct file_operations for
34993 + * unix file plugin.
34994 + */
34995 +ssize_t write_unix_file(struct file *file, const char __user *buf,
34996 + size_t count, loff_t *pos, int *conv)
34997 +{
34998 + int result;
34999 + reiser4_context *ctx;
35000 + struct inode *inode;
35001 + struct unix_file_info *uf_info;
35002 + ssize_t written;
35003 + int try_free_space;
35004 + int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
35005 + size_t left;
35006 + ssize_t (*write_op)(struct file *, const char __user *, size_t,
35007 + loff_t *pos);
35008 + int ea;
35009 + loff_t new_size;
35010 +
35011 + inode = file->f_dentry->d_inode;
35012 + ctx = reiser4_init_context(inode->i_sb);
35013 + if (IS_ERR(ctx))
35014 + return PTR_ERR(ctx);
35015 +
35016 + mutex_lock(&inode->i_mutex);
35017 +
35018 + assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
35019 + assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
35020 +
35021 + /* check amount of bytes to write and writing position */
35022 + result = generic_write_checks(file, pos, &count, 0);
35023 + if (result) {
35024 + mutex_unlock(&inode->i_mutex);
35025 + context_set_commit_async(ctx);
35026 + reiser4_exit_context(ctx);
35027 + return result;
35028 + }
35029 +
35030 + result = remove_suid(file->f_dentry);
35031 + if (result) {
35032 + mutex_unlock(&inode->i_mutex);
35033 + context_set_commit_async(ctx);
35034 + reiser4_exit_context(ctx);
35035 + return result;
35036 + }
35037 + /* remove_suid might create a transaction */
35038 + reiser4_txn_restart(ctx);
35039 +
35040 + uf_info = unix_file_inode_data(inode);
35041 +
35042 + current->backing_dev_info = inode->i_mapping->backing_dev_info;
35043 + written = 0;
35044 + try_free_space = 0;
35045 + left = count;
35046 + ea = NEITHER_OBTAINED;
35047 +
35048 + new_size = i_size_read(inode);
35049 + if (*pos + count > new_size)
35050 + new_size = *pos + count;
35051 +
35052 + while (left) {
35053 + if (left < to_write)
35054 + to_write = left;
35055 +
35056 + if (uf_info->container == UF_CONTAINER_EMPTY) {
35057 + get_exclusive_access(uf_info);
35058 + ea = EA_OBTAINED;
35059 + if (uf_info->container != UF_CONTAINER_EMPTY) {
35060 + /* file is made not empty by another process */
35061 + drop_exclusive_access(uf_info);
35062 + ea = NEITHER_OBTAINED;
35063 + continue;
35064 + }
35065 + } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35066 + /*
35067 + * get exclusive access directly just to not have to
35068 + * re-obtain it if file will appear empty
35069 + */
35070 + get_exclusive_access(uf_info);
35071 + ea = EA_OBTAINED;
35072 + result = find_file_state(inode, uf_info);
35073 + if (result) {
35074 + drop_exclusive_access(uf_info);
35075 + ea = NEITHER_OBTAINED;
35076 + break;
35077 + }
35078 + } else {
35079 + get_nonexclusive_access(uf_info);
35080 + ea = NEA_OBTAINED;
35081 + }
35082 +
35083 + /* either EA or NEA is obtained. Choose item write method */
35084 + if (uf_info->container == UF_CONTAINER_EXTENTS) {
35085 + /* file is built of extent items */
35086 + write_op = reiser4_write_extent;
35087 + } else if (uf_info->container == UF_CONTAINER_EMPTY) {
35088 + /* file is empty */
35089 + if (should_have_notail(uf_info, new_size))
35090 + write_op = reiser4_write_extent;
35091 + else
35092 + write_op = reiser4_write_tail;
35093 + } else {
35094 + /* file is built of tail items */
35095 + if (should_have_notail(uf_info, new_size)) {
35096 + if (ea == NEA_OBTAINED) {
35097 + drop_nonexclusive_access(uf_info);
35098 + get_exclusive_access(uf_info);
35099 + ea = EA_OBTAINED;
35100 + }
35101 + if (uf_info->container == UF_CONTAINER_TAILS) {
35102 + /*
35103 + * if file is being convered by another
35104 + * process - wait until it completes
35105 + */
35106 + while (1) {
35107 + if (reiser4_inode_get_flag(inode,
35108 + REISER4_PART_IN_CONV)) {
35109 + drop_exclusive_access(uf_info);
35110 + schedule();
35111 + get_exclusive_access(uf_info);
35112 + continue;
35113 + }
35114 + break;
35115 + }
35116 + if (uf_info->container == UF_CONTAINER_TAILS) {
35117 + result = tail2extent(uf_info);
35118 + if (result)
35119 + break;
35120 + }
35121 + }
35122 + drop_exclusive_access(uf_info);
35123 + ea = NEITHER_OBTAINED;
35124 + continue;
35125 + }
35126 + write_op = reiser4_write_tail;
35127 + }
35128 +
35129 + written = write_op(file, buf, to_write, pos);
35130 + if (written == -ENOSPC && try_free_space) {
35131 + drop_access(uf_info);
35132 + txnmgr_force_commit_all(inode->i_sb, 0);
35133 + try_free_space = 0;
35134 + continue;
35135 + }
35136 + if (written < 0) {
35137 + drop_access(uf_info);
35138 + result = written;
35139 + break;
35140 + }
35141 + /* something is written. */
35142 + if (uf_info->container == UF_CONTAINER_EMPTY) {
35143 + assert("", ea == EA_OBTAINED);
35144 + uf_info->container =
35145 + (write_op == reiser4_write_extent) ?
35146 + UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
35147 + } else {
35148 + assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
35149 + write_op == reiser4_write_extent));
35150 + assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
35151 + write_op == reiser4_write_tail));
35152 + }
35153 + if (*pos + written > inode->i_size)
35154 + INODE_SET_FIELD(inode, i_size, *pos + written);
35155 + file_update_time(file);
35156 + result = reiser4_update_sd(inode);
35157 + if (result) {
35158 + mutex_unlock(&inode->i_mutex);
35159 + current->backing_dev_info = NULL;
35160 + drop_access(uf_info);
35161 + context_set_commit_async(ctx);
35162 + reiser4_exit_context(ctx);
35163 + return result;
35164 + }
35165 + drop_access(uf_info);
35166 + ea = NEITHER_OBTAINED;
35167 + reiser4_txn_restart(ctx);
35168 + current->journal_info = NULL;
35169 + /*
35170 + * tell VM how many pages were dirtied. Maybe number of pages
35171 + * which were dirty already should not be counted
35172 + */
35173 + balance_dirty_pages_ratelimited_nr(inode->i_mapping,
35174 + (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
35175 + current->journal_info = ctx;
35176 +
35177 + left -= written;
35178 + buf += written;
35179 + *pos += written;
35180 + }
35181 +
35182 + mutex_unlock(&inode->i_mutex);
35183 +
35184 + if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
35185 + reiser4_txn_restart_current();
35186 + grab_space_enable();
35187 + result = reiser4_sync_file_common(file, file->f_dentry,
35188 + 0 /* data and stat data */);
35189 + if (result)
35190 + warning("reiser4-7", "failed to sync file %llu",
35191 + (unsigned long long)get_inode_oid(inode));
35192 + }
35193 +
35194 + current->backing_dev_info = NULL;
35195 +
35196 + reiser4_exit_context(ctx);
35197 +
35198 + /*
35199 + * return number of written bytes or error code if nothing is
35200 + * written. Note, that it does not work correctly in case when
35201 + * sync_unix_file returns error
35202 + */
35203 + return (count - left) ? (count - left) : result;
35204 +}
35205 +
35206 +/**
35207 + * release_unix_file - release of struct file_operations
35208 + * @inode: inode of released file
35209 + * @file: file to release
35210 + *
35211 + * Implementation of release method of struct file_operations for unix file
35212 + * plugin. If last reference to indode is released - convert all extent items
35213 + * into tail items if necessary. Frees reiser4 specific file data.
35214 + */
35215 +int release_unix_file(struct inode *inode, struct file *file)
35216 +{
35217 + reiser4_context *ctx;
35218 + struct unix_file_info *uf_info;
35219 + int result;
35220 + int in_reiser4;
35221 +
35222 + in_reiser4 = is_in_reiser4_context();
35223 +
35224 + ctx = reiser4_init_context(inode->i_sb);
35225 + if (IS_ERR(ctx))
35226 + return PTR_ERR(ctx);
35227 +
35228 + result = 0;
35229 + if (in_reiser4 == 0) {
35230 + uf_info = unix_file_inode_data(inode);
35231 +
35232 + get_exclusive_access_careful(uf_info, inode);
35233 + if (atomic_read(&file->f_dentry->d_count) == 1 &&
35234 + uf_info->container == UF_CONTAINER_EXTENTS &&
35235 + !should_have_notail(uf_info, inode->i_size) &&
35236 + !rofs_inode(inode)) {
35237 + result = extent2tail(file, uf_info);
35238 + if (result != 0) {
35239 + warning("nikita-3233",
35240 + "Failed (%d) to convert in %s (%llu)",
35241 + result, __FUNCTION__,
35242 + (unsigned long long)
35243 + get_inode_oid(inode));
35244 + }
35245 + }
35246 + drop_exclusive_access(uf_info);
35247 + } else {
35248 + /*
35249 + we are within reiser4 context already. How latter is
35250 + possible? Simple:
35251 +
35252 + (gdb) bt
35253 + #0 get_exclusive_access ()
35254 + #2 0xc01e56d3 in release_unix_file ()
35255 + #3 0xc01c3643 in reiser4_release ()
35256 + #4 0xc014cae0 in __fput ()
35257 + #5 0xc013ffc3 in remove_vm_struct ()
35258 + #6 0xc0141786 in exit_mmap ()
35259 + #7 0xc0118480 in mmput ()
35260 + #8 0xc0133205 in oom_kill ()
35261 + #9 0xc01332d1 in out_of_memory ()
35262 + #10 0xc013bc1d in try_to_free_pages ()
35263 + #11 0xc013427b in __alloc_pages ()
35264 + #12 0xc013f058 in do_anonymous_page ()
35265 + #13 0xc013f19d in do_no_page ()
35266 + #14 0xc013f60e in handle_mm_fault ()
35267 + #15 0xc01131e5 in do_page_fault ()
35268 + #16 0xc0104935 in error_code ()
35269 + #17 0xc025c0c6 in __copy_to_user_ll ()
35270 + #18 0xc01d496f in reiser4_read_tail ()
35271 + #19 0xc01e4def in read_unix_file ()
35272 + #20 0xc01c3504 in reiser4_read ()
35273 + #21 0xc014bd4f in vfs_read ()
35274 + #22 0xc014bf66 in sys_read ()
35275 + */
35276 + warning("vs-44", "out of memory?");
35277 + }
35278 +
35279 + reiser4_free_file_fsdata(file);
35280 +
35281 + reiser4_exit_context(ctx);
35282 + return result;
35283 +}
35284 +
35285 +static void set_file_notail(struct inode *inode)
35286 +{
35287 + reiser4_inode *state;
35288 + formatting_plugin *tplug;
35289 +
35290 + state = reiser4_inode_data(inode);
35291 + tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
35292 + force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
35293 +}
35294 +
35295 +/* if file is built of tails - convert it to extents */
35296 +static int unpack(struct file *filp, struct inode *inode, int forever)
35297 +{
35298 + int result = 0;
35299 + struct unix_file_info *uf_info;
35300 +
35301 + uf_info = unix_file_inode_data(inode);
35302 + assert("vs-1628", ea_obtained(uf_info));
35303 +
35304 + result = find_file_state(inode, uf_info);
35305 + if (result)
35306 + return result;
35307 + assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
35308 +
35309 + if (uf_info->container == UF_CONTAINER_TAILS) {
35310 + /*
35311 + * if file is being convered by another process - wait until it
35312 + * completes
35313 + */
35314 + while (1) {
35315 + if (reiser4_inode_get_flag(inode,
35316 + REISER4_PART_IN_CONV)) {
35317 + drop_exclusive_access(uf_info);
35318 + schedule();
35319 + get_exclusive_access(uf_info);
35320 + continue;
35321 + }
35322 + break;
35323 + }
35324 + if (uf_info->container == UF_CONTAINER_TAILS) {
35325 + result = tail2extent(uf_info);
35326 + if (result)
35327 + return result;
35328 + }
35329 + }
35330 + if (forever) {
35331 + /* safe new formatting plugin in stat data */
35332 + __u64 tograb;
35333 +
35334 + set_file_notail(inode);
35335 +
35336 + grab_space_enable();
35337 + tograb = inode_file_plugin(inode)->estimate.update(inode);
35338 + result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
35339 + result = reiser4_update_sd(inode);
35340 + }
35341 +
35342 + return result;
35343 +}
35344 +
35345 +/* implentation of vfs' ioctl method of struct file_operations for unix file
35346 + plugin
35347 +*/
35348 +int
35349 +ioctl_unix_file(struct inode *inode, struct file *filp,
35350 + unsigned int cmd, unsigned long arg UNUSED_ARG)
35351 +{
35352 + reiser4_context *ctx;
35353 + int result;
35354 +
35355 + ctx = reiser4_init_context(inode->i_sb);
35356 + if (IS_ERR(ctx))
35357 + return PTR_ERR(ctx);
35358 +
35359 + switch (cmd) {
35360 + case REISER4_IOC_UNPACK:
35361 + get_exclusive_access(unix_file_inode_data(inode));
35362 + result = unpack(filp, inode, 1 /* forever */ );
35363 + drop_exclusive_access(unix_file_inode_data(inode));
35364 + break;
35365 +
35366 + default:
35367 + result = RETERR(-ENOSYS);
35368 + break;
35369 + }
35370 + reiser4_exit_context(ctx);
35371 + return result;
35372 +}
35373 +
35374 +/* implentation of vfs' bmap method of struct address_space_operations for unix
35375 + file plugin
35376 +*/
35377 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
35378 +{
35379 + reiser4_context *ctx;
35380 + sector_t result;
35381 + reiser4_key key;
35382 + coord_t coord;
35383 + lock_handle lh;
35384 + struct inode *inode;
35385 + item_plugin *iplug;
35386 + sector_t block;
35387 +
35388 + inode = mapping->host;
35389 +
35390 + ctx = reiser4_init_context(inode->i_sb);
35391 + if (IS_ERR(ctx))
35392 + return PTR_ERR(ctx);
35393 + key_by_inode_and_offset_common(inode,
35394 + (loff_t) lblock * current_blocksize,
35395 + &key);
35396 +
35397 + init_lh(&lh);
35398 + result =
35399 + find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
35400 + if (cbk_errored(result)) {
35401 + done_lh(&lh);
35402 + reiser4_exit_context(ctx);
35403 + return result;
35404 + }
35405 +
35406 + result = zload(coord.node);
35407 + if (result) {
35408 + done_lh(&lh);
35409 + reiser4_exit_context(ctx);
35410 + return result;
35411 + }
35412 +
35413 + iplug = item_plugin_by_coord(&coord);
35414 + if (iplug->s.file.get_block) {
35415 + result = iplug->s.file.get_block(&coord, lblock, &block);
35416 + if (result == 0)
35417 + result = block;
35418 + } else
35419 + result = RETERR(-EINVAL);
35420 +
35421 + zrelse(coord.node);
35422 + done_lh(&lh);
35423 + reiser4_exit_context(ctx);
35424 + return result;
35425 +}
35426 +
35427 +/**
35428 + * flow_by_inode_unix_file - initizlize structure flow
35429 + * @inode: inode of file for which read or write is abou
35430 + * @buf: buffer to perform read to or write from
35431 + * @user: flag showing whether @buf is user space or kernel space
35432 + * @size: size of buffer @buf
35433 + * @off: start offset fro read or write
35434 + * @op: READ or WRITE
35435 + * @flow:
35436 + *
35437 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
35438 + */
35439 +int flow_by_inode_unix_file(struct inode *inode,
35440 + const char __user *buf, int user,
35441 + loff_t size, loff_t off,
35442 + rw_op op, flow_t *flow)
35443 +{
35444 + assert("nikita-1100", inode != NULL);
35445 +
35446 + flow->length = size;
35447 + memcpy(&flow->data, &buf, sizeof(buf));
35448 + flow->user = user;
35449 + flow->op = op;
35450 + assert("nikita-1931", inode_file_plugin(inode) != NULL);
35451 + assert("nikita-1932",
35452 + inode_file_plugin(inode)->key_by_inode ==
35453 + key_by_inode_and_offset_common);
35454 + /* calculate key of write position and insert it into flow->key */
35455 + return key_by_inode_and_offset_common(inode, off, &flow->key);
35456 +}
35457 +
35458 +/* plugin->u.file.set_plug_in_sd = NULL
35459 + plugin->u.file.set_plug_in_inode = NULL
35460 + plugin->u.file.create_blank_sd = NULL */
35461 +/* plugin->u.file.delete */
35462 +/*
35463 + plugin->u.file.add_link = reiser4_add_link_common
35464 + plugin->u.file.rem_link = NULL */
35465 +
35466 +/* plugin->u.file.owns_item
35467 + this is common_file_owns_item with assertion */
35468 +/* Audited by: green(2002.06.15) */
35469 +int
35470 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
35471 + const coord_t * coord /* coord to check */ )
35472 +{
35473 + int result;
35474 +
35475 + result = owns_item_common(inode, coord);
35476 + if (!result)
35477 + return 0;
35478 + if (!plugin_of_group(item_plugin_by_coord(coord),
35479 + UNIX_FILE_METADATA_ITEM_TYPE))
35480 + return 0;
35481 + assert("vs-547",
35482 + item_id_by_coord(coord) == EXTENT_POINTER_ID ||
35483 + item_id_by_coord(coord) == FORMATTING_ID);
35484 + return 1;
35485 +}
35486 +
35487 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
35488 +{
35489 + int result;
35490 + int s_result;
35491 + loff_t old_size;
35492 + reiser4_tree *tree;
35493 +
35494 + inode_check_scale(inode, inode->i_size, attr->ia_size);
35495 +
35496 + old_size = inode->i_size;
35497 + tree = reiser4_tree_by_inode(inode);
35498 +
35499 + result = safe_link_grab(tree, BA_CAN_COMMIT);
35500 + if (result == 0)
35501 + result = safe_link_add(inode, SAFE_TRUNCATE);
35502 + if (result == 0)
35503 + result = truncate_file_body(inode, attr);
35504 + if (result)
35505 + warning("vs-1588", "truncate_file failed: oid %lli, "
35506 + "old size %lld, new size %lld, retval %d",
35507 + (unsigned long long)get_inode_oid(inode),
35508 + old_size, attr->ia_size, result);
35509 +
35510 + s_result = safe_link_grab(tree, BA_CAN_COMMIT);
35511 + if (s_result == 0)
35512 + s_result =
35513 + safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
35514 + if (s_result != 0) {
35515 + warning("nikita-3417", "Cannot kill safelink %lli: %i",
35516 + (unsigned long long)get_inode_oid(inode), s_result);
35517 + }
35518 + safe_link_release(tree);
35519 + return result;
35520 +}
35521 +
35522 +/* plugin->u.file.setattr method */
35523 +/* This calls inode_setattr and if truncate is in effect it also takes
35524 + exclusive inode access to avoid races */
35525 +int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
35526 + struct iattr *attr /* change description */ )
35527 +{
35528 + int result;
35529 +
35530 + if (attr->ia_valid & ATTR_SIZE) {
35531 + reiser4_context *ctx;
35532 + struct unix_file_info *uf_info;
35533 +
35534 + /* truncate does reservation itself and requires exclusive
35535 + access obtained */
35536 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
35537 + if (IS_ERR(ctx))
35538 + return PTR_ERR(ctx);
35539 +
35540 + uf_info = unix_file_inode_data(dentry->d_inode);
35541 + get_exclusive_access_careful(uf_info, dentry->d_inode);
35542 + result = setattr_truncate(dentry->d_inode, attr);
35543 + drop_exclusive_access(uf_info);
35544 + context_set_commit_async(ctx);
35545 + reiser4_exit_context(ctx);
35546 + } else
35547 + result = reiser4_setattr_common(dentry, attr);
35548 +
35549 + return result;
35550 +}
35551 +
35552 +/* plugin->u.file.init_inode_data */
35553 +void
35554 +init_inode_data_unix_file(struct inode *inode,
35555 + reiser4_object_create_data * crd, int create)
35556 +{
35557 + struct unix_file_info *data;
35558 +
35559 + data = unix_file_inode_data(inode);
35560 + data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
35561 + init_rwsem(&data->latch);
35562 + data->tplug = inode_formatting_plugin(inode);
35563 + data->exclusive_use = 0;
35564 +
35565 +#if REISER4_DEBUG
35566 + data->ea_owner = NULL;
35567 + atomic_set(&data->nr_neas, 0);
35568 +#endif
35569 + init_inode_ordering(inode, crd, create);
35570 +}
35571 +
35572 +/**
35573 + * delete_unix_file - delete_object of file_plugin
35574 + * @inode: inode to be deleted
35575 + *
35576 + * Truncates file to length 0, removes stat data and safe link.
35577 + */
35578 +int delete_object_unix_file(struct inode *inode)
35579 +{
35580 + struct unix_file_info *uf_info;
35581 + int result;
35582 +
35583 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
35584 + return 0;
35585 +
35586 + /* truncate file bogy first */
35587 + uf_info = unix_file_inode_data(inode);
35588 + get_exclusive_access(uf_info);
35589 + result = shorten_file(inode, 0 /* size */ );
35590 + drop_exclusive_access(uf_info);
35591 +
35592 + if (result)
35593 + warning("", "failed to truncate file (%llu) on removal: %d",
35594 + get_inode_oid(inode), result);
35595 +
35596 + /* remove stat data and safe link */
35597 + return reiser4_delete_object_common(inode);
35598 +}
35599 +
35600 +int
35601 +prepare_write_unix_file(struct file *file, struct page *page,
35602 + unsigned from, unsigned to)
35603 +{
35604 + reiser4_context *ctx;
35605 + struct unix_file_info *uf_info;
35606 + int ret;
35607 +
35608 + ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
35609 + if (IS_ERR(ctx))
35610 + return PTR_ERR(ctx);
35611 +
35612 + uf_info = unix_file_inode_data(file->f_dentry->d_inode);
35613 + get_exclusive_access(uf_info);
35614 + ret = find_file_state(file->f_dentry->d_inode, uf_info);
35615 + if (ret == 0) {
35616 + if (uf_info->container == UF_CONTAINER_TAILS)
35617 + ret = -EINVAL;
35618 + else
35619 + ret = do_prepare_write(file, page, from, to);
35620 + }
35621 + drop_exclusive_access(uf_info);
35622 +
35623 + /* don't commit transaction under inode semaphore */
35624 + context_set_commit_async(ctx);
35625 + reiser4_exit_context(ctx);
35626 + return ret;
35627 +}
35628 +
35629 +/*
35630 + * Local variables:
35631 + * c-indentation-style: "K&R"
35632 + * mode-name: "LC"
35633 + * c-basic-offset: 8
35634 + * tab-width: 8
35635 + * fill-column: 79
35636 + * scroll-step: 1
35637 + * End:
35638 + */
35639 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.23/fs/reiser4/plugin/file/file_conversion.c
35640 --- linux-2.6.23.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300
35641 +++ linux-2.6.23/fs/reiser4/plugin/file/file_conversion.c 2007-12-04 16:49:30.000000000 +0300
35642 @@ -0,0 +1,659 @@
35643 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
35644 + licensing governed by reiser4/README */
35645 +
35646 +/* *
35647 + * This file contains a converter cryptcompress->unix_file, and O(1)-heuristic,
35648 + * which allows to assign for a regular file the most reasonable plugin to be
35649 + * managed by. Note, that we don't use back conversion because of compatibility
35650 + * reasons (see http://dev.namesys.com/Version4.X.Y for details).
35651 + *
35652 + * Currently used heuristic is very simple: if first complete logical cluster
35653 + * (64K by default) of a file is incompressible, then we make a decision, that
35654 + * the whole file is incompressible (*). When creating a file the conversion
35655 + * is enabled by default via installing a special "permitting" compression mode
35656 + * plugin (**) (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c
35657 + * for details).
35658 + *
35659 + * The conversion is accompanied by rebuilding disk structures of a file, so it
35660 + * is important to protect them from being interacted with other plugins which
35661 + * don't expect them to be in such inconsistent state. For this to be protected
35662 + * we serialize readers and writers of pset. Writers are the processes which can
35663 + * change it with conversion purposes; other ones are readers. Serialization is
35664 + * performed via acquiring per-inode rw-semaphore (conv_sem).
35665 + *
35666 + * (*) This heuristic can be easily changed as soon as we have a new,
35667 + * better one.
35668 + * (**) Such solution allows to keep enable/disable state on disk.
35669 + */
35670 +
35671 +#include "../../inode.h"
35672 +#include "../cluster.h"
35673 +#include "file.h"
35674 +
35675 +#define conversion_enabled(inode) \
35676 + (inode_compression_mode_plugin(inode) == \
35677 + compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
35678 +
35679 +/**
35680 + * Located sections (readers and writers of @pset) are not permanently
35681 + * critical: cryptcompress file can be converted only if the conversion
35682 + * is enabled (see the macrio above). Also we don't perform back
35683 + * conversion. The following helper macro is a sanity check to decide
35684 + * if we need the protection (locks are always additional overheads).
35685 + */
35686 +#define should_protect(inode) \
35687 + (inode_file_plugin(inode) == \
35688 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
35689 + conversion_enabled(inode))
35690 +/**
35691 + * We'll speak about "passive" protection for readers and "active"
35692 + * protection for writers. All methods with active or passive protection
35693 + * has suffix "careful".
35694 + */
35695 +/* Macro for passive protection.
35696 + method_foo contains only readers */
35697 +#define PROT_PASSIVE(type, method, args) \
35698 +({ \
35699 + type _result; \
35700 + struct rw_semaphore * guard = \
35701 + &reiser4_inode_data(inode)->conv_sem; \
35702 + \
35703 + if (should_protect(inode)) { \
35704 + down_read(guard); \
35705 + if (!should_protect(inode)) \
35706 + up_read(guard); \
35707 + } \
35708 + if (inode_file_plugin(inode) == \
35709 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
35710 + _result = method ## _unix_file args; \
35711 + else \
35712 + _result = method ## _cryptcompress args; \
35713 + if (should_protect(inode)) \
35714 + up_read(guard); \
35715 + _result; \
35716 +})
35717 +
35718 +#define PROT_PASSIVE_VOID(method, args) \
35719 +({ \
35720 + struct rw_semaphore * guard = \
35721 + &reiser4_inode_data(inode)->conv_sem; \
35722 + \
35723 + if (should_protect(inode)) { \
35724 + down_read(guard); \
35725 + if (!should_protect(inode)) \
35726 + up_read(guard); \
35727 + } \
35728 + if (inode_file_plugin(inode) == \
35729 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
35730 + method ## _unix_file args; \
35731 + else \
35732 + method ## _cryptcompress args; \
35733 + if (should_protect(inode)) \
35734 + up_read(guard); \
35735 +})
35736 +
35737 +/**
35738 + * Macro for active protection.
35739 + * active_expr contains writers of pset;
35740 + * NOTE: after evaluating active_expr conversion should be disabled.
35741 + */
35742 +#define PROT_ACTIVE(type, method, args, active_expr) \
35743 +({ \
35744 + type _result = 0; \
35745 + struct rw_semaphore * guard = \
35746 + &reiser4_inode_data(inode)->conv_sem; \
35747 + reiser4_context * ctx = reiser4_init_context(inode->i_sb); \
35748 + if (IS_ERR(ctx)) \
35749 + return PTR_ERR(ctx); \
35750 + \
35751 + if (should_protect(inode)) { \
35752 + down_write(guard); \
35753 + if (should_protect(inode)) \
35754 + _result = active_expr; \
35755 + up_write(guard); \
35756 + } \
35757 + if (_result == 0) { \
35758 + if (inode_file_plugin(inode) == \
35759 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
35760 + _result = method ## _unix_file args; \
35761 + else \
35762 + _result = method ## _cryptcompress args; \
35763 + } \
35764 + reiser4_exit_context(ctx); \
35765 + _result; \
35766 +})
35767 +
35768 +/* Pass management to the unix-file plugin with "notail" policy */
35769 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
35770 +{
35771 + int result;
35772 + reiser4_inode *info;
35773 + struct unix_file_info * uf;
35774 + info = reiser4_inode_data(inode);
35775 +
35776 + result = aset_set_unsafe(&info->pset,
35777 + PSET_FILE,
35778 + (reiser4_plugin *)
35779 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
35780 + if (result)
35781 + return result;
35782 + result = aset_set_unsafe(&info->pset,
35783 + PSET_FORMATTING,
35784 + (reiser4_plugin *)
35785 + formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
35786 + if (result)
35787 + return result;
35788 + /* get rid of non-standard plugins */
35789 + info->plugin_mask &= ~cryptcompress_mask;
35790 + /* get rid of plugin stat-data extension */
35791 + info->extmask &= ~(1 << PLUGIN_STAT);
35792 +
35793 + reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
35794 +
35795 + /* FIXME use init_inode_data_unix_file() instead,
35796 + but aviod init_inode_ordering() */
35797 + /* Init unix-file specific part of inode */
35798 + uf = unix_file_inode_data(inode);
35799 + uf->container = UF_CONTAINER_UNKNOWN;
35800 + init_rwsem(&uf->latch);
35801 + uf->tplug = inode_formatting_plugin(inode);
35802 + uf->exclusive_use = 0;
35803 +#if REISER4_DEBUG
35804 + uf->ea_owner = NULL;
35805 + atomic_set(&uf->nr_neas, 0);
35806 +#endif
35807 + /**
35808 + * we was carefull for file_ops, inode_ops and as_ops
35809 + * to be invariant for plugin conversion, so there is
35810 + * no need to update ones already installed in the
35811 + * vfs's residence.
35812 + */
35813 + return 0;
35814 +}
35815 +
35816 +#if REISER4_DEBUG
35817 +static int disabled_conversion_inode_ok(struct inode * inode)
35818 +{
35819 + __u64 extmask = reiser4_inode_data(inode)->extmask;
35820 + __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
35821 +
35822 + return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
35823 + (extmask & (1 << UNIX_STAT)) &&
35824 + (extmask & (1 << LARGE_TIMES_STAT)) &&
35825 + (extmask & (1 << PLUGIN_STAT)) &&
35826 + (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
35827 +}
35828 +#endif
35829 +
35830 +/* Assign another mode that will control
35831 + compression at flush time only */
35832 +static int disable_conversion_no_update_sd(struct inode * inode)
35833 +{
35834 + int result;
35835 + result =
35836 + force_plugin_pset(inode,
35837 + PSET_COMPRESSION_MODE,
35838 + (reiser4_plugin *)compression_mode_plugin_by_id
35839 + (LATTD_COMPRESSION_MODE_ID));
35840 + assert("edward-1500",
35841 + ergo(!result, disabled_conversion_inode_ok(inode)));
35842 + return result;
35843 +}
35844 +
35845 +/* Disable future attempts to check/convert. This function is called by
35846 + conversion hooks. */
35847 +static int disable_conversion(struct inode * inode)
35848 +{
35849 + return disable_conversion_no_update_sd(inode);
35850 +}
35851 +
35852 +static int check_position(struct inode * inode,
35853 + loff_t pos /* position in the file to write from */,
35854 + struct cluster_handle * clust,
35855 + int * check_compress)
35856 +{
35857 + assert("edward-1505", conversion_enabled(inode));
35858 + /*
35859 + * if file size is more then cluster size, then compressible
35860 + * status must be figured out (i.e. compression was disabled,
35861 + * or file plugin was converted to unix_file)
35862 + */
35863 + assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
35864 +
35865 + if (pos > inode->i_size)
35866 + /* first logical cluster will contain a (partial) hole */
35867 + return disable_conversion(inode);
35868 + if (pos < inode_cluster_size(inode))
35869 + /* writing to the first logical cluster */
35870 + return 0;
35871 + /*
35872 + * here we have:
35873 + * cluster_size <= pos <= i_size <= cluster_size,
35874 + * and, hence, pos == i_size == cluster_size
35875 + */
35876 + assert("edward-1498",
35877 + pos == inode->i_size &&
35878 + pos == inode_cluster_size(inode));
35879 +
35880 + *check_compress = 1;
35881 + return 0;
35882 +}
35883 +
35884 +static void start_check_compressibility(struct inode * inode,
35885 + struct cluster_handle * clust,
35886 + hint_t * hint)
35887 +{
35888 + assert("edward-1507", clust->index == 1);
35889 + assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
35890 + assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
35891 +
35892 + hint_init_zero(hint);
35893 + clust->hint = hint;
35894 + clust->index --;
35895 + clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
35896 +
35897 + /* first logical cluster (of index #0) must be complete */
35898 + assert("edward-1510", lbytes(clust->index, inode) ==
35899 + inode_cluster_size(inode));
35900 +}
35901 +
35902 +static void finish_check_compressibility(struct inode * inode,
35903 + struct cluster_handle * clust,
35904 + hint_t * hint)
35905 +{
35906 + reiser4_unset_hint(clust->hint);
35907 + clust->hint = hint;
35908 + clust->index ++;
35909 +}
35910 +
35911 +#if REISER4_DEBUG
35912 +static int prepped_dclust_ok(hint_t * hint)
35913 +{
35914 + reiser4_key key;
35915 + coord_t * coord = &hint->ext_coord.coord;
35916 +
35917 + item_key_by_coord(coord, &key);
35918 + return (item_id_by_coord(coord) == CTAIL_ID &&
35919 + !coord_is_unprepped_ctail(coord) &&
35920 + (get_key_offset(&key) + nr_units_ctail(coord) ==
35921 + dclust_get_extension_dsize(hint)));
35922 +}
35923 +#endif
35924 +
35925 +#define fifty_persent(size) (size >> 1)
35926 +/* evaluation of data compressibility */
35927 +#define data_is_compressible(osize, isize) \
35928 + (osize < fifty_persent(isize))
35929 +
35930 +/**
35931 + * A simple O(1)-heuristic for compressibility.
35932 + * This is called not more then one time per file's life.
35933 + * Read first logical cluster (of index #0) and estimate its compressibility.
35934 + * Save estimation result in @compressible.
35935 + */
35936 +static int read_check_compressibility(struct inode * inode,
35937 + struct cluster_handle * clust,
35938 + int * compressible)
35939 +{
35940 + int i;
35941 + int result;
35942 + __u32 dst_len;
35943 + hint_t tmp_hint;
35944 + hint_t * cur_hint = clust->hint;
35945 +
35946 + start_check_compressibility(inode, clust, &tmp_hint);
35947 +
35948 + reset_cluster_pgset(clust, cluster_nrpages(inode));
35949 + result = grab_page_cluster(inode, clust, READ_OP);
35950 + if (result)
35951 + return result;
35952 + /* Read page cluster here */
35953 + for (i = 0; i < clust->nr_pages; i++) {
35954 + struct page *page = clust->pages[i];
35955 + lock_page(page);
35956 + result = do_readpage_ctail(inode, clust, page,
35957 + ZNODE_READ_LOCK);
35958 + unlock_page(page);
35959 + if (result)
35960 + goto error;
35961 + }
35962 + tfm_cluster_clr_uptodate(&clust->tc);
35963 +
35964 + cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
35965 +
35966 + if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
35967 + /* lenght of compressed data is known, no need to compress */
35968 + assert("edward-1511",
35969 + znode_is_any_locked(tmp_hint.lh.node));
35970 + assert("edward-1512",
35971 + WITH_DATA(tmp_hint.ext_coord.coord.node,
35972 + prepped_dclust_ok(&tmp_hint)));
35973 + dst_len = dclust_get_extension_dsize(&tmp_hint);
35974 + }
35975 + else {
35976 + struct tfm_cluster * tc = &clust->tc;
35977 + compression_plugin * cplug = inode_compression_plugin(inode);
35978 + result = grab_tfm_stream(inode, tc, INPUT_STREAM);
35979 + if (result)
35980 + goto error;
35981 + for (i = 0; i < clust->nr_pages; i++) {
35982 + char *data;
35983 + lock_page(clust->pages[i]);
35984 + BUG_ON(!PageUptodate(clust->pages[i]));
35985 + data = kmap(clust->pages[i]);
35986 + memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
35987 + data, PAGE_CACHE_SIZE);
35988 + kunmap(clust->pages[i]);
35989 + unlock_page(clust->pages[i]);
35990 + }
35991 + result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
35992 + if (result)
35993 + goto error;
35994 + result = grab_coa(tc, cplug);
35995 + if (result)
35996 + goto error;
35997 + tc->len = tc->lsize = lbytes(clust->index, inode);
35998 + assert("edward-1513", tc->len == inode_cluster_size(inode));
35999 + dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
36000 + cplug->compress(get_coa(tc, cplug->h.id, tc->act),
36001 + tfm_input_data(clust), tc->len,
36002 + tfm_output_data(clust), &dst_len);
36003 + assert("edward-1514",
36004 + dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
36005 + }
36006 + finish_check_compressibility(inode, clust, cur_hint);
36007 + *compressible = data_is_compressible(dst_len,
36008 + inode_cluster_size(inode));
36009 + return 0;
36010 + error:
36011 + put_page_cluster(clust, inode, READ_OP);
36012 + return result;
36013 +}
36014 +
36015 +/* Cut disk cluster of index @idx */
36016 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
36017 +{
36018 + reiser4_key from, to;
36019 + assert("edward-1515", inode_file_plugin(inode) ==
36020 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
36021 + key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
36022 + to = from;
36023 + set_key_offset(&to,
36024 + get_key_offset(&from) + inode_cluster_size(inode) - 1);
36025 + return reiser4_cut_tree(reiser4_tree_by_inode(inode),
36026 + &from, &to, inode, 0);
36027 +}
36028 +
36029 +static int reserve_cryptcompress2unixfile(struct inode *inode)
36030 +{
36031 + reiser4_block_nr unformatted_nodes;
36032 + reiser4_tree *tree;
36033 +
36034 + tree = reiser4_tree_by_inode(inode);
36035 +
36036 + /* number of unformatted nodes which will be created */
36037 + unformatted_nodes = cluster_nrpages(inode); /* N */
36038 +
36039 + /*
36040 + * space required for one iteration of extent->tail conversion:
36041 + *
36042 + * 1. kill ctail items
36043 + *
36044 + * 2. insert N unformatted nodes
36045 + *
36046 + * 3. insert N (worst-case single-block
36047 + * extents) extent units.
36048 + *
36049 + * 4. drilling to the leaf level by coord_by_key()
36050 + *
36051 + * 5. possible update of stat-data
36052 + *
36053 + */
36054 + grab_space_enable();
36055 + return reiser4_grab_space
36056 + (2 * tree->height +
36057 + unformatted_nodes +
36058 + unformatted_nodes * estimate_one_insert_into_item(tree) +
36059 + 1 + estimate_one_insert_item(tree) +
36060 + inode_file_plugin(inode)->estimate.update(inode),
36061 + BA_CAN_COMMIT);
36062 +}
36063 +
36064 +/* clear flag that indicated conversion and update
36065 + stat-data with new (unix-file - specific) info */
36066 +static int complete_file_conversion(struct inode *inode)
36067 +{
36068 + int result;
36069 +
36070 + grab_space_enable();
36071 + result =
36072 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
36073 + BA_CAN_COMMIT);
36074 + if (result == 0) {
36075 + reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36076 + result = reiser4_update_sd(inode);
36077 + }
36078 + if (result)
36079 + warning("edward-1452",
36080 + "Converting %llu to unix-file: update sd failed (%i)",
36081 + (unsigned long long)get_inode_oid(inode), result);
36082 + return 0;
36083 +}
36084 +
36085 +
36086 +/* do conversion */
36087 +static int cryptcompress2unixfile(struct file * file, struct inode * inode,
36088 + struct cluster_handle * clust)
36089 +{
36090 + int i;
36091 + int result = 0;
36092 + struct cryptcompress_info *cr_info;
36093 + struct unix_file_info *uf_info;
36094 +
36095 + assert("edward-1516", clust->pages[0]->index == 0);
36096 + assert("edward-1517", clust->hint != NULL);
36097 +
36098 + /* release all cryptcompress-specific resources */
36099 + cr_info = cryptcompress_inode_data(inode);
36100 + result = reserve_cryptcompress2unixfile(inode);
36101 + if (result)
36102 + goto out;
36103 + reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36104 + reiser4_unset_hint(clust->hint);
36105 + result = cut_disk_cluster(inode, 0);
36106 + if (result)
36107 + goto out;
36108 + /* captured jnode of cluster and assotiated resources (pages,
36109 + reserved disk space) were released by ->kill_hook() method
36110 + of the item plugin */
36111 +
36112 + result = __cryptcompress2unixfile(file, inode);
36113 + if (result)
36114 + goto out;
36115 + /* At this point file is managed by unix file plugin */
36116 +
36117 + uf_info = unix_file_inode_data(inode);
36118 +
36119 + assert("edward-1518",
36120 + ergo(jprivate(clust->pages[0]),
36121 + !jnode_is_cluster_page(jprivate(clust->pages[0]))));
36122 + for(i = 0; i < clust->nr_pages; i++) {
36123 + assert("edward-1519", clust->pages[i]);
36124 + assert("edward-1520", PageUptodate(clust->pages[i]));
36125 +
36126 + result = find_or_create_extent(clust->pages[i]);
36127 + if (result)
36128 + break;
36129 + }
36130 + if (!result) {
36131 + uf_info->container = UF_CONTAINER_EXTENTS;
36132 + complete_file_conversion(inode);
36133 + }
36134 + out:
36135 + all_grabbed2free();
36136 + if (result)
36137 + warning("edward-1453", "Failed to convert file %llu: ret=%i",
36138 + (unsigned long long)get_inode_oid(inode), result);
36139 + return result;
36140 +}
36141 +
36142 +/* Check, then perform or disable conversion if needed */
36143 +int write_conversion_hook(struct file * file, struct inode * inode, loff_t pos,
36144 + struct cluster_handle * clust, int * progress)
36145 +{
36146 + int result;
36147 + int check_compress = 0;
36148 + int compressible = 0;
36149 +
36150 + if (!conversion_enabled(inode))
36151 + return 0;
36152 + result = check_position(inode, pos, clust, &check_compress);
36153 + if (result || !check_compress)
36154 + return result;
36155 + result = read_check_compressibility(inode, clust, &compressible);
36156 + if (result)
36157 + return result;
36158 +
36159 + /* At this point page cluster is grabbed and uptodate */
36160 + if (!compressible) {
36161 + result = cryptcompress2unixfile(file, inode, clust);
36162 + if (result == 0)
36163 + *progress = 1;
36164 + }
36165 + else
36166 + result = disable_conversion(inode);
36167 +
36168 + reiser4_txn_restart_current();
36169 + put_page_cluster(clust, inode, READ_OP);
36170 + return result;
36171 +}
36172 +
36173 +static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
36174 +{
36175 + return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
36176 +}
36177 +
36178 +/**
36179 + * Here are wrappers with "protection", aka Reiser4 "careful" methods.
36180 + * They are used by vfs (as methods of file_ops, inode_ops or as_ops),
36181 + * which is not aware of plugin conversion performed by Reiser4.
36182 + */
36183 +
36184 +/*
36185 + * Wrappers with active protection for:
36186 + *
36187 + * ->write();
36188 + * ->setattr();
36189 + */
36190 +
36191 +/*
36192 + * Reiser4 write "careful" method. Write a file in 2 steps:
36193 + * . start write with initial file plugin,
36194 + * switch to a new (more resonable) file plugin (if any);
36195 + * . finish write with the new plugin.
36196 + */
36197 +ssize_t reiser4_write_careful(struct file *file, const char __user *buf,
36198 + size_t count, loff_t *off)
36199 +{
36200 + int prot = 0;
36201 + int conv = 0;
36202 + ssize_t written_old = 0; /* bytes written with old plugin */
36203 + ssize_t written_new = 0; /* bytes written with new plugin */
36204 + struct inode * inode = file->f_dentry->d_inode;
36205 + struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
36206 +
36207 + /**
36208 + * First step.
36209 + * Sanity check: if conversion is possible,
36210 + * then protect pset.
36211 + */
36212 + if (should_protect(inode)) {
36213 + prot = 1;
36214 + down_write(guard);
36215 + }
36216 + written_old = inode_file_plugin(inode)->write(file,
36217 + buf,
36218 + count,
36219 + off, &conv);
36220 + if (prot)
36221 + up_write(guard);
36222 + if (written_old < 0 || conv == 0)
36223 + return written_old;
36224 + /**
36225 + * Conversion occurred.
36226 + * Back conversion is impossible,
36227 + * so don't protect at this step.
36228 + */
36229 + assert("edward-1532",
36230 + inode_file_plugin(inode) ==
36231 + file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
36232 +
36233 + written_new = inode_file_plugin(inode)->write(file,
36234 + buf + written_old,
36235 + count - written_old,
36236 + off, NULL);
36237 + return written_old + (written_new < 0 ? 0 : written_new);
36238 +}
36239 +
36240 +int reiser4_setattr_careful(struct dentry *dentry, struct iattr *attr)
36241 +{
36242 + struct inode * inode = dentry->d_inode;
36243 + return PROT_ACTIVE(int, setattr, (dentry, attr),
36244 + setattr_conversion_hook(inode, attr));
36245 +}
36246 +
36247 +/* Wrappers with passive protection for:
36248 + *
36249 + * ->open();
36250 + * ->read();
36251 + * ->ioctl();
36252 + * ->mmap();
36253 + * ->release();
36254 + * ->bmap().
36255 + */
36256 +
36257 +int reiser4_open_careful(struct inode *inode, struct file *file)
36258 +{
36259 + return PROT_PASSIVE(int, open, (inode, file));
36260 +}
36261 +
36262 +ssize_t reiser4_read_careful(struct file * file, char __user * buf,
36263 + size_t size, loff_t * off)
36264 +{
36265 + struct inode * inode = file->f_dentry->d_inode;
36266 + return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
36267 +}
36268 +
36269 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36270 + unsigned int cmd, unsigned long arg)
36271 +{
36272 + return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg));
36273 +}
36274 +
36275 +int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma)
36276 +{
36277 + struct inode *inode = file->f_dentry->d_inode;
36278 + return PROT_PASSIVE(int, mmap, (file, vma));
36279 +}
36280 +
36281 +int reiser4_release_careful(struct inode *inode, struct file *file)
36282 +{
36283 + return PROT_PASSIVE(int, release, (inode, file));
36284 +}
36285 +
36286 +sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock)
36287 +{
36288 + struct inode *inode = mapping->host;
36289 + return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
36290 +}
36291 +
36292 +/*
36293 + Local variables:
36294 + c-indentation-style: "K&R"
36295 + mode-name: "LC"
36296 + c-basic-offset: 8
36297 + tab-width: 8
36298 + fill-column: 80
36299 + scroll-step: 1
36300 + End:
36301 +*/
36302 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/file.h linux-2.6.23/fs/reiser4/plugin/file/file.h
36303 --- linux-2.6.23.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300
36304 +++ linux-2.6.23/fs/reiser4/plugin/file/file.h 2007-12-04 16:49:30.000000000 +0300
36305 @@ -0,0 +1,316 @@
36306 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
36307 + * reiser4/README */
36308 +
36309 +/* this file contains declarations of methods implementing
36310 + file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
36311 + and SYMLINK_FILE_PLUGIN_ID) */
36312 +
36313 +#if !defined( __REISER4_FILE_H__ )
36314 +#define __REISER4_FILE_H__
36315 +
36316 +/**
36317 + * Declarations of common/careful/generic methods.
36318 + * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops);
36319 + * Then common reiser4 method for foo looks like reiser4_foo_common;
36320 + * careful method looks like reiser4_foo_careful;
36321 + * generic method looks like reiser4_foo.
36322 + *
36323 + * Common method is a simple instruction set eligible for more
36324 + * then one plugin id.
36325 + *
36326 + * Generic method looks at the plugin installed in inode's
36327 + * plugin set and calls its appropriate method.
36328 + *
36329 + * Careful method looks like generic method with protected pset
36330 + * (see plugin/file/file_conversion.c for details).
36331 + */
36332 +
36333 +/* inode operations */
36334 +int reiser4_setattr_careful(struct dentry *, struct iattr *);
36335 +
36336 +/* file operations */
36337 +ssize_t reiser4_read_careful(struct file *, char __user *buf,
36338 + size_t count, loff_t *off);
36339 +ssize_t reiser4_write_careful(struct file *, const char __user *buf,
36340 + size_t count, loff_t * off);
36341 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36342 + unsigned int cmd, unsigned long arg);
36343 +int reiser4_mmap_careful(struct file *, struct vm_area_struct *);
36344 +int reiser4_open_careful(struct inode *inode, struct file *file);
36345 +int reiser4_release_careful(struct inode *, struct file *);
36346 +int reiser4_sync_file_common(struct file *, struct dentry *, int datasync);
36347 +
36348 +/* address space operations */
36349 +int reiser4_readpage(struct file *, struct page *);
36350 +int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
36351 + unsigned);
36352 +int reiser4_writepages(struct address_space *, struct writeback_control *);
36353 +int reiser4_prepare_write(struct file *, struct page *, unsigned from,
36354 + unsigned to);
36355 +int reiser4_commit_write(struct file *, struct page *, unsigned from,
36356 + unsigned to);
36357 +sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
36358 +
36359 +/*
36360 + * Private methods of unix-file plugin
36361 + * (UNIX_FILE_PLUGIN_ID)
36362 + */
36363 +
36364 +/* private inode operations */
36365 +int setattr_unix_file(struct dentry *, struct iattr *);
36366 +
36367 +/* private file operations */
36368 +
36369 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
36370 + loff_t *off);
36371 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
36372 + loff_t * off, int * conv);
36373 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
36374 + unsigned long arg);
36375 +int mmap_unix_file(struct file *, struct vm_area_struct *);
36376 +int open_unix_file(struct inode *, struct file *);
36377 +int release_unix_file(struct inode *, struct file *);
36378 +
36379 +/* private address space operations */
36380 +int readpage_unix_file(struct file *, struct page *);
36381 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
36382 +int writepages_unix_file(struct address_space *, struct writeback_control *);
36383 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
36384 + unsigned to);
36385 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
36386 + unsigned to);
36387 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
36388 +
36389 +/* other private methods */
36390 +int delete_object_unix_file(struct inode *);
36391 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
36392 + int user, loff_t, loff_t, rw_op, flow_t *);
36393 +int owns_item_unix_file(const struct inode *, const coord_t *);
36394 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
36395 + int create);
36396 +
36397 +/*
36398 + * Private methods of cryptcompress file plugin
36399 + * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
36400 + */
36401 +
36402 +/* private inode operations */
36403 +int setattr_cryptcompress(struct dentry *, struct iattr *);
36404 +
36405 +/* private file operations */
36406 +ssize_t read_cryptcompress(struct file *, char __user *buf,
36407 + size_t count, loff_t *off);
36408 +ssize_t write_cryptcompress(struct file *, const char __user *buf,
36409 + size_t count, loff_t * off, int *conv);
36410 +int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
36411 + unsigned long arg);
36412 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
36413 +int open_cryptcompress(struct inode *, struct file *);
36414 +int release_cryptcompress(struct inode *, struct file *);
36415 +
36416 +/* private address space operations */
36417 +int readpage_cryptcompress(struct file *, struct page *);
36418 +int readpages_cryptcompress(struct file*, struct address_space*,
36419 + struct list_head*, unsigned);
36420 +int writepages_cryptcompress(struct address_space *,
36421 + struct writeback_control *);
36422 +int prepare_write_cryptcompress(struct file *, struct page *, unsigned from,
36423 + unsigned to);
36424 +int commit_write_cryptcompress(struct file *, struct page *, unsigned from,
36425 + unsigned to);
36426 +sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
36427 +
36428 +/* other private methods */
36429 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
36430 + int user, loff_t, loff_t, rw_op, flow_t *);
36431 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
36432 +int create_object_cryptcompress(struct inode *, struct inode *,
36433 + reiser4_object_create_data *);
36434 +int delete_object_cryptcompress(struct inode *);
36435 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
36436 + int create);
36437 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
36438 + const reiser4_key * to_key,
36439 + reiser4_key * smallest_removed,
36440 + struct inode *object, int truncate,
36441 + int *progress);
36442 +void destroy_inode_cryptcompress(struct inode *);
36443 +
36444 +/*
36445 + * Private methods of symlink file plugin
36446 + * (SYMLINK_FILE_PLUGIN_ID)
36447 + */
36448 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
36449 + reiser4_object_create_data *);
36450 +void destroy_inode_symlink(struct inode *);
36451 +
36452 +/*
36453 + * all the write into unix file is performed by item write method. Write method
36454 + * of unix file plugin only decides which item plugin (extent or tail) and in
36455 + * which mode (one from the enum below) to call
36456 + */
36457 +typedef enum {
36458 + FIRST_ITEM = 1,
36459 + APPEND_ITEM = 2,
36460 + OVERWRITE_ITEM = 3
36461 +} write_mode_t;
36462 +
36463 +/* unix file may be in one the following states */
36464 +typedef enum {
36465 + UF_CONTAINER_UNKNOWN = 0,
36466 + UF_CONTAINER_TAILS = 1,
36467 + UF_CONTAINER_EXTENTS = 2,
36468 + UF_CONTAINER_EMPTY = 3
36469 +} file_container_t;
36470 +
36471 +struct formatting_plugin;
36472 +struct inode;
36473 +
36474 +/* unix file plugin specific part of reiser4 inode */
36475 +struct unix_file_info {
36476 + /*
36477 + * this read-write lock protects file containerization change. Accesses
36478 + * which do not change file containerization (see file_container_t)
36479 + * (read, readpage, writepage, write (until tail conversion is
36480 + * involved)) take read-lock. Accesses which modify file
36481 + * containerization (truncate, conversion from tail to extent and back)
36482 + * take write-lock.
36483 + */
36484 + struct rw_semaphore latch;
36485 + /* this enum specifies which items are used to build the file */
36486 + file_container_t container;
36487 + /*
36488 + * plugin which controls when file is to be converted to extents and
36489 + * back to tail
36490 + */
36491 + struct formatting_plugin *tplug;
36492 + /* if this is set, file is in exclusive use */
36493 + int exclusive_use;
36494 +#if REISER4_DEBUG
36495 + /* pointer to task struct of thread owning exclusive access to file */
36496 + void *ea_owner;
36497 + atomic_t nr_neas;
36498 + void *last_reader;
36499 +#endif
36500 +};
36501 +
36502 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
36503 +void get_exclusive_access(struct unix_file_info *);
36504 +void drop_exclusive_access(struct unix_file_info *);
36505 +void get_nonexclusive_access(struct unix_file_info *);
36506 +void drop_nonexclusive_access(struct unix_file_info *);
36507 +int try_to_get_nonexclusive_access(struct unix_file_info *);
36508 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
36509 + struct inode *);
36510 +int find_file_item_nohint(coord_t *, lock_handle *,
36511 + const reiser4_key *, znode_lock_mode,
36512 + struct inode *);
36513 +
36514 +int load_file_hint(struct file *, hint_t *);
36515 +void save_file_hint(struct file *, const hint_t *);
36516 +
36517 +#include "../item/extent.h"
36518 +#include "../item/tail.h"
36519 +#include "../item/ctail.h"
36520 +
36521 +struct uf_coord {
36522 + coord_t coord;
36523 + lock_handle *lh;
36524 + int valid;
36525 + union {
36526 + struct extent_coord_extension extent;
36527 + struct tail_coord_extension tail;
36528 + struct ctail_coord_extension ctail;
36529 + } extension;
36530 +};
36531 +
36532 +#include "../../forward.h"
36533 +#include "../../seal.h"
36534 +#include "../../lock.h"
36535 +
36536 +/*
36537 + * This structure is used to speed up file operations (reads and writes). A
36538 + * hint is a suggestion about where a key resolved to last time. A seal
36539 + * indicates whether a node has been modified since a hint was last recorded.
36540 + * You check the seal, and if the seal is still valid, you can use the hint
36541 + * without traversing the tree again.
36542 + */
36543 +struct hint {
36544 + seal_t seal; /* a seal over last file item accessed */
36545 + uf_coord_t ext_coord;
36546 + loff_t offset;
36547 + znode_lock_mode mode;
36548 + lock_handle lh;
36549 +};
36550 +
36551 +static inline int hint_is_valid(hint_t * hint)
36552 +{
36553 + return hint->ext_coord.valid;
36554 +}
36555 +
36556 +static inline void hint_set_valid(hint_t * hint)
36557 +{
36558 + hint->ext_coord.valid = 1;
36559 +}
36560 +
36561 +static inline void hint_clr_valid(hint_t * hint)
36562 +{
36563 + hint->ext_coord.valid = 0;
36564 +}
36565 +
36566 +int load_file_hint(struct file *, hint_t *);
36567 +void save_file_hint(struct file *, const hint_t *);
36568 +void hint_init_zero(hint_t *);
36569 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
36570 +int hint_is_set(const hint_t *);
36571 +void reiser4_unset_hint(hint_t *);
36572 +
36573 +int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
36574 +int cut_file_items(struct inode *, loff_t new_size, int update_sd,
36575 + loff_t cur_size, int (*update_actor) (struct inode *,
36576 + reiser4_key *, int));
36577 +#if REISER4_DEBUG
36578 +
36579 +/* return 1 is exclusive access is obtained, 0 - otherwise */
36580 +static inline int ea_obtained(struct unix_file_info * uf_info)
36581 +{
36582 + int ret;
36583 +
36584 + ret = down_read_trylock(&uf_info->latch);
36585 + if (ret)
36586 + up_read(&uf_info->latch);
36587 + return !ret;
36588 +}
36589 +
36590 +#endif
36591 +
36592 +#define WRITE_GRANULARITY 32
36593 +
36594 +int tail2extent(struct unix_file_info *);
36595 +int extent2tail(struct file *, struct unix_file_info *);
36596 +
36597 +int goto_right_neighbor(coord_t *, lock_handle *);
36598 +int find_or_create_extent(struct page *);
36599 +int equal_to_ldk(znode *, const reiser4_key *);
36600 +
36601 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
36602 +
36603 +static inline int cbk_errored(int cbk_result)
36604 +{
36605 + return (cbk_result != CBK_COORD_NOTFOUND
36606 + && cbk_result != CBK_COORD_FOUND);
36607 +}
36608 +
36609 +/* __REISER4_FILE_H__ */
36610 +#endif
36611 +
36612 +/*
36613 + * Local variables:
36614 + * c-indentation-style: "K&R"
36615 + * mode-name: "LC"
36616 + * c-basic-offset: 8
36617 + * tab-width: 8
36618 + * fill-column: 79
36619 + * scroll-step: 1
36620 + * End:
36621 +*/
36622 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/Makefile linux-2.6.23/fs/reiser4/plugin/file/Makefile
36623 --- linux-2.6.23.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300
36624 +++ linux-2.6.23/fs/reiser4/plugin/file/Makefile 2007-12-04 16:49:30.000000000 +0300
36625 @@ -0,0 +1,7 @@
36626 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
36627 +
36628 +file_plugins-objs := \
36629 + file.o \
36630 + tail_conversion.o \
36631 + symlink.o \
36632 + cryptcompress.o
36633 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.23/fs/reiser4/plugin/file/symfile.c
36634 --- linux-2.6.23.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300
36635 +++ linux-2.6.23/fs/reiser4/plugin/file/symfile.c 2007-12-04 16:49:30.000000000 +0300
36636 @@ -0,0 +1,87 @@
36637 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36638 +
36639 +/* Symfiles are a generalization of Unix symlinks.
36640 +
36641 + A symfile when read behaves as though you took its contents and
36642 + substituted them into the reiser4 naming system as the right hand side
36643 + of an assignment, and then read that which you had assigned to it.
36644 +
36645 + A key issue for symfiles is how to implement writes through to
36646 + subfiles. In general, one must have some method of determining what
36647 + of that which is written to the symfile is written to what subfile.
36648 + This can be done by use of custom plugin methods written by users, or
36649 + by using a few general methods we provide for those willing to endure
36650 + the insertion of delimiters into what is read.
36651 +
36652 + Writing to symfiles without delimiters to denote what is written to
36653 + what subfile is not supported by any plugins we provide in this
36654 + release. Our most sophisticated support for writes is that embodied
36655 + by the invert plugin (see invert.c).
36656 +
36657 + A read only version of the /etc/passwd file might be
36658 + constructed as a symfile whose contents are as follows:
36659 +
36660 + /etc/passwd/userlines/*
36661 +
36662 + or
36663 +
36664 + /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
36665 +
36666 + or
36667 +
36668 + /etc/passwd/userlines/(demidov+edward+reiser+root)
36669 +
36670 + A symfile with contents
36671 +
36672 + /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
36673 +
36674 + will return when read
36675 +
36676 + The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
36677 +
36678 + and write of what has been read will not be possible to implement as
36679 + an identity operation because there are no delimiters denoting the
36680 + boundaries of what is to be written to what subfile.
36681 +
36682 + Note that one could make this a read/write symfile if one specified
36683 + delimiters, and the write method understood those delimiters delimited
36684 + what was written to subfiles.
36685 +
36686 + So, specifying the symfile in a manner that allows writes:
36687 +
36688 + /etc/passwd/userlines/demidov+"(
36689 + )+/etc/passwd/userlines/edward+"(
36690 + )+/etc/passwd/userlines/reiser+"(
36691 + )+/etc/passwd/userlines/root+"(
36692 + )
36693 +
36694 + or
36695 +
36696 + /etc/passwd/userlines/(demidov+"(
36697 + )+edward+"(
36698 + )+reiser+"(
36699 + )+root+"(
36700 + ))
36701 +
36702 + and the file demidov might be specified as:
36703 +
36704 + /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
36705 +
36706 + or
36707 +
36708 + /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
36709 +
36710 + Notice that if the file demidov has a carriage return in it, the
36711 + parsing fails, but then if you put carriage returns in the wrong place
36712 + in a normal /etc/passwd file it breaks things also.
36713 +
36714 + Note that it is forbidden to have no text between two interpolations
36715 + if one wants to be able to define what parts of a write go to what
36716 + subfiles referenced in an interpolation.
36717 +
36718 + If one wants to be able to add new lines by writing to the file, one
36719 + must either write a custom plugin for /etc/passwd that knows how to
36720 + name an added line, or one must use an invert, or one must use a more
36721 + sophisticated symfile syntax that we are not planning to write for
36722 + version 4.0.
36723 +*/
36724 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.23/fs/reiser4/plugin/file/symlink.c
36725 --- linux-2.6.23.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300
36726 +++ linux-2.6.23/fs/reiser4/plugin/file/symlink.c 2007-12-04 16:49:30.000000000 +0300
36727 @@ -0,0 +1,95 @@
36728 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
36729 +
36730 +#include "../../inode.h"
36731 +
36732 +#include <linux/types.h>
36733 +#include <linux/fs.h>
36734 +
36735 +/* file plugin methods specific for symlink files
36736 + (SYMLINK_FILE_PLUGIN_ID) */
36737 +
36738 +/* this is implementation of create_object method of file plugin for
36739 + SYMLINK_FILE_PLUGIN_ID
36740 + */
36741 +
36742 +/**
36743 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
36744 + * @symlink: inode of symlink object
36745 + * @dir: inode of parent directory
36746 + * @info: parameters of new object
36747 + *
36748 + * Inserts stat data with symlink extension where into the tree.
36749 + */
36750 +int reiser4_create_symlink(struct inode *symlink,
36751 + struct inode *dir UNUSED_ARG,
36752 + reiser4_object_create_data *data /* info passed to us
36753 + * this is filled by
36754 + * reiser4() syscall
36755 + * in particular */)
36756 +{
36757 + int result;
36758 +
36759 + assert("nikita-680", symlink != NULL);
36760 + assert("nikita-681", S_ISLNK(symlink->i_mode));
36761 + assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
36762 + assert("nikita-682", dir != NULL);
36763 + assert("nikita-684", data != NULL);
36764 + assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
36765 +
36766 + /*
36767 + * stat data of symlink has symlink extension in which we store
36768 + * symlink content, that is, path symlink is pointing to.
36769 + */
36770 + reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
36771 +
36772 + assert("vs-838", symlink->i_private == NULL);
36773 + symlink->i_private = (void *)data->name;
36774 +
36775 + assert("vs-843", symlink->i_size == 0);
36776 + INODE_SET_FIELD(symlink, i_size, strlen(data->name));
36777 +
36778 + /* insert stat data appended with data->name */
36779 + result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
36780 + if (result) {
36781 + /* FIXME-VS: Make sure that symlink->i_private is not attached
36782 + to kmalloced data */
36783 + INODE_SET_FIELD(symlink, i_size, 0);
36784 + } else {
36785 + assert("vs-849", symlink->i_private
36786 + && reiser4_inode_get_flag(symlink,
36787 + REISER4_GENERIC_PTR_USED));
36788 + assert("vs-850",
36789 + !memcmp((char *)symlink->i_private, data->name,
36790 + (size_t) symlink->i_size + 1));
36791 + }
36792 + return result;
36793 +}
36794 +
36795 +/* this is implementation of destroy_inode method of file plugin for
36796 + SYMLINK_FILE_PLUGIN_ID
36797 + */
36798 +void destroy_inode_symlink(struct inode *inode)
36799 +{
36800 + assert("edward-799",
36801 + inode_file_plugin(inode) ==
36802 + file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
36803 + assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
36804 + assert("edward-801", reiser4_inode_get_flag(inode,
36805 + REISER4_GENERIC_PTR_USED));
36806 + assert("vs-839", S_ISLNK(inode->i_mode));
36807 +
36808 + kfree(inode->i_private);
36809 + inode->i_private = NULL;
36810 + reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
36811 +}
36812 +
36813 +/*
36814 + Local variables:
36815 + c-indentation-style: "K&R"
36816 + mode-name: "LC"
36817 + c-basic-offset: 8
36818 + tab-width: 8
36819 + fill-column: 80
36820 + scroll-step: 1
36821 + End:
36822 +*/
36823 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.23/fs/reiser4/plugin/file/tail_conversion.c
36824 --- linux-2.6.23.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300
36825 +++ linux-2.6.23/fs/reiser4/plugin/file/tail_conversion.c 2007-12-04 16:49:30.000000000 +0300
36826 @@ -0,0 +1,726 @@
36827 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36828 +
36829 +#include "../../inode.h"
36830 +#include "../../super.h"
36831 +#include "../../page_cache.h"
36832 +#include "../../carry.h"
36833 +#include "../../safe_link.h"
36834 +#include "../../vfs_ops.h"
36835 +
36836 +#include <linux/writeback.h>
36837 +
36838 +/* this file contains:
36839 + tail2extent and extent2tail */
36840 +
36841 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
36842 +void get_exclusive_access(struct unix_file_info * uf_info)
36843 +{
36844 + assert("nikita-3028", reiser4_schedulable());
36845 + assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
36846 + assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
36847 + /*
36848 + * "deadlock avoidance": sometimes we commit a transaction under
36849 + * rw-semaphore on a file. Such commit can deadlock with another
36850 + * thread that captured some block (hence preventing atom from being
36851 + * committed) and waits on rw-semaphore.
36852 + */
36853 + reiser4_txn_restart_current();
36854 + LOCK_CNT_INC(inode_sem_w);
36855 + down_write(&uf_info->latch);
36856 + uf_info->exclusive_use = 1;
36857 + assert("vs-1713", uf_info->ea_owner == NULL);
36858 + assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
36859 + ON_DEBUG(uf_info->ea_owner = current);
36860 +}
36861 +
36862 +void drop_exclusive_access(struct unix_file_info * uf_info)
36863 +{
36864 + assert("vs-1714", uf_info->ea_owner == current);
36865 + assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
36866 + ON_DEBUG(uf_info->ea_owner = NULL);
36867 + uf_info->exclusive_use = 0;
36868 + up_write(&uf_info->latch);
36869 + assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
36870 + assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
36871 + LOCK_CNT_DEC(inode_sem_w);
36872 + reiser4_txn_restart_current();
36873 +}
36874 +
36875 +/**
36876 + * nea_grabbed - do something when file semaphore is down_read-ed
36877 + * @uf_info:
36878 + *
36879 + * This is called when nonexclisive access is obtained on file. All it does is
36880 + * for debugging purposes.
36881 + */
36882 +static void nea_grabbed(struct unix_file_info *uf_info)
36883 +{
36884 +#if REISER4_DEBUG
36885 + LOCK_CNT_INC(inode_sem_r);
36886 + assert("vs-1716", uf_info->ea_owner == NULL);
36887 + atomic_inc(&uf_info->nr_neas);
36888 + uf_info->last_reader = current;
36889 +#endif
36890 +}
36891 +
36892 +/**
36893 + * get_nonexclusive_access - get nonexclusive access to a file
36894 + * @uf_info: unix file specific part of inode to obtain access to
36895 + *
36896 + * Nonexclusive access is obtained on a file before read, write, readpage.
36897 + */
36898 +void get_nonexclusive_access(struct unix_file_info *uf_info)
36899 +{
36900 + assert("nikita-3029", reiser4_schedulable());
36901 + assert("nikita-3361", get_current_context()->trans->atom == NULL);
36902 +
36903 + down_read(&uf_info->latch);
36904 + nea_grabbed(uf_info);
36905 +}
36906 +
36907 +/**
36908 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
36909 + * @uf_info: unix file specific part of inode to obtain access to
36910 + *
36911 + * Non-blocking version of nonexclusive access obtaining.
36912 + */
36913 +int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
36914 +{
36915 + int result;
36916 +
36917 + result = down_read_trylock(&uf_info->latch);
36918 + if (result)
36919 + nea_grabbed(uf_info);
36920 + return result;
36921 +}
36922 +
36923 +void drop_nonexclusive_access(struct unix_file_info * uf_info)
36924 +{
36925 + assert("vs-1718", uf_info->ea_owner == NULL);
36926 + assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
36927 + ON_DEBUG(atomic_dec(&uf_info->nr_neas));
36928 +
36929 + up_read(&uf_info->latch);
36930 +
36931 + LOCK_CNT_DEC(inode_sem_r);
36932 + reiser4_txn_restart_current();
36933 +}
36934 +
36935 +/* part of tail2extent. Cut all items covering @count bytes starting from
36936 + @offset */
36937 +/* Audited by: green(2002.06.15) */
36938 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
36939 +{
36940 + reiser4_key from, to;
36941 +
36942 + /* AUDIT: How about putting an assertion here, what would check
36943 + all provided range is covered by tail items only? */
36944 + /* key of first byte in the range to be cut */
36945 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
36946 +
36947 + /* key of last byte in that range */
36948 + to = from;
36949 + set_key_offset(&to, (__u64) (offset + count - 1));
36950 +
36951 + /* cut everything between those keys */
36952 + return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
36953 + inode, 0);
36954 +}
36955 +
36956 +static void release_all_pages(struct page **pages, unsigned nr_pages)
36957 +{
36958 + unsigned i;
36959 +
36960 + for (i = 0; i < nr_pages; i++) {
36961 + if (pages[i] == NULL) {
36962 + unsigned j;
36963 + for (j = i + 1; j < nr_pages; j++)
36964 + assert("vs-1620", pages[j] == NULL);
36965 + break;
36966 + }
36967 + page_cache_release(pages[i]);
36968 + pages[i] = NULL;
36969 + }
36970 +}
36971 +
36972 +/* part of tail2extent. replace tail items with extent one. Content of tail
36973 + items (@count bytes) being cut are copied already into
36974 + pages. extent_writepage method is called to create extents corresponding to
36975 + those pages */
36976 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
36977 +{
36978 + int result;
36979 + unsigned i;
36980 + STORE_COUNTERS;
36981 +
36982 + if (nr_pages == 0)
36983 + return 0;
36984 +
36985 + assert("vs-596", pages[0]);
36986 +
36987 + /* cut copied items */
36988 + result = cut_formatting_items(inode, page_offset(pages[0]), count);
36989 + if (result)
36990 + return result;
36991 +
36992 + CHECK_COUNTERS;
36993 +
36994 + /* put into tree replacement for just removed items: extent item, namely */
36995 + for (i = 0; i < nr_pages; i++) {
36996 + result = add_to_page_cache_lru(pages[i], inode->i_mapping,
36997 + pages[i]->index,
36998 + mapping_gfp_mask(inode->
36999 + i_mapping));
37000 + if (result)
37001 + break;
37002 + unlock_page(pages[i]);
37003 + result = find_or_create_extent(pages[i]);
37004 + if (result)
37005 + break;
37006 + SetPageUptodate(pages[i]);
37007 + }
37008 + return result;
37009 +}
37010 +
37011 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
37012 + * items */
37013 +
37014 +static int reserve_tail2extent_iteration(struct inode *inode)
37015 +{
37016 + reiser4_block_nr unformatted_nodes;
37017 + reiser4_tree *tree;
37018 +
37019 + tree = reiser4_tree_by_inode(inode);
37020 +
37021 + /* number of unformatted nodes which will be created */
37022 + unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
37023 +
37024 + /*
37025 + * space required for one iteration of extent->tail conversion:
37026 + *
37027 + * 1. kill N tail items
37028 + *
37029 + * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
37030 + *
37031 + * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
37032 + * extents) extent units.
37033 + *
37034 + * 4. drilling to the leaf level by coord_by_key()
37035 + *
37036 + * 5. possible update of stat-data
37037 + *
37038 + */
37039 + grab_space_enable();
37040 + return reiser4_grab_space
37041 + (2 * tree->height +
37042 + TAIL2EXTENT_PAGE_NUM +
37043 + TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
37044 + 1 + estimate_one_insert_item(tree) +
37045 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37046 +}
37047 +
37048 +/* clear stat data's flag indicating that conversion is being converted */
37049 +static int complete_conversion(struct inode *inode)
37050 +{
37051 + int result;
37052 +
37053 + grab_space_enable();
37054 + result =
37055 + reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
37056 + BA_CAN_COMMIT);
37057 + if (result == 0) {
37058 + reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
37059 + result = reiser4_update_sd(inode);
37060 + }
37061 + if (result)
37062 + warning("vs-1696", "Failed to clear converting bit of %llu: %i",
37063 + (unsigned long long)get_inode_oid(inode), result);
37064 + return 0;
37065 +}
37066 +
37067 +/**
37068 + * find_start
37069 + * @inode:
37070 + * @id:
37071 + * @offset:
37072 + *
37073 + * this is used by tail2extent and extent2tail to detect where previous
37074 + * uncompleted conversion stopped
37075 + */
37076 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
37077 +{
37078 + int result;
37079 + lock_handle lh;
37080 + coord_t coord;
37081 + struct unix_file_info *ufo;
37082 + int found;
37083 + reiser4_key key;
37084 +
37085 + ufo = unix_file_inode_data(inode);
37086 + init_lh(&lh);
37087 + result = 0;
37088 + found = 0;
37089 + inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
37090 + do {
37091 + init_lh(&lh);
37092 + result = find_file_item_nohint(&coord, &lh, &key,
37093 + ZNODE_READ_LOCK, inode);
37094 +
37095 + if (result == CBK_COORD_FOUND) {
37096 + if (coord.between == AT_UNIT) {
37097 + /*coord_clear_iplug(&coord); */
37098 + result = zload(coord.node);
37099 + if (result == 0) {
37100 + if (item_id_by_coord(&coord) == id)
37101 + found = 1;
37102 + else
37103 + item_plugin_by_coord(&coord)->s.
37104 + file.append_key(&coord,
37105 + &key);
37106 + zrelse(coord.node);
37107 + }
37108 + } else
37109 + result = RETERR(-ENOENT);
37110 + }
37111 + done_lh(&lh);
37112 + } while (result == 0 && !found);
37113 + *offset = get_key_offset(&key);
37114 + return result;
37115 +}
37116 +
37117 +/**
37118 + * tail2extent
37119 + * @uf_info:
37120 + *
37121 + *
37122 + */
37123 +int tail2extent(struct unix_file_info *uf_info)
37124 +{
37125 + int result;
37126 + reiser4_key key; /* key of next byte to be moved to page */
37127 + char *p_data; /* data of page */
37128 + unsigned page_off = 0, /* offset within the page where to copy data */
37129 + count; /* number of bytes of item which can be
37130 + * copied to page */
37131 + struct page *pages[TAIL2EXTENT_PAGE_NUM];
37132 + struct page *page;
37133 + int done; /* set to 1 when all file is read */
37134 + char *item;
37135 + int i;
37136 + struct inode *inode;
37137 + int first_iteration;
37138 + int bytes;
37139 + __u64 offset;
37140 +
37141 + assert("nikita-3362", ea_obtained(uf_info));
37142 + inode = unix_file_info_to_inode(uf_info);
37143 + assert("nikita-3412", !IS_RDONLY(inode));
37144 + assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
37145 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37146 +
37147 + offset = 0;
37148 + first_iteration = 1;
37149 + result = 0;
37150 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37151 + /*
37152 + * file is marked on disk as there was a conversion which did
37153 + * not complete due to either crash or some error. Find which
37154 + * offset tail conversion stopped at
37155 + */
37156 + result = find_start(inode, FORMATTING_ID, &offset);
37157 + if (result == -ENOENT) {
37158 + /* no tail items found, everything is converted */
37159 + uf_info->container = UF_CONTAINER_EXTENTS;
37160 + complete_conversion(inode);
37161 + return 0;
37162 + } else if (result != 0)
37163 + /* some other error */
37164 + return result;
37165 + first_iteration = 0;
37166 + }
37167 +
37168 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37169 +
37170 + /* get key of first byte of a file */
37171 + inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
37172 +
37173 + done = 0;
37174 + while (done == 0) {
37175 + memset(pages, 0, sizeof(pages));
37176 + result = reserve_tail2extent_iteration(inode);
37177 + if (result != 0)
37178 + goto out;
37179 + if (first_iteration) {
37180 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37181 + reiser4_update_sd(inode);
37182 + first_iteration = 0;
37183 + }
37184 + bytes = 0;
37185 + for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
37186 + assert("vs-598",
37187 + (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
37188 + page = alloc_page(reiser4_ctx_gfp_mask_get());
37189 + if (!page) {
37190 + result = RETERR(-ENOMEM);
37191 + goto error;
37192 + }
37193 +
37194 + page->index =
37195 + (unsigned long)(get_key_offset(&key) >>
37196 + PAGE_CACHE_SHIFT);
37197 + /*
37198 + * usually when one is going to longterm lock znode (as
37199 + * find_file_item does, for instance) he must not hold
37200 + * locked pages. However, there is an exception for
37201 + * case tail2extent. Pages appearing here are not
37202 + * reachable to everyone else, they are clean, they do
37203 + * not have jnodes attached so keeping them locked do
37204 + * not risk deadlock appearance
37205 + */
37206 + assert("vs-983", !PagePrivate(page));
37207 + reiser4_invalidate_pages(inode->i_mapping, page->index,
37208 + 1, 0);
37209 +
37210 + for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
37211 + coord_t coord;
37212 + lock_handle lh;
37213 +
37214 + /* get next item */
37215 + /* FIXME: we might want to readahead here */
37216 + init_lh(&lh);
37217 + result =
37218 + find_file_item_nohint(&coord, &lh, &key,
37219 + ZNODE_READ_LOCK,
37220 + inode);
37221 + if (result != CBK_COORD_FOUND) {
37222 + /*
37223 + * error happened of not items of file
37224 + * were found
37225 + */
37226 + done_lh(&lh);
37227 + page_cache_release(page);
37228 + goto error;
37229 + }
37230 +
37231 + if (coord.between == AFTER_UNIT) {
37232 + /*
37233 + * end of file is reached. Padd page
37234 + * with zeros
37235 + */
37236 + done_lh(&lh);
37237 + done = 1;
37238 + p_data = kmap_atomic(page, KM_USER0);
37239 + memset(p_data + page_off, 0,
37240 + PAGE_CACHE_SIZE - page_off);
37241 + kunmap_atomic(p_data, KM_USER0);
37242 + break;
37243 + }
37244 +
37245 + result = zload(coord.node);
37246 + if (result) {
37247 + page_cache_release(page);
37248 + done_lh(&lh);
37249 + goto error;
37250 + }
37251 + assert("vs-856", coord.between == AT_UNIT);
37252 + item = ((char *)item_body_by_coord(&coord)) +
37253 + coord.unit_pos;
37254 +
37255 + /* how many bytes to copy */
37256 + count =
37257 + item_length_by_coord(&coord) -
37258 + coord.unit_pos;
37259 + /* limit length of copy to end of page */
37260 + if (count > PAGE_CACHE_SIZE - page_off)
37261 + count = PAGE_CACHE_SIZE - page_off;
37262 +
37263 + /*
37264 + * copy item (as much as will fit starting from
37265 + * the beginning of the item) into the page
37266 + */
37267 + p_data = kmap_atomic(page, KM_USER0);
37268 + memcpy(p_data + page_off, item, count);
37269 + kunmap_atomic(p_data, KM_USER0);
37270 +
37271 + page_off += count;
37272 + bytes += count;
37273 + set_key_offset(&key,
37274 + get_key_offset(&key) + count);
37275 +
37276 + zrelse(coord.node);
37277 + done_lh(&lh);
37278 + } /* end of loop which fills one page by content of
37279 + * formatting items */
37280 +
37281 + if (page_off) {
37282 + /* something was copied into page */
37283 + pages[i] = page;
37284 + } else {
37285 + page_cache_release(page);
37286 + assert("vs-1648", done == 1);
37287 + break;
37288 + }
37289 + } /* end of loop through pages of one conversion iteration */
37290 +
37291 + if (i > 0) {
37292 + result = replace(inode, pages, i, bytes);
37293 + release_all_pages(pages, sizeof_array(pages));
37294 + if (result)
37295 + goto error;
37296 + /*
37297 + * We have to drop exclusive access to avoid deadlock
37298 + * which may happen because called by reiser4_writepages
37299 + * capture_unix_file requires to get non-exclusive
37300 + * access to a file. It is safe to drop EA in the middle
37301 + * of tail2extent conversion because write_unix_file,
37302 + * setattr_unix_file(truncate), mmap_unix_file,
37303 + * release_unix_file(extent2tail) checks if conversion
37304 + * is not in progress (see comments before
37305 + * get_exclusive_access_careful().
37306 + * Other processes that acquire non-exclusive access
37307 + * (read_unix_file, reiser4_writepages, etc) should work
37308 + * on partially converted files.
37309 + */
37310 + drop_exclusive_access(uf_info);
37311 + /* throttle the conversion */
37312 + reiser4_throttle_write(inode);
37313 + get_exclusive_access(uf_info);
37314 +
37315 + /*
37316 + * nobody is allowed to complete conversion but a
37317 + * process which started it
37318 + */
37319 + assert("", reiser4_inode_get_flag(inode,
37320 + REISER4_PART_MIXED));
37321 + }
37322 + }
37323 +
37324 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37325 +
37326 + if (result == 0) {
37327 + /* file is converted to extent items */
37328 + assert("vs-1697", reiser4_inode_get_flag(inode,
37329 + REISER4_PART_MIXED));
37330 +
37331 + uf_info->container = UF_CONTAINER_EXTENTS;
37332 + complete_conversion(inode);
37333 + } else {
37334 + /*
37335 + * conversion is not complete. Inode was already marked as
37336 + * REISER4_PART_CONV and stat-data were updated at the first
37337 + * iteration of the loop above.
37338 + */
37339 + error:
37340 + release_all_pages(pages, sizeof_array(pages));
37341 + warning("nikita-2282", "Partial conversion of %llu: %i",
37342 + (unsigned long long)get_inode_oid(inode), result);
37343 + }
37344 +
37345 + out:
37346 + return result;
37347 +}
37348 +
37349 +static int reserve_extent2tail_iteration(struct inode *inode)
37350 +{
37351 + reiser4_tree *tree;
37352 +
37353 + tree = reiser4_tree_by_inode(inode);
37354 + /*
37355 + * reserve blocks for (in this order):
37356 + *
37357 + * 1. removal of extent item
37358 + *
37359 + * 2. insertion of tail by insert_flow()
37360 + *
37361 + * 3. drilling to the leaf level by coord_by_key()
37362 + *
37363 + * 4. possible update of stat-data
37364 + */
37365 + grab_space_enable();
37366 + return reiser4_grab_space
37367 + (estimate_one_item_removal(tree) +
37368 + estimate_insert_flow(tree->height) +
37369 + 1 + estimate_one_insert_item(tree) +
37370 + inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37371 +}
37372 +
37373 +/* for every page of file: read page, cut part of extent pointing to this page,
37374 + put data of page tree by tail item */
37375 +int extent2tail(struct file * file, struct unix_file_info *uf_info)
37376 +{
37377 + int result;
37378 + struct inode *inode;
37379 + struct page *page;
37380 + unsigned long num_pages, i;
37381 + unsigned long start_page;
37382 + reiser4_key from;
37383 + reiser4_key to;
37384 + unsigned count;
37385 + __u64 offset;
37386 +
37387 + assert("nikita-3362", ea_obtained(uf_info));
37388 + inode = unix_file_info_to_inode(uf_info);
37389 + assert("nikita-3412", !IS_RDONLY(inode));
37390 + assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
37391 + assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37392 +
37393 + offset = 0;
37394 + if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37395 + /*
37396 + * file is marked on disk as there was a conversion which did
37397 + * not complete due to either crash or some error. Find which
37398 + * offset tail conversion stopped at
37399 + */
37400 + result = find_start(inode, EXTENT_POINTER_ID, &offset);
37401 + if (result == -ENOENT) {
37402 + /* no extent found, everything is converted */
37403 + uf_info->container = UF_CONTAINER_TAILS;
37404 + complete_conversion(inode);
37405 + return 0;
37406 + } else if (result != 0)
37407 + /* some other error */
37408 + return result;
37409 + }
37410 +
37411 + reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37412 +
37413 + /* number of pages in the file */
37414 + num_pages =
37415 + (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
37416 + start_page = offset >> PAGE_CACHE_SHIFT;
37417 +
37418 + inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37419 + to = from;
37420 +
37421 + result = 0;
37422 + for (i = 0; i < num_pages; i++) {
37423 + __u64 start_byte;
37424 +
37425 + result = reserve_extent2tail_iteration(inode);
37426 + if (result != 0)
37427 + break;
37428 + if (i == 0 && offset == 0) {
37429 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37430 + reiser4_update_sd(inode);
37431 + }
37432 +
37433 + page = read_mapping_page(inode->i_mapping,
37434 + (unsigned)(i + start_page), NULL);
37435 + if (IS_ERR(page)) {
37436 + result = PTR_ERR(page);
37437 + break;
37438 + }
37439 +
37440 + wait_on_page_locked(page);
37441 +
37442 + if (!PageUptodate(page)) {
37443 + page_cache_release(page);
37444 + result = RETERR(-EIO);
37445 + break;
37446 + }
37447 +
37448 + /* cut part of file we have read */
37449 + start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
37450 + set_key_offset(&from, start_byte);
37451 + set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
37452 + /*
37453 + * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
37454 + * commits during over-long truncates. But
37455 + * extent->tail conversion should be performed in one
37456 + * transaction.
37457 + */
37458 + result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
37459 + &to, inode, 0);
37460 +
37461 + if (result) {
37462 + page_cache_release(page);
37463 + break;
37464 + }
37465 +
37466 + /* put page data into tree via tail_write */
37467 + count = PAGE_CACHE_SIZE;
37468 + if ((i == (num_pages - 1)) &&
37469 + (inode->i_size & ~PAGE_CACHE_MASK))
37470 + /* last page can be incompleted */
37471 + count = (inode->i_size & ~PAGE_CACHE_MASK);
37472 + while (count) {
37473 + loff_t pos = start_byte;
37474 +
37475 + assert("edward-1533",
37476 + file != NULL && file->f_dentry != NULL);
37477 + assert("edward-1534",
37478 + file->f_dentry->d_inode == inode);
37479 +
37480 + result = reiser4_write_tail(file,
37481 + (char __user *)kmap(page),
37482 + count, &pos);
37483 + reiser4_free_file_fsdata(file);
37484 + if (result <= 0) {
37485 + warning("", "reiser4_write_tail failed");
37486 + page_cache_release(page);
37487 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37488 + return result;
37489 + }
37490 + count -= result;
37491 + }
37492 +
37493 + /* release page */
37494 + lock_page(page);
37495 + /* page is already detached from jnode and mapping. */
37496 + assert("vs-1086", page->mapping == NULL);
37497 + assert("nikita-2690",
37498 + (!PagePrivate(page) && jprivate(page) == 0));
37499 + /* waiting for writeback completion with page lock held is
37500 + * perfectly valid. */
37501 + wait_on_page_writeback(page);
37502 + reiser4_drop_page(page);
37503 + /* release reference taken by read_cache_page() above */
37504 + page_cache_release(page);
37505 +
37506 + drop_exclusive_access(uf_info);
37507 + /* throttle the conversion */
37508 + reiser4_throttle_write(inode);
37509 + get_exclusive_access(uf_info);
37510 + /*
37511 + * nobody is allowed to complete conversion but a process which
37512 + * started it
37513 + */
37514 + assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
37515 + }
37516 +
37517 + reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37518 +
37519 + if (i == num_pages) {
37520 + /* file is converted to formatted items */
37521 + assert("vs-1698", reiser4_inode_get_flag(inode,
37522 + REISER4_PART_MIXED));
37523 + assert("vs-1260",
37524 + inode_has_no_jnodes(reiser4_inode_data(inode)));
37525 +
37526 + uf_info->container = UF_CONTAINER_TAILS;
37527 + complete_conversion(inode);
37528 + return 0;
37529 + }
37530 + /*
37531 + * conversion is not complete. Inode was already marked as
37532 + * REISER4_PART_MIXED and stat-data were updated at the first *
37533 + * iteration of the loop above.
37534 + */
37535 + warning("nikita-2282",
37536 + "Partial conversion of %llu: %lu of %lu: %i",
37537 + (unsigned long long)get_inode_oid(inode), i,
37538 + num_pages, result);
37539 +
37540 + return result;
37541 +}
37542 +
37543 +/*
37544 + * Local variables:
37545 + * c-indentation-style: "K&R"
37546 + * mode-name: "LC"
37547 + * c-basic-offset: 8
37548 + * tab-width: 8
37549 + * fill-column: 79
37550 + * scroll-step: 1
37551 + * End:
37552 + */
37553 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file_ops.c linux-2.6.23/fs/reiser4/plugin/file_ops.c
37554 --- linux-2.6.23.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300
37555 +++ linux-2.6.23/fs/reiser4/plugin/file_ops.c 2007-12-04 16:49:30.000000000 +0300
37556 @@ -0,0 +1,205 @@
37557 +/* Copyright 2005 by Hans Reiser, licensing governed by
37558 + reiser4/README */
37559 +
37560 +/* this file contains typical implementations for some of methods of
37561 + struct file_operations and of struct address_space_operations
37562 +*/
37563 +
37564 +#include "../inode.h"
37565 +#include "object.h"
37566 +
37567 +/* file operations */
37568 +
37569 +/* implementation of vfs's llseek method of struct file_operations for
37570 + typical directory can be found in readdir_common.c
37571 +*/
37572 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
37573 +
37574 +/* implementation of vfs's readdir method of struct file_operations for
37575 + typical directory can be found in readdir_common.c
37576 +*/
37577 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
37578 +
37579 +/**
37580 + * reiser4_release_dir_common - release of struct file_operations
37581 + * @inode: inode of released file
37582 + * @file: file to release
37583 + *
37584 + * Implementation of release method of struct file_operations for typical
37585 + * directory. All it does is freeing of reiser4 specific file data.
37586 +*/
37587 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
37588 +{
37589 + reiser4_context *ctx;
37590 +
37591 + ctx = reiser4_init_context(inode->i_sb);
37592 + if (IS_ERR(ctx))
37593 + return PTR_ERR(ctx);
37594 + reiser4_free_file_fsdata(file);
37595 + reiser4_exit_context(ctx);
37596 + return 0;
37597 +}
37598 +
37599 +/* this is common implementation of vfs's fsync method of struct
37600 + file_operations
37601 +*/
37602 +int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
37603 +{
37604 + reiser4_context *ctx;
37605 + int result;
37606 +
37607 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
37608 + if (IS_ERR(ctx))
37609 + return PTR_ERR(ctx);
37610 + result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
37611 +
37612 + context_set_commit_async(ctx);
37613 + reiser4_exit_context(ctx);
37614 + return result;
37615 +}
37616 +
37617 +/*
37618 + * common sync method for regular files.
37619 + *
37620 + * We are trying to be smart here. Instead of committing all atoms (original
37621 + * solution), we scan dirty pages of this file and commit all atoms they are
37622 + * part of.
37623 + *
37624 + * Situation is complicated by anonymous pages: i.e., extent-less pages
37625 + * dirtied through mmap. Fortunately sys_fsync() first calls
37626 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37627 + * all missing extents and capture anonymous pages.
37628 + */
37629 +int reiser4_sync_file_common(struct file *file,
37630 + struct dentry *dentry, int datasync)
37631 +{
37632 + reiser4_context *ctx;
37633 + txn_atom *atom;
37634 + reiser4_block_nr reserve;
37635 +
37636 + ctx = reiser4_init_context(dentry->d_inode->i_sb);
37637 + if (IS_ERR(ctx))
37638 + return PTR_ERR(ctx);
37639 +
37640 + reserve = estimate_update_common(dentry->d_inode);
37641 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37642 + reiser4_exit_context(ctx);
37643 + return RETERR(-ENOSPC);
37644 + }
37645 + write_sd_by_inode_common(dentry->d_inode);
37646 +
37647 + atom = get_current_atom_locked();
37648 + spin_lock_txnh(ctx->trans);
37649 + force_commit_atom(ctx->trans);
37650 + reiser4_exit_context(ctx);
37651 + return 0;
37652 +}
37653 +
37654 +/* this is common implementation of vfs's sendfile method of struct
37655 + file_operations
37656 +
37657 + Reads @count bytes from @file and calls @actor for every page read. This is
37658 + needed for loop back devices support.
37659 +*/
37660 +#if 0
37661 +ssize_t
37662 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
37663 + read_actor_t actor, void *target)
37664 +{
37665 + reiser4_context *ctx;
37666 + ssize_t result;
37667 +
37668 + ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
37669 + if (IS_ERR(ctx))
37670 + return PTR_ERR(ctx);
37671 + result = generic_file_sendfile(file, ppos, count, actor, target);
37672 + reiser4_exit_context(ctx);
37673 + return result;
37674 +}
37675 +#endif /* 0 */
37676 +
37677 +/* address space operations */
37678 +
37679 +/* this is common implementation of vfs's prepare_write method of struct
37680 + address_space_operations
37681 +*/
37682 +int
37683 +prepare_write_common(struct file *file, struct page *page, unsigned from,
37684 + unsigned to)
37685 +{
37686 + reiser4_context *ctx;
37687 + int result;
37688 +
37689 + ctx = reiser4_init_context(page->mapping->host->i_sb);
37690 + result = do_prepare_write(file, page, from, to);
37691 +
37692 + /* don't commit transaction under inode semaphore */
37693 + context_set_commit_async(ctx);
37694 + reiser4_exit_context(ctx);
37695 +
37696 + return result;
37697 +}
37698 +
37699 +/* this is helper for prepare_write_common and prepare_write_unix_file
37700 + */
37701 +int
37702 +do_prepare_write(struct file *file, struct page *page, unsigned from,
37703 + unsigned to)
37704 +{
37705 + int result;
37706 + file_plugin *fplug;
37707 + struct inode *inode;
37708 +
37709 + assert("umka-3099", file != NULL);
37710 + assert("umka-3100", page != NULL);
37711 + assert("umka-3095", PageLocked(page));
37712 +
37713 + if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
37714 + return 0;
37715 +
37716 + inode = page->mapping->host;
37717 + fplug = inode_file_plugin(inode);
37718 +
37719 + if (page->mapping->a_ops->readpage == NULL)
37720 + return RETERR(-EINVAL);
37721 +
37722 + result = page->mapping->a_ops->readpage(file, page);
37723 + if (result != 0) {
37724 + SetPageError(page);
37725 + ClearPageUptodate(page);
37726 + /* All reiser4 readpage() implementations should return the
37727 + * page locked in case of error. */
37728 + assert("nikita-3472", PageLocked(page));
37729 + } else {
37730 + /*
37731 + * ->readpage() either:
37732 + *
37733 + * 1. starts IO against @page. @page is locked for IO in
37734 + * this case.
37735 + *
37736 + * 2. doesn't start IO. @page is unlocked.
37737 + *
37738 + * In either case, page should be locked.
37739 + */
37740 + lock_page(page);
37741 + /*
37742 + * IO (if any) is completed at this point. Check for IO
37743 + * errors.
37744 + */
37745 + if (!PageUptodate(page))
37746 + result = RETERR(-EIO);
37747 + }
37748 + assert("umka-3098", PageLocked(page));
37749 + return result;
37750 +}
37751 +
37752 +/*
37753 + * Local variables:
37754 + * c-indentation-style: "K&R"
37755 + * mode-name: "LC"
37756 + * c-basic-offset: 8
37757 + * tab-width: 8
37758 + * fill-column: 79
37759 + * scroll-step: 1
37760 + * End:
37761 + */
37762 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.23/fs/reiser4/plugin/file_ops_readdir.c
37763 --- linux-2.6.23.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300
37764 +++ linux-2.6.23/fs/reiser4/plugin/file_ops_readdir.c 2007-12-04 16:49:30.000000000 +0300
37765 @@ -0,0 +1,658 @@
37766 +/* Copyright 2005 by Hans Reiser, licensing governed by
37767 + * reiser4/README */
37768 +
37769 +#include "../inode.h"
37770 +
37771 +/* return true, iff @coord points to the valid directory item that is part of
37772 + * @inode directory. */
37773 +static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
37774 +{
37775 + return plugin_of_group(item_plugin_by_coord(coord),
37776 + DIR_ENTRY_ITEM_TYPE) &&
37777 + inode_file_plugin(inode)->owns_item(inode, coord);
37778 +}
37779 +
37780 +/* compare two logical positions within the same directory */
37781 +static cmp_t dir_pos_cmp(const struct dir_pos * p1, const struct dir_pos * p2)
37782 +{
37783 + cmp_t result;
37784 +
37785 + assert("nikita-2534", p1 != NULL);
37786 + assert("nikita-2535", p2 != NULL);
37787 +
37788 + result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
37789 + if (result == EQUAL_TO) {
37790 + int diff;
37791 +
37792 + diff = p1->pos - p2->pos;
37793 + result =
37794 + (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
37795 + }
37796 + return result;
37797 +}
37798 +
37799 +/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
37800 + * necessary. */
37801 +static void
37802 +adjust_dir_pos(struct file *dir, struct readdir_pos * readdir_spot,
37803 + const struct dir_pos * mod_point, int adj)
37804 +{
37805 + struct dir_pos *pos;
37806 +
37807 + /*
37808 + * new directory entry was added (adj == +1) or removed (adj == -1) at
37809 + * the @mod_point. Directory file descriptor @dir is doing readdir and
37810 + * is currently positioned at @readdir_spot. Latter has to be updated
37811 + * to maintain stable readdir.
37812 + */
37813 + /* directory is positioned to the beginning. */
37814 + if (readdir_spot->entry_no == 0)
37815 + return;
37816 +
37817 + pos = &readdir_spot->position;
37818 + switch (dir_pos_cmp(mod_point, pos)) {
37819 + case LESS_THAN:
37820 + /* @mod_pos is _before_ @readdir_spot, that is, entry was
37821 + * added/removed on the left (in key order) of current
37822 + * position. */
37823 + /* logical number of directory entry readdir is "looking" at
37824 + * changes */
37825 + readdir_spot->entry_no += adj;
37826 + assert("nikita-2577",
37827 + ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
37828 + if (de_id_cmp(&pos->dir_entry_key,
37829 + &mod_point->dir_entry_key) == EQUAL_TO) {
37830 + assert("nikita-2575", mod_point->pos < pos->pos);
37831 + /*
37832 + * if entry added/removed has the same key as current
37833 + * for readdir, update counter of duplicate keys in
37834 + * @readdir_spot.
37835 + */
37836 + pos->pos += adj;
37837 + }
37838 + break;
37839 + case GREATER_THAN:
37840 + /* directory is modified after @pos: nothing to do. */
37841 + break;
37842 + case EQUAL_TO:
37843 + /* cannot insert an entry readdir is looking at, because it
37844 + already exists. */
37845 + assert("nikita-2576", adj < 0);
37846 + /* directory entry to which @pos points to is being
37847 + removed.
37848 +
37849 + NOTE-NIKITA: Right thing to do is to update @pos to point
37850 + to the next entry. This is complex (we are under spin-lock
37851 + for one thing). Just rewind it to the beginning. Next
37852 + readdir will have to scan the beginning of
37853 + directory. Proper solution is to use semaphore in
37854 + spin lock's stead and use rewind_right() here.
37855 +
37856 + NOTE-NIKITA: now, semaphore is used, so...
37857 + */
37858 + memset(readdir_spot, 0, sizeof *readdir_spot);
37859 + }
37860 +}
37861 +
37862 +/* scan all file-descriptors for this directory and adjust their
37863 + positions respectively. Should be used by implementations of
37864 + add_entry and rem_entry of dir plugin */
37865 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
37866 + int offset, int adj)
37867 +{
37868 + reiser4_file_fsdata *scan;
37869 + struct dir_pos mod_point;
37870 +
37871 + assert("nikita-2536", dir != NULL);
37872 + assert("nikita-2538", de != NULL);
37873 + assert("nikita-2539", adj != 0);
37874 +
37875 + build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
37876 + mod_point.pos = offset;
37877 +
37878 + spin_lock_inode(dir);
37879 +
37880 + /*
37881 + * new entry was added/removed in directory @dir. Scan all file
37882 + * descriptors for @dir that are currently involved into @readdir and
37883 + * update them.
37884 + */
37885 +
37886 + list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
37887 + adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
37888 +
37889 + spin_unlock_inode(dir);
37890 +}
37891 +
37892 +/*
37893 + * traverse tree to start/continue readdir from the readdir position @pos.
37894 + */
37895 +static int dir_go_to(struct file *dir, struct readdir_pos * pos, tap_t * tap)
37896 +{
37897 + reiser4_key key;
37898 + int result;
37899 + struct inode *inode;
37900 +
37901 + assert("nikita-2554", pos != NULL);
37902 +
37903 + inode = dir->f_dentry->d_inode;
37904 + result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
37905 + if (result != 0)
37906 + return result;
37907 + result = reiser4_object_lookup(inode,
37908 + &key,
37909 + tap->coord,
37910 + tap->lh,
37911 + tap->mode,
37912 + FIND_EXACT,
37913 + LEAF_LEVEL, LEAF_LEVEL,
37914 + 0, &tap->ra_info);
37915 + if (result == CBK_COORD_FOUND)
37916 + result = rewind_right(tap, (int)pos->position.pos);
37917 + else {
37918 + tap->coord->node = NULL;
37919 + done_lh(tap->lh);
37920 + result = RETERR(-EIO);
37921 + }
37922 + return result;
37923 +}
37924 +
37925 +/*
37926 + * handling of non-unique keys: calculate at what ordinal position within
37927 + * sequence of directory items with identical keys @pos is.
37928 + */
37929 +static int set_pos(struct inode *inode, struct readdir_pos * pos, tap_t * tap)
37930 +{
37931 + int result;
37932 + coord_t coord;
37933 + lock_handle lh;
37934 + tap_t scan;
37935 + de_id *did;
37936 + reiser4_key de_key;
37937 +
37938 + coord_init_zero(&coord);
37939 + init_lh(&lh);
37940 + reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
37941 + reiser4_tap_copy(&scan, tap);
37942 + reiser4_tap_load(&scan);
37943 + pos->position.pos = 0;
37944 +
37945 + did = &pos->position.dir_entry_key;
37946 +
37947 + if (is_valid_dir_coord(inode, scan.coord)) {
37948 +
37949 + build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
37950 +
37951 + while (1) {
37952 +
37953 + result = go_prev_unit(&scan);
37954 + if (result != 0)
37955 + break;
37956 +
37957 + if (!is_valid_dir_coord(inode, scan.coord)) {
37958 + result = -EINVAL;
37959 + break;
37960 + }
37961 +
37962 + /* get key of directory entry */
37963 + unit_key_by_coord(scan.coord, &de_key);
37964 + if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
37965 + /* duplicate-sequence is over */
37966 + break;
37967 + }
37968 + pos->position.pos++;
37969 + }
37970 + } else
37971 + result = RETERR(-ENOENT);
37972 + reiser4_tap_relse(&scan);
37973 + reiser4_tap_done(&scan);
37974 + return result;
37975 +}
37976 +
37977 +/*
37978 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
37979 + */
37980 +static int dir_rewind(struct file *dir, struct readdir_pos * pos, tap_t * tap)
37981 +{
37982 + __u64 destination;
37983 + __s64 shift;
37984 + int result;
37985 + struct inode *inode;
37986 + loff_t dirpos;
37987 +
37988 + assert("nikita-2553", dir != NULL);
37989 + assert("nikita-2548", pos != NULL);
37990 + assert("nikita-2551", tap->coord != NULL);
37991 + assert("nikita-2552", tap->lh != NULL);
37992 +
37993 + dirpos = reiser4_get_dir_fpos(dir);
37994 + shift = dirpos - pos->fpos;
37995 + /* this is logical directory entry within @dir which we are rewinding
37996 + * to */
37997 + destination = pos->entry_no + shift;
37998 +
37999 + inode = dir->f_dentry->d_inode;
38000 + if (dirpos < 0)
38001 + return RETERR(-EINVAL);
38002 + else if (destination == 0ll || dirpos == 0) {
38003 + /* rewind to the beginning of directory */
38004 + memset(pos, 0, sizeof *pos);
38005 + return dir_go_to(dir, pos, tap);
38006 + } else if (destination >= inode->i_size)
38007 + return RETERR(-ENOENT);
38008 +
38009 + if (shift < 0) {
38010 + /* I am afraid of negative numbers */
38011 + shift = -shift;
38012 + /* rewinding to the left */
38013 + if (shift <= (int)pos->position.pos) {
38014 + /* destination is within sequence of entries with
38015 + duplicate keys. */
38016 + result = dir_go_to(dir, pos, tap);
38017 + } else {
38018 + shift -= pos->position.pos;
38019 + while (1) {
38020 + /* repetitions: deadlock is possible when
38021 + going to the left. */
38022 + result = dir_go_to(dir, pos, tap);
38023 + if (result == 0) {
38024 + result = rewind_left(tap, shift);
38025 + if (result == -E_DEADLOCK) {
38026 + reiser4_tap_done(tap);
38027 + continue;
38028 + }
38029 + }
38030 + break;
38031 + }
38032 + }
38033 + } else {
38034 + /* rewinding to the right */
38035 + result = dir_go_to(dir, pos, tap);
38036 + if (result == 0)
38037 + result = rewind_right(tap, shift);
38038 + }
38039 + if (result == 0) {
38040 + result = set_pos(inode, pos, tap);
38041 + if (result == 0) {
38042 + /* update pos->position.pos */
38043 + pos->entry_no = destination;
38044 + pos->fpos = dirpos;
38045 + }
38046 + }
38047 + return result;
38048 +}
38049 +
38050 +/*
38051 + * Function that is called by common_readdir() on each directory entry while
38052 + * doing readdir. ->filldir callback may block, so we had to release long term
38053 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
38054 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
38055 + *
38056 + * Whether node is unlocked in case of any other error is undefined. It is
38057 + * guaranteed to be still locked if success (0) is returned.
38058 + *
38059 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
38060 + * unlocked.
38061 + */
38062 +static int
38063 +feed_entry(struct file *f, struct readdir_pos * pos, tap_t * tap,
38064 + filldir_t filldir, void *dirent)
38065 +{
38066 + item_plugin *iplug;
38067 + char *name;
38068 + reiser4_key sd_key;
38069 + int result;
38070 + char buf[DE_NAME_BUF_LEN];
38071 + char name_buf[32];
38072 + char *local_name;
38073 + unsigned file_type;
38074 + seal_t seal;
38075 + coord_t *coord;
38076 + reiser4_key entry_key;
38077 +
38078 + coord = tap->coord;
38079 + iplug = item_plugin_by_coord(coord);
38080 +
38081 + /* pointer to name within the node */
38082 + name = iplug->s.dir.extract_name(coord, buf);
38083 + assert("nikita-1371", name != NULL);
38084 +
38085 + /* key of object the entry points to */
38086 + if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
38087 + return RETERR(-EIO);
38088 +
38089 + /* we must release longterm znode lock before calling filldir to avoid
38090 + deadlock which may happen if filldir causes page fault. So, copy
38091 + name to intermediate buffer */
38092 + if (strlen(name) + 1 > sizeof(name_buf)) {
38093 + local_name = kmalloc(strlen(name) + 1,
38094 + reiser4_ctx_gfp_mask_get());
38095 + if (local_name == NULL)
38096 + return RETERR(-ENOMEM);
38097 + } else
38098 + local_name = name_buf;
38099 +
38100 + strcpy(local_name, name);
38101 + file_type = iplug->s.dir.extract_file_type(coord);
38102 +
38103 + unit_key_by_coord(coord, &entry_key);
38104 + reiser4_seal_init(&seal, coord, &entry_key);
38105 +
38106 + longterm_unlock_znode(tap->lh);
38107 +
38108 + /*
38109 + * send information about directory entry to the ->filldir() filler
38110 + * supplied to us by caller (VFS).
38111 + *
38112 + * ->filldir is entitled to do weird things. For example, ->filldir
38113 + * supplied by knfsd re-enters file system. Make sure no locks are
38114 + * held.
38115 + */
38116 + assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
38117 +
38118 + reiser4_txn_restart_current();
38119 + result = filldir(dirent, name, (int)strlen(name),
38120 + /* offset of this entry */
38121 + f->f_pos,
38122 + /* inode number of object bounden by this entry */
38123 + oid_to_uino(get_key_objectid(&sd_key)), file_type);
38124 + if (local_name != name_buf)
38125 + kfree(local_name);
38126 + if (result < 0)
38127 + /* ->filldir() is satisfied. (no space in buffer, IOW) */
38128 + result = 1;
38129 + else
38130 + result = reiser4_seal_validate(&seal, coord, &entry_key,
38131 + tap->lh, tap->mode,
38132 + ZNODE_LOCK_HIPRI);
38133 + return result;
38134 +}
38135 +
38136 +static void move_entry(struct readdir_pos * pos, coord_t * coord)
38137 +{
38138 + reiser4_key de_key;
38139 + de_id *did;
38140 +
38141 + /* update @pos */
38142 + ++pos->entry_no;
38143 + did = &pos->position.dir_entry_key;
38144 +
38145 + /* get key of directory entry */
38146 + unit_key_by_coord(coord, &de_key);
38147 +
38148 + if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
38149 + /* we are within sequence of directory entries
38150 + with duplicate keys. */
38151 + ++pos->position.pos;
38152 + else {
38153 + pos->position.pos = 0;
38154 + build_de_id_by_key(&de_key, did);
38155 + }
38156 + ++pos->fpos;
38157 +}
38158 +
38159 +/*
38160 + * STATELESS READDIR
38161 + *
38162 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
38163 + * into reiser4_file_fsdata on each directory modification (name insertion and
38164 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
38165 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
38166 + * across client READDIR requests for the same directory.
38167 + *
38168 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
38169 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
38170 + * find detached reiser4_file_fsdata corresponding to previous readdir
38171 + * request. In other words, additional state is maintained on the
38172 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
38173 + *
38174 + * To efficiently detect when our ->readdir() method is called by NFS server,
38175 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
38176 + * file_is_stateless() function).
38177 + *
38178 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
38179 + * bits of NFS readdir cookie: when first readdir request comes to the given
38180 + * directory from the given client, cookie is set to 0. This situation is
38181 + * detected, global cid_counter is incremented, and stored in highest bits of
38182 + * all direntry offsets returned to the client, including last one. As the
38183 + * only valid readdir cookie is one obtained as direntry->offset, we are
38184 + * guaranteed that next readdir request (continuing current one) will have
38185 + * current cid in the highest bits of starting readdir cookie. All d_cursors
38186 + * are hashed into per-super-block hash table by (oid, cid) key.
38187 + *
38188 + * In addition d_cursors are placed into per-super-block radix tree where they
38189 + * are keyed by oid alone. This is necessary to efficiently remove them during
38190 + * rmdir.
38191 + *
38192 + * At last, currently unused d_cursors are linked into special list. This list
38193 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
38194 + *
38195 + */
38196 +
38197 +/*
38198 + * prepare for readdir.
38199 + */
38200 +static int dir_readdir_init(struct file *f, tap_t * tap,
38201 + struct readdir_pos ** pos)
38202 +{
38203 + struct inode *inode;
38204 + reiser4_file_fsdata *fsdata;
38205 + int result;
38206 +
38207 + assert("nikita-1359", f != NULL);
38208 + inode = f->f_dentry->d_inode;
38209 + assert("nikita-1360", inode != NULL);
38210 +
38211 + if (!S_ISDIR(inode->i_mode))
38212 + return RETERR(-ENOTDIR);
38213 +
38214 + /* try to find detached readdir state */
38215 + result = reiser4_attach_fsdata(f, inode);
38216 + if (result != 0)
38217 + return result;
38218 +
38219 + fsdata = reiser4_get_file_fsdata(f);
38220 + assert("nikita-2571", fsdata != NULL);
38221 + if (IS_ERR(fsdata))
38222 + return PTR_ERR(fsdata);
38223 +
38224 + /* add file descriptor to the readdir list hanging of directory
38225 + * inode. This list is used to scan "readdirs-in-progress" while
38226 + * inserting or removing names in the directory. */
38227 + spin_lock_inode(inode);
38228 + if (list_empty_careful(&fsdata->dir.linkage))
38229 + list_add(&fsdata->dir.linkage, get_readdir_list(inode));
38230 + *pos = &fsdata->dir.readdir;
38231 + spin_unlock_inode(inode);
38232 +
38233 + /* move @tap to the current position */
38234 + return dir_rewind(f, *pos, tap);
38235 +}
38236 +
38237 +/* this is implementation of vfs's llseek method of struct file_operations for
38238 + typical directory
38239 + See comment before reiser4_readdir_common() for explanation.
38240 +*/
38241 +loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
38242 +{
38243 + reiser4_context *ctx;
38244 + loff_t result;
38245 + struct inode *inode;
38246 +
38247 + inode = file->f_dentry->d_inode;
38248 +
38249 + ctx = reiser4_init_context(inode->i_sb);
38250 + if (IS_ERR(ctx))
38251 + return PTR_ERR(ctx);
38252 +
38253 + mutex_lock(&inode->i_mutex);
38254 +
38255 + /* update ->f_pos */
38256 + result = default_llseek(file, off, origin);
38257 + if (result >= 0) {
38258 + int ff;
38259 + coord_t coord;
38260 + lock_handle lh;
38261 + tap_t tap;
38262 + struct readdir_pos *pos;
38263 +
38264 + coord_init_zero(&coord);
38265 + init_lh(&lh);
38266 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38267 +
38268 + ff = dir_readdir_init(file, &tap, &pos);
38269 + reiser4_detach_fsdata(file);
38270 + if (ff != 0)
38271 + result = (loff_t) ff;
38272 + reiser4_tap_done(&tap);
38273 + }
38274 + reiser4_detach_fsdata(file);
38275 + mutex_unlock(&inode->i_mutex);
38276 +
38277 + reiser4_exit_context(ctx);
38278 + return result;
38279 +}
38280 +
38281 +/* this is common implementation of vfs's readdir method of struct
38282 + file_operations
38283 +
38284 + readdir problems:
38285 +
38286 + readdir(2)/getdents(2) interface is based on implicit assumption that
38287 + readdir can be restarted from any particular point by supplying file system
38288 + with off_t-full of data. That is, file system fills ->d_off field in struct
38289 + dirent and later user passes ->d_off to the seekdir(3), which is, actually,
38290 + implemented by glibc as lseek(2) on directory.
38291 +
38292 + Reiser4 cannot restart readdir from 64 bits of data, because two last
38293 + components of the key of directory entry are unknown, which given 128 bits:
38294 + locality and type fields in the key of directory entry are always known, to
38295 + start readdir() from given point objectid and offset fields have to be
38296 + filled.
38297 +
38298 + Traditional UNIX API for scanning through directory
38299 + (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
38300 + assumption that directory is structured very much like regular file, in
38301 + particular, it is implied that each name within given directory (directory
38302 + entry) can be uniquely identified by scalar offset and that such offset is
38303 + stable across the life-time of the name is identifies.
38304 +
38305 + This is manifestly not so for reiser4. In reiser4 the only stable unique
38306 + identifies for the directory entry is its key that doesn't fit into
38307 + seekdir/telldir API.
38308 +
38309 + solution:
38310 +
38311 + Within each file descriptor participating in readdir-ing of directory
38312 + plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
38313 + the "current" directory entry that file descriptor looks at. It contains a
38314 + key of directory entry (plus some additional info to deal with non-unique
38315 + keys that we wouldn't dwell onto here) and a logical position of this
38316 + directory entry starting from the beginning of the directory, that is
38317 + ordinal number of this entry in the readdir order.
38318 +
38319 + Obviously this logical position is not stable in the face of directory
38320 + modifications. To work around this, on each addition or removal of directory
38321 + entry all file descriptors for directory inode are scanned and their
38322 + readdir_pos are updated accordingly (adjust_dir_pos()).
38323 +*/
38324 +int reiser4_readdir_common(struct file *f /* directory file being read */,
38325 + void *dirent /* opaque data passed to us by VFS */,
38326 + filldir_t filld /* filler function passed to us
38327 + * by VFS */)
38328 +{
38329 + reiser4_context *ctx;
38330 + int result;
38331 + struct inode *inode;
38332 + coord_t coord;
38333 + lock_handle lh;
38334 + tap_t tap;
38335 + struct readdir_pos *pos;
38336 +
38337 + assert("nikita-1359", f != NULL);
38338 + inode = f->f_dentry->d_inode;
38339 + assert("nikita-1360", inode != NULL);
38340 +
38341 + if (!S_ISDIR(inode->i_mode))
38342 + return RETERR(-ENOTDIR);
38343 +
38344 + ctx = reiser4_init_context(inode->i_sb);
38345 + if (IS_ERR(ctx))
38346 + return PTR_ERR(ctx);
38347 +
38348 + coord_init_zero(&coord);
38349 + init_lh(&lh);
38350 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38351 +
38352 + reiser4_readdir_readahead_init(inode, &tap);
38353 +
38354 + repeat:
38355 + result = dir_readdir_init(f, &tap, &pos);
38356 + if (result == 0) {
38357 + result = reiser4_tap_load(&tap);
38358 + /* scan entries one by one feeding them to @filld */
38359 + while (result == 0) {
38360 + coord_t *coord;
38361 +
38362 + coord = tap.coord;
38363 + assert("nikita-2572", coord_is_existing_unit(coord));
38364 + assert("nikita-3227", is_valid_dir_coord(inode, coord));
38365 +
38366 + result = feed_entry(f, pos, &tap, filld, dirent);
38367 + if (result > 0) {
38368 + break;
38369 + } else if (result == 0) {
38370 + ++f->f_pos;
38371 + result = go_next_unit(&tap);
38372 + if (result == -E_NO_NEIGHBOR ||
38373 + result == -ENOENT) {
38374 + result = 0;
38375 + break;
38376 + } else if (result == 0) {
38377 + if (is_valid_dir_coord(inode, coord))
38378 + move_entry(pos, coord);
38379 + else
38380 + break;
38381 + }
38382 + } else if (result == -E_REPEAT) {
38383 + /* feed_entry() had to restart. */
38384 + ++f->f_pos;
38385 + reiser4_tap_relse(&tap);
38386 + goto repeat;
38387 + } else
38388 + warning("vs-1617",
38389 + "reiser4_readdir_common: unexpected error %d",
38390 + result);
38391 + }
38392 + reiser4_tap_relse(&tap);
38393 +
38394 + if (result >= 0)
38395 + f->f_version = inode->i_version;
38396 + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
38397 + result = 0;
38398 + reiser4_tap_done(&tap);
38399 + reiser4_detach_fsdata(f);
38400 +
38401 + /* try to update directory's atime */
38402 + if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
38403 + BA_CAN_COMMIT) != 0)
38404 + warning("", "failed to update atime on readdir: %llu",
38405 + get_inode_oid(inode));
38406 + else
38407 + file_accessed(f);
38408 +
38409 + context_set_commit_async(ctx);
38410 + reiser4_exit_context(ctx);
38411 +
38412 + return (result <= 0) ? result : 0;
38413 +}
38414 +
38415 +/*
38416 + * Local variables:
38417 + * c-indentation-style: "K&R"
38418 + * mode-name: "LC"
38419 + * c-basic-offset: 8
38420 + * tab-width: 8
38421 + * fill-column: 79
38422 + * End:
38423 + */
38424 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.23/fs/reiser4/plugin/file_plugin_common.c
38425 --- linux-2.6.23.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
38426 +++ linux-2.6.23/fs/reiser4/plugin/file_plugin_common.c 2007-12-04 22:59:05.806371984 +0300
38427 @@ -0,0 +1,1007 @@
38428 +/* Copyright 2005 by Hans Reiser, licensing governed by
38429 + reiser4/README */
38430 +
38431 +/* this file contains typical implementations for most of methods of
38432 + file plugin
38433 +*/
38434 +
38435 +#include "../inode.h"
38436 +#include "object.h"
38437 +#include "../safe_link.h"
38438 +
38439 +#include <linux/quotaops.h>
38440 +
38441 +static int insert_new_sd(struct inode *inode);
38442 +static int update_sd(struct inode *inode);
38443 +
38444 +/* this is common implementation of write_sd_by_inode method of file plugin
38445 + either insert stat data or update it
38446 + */
38447 +int write_sd_by_inode_common(struct inode *inode /* object to save */ )
38448 +{
38449 + int result;
38450 +
38451 + assert("nikita-730", inode != NULL);
38452 +
38453 + if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38454 + /* object doesn't have stat-data yet */
38455 + result = insert_new_sd(inode);
38456 + else
38457 + result = update_sd(inode);
38458 + if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
38459 + /* Don't issue warnings about "name is too long" */
38460 + warning("nikita-2221", "Failed to save sd for %llu: %i",
38461 + (unsigned long long)get_inode_oid(inode), result);
38462 + return result;
38463 +}
38464 +
38465 +/* this is common implementation of key_by_inode method of file plugin
38466 + */
38467 +int
38468 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
38469 + reiser4_key * key)
38470 +{
38471 + reiser4_key_init(key);
38472 + set_key_locality(key, reiser4_inode_data(inode)->locality_id);
38473 + set_key_ordering(key, get_inode_ordering(inode));
38474 + set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
38475 + set_key_type(key, KEY_BODY_MINOR);
38476 + set_key_offset(key, (__u64) off);
38477 + return 0;
38478 +}
38479 +
38480 +/* this is common implementation of set_plug_in_inode method of file plugin
38481 + */
38482 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
38483 + struct inode *parent /* parent object */ ,
38484 + reiser4_object_create_data * data /* creational
38485 + * data */ )
38486 +{
38487 + __u64 mask;
38488 +
38489 + object->i_mode = data->mode;
38490 + /* this should be plugin decision */
38491 + object->i_uid = current->fsuid;
38492 + object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
38493 +
38494 + /* support for BSD style group-id assignment. See mount's manual page
38495 + description of bsdgroups ext2 mount options for more details */
38496 + if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
38497 + object->i_gid = parent->i_gid;
38498 + else if (parent->i_mode & S_ISGID) {
38499 + /* parent directory has sguid bit */
38500 + object->i_gid = parent->i_gid;
38501 + if (S_ISDIR(object->i_mode))
38502 + /* sguid is inherited by sub-directories */
38503 + object->i_mode |= S_ISGID;
38504 + } else
38505 + object->i_gid = current->fsgid;
38506 +
38507 + /* this object doesn't have stat-data yet */
38508 + reiser4_inode_set_flag(object, REISER4_NO_SD);
38509 +#if 0
38510 + /* this is now called after all inode plugins are initialized:
38511 + do_create_vfs_child after adjust_to_parent */
38512 + /* setup inode and file-operations for this inode */
38513 + setup_inode_ops(object, data);
38514 +#endif
38515 + object->i_nlink = 0;
38516 + reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
38517 + mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
38518 + if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
38519 + mask |= (1 << LARGE_TIMES_STAT);
38520 +
38521 + reiser4_inode_data(object)->extmask = mask;
38522 + return 0;
38523 +}
38524 +
38525 +/* this is common implementation of adjust_to_parent method of file plugin for
38526 + regular files
38527 + */
38528 +int adjust_to_parent_common(struct inode *object /* new object */ ,
38529 + struct inode *parent /* parent directory */ ,
38530 + struct inode *root /* root directory */ )
38531 +{
38532 + assert("nikita-2165", object != NULL);
38533 + if (parent == NULL)
38534 + parent = root;
38535 + assert("nikita-2069", parent != NULL);
38536 +
38537 + /*
38538 + * inherit missing plugins from parent
38539 + */
38540 +
38541 + grab_plugin_pset(object, parent, PSET_FILE);
38542 + grab_plugin_pset(object, parent, PSET_SD);
38543 + grab_plugin_pset(object, parent, PSET_FORMATTING);
38544 + grab_plugin_pset(object, parent, PSET_PERM);
38545 + return 0;
38546 +}
38547 +
38548 +/* this is common implementation of adjust_to_parent method of file plugin for
38549 + typical directories
38550 + */
38551 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
38552 + struct inode *parent /* parent directory */ ,
38553 + struct inode *root /* root directory */ )
38554 +{
38555 + int result = 0;
38556 + pset_member memb;
38557 +
38558 + assert("nikita-2166", object != NULL);
38559 + if (parent == NULL)
38560 + parent = root;
38561 + assert("nikita-2167", parent != NULL);
38562 +
38563 + /*
38564 + * inherit missing plugins from parent
38565 + */
38566 + for (memb = 0; memb < PSET_LAST; ++memb) {
38567 + result = grab_plugin_pset(object, parent, memb);
38568 + if (result != 0)
38569 + break;
38570 + }
38571 + return result;
38572 +}
38573 +
38574 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
38575 + struct inode *parent /* parent directory */,
38576 + struct inode *root /* root directory */)
38577 +{
38578 + int result;
38579 + result = adjust_to_parent_common(object, parent, root);
38580 + if (result)
38581 + return result;
38582 + assert("edward-1416", parent != NULL);
38583 +
38584 + grab_plugin_pset(object, parent, PSET_CLUSTER);
38585 + grab_plugin_pset(object, parent, PSET_CIPHER);
38586 + grab_plugin_pset(object, parent, PSET_DIGEST);
38587 + grab_plugin_pset(object, parent, PSET_COMPRESSION);
38588 + grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
38589 +
38590 + return 0;
38591 +}
38592 +
38593 +/* this is common implementation of create_object method of file plugin
38594 + */
38595 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
38596 + reiser4_object_create_data * data)
38597 +{
38598 + reiser4_block_nr reserve;
38599 + assert("nikita-744", object != NULL);
38600 + assert("nikita-745", parent != NULL);
38601 + assert("nikita-747", data != NULL);
38602 + assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
38603 +
38604 + reserve = estimate_create_common(object);
38605 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
38606 + return RETERR(-ENOSPC);
38607 + return write_sd_by_inode_common(object);
38608 +}
38609 +
38610 +static int common_object_delete_no_reserve(struct inode *inode);
38611 +
38612 +/**
38613 + * reiser4_delete_object_common - delete_object of file_plugin
38614 + * @inode: inode to be deleted
38615 + *
38616 + * This is common implementation of delete_object method of file_plugin. It
38617 + * applies to object its deletion consists of removing two items - stat data
38618 + * and safe-link.
38619 + */
38620 +int reiser4_delete_object_common(struct inode *inode)
38621 +{
38622 + int result;
38623 +
38624 + assert("nikita-1477", inode != NULL);
38625 + /* FIXME: if file body deletion failed (i/o error, for instance),
38626 + inode->i_size can be != 0 here */
38627 + assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
38628 + assert("nikita-3421", inode->i_nlink == 0);
38629 +
38630 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
38631 + reiser4_block_nr reserve;
38632 +
38633 + /* grab space which is needed to remove 2 items from the tree:
38634 + stat data and safe-link */
38635 + reserve = 2 *
38636 + estimate_one_item_removal(reiser4_tree_by_inode(inode));
38637 + if (reiser4_grab_space_force(reserve,
38638 + BA_RESERVED | BA_CAN_COMMIT))
38639 + return RETERR(-ENOSPC);
38640 + result = common_object_delete_no_reserve(inode);
38641 + } else
38642 + result = 0;
38643 + return result;
38644 +}
38645 +
38646 +/**
38647 + * reiser4_delete_dir_common - delete_object of file_plugin
38648 + * @inode: inode to be deleted
38649 + *
38650 + * This is common implementation of delete_object method of file_plugin for
38651 + * typical directory. It calls done method of dir_plugin to remove "." and
38652 + * removes stat data and safe-link.
38653 + */
38654 +int reiser4_delete_dir_common(struct inode *inode)
38655 +{
38656 + int result;
38657 + dir_plugin *dplug;
38658 +
38659 + assert("", (get_current_context() &&
38660 + get_current_context()->trans->atom == NULL));
38661 +
38662 + dplug = inode_dir_plugin(inode);
38663 + assert("vs-1101", dplug && dplug->done);
38664 +
38665 + /* kill cursors which might be attached to inode */
38666 + reiser4_kill_cursors(inode);
38667 +
38668 + /* grab space enough for removing two items */
38669 + if (reiser4_grab_space
38670 + (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
38671 + BA_RESERVED | BA_CAN_COMMIT))
38672 + return RETERR(-ENOSPC);
38673 +
38674 + result = dplug->done(inode);
38675 + if (!result)
38676 + result = common_object_delete_no_reserve(inode);
38677 + return result;
38678 +}
38679 +
38680 +/* this is common implementation of add_link method of file plugin
38681 + */
38682 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
38683 +{
38684 + /*
38685 + * increment ->i_nlink and update ->i_ctime
38686 + */
38687 +
38688 + INODE_INC_FIELD(object, i_nlink);
38689 + object->i_ctime = CURRENT_TIME;
38690 + return 0;
38691 +}
38692 +
38693 +/* this is common implementation of rem_link method of file plugin
38694 + */
38695 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
38696 +{
38697 + assert("nikita-2021", object != NULL);
38698 + assert("nikita-2163", object->i_nlink > 0);
38699 +
38700 + /*
38701 + * decrement ->i_nlink and update ->i_ctime
38702 + */
38703 +
38704 + INODE_DEC_FIELD(object, i_nlink);
38705 + object->i_ctime = CURRENT_TIME;
38706 + return 0;
38707 +}
38708 +
38709 +/* this is common implementation of rem_link method of file plugin for typical
38710 + directory
38711 +*/
38712 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
38713 +{
38714 + assert("nikita-20211", object != NULL);
38715 + assert("nikita-21631", object->i_nlink > 0);
38716 +
38717 + /*
38718 + * decrement ->i_nlink and update ->i_ctime
38719 + */
38720 + INODE_DEC_FIELD(object, i_nlink);
38721 + if (object->i_nlink == 1)
38722 + INODE_DEC_FIELD(object, i_nlink);
38723 + object->i_ctime = CURRENT_TIME;
38724 + return 0;
38725 +}
38726 +
38727 +/* this is common implementation of owns_item method of file plugin
38728 + compare objectids of keys in inode and coord */
38729 +int owns_item_common(const struct inode *inode, /* object to check
38730 + * against */
38731 + const coord_t * coord /* coord to check */ )
38732 +{
38733 + reiser4_key item_key;
38734 + reiser4_key file_key;
38735 +
38736 + assert("nikita-760", inode != NULL);
38737 + assert("nikita-761", coord != NULL);
38738 +
38739 + return coord_is_existing_item(coord) &&
38740 + (get_key_objectid(build_sd_key(inode, &file_key)) ==
38741 + get_key_objectid(item_key_by_coord(coord, &item_key)));
38742 +}
38743 +
38744 +/* this is common implementation of owns_item method of file plugin
38745 + for typical directory
38746 +*/
38747 +int owns_item_common_dir(const struct inode *inode, /* object to check against */
38748 + const coord_t * coord /* coord of item to check */ )
38749 +{
38750 + reiser4_key item_key;
38751 +
38752 + assert("nikita-1335", inode != NULL);
38753 + assert("nikita-1334", coord != NULL);
38754 +
38755 + if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
38756 + return get_key_locality(item_key_by_coord(coord, &item_key)) ==
38757 + get_inode_oid(inode);
38758 + else
38759 + return owns_item_common(inode, coord);
38760 +}
38761 +
38762 +/* this is common implementation of can_add_link method of file plugin
38763 + checks whether yet another hard links to this object can be added
38764 +*/
38765 +int can_add_link_common(const struct inode *object /* object to check */ )
38766 +{
38767 + assert("nikita-732", object != NULL);
38768 +
38769 + /* inode->i_nlink is unsigned int, so just check for integer
38770 + overflow */
38771 + return object->i_nlink + 1 != 0;
38772 +}
38773 +
38774 +/* this is common implementation of can_rem_link method of file plugin for
38775 + typical directory
38776 +*/
38777 +int can_rem_link_common_dir(const struct inode *inode)
38778 +{
38779 + /* is_dir_empty() returns 0 is dir is empty */
38780 + return !is_dir_empty(inode);
38781 +}
38782 +
38783 +/* this is common implementation of detach method of file plugin for typical
38784 + directory
38785 +*/
38786 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
38787 +{
38788 + dir_plugin *dplug;
38789 +
38790 + dplug = inode_dir_plugin(child);
38791 + assert("nikita-2883", dplug != NULL);
38792 + assert("nikita-2884", dplug->detach != NULL);
38793 + return dplug->detach(child, parent);
38794 +}
38795 +
38796 +/* this is common implementation of bind method of file plugin for typical
38797 + directory
38798 +*/
38799 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
38800 +{
38801 + dir_plugin *dplug;
38802 +
38803 + dplug = inode_dir_plugin(child);
38804 + assert("nikita-2646", dplug != NULL);
38805 + return dplug->attach(child, parent);
38806 +}
38807 +
38808 +static int process_truncate(struct inode *, __u64 size);
38809 +
38810 +/* this is common implementation of safelink method of file plugin
38811 + */
38812 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
38813 +{
38814 + int result;
38815 +
38816 + assert("vs-1705", get_current_context()->trans->atom == NULL);
38817 + if (link == SAFE_UNLINK)
38818 + /* nothing to do. iput() in the caller (process_safelink) will
38819 + * finish with file */
38820 + result = 0;
38821 + else if (link == SAFE_TRUNCATE)
38822 + result = process_truncate(object, value);
38823 + else {
38824 + warning("nikita-3438", "Unrecognized safe-link type: %i", link);
38825 + result = RETERR(-EIO);
38826 + }
38827 + return result;
38828 +}
38829 +
38830 +/* this is common implementation of estimate.create method of file plugin
38831 + can be used when object creation involves insertion of one item (usually stat
38832 + data) into tree
38833 +*/
38834 +reiser4_block_nr estimate_create_common(const struct inode * object)
38835 +{
38836 + return estimate_one_insert_item(reiser4_tree_by_inode(object));
38837 +}
38838 +
38839 +/* this is common implementation of estimate.create method of file plugin for
38840 + typical directory
38841 + can be used when directory creation involves insertion of two items (usually
38842 + stat data and item containing "." and "..") into tree
38843 +*/
38844 +reiser4_block_nr estimate_create_common_dir(const struct inode * object)
38845 +{
38846 + return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
38847 +}
38848 +
38849 +/* this is common implementation of estimate.update method of file plugin
38850 + can be used when stat data update does not do more than inserting a unit
38851 + into a stat data item which is probably true for most cases
38852 +*/
38853 +reiser4_block_nr estimate_update_common(const struct inode * inode)
38854 +{
38855 + return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
38856 +}
38857 +
38858 +/* this is common implementation of estimate.unlink method of file plugin
38859 + */
38860 +reiser4_block_nr
38861 +estimate_unlink_common(const struct inode * object UNUSED_ARG,
38862 + const struct inode * parent UNUSED_ARG)
38863 +{
38864 + return 0;
38865 +}
38866 +
38867 +/* this is common implementation of estimate.unlink method of file plugin for
38868 + typical directory
38869 +*/
38870 +reiser4_block_nr
38871 +estimate_unlink_common_dir(const struct inode * object,
38872 + const struct inode * parent)
38873 +{
38874 + dir_plugin *dplug;
38875 +
38876 + dplug = inode_dir_plugin(object);
38877 + assert("nikita-2888", dplug != NULL);
38878 + assert("nikita-2887", dplug->estimate.unlink != NULL);
38879 + return dplug->estimate.unlink(object, parent);
38880 +}
38881 +
38882 +char *wire_write_common(struct inode *inode, char *start)
38883 +{
38884 + return build_inode_onwire(inode, start);
38885 +}
38886 +
38887 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
38888 +{
38889 + return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
38890 +}
38891 +
38892 +struct dentry *wire_get_common(struct super_block *sb,
38893 + reiser4_object_on_wire * obj)
38894 +{
38895 + struct inode *inode;
38896 + struct dentry *dentry;
38897 + reiser4_key key;
38898 +
38899 + extract_key_from_id(&obj->u.std.key_id, &key);
38900 + inode = reiser4_iget(sb, &key, 1);
38901 + if (!IS_ERR(inode)) {
38902 + reiser4_iget_complete(inode);
38903 + dentry = d_alloc_anon(inode);
38904 + if (dentry == NULL) {
38905 + iput(inode);
38906 + dentry = ERR_PTR(-ENOMEM);
38907 + } else
38908 + dentry->d_op = &get_super_private(sb)->ops.dentry;
38909 + } else if (PTR_ERR(inode) == -ENOENT)
38910 + /*
38911 + * inode wasn't found at the key encoded in the file
38912 + * handle. Hence, file handle is stale.
38913 + */
38914 + dentry = ERR_PTR(RETERR(-ESTALE));
38915 + else
38916 + dentry = (void *)inode;
38917 + return dentry;
38918 +}
38919 +
38920 +int wire_size_common(struct inode *inode)
38921 +{
38922 + return inode_onwire_size(inode);
38923 +}
38924 +
38925 +void wire_done_common(reiser4_object_on_wire * obj)
38926 +{
38927 + /* nothing to do */
38928 +}
38929 +
38930 +/* helper function to print errors */
38931 +static void key_warning(const reiser4_key * key /* key to print */ ,
38932 + const struct inode *inode,
38933 + int code /* error code to print */ )
38934 +{
38935 + assert("nikita-716", key != NULL);
38936 +
38937 + if (code != -ENOMEM) {
38938 + warning("nikita-717", "Error for inode %llu (%i)",
38939 + (unsigned long long)get_key_objectid(key), code);
38940 + reiser4_print_key("for key", key);
38941 + }
38942 +}
38943 +
38944 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
38945 +#if REISER4_DEBUG
38946 +static void
38947 +check_inode_seal(const struct inode *inode,
38948 + const coord_t * coord, const reiser4_key * key)
38949 +{
38950 + reiser4_key unit_key;
38951 +
38952 + unit_key_by_coord(coord, &unit_key);
38953 + assert("nikita-2752",
38954 + WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
38955 + assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
38956 +}
38957 +
38958 +static void check_sd_coord(coord_t * coord, const reiser4_key * key)
38959 +{
38960 + reiser4_key ukey;
38961 +
38962 + coord_clear_iplug(coord);
38963 + if (zload(coord->node))
38964 + return;
38965 +
38966 + if (!coord_is_existing_unit(coord) ||
38967 + !item_plugin_by_coord(coord) ||
38968 + !keyeq(unit_key_by_coord(coord, &ukey), key) ||
38969 + (znode_get_level(coord->node) != LEAF_LEVEL) ||
38970 + !item_is_statdata(coord)) {
38971 + warning("nikita-1901", "Conspicuous seal");
38972 + reiser4_print_key("key", key);
38973 + print_coord("coord", coord, 1);
38974 + impossible("nikita-2877", "no way");
38975 + }
38976 + zrelse(coord->node);
38977 +}
38978 +
38979 +#else
38980 +#define check_inode_seal(inode, coord, key) noop
38981 +#define check_sd_coord(coord, key) noop
38982 +#endif
38983 +
38984 +/* insert new stat-data into tree. Called with inode state
38985 + locked. Return inode state locked. */
38986 +static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
38987 +{
38988 + int result;
38989 + reiser4_key key;
38990 + coord_t coord;
38991 + reiser4_item_data data;
38992 + char *area;
38993 + reiser4_inode *ref;
38994 + lock_handle lh;
38995 + oid_t oid;
38996 +
38997 + assert("nikita-723", inode != NULL);
38998 + assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
38999 +
39000 + ref = reiser4_inode_data(inode);
39001 + spin_lock_inode(inode);
39002 +
39003 + if (ref->plugin_mask != 0)
39004 + /* inode has non-standard plugins */
39005 + inode_set_extension(inode, PLUGIN_STAT);
39006 + /*
39007 + * prepare specification of new item to be inserted
39008 + */
39009 +
39010 + data.iplug = inode_sd_plugin(inode);
39011 + data.length = data.iplug->s.sd.save_len(inode);
39012 + spin_unlock_inode(inode);
39013 +
39014 + data.data = NULL;
39015 + data.user = 0;
39016 +/* could be optimized for case where there is only one node format in
39017 + * use in the filesystem, probably there are lots of such
39018 + * places we could optimize for only one node layout.... -Hans */
39019 + if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
39020 + /* This is silly check, but we don't know actual node where
39021 + insertion will go into. */
39022 + return RETERR(-ENAMETOOLONG);
39023 + }
39024 + oid = oid_allocate(inode->i_sb);
39025 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
39026 + if (oid == ABSOLUTE_MAX_OID)
39027 + return RETERR(-EOVERFLOW);
39028 +
39029 + set_inode_oid(inode, oid);
39030 +
39031 + coord_init_zero(&coord);
39032 + init_lh(&lh);
39033 +
39034 + result = insert_by_key(reiser4_tree_by_inode(inode),
39035 + build_sd_key(inode, &key), &data, &coord, &lh,
39036 + /* stat data lives on a leaf level */
39037 + LEAF_LEVEL, CBK_UNIQUE);
39038 +
39039 + /* we don't want to re-check that somebody didn't insert
39040 + stat-data while we were doing io, because if it did,
39041 + insert_by_key() returned error. */
39042 + /* but what _is_ possible is that plugin for inode's stat-data,
39043 + list of non-standard plugins or their state would change
39044 + during io, so that stat-data wouldn't fit into sd. To avoid
39045 + this race we keep inode_state lock. This lock has to be
39046 + taken each time you access inode in a way that would cause
39047 + changes in sd size: changing plugins etc.
39048 + */
39049 +
39050 + if (result == IBK_INSERT_OK) {
39051 + coord_clear_iplug(&coord);
39052 + result = zload(coord.node);
39053 + if (result == 0) {
39054 + /* have we really inserted stat data? */
39055 + assert("nikita-725", item_is_statdata(&coord));
39056 +
39057 + /* inode was just created. It is inserted into hash
39058 + table, but no directory entry was yet inserted into
39059 + parent. So, inode is inaccessible through
39060 + ->lookup(). All places that directly grab inode
39061 + from hash-table (like old knfsd), should check
39062 + IMMUTABLE flag that is set by common_create_child.
39063 + */
39064 + assert("nikita-3240", data.iplug != NULL);
39065 + assert("nikita-3241", data.iplug->s.sd.save != NULL);
39066 + area = item_body_by_coord(&coord);
39067 + result = data.iplug->s.sd.save(inode, &area);
39068 + znode_make_dirty(coord.node);
39069 + if (result == 0) {
39070 + /* object has stat-data now */
39071 + reiser4_inode_clr_flag(inode, REISER4_NO_SD);
39072 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39073 + /* initialise stat-data seal */
39074 + reiser4_seal_init(&ref->sd_seal, &coord, &key);
39075 + ref->sd_coord = coord;
39076 + check_inode_seal(inode, &coord, &key);
39077 + } else if (result != -ENOMEM)
39078 + /*
39079 + * convert any other error code to -EIO to
39080 + * avoid confusing user level with unexpected
39081 + * errors.
39082 + */
39083 + result = RETERR(-EIO);
39084 + zrelse(coord.node);
39085 + }
39086 + }
39087 + done_lh(&lh);
39088 +
39089 + if (result != 0)
39090 + key_warning(&key, inode, result);
39091 + else
39092 + oid_count_allocated();
39093 +
39094 + return result;
39095 +}
39096 +
39097 +/* find sd of inode in a tree, deal with errors */
39098 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
39099 + znode_lock_mode lock_mode /* lock mode */ ,
39100 + coord_t * coord /* resulting coord */ ,
39101 + lock_handle * lh /* resulting lock handle */ ,
39102 + const reiser4_key * key /* resulting key */ ,
39103 + int silent)
39104 +{
39105 + int result;
39106 + __u32 flags;
39107 +
39108 + assert("nikita-1692", inode != NULL);
39109 + assert("nikita-1693", coord != NULL);
39110 + assert("nikita-1694", key != NULL);
39111 +
39112 + /* look for the object's stat data in a tree.
39113 + This returns in "node" pointer to a locked znode and in "pos"
39114 + position of an item found in node. Both are only valid if
39115 + coord_found is returned. */
39116 + flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
39117 + flags |= CBK_UNIQUE;
39118 + /*
39119 + * traverse tree to find stat data. We cannot use vroot here, because
39120 + * it only covers _body_ of the file, and stat data don't belong
39121 + * there.
39122 + */
39123 + result = coord_by_key(reiser4_tree_by_inode(inode),
39124 + key,
39125 + coord,
39126 + lh,
39127 + lock_mode,
39128 + FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
39129 + if (REISER4_DEBUG && result == 0)
39130 + check_sd_coord(coord, key);
39131 +
39132 + if (result != 0 && !silent)
39133 + key_warning(key, inode, result);
39134 + return result;
39135 +}
39136 +
39137 +static int
39138 +locate_inode_sd(struct inode *inode,
39139 + reiser4_key * key, coord_t * coord, lock_handle * lh)
39140 +{
39141 + reiser4_inode *state;
39142 + seal_t seal;
39143 + int result;
39144 +
39145 + assert("nikita-3483", inode != NULL);
39146 +
39147 + state = reiser4_inode_data(inode);
39148 + spin_lock_inode(inode);
39149 + *coord = state->sd_coord;
39150 + coord_clear_iplug(coord);
39151 + seal = state->sd_seal;
39152 + spin_unlock_inode(inode);
39153 +
39154 + build_sd_key(inode, key);
39155 + if (reiser4_seal_is_set(&seal)) {
39156 + /* first, try to use seal */
39157 + result = reiser4_seal_validate(&seal,
39158 + coord,
39159 + key,
39160 + lh, ZNODE_WRITE_LOCK,
39161 + ZNODE_LOCK_LOPRI);
39162 + if (result == 0)
39163 + check_sd_coord(coord, key);
39164 + } else
39165 + result = -E_REPEAT;
39166 +
39167 + if (result != 0) {
39168 + coord_init_zero(coord);
39169 + result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
39170 + }
39171 + return result;
39172 +}
39173 +
39174 +#if REISER4_DEBUG
39175 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
39176 +{
39177 + return (get_key_locality(k1) == get_key_locality(k2) &&
39178 + get_key_type(k1) == get_key_type(k2) &&
39179 + get_key_band(k1) == get_key_band(k2) &&
39180 + get_key_ordering(k1) == get_key_ordering(k2) &&
39181 + get_key_objectid(k1) == get_key_objectid(k2));
39182 +}
39183 +
39184 +#include "../tree_walk.h"
39185 +
39186 +/* make some checks before and after stat-data resize operation */
39187 +static int check_sd_resize(struct inode * inode, coord_t * coord,
39188 + int length, int progress /* 1 means after resize */)
39189 +{
39190 + int ret = 0;
39191 + lock_handle left_lock;
39192 + coord_t left_coord;
39193 + reiser4_key left_key;
39194 + reiser4_key key;
39195 +
39196 + if (inode_file_plugin(inode) !=
39197 + file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
39198 + return 0;
39199 + if (!length)
39200 + return 0;
39201 + if (coord->item_pos != 0)
39202 + return 0;
39203 +
39204 + init_lh(&left_lock);
39205 + ret = reiser4_get_left_neighbor(&left_lock,
39206 + coord->node,
39207 + ZNODE_WRITE_LOCK,
39208 + GN_CAN_USE_UPPER_LEVELS);
39209 + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
39210 + ret == -ENOENT || ret == -EINVAL
39211 + || ret == -E_DEADLOCK) {
39212 + ret = 0;
39213 + goto exit;
39214 + }
39215 + ret = zload(left_lock.node);
39216 + if (ret)
39217 + goto exit;
39218 + coord_init_last_unit(&left_coord, left_lock.node);
39219 + item_key_by_coord(&left_coord, &left_key);
39220 + item_key_by_coord(coord, &key);
39221 +
39222 + if (all_but_offset_key_eq(&key, &left_key))
39223 + /* corruption occured */
39224 + ret = 1;
39225 + zrelse(left_lock.node);
39226 + exit:
39227 + done_lh(&left_lock);
39228 + return ret;
39229 +}
39230 +#endif
39231 +
39232 +/* update stat-data at @coord */
39233 +static int
39234 +update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
39235 + lock_handle * lh)
39236 +{
39237 + int result;
39238 + reiser4_item_data data;
39239 + char *area;
39240 + reiser4_inode *state;
39241 + znode *loaded;
39242 +
39243 + state = reiser4_inode_data(inode);
39244 +
39245 + coord_clear_iplug(coord);
39246 + result = zload(coord->node);
39247 + if (result != 0)
39248 + return result;
39249 + loaded = coord->node;
39250 +
39251 + spin_lock_inode(inode);
39252 + assert("nikita-728", inode_sd_plugin(inode) != NULL);
39253 + data.iplug = inode_sd_plugin(inode);
39254 +
39255 + /* if inode has non-standard plugins, add appropriate stat data
39256 + * extension */
39257 + if (state->extmask & (1 << PLUGIN_STAT)) {
39258 + if (state->plugin_mask == 0)
39259 + inode_clr_extension(inode, PLUGIN_STAT);
39260 + } else if (state->plugin_mask != 0)
39261 + inode_set_extension(inode, PLUGIN_STAT);
39262 +
39263 + if (state->extmask & (1 << HEIR_STAT)) {
39264 + if (state->heir_mask == 0)
39265 + inode_clr_extension(inode, HEIR_STAT);
39266 + } else if (state->heir_mask != 0)
39267 + inode_set_extension(inode, HEIR_STAT);
39268 +
39269 + /* data.length is how much space to add to (or remove
39270 + from if negative) sd */
39271 + if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
39272 + /* recalculate stat-data length */
39273 + data.length =
39274 + data.iplug->s.sd.save_len(inode) -
39275 + item_length_by_coord(coord);
39276 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39277 + } else
39278 + data.length = 0;
39279 + spin_unlock_inode(inode);
39280 +
39281 + /* if on-disk stat data is of different length than required
39282 + for this inode, resize it */
39283 +
39284 + if (data.length != 0) {
39285 + data.data = NULL;
39286 + data.user = 0;
39287 +
39288 + assert("edward-1441",
39289 + !check_sd_resize(inode, coord,
39290 + data.length, 0/* before resize */));
39291 +
39292 + /* insertion code requires that insertion point (coord) was
39293 + * between units. */
39294 + coord->between = AFTER_UNIT;
39295 + result = reiser4_resize_item(coord, &data, key, lh,
39296 + COPI_DONT_SHIFT_LEFT);
39297 + if (result != 0) {
39298 + key_warning(key, inode, result);
39299 + zrelse(loaded);
39300 + return result;
39301 + }
39302 + if (loaded != coord->node) {
39303 + /* reiser4_resize_item moved coord to another node.
39304 + Zload it */
39305 + zrelse(loaded);
39306 + coord_clear_iplug(coord);
39307 + result = zload(coord->node);
39308 + if (result != 0)
39309 + return result;
39310 + loaded = coord->node;
39311 + }
39312 + assert("edward-1442",
39313 + !check_sd_resize(inode, coord,
39314 + data.length, 1/* after resize */));
39315 + }
39316 + area = item_body_by_coord(coord);
39317 + spin_lock_inode(inode);
39318 + result = data.iplug->s.sd.save(inode, &area);
39319 + znode_make_dirty(coord->node);
39320 +
39321 + /* re-initialise stat-data seal */
39322 +
39323 + /*
39324 + * coord.between was possibly skewed from AT_UNIT when stat-data size
39325 + * was changed and new extensions were pasted into item.
39326 + */
39327 + coord->between = AT_UNIT;
39328 + reiser4_seal_init(&state->sd_seal, coord, key);
39329 + state->sd_coord = *coord;
39330 + spin_unlock_inode(inode);
39331 + check_inode_seal(inode, coord, key);
39332 + zrelse(loaded);
39333 + return result;
39334 +}
39335 +
39336 +/* Update existing stat-data in a tree. Called with inode state locked. Return
39337 + inode state locked. */
39338 +static int update_sd(struct inode *inode /* inode to update sd for */ )
39339 +{
39340 + int result;
39341 + reiser4_key key;
39342 + coord_t coord;
39343 + lock_handle lh;
39344 +
39345 + assert("nikita-726", inode != NULL);
39346 +
39347 + /* no stat-data, nothing to update?! */
39348 + assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
39349 +
39350 + init_lh(&lh);
39351 +
39352 + result = locate_inode_sd(inode, &key, &coord, &lh);
39353 + if (result == 0)
39354 + result = update_sd_at(inode, &coord, &key, &lh);
39355 + done_lh(&lh);
39356 +
39357 + return result;
39358 +}
39359 +
39360 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
39361 + Remove object stat data. Space for that must be reserved by caller before
39362 +*/
39363 +static int
39364 +common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
39365 +{
39366 + int result;
39367 +
39368 + assert("nikita-1477", inode != NULL);
39369 +
39370 + if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
39371 + reiser4_key sd_key;
39372 +
39373 + DQUOT_FREE_INODE(inode);
39374 + DQUOT_DROP(inode);
39375 +
39376 + build_sd_key(inode, &sd_key);
39377 + result =
39378 + reiser4_cut_tree(reiser4_tree_by_inode(inode),
39379 + &sd_key, &sd_key, NULL, 0);
39380 + if (result == 0) {
39381 + reiser4_inode_set_flag(inode, REISER4_NO_SD);
39382 + result = oid_release(inode->i_sb, get_inode_oid(inode));
39383 + if (result == 0) {
39384 + oid_count_released();
39385 +
39386 + result = safe_link_del(reiser4_tree_by_inode(inode),
39387 + get_inode_oid(inode),
39388 + SAFE_UNLINK);
39389 + }
39390 + }
39391 + } else
39392 + result = 0;
39393 + return result;
39394 +}
39395 +
39396 +/* helper for safelink_common */
39397 +static int process_truncate(struct inode *inode, __u64 size)
39398 +{
39399 + int result;
39400 + struct iattr attr;
39401 + file_plugin *fplug;
39402 + reiser4_context *ctx;
39403 + struct dentry dentry;
39404 +
39405 + assert("vs-21", is_in_reiser4_context());
39406 + ctx = reiser4_init_context(inode->i_sb);
39407 + assert("vs-22", !IS_ERR(ctx));
39408 +
39409 + attr.ia_size = size;
39410 + attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
39411 + fplug = inode_file_plugin(inode);
39412 +
39413 + mutex_lock(&inode->i_mutex);
39414 + assert("vs-1704", get_current_context()->trans->atom == NULL);
39415 + dentry.d_inode = inode;
39416 + result = inode->i_op->setattr(&dentry, &attr);
39417 + mutex_unlock(&inode->i_mutex);
39418 +
39419 + context_set_commit_async(ctx);
39420 + reiser4_exit_context(ctx);
39421 +
39422 + return result;
39423 +}
39424 +
39425 +/*
39426 + Local variables:
39427 + c-indentation-style: "K&R"
39428 + mode-name: "LC"
39429 + c-basic-offset: 8
39430 + tab-width: 8
39431 + fill-column: 80
39432 + scroll-step: 1
39433 + End:
39434 +*/
39435 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/hash.c linux-2.6.23/fs/reiser4/plugin/hash.c
39436 --- linux-2.6.23.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300
39437 +++ linux-2.6.23/fs/reiser4/plugin/hash.c 2007-12-04 16:49:30.000000000 +0300
39438 @@ -0,0 +1,353 @@
39439 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
39440 + * reiser4/README */
39441 +
39442 +/* Hash functions */
39443 +
39444 +#include "../debug.h"
39445 +#include "plugin_header.h"
39446 +#include "plugin.h"
39447 +#include "../super.h"
39448 +#include "../inode.h"
39449 +
39450 +#include <linux/types.h>
39451 +
39452 +/* old rupasov (yura) hash */
39453 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
39454 + int len /* @name's length */ )
39455 +{
39456 + int i;
39457 + int j;
39458 + int pow;
39459 + __u64 a;
39460 + __u64 c;
39461 +
39462 + assert("nikita-672", name != NULL);
39463 + assert("nikita-673", len >= 0);
39464 +
39465 + for (pow = 1, i = 1; i < len; ++i)
39466 + pow = pow * 10;
39467 +
39468 + if (len == 1)
39469 + a = name[0] - 48;
39470 + else
39471 + a = (name[0] - 48) * pow;
39472 +
39473 + for (i = 1; i < len; ++i) {
39474 + c = name[i] - 48;
39475 + for (pow = 1, j = i; j < len - 1; ++j)
39476 + pow = pow * 10;
39477 + a = a + c * pow;
39478 + }
39479 + for (; i < 40; ++i) {
39480 + c = '0' - 48;
39481 + for (pow = 1, j = i; j < len - 1; ++j)
39482 + pow = pow * 10;
39483 + a = a + c * pow;
39484 + }
39485 +
39486 + for (; i < 256; ++i) {
39487 + c = i;
39488 + for (pow = 1, j = i; j < len - 1; ++j)
39489 + pow = pow * 10;
39490 + a = a + c * pow;
39491 + }
39492 +
39493 + a = a << 7;
39494 + return a;
39495 +}
39496 +
39497 +/* r5 hash */
39498 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
39499 + int len UNUSED_ARG /* @name's length */ )
39500 +{
39501 + __u64 a = 0;
39502 +
39503 + assert("nikita-674", name != NULL);
39504 + assert("nikita-675", len >= 0);
39505 +
39506 + while (*name) {
39507 + a += *name << 4;
39508 + a += *name >> 4;
39509 + a *= 11;
39510 + name++;
39511 + }
39512 + return a;
39513 +}
39514 +
39515 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
39516 + H0 = Key
39517 + Hi = E Mi(Hi-1) + Hi-1
39518 +
39519 + (see Applied Cryptography, 2nd edition, p448).
39520 +
39521 + Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
39522 +
39523 + Jeremy has agreed to the contents of reiserfs/README. -Hans
39524 +
39525 + This code was blindly upgraded to __u64 by s/__u32/__u64/g.
39526 +*/
39527 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
39528 + int len /* @name's length */ )
39529 +{
39530 + __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
39531 +
39532 + __u64 h0 = k[0], h1 = k[1];
39533 + __u64 a, b, c, d;
39534 + __u64 pad;
39535 + int i;
39536 +
39537 + assert("nikita-676", name != NULL);
39538 + assert("nikita-677", len >= 0);
39539 +
39540 +#define DELTA 0x9E3779B9u
39541 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
39542 +#define PARTROUNDS 6 /* 6 gets complete mixing */
39543 +
39544 +/* a, b, c, d - data; h0, h1 - accumulated hash */
39545 +#define TEACORE(rounds) \
39546 + do { \
39547 + __u64 sum = 0; \
39548 + int n = rounds; \
39549 + __u64 b0, b1; \
39550 + \
39551 + b0 = h0; \
39552 + b1 = h1; \
39553 + \
39554 + do \
39555 + { \
39556 + sum += DELTA; \
39557 + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
39558 + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
39559 + } while(--n); \
39560 + \
39561 + h0 += b0; \
39562 + h1 += b1; \
39563 + } while(0)
39564 +
39565 + pad = (__u64) len | ((__u64) len << 8);
39566 + pad |= pad << 16;
39567 +
39568 + while (len >= 16) {
39569 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39570 + 16 | (__u64) name[3] << 24;
39571 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39572 + 16 | (__u64) name[7] << 24;
39573 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39574 + 16 | (__u64) name[11] << 24;
39575 + d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
39576 + << 16 | (__u64) name[15] << 24;
39577 +
39578 + TEACORE(PARTROUNDS);
39579 +
39580 + len -= 16;
39581 + name += 16;
39582 + }
39583 +
39584 + if (len >= 12) {
39585 + //assert(len < 16);
39586 + if (len >= 16)
39587 + *(int *)0 = 0;
39588 +
39589 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39590 + 16 | (__u64) name[3] << 24;
39591 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39592 + 16 | (__u64) name[7] << 24;
39593 + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39594 + 16 | (__u64) name[11] << 24;
39595 +
39596 + d = pad;
39597 + for (i = 12; i < len; i++) {
39598 + d <<= 8;
39599 + d |= name[i];
39600 + }
39601 + } else if (len >= 8) {
39602 + //assert(len < 12);
39603 + if (len >= 12)
39604 + *(int *)0 = 0;
39605 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39606 + 16 | (__u64) name[3] << 24;
39607 + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39608 + 16 | (__u64) name[7] << 24;
39609 +
39610 + c = d = pad;
39611 + for (i = 8; i < len; i++) {
39612 + c <<= 8;
39613 + c |= name[i];
39614 + }
39615 + } else if (len >= 4) {
39616 + //assert(len < 8);
39617 + if (len >= 8)
39618 + *(int *)0 = 0;
39619 + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39620 + 16 | (__u64) name[3] << 24;
39621 +
39622 + b = c = d = pad;
39623 + for (i = 4; i < len; i++) {
39624 + b <<= 8;
39625 + b |= name[i];
39626 + }
39627 + } else {
39628 + //assert(len < 4);
39629 + if (len >= 4)
39630 + *(int *)0 = 0;
39631 + a = b = c = d = pad;
39632 + for (i = 0; i < len; i++) {
39633 + a <<= 8;
39634 + a |= name[i];
39635 + }
39636 + }
39637 +
39638 + TEACORE(FULLROUNDS);
39639 +
39640 +/* return 0;*/
39641 + return h0 ^ h1;
39642 +
39643 +}
39644 +
39645 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
39646 +
39647 + See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
39648 +
39649 + Excerpts:
39650 +
39651 + FNV hashes are designed to be fast while maintaining a low collision
39652 + rate.
39653 +
39654 + [This version also seems to preserve lexicographical order locally.]
39655 +
39656 + FNV hash algorithms and source code have been released into the public
39657 + domain.
39658 +
39659 +*/
39660 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
39661 + int len UNUSED_ARG /* @name's length */ )
39662 +{
39663 + unsigned long long a = 0xcbf29ce484222325ull;
39664 + const unsigned long long fnv_64_prime = 0x100000001b3ull;
39665 +
39666 + assert("nikita-678", name != NULL);
39667 + assert("nikita-679", len >= 0);
39668 +
39669 + /* FNV-1 hash each octet in the buffer */
39670 + for (; *name; ++name) {
39671 + /* multiply by the 32 bit FNV magic prime mod 2^64 */
39672 + a *= fnv_64_prime;
39673 + /* xor the bottom with the current octet */
39674 + a ^= (unsigned long long)(*name);
39675 + }
39676 + /* return our new hash value */
39677 + return a;
39678 +}
39679 +
39680 +/* degenerate hash function used to simplify testing of non-unique key
39681 + handling */
39682 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
39683 + int len UNUSED_ARG /* @name's length */ )
39684 +{
39685 + return 0xc0c0c0c010101010ull;
39686 +}
39687 +
39688 +static int change_hash(struct inode *inode,
39689 + reiser4_plugin * plugin,
39690 + pset_member memb)
39691 +{
39692 + int result;
39693 +
39694 + assert("nikita-3503", inode != NULL);
39695 + assert("nikita-3504", plugin != NULL);
39696 +
39697 + assert("nikita-3505", is_reiser4_inode(inode));
39698 + assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
39699 +
39700 + if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
39701 + return RETERR(-EINVAL);
39702 +
39703 + result = 0;
39704 + if (inode_hash_plugin(inode) == NULL ||
39705 + inode_hash_plugin(inode)->h.id != plugin->h.id) {
39706 + if (is_dir_empty(inode) == 0)
39707 + result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
39708 + PSET_HASH, plugin);
39709 + else
39710 + result = RETERR(-ENOTEMPTY);
39711 +
39712 + }
39713 + return result;
39714 +}
39715 +
39716 +static reiser4_plugin_ops hash_plugin_ops = {
39717 + .init = NULL,
39718 + .load = NULL,
39719 + .save_len = NULL,
39720 + .save = NULL,
39721 + .change = change_hash
39722 +};
39723 +
39724 +/* hash plugins */
39725 +hash_plugin hash_plugins[LAST_HASH_ID] = {
39726 + [RUPASOV_HASH_ID] = {
39727 + .h = {
39728 + .type_id = REISER4_HASH_PLUGIN_TYPE,
39729 + .id = RUPASOV_HASH_ID,
39730 + .pops = &hash_plugin_ops,
39731 + .label = "rupasov",
39732 + .desc = "Original Yura's hash",
39733 + .linkage = {NULL, NULL}
39734 + },
39735 + .hash = hash_rupasov
39736 + },
39737 + [R5_HASH_ID] = {
39738 + .h = {
39739 + .type_id = REISER4_HASH_PLUGIN_TYPE,
39740 + .id = R5_HASH_ID,
39741 + .pops = &hash_plugin_ops,
39742 + .label = "r5",
39743 + .desc = "r5 hash",
39744 + .linkage = {NULL, NULL}
39745 + },
39746 + .hash = hash_r5
39747 + },
39748 + [TEA_HASH_ID] = {
39749 + .h = {
39750 + .type_id = REISER4_HASH_PLUGIN_TYPE,
39751 + .id = TEA_HASH_ID,
39752 + .pops = &hash_plugin_ops,
39753 + .label = "tea",
39754 + .desc = "tea hash",
39755 + .linkage = {NULL, NULL}
39756 + },
39757 + .hash = hash_tea
39758 + },
39759 + [FNV1_HASH_ID] = {
39760 + .h = {
39761 + .type_id = REISER4_HASH_PLUGIN_TYPE,
39762 + .id = FNV1_HASH_ID,
39763 + .pops = &hash_plugin_ops,
39764 + .label = "fnv1",
39765 + .desc = "fnv1 hash",
39766 + .linkage = {NULL, NULL}
39767 + },
39768 + .hash = hash_fnv1
39769 + },
39770 + [DEGENERATE_HASH_ID] = {
39771 + .h = {
39772 + .type_id = REISER4_HASH_PLUGIN_TYPE,
39773 + .id = DEGENERATE_HASH_ID,
39774 + .pops = &hash_plugin_ops,
39775 + .label = "degenerate hash",
39776 + .desc = "Degenerate hash: only for testing",
39777 + .linkage = {NULL, NULL}
39778 + },
39779 + .hash = hash_deg
39780 + }
39781 +};
39782 +
39783 +/* Make Linus happy.
39784 + Local variables:
39785 + c-indentation-style: "K&R"
39786 + mode-name: "LC"
39787 + c-basic-offset: 8
39788 + tab-width: 8
39789 + fill-column: 120
39790 + End:
39791 +*/
39792 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.23/fs/reiser4/plugin/inode_ops.c
39793 --- linux-2.6.23.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300
39794 +++ linux-2.6.23/fs/reiser4/plugin/inode_ops.c 2007-12-04 16:49:30.000000000 +0300
39795 @@ -0,0 +1,897 @@
39796 +/*
39797 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
39798 + */
39799 +
39800 +/*
39801 + * this file contains typical implementations for most of methods of struct
39802 + * inode_operations
39803 + */
39804 +
39805 +#include "../inode.h"
39806 +#include "../safe_link.h"
39807 +
39808 +#include <linux/quotaops.h>
39809 +#include <linux/namei.h>
39810 +
39811 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
39812 + reiser4_object_create_data *data);
39813 +
39814 +/**
39815 + * reiser4_create_common - create of inode operations
39816 + * @parent: inode of parent directory
39817 + * @dentry: dentry of new object to create
39818 + * @mode: the permissions to use
39819 + * @nameidata:
39820 + *
39821 + * This is common implementation of vfs's create method of struct
39822 + * inode_operations.
39823 + * Creates regular file using file plugin from parent directory plugin set.
39824 + */
39825 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
39826 + int mode, struct nameidata *nameidata)
39827 +{
39828 + reiser4_object_create_data data;
39829 + file_plugin *fplug;
39830 +
39831 + memset(&data, 0, sizeof data);
39832 + data.mode = S_IFREG | mode;
39833 + fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
39834 + if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
39835 + warning("vpf-1900", "'%s' is not a regular file plugin.",
39836 + fplug->h.label);
39837 + return RETERR(-EIO);
39838 + }
39839 + data.id = fplug->h.id;
39840 + return create_vfs_object(parent, dentry, &data);
39841 +}
39842 +
39843 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
39844 +void check_light_weight(struct inode *inode, struct inode *parent);
39845 +
39846 +/**
39847 + * reiser4_lookup_common - lookup of inode operations
39848 + * @parent: inode of directory to lookup into
39849 + * @dentry: name to look for
39850 + * @nameidata:
39851 + *
39852 + * This is common implementation of vfs's lookup method of struct
39853 + * inode_operations.
39854 + */
39855 +struct dentry *reiser4_lookup_common(struct inode *parent,
39856 + struct dentry *dentry,
39857 + struct nameidata *nameidata)
39858 +{
39859 + reiser4_context *ctx;
39860 + int result;
39861 + struct dentry *new;
39862 + struct inode *inode;
39863 + reiser4_dir_entry_desc entry;
39864 +
39865 + ctx = reiser4_init_context(parent->i_sb);
39866 + if (IS_ERR(ctx))
39867 + return (struct dentry *)ctx;
39868 +
39869 + /* set up operations on dentry. */
39870 + dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
39871 +
39872 + result = reiser4_lookup_name(parent, dentry, &entry.key);
39873 + if (result) {
39874 + context_set_commit_async(ctx);
39875 + reiser4_exit_context(ctx);
39876 + if (result == -ENOENT) {
39877 + /* object not found */
39878 + if (!IS_DEADDIR(parent))
39879 + d_add(dentry, NULL);
39880 + return NULL;
39881 + }
39882 + return ERR_PTR(result);
39883 + }
39884 +
39885 + inode = reiser4_iget(parent->i_sb, &entry.key, 0);
39886 + if (IS_ERR(inode)) {
39887 + context_set_commit_async(ctx);
39888 + reiser4_exit_context(ctx);
39889 + return ERR_PTR(PTR_ERR(inode));
39890 + }
39891 +
39892 + /* success */
39893 + check_light_weight(inode, parent);
39894 + new = d_splice_alias(inode, dentry);
39895 + reiser4_iget_complete(inode);
39896 +
39897 + /* prevent balance_dirty_pages() from being called: we don't want to
39898 + * do this under directory i_mutex. */
39899 + context_set_commit_async(ctx);
39900 + reiser4_exit_context(ctx);
39901 + return new;
39902 +}
39903 +
39904 +static reiser4_block_nr common_estimate_link(struct inode *parent,
39905 + struct inode *object);
39906 +int reiser4_update_dir(struct inode *);
39907 +
39908 +/**
39909 + * reiser4_link_common - link of inode operations
39910 + * @existing: dentry of object which is to get new name
39911 + * @parent: directory where new name is to be created
39912 + * @newname: new name
39913 + *
39914 + * This is common implementation of vfs's link method of struct
39915 + * inode_operations.
39916 + */
39917 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
39918 + struct dentry *newname)
39919 +{
39920 + reiser4_context *ctx;
39921 + int result;
39922 + struct inode *object;
39923 + dir_plugin *parent_dplug;
39924 + reiser4_dir_entry_desc entry;
39925 + reiser4_object_create_data data;
39926 + reiser4_block_nr reserve;
39927 +
39928 + ctx = reiser4_init_context(parent->i_sb);
39929 + if (IS_ERR(ctx))
39930 + return PTR_ERR(ctx);
39931 +
39932 + assert("nikita-1431", existing != NULL);
39933 + assert("nikita-1432", parent != NULL);
39934 + assert("nikita-1433", newname != NULL);
39935 +
39936 + object = existing->d_inode;
39937 + assert("nikita-1434", object != NULL);
39938 +
39939 + /* check for race with create_object() */
39940 + if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
39941 + context_set_commit_async(ctx);
39942 + reiser4_exit_context(ctx);
39943 + return RETERR(-E_REPEAT);
39944 + }
39945 +
39946 + parent_dplug = inode_dir_plugin(parent);
39947 +
39948 + memset(&entry, 0, sizeof entry);
39949 + entry.obj = object;
39950 +
39951 + data.mode = object->i_mode;
39952 + data.id = inode_file_plugin(object)->h.id;
39953 +
39954 + reserve = common_estimate_link(parent, existing->d_inode);
39955 + if ((__s64) reserve < 0) {
39956 + context_set_commit_async(ctx);
39957 + reiser4_exit_context(ctx);
39958 + return reserve;
39959 + }
39960 +
39961 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
39962 + context_set_commit_async(ctx);
39963 + reiser4_exit_context(ctx);
39964 + return RETERR(-ENOSPC);
39965 + }
39966 +
39967 + /*
39968 + * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
39969 + * means that link(2) can race against unlink(2) or rename(2), and
39970 + * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
39971 + *
39972 + * For such inode we have to undo special processing done in
39973 + * reiser4_unlink() viz. creation of safe-link.
39974 + */
39975 + if (unlikely(object->i_nlink == 0)) {
39976 + result = safe_link_del(reiser4_tree_by_inode(object),
39977 + get_inode_oid(object), SAFE_UNLINK);
39978 + if (result != 0) {
39979 + context_set_commit_async(ctx);
39980 + reiser4_exit_context(ctx);
39981 + return result;
39982 + }
39983 + }
39984 +
39985 + /* increment nlink of @existing and update its stat data */
39986 + result = reiser4_add_nlink(object, parent, 1);
39987 + if (result == 0) {
39988 + /* add entry to the parent */
39989 + result =
39990 + parent_dplug->add_entry(parent, newname, &data, &entry);
39991 + if (result != 0) {
39992 + /* failed to add entry to the parent, decrement nlink
39993 + of @existing */
39994 + reiser4_del_nlink(object, parent, 1);
39995 + /*
39996 + * now, if that failed, we have a file with too big
39997 + * nlink---space leak, much better than directory
39998 + * entry pointing to nowhere
39999 + */
40000 + }
40001 + }
40002 + if (result == 0) {
40003 + atomic_inc(&object->i_count);
40004 + /*
40005 + * Upon successful completion, link() shall mark for update
40006 + * the st_ctime field of the file. Also, the st_ctime and
40007 + * st_mtime fields of the directory that contains the new
40008 + * entry shall be marked for update. --SUS
40009 + */
40010 + result = reiser4_update_dir(parent);
40011 + }
40012 + if (result == 0)
40013 + d_instantiate(newname, existing->d_inode);
40014 +
40015 + context_set_commit_async(ctx);
40016 + reiser4_exit_context(ctx);
40017 + return result;
40018 +}
40019 +
40020 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
40021 +
40022 +/**
40023 + * reiser4_unlink_common - unlink of inode operations
40024 + * @parent: inode of directory to remove name from
40025 + * @victim: name to be removed
40026 + *
40027 + * This is common implementation of vfs's unlink method of struct
40028 + * inode_operations.
40029 + */
40030 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
40031 +{
40032 + reiser4_context *ctx;
40033 + int result;
40034 + struct inode *object;
40035 + file_plugin *fplug;
40036 +
40037 + ctx = reiser4_init_context(parent->i_sb);
40038 + if (IS_ERR(ctx))
40039 + return PTR_ERR(ctx);
40040 +
40041 + object = victim->d_inode;
40042 + fplug = inode_file_plugin(object);
40043 + assert("nikita-2882", fplug->detach != NULL);
40044 +
40045 + result = unlink_check_and_grab(parent, victim);
40046 + if (result != 0) {
40047 + context_set_commit_async(ctx);
40048 + reiser4_exit_context(ctx);
40049 + return result;
40050 + }
40051 +
40052 + result = fplug->detach(object, parent);
40053 + if (result == 0) {
40054 + dir_plugin *parent_dplug;
40055 + reiser4_dir_entry_desc entry;
40056 +
40057 + parent_dplug = inode_dir_plugin(parent);
40058 + memset(&entry, 0, sizeof entry);
40059 +
40060 + /* first, delete directory entry */
40061 + result = parent_dplug->rem_entry(parent, victim, &entry);
40062 + if (result == 0) {
40063 + /*
40064 + * if name was removed successfully, we _have_ to
40065 + * return 0 from this function, because upper level
40066 + * caller (vfs_{rmdir,unlink}) expect this.
40067 + *
40068 + * now that directory entry is removed, update
40069 + * stat-data
40070 + */
40071 + reiser4_del_nlink(object, parent, 1);
40072 + /*
40073 + * Upon successful completion, unlink() shall mark for
40074 + * update the st_ctime and st_mtime fields of the
40075 + * parent directory. Also, if the file's link count is
40076 + * not 0, the st_ctime field of the file shall be
40077 + * marked for update. --SUS
40078 + */
40079 + reiser4_update_dir(parent);
40080 + /* add safe-link for this file */
40081 + if (object->i_nlink == 0)
40082 + safe_link_add(object, SAFE_UNLINK);
40083 + }
40084 + }
40085 +
40086 + if (unlikely(result != 0)) {
40087 + if (result != -ENOMEM)
40088 + warning("nikita-3398", "Cannot unlink %llu (%i)",
40089 + (unsigned long long)get_inode_oid(object),
40090 + result);
40091 + /* if operation failed commit pending inode modifications to
40092 + * the stat-data */
40093 + reiser4_update_sd(object);
40094 + reiser4_update_sd(parent);
40095 + }
40096 +
40097 + reiser4_release_reserved(object->i_sb);
40098 +
40099 + /* @object's i_ctime was updated by ->rem_link() method(). */
40100 +
40101 + /* @victim can be already removed from the disk by this time. Inode is
40102 + then marked so that iput() wouldn't try to remove stat data. But
40103 + inode itself is still there.
40104 + */
40105 +
40106 + /*
40107 + * we cannot release directory semaphore here, because name has
40108 + * already been deleted, but dentry (@victim) still exists. Prevent
40109 + * balance_dirty_pages() from being called on exiting this context: we
40110 + * don't want to do this under directory i_mutex.
40111 + */
40112 + context_set_commit_async(ctx);
40113 + reiser4_exit_context(ctx);
40114 + return result;
40115 +}
40116 +
40117 +/**
40118 + * reiser4_symlink_common - symlink of inode operations
40119 + * @parent: inode of parent directory
40120 + * @dentry: dentry of object to be created
40121 + * @linkname: string symlink is to contain
40122 + *
40123 + * This is common implementation of vfs's symlink method of struct
40124 + * inode_operations.
40125 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
40126 + */
40127 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
40128 + const char *linkname)
40129 +{
40130 + reiser4_object_create_data data;
40131 +
40132 + memset(&data, 0, sizeof data);
40133 + data.name = linkname;
40134 + data.id = SYMLINK_FILE_PLUGIN_ID;
40135 + data.mode = S_IFLNK | S_IRWXUGO;
40136 + return create_vfs_object(parent, dentry, &data);
40137 +}
40138 +
40139 +/**
40140 + * reiser4_mkdir_common - mkdir of inode operations
40141 + * @parent: inode of parent directory
40142 + * @dentry: dentry of object to be created
40143 + * @mode: the permissions to use
40144 + *
40145 + * This is common implementation of vfs's mkdir method of struct
40146 + * inode_operations.
40147 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
40148 + */
40149 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
40150 +{
40151 + reiser4_object_create_data data;
40152 +
40153 + memset(&data, 0, sizeof data);
40154 + data.mode = S_IFDIR | mode;
40155 + data.id = DIRECTORY_FILE_PLUGIN_ID;
40156 + return create_vfs_object(parent, dentry, &data);
40157 +}
40158 +
40159 +/**
40160 + * reiser4_mknod_common - mknod of inode operations
40161 + * @parent: inode of parent directory
40162 + * @dentry: dentry of object to be created
40163 + * @mode: the permissions to use and file type
40164 + * @rdev: minor and major of new device file
40165 + *
40166 + * This is common implementation of vfs's mknod method of struct
40167 + * inode_operations.
40168 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
40169 + */
40170 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
40171 + int mode, dev_t rdev)
40172 +{
40173 + reiser4_object_create_data data;
40174 +
40175 + memset(&data, 0, sizeof data);
40176 + data.mode = mode;
40177 + data.rdev = rdev;
40178 + data.id = SPECIAL_FILE_PLUGIN_ID;
40179 + return create_vfs_object(parent, dentry, &data);
40180 +}
40181 +
40182 +/*
40183 + * implementation of vfs's rename method of struct inode_operations for typical
40184 + * directory is in inode_ops_rename.c
40185 + */
40186 +
40187 +/**
40188 + * reiser4_follow_link_common - follow_link of inode operations
40189 + * @dentry: dentry of symlink
40190 + * @data:
40191 + *
40192 + * This is common implementation of vfs's followlink method of struct
40193 + * inode_operations.
40194 + * Assumes that inode's i_private points to the content of symbolic link.
40195 + */
40196 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
40197 +{
40198 + assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
40199 +
40200 + if (!dentry->d_inode->i_private
40201 + || !reiser4_inode_get_flag(dentry->d_inode,
40202 + REISER4_GENERIC_PTR_USED))
40203 + return ERR_PTR(RETERR(-EINVAL));
40204 + nd_set_link(nd, dentry->d_inode->i_private);
40205 + return NULL;
40206 +}
40207 +
40208 +/**
40209 + * reiser4_permission_common - permission of inode operations
40210 + * @inode: inode to check permissions for
40211 + * @mask: mode bits to check permissions for
40212 + * @nameidata:
40213 + *
40214 + * Uses generic function to check for rwx permissions.
40215 + */
40216 +int reiser4_permission_common(struct inode *inode, int mask,
40217 + struct nameidata *nameidata)
40218 +{
40219 + return generic_permission(inode, mask, NULL);
40220 +}
40221 +
40222 +static int setattr_reserve(reiser4_tree *);
40223 +
40224 +/* this is common implementation of vfs's setattr method of struct
40225 + inode_operations
40226 +*/
40227 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
40228 +{
40229 + reiser4_context *ctx;
40230 + struct inode *inode;
40231 + int result;
40232 +
40233 + inode = dentry->d_inode;
40234 + result = inode_change_ok(inode, attr);
40235 + if (result)
40236 + return result;
40237 +
40238 + ctx = reiser4_init_context(inode->i_sb);
40239 + if (IS_ERR(ctx))
40240 + return PTR_ERR(ctx);
40241 +
40242 + assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
40243 +
40244 + /*
40245 + * grab disk space and call standard inode_setattr().
40246 + */
40247 + result = setattr_reserve(reiser4_tree_by_inode(inode));
40248 + if (!result) {
40249 + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
40250 + || (attr->ia_valid & ATTR_GID
40251 + && attr->ia_gid != inode->i_gid)) {
40252 + result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
40253 + if (result) {
40254 + context_set_commit_async(ctx);
40255 + reiser4_exit_context(ctx);
40256 + return result;
40257 + }
40258 + }
40259 + result = inode_setattr(inode, attr);
40260 + if (!result)
40261 + reiser4_update_sd(inode);
40262 + }
40263 +
40264 + context_set_commit_async(ctx);
40265 + reiser4_exit_context(ctx);
40266 + return result;
40267 +}
40268 +
40269 +/* this is common implementation of vfs's getattr method of struct
40270 + inode_operations
40271 +*/
40272 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
40273 + struct dentry *dentry, struct kstat *stat)
40274 +{
40275 + struct inode *obj;
40276 +
40277 + assert("nikita-2298", dentry != NULL);
40278 + assert("nikita-2299", stat != NULL);
40279 + assert("nikita-2300", dentry->d_inode != NULL);
40280 +
40281 + obj = dentry->d_inode;
40282 +
40283 + stat->dev = obj->i_sb->s_dev;
40284 + stat->ino = oid_to_uino(get_inode_oid(obj));
40285 + stat->mode = obj->i_mode;
40286 + /* don't confuse userland with huge nlink. This is not entirely
40287 + * correct, because nlink_t is not necessary 16 bit signed. */
40288 + stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
40289 + stat->uid = obj->i_uid;
40290 + stat->gid = obj->i_gid;
40291 + stat->rdev = obj->i_rdev;
40292 + stat->atime = obj->i_atime;
40293 + stat->mtime = obj->i_mtime;
40294 + stat->ctime = obj->i_ctime;
40295 + stat->size = obj->i_size;
40296 + stat->blocks =
40297 + (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
40298 + /* "preferred" blocksize for efficient file system I/O */
40299 + stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
40300 +
40301 + return 0;
40302 +}
40303 +
40304 +/* Estimate the maximum amount of nodes which might be allocated or changed on
40305 + typical new object creation. Typical creation consists of calling create
40306 + method of file plugin, adding directory entry to parent and update parent
40307 + directory's stat data.
40308 +*/
40309 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
40310 + struct inode *object
40311 + /* object */ )
40312 +{
40313 + assert("vpf-309", parent != NULL);
40314 + assert("vpf-307", object != NULL);
40315 +
40316 + return
40317 + /* object creation estimation */
40318 + inode_file_plugin(object)->estimate.create(object) +
40319 + /* stat data of parent directory estimation */
40320 + inode_file_plugin(parent)->estimate.update(parent) +
40321 + /* adding entry estimation */
40322 + inode_dir_plugin(parent)->estimate.add_entry(parent) +
40323 + /* to undo in the case of failure */
40324 + inode_dir_plugin(parent)->estimate.rem_entry(parent);
40325 +}
40326 +
40327 +/* Create child in directory.
40328 +
40329 + . get object's plugin
40330 + . get fresh inode
40331 + . initialize inode
40332 + . add object's stat-data
40333 + . initialize object's directory
40334 + . add entry to the parent
40335 + . instantiate dentry
40336 +
40337 +*/
40338 +static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
40339 + object */
40340 + struct inode **retobj)
40341 +{
40342 + int result;
40343 +
40344 + struct dentry *dentry; /* parent object */
40345 + struct inode *parent; /* new name */
40346 +
40347 + dir_plugin *par_dir; /* directory plugin on the parent */
40348 + dir_plugin *obj_dir; /* directory plugin on the new object */
40349 + file_plugin *obj_plug; /* object plugin on the new object */
40350 + struct inode *object; /* new object */
40351 + reiser4_block_nr reserve;
40352 +
40353 + reiser4_dir_entry_desc entry; /* new directory entry */
40354 +
40355 + assert("nikita-1420", data != NULL);
40356 + parent = data->parent;
40357 + dentry = data->dentry;
40358 +
40359 + assert("nikita-1418", parent != NULL);
40360 + assert("nikita-1419", dentry != NULL);
40361 +
40362 + /* check, that name is acceptable for parent */
40363 + par_dir = inode_dir_plugin(parent);
40364 + if (par_dir->is_name_acceptable &&
40365 + !par_dir->is_name_acceptable(parent,
40366 + dentry->d_name.name,
40367 + (int)dentry->d_name.len))
40368 + return RETERR(-ENAMETOOLONG);
40369 +
40370 + result = 0;
40371 + obj_plug = file_plugin_by_id((int)data->id);
40372 + if (obj_plug == NULL) {
40373 + warning("nikita-430", "Cannot find plugin %i", data->id);
40374 + return RETERR(-ENOENT);
40375 + }
40376 + object = new_inode(parent->i_sb);
40377 + if (object == NULL)
40378 + return RETERR(-ENOMEM);
40379 + /* we'll update i_nlink below */
40380 + object->i_nlink = 0;
40381 + /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
40382 + * to simplify error handling: if some error occurs before i_ino is
40383 + * initialized with oid, i_ino should already be set to some
40384 + * distinguished value. */
40385 + object->i_ino = 0;
40386 +
40387 + /* So that on error iput will be called. */
40388 + *retobj = object;
40389 +
40390 + if (DQUOT_ALLOC_INODE(object)) {
40391 + DQUOT_DROP(object);
40392 + object->i_flags |= S_NOQUOTA;
40393 + return RETERR(-EDQUOT);
40394 + }
40395 +
40396 + memset(&entry, 0, sizeof entry);
40397 + entry.obj = object;
40398 +
40399 + set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
40400 + file_plugin_to_plugin(obj_plug));
40401 + result = obj_plug->set_plug_in_inode(object, parent, data);
40402 + if (result) {
40403 + warning("nikita-431", "Cannot install plugin %i on %llx",
40404 + data->id, (unsigned long long)get_inode_oid(object));
40405 + DQUOT_FREE_INODE(object);
40406 + object->i_flags |= S_NOQUOTA;
40407 + return result;
40408 + }
40409 +
40410 + /* reget plugin after installation */
40411 + obj_plug = inode_file_plugin(object);
40412 +
40413 + if (obj_plug->create_object == NULL) {
40414 + DQUOT_FREE_INODE(object);
40415 + object->i_flags |= S_NOQUOTA;
40416 + return RETERR(-EPERM);
40417 + }
40418 +
40419 + /* if any of hash, tail, sd or permission plugins for newly created
40420 + object are not set yet set them here inheriting them from parent
40421 + directory
40422 + */
40423 + assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
40424 + result = obj_plug->adjust_to_parent(object,
40425 + parent,
40426 + object->i_sb->s_root->d_inode);
40427 + if (result == 0)
40428 + result = finish_pset(object);
40429 + if (result != 0) {
40430 + warning("nikita-432", "Cannot inherit from %llx to %llx",
40431 + (unsigned long long)get_inode_oid(parent),
40432 + (unsigned long long)get_inode_oid(object));
40433 + DQUOT_FREE_INODE(object);
40434 + object->i_flags |= S_NOQUOTA;
40435 + return result;
40436 + }
40437 +
40438 + /* setup inode and file-operations for this inode */
40439 + setup_inode_ops(object, data);
40440 +
40441 + /* call file plugin's method to initialize plugin specific part of
40442 + * inode */
40443 + if (obj_plug->init_inode_data)
40444 + obj_plug->init_inode_data(object, data, 1 /*create */ );
40445 +
40446 + /* obtain directory plugin (if any) for new object. */
40447 + obj_dir = inode_dir_plugin(object);
40448 + if (obj_dir != NULL && obj_dir->init == NULL) {
40449 + DQUOT_FREE_INODE(object);
40450 + object->i_flags |= S_NOQUOTA;
40451 + return RETERR(-EPERM);
40452 + }
40453 +
40454 + reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
40455 +
40456 + reserve = estimate_create_vfs_object(parent, object);
40457 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40458 + DQUOT_FREE_INODE(object);
40459 + object->i_flags |= S_NOQUOTA;
40460 + return RETERR(-ENOSPC);
40461 + }
40462 +
40463 + /* mark inode `immutable'. We disable changes to the file being
40464 + created until valid directory entry for it is inserted. Otherwise,
40465 + if file were expanded and insertion of directory entry fails, we
40466 + have to remove file, but we only alloted enough space in
40467 + transaction to remove _empty_ file. 3.x code used to remove stat
40468 + data in different transaction thus possibly leaking disk space on
40469 + crash. This all only matters if it's possible to access file
40470 + without name, for example, by inode number
40471 + */
40472 + reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
40473 +
40474 + /* create empty object, this includes allocation of new objectid. For
40475 + directories this implies creation of dot and dotdot */
40476 + assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
40477 +
40478 + /* mark inode as `loaded'. From this point onward
40479 + reiser4_delete_inode() will try to remove its stat-data. */
40480 + reiser4_inode_set_flag(object, REISER4_LOADED);
40481 +
40482 + result = obj_plug->create_object(object, parent, data);
40483 + if (result != 0) {
40484 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40485 + if (result != -ENAMETOOLONG && result != -ENOMEM)
40486 + warning("nikita-2219",
40487 + "Failed to create sd for %llu",
40488 + (unsigned long long)get_inode_oid(object));
40489 + DQUOT_FREE_INODE(object);
40490 + object->i_flags |= S_NOQUOTA;
40491 + return result;
40492 + }
40493 +
40494 + if (obj_dir != NULL)
40495 + result = obj_dir->init(object, parent, data);
40496 + if (result == 0) {
40497 + assert("nikita-434", !reiser4_inode_get_flag(object,
40498 + REISER4_NO_SD));
40499 + /* insert inode into VFS hash table */
40500 + insert_inode_hash(object);
40501 + /* create entry */
40502 + result = par_dir->add_entry(parent, dentry, data, &entry);
40503 + if (result == 0) {
40504 + result = reiser4_add_nlink(object, parent, 0);
40505 + /* If O_CREAT is set and the file did not previously
40506 + exist, upon successful completion, open() shall
40507 + mark for update the st_atime, st_ctime, and
40508 + st_mtime fields of the file and the st_ctime and
40509 + st_mtime fields of the parent directory. --SUS
40510 + */
40511 + /* @object times are already updated by
40512 + reiser4_add_nlink() */
40513 + if (result == 0)
40514 + reiser4_update_dir(parent);
40515 + if (result != 0)
40516 + /* cleanup failure to add nlink */
40517 + par_dir->rem_entry(parent, dentry, &entry);
40518 + }
40519 + if (result != 0)
40520 + /* cleanup failure to add entry */
40521 + obj_plug->detach(object, parent);
40522 + } else if (result != -ENOMEM)
40523 + warning("nikita-2219", "Failed to initialize dir for %llu: %i",
40524 + (unsigned long long)get_inode_oid(object), result);
40525 +
40526 + /*
40527 + * update stat-data, committing all pending modifications to the inode
40528 + * fields.
40529 + */
40530 + reiser4_update_sd(object);
40531 + if (result != 0) {
40532 + DQUOT_FREE_INODE(object);
40533 + object->i_flags |= S_NOQUOTA;
40534 + /* if everything was ok (result == 0), parent stat-data is
40535 + * already updated above (update_parent_dir()) */
40536 + reiser4_update_sd(parent);
40537 + /* failure to create entry, remove object */
40538 + obj_plug->delete_object(object);
40539 + }
40540 +
40541 + /* file has name now, clear immutable flag */
40542 + reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40543 +
40544 + /* on error, iput() will call ->delete_inode(). We should keep track
40545 + of the existence of stat-data for this inode and avoid attempt to
40546 + remove it in reiser4_delete_inode(). This is accomplished through
40547 + REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
40548 + */
40549 + return result;
40550 +}
40551 +
40552 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
40553 + reiser4_mknod and reiser4_symlink
40554 +*/
40555 +static int
40556 +create_vfs_object(struct inode *parent,
40557 + struct dentry *dentry, reiser4_object_create_data * data)
40558 +{
40559 + reiser4_context *ctx;
40560 + int result;
40561 + struct inode *child;
40562 +
40563 + ctx = reiser4_init_context(parent->i_sb);
40564 + if (IS_ERR(ctx))
40565 + return PTR_ERR(ctx);
40566 + context_set_commit_async(ctx);
40567 +
40568 + data->parent = parent;
40569 + data->dentry = dentry;
40570 + child = NULL;
40571 + result = do_create_vfs_child(data, &child);
40572 + if (unlikely(result != 0)) {
40573 + if (child != NULL) {
40574 + reiser4_make_bad_inode(child);
40575 + iput(child);
40576 + }
40577 + } else
40578 + d_instantiate(dentry, child);
40579 +
40580 + reiser4_exit_context(ctx);
40581 + return result;
40582 +}
40583 +
40584 +/* helper for link_common. Estimate disk space necessary to add a link
40585 + from @parent to @object
40586 +*/
40587 +static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
40588 + struct inode *object
40589 + /* object to which new link is being cerated */
40590 + )
40591 +{
40592 + reiser4_block_nr res = 0;
40593 + file_plugin *fplug;
40594 + dir_plugin *dplug;
40595 +
40596 + assert("vpf-317", object != NULL);
40597 + assert("vpf-318", parent != NULL);
40598 +
40599 + fplug = inode_file_plugin(object);
40600 + dplug = inode_dir_plugin(parent);
40601 + /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
40602 + /* reiser4_add_nlink(object) */
40603 + res += fplug->estimate.update(object);
40604 + /* add_entry(parent) */
40605 + res += dplug->estimate.add_entry(parent);
40606 + /* reiser4_del_nlink(object) */
40607 + res += fplug->estimate.update(object);
40608 + /* update_dir(parent) */
40609 + res += inode_file_plugin(parent)->estimate.update(parent);
40610 + /* safe-link */
40611 + res += estimate_one_item_removal(reiser4_tree_by_inode(object));
40612 +
40613 + return res;
40614 +}
40615 +
40616 +/* Estimate disk space necessary to remove a link between @parent and
40617 + @object.
40618 +*/
40619 +static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
40620 + struct inode *object
40621 + /* object to which new link is being cerated */
40622 + )
40623 +{
40624 + reiser4_block_nr res = 0;
40625 + file_plugin *fplug;
40626 + dir_plugin *dplug;
40627 +
40628 + assert("vpf-317", object != NULL);
40629 + assert("vpf-318", parent != NULL);
40630 +
40631 + fplug = inode_file_plugin(object);
40632 + dplug = inode_dir_plugin(parent);
40633 +
40634 + /* rem_entry(parent) */
40635 + res += dplug->estimate.rem_entry(parent);
40636 + /* reiser4_del_nlink(object) */
40637 + res += fplug->estimate.update(object);
40638 + /* update_dir(parent) */
40639 + res += inode_file_plugin(parent)->estimate.update(parent);
40640 + /* fplug->unlink */
40641 + res += fplug->estimate.unlink(object, parent);
40642 + /* safe-link */
40643 + res += estimate_one_insert_item(reiser4_tree_by_inode(object));
40644 +
40645 + return res;
40646 +}
40647 +
40648 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
40649 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
40650 +{
40651 + file_plugin *fplug;
40652 + struct inode *child;
40653 + int result;
40654 +
40655 + result = 0;
40656 + child = victim->d_inode;
40657 + fplug = inode_file_plugin(child);
40658 +
40659 + /* check for race with create_object() */
40660 + if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
40661 + return RETERR(-E_REPEAT);
40662 + /* object being deleted should have stat data */
40663 + assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
40664 +
40665 + /* ask object plugin */
40666 + if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
40667 + return RETERR(-ENOTEMPTY);
40668 +
40669 + result = (int)estimate_unlink(parent, child);
40670 + if (result < 0)
40671 + return result;
40672 +
40673 + return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
40674 +}
40675 +
40676 +/* helper for reiser4_setattr_common */
40677 +static int setattr_reserve(reiser4_tree * tree)
40678 +{
40679 + assert("vs-1096", is_grab_enabled(get_current_context()));
40680 + return reiser4_grab_space(estimate_one_insert_into_item(tree),
40681 + BA_CAN_COMMIT);
40682 +}
40683 +
40684 +/* helper function. Standards require that for many file-system operations
40685 + on success ctime and mtime of parent directory is to be updated. */
40686 +int reiser4_update_dir(struct inode *dir)
40687 +{
40688 + assert("nikita-2525", dir != NULL);
40689 +
40690 + dir->i_ctime = dir->i_mtime = CURRENT_TIME;
40691 + return reiser4_update_sd(dir);
40692 +}
40693 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.23/fs/reiser4/plugin/inode_ops_rename.c
40694 --- linux-2.6.23.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300
40695 +++ linux-2.6.23/fs/reiser4/plugin/inode_ops_rename.c 2007-12-04 16:49:30.000000000 +0300
40696 @@ -0,0 +1,912 @@
40697 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
40698 + * reiser4/README */
40699 +
40700 +#include "../inode.h"
40701 +#include "../safe_link.h"
40702 +
40703 +static const char *possible_leak = "Possible disk space leak.";
40704 +
40705 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
40706 +
40707 + Helper function called from hashed_rename() */
40708 +static int replace_name(struct inode *to_inode, /* inode where @from_coord is
40709 + * to be re-targeted at */
40710 + struct inode *from_dir, /* directory where @from_coord
40711 + * lives */
40712 + struct inode *from_inode, /* inode @from_coord
40713 + * originally point to */
40714 + coord_t * from_coord, /* where directory entry is in
40715 + * the tree */
40716 + lock_handle * from_lh /* lock handle on @from_coord */ )
40717 +{
40718 + item_plugin *from_item;
40719 + int result;
40720 + znode *node;
40721 +
40722 + coord_clear_iplug(from_coord);
40723 + node = from_coord->node;
40724 + result = zload(node);
40725 + if (result != 0)
40726 + return result;
40727 + from_item = item_plugin_by_coord(from_coord);
40728 + if (plugin_of_group(item_plugin_by_coord(from_coord),
40729 + DIR_ENTRY_ITEM_TYPE))
40730 + {
40731 + reiser4_key to_key;
40732 +
40733 + build_sd_key(to_inode, &to_key);
40734 +
40735 + /* everything is found and prepared to change directory entry
40736 + at @from_coord to point to @to_inode.
40737 +
40738 + @to_inode is just about to get new name, so bump its link
40739 + counter.
40740 +
40741 + */
40742 + result = reiser4_add_nlink(to_inode, from_dir, 0);
40743 + if (result != 0) {
40744 + /* Don't issue warning: this may be plain -EMLINK */
40745 + zrelse(node);
40746 + return result;
40747 + }
40748 +
40749 + result =
40750 + from_item->s.dir.update_key(from_coord, &to_key, from_lh);
40751 + if (result != 0) {
40752 + reiser4_del_nlink(to_inode, from_dir, 0);
40753 + zrelse(node);
40754 + return result;
40755 + }
40756 +
40757 + /* @from_inode just lost its name, he-he.
40758 +
40759 + If @from_inode was directory, it contained dotdot pointing
40760 + to @from_dir. @from_dir i_nlink will be decreased when
40761 + iput() will be called on @from_inode.
40762 +
40763 + If file-system is not ADG (hard-links are
40764 + supported on directories), iput(from_inode) will not remove
40765 + @from_inode, and thus above is incorrect, but hard-links on
40766 + directories are problematic in many other respects.
40767 + */
40768 + result = reiser4_del_nlink(from_inode, from_dir, 0);
40769 + if (result != 0) {
40770 + warning("nikita-2330",
40771 + "Cannot remove link from source: %i. %s",
40772 + result, possible_leak);
40773 + }
40774 + /* Has to return success, because entry is already
40775 + * modified. */
40776 + result = 0;
40777 +
40778 + /* NOTE-NIKITA consider calling plugin method in stead of
40779 + accessing inode fields directly. */
40780 + from_dir->i_mtime = CURRENT_TIME;
40781 + } else {
40782 + warning("nikita-2326", "Unexpected item type");
40783 + result = RETERR(-EIO);
40784 + }
40785 + zrelse(node);
40786 + return result;
40787 +}
40788 +
40789 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
40790 +
40791 + Helper function used by hashed_rename(). */
40792 +static int add_name(struct inode *inode, /* inode where @coord is to be
40793 + * re-targeted at */
40794 + struct inode *dir, /* directory where @coord lives */
40795 + struct dentry *name, /* new name */
40796 + coord_t * coord, /* where directory entry is in the tree */
40797 + lock_handle * lh, /* lock handle on @coord */
40798 + int is_dir /* true, if @inode is directory */ )
40799 +{
40800 + int result;
40801 + reiser4_dir_entry_desc entry;
40802 +
40803 + assert("nikita-2333", lh->node == coord->node);
40804 + assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
40805 +
40806 + memset(&entry, 0, sizeof entry);
40807 + entry.obj = inode;
40808 + /* build key of directory entry description */
40809 + inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
40810 +
40811 + /* ext2 does this in different order: first inserts new entry,
40812 + then increases directory nlink. We don't want do this,
40813 + because reiser4_add_nlink() calls ->add_link() plugin
40814 + method that can fail for whatever reason, leaving as with
40815 + cleanup problems.
40816 + */
40817 + /* @inode is getting new name */
40818 + reiser4_add_nlink(inode, dir, 0);
40819 + /* create @new_name in @new_dir pointing to
40820 + @old_inode */
40821 + result = WITH_COORD(coord,
40822 + inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
40823 + coord,
40824 + lh,
40825 + name,
40826 + &entry));
40827 + if (result != 0) {
40828 + int result2;
40829 + result2 = reiser4_del_nlink(inode, dir, 0);
40830 + if (result2 != 0) {
40831 + warning("nikita-2327",
40832 + "Cannot drop link on %lli %i. %s",
40833 + (unsigned long long)get_inode_oid(inode),
40834 + result2, possible_leak);
40835 + }
40836 + } else
40837 + INODE_INC_FIELD(dir, i_size);
40838 + return result;
40839 +}
40840 +
40841 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
40842 + struct dentry *old_name, /* old name */
40843 + struct inode *new_dir, /* directory where @new is located */
40844 + struct dentry *new_name /* new name */ )
40845 +{
40846 + reiser4_block_nr res1, res2;
40847 + dir_plugin *p_parent_old, *p_parent_new;
40848 + file_plugin *p_child_old, *p_child_new;
40849 +
40850 + assert("vpf-311", old_dir != NULL);
40851 + assert("vpf-312", new_dir != NULL);
40852 + assert("vpf-313", old_name != NULL);
40853 + assert("vpf-314", new_name != NULL);
40854 +
40855 + p_parent_old = inode_dir_plugin(old_dir);
40856 + p_parent_new = inode_dir_plugin(new_dir);
40857 + p_child_old = inode_file_plugin(old_name->d_inode);
40858 + if (new_name->d_inode)
40859 + p_child_new = inode_file_plugin(new_name->d_inode);
40860 + else
40861 + p_child_new = NULL;
40862 +
40863 + /* find_entry - can insert one leaf. */
40864 + res1 = res2 = 1;
40865 +
40866 + /* replace_name */
40867 + {
40868 + /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
40869 + res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
40870 + /* update key */
40871 + res1 += 1;
40872 + /* reiser4_del_nlink(p_child_new) */
40873 + if (p_child_new)
40874 + res1 += p_child_new->estimate.update(new_name->d_inode);
40875 + }
40876 +
40877 + /* else add_name */
40878 + {
40879 + /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
40880 + res2 +=
40881 + 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
40882 + /* reiser4_add_nlink(p_parent_old) */
40883 + res2 += p_child_old->estimate.update(old_name->d_inode);
40884 + /* add_entry(p_parent_new) */
40885 + res2 += p_parent_new->estimate.add_entry(new_dir);
40886 + /* reiser4_del_nlink(p_parent_old) */
40887 + res2 += p_child_old->estimate.update(old_name->d_inode);
40888 + }
40889 +
40890 + res1 = res1 < res2 ? res2 : res1;
40891 +
40892 + /* reiser4_write_sd(p_parent_new) */
40893 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40894 +
40895 + /* reiser4_write_sd(p_child_new) */
40896 + if (p_child_new)
40897 + res1 += p_child_new->estimate.update(new_name->d_inode);
40898 +
40899 + /* hashed_rem_entry(p_parent_old) */
40900 + res1 += p_parent_old->estimate.rem_entry(old_dir);
40901 +
40902 + /* reiser4_del_nlink(p_child_old) */
40903 + res1 += p_child_old->estimate.update(old_name->d_inode);
40904 +
40905 + /* replace_name */
40906 + {
40907 + /* reiser4_add_nlink(p_parent_dir_new) */
40908 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40909 + /* update_key */
40910 + res1 += 1;
40911 + /* reiser4_del_nlink(p_parent_new) */
40912 + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40913 + /* reiser4_del_nlink(p_parent_old) */
40914 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
40915 + }
40916 +
40917 + /* reiser4_write_sd(p_parent_old) */
40918 + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
40919 +
40920 + /* reiser4_write_sd(p_child_old) */
40921 + res1 += p_child_old->estimate.update(old_name->d_inode);
40922 +
40923 + return res1;
40924 +}
40925 +
40926 +static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
40927 + struct dentry *old_name, /* old name */
40928 + struct inode *new_dir, /* directory where @new is located */
40929 + struct dentry *new_name
40930 + /* new name */ )
40931 +{
40932 + reiser4_block_nr reserve;
40933 +
40934 + reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
40935 +
40936 + if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
40937 + return RETERR(-ENOSPC);
40938 +
40939 + return 0;
40940 +}
40941 +
40942 +/* check whether @old_inode and @new_inode can be moved within file system
40943 + * tree. This singles out attempts to rename pseudo-files, for example. */
40944 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
40945 + struct inode *new_dir, struct inode *new_inode)
40946 +{
40947 + file_plugin *fplug;
40948 + dir_plugin *dplug;
40949 +
40950 + assert("nikita-3370", old_inode != NULL);
40951 +
40952 + dplug = inode_dir_plugin(new_dir);
40953 + fplug = inode_file_plugin(old_inode);
40954 +
40955 + if (dplug == NULL)
40956 + return RETERR(-ENOTDIR);
40957 + else if (new_dir->i_op->create == NULL)
40958 + return RETERR(-EPERM);
40959 + else if (!fplug->can_add_link(old_inode))
40960 + return RETERR(-EMLINK);
40961 + else if (new_inode != NULL) {
40962 + fplug = inode_file_plugin(new_inode);
40963 + if (fplug->can_rem_link != NULL &&
40964 + !fplug->can_rem_link(new_inode))
40965 + return RETERR(-EBUSY);
40966 + }
40967 + return 0;
40968 +}
40969 +
40970 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
40971 + znode_lock_mode, reiser4_dir_entry_desc *);
40972 +int reiser4_update_dir(struct inode *);
40973 +
40974 +/* this is common implementation of vfs's rename method of struct
40975 + inode_operations
40976 + See comments in the body.
40977 +
40978 + It is arguable that this function can be made generic so, that it
40979 + will be applicable to any kind of directory plugin that deals with
40980 + directories composed out of directory entries. The only obstacle
40981 + here is that we don't have any data-type to represent directory
40982 + entry. This should be re-considered when more than one different
40983 + directory plugin will be implemented.
40984 +*/
40985 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
40986 + * is located */ ,
40987 + struct dentry *old_name /* old name */ ,
40988 + struct inode *new_dir /* directory where @new
40989 + * is located */ ,
40990 + struct dentry *new_name /* new name */ )
40991 +{
40992 + /* From `The Open Group Base Specifications Issue 6'
40993 +
40994 + If either the old or new argument names a symbolic link, rename()
40995 + shall operate on the symbolic link itself, and shall not resolve
40996 + the last component of the argument. If the old argument and the new
40997 + argument resolve to the same existing file, rename() shall return
40998 + successfully and perform no other action.
40999 +
41000 + [this is done by VFS: vfs_rename()]
41001 +
41002 + If the old argument points to the pathname of a file that is not a
41003 + directory, the new argument shall not point to the pathname of a
41004 + directory.
41005 +
41006 + [checked by VFS: vfs_rename->may_delete()]
41007 +
41008 + If the link named by the new argument exists, it shall
41009 + be removed and old renamed to new. In this case, a link named new
41010 + shall remain visible to other processes throughout the renaming
41011 + operation and refer either to the file referred to by new or old
41012 + before the operation began.
41013 +
41014 + [we should assure this]
41015 +
41016 + Write access permission is required for
41017 + both the directory containing old and the directory containing new.
41018 +
41019 + [checked by VFS: vfs_rename->may_delete(), may_create()]
41020 +
41021 + If the old argument points to the pathname of a directory, the new
41022 + argument shall not point to the pathname of a file that is not a
41023 + directory.
41024 +
41025 + [checked by VFS: vfs_rename->may_delete()]
41026 +
41027 + If the directory named by the new argument exists, it
41028 + shall be removed and old renamed to new. In this case, a link named
41029 + new shall exist throughout the renaming operation and shall refer
41030 + either to the directory referred to by new or old before the
41031 + operation began.
41032 +
41033 + [we should assure this]
41034 +
41035 + If new names an existing directory, it shall be
41036 + required to be an empty directory.
41037 +
41038 + [we should check this]
41039 +
41040 + If the old argument points to a pathname of a symbolic link, the
41041 + symbolic link shall be renamed. If the new argument points to a
41042 + pathname of a symbolic link, the symbolic link shall be removed.
41043 +
41044 + The new pathname shall not contain a path prefix that names
41045 + old. Write access permission is required for the directory
41046 + containing old and the directory containing new. If the old
41047 + argument points to the pathname of a directory, write access
41048 + permission may be required for the directory named by old, and, if
41049 + it exists, the directory named by new.
41050 +
41051 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
41052 +
41053 + If the link named by the new argument exists and the file's link
41054 + count becomes 0 when it is removed and no process has the file
41055 + open, the space occupied by the file shall be freed and the file
41056 + shall no longer be accessible. If one or more processes have the
41057 + file open when the last link is removed, the link shall be removed
41058 + before rename() returns, but the removal of the file contents shall
41059 + be postponed until all references to the file are closed.
41060 +
41061 + [iput() handles this, but we can do this manually, a la
41062 + reiser4_unlink()]
41063 +
41064 + Upon successful completion, rename() shall mark for update the
41065 + st_ctime and st_mtime fields of the parent directory of each file.
41066 +
41067 + [N/A]
41068 +
41069 + */
41070 + reiser4_context *ctx;
41071 + int result;
41072 + int is_dir; /* is @old_name directory */
41073 +
41074 + struct inode *old_inode;
41075 + struct inode *new_inode;
41076 + coord_t *new_coord;
41077 +
41078 + struct reiser4_dentry_fsdata *new_fsdata;
41079 + dir_plugin *dplug;
41080 + file_plugin *fplug;
41081 +
41082 + reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
41083 + lock_handle *new_lh, *dotdot_lh;
41084 + struct dentry *dotdot_name;
41085 + struct reiser4_dentry_fsdata *dataonstack;
41086 +
41087 + ctx = reiser4_init_context(old_dir->i_sb);
41088 + if (IS_ERR(ctx))
41089 + return PTR_ERR(ctx);
41090 +
41091 + old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
41092 + sizeof(*dotdot_name) + sizeof(*dataonstack),
41093 + reiser4_ctx_gfp_mask_get());
41094 + if (!old_entry) {
41095 + context_set_commit_async(ctx);
41096 + reiser4_exit_context(ctx);
41097 + return RETERR(-ENOMEM);
41098 + }
41099 +
41100 + new_entry = old_entry + 1;
41101 + dotdot_entry = old_entry + 2;
41102 + new_lh = (lock_handle *)(old_entry + 3);
41103 + dotdot_lh = new_lh + 1;
41104 + dotdot_name = (struct dentry *)(new_lh + 2);
41105 + dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
41106 +
41107 + assert("nikita-2318", old_dir != NULL);
41108 + assert("nikita-2319", new_dir != NULL);
41109 + assert("nikita-2320", old_name != NULL);
41110 + assert("nikita-2321", new_name != NULL);
41111 +
41112 + old_inode = old_name->d_inode;
41113 + new_inode = new_name->d_inode;
41114 +
41115 + dplug = inode_dir_plugin(old_dir);
41116 + fplug = NULL;
41117 +
41118 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
41119 + if (IS_ERR(new_fsdata)) {
41120 + kfree(old_entry);
41121 + context_set_commit_async(ctx);
41122 + reiser4_exit_context(ctx);
41123 + return PTR_ERR(new_fsdata);
41124 + }
41125 +
41126 + new_coord = &new_fsdata->dec.entry_coord;
41127 + coord_clear_iplug(new_coord);
41128 +
41129 + is_dir = S_ISDIR(old_inode->i_mode);
41130 +
41131 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41132 +
41133 + /* if target is existing directory and it's not empty---return error.
41134 +
41135 + This check is done specifically, because is_dir_empty() requires
41136 + tree traversal and have to be done before locks are taken.
41137 + */
41138 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
41139 + kfree(old_entry);
41140 + context_set_commit_async(ctx);
41141 + reiser4_exit_context(ctx);
41142 + return RETERR(-ENOTEMPTY);
41143 + }
41144 +
41145 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
41146 + if (result != 0) {
41147 + kfree(old_entry);
41148 + context_set_commit_async(ctx);
41149 + reiser4_exit_context(ctx);
41150 + return result;
41151 + }
41152 +
41153 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
41154 + new_dir, new_name);
41155 + if (result != 0) {
41156 + kfree(old_entry);
41157 + context_set_commit_async(ctx);
41158 + reiser4_exit_context(ctx);
41159 + return result;
41160 + }
41161 +
41162 + init_lh(new_lh);
41163 +
41164 + /* find entry for @new_name */
41165 + result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
41166 + new_entry);
41167 +
41168 + if (IS_CBKERR(result)) {
41169 + done_lh(new_lh);
41170 + kfree(old_entry);
41171 + context_set_commit_async(ctx);
41172 + reiser4_exit_context(ctx);
41173 + return result;
41174 + }
41175 +
41176 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
41177 +
41178 + /* add or replace name for @old_inode as @new_name */
41179 + if (new_inode != NULL) {
41180 + /* target (@new_name) exists. */
41181 + /* Not clear what to do with objects that are
41182 + both directories and files at the same time. */
41183 + if (result == CBK_COORD_FOUND) {
41184 + result = replace_name(old_inode,
41185 + new_dir,
41186 + new_inode, new_coord, new_lh);
41187 + if (result == 0)
41188 + fplug = inode_file_plugin(new_inode);
41189 + } else if (result == CBK_COORD_NOTFOUND) {
41190 + /* VFS told us that @new_name is bound to existing
41191 + inode, but we failed to find directory entry. */
41192 + warning("nikita-2324", "Target not found");
41193 + result = RETERR(-ENOENT);
41194 + }
41195 + } else {
41196 + /* target (@new_name) doesn't exists. */
41197 + if (result == CBK_COORD_NOTFOUND)
41198 + result = add_name(old_inode,
41199 + new_dir,
41200 + new_name, new_coord, new_lh, is_dir);
41201 + else if (result == CBK_COORD_FOUND) {
41202 + /* VFS told us that @new_name is "negative" dentry,
41203 + but we found directory entry. */
41204 + warning("nikita-2331", "Target found unexpectedly");
41205 + result = RETERR(-EIO);
41206 + }
41207 + }
41208 +
41209 + assert("nikita-3462", ergo(result == 0,
41210 + old_inode->i_nlink >= 2 + !!is_dir));
41211 +
41212 + /* We are done with all modifications to the @new_dir, release lock on
41213 + node. */
41214 + done_lh(new_lh);
41215 +
41216 + if (fplug != NULL) {
41217 + /* detach @new_inode from name-space */
41218 + result = fplug->detach(new_inode, new_dir);
41219 + if (result != 0)
41220 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
41221 + (unsigned long long)get_inode_oid(new_inode),
41222 + result, possible_leak);
41223 + }
41224 +
41225 + if (new_inode != NULL)
41226 + reiser4_update_sd(new_inode);
41227 +
41228 + if (result == 0) {
41229 + old_entry->obj = old_inode;
41230 +
41231 + dplug->build_entry_key(old_dir,
41232 + &old_name->d_name, &old_entry->key);
41233 +
41234 + /* At this stage new name was introduced for
41235 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41236 + counters were updated.
41237 +
41238 + We want to remove @old_name now. If @old_inode wasn't
41239 + directory this is simple.
41240 + */
41241 + result = dplug->rem_entry(old_dir, old_name, old_entry);
41242 + if (result != 0 && result != -ENOMEM) {
41243 + warning("nikita-2335",
41244 + "Cannot remove old name: %i", result);
41245 + } else {
41246 + result = reiser4_del_nlink(old_inode, old_dir, 0);
41247 + if (result != 0 && result != -ENOMEM) {
41248 + warning("nikita-2337",
41249 + "Cannot drop link on old: %i", result);
41250 + }
41251 + }
41252 +
41253 + if (result == 0 && is_dir) {
41254 + /* @old_inode is directory. We also have to update
41255 + dotdot entry. */
41256 + coord_t *dotdot_coord;
41257 +
41258 + memset(dataonstack, 0, sizeof dataonstack);
41259 + memset(dotdot_entry, 0, sizeof dotdot_entry);
41260 + dotdot_entry->obj = old_dir;
41261 + memset(dotdot_name, 0, sizeof dotdot_name);
41262 + dotdot_name->d_name.name = "..";
41263 + dotdot_name->d_name.len = 2;
41264 + /*
41265 + * allocate ->d_fsdata on the stack to avoid using
41266 + * reiser4_get_dentry_fsdata(). Locking is not needed,
41267 + * because dentry is private to the current thread.
41268 + */
41269 + dotdot_name->d_fsdata = dataonstack;
41270 + init_lh(dotdot_lh);
41271 +
41272 + dotdot_coord = &dataonstack->dec.entry_coord;
41273 + coord_clear_iplug(dotdot_coord);
41274 +
41275 + result = reiser4_find_entry(old_inode, dotdot_name,
41276 + dotdot_lh, ZNODE_WRITE_LOCK,
41277 + dotdot_entry);
41278 + if (result == 0) {
41279 + /* replace_name() decreases i_nlink on
41280 + * @old_dir */
41281 + result = replace_name(new_dir,
41282 + old_inode,
41283 + old_dir,
41284 + dotdot_coord, dotdot_lh);
41285 + } else
41286 + result = RETERR(-EIO);
41287 + done_lh(dotdot_lh);
41288 + }
41289 + }
41290 + reiser4_update_dir(new_dir);
41291 + reiser4_update_dir(old_dir);
41292 + reiser4_update_sd(old_inode);
41293 + if (result == 0) {
41294 + file_plugin *fplug;
41295 +
41296 + if (new_inode != NULL) {
41297 + /* add safe-link for target file (in case we removed
41298 + * last reference to the poor fellow */
41299 + fplug = inode_file_plugin(new_inode);
41300 + if (new_inode->i_nlink == 0)
41301 + result = safe_link_add(new_inode, SAFE_UNLINK);
41302 + }
41303 + }
41304 + kfree(old_entry);
41305 + context_set_commit_async(ctx);
41306 + reiser4_exit_context(ctx);
41307 + return result;
41308 +}
41309 +
41310 +#if 0
41311 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
41312 + * is located */ ,
41313 + struct dentry *old_name /* old name */ ,
41314 + struct inode *new_dir /* directory where @new
41315 + * is located */ ,
41316 + struct dentry *new_name /* new name */ )
41317 +{
41318 + /* From `The Open Group Base Specifications Issue 6'
41319 +
41320 + If either the old or new argument names a symbolic link, rename()
41321 + shall operate on the symbolic link itself, and shall not resolve
41322 + the last component of the argument. If the old argument and the new
41323 + argument resolve to the same existing file, rename() shall return
41324 + successfully and perform no other action.
41325 +
41326 + [this is done by VFS: vfs_rename()]
41327 +
41328 + If the old argument points to the pathname of a file that is not a
41329 + directory, the new argument shall not point to the pathname of a
41330 + directory.
41331 +
41332 + [checked by VFS: vfs_rename->may_delete()]
41333 +
41334 + If the link named by the new argument exists, it shall
41335 + be removed and old renamed to new. In this case, a link named new
41336 + shall remain visible to other processes throughout the renaming
41337 + operation and refer either to the file referred to by new or old
41338 + before the operation began.
41339 +
41340 + [we should assure this]
41341 +
41342 + Write access permission is required for
41343 + both the directory containing old and the directory containing new.
41344 +
41345 + [checked by VFS: vfs_rename->may_delete(), may_create()]
41346 +
41347 + If the old argument points to the pathname of a directory, the new
41348 + argument shall not point to the pathname of a file that is not a
41349 + directory.
41350 +
41351 + [checked by VFS: vfs_rename->may_delete()]
41352 +
41353 + If the directory named by the new argument exists, it
41354 + shall be removed and old renamed to new. In this case, a link named
41355 + new shall exist throughout the renaming operation and shall refer
41356 + either to the directory referred to by new or old before the
41357 + operation began.
41358 +
41359 + [we should assure this]
41360 +
41361 + If new names an existing directory, it shall be
41362 + required to be an empty directory.
41363 +
41364 + [we should check this]
41365 +
41366 + If the old argument points to a pathname of a symbolic link, the
41367 + symbolic link shall be renamed. If the new argument points to a
41368 + pathname of a symbolic link, the symbolic link shall be removed.
41369 +
41370 + The new pathname shall not contain a path prefix that names
41371 + old. Write access permission is required for the directory
41372 + containing old and the directory containing new. If the old
41373 + argument points to the pathname of a directory, write access
41374 + permission may be required for the directory named by old, and, if
41375 + it exists, the directory named by new.
41376 +
41377 + [checked by VFS: vfs_rename(), vfs_rename_dir()]
41378 +
41379 + If the link named by the new argument exists and the file's link
41380 + count becomes 0 when it is removed and no process has the file
41381 + open, the space occupied by the file shall be freed and the file
41382 + shall no longer be accessible. If one or more processes have the
41383 + file open when the last link is removed, the link shall be removed
41384 + before rename() returns, but the removal of the file contents shall
41385 + be postponed until all references to the file are closed.
41386 +
41387 + [iput() handles this, but we can do this manually, a la
41388 + reiser4_unlink()]
41389 +
41390 + Upon successful completion, rename() shall mark for update the
41391 + st_ctime and st_mtime fields of the parent directory of each file.
41392 +
41393 + [N/A]
41394 +
41395 + */
41396 + reiser4_context *ctx;
41397 + int result;
41398 + int is_dir; /* is @old_name directory */
41399 + struct inode *old_inode;
41400 + struct inode *new_inode;
41401 + reiser4_dir_entry_desc old_entry;
41402 + reiser4_dir_entry_desc new_entry;
41403 + coord_t *new_coord;
41404 + struct reiser4_dentry_fsdata *new_fsdata;
41405 + lock_handle new_lh;
41406 + dir_plugin *dplug;
41407 + file_plugin *fplug;
41408 +
41409 + ctx = reiser4_init_context(old_dir->i_sb);
41410 + if (IS_ERR(ctx))
41411 + return PTR_ERR(ctx);
41412 +
41413 + assert("nikita-2318", old_dir != NULL);
41414 + assert("nikita-2319", new_dir != NULL);
41415 + assert("nikita-2320", old_name != NULL);
41416 + assert("nikita-2321", new_name != NULL);
41417 +
41418 + old_inode = old_name->d_inode;
41419 + new_inode = new_name->d_inode;
41420 +
41421 + dplug = inode_dir_plugin(old_dir);
41422 + fplug = NULL;
41423 +
41424 + new_fsdata = reiser4_get_dentry_fsdata(new_name);
41425 + if (IS_ERR(new_fsdata)) {
41426 + result = PTR_ERR(new_fsdata);
41427 + goto exit;
41428 + }
41429 +
41430 + new_coord = &new_fsdata->dec.entry_coord;
41431 + coord_clear_iplug(new_coord);
41432 +
41433 + is_dir = S_ISDIR(old_inode->i_mode);
41434 +
41435 + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41436 +
41437 + /* if target is existing directory and it's not empty---return error.
41438 +
41439 + This check is done specifically, because is_dir_empty() requires
41440 + tree traversal and have to be done before locks are taken.
41441 + */
41442 + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
41443 + return RETERR(-ENOTEMPTY);
41444 +
41445 + result = can_rename(old_dir, old_inode, new_dir, new_inode);
41446 + if (result != 0)
41447 + goto exit;
41448 +
41449 + result = hashed_rename_estimate_and_grab(old_dir, old_name,
41450 + new_dir, new_name);
41451 + if (result != 0)
41452 + goto exit;
41453 +
41454 + init_lh(&new_lh);
41455 +
41456 + /* find entry for @new_name */
41457 + result = reiser4_find_entry(new_dir, new_name, &new_lh,
41458 + ZNODE_WRITE_LOCK, &new_entry);
41459 +
41460 + if (IS_CBKERR(result)) {
41461 + done_lh(&new_lh);
41462 + goto exit;
41463 + }
41464 +
41465 + reiser4_seal_done(&new_fsdata->dec.entry_seal);
41466 +
41467 + /* add or replace name for @old_inode as @new_name */
41468 + if (new_inode != NULL) {
41469 + /* target (@new_name) exists. */
41470 + /* Not clear what to do with objects that are
41471 + both directories and files at the same time. */
41472 + if (result == CBK_COORD_FOUND) {
41473 + result = replace_name(old_inode,
41474 + new_dir,
41475 + new_inode, new_coord, &new_lh);
41476 + if (result == 0)
41477 + fplug = inode_file_plugin(new_inode);
41478 + } else if (result == CBK_COORD_NOTFOUND) {
41479 + /* VFS told us that @new_name is bound to existing
41480 + inode, but we failed to find directory entry. */
41481 + warning("nikita-2324", "Target not found");
41482 + result = RETERR(-ENOENT);
41483 + }
41484 + } else {
41485 + /* target (@new_name) doesn't exists. */
41486 + if (result == CBK_COORD_NOTFOUND)
41487 + result = add_name(old_inode,
41488 + new_dir,
41489 + new_name, new_coord, &new_lh, is_dir);
41490 + else if (result == CBK_COORD_FOUND) {
41491 + /* VFS told us that @new_name is "negative" dentry,
41492 + but we found directory entry. */
41493 + warning("nikita-2331", "Target found unexpectedly");
41494 + result = RETERR(-EIO);
41495 + }
41496 + }
41497 +
41498 + assert("nikita-3462", ergo(result == 0,
41499 + old_inode->i_nlink >= 2 + !!is_dir));
41500 +
41501 + /* We are done with all modifications to the @new_dir, release lock on
41502 + node. */
41503 + done_lh(&new_lh);
41504 +
41505 + if (fplug != NULL) {
41506 + /* detach @new_inode from name-space */
41507 + result = fplug->detach(new_inode, new_dir);
41508 + if (result != 0)
41509 + warning("nikita-2330", "Cannot detach %lli: %i. %s",
41510 + (unsigned long long)get_inode_oid(new_inode),
41511 + result, possible_leak);
41512 + }
41513 +
41514 + if (new_inode != NULL)
41515 + reiser4_update_sd(new_inode);
41516 +
41517 + if (result == 0) {
41518 + memset(&old_entry, 0, sizeof old_entry);
41519 + old_entry.obj = old_inode;
41520 +
41521 + dplug->build_entry_key(old_dir,
41522 + &old_name->d_name, &old_entry.key);
41523 +
41524 + /* At this stage new name was introduced for
41525 + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41526 + counters were updated.
41527 +
41528 + We want to remove @old_name now. If @old_inode wasn't
41529 + directory this is simple.
41530 + */
41531 + result = dplug->rem_entry(old_dir, old_name, &old_entry);
41532 + /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
41533 + if (result != 0 && result != -ENOMEM) {
41534 + warning("nikita-2335",
41535 + "Cannot remove old name: %i", result);
41536 + } else {
41537 + result = reiser4_del_nlink(old_inode, old_dir, 0);
41538 + if (result != 0 && result != -ENOMEM) {
41539 + warning("nikita-2337",
41540 + "Cannot drop link on old: %i", result);
41541 + }
41542 + }
41543 +
41544 + if (result == 0 && is_dir) {
41545 + /* @old_inode is directory. We also have to update
41546 + dotdot entry. */
41547 + coord_t *dotdot_coord;
41548 + lock_handle dotdot_lh;
41549 + struct dentry dotdot_name;
41550 + reiser4_dir_entry_desc dotdot_entry;
41551 + struct reiser4_dentry_fsdata dataonstack;
41552 + struct reiser4_dentry_fsdata *fsdata;
41553 +
41554 + memset(&dataonstack, 0, sizeof dataonstack);
41555 + memset(&dotdot_entry, 0, sizeof dotdot_entry);
41556 + dotdot_entry.obj = old_dir;
41557 + memset(&dotdot_name, 0, sizeof dotdot_name);
41558 + dotdot_name.d_name.name = "..";
41559 + dotdot_name.d_name.len = 2;
41560 + /*
41561 + * allocate ->d_fsdata on the stack to avoid using
41562 + * reiser4_get_dentry_fsdata(). Locking is not needed,
41563 + * because dentry is private to the current thread.
41564 + */
41565 + dotdot_name.d_fsdata = &dataonstack;
41566 + init_lh(&dotdot_lh);
41567 +
41568 + fsdata = &dataonstack;
41569 + dotdot_coord = &fsdata->dec.entry_coord;
41570 + coord_clear_iplug(dotdot_coord);
41571 +
41572 + result = reiser4_find_entry(old_inode,
41573 + &dotdot_name,
41574 + &dotdot_lh,
41575 + ZNODE_WRITE_LOCK,
41576 + &dotdot_entry);
41577 + if (result == 0) {
41578 + /* replace_name() decreases i_nlink on
41579 + * @old_dir */
41580 + result = replace_name(new_dir,
41581 + old_inode,
41582 + old_dir,
41583 + dotdot_coord, &dotdot_lh);
41584 + } else
41585 + result = RETERR(-EIO);
41586 + done_lh(&dotdot_lh);
41587 + }
41588 + }
41589 + reiser4_update_dir(new_dir);
41590 + reiser4_update_dir(old_dir);
41591 + reiser4_update_sd(old_inode);
41592 + if (result == 0) {
41593 + file_plugin *fplug;
41594 +
41595 + if (new_inode != NULL) {
41596 + /* add safe-link for target file (in case we removed
41597 + * last reference to the poor fellow */
41598 + fplug = inode_file_plugin(new_inode);
41599 + if (new_inode->i_nlink == 0)
41600 + result = safe_link_add(new_inode, SAFE_UNLINK);
41601 + }
41602 + }
41603 + exit:
41604 + context_set_commit_async(ctx);
41605 + reiser4_exit_context(ctx);
41606 + return result;
41607 +}
41608 +#endif
41609 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/acl.h linux-2.6.23/fs/reiser4/plugin/item/acl.h
41610 --- linux-2.6.23.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300
41611 +++ linux-2.6.23/fs/reiser4/plugin/item/acl.h 2007-12-04 16:49:30.000000000 +0300
41612 @@ -0,0 +1,66 @@
41613 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41614 +
41615 +/* Directory entry. */
41616 +
41617 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
41618 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
41619 +
41620 +#include "../../forward.h"
41621 +#include "../../dformat.h"
41622 +#include "../../kassign.h"
41623 +#include "../../key.h"
41624 +
41625 +#include <linux/fs.h>
41626 +#include <linux/dcache.h> /* for struct dentry */
41627 +
41628 +typedef struct directory_entry_format {
41629 + /* key of object stat-data. It's not necessary to store whole
41630 + key here, because it's always key of stat-data, so minor
41631 + packing locality and offset can be omitted here. But this
41632 + relies on particular key allocation scheme for stat-data, so,
41633 + for extensibility sake, whole key can be stored here.
41634 +
41635 + We store key as array of bytes, because we don't want 8-byte
41636 + alignment of dir entries.
41637 + */
41638 + obj_key_id id;
41639 + /* file name. Null terminated string. */
41640 + d8 name[0];
41641 +} directory_entry_format;
41642 +
41643 +void print_de(const char *prefix, coord_t * coord);
41644 +int extract_key_de(const coord_t * coord, reiser4_key * key);
41645 +int update_key_de(const coord_t * coord, const reiser4_key * key,
41646 + lock_handle * lh);
41647 +char *extract_name_de(const coord_t * coord, char *buf);
41648 +unsigned extract_file_type_de(const coord_t * coord);
41649 +int add_entry_de(struct inode *dir, coord_t * coord,
41650 + lock_handle * lh, const struct dentry *name,
41651 + reiser4_dir_entry_desc * entry);
41652 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
41653 + lock_handle * lh, reiser4_dir_entry_desc * entry);
41654 +int max_name_len_de(const struct inode *dir);
41655 +
41656 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
41657 +
41658 +char *extract_dent_name(const coord_t * coord,
41659 + directory_entry_format * dent, char *buf);
41660 +
41661 +#if REISER4_LARGE_KEY
41662 +#define DE_NAME_BUF_LEN (24)
41663 +#else
41664 +#define DE_NAME_BUF_LEN (16)
41665 +#endif
41666 +
41667 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
41668 +#endif
41669 +
41670 +/* Make Linus happy.
41671 + Local variables:
41672 + c-indentation-style: "K&R"
41673 + mode-name: "LC"
41674 + c-basic-offset: 8
41675 + tab-width: 8
41676 + fill-column: 120
41677 + End:
41678 +*/
41679 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.23/fs/reiser4/plugin/item/blackbox.c
41680 --- linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
41681 +++ linux-2.6.23/fs/reiser4/plugin/item/blackbox.c 2007-12-04 16:49:30.000000000 +0300
41682 @@ -0,0 +1,142 @@
41683 +/* Copyright 2003 by Hans Reiser, licensing governed by
41684 + * reiser4/README */
41685 +
41686 +/* Black box item implementation */
41687 +
41688 +#include "../../forward.h"
41689 +#include "../../debug.h"
41690 +#include "../../dformat.h"
41691 +#include "../../kassign.h"
41692 +#include "../../coord.h"
41693 +#include "../../tree.h"
41694 +#include "../../lock.h"
41695 +
41696 +#include "blackbox.h"
41697 +#include "item.h"
41698 +#include "../plugin.h"
41699 +
41700 +int
41701 +store_black_box(reiser4_tree * tree,
41702 + const reiser4_key * key, void *data, int length)
41703 +{
41704 + int result;
41705 + reiser4_item_data idata;
41706 + coord_t coord;
41707 + lock_handle lh;
41708 +
41709 + memset(&idata, 0, sizeof idata);
41710 +
41711 + idata.data = data;
41712 + idata.user = 0;
41713 + idata.length = length;
41714 + idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
41715 +
41716 + init_lh(&lh);
41717 + result = insert_by_key(tree, key,
41718 + &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
41719 +
41720 + assert("nikita-3413",
41721 + ergo(result == 0,
41722 + WITH_COORD(&coord,
41723 + item_length_by_coord(&coord) == length)));
41724 +
41725 + done_lh(&lh);
41726 + return result;
41727 +}
41728 +
41729 +int
41730 +load_black_box(reiser4_tree * tree,
41731 + reiser4_key * key, void *data, int length, int exact)
41732 +{
41733 + int result;
41734 + coord_t coord;
41735 + lock_handle lh;
41736 +
41737 + init_lh(&lh);
41738 + result = coord_by_key(tree, key,
41739 + &coord, &lh, ZNODE_READ_LOCK,
41740 + exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
41741 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41742 +
41743 + if (result == 0) {
41744 + int ilen;
41745 +
41746 + result = zload(coord.node);
41747 + if (result == 0) {
41748 + ilen = item_length_by_coord(&coord);
41749 + if (ilen <= length) {
41750 + memcpy(data, item_body_by_coord(&coord), ilen);
41751 + unit_key_by_coord(&coord, key);
41752 + } else if (exact) {
41753 + /*
41754 + * item is larger than buffer provided by the
41755 + * user. Only issue a warning if @exact is
41756 + * set. If @exact is false, we are iterating
41757 + * over all safe-links and here we are reaching
41758 + * the end of the iteration.
41759 + */
41760 + warning("nikita-3415",
41761 + "Wrong black box length: %i > %i",
41762 + ilen, length);
41763 + result = RETERR(-EIO);
41764 + }
41765 + zrelse(coord.node);
41766 + }
41767 + }
41768 +
41769 + done_lh(&lh);
41770 + return result;
41771 +
41772 +}
41773 +
41774 +int
41775 +update_black_box(reiser4_tree * tree,
41776 + const reiser4_key * key, void *data, int length)
41777 +{
41778 + int result;
41779 + coord_t coord;
41780 + lock_handle lh;
41781 +
41782 + init_lh(&lh);
41783 + result = coord_by_key(tree, key,
41784 + &coord, &lh, ZNODE_READ_LOCK,
41785 + FIND_EXACT,
41786 + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41787 + if (result == 0) {
41788 + int ilen;
41789 +
41790 + result = zload(coord.node);
41791 + if (result == 0) {
41792 + ilen = item_length_by_coord(&coord);
41793 + if (length <= ilen) {
41794 + memcpy(item_body_by_coord(&coord), data,
41795 + length);
41796 + } else {
41797 + warning("nikita-3437",
41798 + "Wrong black box length: %i < %i",
41799 + ilen, length);
41800 + result = RETERR(-EIO);
41801 + }
41802 + zrelse(coord.node);
41803 + }
41804 + }
41805 +
41806 + done_lh(&lh);
41807 + return result;
41808 +
41809 +}
41810 +
41811 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
41812 +{
41813 + return reiser4_cut_tree(tree, key, key, NULL, 1);
41814 +}
41815 +
41816 +/* Make Linus happy.
41817 + Local variables:
41818 + c-indentation-style: "K&R"
41819 + mode-name: "LC"
41820 + c-basic-offset: 8
41821 + tab-width: 8
41822 + fill-column: 120
41823 + End:
41824 +*/
41825 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.23/fs/reiser4/plugin/item/blackbox.h
41826 --- linux-2.6.23.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
41827 +++ linux-2.6.23/fs/reiser4/plugin/item/blackbox.h 2007-12-04 16:49:30.000000000 +0300
41828 @@ -0,0 +1,33 @@
41829 +/* Copyright 2003 by Hans Reiser, licensing governed by
41830 + * reiser4/README */
41831 +
41832 +/* "Black box" entry to fixed-width contain user supplied data */
41833 +
41834 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
41835 +#define __FS_REISER4_BLACK_BOX_H__
41836 +
41837 +#include "../../forward.h"
41838 +#include "../../dformat.h"
41839 +#include "../../kassign.h"
41840 +#include "../../key.h"
41841 +
41842 +extern int store_black_box(reiser4_tree * tree,
41843 + const reiser4_key * key, void *data, int length);
41844 +extern int load_black_box(reiser4_tree * tree,
41845 + reiser4_key * key, void *data, int length, int exact);
41846 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
41847 +extern int update_black_box(reiser4_tree * tree,
41848 + const reiser4_key * key, void *data, int length);
41849 +
41850 +/* __FS_REISER4_BLACK_BOX_H__ */
41851 +#endif
41852 +
41853 +/* Make Linus happy.
41854 + Local variables:
41855 + c-indentation-style: "K&R"
41856 + mode-name: "LC"
41857 + c-basic-offset: 8
41858 + tab-width: 8
41859 + fill-column: 120
41860 + End:
41861 +*/
41862 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/cde.c linux-2.6.23/fs/reiser4/plugin/item/cde.c
41863 --- linux-2.6.23.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300
41864 +++ linux-2.6.23/fs/reiser4/plugin/item/cde.c 2007-12-04 16:49:30.000000000 +0300
41865 @@ -0,0 +1,1008 @@
41866 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41867 +
41868 +/* Directory entry implementation */
41869 +
41870 +/* DESCRIPTION:
41871 +
41872 + This is "compound" directory item plugin implementation. This directory
41873 + item type is compound (as opposed to the "simple directory item" in
41874 + fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
41875 + entries.
41876 +
41877 + The reason behind this decision is disk space efficiency: all directory
41878 + entries inside the same directory have identical fragment in their
41879 + keys. This, of course, depends on key assignment policy. In our default key
41880 + assignment policy, all directory entries have the same locality which is
41881 + equal to the object id of their directory.
41882 +
41883 + Composing directory item out of several directory entries for the same
41884 + directory allows us to store said key fragment only once. That is, this is
41885 + some ad hoc form of key compression (stem compression) that is implemented
41886 + here, because general key compression is not supposed to be implemented in
41887 + v4.0.
41888 +
41889 + Another decision that was made regarding all directory item plugins, is
41890 + that they will store entry keys unaligned. This is for that sake of disk
41891 + space efficiency again.
41892 +
41893 + In should be noted, that storing keys unaligned increases CPU consumption,
41894 + at least on some architectures.
41895 +
41896 + Internal on-disk structure of the compound directory item is the following:
41897 +
41898 + HEADER cde_item_format. Here number of entries is stored.
41899 + ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
41900 + ENTRY_HEADER_1 offset of entry body are stored.
41901 + ENTRY_HEADER_2 (basically two last parts of key)
41902 + ...
41903 + ENTRY_HEADER_N
41904 + ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
41905 + ENTRY_BODY_1 NUL-terminated name are stored.
41906 + ENTRY_BODY_2 (part of statadta key in the
41907 + sence that since all SDs have
41908 + zero offset, this offset is not
41909 + stored on disk).
41910 + ...
41911 + ENTRY_BODY_N
41912 +
41913 + When it comes to the balancing, each directory entry in compound directory
41914 + item is unit, that is, something that can be cut from one item and pasted
41915 + into another item of the same type. Handling of unit cut and paste is major
41916 + reason for the complexity of code below.
41917 +
41918 +*/
41919 +
41920 +#include "../../forward.h"
41921 +#include "../../debug.h"
41922 +#include "../../dformat.h"
41923 +#include "../../kassign.h"
41924 +#include "../../key.h"
41925 +#include "../../coord.h"
41926 +#include "sde.h"
41927 +#include "cde.h"
41928 +#include "item.h"
41929 +#include "../node/node.h"
41930 +#include "../plugin.h"
41931 +#include "../../znode.h"
41932 +#include "../../carry.h"
41933 +#include "../../tree.h"
41934 +#include "../../inode.h"
41935 +
41936 +#include <linux/fs.h> /* for struct inode */
41937 +#include <linux/dcache.h> /* for struct dentry */
41938 +#include <linux/quotaops.h>
41939 +
41940 +#if 0
41941 +#define CHECKME(coord) \
41942 +({ \
41943 + const char *message; \
41944 + coord_t dup; \
41945 + \
41946 + coord_dup_nocheck(&dup, (coord)); \
41947 + dup.unit_pos = 0; \
41948 + assert("nikita-2871", cde_check(&dup, &message) == 0); \
41949 +})
41950 +#else
41951 +#define CHECKME(coord) noop
41952 +#endif
41953 +
41954 +/* return body of compound directory item at @coord */
41955 +static inline cde_item_format *formatted_at(const coord_t * coord)
41956 +{
41957 + assert("nikita-1282", coord != NULL);
41958 + return item_body_by_coord(coord);
41959 +}
41960 +
41961 +/* return entry header at @coord */
41962 +static inline cde_unit_header *header_at(const coord_t *
41963 + coord /* coord of item */ ,
41964 + int idx /* index of unit */ )
41965 +{
41966 + assert("nikita-1283", coord != NULL);
41967 + return &formatted_at(coord)->entry[idx];
41968 +}
41969 +
41970 +/* return number of units in compound directory item at @coord */
41971 +static int units(const coord_t * coord /* coord of item */ )
41972 +{
41973 + return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
41974 +}
41975 +
41976 +/* return offset of the body of @idx-th entry in @coord */
41977 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
41978 + int idx /* index of unit */ )
41979 +{
41980 + if (idx < units(coord))
41981 + return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
41982 + else if (idx == units(coord))
41983 + return item_length_by_coord(coord);
41984 + else
41985 + impossible("nikita-1308", "Wrong idx");
41986 + return 0;
41987 +}
41988 +
41989 +/* set offset of the body of @idx-th entry in @coord */
41990 +static void set_offset(const coord_t * coord /* coord of item */ ,
41991 + int idx /* index of unit */ ,
41992 + unsigned int offset /* new offset */ )
41993 +{
41994 + put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
41995 +}
41996 +
41997 +static void adj_offset(const coord_t * coord /* coord of item */ ,
41998 + int idx /* index of unit */ ,
41999 + int delta /* offset change */ )
42000 +{
42001 + d16 *doffset;
42002 + __u16 offset;
42003 +
42004 + doffset = &header_at(coord, idx)->offset;
42005 + offset = le16_to_cpu(get_unaligned(doffset));
42006 + offset += delta;
42007 + put_unaligned(cpu_to_le16((__u16) offset), doffset);
42008 +}
42009 +
42010 +/* return pointer to @offset-th byte from the beginning of @coord */
42011 +static char *address(const coord_t * coord /* coord of item */ ,
42012 + int offset)
42013 +{
42014 + return ((char *)item_body_by_coord(coord)) + offset;
42015 +}
42016 +
42017 +/* return pointer to the body of @idx-th entry in @coord */
42018 +static directory_entry_format *entry_at(const coord_t * coord /* coord of
42019 + * item */ ,
42020 + int idx /* index of unit */ )
42021 +{
42022 + return (directory_entry_format *) address(coord,
42023 + (int)offset_of(coord, idx));
42024 +}
42025 +
42026 +/* return number of unit referenced by @coord */
42027 +static int idx_of(const coord_t * coord /* coord of item */ )
42028 +{
42029 + assert("nikita-1285", coord != NULL);
42030 + return coord->unit_pos;
42031 +}
42032 +
42033 +/* find position where entry with @entry_key would be inserted into @coord */
42034 +static int find(const coord_t * coord /* coord of item */ ,
42035 + const reiser4_key * entry_key /* key to look for */ ,
42036 + cmp_t * last /* result of last comparison */ )
42037 +{
42038 + int entries;
42039 +
42040 + int left;
42041 + int right;
42042 +
42043 + cde_unit_header *header;
42044 +
42045 + assert("nikita-1295", coord != NULL);
42046 + assert("nikita-1296", entry_key != NULL);
42047 + assert("nikita-1297", last != NULL);
42048 +
42049 + entries = units(coord);
42050 + left = 0;
42051 + right = entries - 1;
42052 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
42053 + int median;
42054 +
42055 + median = (left + right) >> 1;
42056 +
42057 + header = header_at(coord, median);
42058 + *last = de_id_key_cmp(&header->hash, entry_key);
42059 + switch (*last) {
42060 + case LESS_THAN:
42061 + left = median;
42062 + break;
42063 + case GREATER_THAN:
42064 + right = median;
42065 + break;
42066 + case EQUAL_TO:{
42067 + do {
42068 + median--;
42069 + header--;
42070 + } while (median >= 0 &&
42071 + de_id_key_cmp(&header->hash,
42072 + entry_key) == EQUAL_TO);
42073 + return median + 1;
42074 + }
42075 + }
42076 + }
42077 + header = header_at(coord, left);
42078 + for (; left < entries; ++left, ++header) {
42079 + prefetch(header + 1);
42080 + *last = de_id_key_cmp(&header->hash, entry_key);
42081 + if (*last != LESS_THAN)
42082 + break;
42083 + }
42084 + if (left < entries)
42085 + return left;
42086 + else
42087 + return RETERR(-ENOENT);
42088 +
42089 +}
42090 +
42091 +/* expand @coord as to accommodate for insertion of @no new entries starting
42092 + from @pos, with total bodies size @size. */
42093 +static int expand_item(const coord_t * coord /* coord of item */ ,
42094 + int pos /* unit position */ , int no /* number of new
42095 + * units*/ ,
42096 + int size /* total size of new units' data */ ,
42097 + unsigned int data_size /* free space already reserved
42098 + * in the item for insertion */ )
42099 +{
42100 + int entries;
42101 + cde_unit_header *header;
42102 + char *dent;
42103 + int i;
42104 +
42105 + assert("nikita-1310", coord != NULL);
42106 + assert("nikita-1311", pos >= 0);
42107 + assert("nikita-1312", no > 0);
42108 + assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
42109 + assert("nikita-1343",
42110 + item_length_by_coord(coord) >=
42111 + (int)(size + data_size + no * sizeof *header));
42112 +
42113 + entries = units(coord);
42114 +
42115 + if (pos == entries)
42116 + dent = address(coord, size);
42117 + else
42118 + dent = (char *)entry_at(coord, pos);
42119 + /* place where new header will be in */
42120 + header = header_at(coord, pos);
42121 + /* free space for new entry headers */
42122 + memmove(header + no, header,
42123 + (unsigned)(address(coord, size) - (char *)header));
42124 + /* if adding to the end initialise first new header */
42125 + if (pos == entries) {
42126 + set_offset(coord, pos, (unsigned)size);
42127 + }
42128 +
42129 + /* adjust entry pointer and size */
42130 + dent = dent + no * sizeof *header;
42131 + size += no * sizeof *header;
42132 + /* free space for new entries */
42133 + memmove(dent + data_size, dent,
42134 + (unsigned)(address(coord, size) - dent));
42135 +
42136 + /* increase counter */
42137 + entries += no;
42138 + put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
42139 +
42140 + /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
42141 + bytes. */
42142 + for (i = 0; i <= pos; ++i)
42143 + adj_offset(coord, i, no * sizeof *header);
42144 + /* [ pos + no ... +\infty ) entries were shifted by ( no *
42145 + sizeof *header + data_size ) bytes */
42146 + for (i = pos + no; i < entries; ++i)
42147 + adj_offset(coord, i, no * sizeof *header + data_size);
42148 + return 0;
42149 +}
42150 +
42151 +/* insert new @entry into item */
42152 +static int expand(const coord_t * coord /* coord of item */ ,
42153 + struct cde_entry * entry /* entry to insert */ ,
42154 + int len /* length of @entry data */ ,
42155 + int *pos /* position to insert */ ,
42156 + reiser4_dir_entry_desc * dir_entry /* parameters for new
42157 + * entry */ )
42158 +{
42159 + cmp_t cmp_res;
42160 + int datasize;
42161 +
42162 + *pos = find(coord, &dir_entry->key, &cmp_res);
42163 + if (*pos < 0)
42164 + *pos = units(coord);
42165 +
42166 + datasize = sizeof(directory_entry_format);
42167 + if (is_longname(entry->name->name, entry->name->len))
42168 + datasize += entry->name->len + 1;
42169 +
42170 + expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
42171 + datasize);
42172 + return 0;
42173 +}
42174 +
42175 +/* paste body of @entry into item */
42176 +static int paste_entry(const coord_t * coord /* coord of item */ ,
42177 + struct cde_entry * entry /* new entry */ ,
42178 + int pos /* position to insert */ ,
42179 + reiser4_dir_entry_desc * dir_entry /* parameters for
42180 + * new entry */ )
42181 +{
42182 + cde_unit_header *header;
42183 + directory_entry_format *dent;
42184 + const char *name;
42185 + int len;
42186 +
42187 + header = header_at(coord, pos);
42188 + dent = entry_at(coord, pos);
42189 +
42190 + build_de_id_by_key(&dir_entry->key, &header->hash);
42191 + build_inode_key_id(entry->obj, &dent->id);
42192 + /* AUDIT unsafe strcpy() operation! It should be replaced with
42193 + much less CPU hungry
42194 + memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
42195 +
42196 + Also a more major thing is that there should be a way to figure out
42197 + amount of space in dent -> name and be able to check that we are
42198 + not going to overwrite more than we supposed to */
42199 + name = entry->name->name;
42200 + len = entry->name->len;
42201 + if (is_longname(name, len)) {
42202 + strcpy((unsigned char *)dent->name, name);
42203 + put_unaligned(0, &dent->name[len]);
42204 + }
42205 + return 0;
42206 +}
42207 +
42208 +/* estimate how much space is necessary in item to insert/paste set of entries
42209 + described in @data. */
42210 +int estimate_cde(const coord_t * coord /* coord of item */ ,
42211 + const reiser4_item_data * data /* parameters for new item */ )
42212 +{
42213 + struct cde_entry_data *e;
42214 + int result;
42215 + int i;
42216 +
42217 + e = (struct cde_entry_data *) data->data;
42218 +
42219 + assert("nikita-1288", e != NULL);
42220 + assert("nikita-1289", e->num_of_entries >= 0);
42221 +
42222 + if (coord == NULL)
42223 + /* insert */
42224 + result = sizeof(cde_item_format);
42225 + else
42226 + /* paste */
42227 + result = 0;
42228 +
42229 + result += e->num_of_entries *
42230 + (sizeof(cde_unit_header) + sizeof(directory_entry_format));
42231 + for (i = 0; i < e->num_of_entries; ++i) {
42232 + const char *name;
42233 + int len;
42234 +
42235 + name = e->entry[i].name->name;
42236 + len = e->entry[i].name->len;
42237 + assert("nikita-2054", strlen(name) == len);
42238 + if (is_longname(name, len))
42239 + result += len + 1;
42240 + }
42241 + ((reiser4_item_data *) data)->length = result;
42242 + return result;
42243 +}
42244 +
42245 +/* ->nr_units() method for this item plugin. */
42246 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
42247 +{
42248 + return units(coord);
42249 +}
42250 +
42251 +/* ->unit_key() method for this item plugin. */
42252 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
42253 + reiser4_key * key /* resulting key */ )
42254 +{
42255 + assert("nikita-1452", coord != NULL);
42256 + assert("nikita-1345", idx_of(coord) < units(coord));
42257 + assert("nikita-1346", key != NULL);
42258 +
42259 + item_key_by_coord(coord, key);
42260 + extract_key_from_de_id(extract_dir_id_from_key(key),
42261 + &header_at(coord, idx_of(coord))->hash, key);
42262 + return key;
42263 +}
42264 +
42265 +/* mergeable_cde(): implementation of ->mergeable() item method.
42266 +
42267 + Two directory items are mergeable iff they are from the same
42268 + directory. That simple.
42269 +
42270 +*/
42271 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
42272 + const coord_t * p2 /* coord of second item */ )
42273 +{
42274 + reiser4_key k1;
42275 + reiser4_key k2;
42276 +
42277 + assert("nikita-1339", p1 != NULL);
42278 + assert("nikita-1340", p2 != NULL);
42279 +
42280 + return
42281 + (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
42282 + (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
42283 + extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
42284 +
42285 +}
42286 +
42287 +/* ->max_key_inside() method for this item plugin. */
42288 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
42289 + reiser4_key * result /* resulting key */ )
42290 +{
42291 + assert("nikita-1342", coord != NULL);
42292 +
42293 + item_key_by_coord(coord, result);
42294 + set_key_ordering(result, get_key_ordering(reiser4_max_key()));
42295 + set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
42296 + set_key_offset(result, get_key_offset(reiser4_max_key()));
42297 + return result;
42298 +}
42299 +
42300 +/* @data contains data which are to be put into tree */
42301 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
42302 + const reiser4_key * key /* key to check */ ,
42303 + const reiser4_item_data * data /* parameters of new
42304 + * item/unit being
42305 + * created */ )
42306 +{
42307 + reiser4_key item_key;
42308 +
42309 + /* FIXME-VS: do not rely on anything but iplug field of @data. Only
42310 + data->iplug is initialized */
42311 + assert("vs-457", data && data->iplug);
42312 +/* assert( "vs-553", data -> user == 0 );*/
42313 + item_key_by_coord(coord, &item_key);
42314 +
42315 + return (item_plugin_by_coord(coord) == data->iplug) &&
42316 + (extract_dir_id_from_key(&item_key) ==
42317 + extract_dir_id_from_key(key));
42318 +}
42319 +
42320 +#if REISER4_DEBUG
42321 +/* cde_check ->check() method for compressed directory items
42322 +
42323 + used for debugging, every item should have here the most complete
42324 + possible check of the consistency of the item that the inventor can
42325 + construct
42326 +*/
42327 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
42328 + const char **error /* where to store error message */)
42329 +{
42330 + int i;
42331 + int result;
42332 + char *item_start;
42333 + char *item_end;
42334 + reiser4_key key;
42335 +
42336 + coord_t c;
42337 +
42338 + assert("nikita-1357", coord != NULL);
42339 + assert("nikita-1358", error != NULL);
42340 +
42341 + if (!ergo(coord->item_pos != 0,
42342 + is_dot_key(item_key_by_coord(coord, &key)))) {
42343 + *error = "CDE doesn't start with dot";
42344 + return -1;
42345 + }
42346 + item_start = item_body_by_coord(coord);
42347 + item_end = item_start + item_length_by_coord(coord);
42348 +
42349 + coord_dup(&c, coord);
42350 + result = 0;
42351 + for (i = 0; i < units(coord); ++i) {
42352 + directory_entry_format *entry;
42353 +
42354 + if ((char *)(header_at(coord, i) + 1) >
42355 + item_end - units(coord) * sizeof *entry) {
42356 + *error = "CDE header is out of bounds";
42357 + result = -1;
42358 + break;
42359 + }
42360 + entry = entry_at(coord, i);
42361 + if ((char *)entry < item_start + sizeof(cde_item_format)) {
42362 + *error = "CDE header is too low";
42363 + result = -1;
42364 + break;
42365 + }
42366 + if ((char *)(entry + 1) > item_end) {
42367 + *error = "CDE header is too high";
42368 + result = -1;
42369 + break;
42370 + }
42371 + }
42372 +
42373 + return result;
42374 +}
42375 +#endif
42376 +
42377 +/* ->init() method for this item plugin. */
42378 +int init_cde(coord_t * coord /* coord of item */ ,
42379 + coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
42380 + UNUSED_ARG)
42381 +{
42382 + put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
42383 + return 0;
42384 +}
42385 +
42386 +/* ->lookup() method for this item plugin. */
42387 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
42388 + lookup_bias bias /* search bias */ ,
42389 + coord_t * coord /* coord of item to lookup in */ )
42390 +{
42391 + cmp_t last_comp;
42392 + int pos;
42393 +
42394 + reiser4_key utmost_key;
42395 +
42396 + assert("nikita-1293", coord != NULL);
42397 + assert("nikita-1294", key != NULL);
42398 +
42399 + CHECKME(coord);
42400 +
42401 + if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
42402 + coord->unit_pos = 0;
42403 + coord->between = BEFORE_UNIT;
42404 + return CBK_COORD_NOTFOUND;
42405 + }
42406 + pos = find(coord, key, &last_comp);
42407 + if (pos >= 0) {
42408 + coord->unit_pos = (int)pos;
42409 + switch (last_comp) {
42410 + case EQUAL_TO:
42411 + coord->between = AT_UNIT;
42412 + return CBK_COORD_FOUND;
42413 + case GREATER_THAN:
42414 + coord->between = BEFORE_UNIT;
42415 + return RETERR(-ENOENT);
42416 + case LESS_THAN:
42417 + default:
42418 + impossible("nikita-1298", "Broken find");
42419 + return RETERR(-EIO);
42420 + }
42421 + } else {
42422 + coord->unit_pos = units(coord) - 1;
42423 + coord->between = AFTER_UNIT;
42424 + return (bias ==
42425 + FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
42426 + CBK_COORD_NOTFOUND;
42427 + }
42428 +}
42429 +
42430 +/* ->paste() method for this item plugin. */
42431 +int paste_cde(coord_t * coord /* coord of item */ ,
42432 + reiser4_item_data * data /* parameters of new unit being
42433 + * inserted */ ,
42434 + carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
42435 +{
42436 + struct cde_entry_data *e;
42437 + int result;
42438 + int i;
42439 +
42440 + CHECKME(coord);
42441 + e = (struct cde_entry_data *) data->data;
42442 +
42443 + result = 0;
42444 + for (i = 0; i < e->num_of_entries; ++i) {
42445 + int pos;
42446 + int phantom_size;
42447 +
42448 + phantom_size = data->length;
42449 + if (units(coord) == 0)
42450 + phantom_size -= sizeof(cde_item_format);
42451 +
42452 + result =
42453 + expand(coord, e->entry + i, phantom_size, &pos, data->arg);
42454 + if (result != 0)
42455 + break;
42456 + result = paste_entry(coord, e->entry + i, pos, data->arg);
42457 + if (result != 0)
42458 + break;
42459 + }
42460 + CHECKME(coord);
42461 + return result;
42462 +}
42463 +
42464 +/* amount of space occupied by all entries starting from @idx both headers and
42465 + bodies. */
42466 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
42467 + int idx /* index of unit */ )
42468 +{
42469 + assert("nikita-1299", coord != NULL);
42470 + assert("nikita-1300", idx < (int)units(coord));
42471 +
42472 + return sizeof(cde_item_format) +
42473 + (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
42474 + idx + 1) -
42475 + offset_of(coord, 0);
42476 +}
42477 +
42478 +/* how many but not more than @want units of @source can be merged with
42479 + item in @target node. If pend == append - we try to append last item
42480 + of @target by first units of @source. If pend == prepend - we try to
42481 + "prepend" first item in @target by last units of @source. @target
42482 + node has @free_space bytes of free space. Total size of those units
42483 + are returned via @size */
42484 +int can_shift_cde(unsigned free_space /* free space in item */ ,
42485 + coord_t * coord /* coord of source item */ ,
42486 + znode * target /* target node */ ,
42487 + shift_direction pend /* shift direction */ ,
42488 + unsigned *size /* resulting number of shifted bytes */ ,
42489 + unsigned want /* maximal number of bytes to shift */ )
42490 +{
42491 + int shift;
42492 +
42493 + CHECKME(coord);
42494 + if (want == 0) {
42495 + *size = 0;
42496 + return 0;
42497 + }
42498 +
42499 + /* pend == SHIFT_LEFT <==> shifting to the left */
42500 + if (pend == SHIFT_LEFT) {
42501 + for (shift = min((int)want - 1, units(coord)); shift >= 0;
42502 + --shift) {
42503 + *size = part_size(coord, shift);
42504 + if (target != NULL)
42505 + *size -= sizeof(cde_item_format);
42506 + if (*size <= free_space)
42507 + break;
42508 + }
42509 + shift = shift + 1;
42510 + } else {
42511 + int total_size;
42512 +
42513 + assert("nikita-1301", pend == SHIFT_RIGHT);
42514 +
42515 + total_size = item_length_by_coord(coord);
42516 + for (shift = units(coord) - want - 1; shift < units(coord) - 1;
42517 + ++shift) {
42518 + *size = total_size - part_size(coord, shift);
42519 + if (target == NULL)
42520 + *size += sizeof(cde_item_format);
42521 + if (*size <= free_space)
42522 + break;
42523 + }
42524 + shift = units(coord) - shift - 1;
42525 + }
42526 + if (shift == 0)
42527 + *size = 0;
42528 + CHECKME(coord);
42529 + return shift;
42530 +}
42531 +
42532 +/* ->copy_units() method for this item plugin. */
42533 +void copy_units_cde(coord_t * target /* coord of target item */ ,
42534 + coord_t * source /* coord of source item */ ,
42535 + unsigned from /* starting unit */ ,
42536 + unsigned count /* how many units to copy */ ,
42537 + shift_direction where_is_free_space /* shift direction */ ,
42538 + unsigned free_space /* free space in item */ )
42539 +{
42540 + char *header_from;
42541 + char *header_to;
42542 +
42543 + char *entry_from;
42544 + char *entry_to;
42545 +
42546 + int pos_in_target;
42547 + int data_size;
42548 + int data_delta;
42549 + int i;
42550 +
42551 + assert("nikita-1303", target != NULL);
42552 + assert("nikita-1304", source != NULL);
42553 + assert("nikita-1305", (int)from < units(source));
42554 + assert("nikita-1307", (int)(from + count) <= units(source));
42555 +
42556 + if (where_is_free_space == SHIFT_LEFT) {
42557 + assert("nikita-1453", from == 0);
42558 + pos_in_target = units(target);
42559 + } else {
42560 + assert("nikita-1309", (int)(from + count) == units(source));
42561 + pos_in_target = 0;
42562 + memmove(item_body_by_coord(target),
42563 + (char *)item_body_by_coord(target) + free_space,
42564 + item_length_by_coord(target) - free_space);
42565 + }
42566 +
42567 + CHECKME(target);
42568 + CHECKME(source);
42569 +
42570 + /* expand @target */
42571 + data_size =
42572 + offset_of(source, (int)(from + count)) - offset_of(source,
42573 + (int)from);
42574 +
42575 + if (units(target) == 0)
42576 + free_space -= sizeof(cde_item_format);
42577 +
42578 + expand_item(target, pos_in_target, (int)count,
42579 + (int)(item_length_by_coord(target) - free_space),
42580 + (unsigned)data_size);
42581 +
42582 + /* copy first @count units of @source into @target */
42583 + data_delta =
42584 + offset_of(target, pos_in_target) - offset_of(source, (int)from);
42585 +
42586 + /* copy entries */
42587 + entry_from = (char *)entry_at(source, (int)from);
42588 + entry_to = (char *)entry_at(source, (int)(from + count));
42589 + memmove(entry_at(target, pos_in_target), entry_from,
42590 + (unsigned)(entry_to - entry_from));
42591 +
42592 + /* copy headers */
42593 + header_from = (char *)header_at(source, (int)from);
42594 + header_to = (char *)header_at(source, (int)(from + count));
42595 + memmove(header_at(target, pos_in_target), header_from,
42596 + (unsigned)(header_to - header_from));
42597 +
42598 + /* update offsets */
42599 + for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
42600 + adj_offset(target, i, data_delta);
42601 + CHECKME(target);
42602 + CHECKME(source);
42603 +}
42604 +
42605 +/* ->cut_units() method for this item plugin. */
42606 +int cut_units_cde(coord_t * coord /* coord of item */ ,
42607 + pos_in_node_t from /* start unit pos */ ,
42608 + pos_in_node_t to /* stop unit pos */ ,
42609 + struct carry_cut_data *cdata UNUSED_ARG,
42610 + reiser4_key * smallest_removed, reiser4_key * new_first)
42611 +{
42612 + char *header_from;
42613 + char *header_to;
42614 +
42615 + char *entry_from;
42616 + char *entry_to;
42617 +
42618 + int size;
42619 + int entry_delta;
42620 + int header_delta;
42621 + int i;
42622 +
42623 + unsigned count;
42624 +
42625 + CHECKME(coord);
42626 +
42627 + count = to - from + 1;
42628 +
42629 + assert("nikita-1454", coord != NULL);
42630 + assert("nikita-1455", (int)(from + count) <= units(coord));
42631 +
42632 + if (smallest_removed)
42633 + unit_key_by_coord(coord, smallest_removed);
42634 +
42635 + if (new_first) {
42636 + coord_t next;
42637 +
42638 + /* not everything is cut from item head */
42639 + assert("vs-1527", from == 0);
42640 + assert("vs-1528", to < units(coord) - 1);
42641 +
42642 + coord_dup(&next, coord);
42643 + next.unit_pos++;
42644 + unit_key_by_coord(&next, new_first);
42645 + }
42646 +
42647 + size = item_length_by_coord(coord);
42648 + if (count == (unsigned)units(coord)) {
42649 + return size;
42650 + }
42651 +
42652 + header_from = (char *)header_at(coord, (int)from);
42653 + header_to = (char *)header_at(coord, (int)(from + count));
42654 +
42655 + entry_from = (char *)entry_at(coord, (int)from);
42656 + entry_to = (char *)entry_at(coord, (int)(from + count));
42657 +
42658 + /* move headers */
42659 + memmove(header_from, header_to,
42660 + (unsigned)(address(coord, size) - header_to));
42661 +
42662 + header_delta = header_to - header_from;
42663 +
42664 + entry_from -= header_delta;
42665 + entry_to -= header_delta;
42666 + size -= header_delta;
42667 +
42668 + /* copy entries */
42669 + memmove(entry_from, entry_to,
42670 + (unsigned)(address(coord, size) - entry_to));
42671 +
42672 + entry_delta = entry_to - entry_from;
42673 + size -= entry_delta;
42674 +
42675 + /* update offsets */
42676 +
42677 + for (i = 0; i < (int)from; ++i)
42678 + adj_offset(coord, i, -header_delta);
42679 +
42680 + for (i = from; i < units(coord) - (int)count; ++i)
42681 + adj_offset(coord, i, -header_delta - entry_delta);
42682 +
42683 + put_unaligned(cpu_to_le16((__u16) units(coord) - count),
42684 + &formatted_at(coord)->num_of_entries);
42685 +
42686 + if (from == 0) {
42687 + /* entries from head was removed - move remaining to right */
42688 + memmove((char *)item_body_by_coord(coord) +
42689 + header_delta + entry_delta, item_body_by_coord(coord),
42690 + (unsigned)size);
42691 + if (REISER4_DEBUG)
42692 + memset(item_body_by_coord(coord), 0,
42693 + (unsigned)header_delta + entry_delta);
42694 + } else {
42695 + /* freed space is already at the end of item */
42696 + if (REISER4_DEBUG)
42697 + memset((char *)item_body_by_coord(coord) + size, 0,
42698 + (unsigned)header_delta + entry_delta);
42699 + }
42700 +
42701 + return header_delta + entry_delta;
42702 +}
42703 +
42704 +int kill_units_cde(coord_t * coord /* coord of item */ ,
42705 + pos_in_node_t from /* start unit pos */ ,
42706 + pos_in_node_t to /* stop unit pos */ ,
42707 + struct carry_kill_data *kdata UNUSED_ARG,
42708 + reiser4_key * smallest_removed, reiser4_key * new_first)
42709 +{
42710 + return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
42711 +}
42712 +
42713 +/* ->s.dir.extract_key() method for this item plugin. */
42714 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
42715 + reiser4_key * key /* resulting key */ )
42716 +{
42717 + directory_entry_format *dent;
42718 +
42719 + assert("nikita-1155", coord != NULL);
42720 + assert("nikita-1156", key != NULL);
42721 +
42722 + dent = entry_at(coord, idx_of(coord));
42723 + return extract_key_from_id(&dent->id, key);
42724 +}
42725 +
42726 +int
42727 +update_key_cde(const coord_t * coord, const reiser4_key * key,
42728 + lock_handle * lh UNUSED_ARG)
42729 +{
42730 + directory_entry_format *dent;
42731 + obj_key_id obj_id;
42732 + int result;
42733 +
42734 + assert("nikita-2344", coord != NULL);
42735 + assert("nikita-2345", key != NULL);
42736 +
42737 + dent = entry_at(coord, idx_of(coord));
42738 + result = build_obj_key_id(key, &obj_id);
42739 + if (result == 0) {
42740 + dent->id = obj_id;
42741 + znode_make_dirty(coord->node);
42742 + }
42743 + return 0;
42744 +}
42745 +
42746 +/* ->s.dir.extract_name() method for this item plugin. */
42747 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
42748 +{
42749 + directory_entry_format *dent;
42750 +
42751 + assert("nikita-1157", coord != NULL);
42752 +
42753 + dent = entry_at(coord, idx_of(coord));
42754 + return extract_dent_name(coord, dent, buf);
42755 +}
42756 +
42757 +static int cde_bytes(int pasting, const reiser4_item_data * data)
42758 +{
42759 + int result;
42760 +
42761 + result = data->length;
42762 + if (!pasting)
42763 + result -= sizeof(cde_item_format);
42764 + return result;
42765 +}
42766 +
42767 +/* ->s.dir.add_entry() method for this item plugin */
42768 +int add_entry_cde(struct inode *dir /* directory object */ ,
42769 + coord_t * coord /* coord of item */ ,
42770 + lock_handle * lh /* lock handle for insertion */ ,
42771 + const struct dentry *name /* name to insert */ ,
42772 + reiser4_dir_entry_desc * dir_entry /* parameters of new
42773 + * directory entry */ )
42774 +{
42775 + reiser4_item_data data;
42776 + struct cde_entry entry;
42777 + struct cde_entry_data edata;
42778 + int result;
42779 +
42780 + assert("nikita-1656", coord->node == lh->node);
42781 + assert("nikita-1657", znode_is_write_locked(coord->node));
42782 +
42783 + edata.num_of_entries = 1;
42784 + edata.entry = &entry;
42785 +
42786 + entry.dir = dir;
42787 + entry.obj = dir_entry->obj;
42788 + entry.name = &name->d_name;
42789 +
42790 + data.data = (char *)&edata;
42791 + data.user = 0; /* &edata is not user space */
42792 + data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
42793 + data.arg = dir_entry;
42794 + assert("nikita-1302", data.iplug != NULL);
42795 +
42796 + result = is_dot_key(&dir_entry->key);
42797 + data.length = estimate_cde(result ? coord : NULL, &data);
42798 +
42799 + /* NOTE-NIKITA quota plugin? */
42800 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
42801 + return RETERR(-EDQUOT);
42802 +
42803 + if (result)
42804 + result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
42805 + else
42806 + result = reiser4_resize_item(coord, &data, &dir_entry->key,
42807 + lh, 0);
42808 + return result;
42809 +}
42810 +
42811 +/* ->s.dir.rem_entry() */
42812 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
42813 + const struct qstr *name, coord_t * coord /* coord of item */ ,
42814 + lock_handle * lh UNUSED_ARG /* lock handle for
42815 + * removal */ ,
42816 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
42817 + * directory entry
42818 + * being removed */ )
42819 +{
42820 + coord_t shadow;
42821 + int result;
42822 + int length;
42823 + ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
42824 +
42825 + assert("nikita-2870", strlen(name->name) == name->len);
42826 + assert("nikita-2869",
42827 + !strcmp(name->name, extract_name_cde(coord, buf)));
42828 +
42829 + length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
42830 + if (is_longname(name->name, name->len))
42831 + length += name->len + 1;
42832 +
42833 + if (inode_get_bytes(dir) < length) {
42834 + warning("nikita-2628", "Dir is broke: %llu: %llu",
42835 + (unsigned long long)get_inode_oid(dir),
42836 + inode_get_bytes(dir));
42837 +
42838 + return RETERR(-EIO);
42839 + }
42840 +
42841 + /* cut_node() is supposed to take pointers to _different_
42842 + coords, because it will modify them without respect to
42843 + possible aliasing. To work around this, create temporary copy
42844 + of @coord.
42845 + */
42846 + coord_dup(&shadow, coord);
42847 + result =
42848 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
42849 + if (result == 0) {
42850 + /* NOTE-NIKITA quota plugin? */
42851 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
42852 + }
42853 + return result;
42854 +}
42855 +
42856 +/* ->s.dir.max_name_len() method for this item plugin */
42857 +int max_name_len_cde(const struct inode *dir /* directory */ )
42858 +{
42859 + return
42860 + reiser4_tree_by_inode(dir)->nplug->max_item_size() -
42861 + sizeof(directory_entry_format) - sizeof(cde_item_format) -
42862 + sizeof(cde_unit_header) - 2;
42863 +}
42864 +
42865 +/* Make Linus happy.
42866 + Local variables:
42867 + c-indentation-style: "K&R"
42868 + mode-name: "LC"
42869 + c-basic-offset: 8
42870 + tab-width: 8
42871 + fill-column: 120
42872 + End:
42873 +*/
42874 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/cde.h linux-2.6.23/fs/reiser4/plugin/item/cde.h
42875 --- linux-2.6.23.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300
42876 +++ linux-2.6.23/fs/reiser4/plugin/item/cde.h 2007-12-04 16:49:30.000000000 +0300
42877 @@ -0,0 +1,87 @@
42878 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42879 +
42880 +/* Compound directory item. See cde.c for description. */
42881 +
42882 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
42883 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
42884 +
42885 +#include "../../forward.h"
42886 +#include "../../kassign.h"
42887 +#include "../../dformat.h"
42888 +
42889 +#include <linux/fs.h> /* for struct inode */
42890 +#include <linux/dcache.h> /* for struct dentry, etc */
42891 +
42892 +typedef struct cde_unit_header {
42893 + de_id hash;
42894 + d16 offset;
42895 +} cde_unit_header;
42896 +
42897 +typedef struct cde_item_format {
42898 + d16 num_of_entries;
42899 + cde_unit_header entry[0];
42900 +} cde_item_format;
42901 +
42902 +struct cde_entry {
42903 + const struct inode *dir;
42904 + const struct inode *obj;
42905 + const struct qstr *name;
42906 +};
42907 +
42908 +struct cde_entry_data {
42909 + int num_of_entries;
42910 + struct cde_entry *entry;
42911 +};
42912 +
42913 +/* plugin->item.b.* */
42914 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
42915 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
42916 + const reiser4_item_data *);
42917 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
42918 +pos_in_node_t nr_units_cde(const coord_t * coord);
42919 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
42920 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
42921 +void print_cde(const char *prefix, coord_t * coord);
42922 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
42923 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
42924 + coord_t * coord);
42925 +int paste_cde(coord_t * coord, reiser4_item_data * data,
42926 + carry_plugin_info * info UNUSED_ARG);
42927 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
42928 + shift_direction pend, unsigned *size, unsigned want);
42929 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
42930 + unsigned count, shift_direction where_is_free_space,
42931 + unsigned free_space);
42932 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
42933 + struct carry_cut_data *, reiser4_key * smallest_removed,
42934 + reiser4_key * new_first);
42935 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
42936 + struct carry_kill_data *, reiser4_key * smallest_removed,
42937 + reiser4_key * new_first);
42938 +void print_cde(const char *prefix, coord_t * coord);
42939 +int reiser4_check_cde(const coord_t * coord, const char **error);
42940 +
42941 +/* plugin->u.item.s.dir.* */
42942 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
42943 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
42944 + lock_handle * lh);
42945 +char *extract_name_cde(const coord_t * coord, char *buf);
42946 +int add_entry_cde(struct inode *dir, coord_t * coord,
42947 + lock_handle * lh, const struct dentry *name,
42948 + reiser4_dir_entry_desc * entry);
42949 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
42950 + lock_handle * lh, reiser4_dir_entry_desc * entry);
42951 +int max_name_len_cde(const struct inode *dir);
42952 +
42953 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
42954 +#endif
42955 +
42956 +/* Make Linus happy.
42957 + Local variables:
42958 + c-indentation-style: "K&R"
42959 + mode-name: "LC"
42960 + c-basic-offset: 8
42961 + tab-width: 8
42962 + fill-column: 120
42963 + End:
42964 +*/
42965 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.23/fs/reiser4/plugin/item/ctail.c
42966 --- linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300
42967 +++ linux-2.6.23/fs/reiser4/plugin/item/ctail.c 2007-12-04 23:04:00.730306034 +0300
42968 @@ -0,0 +1,1615 @@
42969 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42970 +
42971 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
42972 +
42973 +/* DESCRIPTION:
42974 +
42975 +Each cryptcompress object is stored on disk as a set of clusters sliced
42976 +into ctails.
42977 +
42978 +Internal on-disk structure:
42979 +
42980 + HEADER (1) Here stored disk cluster shift
42981 + BODY
42982 +*/
42983 +
42984 +#include "../../forward.h"
42985 +#include "../../debug.h"
42986 +#include "../../dformat.h"
42987 +#include "../../kassign.h"
42988 +#include "../../key.h"
42989 +#include "../../coord.h"
42990 +#include "item.h"
42991 +#include "../node/node.h"
42992 +#include "../plugin.h"
42993 +#include "../object.h"
42994 +#include "../../znode.h"
42995 +#include "../../carry.h"
42996 +#include "../../tree.h"
42997 +#include "../../inode.h"
42998 +#include "../../super.h"
42999 +#include "../../context.h"
43000 +#include "../../page_cache.h"
43001 +#include "../cluster.h"
43002 +#include "../../flush.h"
43003 +#include "../../tree_walk.h"
43004 +
43005 +#include <linux/pagevec.h>
43006 +#include <linux/swap.h>
43007 +#include <linux/fs.h>
43008 +
43009 +/* return body of ctail item at @coord */
43010 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
43011 +{
43012 + assert("edward-60", coord != NULL);
43013 + return item_body_by_coord(coord);
43014 +}
43015 +
43016 +static int cluster_shift_by_coord(const coord_t * coord)
43017 +{
43018 + return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
43019 +}
43020 +
43021 +static inline void dclust_set_extension_shift(hint_t * hint)
43022 +{
43023 + assert("edward-1270",
43024 + item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
43025 + hint->ext_coord.extension.ctail.shift =
43026 + cluster_shift_by_coord(&hint->ext_coord.coord);
43027 +}
43028 +
43029 +static loff_t off_by_coord(const coord_t * coord)
43030 +{
43031 + reiser4_key key;
43032 + return get_key_offset(item_key_by_coord(coord, &key));
43033 +}
43034 +
43035 +int coord_is_unprepped_ctail(const coord_t * coord)
43036 +{
43037 + assert("edward-1233", coord != NULL);
43038 + assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
43039 + assert("edward-1235",
43040 + ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
43041 + nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
43042 +
43043 + return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
43044 +}
43045 +
43046 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
43047 +{
43048 + int shift;
43049 +
43050 + if (inode != NULL) {
43051 + shift = inode_cluster_shift(inode);
43052 + assert("edward-1236",
43053 + ergo(!coord_is_unprepped_ctail(coord),
43054 + shift == cluster_shift_by_coord(coord)));
43055 + } else {
43056 + assert("edward-1237", !coord_is_unprepped_ctail(coord));
43057 + shift = cluster_shift_by_coord(coord);
43058 + }
43059 + return off_by_coord(coord) >> shift;
43060 +}
43061 +
43062 +static int disk_cluster_size(const coord_t * coord)
43063 +{
43064 + assert("edward-1156",
43065 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
43066 + /* calculation of disk cluster size
43067 + is meaninless if ctail is unprepped */
43068 + assert("edward-1238", !coord_is_unprepped_ctail(coord));
43069 +
43070 + return 1 << cluster_shift_by_coord(coord);
43071 +}
43072 +
43073 +/* true if the key is of first disk cluster item */
43074 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
43075 +{
43076 + assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
43077 +
43078 + return coord_is_unprepped_ctail(coord) ||
43079 + ((get_key_offset(key) &
43080 + ((loff_t) disk_cluster_size(coord) - 1)) == 0);
43081 +}
43082 +
43083 +static char *first_unit(coord_t * coord)
43084 +{
43085 + /* FIXME: warning: pointer of type `void *' used in arithmetic */
43086 + return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
43087 +}
43088 +
43089 +/* plugin->u.item.b.max_key_inside :
43090 + tail_max_key_inside */
43091 +
43092 +/* plugin->u.item.b.can_contain_key */
43093 +int
43094 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
43095 + const reiser4_item_data * data)
43096 +{
43097 + reiser4_key item_key;
43098 +
43099 + if (item_plugin_by_coord(coord) != data->iplug)
43100 + return 0;
43101 +
43102 + item_key_by_coord(coord, &item_key);
43103 + if (get_key_locality(key) != get_key_locality(&item_key) ||
43104 + get_key_objectid(key) != get_key_objectid(&item_key))
43105 + return 0;
43106 + if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
43107 + get_key_offset(key))
43108 + return 0;
43109 + if (is_disk_cluster_key(key, coord))
43110 + return 0;
43111 + return 1;
43112 +}
43113 +
43114 +/* plugin->u.item.b.mergeable */
43115 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
43116 +{
43117 + reiser4_key key1, key2;
43118 +
43119 + assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
43120 + assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
43121 + UNIX_FILE_METADATA_ITEM_TYPE));
43122 +
43123 + if (item_id_by_coord(p2) != CTAIL_ID) {
43124 + /* second item is of another type */
43125 + return 0;
43126 + }
43127 +
43128 + item_key_by_coord(p1, &key1);
43129 + item_key_by_coord(p2, &key2);
43130 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
43131 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
43132 + get_key_type(&key1) != get_key_type(&key2)) {
43133 + /* items of different objects */
43134 + return 0;
43135 + }
43136 + if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
43137 + /* not adjacent items */
43138 + return 0;
43139 + if (is_disk_cluster_key(&key2, p2))
43140 + return 0;
43141 + return 1;
43142 +}
43143 +
43144 +/* plugin->u.item.b.nr_units */
43145 +pos_in_node_t nr_units_ctail(const coord_t * coord)
43146 +{
43147 + return (item_length_by_coord(coord) -
43148 + sizeof(ctail_formatted_at(coord)->cluster_shift));
43149 +}
43150 +
43151 +/* plugin->u.item.b.estimate:
43152 + estimate how much space is needed to insert/paste @data->length bytes
43153 + into ctail at @coord */
43154 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
43155 + const reiser4_item_data *
43156 + data /* parameters for new item */ )
43157 +{
43158 + if (coord == NULL)
43159 + /* insert */
43160 + return (sizeof(ctail_item_format) + data->length);
43161 + else
43162 + /* paste */
43163 + return data->length;
43164 +}
43165 +
43166 +/* ->init() method for this item plugin. */
43167 +int init_ctail(coord_t * to /* coord of item */ ,
43168 + coord_t * from /* old_item */ ,
43169 + reiser4_item_data * data /* structure used for insertion */ )
43170 +{
43171 + int cluster_shift; /* cpu value to convert */
43172 +
43173 + if (data) {
43174 + assert("edward-463", data->length > sizeof(ctail_item_format));
43175 + cluster_shift = *((int *)(data->arg));
43176 + data->length -= sizeof(ctail_item_format);
43177 + } else {
43178 + assert("edward-464", from != NULL);
43179 + assert("edward-855", ctail_ok(from));
43180 + cluster_shift = (int)(cluster_shift_by_coord(from));
43181 + }
43182 + put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
43183 + assert("edward-856", ctail_ok(to));
43184 + return 0;
43185 +}
43186 +
43187 +/* plugin->u.item.b.lookup:
43188 + NULL: We are looking for item keys only */
43189 +
43190 +#if REISER4_DEBUG
43191 +int ctail_ok(const coord_t * coord)
43192 +{
43193 + return coord_is_unprepped_ctail(coord) ||
43194 + cluster_shift_ok(cluster_shift_by_coord(coord));
43195 +}
43196 +
43197 +/* plugin->u.item.b.check */
43198 +int check_ctail(const coord_t * coord, const char **error)
43199 +{
43200 + if (!ctail_ok(coord)) {
43201 + if (error)
43202 + *error = "bad cluster shift in ctail";
43203 + return 1;
43204 + }
43205 + return 0;
43206 +}
43207 +#endif
43208 +
43209 +/* plugin->u.item.b.paste */
43210 +int
43211 +paste_ctail(coord_t * coord, reiser4_item_data * data,
43212 + carry_plugin_info * info UNUSED_ARG)
43213 +{
43214 + unsigned old_nr_units;
43215 +
43216 + assert("edward-268", data->data != NULL);
43217 + /* copy only from kernel space */
43218 + assert("edward-66", data->user == 0);
43219 +
43220 + old_nr_units =
43221 + item_length_by_coord(coord) - sizeof(ctail_item_format) -
43222 + data->length;
43223 +
43224 + /* ctail items never get pasted in the middle */
43225 +
43226 + if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
43227 +
43228 + /* paste at the beginning when create new item */
43229 + assert("edward-450",
43230 + item_length_by_coord(coord) ==
43231 + data->length + sizeof(ctail_item_format));
43232 + assert("edward-451", old_nr_units == 0);
43233 + } else if (coord->unit_pos == old_nr_units - 1
43234 + && coord->between == AFTER_UNIT) {
43235 +
43236 + /* paste at the end */
43237 + coord->unit_pos++;
43238 + } else
43239 + impossible("edward-453", "bad paste position");
43240 +
43241 + memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
43242 +
43243 + assert("edward-857", ctail_ok(coord));
43244 +
43245 + return 0;
43246 +}
43247 +
43248 +/* plugin->u.item.b.fast_paste */
43249 +
43250 +/* plugin->u.item.b.can_shift
43251 + number of units is returned via return value, number of bytes via @size. For
43252 + ctail items they coincide */
43253 +int
43254 +can_shift_ctail(unsigned free_space, coord_t * source,
43255 + znode * target, shift_direction direction UNUSED_ARG,
43256 + unsigned *size /* number of bytes */ , unsigned want)
43257 +{
43258 + /* make sure that that we do not want to shift more than we have */
43259 + assert("edward-68", want > 0 && want <= nr_units_ctail(source));
43260 +
43261 + *size = min(want, free_space);
43262 +
43263 + if (!target) {
43264 + /* new item will be created */
43265 + if (*size <= sizeof(ctail_item_format)) {
43266 + *size = 0;
43267 + return 0;
43268 + }
43269 + return *size - sizeof(ctail_item_format);
43270 + }
43271 + return *size;
43272 +}
43273 +
43274 +/* plugin->u.item.b.copy_units
43275 + cooperates with ->can_shift() */
43276 +void
43277 +copy_units_ctail(coord_t * target, coord_t * source,
43278 + unsigned from, unsigned count /* units */ ,
43279 + shift_direction where_is_free_space,
43280 + unsigned free_space /* bytes */ )
43281 +{
43282 + /* make sure that item @target is expanded already */
43283 + assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
43284 + assert("edward-70", free_space == count || free_space == count + 1);
43285 +
43286 + assert("edward-858", ctail_ok(source));
43287 +
43288 + if (where_is_free_space == SHIFT_LEFT) {
43289 + /* append item @target with @count first bytes of @source:
43290 + this restriction came from ordinary tails */
43291 + assert("edward-71", from == 0);
43292 + assert("edward-860", ctail_ok(target));
43293 +
43294 + memcpy(first_unit(target) + nr_units_ctail(target) - count,
43295 + first_unit(source), count);
43296 + } else {
43297 + /* target item is moved to right already */
43298 + reiser4_key key;
43299 +
43300 + assert("edward-72", nr_units_ctail(source) == from + count);
43301 +
43302 + if (free_space == count) {
43303 + init_ctail(target, source, NULL);
43304 + } else {
43305 + /* new item has been created */
43306 + assert("edward-862", ctail_ok(target));
43307 + }
43308 + memcpy(first_unit(target), first_unit(source) + from, count);
43309 +
43310 + assert("edward-863", ctail_ok(target));
43311 +
43312 + /* new units are inserted before first unit in an item,
43313 + therefore, we have to update item key */
43314 + item_key_by_coord(source, &key);
43315 + set_key_offset(&key, get_key_offset(&key) + from);
43316 +
43317 + node_plugin_by_node(target->node)->update_item_key(target, &key,
43318 + NULL /*info */);
43319 + }
43320 +}
43321 +
43322 +/* plugin->u.item.b.create_hook */
43323 +int create_hook_ctail(const coord_t * coord, void *arg)
43324 +{
43325 + assert("edward-864", znode_is_loaded(coord->node));
43326 +
43327 + znode_set_convertible(coord->node);
43328 + return 0;
43329 +}
43330 +
43331 +/* plugin->u.item.b.kill_hook */
43332 +int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
43333 + pos_in_node_t count, carry_kill_data * kdata)
43334 +{
43335 + struct inode *inode;
43336 +
43337 + assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
43338 + assert("edward-291", znode_is_write_locked(coord->node));
43339 +
43340 + inode = kdata->inode;
43341 + if (inode) {
43342 + reiser4_key key;
43343 + struct cryptcompress_info * info;
43344 + cloff_t index;
43345 +
43346 + item_key_by_coord(coord, &key);
43347 + info = cryptcompress_inode_data(inode);
43348 + index = off_to_clust(get_key_offset(&key), inode);
43349 +
43350 + if (from == 0) {
43351 + info->trunc_index = index;
43352 + if (is_disk_cluster_key(&key, coord)) {
43353 + /*
43354 + * first item of disk cluster is to be killed
43355 + */
43356 + truncate_complete_page_cluster(
43357 + inode, index, kdata->params.truncate);
43358 + inode_sub_bytes(inode,
43359 + inode_cluster_size(inode));
43360 + }
43361 + }
43362 + }
43363 + return 0;
43364 +}
43365 +
43366 +/* for shift_hook_ctail(),
43367 + return true if the first disk cluster item has dirty child
43368 +*/
43369 +static int ctail_convertible(const coord_t * coord)
43370 +{
43371 + int result;
43372 + reiser4_key key;
43373 + jnode *child = NULL;
43374 +
43375 + assert("edward-477", coord != NULL);
43376 + assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
43377 +
43378 + if (coord_is_unprepped_ctail(coord))
43379 + /* unprepped ctail should be converted */
43380 + return 1;
43381 +
43382 + item_key_by_coord(coord, &key);
43383 + child = jlookup(current_tree,
43384 + get_key_objectid(&key),
43385 + off_to_pg(off_by_coord(coord)));
43386 + if (!child)
43387 + return 0;
43388 + result = JF_ISSET(child, JNODE_DIRTY);
43389 + jput(child);
43390 + return result;
43391 +}
43392 +
43393 +/* FIXME-EDWARD */
43394 +/* plugin->u.item.b.shift_hook */
43395 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
43396 + unsigned from UNUSED_ARG /* start unit */ ,
43397 + unsigned count UNUSED_ARG /* stop unit */ ,
43398 + znode * old_node /* old parent */ )
43399 +{
43400 + assert("edward-479", item != NULL);
43401 + assert("edward-480", item->node != old_node);
43402 +
43403 + if (!znode_convertible(old_node) || znode_convertible(item->node))
43404 + return 0;
43405 + if (ctail_convertible(item))
43406 + znode_set_convertible(item->node);
43407 + return 0;
43408 +}
43409 +
43410 +static int
43411 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43412 + int cut, void *p, reiser4_key * smallest_removed,
43413 + reiser4_key * new_first)
43414 +{
43415 + pos_in_node_t count; /* number of units to cut */
43416 + char *item;
43417 +
43418 + count = to - from + 1;
43419 + item = item_body_by_coord(coord);
43420 +
43421 + assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
43422 +
43423 + if (smallest_removed) {
43424 + /* store smallest key removed */
43425 + item_key_by_coord(coord, smallest_removed);
43426 + set_key_offset(smallest_removed,
43427 + get_key_offset(smallest_removed) + from);
43428 + }
43429 +
43430 + if (new_first) {
43431 + assert("vs-1531", from == 0);
43432 +
43433 + item_key_by_coord(coord, new_first);
43434 + set_key_offset(new_first,
43435 + get_key_offset(new_first) + from + count);
43436 + }
43437 +
43438 + if (!cut)
43439 + kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
43440 +
43441 + if (from == 0) {
43442 + if (count != nr_units_ctail(coord)) {
43443 + /* part of item is removed, so move free space at the beginning
43444 + of the item and update item key */
43445 + reiser4_key key;
43446 + memcpy(item + to + 1, item, sizeof(ctail_item_format));
43447 + item_key_by_coord(coord, &key);
43448 + set_key_offset(&key, get_key_offset(&key) + count);
43449 + node_plugin_by_node(coord->node)->update_item_key(coord,
43450 + &key,
43451 + NULL);
43452 + } else {
43453 + /* cut_units should not be called to cut evrything */
43454 + assert("vs-1532", ergo(cut, 0));
43455 + /* whole item is cut, so more then amount of space occupied
43456 + by units got freed */
43457 + count += sizeof(ctail_item_format);
43458 + }
43459 + if (REISER4_DEBUG)
43460 + memset(item, 0, count);
43461 + } else if (REISER4_DEBUG)
43462 + memset(item + sizeof(ctail_item_format) + from, 0, count);
43463 + return count;
43464 +}
43465 +
43466 +/* plugin->u.item.b.cut_units */
43467 +int
43468 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43469 + carry_cut_data * cdata, reiser4_key * smallest_removed,
43470 + reiser4_key * new_first)
43471 +{
43472 + return cut_or_kill_ctail_units(item, from, to, 1, NULL,
43473 + smallest_removed, new_first);
43474 +}
43475 +
43476 +/* plugin->u.item.b.kill_units */
43477 +int
43478 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43479 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
43480 + reiser4_key * new_first)
43481 +{
43482 + return cut_or_kill_ctail_units(item, from, to, 0, kdata,
43483 + smallest_removed, new_first);
43484 +}
43485 +
43486 +/* plugin->u.item.s.file.read */
43487 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
43488 +{
43489 + uf_coord_t *uf_coord;
43490 + coord_t *coord;
43491 +
43492 + uf_coord = &hint->ext_coord;
43493 + coord = &uf_coord->coord;
43494 + assert("edward-127", f->user == 0);
43495 + assert("edward-129", coord && coord->node);
43496 + assert("edward-130", coord_is_existing_unit(coord));
43497 + assert("edward-132", znode_is_loaded(coord->node));
43498 +
43499 + /* start read only from the beginning of ctail */
43500 + assert("edward-133", coord->unit_pos == 0);
43501 + /* read only whole ctails */
43502 + assert("edward-135", nr_units_ctail(coord) <= f->length);
43503 +
43504 + assert("edward-136", reiser4_schedulable());
43505 + assert("edward-886", ctail_ok(coord));
43506 +
43507 + if (f->data)
43508 + memcpy(f->data, (char *)first_unit(coord),
43509 + (size_t) nr_units_ctail(coord));
43510 +
43511 + dclust_set_extension_shift(hint);
43512 + mark_page_accessed(znode_page(coord->node));
43513 + move_flow_forward(f, nr_units_ctail(coord));
43514 +
43515 + return 0;
43516 +}
43517 +
43518 +/**
43519 + * Prepare transform stream with plain text for page
43520 + * @page taking into account synchronization issues.
43521 + */
43522 +static int ctail_read_disk_cluster(struct cluster_handle * clust,
43523 + struct inode * inode, struct page * page,
43524 + znode_lock_mode mode)
43525 +{
43526 + int result;
43527 +
43528 + assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
43529 + assert("edward-671", clust->hint != NULL);
43530 + assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
43531 + assert("edward-672", cryptcompress_inode_ok(inode));
43532 + assert("edward-1527", PageLocked(page));
43533 +
43534 + unlock_page(page);
43535 +
43536 + /* set input stream */
43537 + result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
43538 + if (result) {
43539 + lock_page(page);
43540 + return result;
43541 + }
43542 + result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
43543 + lock_page(page);
43544 + if (result)
43545 + return result;
43546 + /*
43547 + * at this point we have locked position in the tree
43548 + */
43549 + assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
43550 +
43551 + if (page->mapping != inode->i_mapping) {
43552 + /* page was truncated */
43553 + reiser4_unset_hint(clust->hint);
43554 + reset_cluster_params(clust);
43555 + return AOP_TRUNCATED_PAGE;
43556 + }
43557 + if (PageUptodate(page)) {
43558 + /* disk cluster can be obsolete, don't use it! */
43559 + reiser4_unset_hint(clust->hint);
43560 + reset_cluster_params(clust);
43561 + return 0;
43562 + }
43563 + if (clust->dstat == FAKE_DISK_CLUSTER ||
43564 + clust->dstat == UNPR_DISK_CLUSTER ||
43565 + clust->dstat == TRNC_DISK_CLUSTER) {
43566 + /*
43567 + * this information about disk cluster will be valid
43568 + * as long as we keep the position in the tree locked
43569 + */
43570 + tfm_cluster_set_uptodate(&clust->tc);
43571 + return 0;
43572 + }
43573 + /* now prepare output stream.. */
43574 + result = grab_coa(&clust->tc, inode_compression_plugin(inode));
43575 + if (result)
43576 + return result;
43577 + /* ..and fill this with plain text */
43578 + result = reiser4_inflate_cluster(clust, inode);
43579 + if (result)
43580 + return result;
43581 + /*
43582 + * The stream is ready! It won't be obsolete as
43583 + * long as we keep last disk cluster item locked.
43584 + */
43585 + tfm_cluster_set_uptodate(&clust->tc);
43586 + return 0;
43587 +}
43588 +
43589 +/*
43590 + * fill one page with plain text.
43591 + */
43592 +int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
43593 + struct page *page, znode_lock_mode mode)
43594 +{
43595 + int ret;
43596 + unsigned cloff;
43597 + char *data;
43598 + size_t to_page;
43599 + struct tfm_cluster * tc = &clust->tc;
43600 +
43601 + assert("edward-212", PageLocked(page));
43602 +
43603 + if (unlikely(page->mapping != inode->i_mapping))
43604 + return AOP_TRUNCATED_PAGE;
43605 + if (PageUptodate(page))
43606 + goto exit;
43607 + to_page = pbytes(page_index(page), inode);
43608 + if (to_page == 0) {
43609 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43610 + SetPageUptodate(page);
43611 + goto exit;
43612 + }
43613 + if (!tfm_cluster_is_uptodate(&clust->tc)) {
43614 + clust->index = pg_to_clust(page->index, inode);
43615 +
43616 + /* this will unlock/lock the page */
43617 + ret = ctail_read_disk_cluster(clust, inode, page, mode);
43618 +
43619 + assert("edward-212", PageLocked(page));
43620 + if (ret)
43621 + return ret;
43622 +
43623 + /* refresh bytes */
43624 + to_page = pbytes(page_index(page), inode);
43625 + if (to_page == 0) {
43626 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43627 + SetPageUptodate(page);
43628 + goto exit;
43629 + }
43630 + }
43631 + if (PageUptodate(page))
43632 + /* somebody else fill it already */
43633 + goto exit;
43634 +
43635 + assert("edward-119", tfm_cluster_is_uptodate(tc));
43636 + assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
43637 +
43638 + switch (clust->dstat) {
43639 + case UNPR_DISK_CLUSTER:
43640 + BUG_ON(1);
43641 + case TRNC_DISK_CLUSTER:
43642 + /*
43643 + * Race with truncate!
43644 + * We resolve it in favour of the last one (the only way,
43645 + * as in this case plain text is unrecoverable)
43646 + */
43647 + case FAKE_DISK_CLUSTER:
43648 + /* fill the page by zeroes */
43649 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43650 + SetPageUptodate(page);
43651 + break;
43652 + case PREP_DISK_CLUSTER:
43653 + /* fill page by transformed stream with plain text */
43654 + assert("edward-1058", !PageUptodate(page));
43655 + assert("edward-120", tc->len <= inode_cluster_size(inode));
43656 +
43657 + /* page index in this logical cluster */
43658 + cloff = pg_to_off_to_cloff(page->index, inode);
43659 +
43660 + data = kmap(page);
43661 + memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
43662 + memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
43663 + flush_dcache_page(page);
43664 + kunmap(page);
43665 + SetPageUptodate(page);
43666 + break;
43667 + default:
43668 + impossible("edward-1169", "bad disk cluster state");
43669 + }
43670 + exit:
43671 + return 0;
43672 +}
43673 +
43674 +/* plugin->u.item.s.file.readpage */
43675 +int readpage_ctail(void *vp, struct page *page)
43676 +{
43677 + int result;
43678 + hint_t * hint;
43679 + struct cluster_handle * clust = vp;
43680 +
43681 + assert("edward-114", clust != NULL);
43682 + assert("edward-115", PageLocked(page));
43683 + assert("edward-116", !PageUptodate(page));
43684 + assert("edward-118", page->mapping && page->mapping->host);
43685 + assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
43686 +
43687 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43688 + if (hint == NULL) {
43689 + unlock_page(page);
43690 + return RETERR(-ENOMEM);
43691 + }
43692 + clust->hint = hint;
43693 + result = load_file_hint(clust->file, hint);
43694 + if (result) {
43695 + kfree(hint);
43696 + unlock_page(page);
43697 + return result;
43698 + }
43699 + assert("vs-25", hint->ext_coord.lh == &hint->lh);
43700 +
43701 + result = do_readpage_ctail(page->mapping->host, clust, page,
43702 + ZNODE_READ_LOCK);
43703 + assert("edward-213", PageLocked(page));
43704 + assert("edward-1163", ergo(!result, PageUptodate(page)));
43705 +
43706 + unlock_page(page);
43707 + done_lh(&hint->lh);
43708 + hint->ext_coord.valid = 0;
43709 + save_file_hint(clust->file, hint);
43710 + kfree(hint);
43711 + tfm_cluster_clr_uptodate(&clust->tc);
43712 +
43713 + return result;
43714 +}
43715 +
43716 +/* Helper function for ->readpages() */
43717 +static int ctail_read_page_cluster(struct cluster_handle * clust,
43718 + struct inode *inode)
43719 +{
43720 + int i;
43721 + int result;
43722 + assert("edward-779", clust != NULL);
43723 + assert("edward-1059", clust->win == NULL);
43724 + assert("edward-780", inode != NULL);
43725 +
43726 + result = prepare_page_cluster(inode, clust, READ_OP);
43727 + if (result)
43728 + return result;
43729 +
43730 + assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
43731 +
43732 + for (i = 0; i < clust->nr_pages; i++) {
43733 + struct page *page = clust->pages[i];
43734 + lock_page(page);
43735 + result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
43736 + unlock_page(page);
43737 + if (result)
43738 + break;
43739 + }
43740 + tfm_cluster_clr_uptodate(&clust->tc);
43741 + put_page_cluster(clust, inode, READ_OP);
43742 + return result;
43743 +}
43744 +
43745 +/* filler for read_cache_pages() */
43746 +static int ctail_readpages_filler(void * data, struct page * page)
43747 +{
43748 + int ret = 0;
43749 + struct cluster_handle * clust = data;
43750 + struct inode * inode = clust->file->f_dentry->d_inode;
43751 +
43752 + assert("edward-1525", page->mapping == inode->i_mapping);
43753 +
43754 + if (PageUptodate(page)) {
43755 + unlock_page(page);
43756 + return 0;
43757 + }
43758 + if (pbytes(page_index(page), inode) == 0) {
43759 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
43760 + SetPageUptodate(page);
43761 + unlock_page(page);
43762 + return 0;
43763 + }
43764 + move_cluster_forward(clust, inode, page->index);
43765 + unlock_page(page);
43766 + /*
43767 + * read the whole page cluster
43768 + */
43769 + ret = ctail_read_page_cluster(clust, inode);
43770 +
43771 + assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
43772 + return ret;
43773 +}
43774 +
43775 +/*
43776 + * We populate a bit more then upper readahead suggests:
43777 + * with each nominated page we read the whole page cluster
43778 + * this page belongs to.
43779 + */
43780 +int readpages_ctail(struct file *file, struct address_space *mapping,
43781 + struct list_head *pages)
43782 +{
43783 + int ret = 0;
43784 + hint_t *hint;
43785 + struct cluster_handle clust;
43786 + struct inode *inode = mapping->host;
43787 +
43788 + assert("edward-1521", inode == file->f_dentry->d_inode);
43789 +
43790 + cluster_init_read(&clust, NULL);
43791 + clust.file = file;
43792 + hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43793 + if (hint == NULL) {
43794 + warning("vs-28", "failed to allocate hint");
43795 + ret = RETERR(-ENOMEM);
43796 + goto exit1;
43797 + }
43798 + clust.hint = hint;
43799 + ret = load_file_hint(clust.file, hint);
43800 + if (ret) {
43801 + warning("edward-1522", "failed to load hint");
43802 + goto exit2;
43803 + }
43804 + assert("vs-26", hint->ext_coord.lh == &hint->lh);
43805 + ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
43806 + if (ret) {
43807 + warning("edward-1523", "failed to alloc pgset");
43808 + goto exit3;
43809 + }
43810 + ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
43811 +
43812 + assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
43813 + exit3:
43814 + done_lh(&hint->lh);
43815 + save_file_hint(file, hint);
43816 + hint->ext_coord.valid = 0;
43817 + exit2:
43818 + kfree(hint);
43819 + exit1:
43820 + put_cluster_handle(&clust);
43821 + return ret;
43822 +}
43823 +
43824 +/*
43825 + plugin->u.item.s.file.append_key
43826 + key of the first item of the next disk cluster
43827 +*/
43828 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
43829 +{
43830 + assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
43831 + assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
43832 +
43833 + item_key_by_coord(coord, key);
43834 + set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
43835 + << cluster_shift_by_coord(coord));
43836 + return key;
43837 +}
43838 +
43839 +static int insert_unprepped_ctail(struct cluster_handle * clust,
43840 + struct inode *inode)
43841 +{
43842 + int result;
43843 + char buf[UCTAIL_NR_UNITS];
43844 + reiser4_item_data data;
43845 + reiser4_key key;
43846 + int shift = (int)UCTAIL_SHIFT;
43847 +
43848 + memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
43849 + result = key_by_inode_cryptcompress(inode,
43850 + clust_to_off(clust->index, inode),
43851 + &key);
43852 + if (result)
43853 + return result;
43854 + data.user = 0;
43855 + data.iplug = item_plugin_by_id(CTAIL_ID);
43856 + data.arg = &shift;
43857 + data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
43858 + data.data = buf;
43859 +
43860 + result = insert_by_coord(&clust->hint->ext_coord.coord,
43861 + &data, &key, clust->hint->ext_coord.lh, 0);
43862 + return result;
43863 +}
43864 +
43865 +static int
43866 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
43867 + struct inode *inode)
43868 +{
43869 + int result;
43870 + carry_pool *pool;
43871 + carry_level *lowest_level;
43872 + reiser4_item_data *data;
43873 + carry_op *op;
43874 + int cluster_shift = inode_cluster_shift(inode);
43875 +
43876 + pool =
43877 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
43878 + sizeof(*data));
43879 + if (IS_ERR(pool))
43880 + return PTR_ERR(pool);
43881 + lowest_level = (carry_level *) (pool + 1);
43882 + init_carry_level(lowest_level, pool);
43883 + data = (reiser4_item_data *) (lowest_level + 3);
43884 +
43885 + assert("edward-466", coord->between == AFTER_ITEM
43886 + || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
43887 + || coord->between == EMPTY_NODE
43888 + || coord->between == BEFORE_UNIT);
43889 +
43890 + if (coord->between == AFTER_UNIT) {
43891 + coord->unit_pos = 0;
43892 + coord->between = AFTER_ITEM;
43893 + }
43894 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
43895 + 0 /* operate directly on coord -> node */);
43896 + if (IS_ERR(op) || (op == NULL)) {
43897 + done_carry_pool(pool);
43898 + return RETERR(op ? PTR_ERR(op) : -EIO);
43899 + }
43900 + data->user = 0;
43901 + data->iplug = item_plugin_by_id(CTAIL_ID);
43902 + data->arg = &cluster_shift;
43903 +
43904 + data->length = 0;
43905 + data->data = NULL;
43906 +
43907 + op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
43908 + op->u.insert_flow.insert_point = coord;
43909 + op->u.insert_flow.flow = f;
43910 + op->u.insert_flow.data = data;
43911 + op->u.insert_flow.new_nodes = 0;
43912 +
43913 + lowest_level->track_type = CARRY_TRACK_CHANGE;
43914 + lowest_level->tracked = lh;
43915 +
43916 + result = reiser4_carry(lowest_level, NULL);
43917 + done_carry_pool(pool);
43918 +
43919 + return result;
43920 +}
43921 +
43922 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
43923 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
43924 + lock_handle * lh, flow_t * f,
43925 + struct inode *inode)
43926 +{
43927 + int ret;
43928 + coord_t pos;
43929 + lock_handle lock;
43930 +
43931 + assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
43932 + assert("edward-484", coord->between == AT_UNIT
43933 + || coord->between == AFTER_ITEM);
43934 + assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
43935 +
43936 + coord_dup(&pos, coord);
43937 + pos.unit_pos = 0;
43938 + pos.between = AFTER_ITEM;
43939 +
43940 + init_lh(&lock);
43941 + copy_lh(&lock, lh);
43942 +
43943 + ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
43944 + done_lh(&lock);
43945 + assert("edward-1347", znode_is_write_locked(lh->node));
43946 + assert("edward-1228", !ret);
43947 + return ret;
43948 +}
43949 +
43950 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
43951 +static int overwrite_ctail(coord_t * coord, flow_t * f)
43952 +{
43953 + unsigned count;
43954 +
43955 + assert("edward-269", f->user == 0);
43956 + assert("edward-270", f->data != NULL);
43957 + assert("edward-271", f->length > 0);
43958 + assert("edward-272", coord_is_existing_unit(coord));
43959 + assert("edward-273", coord->unit_pos == 0);
43960 + assert("edward-274", znode_is_write_locked(coord->node));
43961 + assert("edward-275", reiser4_schedulable());
43962 + assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
43963 + assert("edward-1243", ctail_ok(coord));
43964 +
43965 + count = nr_units_ctail(coord);
43966 +
43967 + if (count > f->length)
43968 + count = f->length;
43969 + memcpy(first_unit(coord), f->data, count);
43970 + move_flow_forward(f, count);
43971 + coord->unit_pos += count;
43972 + return 0;
43973 +}
43974 +
43975 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
43976 + cut ctail (part or whole) starting from next unit position */
43977 +static int cut_ctail(coord_t * coord)
43978 +{
43979 + coord_t stop;
43980 +
43981 + assert("edward-435", coord->between == AT_UNIT &&
43982 + coord->item_pos < coord_num_items(coord) &&
43983 + coord->unit_pos <= coord_num_units(coord));
43984 +
43985 + if (coord->unit_pos == coord_num_units(coord))
43986 + /* nothing to cut */
43987 + return 0;
43988 + coord_dup(&stop, coord);
43989 + stop.unit_pos = coord_last_unit_pos(coord);
43990 +
43991 + return cut_node_content(coord, &stop, NULL, NULL, NULL);
43992 +}
43993 +
43994 +int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
43995 + struct inode * inode)
43996 +{
43997 + int result;
43998 + assert("edward-1244", inode != NULL);
43999 + assert("edward-1245", clust->hint != NULL);
44000 + assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
44001 + assert("edward-1247", clust->reserved == 1);
44002 +
44003 + result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
44004 + if (cbk_errored(result))
44005 + return result;
44006 + assert("edward-1249", result == CBK_COORD_NOTFOUND);
44007 + assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
44008 +
44009 + assert("edward-1295",
44010 + clust->hint->ext_coord.lh->node ==
44011 + clust->hint->ext_coord.coord.node);
44012 +
44013 + coord_set_between_clusters(&clust->hint->ext_coord.coord);
44014 +
44015 + result = insert_unprepped_ctail(clust, inode);
44016 + all_grabbed2free();
44017 +
44018 + assert("edward-1251", !result);
44019 + assert("edward-1252", cryptcompress_inode_ok(inode));
44020 + assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
44021 + assert("edward-1254",
44022 + reiser4_clustered_blocks(reiser4_get_current_sb()));
44023 + assert("edward-1255",
44024 + znode_convertible(clust->hint->ext_coord.coord.node));
44025 +
44026 + return result;
44027 +}
44028 +
44029 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
44030 +{
44031 + int result = 0;
44032 + struct convert_item_info * info;
44033 +
44034 + assert("edward-468", pos != NULL);
44035 + assert("edward-469", pos->sq != NULL);
44036 + assert("edward-845", item_convert_data(pos) != NULL);
44037 +
44038 + info = item_convert_data(pos);
44039 + assert("edward-679", info->flow.data != NULL);
44040 +
44041 + switch (mode) {
44042 + case CRC_APPEND_ITEM:
44043 + assert("edward-1229", info->flow.length != 0);
44044 + assert("edward-1256",
44045 + cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
44046 + result =
44047 + insert_cryptcompress_flow_in_place(&pos->coord,
44048 + &pos->lock,
44049 + &info->flow,
44050 + info->inode);
44051 + break;
44052 + case CRC_OVERWRITE_ITEM:
44053 + assert("edward-1230", info->flow.length != 0);
44054 + overwrite_ctail(&pos->coord, &info->flow);
44055 + if (info->flow.length != 0)
44056 + break;
44057 + case CRC_CUT_ITEM:
44058 + assert("edward-1231", info->flow.length == 0);
44059 + result = cut_ctail(&pos->coord);
44060 + break;
44061 + default:
44062 + result = RETERR(-EIO);
44063 + impossible("edward-244", "bad convert mode");
44064 + }
44065 + return result;
44066 +}
44067 +
44068 +/* plugin->u.item.f.scan */
44069 +int scan_ctail(flush_scan * scan)
44070 +{
44071 + int result = 0;
44072 + struct page *page;
44073 + struct inode *inode;
44074 + jnode *node = scan->node;
44075 +
44076 + assert("edward-227", scan->node != NULL);
44077 + assert("edward-228", jnode_is_cluster_page(scan->node));
44078 + assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
44079 +
44080 + page = jnode_page(node);
44081 + inode = page->mapping->host;
44082 +
44083 + if (!reiser4_scanning_left(scan))
44084 + return result;
44085 + if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
44086 + znode_make_dirty(scan->parent_lock.node);
44087 +
44088 + if (!znode_convertible(scan->parent_lock.node)) {
44089 + if (JF_ISSET(scan->node, JNODE_DIRTY))
44090 + znode_set_convertible(scan->parent_lock.node);
44091 + else {
44092 + warning("edward-681",
44093 + "cluster page is already processed");
44094 + return -EAGAIN;
44095 + }
44096 + }
44097 + return result;
44098 +}
44099 +
44100 +/* If true, this function attaches children */
44101 +static int should_attach_convert_idata(flush_pos_t * pos)
44102 +{
44103 + int result;
44104 + assert("edward-431", pos != NULL);
44105 + assert("edward-432", pos->child == NULL);
44106 + assert("edward-619", znode_is_write_locked(pos->coord.node));
44107 + assert("edward-470",
44108 + item_plugin_by_coord(&pos->coord) ==
44109 + item_plugin_by_id(CTAIL_ID));
44110 +
44111 + /* check for leftmost child */
44112 + utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
44113 +
44114 + if (!pos->child)
44115 + return 0;
44116 + spin_lock_jnode(pos->child);
44117 + result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
44118 + pos->child->atom == ZJNODE(pos->coord.node)->atom);
44119 + spin_unlock_jnode(pos->child);
44120 + if (!result && pos->child) {
44121 + /* existing child isn't to attach, clear up this one */
44122 + jput(pos->child);
44123 + pos->child = NULL;
44124 + }
44125 + return result;
44126 +}
44127 +
44128 +/* plugin->init_convert_data() */
44129 +static int
44130 +init_convert_data_ctail(struct convert_item_info * idata, struct inode *inode)
44131 +{
44132 + assert("edward-813", idata != NULL);
44133 + assert("edward-814", inode != NULL);
44134 +
44135 + idata->inode = inode;
44136 + idata->d_cur = DC_FIRST_ITEM;
44137 + idata->d_next = DC_INVALID_STATE;
44138 +
44139 + return 0;
44140 +}
44141 +
44142 +static int alloc_item_convert_data(struct convert_info * sq)
44143 +{
44144 + assert("edward-816", sq != NULL);
44145 + assert("edward-817", sq->itm == NULL);
44146 +
44147 + sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
44148 + if (sq->itm == NULL)
44149 + return RETERR(-ENOMEM);
44150 + return 0;
44151 +}
44152 +
44153 +static void free_item_convert_data(struct convert_info * sq)
44154 +{
44155 + assert("edward-818", sq != NULL);
44156 + assert("edward-819", sq->itm != NULL);
44157 + assert("edward-820", sq->iplug != NULL);
44158 +
44159 + kfree(sq->itm);
44160 + sq->itm = NULL;
44161 + return;
44162 +}
44163 +
44164 +static int alloc_convert_data(flush_pos_t * pos)
44165 +{
44166 + assert("edward-821", pos != NULL);
44167 + assert("edward-822", pos->sq == NULL);
44168 +
44169 + pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
44170 + if (!pos->sq)
44171 + return RETERR(-ENOMEM);
44172 + memset(pos->sq, 0, sizeof(*pos->sq));
44173 + cluster_init_write(&pos->sq->clust, NULL);
44174 + return 0;
44175 +}
44176 +
44177 +void free_convert_data(flush_pos_t * pos)
44178 +{
44179 + struct convert_info *sq;
44180 +
44181 + assert("edward-823", pos != NULL);
44182 + assert("edward-824", pos->sq != NULL);
44183 +
44184 + sq = pos->sq;
44185 + if (sq->itm)
44186 + free_item_convert_data(sq);
44187 + put_cluster_handle(&sq->clust);
44188 + kfree(pos->sq);
44189 + pos->sq = NULL;
44190 + return;
44191 +}
44192 +
44193 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
44194 +{
44195 + struct convert_info *sq;
44196 +
44197 + assert("edward-825", pos != NULL);
44198 + assert("edward-826", pos->sq != NULL);
44199 + assert("edward-827", item_convert_data(pos) != NULL);
44200 + assert("edward-828", inode != NULL);
44201 +
44202 + sq = pos->sq;
44203 +
44204 + memset(sq->itm, 0, sizeof(*sq->itm));
44205 +
44206 + /* iplug->init_convert_data() */
44207 + return init_convert_data_ctail(sq->itm, inode);
44208 +}
44209 +
44210 +/* create and attach disk cluster info used by 'convert' phase of the flush
44211 + squalloc() */
44212 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
44213 +{
44214 + int ret = 0;
44215 + struct convert_item_info *info;
44216 + struct cluster_handle *clust;
44217 + file_plugin *fplug = inode_file_plugin(inode);
44218 + compression_plugin *cplug = inode_compression_plugin(inode);
44219 +
44220 + assert("edward-248", pos != NULL);
44221 + assert("edward-249", pos->child != NULL);
44222 + assert("edward-251", inode != NULL);
44223 + assert("edward-682", cryptcompress_inode_ok(inode));
44224 + assert("edward-252",
44225 + fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44226 + assert("edward-473",
44227 + item_plugin_by_coord(&pos->coord) ==
44228 + item_plugin_by_id(CTAIL_ID));
44229 +
44230 + if (!pos->sq) {
44231 + ret = alloc_convert_data(pos);
44232 + if (ret)
44233 + return ret;
44234 + }
44235 + clust = &pos->sq->clust;
44236 + ret = grab_coa(&clust->tc, cplug);
44237 + if (ret)
44238 + goto err;
44239 + ret = set_cluster_by_page(clust,
44240 + jnode_page(pos->child),
44241 + MAX_CLUSTER_NRPAGES);
44242 + if (ret)
44243 + goto err;
44244 +
44245 + assert("edward-829", pos->sq != NULL);
44246 + assert("edward-250", item_convert_data(pos) == NULL);
44247 +
44248 + pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
44249 +
44250 + ret = alloc_item_convert_data(pos->sq);
44251 + if (ret)
44252 + goto err;
44253 + ret = init_item_convert_data(pos, inode);
44254 + if (ret)
44255 + goto err;
44256 + info = item_convert_data(pos);
44257 +
44258 + ret = checkout_logical_cluster(clust, pos->child, inode);
44259 + if (ret)
44260 + goto err;
44261 +
44262 + reiser4_deflate_cluster(clust, inode);
44263 + inc_item_convert_count(pos);
44264 +
44265 + /* prepare flow for insertion */
44266 + fplug->flow_by_inode(info->inode,
44267 + (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
44268 + 0 /* kernel space */ ,
44269 + clust->tc.len,
44270 + clust_to_off(clust->index, inode),
44271 + WRITE_OP, &info->flow);
44272 + jput(pos->child);
44273 +
44274 + assert("edward-683", cryptcompress_inode_ok(inode));
44275 + return 0;
44276 + err:
44277 + jput(pos->child);
44278 + free_convert_data(pos);
44279 + return ret;
44280 +}
44281 +
44282 +/* clear up disk cluster info */
44283 +static void detach_convert_idata(struct convert_info * sq)
44284 +{
44285 + struct convert_item_info *info;
44286 +
44287 + assert("edward-253", sq != NULL);
44288 + assert("edward-840", sq->itm != NULL);
44289 +
44290 + info = sq->itm;
44291 + assert("edward-255", info->inode != NULL);
44292 + assert("edward-1212", info->flow.length == 0);
44293 +
44294 + free_item_convert_data(sq);
44295 + return;
44296 +}
44297 +
44298 +/* plugin->u.item.f.utmost_child */
44299 +
44300 +/* This function sets leftmost child for a first cluster item,
44301 + if the child exists, and NULL in other cases.
44302 + NOTE-EDWARD: Do not call this for RIGHT_SIDE */
44303 +
44304 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
44305 +{
44306 + reiser4_key key;
44307 +
44308 + item_key_by_coord(coord, &key);
44309 +
44310 + assert("edward-257", coord != NULL);
44311 + assert("edward-258", child != NULL);
44312 + assert("edward-259", side == LEFT_SIDE);
44313 + assert("edward-260",
44314 + item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
44315 +
44316 + if (!is_disk_cluster_key(&key, coord))
44317 + *child = NULL;
44318 + else
44319 + *child = jlookup(current_tree,
44320 + get_key_objectid(item_key_by_coord
44321 + (coord, &key)),
44322 + off_to_pg(get_key_offset(&key)));
44323 + return 0;
44324 +}
44325 +
44326 +/* Returns true if @p2 is the next item to @p1
44327 + in the _same_ disk cluster.
44328 + Disk cluster is a set of items. If ->clustered() != NULL,
44329 + with each item the whole disk cluster should be read/modified
44330 +*/
44331 +
44332 +/* Go rightward and check for next disk cluster item, set
44333 + * d_next to DC_CHAINED_ITEM, if the last one exists.
44334 + * If the current position is last item, go to right neighbor.
44335 + * Skip empty nodes. Note, that right neighbors may be not in
44336 + * the slum because of races. If so, make it dirty and
44337 + * convertible.
44338 + */
44339 +static int next_item_dc_stat(flush_pos_t * pos)
44340 +{
44341 + int ret = 0;
44342 + int stop = 0;
44343 + znode *cur;
44344 + coord_t coord;
44345 + lock_handle lh;
44346 + lock_handle right_lock;
44347 +
44348 + assert("edward-1232", !node_is_empty(pos->coord.node));
44349 + assert("edward-1014",
44350 + pos->coord.item_pos < coord_num_items(&pos->coord));
44351 + assert("edward-1015", chaining_data_present(pos));
44352 + assert("edward-1017",
44353 + item_convert_data(pos)->d_next == DC_INVALID_STATE);
44354 +
44355 + item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
44356 +
44357 + if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
44358 + return ret;
44359 + if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
44360 + return ret;
44361 +
44362 + /* Check next slum item.
44363 + * Note, that it can not be killed by concurrent truncate,
44364 + * as the last one will want the lock held by us.
44365 + */
44366 + init_lh(&right_lock);
44367 + cur = pos->coord.node;
44368 +
44369 + while (!stop) {
44370 + init_lh(&lh);
44371 + ret = reiser4_get_right_neighbor(&lh,
44372 + cur,
44373 + ZNODE_WRITE_LOCK,
44374 + GN_CAN_USE_UPPER_LEVELS);
44375 + if (ret)
44376 + break;
44377 + ret = zload(lh.node);
44378 + if (ret) {
44379 + done_lh(&lh);
44380 + break;
44381 + }
44382 + coord_init_before_first_item(&coord, lh.node);
44383 +
44384 + if (node_is_empty(lh.node)) {
44385 + znode_make_dirty(lh.node);
44386 + znode_set_convertible(lh.node);
44387 + stop = 0;
44388 + } else if (same_disk_cluster(&pos->coord, &coord)) {
44389 +
44390 + item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
44391 +
44392 + if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
44393 + /*
44394 + warning("edward-1024",
44395 + "next slum item mergeable, "
44396 + "but znode %p isn't dirty\n",
44397 + lh.node);
44398 + */
44399 + znode_make_dirty(lh.node);
44400 + }
44401 + if (!znode_convertible(lh.node)) {
44402 + /*
44403 + warning("edward-1272",
44404 + "next slum item mergeable, "
44405 + "but znode %p isn't convertible\n",
44406 + lh.node);
44407 + */
44408 + znode_set_convertible(lh.node);
44409 + }
44410 + stop = 1;
44411 + } else
44412 + stop = 1;
44413 + zrelse(lh.node);
44414 + done_lh(&right_lock);
44415 + copy_lh(&right_lock, &lh);
44416 + done_lh(&lh);
44417 + cur = right_lock.node;
44418 + }
44419 + done_lh(&right_lock);
44420 +
44421 + if (ret == -E_NO_NEIGHBOR)
44422 + ret = 0;
44423 + return ret;
44424 +}
44425 +
44426 +static int
44427 +assign_convert_mode(struct convert_item_info * idata,
44428 + cryptcompress_write_mode_t * mode)
44429 +{
44430 + int result = 0;
44431 +
44432 + assert("edward-1025", idata != NULL);
44433 +
44434 + if (idata->flow.length) {
44435 + /* append or overwrite */
44436 + switch (idata->d_cur) {
44437 + case DC_FIRST_ITEM:
44438 + case DC_CHAINED_ITEM:
44439 + *mode = CRC_OVERWRITE_ITEM;
44440 + break;
44441 + case DC_AFTER_CLUSTER:
44442 + *mode = CRC_APPEND_ITEM;
44443 + break;
44444 + default:
44445 + impossible("edward-1018", "wrong current item state");
44446 + }
44447 + } else {
44448 + /* cut or invalidate */
44449 + switch (idata->d_cur) {
44450 + case DC_FIRST_ITEM:
44451 + case DC_CHAINED_ITEM:
44452 + *mode = CRC_CUT_ITEM;
44453 + break;
44454 + case DC_AFTER_CLUSTER:
44455 + result = 1;
44456 + break;
44457 + default:
44458 + impossible("edward-1019", "wrong current item state");
44459 + }
44460 + }
44461 + return result;
44462 +}
44463 +
44464 +/* plugin->u.item.f.convert */
44465 +/* write ctail in guessed mode */
44466 +int convert_ctail(flush_pos_t * pos)
44467 +{
44468 + int result;
44469 + int nr_items;
44470 + cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
44471 +
44472 + assert("edward-1020", pos != NULL);
44473 + assert("edward-1213", coord_num_items(&pos->coord) != 0);
44474 + assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
44475 + assert("edward-1258", ctail_ok(&pos->coord));
44476 + assert("edward-261", pos->coord.node != NULL);
44477 +
44478 + nr_items = coord_num_items(&pos->coord);
44479 + if (!chaining_data_present(pos)) {
44480 + if (should_attach_convert_idata(pos)) {
44481 + /* attach convert item info */
44482 + struct inode *inode;
44483 +
44484 + assert("edward-264", pos->child != NULL);
44485 + assert("edward-265", jnode_page(pos->child) != NULL);
44486 + assert("edward-266",
44487 + jnode_page(pos->child)->mapping != NULL);
44488 +
44489 + inode = jnode_page(pos->child)->mapping->host;
44490 +
44491 + assert("edward-267", inode != NULL);
44492 +
44493 + /* attach item convert info by child and put the last one */
44494 + result = attach_convert_idata(pos, inode);
44495 + pos->child = NULL;
44496 + if (result == -E_REPEAT) {
44497 + /* jnode became clean, or there is no dirty
44498 + pages (nothing to update in disk cluster) */
44499 + warning("edward-1021",
44500 + "convert_ctail: nothing to attach");
44501 + return 0;
44502 + }
44503 + if (result != 0)
44504 + return result;
44505 + } else
44506 + /* unconvertible */
44507 + return 0;
44508 + } else {
44509 + /* use old convert info */
44510 +
44511 + struct convert_item_info *idata;
44512 +
44513 + idata = item_convert_data(pos);
44514 +
44515 + result = assign_convert_mode(idata, &mode);
44516 + if (result) {
44517 + /* disk cluster is over,
44518 + nothing to update anymore */
44519 + detach_convert_idata(pos->sq);
44520 + return 0;
44521 + }
44522 + }
44523 +
44524 + assert("edward-433", chaining_data_present(pos));
44525 + assert("edward-1022",
44526 + pos->coord.item_pos < coord_num_items(&pos->coord));
44527 +
44528 + /* check if next item is of current disk cluster */
44529 + result = next_item_dc_stat(pos);
44530 + if (result) {
44531 + detach_convert_idata(pos->sq);
44532 + return result;
44533 + }
44534 + result = do_convert_ctail(pos, mode);
44535 + if (result) {
44536 + detach_convert_idata(pos->sq);
44537 + return result;
44538 + }
44539 + switch (mode) {
44540 + case CRC_CUT_ITEM:
44541 + assert("edward-1214", item_convert_data(pos)->flow.length == 0);
44542 + assert("edward-1215",
44543 + coord_num_items(&pos->coord) == nr_items ||
44544 + coord_num_items(&pos->coord) == nr_items - 1);
44545 + if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
44546 + break;
44547 + if (coord_num_items(&pos->coord) != nr_items) {
44548 + /* the item was killed, no more chained items */
44549 + detach_convert_idata(pos->sq);
44550 + if (!node_is_empty(pos->coord.node))
44551 + /* make sure the next item will be scanned */
44552 + coord_init_before_item(&pos->coord);
44553 + break;
44554 + }
44555 + case CRC_APPEND_ITEM:
44556 + assert("edward-434", item_convert_data(pos)->flow.length == 0);
44557 + detach_convert_idata(pos->sq);
44558 + break;
44559 + case CRC_OVERWRITE_ITEM:
44560 + if (coord_is_unprepped_ctail(&pos->coord)) {
44561 + /* convert unpprepped ctail to prepped one */
44562 + int shift;
44563 + shift =
44564 + inode_cluster_shift(item_convert_data(pos)->inode);
44565 + assert("edward-1259", cluster_shift_ok(shift));
44566 + put_unaligned((d8)shift,
44567 + &ctail_formatted_at(&pos->coord)->
44568 + cluster_shift);
44569 + }
44570 + break;
44571 + }
44572 + return result;
44573 +}
44574 +
44575 +/* Make Linus happy.
44576 + Local variables:
44577 + c-indentation-style: "K&R"
44578 + mode-name: "LC"
44579 + c-basic-offset: 8
44580 + tab-width: 8
44581 + fill-column: 120
44582 + End:
44583 +*/
44584 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.23/fs/reiser4/plugin/item/ctail.h
44585 --- linux-2.6.23.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300
44586 +++ linux-2.6.23/fs/reiser4/plugin/item/ctail.h 2007-12-04 16:49:30.000000000 +0300
44587 @@ -0,0 +1,102 @@
44588 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44589 +
44590 +/* Ctail items are fragments (or bodies) of special tipe to provide
44591 + optimal storage of encrypted and(or) compressed files. */
44592 +
44593 +
44594 +#if !defined( __FS_REISER4_CTAIL_H__ )
44595 +#define __FS_REISER4_CTAIL_H__
44596 +
44597 +/* Disk format of ctail item */
44598 +typedef struct ctail_item_format {
44599 + /* packed shift;
44600 + if its value is different from UCTAIL_SHIFT (see below), then
44601 + size of disk cluster is calculated as (1 << cluster_shift) */
44602 + d8 cluster_shift;
44603 + /* ctail body */
44604 + d8 body[0];
44605 +} __attribute__ ((packed)) ctail_item_format;
44606 +
44607 +/* "Unprepped" disk cluster is represented by a single ctail item
44608 + with the following "magic" attributes: */
44609 +/* "magic" cluster_shift */
44610 +#define UCTAIL_SHIFT 0xff
44611 +/* How many units unprepped ctail item has */
44612 +#define UCTAIL_NR_UNITS 1
44613 +
44614 +/* The following is a set of various item states in a disk cluster.
44615 + Disk cluster is a set of items whose keys belong to the interval
44616 + [dc_key , dc_key + disk_cluster_size - 1] */
44617 +typedef enum {
44618 + DC_INVALID_STATE = 0,
44619 + DC_FIRST_ITEM = 1,
44620 + DC_CHAINED_ITEM = 2,
44621 + DC_AFTER_CLUSTER = 3
44622 +} dc_item_stat;
44623 +
44624 +/* ctail-specific extension.
44625 + In particular this describes parameters of disk cluster an item belongs to */
44626 +struct ctail_coord_extension {
44627 + int shift; /* this contains cluster_shift extracted from
44628 + ctail_item_format (above), or UCTAIL_SHIFT
44629 + (the last one is the "magic" of unprepped disk clusters)*/
44630 + int dsize; /* size of a prepped disk cluster */
44631 + int ncount; /* count of nodes occupied by a disk cluster */
44632 +};
44633 +
44634 +struct cut_list;
44635 +
44636 +/* plugin->item.b.* */
44637 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
44638 + const reiser4_item_data *);
44639 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
44640 +pos_in_node_t nr_units_ctail(const coord_t * coord);
44641 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
44642 +void print_ctail(const char *prefix, coord_t * coord);
44643 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
44644 +
44645 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
44646 + carry_plugin_info * info UNUSED_ARG);
44647 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
44648 +int can_shift_ctail(unsigned free_space, coord_t * coord,
44649 + znode * target, shift_direction pend, unsigned *size,
44650 + unsigned want);
44651 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
44652 + unsigned count, shift_direction where_is_free_space,
44653 + unsigned free_space);
44654 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44655 + carry_cut_data *, reiser4_key * smallest_removed,
44656 + reiser4_key * new_first);
44657 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44658 + carry_kill_data *, reiser4_key * smallest_removed,
44659 + reiser4_key * new_first);
44660 +int ctail_ok(const coord_t * coord);
44661 +int check_ctail(const coord_t * coord, const char **error);
44662 +
44663 +/* plugin->u.item.s.* */
44664 +int read_ctail(struct file *, flow_t *, hint_t *);
44665 +int readpage_ctail(void *, struct page *);
44666 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
44667 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
44668 +int create_hook_ctail(const coord_t * coord, void *arg);
44669 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
44670 + carry_kill_data *);
44671 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
44672 +
44673 +/* plugin->u.item.f */
44674 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
44675 +int scan_ctail(flush_scan *);
44676 +int convert_ctail(flush_pos_t *);
44677 +size_t inode_scaled_cluster_size(struct inode *);
44678 +
44679 +#endif /* __FS_REISER4_CTAIL_H__ */
44680 +
44681 +/* Make Linus happy.
44682 + Local variables:
44683 + c-indentation-style: "K&R"
44684 + mode-name: "LC"
44685 + c-basic-offset: 8
44686 + tab-width: 8
44687 + fill-column: 120
44688 + End:
44689 +*/
44690 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent.c linux-2.6.23/fs/reiser4/plugin/item/extent.c
44691 --- linux-2.6.23.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300
44692 +++ linux-2.6.23/fs/reiser4/plugin/item/extent.c 2007-12-04 16:49:30.000000000 +0300
44693 @@ -0,0 +1,197 @@
44694 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44695 +
44696 +#include "item.h"
44697 +#include "../../key.h"
44698 +#include "../../super.h"
44699 +#include "../../carry.h"
44700 +#include "../../inode.h"
44701 +#include "../../page_cache.h"
44702 +#include "../../flush.h"
44703 +#include "../object.h"
44704 +
44705 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
44706 +/* Audited by: green(2002.06.13) */
44707 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
44708 + int nr_extents)
44709 +{
44710 + data->data = ext_unit;
44711 + /* data->data is kernel space */
44712 + data->user = 0;
44713 + data->length = sizeof(reiser4_extent) * nr_extents;
44714 + data->arg = NULL;
44715 + data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
44716 + return data;
44717 +}
44718 +
44719 +/* how many bytes are addressed by @nr first extents of the extent item */
44720 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
44721 +{
44722 + pos_in_node_t i;
44723 + reiser4_block_nr blocks;
44724 + reiser4_extent *ext;
44725 +
44726 + ext = item_body_by_coord(coord);
44727 + assert("vs-263", nr <= nr_units_extent(coord));
44728 +
44729 + blocks = 0;
44730 + for (i = 0; i < nr; i++, ext++) {
44731 + blocks += extent_get_width(ext);
44732 + }
44733 +
44734 + return blocks * current_blocksize;
44735 +}
44736 +
44737 +extent_state state_of_extent(reiser4_extent * ext)
44738 +{
44739 + switch ((int)extent_get_start(ext)) {
44740 + case 0:
44741 + return HOLE_EXTENT;
44742 + case 1:
44743 + return UNALLOCATED_EXTENT;
44744 + default:
44745 + break;
44746 + }
44747 + return ALLOCATED_EXTENT;
44748 +}
44749 +
44750 +int extent_is_unallocated(const coord_t * item)
44751 +{
44752 + assert("jmacd-5133", item_is_extent(item));
44753 +
44754 + return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
44755 +}
44756 +
44757 +/* set extent's start and width */
44758 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
44759 + reiser4_block_nr width)
44760 +{
44761 + extent_set_start(ext, start);
44762 + extent_set_width(ext, width);
44763 +}
44764 +
44765 +/**
44766 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
44767 + * @un_extent: coordinate of extent to be overwritten
44768 + * @lh: need better comment
44769 + * @key: need better comment
44770 + * @exts_to_add: data prepared for insertion into tree
44771 + * @replace: need better comment
44772 + * @flags: need better comment
44773 + * @return_insert_position: need better comment
44774 + *
44775 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
44776 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
44777 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
44778 + * set to extent which was overwritten.
44779 + */
44780 +int reiser4_replace_extent(struct replace_handle *h,
44781 + int return_inserted_position)
44782 +{
44783 + int result;
44784 + znode *orig_znode;
44785 + /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
44786 +
44787 + assert("vs-990", coord_is_existing_unit(h->coord));
44788 + assert("vs-1375", znode_is_write_locked(h->coord->node));
44789 + assert("vs-1426", extent_get_width(&h->overwrite) != 0);
44790 + assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
44791 + assert("vs-1427", ergo(h->nr_new_extents == 2,
44792 + extent_get_width(&h->new_extents[1]) != 0));
44793 +
44794 + /* compose structure for paste */
44795 + init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
44796 +
44797 + coord_dup(&h->coord_after, h->coord);
44798 + init_lh(&h->lh_after);
44799 + copy_lh(&h->lh_after, h->lh);
44800 + reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
44801 + reiser4_tap_monitor(&h->watch);
44802 +
44803 + ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
44804 + orig_znode = h->coord->node;
44805 +
44806 +#if REISER4_DEBUG
44807 + /* make sure that key is set properly */
44808 + unit_key_by_coord(h->coord, &h->tmp);
44809 + set_key_offset(&h->tmp,
44810 + get_key_offset(&h->tmp) +
44811 + extent_get_width(&h->overwrite) * current_blocksize);
44812 + assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
44813 +#endif
44814 +
44815 + /* set insert point after unit to be replaced */
44816 + h->coord->between = AFTER_UNIT;
44817 +
44818 + result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
44819 + &h->paste_key, &h->item, h->flags);
44820 + if (!result) {
44821 + /* now we have to replace the unit after which new units were
44822 + inserted. Its position is tracked by @watch */
44823 + reiser4_extent *ext;
44824 + znode *node;
44825 +
44826 + node = h->coord_after.node;
44827 + if (node != orig_znode) {
44828 + coord_clear_iplug(&h->coord_after);
44829 + result = zload(node);
44830 + }
44831 +
44832 + if (likely(!result)) {
44833 + ext = extent_by_coord(&h->coord_after);
44834 +
44835 + assert("vs-987", znode_is_loaded(node));
44836 + assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
44837 +
44838 + /* overwrite extent unit */
44839 + memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
44840 + znode_make_dirty(node);
44841 +
44842 + if (node != orig_znode)
44843 + zrelse(node);
44844 +
44845 + if (return_inserted_position == 0) {
44846 + /* coord and lh are to be set to overwritten
44847 + extent */
44848 + assert("vs-1662",
44849 + WITH_DATA(node, !memcmp(&h->overwrite,
44850 + extent_by_coord(
44851 + &h->coord_after),
44852 + sizeof(reiser4_extent))));
44853 +
44854 + *h->coord = h->coord_after;
44855 + done_lh(h->lh);
44856 + copy_lh(h->lh, &h->lh_after);
44857 + } else {
44858 + /* h->coord and h->lh are to be set to first of
44859 + inserted units */
44860 + assert("vs-1663",
44861 + WITH_DATA(h->coord->node,
44862 + !memcmp(&h->new_extents[0],
44863 + extent_by_coord(h->coord),
44864 + sizeof(reiser4_extent))));
44865 + assert("vs-1664", h->lh->node == h->coord->node);
44866 + }
44867 + }
44868 + }
44869 + reiser4_tap_done(&h->watch);
44870 +
44871 + return result;
44872 +}
44873 +
44874 +lock_handle *znode_lh(znode *node)
44875 +{
44876 + assert("vs-1371", znode_is_write_locked(node));
44877 + assert("vs-1372", znode_is_wlocked_once(node));
44878 + return list_entry(node->lock.owners.next, lock_handle, owners_link);
44879 +}
44880 +
44881 +/*
44882 + * Local variables:
44883 + * c-indentation-style: "K&R"
44884 + * mode-name: "LC"
44885 + * c-basic-offset: 8
44886 + * tab-width: 8
44887 + * fill-column: 79
44888 + * scroll-step: 1
44889 + * End:
44890 + */
44891 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.23/fs/reiser4/plugin/item/extent_file_ops.c
44892 --- linux-2.6.23.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300
44893 +++ linux-2.6.23/fs/reiser4/plugin/item/extent_file_ops.c 2007-12-04 23:04:00.738308094 +0300
44894 @@ -0,0 +1,1453 @@
44895 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44896 +
44897 +#include "item.h"
44898 +#include "../../inode.h"
44899 +#include "../../page_cache.h"
44900 +#include "../object.h"
44901 +
44902 +#include <linux/quotaops.h>
44903 +#include <linux/swap.h>
44904 +
44905 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
44906 +{
44907 + reiser4_extent *ext;
44908 +
44909 + ext = (reiser4_extent *) (zdata(node) + offset);
44910 + return ext;
44911 +}
44912 +
44913 +/**
44914 + * check_uf_coord - verify coord extension
44915 + * @uf_coord:
44916 + * @key:
44917 + *
44918 + * Makes sure that all fields of @uf_coord are set properly. If @key is
44919 + * specified - check whether @uf_coord is set correspondingly.
44920 + */
44921 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
44922 +{
44923 +#if REISER4_DEBUG
44924 + const coord_t *coord;
44925 + const struct extent_coord_extension *ext_coord;
44926 + reiser4_extent *ext;
44927 +
44928 + coord = &uf_coord->coord;
44929 + ext_coord = &uf_coord->extension.extent;
44930 + ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
44931 +
44932 + assert("",
44933 + WITH_DATA(coord->node,
44934 + (uf_coord->valid == 1 &&
44935 + coord_is_iplug_set(coord) &&
44936 + item_is_extent(coord) &&
44937 + ext_coord->nr_units == nr_units_extent(coord) &&
44938 + ext == extent_by_coord(coord) &&
44939 + ext_coord->width == extent_get_width(ext) &&
44940 + coord->unit_pos < ext_coord->nr_units &&
44941 + ext_coord->pos_in_unit < ext_coord->width &&
44942 + memcmp(ext, &ext_coord->extent,
44943 + sizeof(reiser4_extent)) == 0)));
44944 + if (key) {
44945 + reiser4_key coord_key;
44946 +
44947 + unit_key_by_coord(&uf_coord->coord, &coord_key);
44948 + set_key_offset(&coord_key,
44949 + get_key_offset(&coord_key) +
44950 + (uf_coord->extension.extent.
44951 + pos_in_unit << PAGE_CACHE_SHIFT));
44952 + assert("", keyeq(key, &coord_key));
44953 + }
44954 +#endif
44955 +}
44956 +
44957 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
44958 +{
44959 + check_uf_coord(uf_coord, NULL);
44960 +
44961 + return ext_by_offset(uf_coord->coord.node,
44962 + uf_coord->extension.extent.ext_offset);
44963 +}
44964 +
44965 +#if REISER4_DEBUG
44966 +
44967 +/**
44968 + * offset_is_in_unit
44969 + *
44970 + *
44971 + *
44972 + */
44973 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
44974 + pos_in_unit inside of unit correspondingly */
44975 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
44976 +{
44977 + reiser4_key unit_key;
44978 + __u64 unit_off;
44979 + reiser4_extent *ext;
44980 +
44981 + ext = extent_by_coord(coord);
44982 +
44983 + unit_key_extent(coord, &unit_key);
44984 + unit_off = get_key_offset(&unit_key);
44985 + if (off < unit_off)
44986 + return 0;
44987 + if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
44988 + return 0;
44989 + return 1;
44990 +}
44991 +
44992 +static int
44993 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
44994 +{
44995 + reiser4_key item_key;
44996 +
44997 + assert("vs-771", coord_is_existing_unit(coord));
44998 + assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
44999 + assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
45000 +
45001 + return offset_is_in_unit(coord, get_key_offset(key));
45002 +}
45003 +
45004 +#endif
45005 +
45006 +/**
45007 + * can_append -
45008 + * @key:
45009 + * @coord:
45010 + *
45011 + * Returns 1 if @key is equal to an append key of item @coord is set to
45012 + */
45013 +static int can_append(const reiser4_key *key, const coord_t *coord)
45014 +{
45015 + reiser4_key append_key;
45016 +
45017 + return keyeq(key, append_key_extent(coord, &append_key));
45018 +}
45019 +
45020 +/**
45021 + * append_hole
45022 + * @coord:
45023 + * @lh:
45024 + * @key:
45025 + *
45026 + */
45027 +static int append_hole(coord_t *coord, lock_handle *lh,
45028 + const reiser4_key *key)
45029 +{
45030 + reiser4_key append_key;
45031 + reiser4_block_nr hole_width;
45032 + reiser4_extent *ext, new_ext;
45033 + reiser4_item_data idata;
45034 +
45035 + /* last item of file may have to be appended with hole */
45036 + assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
45037 + assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
45038 +
45039 + /* key of first byte which is not addressed by this extent */
45040 + append_key_extent(coord, &append_key);
45041 +
45042 + assert("", keyle(&append_key, key));
45043 +
45044 + /*
45045 + * extent item has to be appended with hole. Calculate length of that
45046 + * hole
45047 + */
45048 + hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
45049 + current_blocksize - 1) >> current_blocksize_bits);
45050 + assert("vs-954", hole_width > 0);
45051 +
45052 + /* set coord after last unit */
45053 + coord_init_after_item_end(coord);
45054 +
45055 + /* get last extent in the item */
45056 + ext = extent_by_coord(coord);
45057 + if (state_of_extent(ext) == HOLE_EXTENT) {
45058 + /*
45059 + * last extent of a file is hole extent. Widen that extent by
45060 + * @hole_width blocks. Note that we do not worry about
45061 + * overflowing - extent width is 64 bits
45062 + */
45063 + reiser4_set_extent(ext, HOLE_EXTENT_START,
45064 + extent_get_width(ext) + hole_width);
45065 + znode_make_dirty(coord->node);
45066 + return 0;
45067 + }
45068 +
45069 + /* append last item of the file with hole extent unit */
45070 + assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
45071 + state_of_extent(ext) == UNALLOCATED_EXTENT));
45072 +
45073 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45074 + init_new_extent(&idata, &new_ext, 1);
45075 + return insert_into_item(coord, lh, &append_key, &idata, 0);
45076 +}
45077 +
45078 +/**
45079 + * check_jnodes
45080 + * @twig: longterm locked twig node
45081 + * @key:
45082 + *
45083 + */
45084 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
45085 +{
45086 +#if REISER4_DEBUG
45087 + coord_t c;
45088 + reiser4_key node_key, jnode_key;
45089 +
45090 + jnode_key = *key;
45091 +
45092 + assert("", twig != NULL);
45093 + assert("", znode_get_level(twig) == TWIG_LEVEL);
45094 + assert("", znode_is_write_locked(twig));
45095 +
45096 + zload(twig);
45097 + /* get the smallest key in twig node */
45098 + coord_init_first_unit(&c, twig);
45099 + unit_key_by_coord(&c, &node_key);
45100 + assert("", keyle(&node_key, &jnode_key));
45101 +
45102 + coord_init_last_unit(&c, twig);
45103 + unit_key_by_coord(&c, &node_key);
45104 + if (item_plugin_by_coord(&c)->s.file.append_key)
45105 + item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
45106 + set_key_offset(&jnode_key,
45107 + get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
45108 + assert("", keylt(&jnode_key, &node_key));
45109 + zrelse(twig);
45110 +#endif
45111 +}
45112 +
45113 +/**
45114 + * append_last_extent - append last file item
45115 + * @uf_coord: coord to start insertion from
45116 + * @jnodes: array of jnodes
45117 + * @count: number of jnodes in the array
45118 + *
45119 + * There is already at least one extent item of file @inode in the tree. Append
45120 + * the last of them with unallocated extent unit of width @count. Assign
45121 + * fake block numbers to jnodes corresponding to the inserted extent.
45122 + */
45123 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45124 + jnode **jnodes, int count)
45125 +{
45126 + int result;
45127 + reiser4_extent new_ext;
45128 + reiser4_item_data idata;
45129 + coord_t *coord;
45130 + struct extent_coord_extension *ext_coord;
45131 + reiser4_extent *ext;
45132 + reiser4_block_nr block;
45133 + jnode *node;
45134 + int i;
45135 +
45136 + coord = &uf_coord->coord;
45137 + ext_coord = &uf_coord->extension.extent;
45138 + ext = ext_by_ext_coord(uf_coord);
45139 +
45140 + /* check correctness of position in the item */
45141 + assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
45142 + assert("vs-1311", coord->between == AFTER_UNIT);
45143 + assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
45144 +
45145 + if (!can_append(key, coord)) {
45146 + /* hole extent has to be inserted */
45147 + result = append_hole(coord, uf_coord->lh, key);
45148 + uf_coord->valid = 0;
45149 + return result;
45150 + }
45151 +
45152 + if (count == 0)
45153 + return 0;
45154 +
45155 + assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
45156 +
45157 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
45158 + count);
45159 + BUG_ON(result != 0);
45160 +
45161 + switch (state_of_extent(ext)) {
45162 + case UNALLOCATED_EXTENT:
45163 + /*
45164 + * last extent unit of the file is unallocated one. Increase
45165 + * its width by @count
45166 + */
45167 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
45168 + extent_get_width(ext) + count);
45169 + znode_make_dirty(coord->node);
45170 +
45171 + /* update coord extension */
45172 + ext_coord->width += count;
45173 + ON_DEBUG(extent_set_width
45174 + (&uf_coord->extension.extent.extent,
45175 + ext_coord->width));
45176 + break;
45177 +
45178 + case HOLE_EXTENT:
45179 + case ALLOCATED_EXTENT:
45180 + /*
45181 + * last extent unit of the file is either hole or allocated
45182 + * one. Append one unallocated extent of width @count
45183 + */
45184 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45185 + init_new_extent(&idata, &new_ext, 1);
45186 + result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
45187 + uf_coord->valid = 0;
45188 + if (result)
45189 + return result;
45190 + break;
45191 +
45192 + default:
45193 + return RETERR(-EIO);
45194 + }
45195 +
45196 + /*
45197 + * make sure that we hold long term locked twig node containing all
45198 + * jnodes we are about to capture
45199 + */
45200 + check_jnodes(uf_coord->lh->node, key, count);
45201 +
45202 + /*
45203 + * assign fake block numbers to all jnodes. FIXME: make sure whether
45204 + * twig node containing inserted extent item is locked
45205 + */
45206 + block = fake_blocknr_unformatted(count);
45207 + for (i = 0; i < count; i ++, block ++) {
45208 + node = jnodes[i];
45209 + spin_lock_jnode(node);
45210 + JF_SET(node, JNODE_CREATED);
45211 + jnode_set_block(node, &block);
45212 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45213 + BUG_ON(result != 0);
45214 + jnode_make_dirty_locked(node);
45215 + spin_unlock_jnode(node);
45216 + }
45217 + return count;
45218 +}
45219 +
45220 +/**
45221 + * insert_first_hole - inser hole extent into tree
45222 + * @coord:
45223 + * @lh:
45224 + * @key:
45225 + *
45226 + *
45227 + */
45228 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
45229 + const reiser4_key *key)
45230 +{
45231 + reiser4_extent new_ext;
45232 + reiser4_item_data idata;
45233 + reiser4_key item_key;
45234 + reiser4_block_nr hole_width;
45235 +
45236 + /* @coord must be set for inserting of new item */
45237 + assert("vs-711", coord_is_between_items(coord));
45238 +
45239 + item_key = *key;
45240 + set_key_offset(&item_key, 0ull);
45241 +
45242 + hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
45243 + current_blocksize_bits);
45244 + assert("vs-710", hole_width > 0);
45245 +
45246 + /* compose body of hole extent and insert item into tree */
45247 + reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45248 + init_new_extent(&idata, &new_ext, 1);
45249 + return insert_extent_by_coord(coord, &idata, &item_key, lh);
45250 +}
45251 +
45252 +
45253 +/**
45254 + * insert_first_extent - insert first file item
45255 + * @inode: inode of file
45256 + * @uf_coord: coord to start insertion from
45257 + * @jnodes: array of jnodes
45258 + * @count: number of jnodes in the array
45259 + * @inode:
45260 + *
45261 + * There are no items of file @inode in the tree yet. Insert unallocated extent
45262 + * of width @count into tree or hole extent if writing not to the
45263 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
45264 + * unallocated extent. Returns number of jnodes or error code.
45265 + */
45266 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45267 + jnode **jnodes, int count,
45268 + struct inode *inode)
45269 +{
45270 + int result;
45271 + int i;
45272 + reiser4_extent new_ext;
45273 + reiser4_item_data idata;
45274 + reiser4_block_nr block;
45275 + struct unix_file_info *uf_info;
45276 + jnode *node;
45277 +
45278 + /* first extent insertion starts at leaf level */
45279 + assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
45280 + assert("vs-711", coord_is_between_items(&uf_coord->coord));
45281 +
45282 + if (get_key_offset(key) != 0) {
45283 + result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
45284 + uf_coord->valid = 0;
45285 + uf_info = unix_file_inode_data(inode);
45286 +
45287 + /*
45288 + * first item insertion is only possible when writing to empty
45289 + * file or performing tail conversion
45290 + */
45291 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
45292 + (reiser4_inode_get_flag(inode,
45293 + REISER4_PART_MIXED) &&
45294 + reiser4_inode_get_flag(inode,
45295 + REISER4_PART_IN_CONV))));
45296 + /* if file was empty - update its state */
45297 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
45298 + uf_info->container = UF_CONTAINER_EXTENTS;
45299 + return result;
45300 + }
45301 +
45302 + if (count == 0)
45303 + return 0;
45304 +
45305 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
45306 + BUG_ON(result != 0);
45307 +
45308 + /*
45309 + * prepare for tree modification: compose body of item and item data
45310 + * structure needed for insertion
45311 + */
45312 + reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45313 + init_new_extent(&idata, &new_ext, 1);
45314 +
45315 + /* insert extent item into the tree */
45316 + result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
45317 + uf_coord->lh);
45318 + if (result)
45319 + return result;
45320 +
45321 + /*
45322 + * make sure that we hold long term locked twig node containing all
45323 + * jnodes we are about to capture
45324 + */
45325 + check_jnodes(uf_coord->lh->node, key, count);
45326 + /*
45327 + * assign fake block numbers to all jnodes, capture and mark them dirty
45328 + */
45329 + block = fake_blocknr_unformatted(count);
45330 + for (i = 0; i < count; i ++, block ++) {
45331 + node = jnodes[i];
45332 + spin_lock_jnode(node);
45333 + JF_SET(node, JNODE_CREATED);
45334 + jnode_set_block(node, &block);
45335 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45336 + BUG_ON(result != 0);
45337 + jnode_make_dirty_locked(node);
45338 + spin_unlock_jnode(node);
45339 + }
45340 +
45341 + /*
45342 + * invalidate coordinate, research must be performed to continue
45343 + * because write will continue on twig level
45344 + */
45345 + uf_coord->valid = 0;
45346 + return count;
45347 +}
45348 +
45349 +/**
45350 + * plug_hole - replace hole extent with unallocated and holes
45351 + * @uf_coord:
45352 + * @key:
45353 + * @node:
45354 + * @h: structure containing coordinate, lock handle, key, etc
45355 + *
45356 + * Creates an unallocated extent of width 1 within a hole. In worst case two
45357 + * additional extents can be created.
45358 + */
45359 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
45360 +{
45361 + struct replace_handle rh;
45362 + reiser4_extent *ext;
45363 + reiser4_block_nr width, pos_in_unit;
45364 + coord_t *coord;
45365 + struct extent_coord_extension *ext_coord;
45366 + int return_inserted_position;
45367 +
45368 + check_uf_coord(uf_coord, key);
45369 +
45370 + rh.coord = coord_by_uf_coord(uf_coord);
45371 + rh.lh = uf_coord->lh;
45372 + rh.flags = 0;
45373 +
45374 + coord = coord_by_uf_coord(uf_coord);
45375 + ext_coord = ext_coord_by_uf_coord(uf_coord);
45376 + ext = ext_by_ext_coord(uf_coord);
45377 +
45378 + width = ext_coord->width;
45379 + pos_in_unit = ext_coord->pos_in_unit;
45380 +
45381 + *how = 0;
45382 + if (width == 1) {
45383 + reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
45384 + znode_make_dirty(coord->node);
45385 + /* update uf_coord */
45386 + ON_DEBUG(ext_coord->extent = *ext);
45387 + *how = 1;
45388 + return 0;
45389 + } else if (pos_in_unit == 0) {
45390 + /* we deal with first element of extent */
45391 + if (coord->unit_pos) {
45392 + /* there is an extent to the left */
45393 + if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
45394 + /*
45395 + * left neighboring unit is an unallocated
45396 + * extent. Increase its width and decrease
45397 + * width of hole
45398 + */
45399 + extent_set_width(ext - 1,
45400 + extent_get_width(ext - 1) + 1);
45401 + extent_set_width(ext, width - 1);
45402 + znode_make_dirty(coord->node);
45403 +
45404 + /* update coord extension */
45405 + coord->unit_pos--;
45406 + ext_coord->width = extent_get_width(ext - 1);
45407 + ext_coord->pos_in_unit = ext_coord->width - 1;
45408 + ext_coord->ext_offset -= sizeof(reiser4_extent);
45409 + ON_DEBUG(ext_coord->extent =
45410 + *extent_by_coord(coord));
45411 + *how = 2;
45412 + return 0;
45413 + }
45414 + }
45415 + /* extent for replace */
45416 + reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
45417 + /* extent to be inserted */
45418 + reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
45419 + width - 1);
45420 + rh.nr_new_extents = 1;
45421 +
45422 + /* have reiser4_replace_extent to return with @coord and
45423 + @uf_coord->lh set to unit which was replaced */
45424 + return_inserted_position = 0;
45425 + *how = 3;
45426 + } else if (pos_in_unit == width - 1) {
45427 + /* we deal with last element of extent */
45428 + if (coord->unit_pos < nr_units_extent(coord) - 1) {
45429 + /* there is an extent unit to the right */
45430 + if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
45431 + /*
45432 + * right neighboring unit is an unallocated
45433 + * extent. Increase its width and decrease
45434 + * width of hole
45435 + */
45436 + extent_set_width(ext + 1,
45437 + extent_get_width(ext + 1) + 1);
45438 + extent_set_width(ext, width - 1);
45439 + znode_make_dirty(coord->node);
45440 +
45441 + /* update coord extension */
45442 + coord->unit_pos++;
45443 + ext_coord->width = extent_get_width(ext + 1);
45444 + ext_coord->pos_in_unit = 0;
45445 + ext_coord->ext_offset += sizeof(reiser4_extent);
45446 + ON_DEBUG(ext_coord->extent =
45447 + *extent_by_coord(coord));
45448 + *how = 4;
45449 + return 0;
45450 + }
45451 + }
45452 + /* extent for replace */
45453 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
45454 + /* extent to be inserted */
45455 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45456 + 1);
45457 + rh.nr_new_extents = 1;
45458 +
45459 + /* have reiser4_replace_extent to return with @coord and
45460 + @uf_coord->lh set to unit which was inserted */
45461 + return_inserted_position = 1;
45462 + *how = 5;
45463 + } else {
45464 + /* extent for replace */
45465 + reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
45466 + pos_in_unit);
45467 + /* extents to be inserted */
45468 + reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45469 + 1);
45470 + reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
45471 + width - pos_in_unit - 1);
45472 + rh.nr_new_extents = 2;
45473 +
45474 + /* have reiser4_replace_extent to return with @coord and
45475 + @uf_coord->lh set to first of units which were inserted */
45476 + return_inserted_position = 1;
45477 + *how = 6;
45478 + }
45479 + unit_key_by_coord(coord, &rh.paste_key);
45480 + set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
45481 + extent_get_width(&rh.overwrite) * current_blocksize);
45482 +
45483 + uf_coord->valid = 0;
45484 + return reiser4_replace_extent(&rh, return_inserted_position);
45485 +}
45486 +
45487 +/**
45488 + * overwrite_one_block -
45489 + * @uf_coord:
45490 + * @key:
45491 + * @node:
45492 + *
45493 + * If @node corresponds to hole extent - create unallocated extent for it and
45494 + * assign fake block number. If @node corresponds to allocated extent - assign
45495 + * block number of jnode
45496 + */
45497 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
45498 + jnode *node, int *hole_plugged)
45499 +{
45500 + int result;
45501 + struct extent_coord_extension *ext_coord;
45502 + reiser4_extent *ext;
45503 + reiser4_block_nr block;
45504 + int how;
45505 +
45506 + assert("vs-1312", uf_coord->coord.between == AT_UNIT);
45507 +
45508 + result = 0;
45509 + ext_coord = ext_coord_by_uf_coord(uf_coord);
45510 + ext = ext_by_ext_coord(uf_coord);
45511 + assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
45512 +
45513 + switch (state_of_extent(ext)) {
45514 + case ALLOCATED_EXTENT:
45515 + block = extent_get_start(ext) + ext_coord->pos_in_unit;
45516 + break;
45517 +
45518 + case HOLE_EXTENT:
45519 + result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
45520 + BUG_ON(result != 0);
45521 + result = plug_hole(uf_coord, key, &how);
45522 + if (result)
45523 + return result;
45524 + block = fake_blocknr_unformatted(1);
45525 + if (hole_plugged)
45526 + *hole_plugged = 1;
45527 + JF_SET(node, JNODE_CREATED);
45528 + break;
45529 +
45530 + default:
45531 + return RETERR(-EIO);
45532 + }
45533 +
45534 + jnode_set_block(node, &block);
45535 + return 0;
45536 +}
45537 +
45538 +/**
45539 + * move_coord - move coordinate forward
45540 + * @uf_coord:
45541 + *
45542 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
45543 + * the last one already or is invalid.
45544 + */
45545 +static int move_coord(uf_coord_t *uf_coord)
45546 +{
45547 + struct extent_coord_extension *ext_coord;
45548 +
45549 + if (uf_coord->valid == 0)
45550 + return 1;
45551 + ext_coord = &uf_coord->extension.extent;
45552 + ext_coord->pos_in_unit ++;
45553 + if (ext_coord->pos_in_unit < ext_coord->width)
45554 + /* coordinate moved within the unit */
45555 + return 0;
45556 +
45557 + /* end of unit is reached. Try to move to next unit */
45558 + ext_coord->pos_in_unit = 0;
45559 + uf_coord->coord.unit_pos ++;
45560 + if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
45561 + /* coordinate moved to next unit */
45562 + ext_coord->ext_offset += sizeof(reiser4_extent);
45563 + ext_coord->width =
45564 + extent_get_width(ext_by_offset
45565 + (uf_coord->coord.node,
45566 + ext_coord->ext_offset));
45567 + ON_DEBUG(ext_coord->extent =
45568 + *ext_by_offset(uf_coord->coord.node,
45569 + ext_coord->ext_offset));
45570 + return 0;
45571 + }
45572 + /* end of item is reached */
45573 + uf_coord->valid = 0;
45574 + return 1;
45575 +}
45576 +
45577 +/**
45578 + * overwrite_extent -
45579 + * @inode:
45580 + *
45581 + * Returns number of handled jnodes.
45582 + */
45583 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45584 + jnode **jnodes, int count, int *plugged_hole)
45585 +{
45586 + int result;
45587 + reiser4_key k;
45588 + int i;
45589 + jnode *node;
45590 +
45591 + k = *key;
45592 + for (i = 0; i < count; i ++) {
45593 + node = jnodes[i];
45594 + if (*jnode_get_block(node) == 0) {
45595 + result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
45596 + if (result)
45597 + return result;
45598 + }
45599 + /*
45600 + * make sure that we hold long term locked twig node containing
45601 + * all jnodes we are about to capture
45602 + */
45603 + check_jnodes(uf_coord->lh->node, &k, 1);
45604 + /*
45605 + * assign fake block numbers to all jnodes, capture and mark
45606 + * them dirty
45607 + */
45608 + spin_lock_jnode(node);
45609 + result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45610 + BUG_ON(result != 0);
45611 + jnode_make_dirty_locked(node);
45612 + spin_unlock_jnode(node);
45613 +
45614 + if (uf_coord->valid == 0)
45615 + return i + 1;
45616 +
45617 + check_uf_coord(uf_coord, &k);
45618 +
45619 + if (move_coord(uf_coord)) {
45620 + /*
45621 + * failed to move to the next node pointer. Either end
45622 + * of file or end of twig node is reached. In the later
45623 + * case we might go to the right neighbor.
45624 + */
45625 + uf_coord->valid = 0;
45626 + return i + 1;
45627 + }
45628 + set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
45629 + }
45630 +
45631 + return count;
45632 +}
45633 +
45634 +/**
45635 + * reiser4_update_extent
45636 + * @file:
45637 + * @jnodes:
45638 + * @count:
45639 + * @off:
45640 + *
45641 + */
45642 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
45643 + int *plugged_hole)
45644 +{
45645 + int result;
45646 + znode *loaded;
45647 + uf_coord_t uf_coord;
45648 + coord_t *coord;
45649 + lock_handle lh;
45650 + reiser4_key key;
45651 +
45652 + assert("", reiser4_lock_counters()->d_refs == 0);
45653 +
45654 + key_by_inode_and_offset_common(inode, pos, &key);
45655 +
45656 + init_uf_coord(&uf_coord, &lh);
45657 + coord = &uf_coord.coord;
45658 + result = find_file_item_nohint(coord, &lh, &key,
45659 + ZNODE_WRITE_LOCK, inode);
45660 + if (IS_CBKERR(result)) {
45661 + assert("", reiser4_lock_counters()->d_refs == 0);
45662 + return result;
45663 + }
45664 +
45665 + result = zload(coord->node);
45666 + BUG_ON(result != 0);
45667 + loaded = coord->node;
45668 +
45669 + if (coord->between == AFTER_UNIT) {
45670 + /*
45671 + * append existing extent item with unallocated extent of width
45672 + * nr_jnodes
45673 + */
45674 + init_coord_extension_extent(&uf_coord,
45675 + get_key_offset(&key));
45676 + result = append_last_extent(&uf_coord, &key,
45677 + &node, 1);
45678 + } else if (coord->between == AT_UNIT) {
45679 + /*
45680 + * overwrite
45681 + * not optimal yet. Will be optimized if new write will show
45682 + * performance win.
45683 + */
45684 + init_coord_extension_extent(&uf_coord,
45685 + get_key_offset(&key));
45686 + result = overwrite_extent(&uf_coord, &key,
45687 + &node, 1, plugged_hole);
45688 + } else {
45689 + /*
45690 + * there are no items of this file in the tree yet. Create
45691 + * first item of the file inserting one unallocated extent of
45692 + * width nr_jnodes
45693 + */
45694 + result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
45695 + }
45696 + assert("", result == 1 || result < 0);
45697 + zrelse(loaded);
45698 + done_lh(&lh);
45699 + assert("", reiser4_lock_counters()->d_refs == 0);
45700 + return (result == 1) ? 0 : result;
45701 +}
45702 +
45703 +/**
45704 + * update_extents
45705 + * @file:
45706 + * @jnodes:
45707 + * @count:
45708 + * @off:
45709 + *
45710 + */
45711 +static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
45712 +{
45713 + struct inode *inode;
45714 + struct hint hint;
45715 + reiser4_key key;
45716 + int result;
45717 + znode *loaded;
45718 +
45719 + result = load_file_hint(file, &hint);
45720 + BUG_ON(result != 0);
45721 +
45722 + inode = file->f_dentry->d_inode;
45723 + if (count != 0)
45724 + /*
45725 + * count == 0 is special case: expanding truncate
45726 + */
45727 + pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
45728 + key_by_inode_and_offset_common(inode, pos, &key);
45729 +
45730 + assert("", reiser4_lock_counters()->d_refs == 0);
45731 +
45732 + do {
45733 + result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
45734 + if (IS_CBKERR(result)) {
45735 + assert("", reiser4_lock_counters()->d_refs == 0);
45736 + return result;
45737 + }
45738 +
45739 + result = zload(hint.ext_coord.coord.node);
45740 + BUG_ON(result != 0);
45741 + loaded = hint.ext_coord.coord.node;
45742 +
45743 + if (hint.ext_coord.coord.between == AFTER_UNIT) {
45744 + /*
45745 + * append existing extent item with unallocated extent
45746 + * of width nr_jnodes
45747 + */
45748 + if (hint.ext_coord.valid == 0)
45749 + /* NOTE: get statistics on this */
45750 + init_coord_extension_extent(&hint.ext_coord,
45751 + get_key_offset(&key));
45752 + result = append_last_extent(&hint.ext_coord, &key,
45753 + jnodes, count);
45754 + } else if (hint.ext_coord.coord.between == AT_UNIT) {
45755 + /*
45756 + * overwrite
45757 + * not optimal yet. Will be optimized if new write will
45758 + * show performance win.
45759 + */
45760 + if (hint.ext_coord.valid == 0)
45761 + /* NOTE: get statistics on this */
45762 + init_coord_extension_extent(&hint.ext_coord,
45763 + get_key_offset(&key));
45764 + result = overwrite_extent(&hint.ext_coord, &key,
45765 + jnodes, count, NULL);
45766 + } else {
45767 + /*
45768 + * there are no items of this file in the tree
45769 + * yet. Create first item of the file inserting one
45770 + * unallocated extent of * width nr_jnodes
45771 + */
45772 + result = insert_first_extent(&hint.ext_coord, &key,
45773 + jnodes, count, inode);
45774 + }
45775 + zrelse(loaded);
45776 + if (result < 0) {
45777 + done_lh(hint.ext_coord.lh);
45778 + break;
45779 + }
45780 +
45781 + jnodes += result;
45782 + count -= result;
45783 + set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
45784 +
45785 + /* seal and unlock znode */
45786 + if (hint.ext_coord.valid)
45787 + reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
45788 + else
45789 + reiser4_unset_hint(&hint);
45790 +
45791 + } while (count > 0);
45792 +
45793 + save_file_hint(file, &hint);
45794 + assert("", reiser4_lock_counters()->d_refs == 0);
45795 + return result;
45796 +}
45797 +
45798 +/**
45799 + * write_extent_reserve_space - reserve space for extent write operation
45800 + * @inode:
45801 + *
45802 + * Estimates and reserves space which may be required for writing
45803 + * WRITE_GRANULARITY pages of file.
45804 + */
45805 +static int write_extent_reserve_space(struct inode *inode)
45806 +{
45807 + __u64 count;
45808 + reiser4_tree *tree;
45809 +
45810 + /*
45811 + * to write WRITE_GRANULARITY pages to a file by extents we have to
45812 + * reserve disk space for:
45813 +
45814 + * 1. find_file_item may have to insert empty node to the tree (empty
45815 + * leaf node between two extent items). This requires 1 block and
45816 + * number of blocks which are necessary to perform insertion of an
45817 + * internal item into twig level.
45818 +
45819 + * 2. for each of written pages there might be needed 1 block and
45820 + * number of blocks which might be necessary to perform insertion of or
45821 + * paste to an extent item.
45822 +
45823 + * 3. stat data update
45824 + */
45825 + tree = reiser4_tree_by_inode(inode);
45826 + count = estimate_one_insert_item(tree) +
45827 + WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
45828 + estimate_one_insert_item(tree);
45829 + grab_space_enable();
45830 + return reiser4_grab_space(count, 0 /* flags */);
45831 +}
45832 +
45833 +/*
45834 + * filemap_copy_from_user no longer exists in generic code, because it
45835 + * is deadlocky (copying from user while holding the page lock is bad).
45836 + * As a temporary fix for reiser4, just define it here.
45837 + */
45838 +static inline size_t
45839 +filemap_copy_from_user(struct page *page, unsigned long offset,
45840 + const char __user *buf, unsigned bytes)
45841 +{
45842 + char *kaddr;
45843 + int left;
45844 +
45845 + kaddr = kmap_atomic(page, KM_USER0);
45846 + left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
45847 + kunmap_atomic(kaddr, KM_USER0);
45848 +
45849 + if (left != 0) {
45850 + /* Do it the slow way */
45851 + kaddr = kmap(page);
45852 + left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
45853 + kunmap(page);
45854 + }
45855 + return bytes - left;
45856 +}
45857 +
45858 +/**
45859 + * reiser4_write_extent - write method of extent item plugin
45860 + * @file: file to write to
45861 + * @buf: address of user-space buffer
45862 + * @count: number of bytes to write
45863 + * @pos: position in file to write to
45864 + *
45865 + */
45866 +ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
45867 + size_t count, loff_t *pos)
45868 +{
45869 + int have_to_update_extent;
45870 + int nr_pages, nr_dirty;
45871 + struct page *page;
45872 + jnode *jnodes[WRITE_GRANULARITY + 1];
45873 + struct inode *inode;
45874 + unsigned long index;
45875 + unsigned long end;
45876 + int i;
45877 + int to_page, page_off;
45878 + size_t left, written;
45879 + int result = 0;
45880 +
45881 + inode = file->f_dentry->d_inode;
45882 + if (write_extent_reserve_space(inode))
45883 + return RETERR(-ENOSPC);
45884 +
45885 + if (count == 0) {
45886 + /* truncate case */
45887 + update_extents(file, jnodes, 0, *pos);
45888 + return 0;
45889 + }
45890 +
45891 + BUG_ON(get_current_context()->trans->atom != NULL);
45892 +
45893 + left = count;
45894 + index = *pos >> PAGE_CACHE_SHIFT;
45895 + /* calculate number of pages which are to be written */
45896 + end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
45897 + nr_pages = end - index + 1;
45898 + nr_dirty = 0;
45899 + assert("", nr_pages <= WRITE_GRANULARITY + 1);
45900 +
45901 + /* get pages and jnodes */
45902 + for (i = 0; i < nr_pages; i ++) {
45903 + page = find_or_create_page(inode->i_mapping, index + i,
45904 + reiser4_ctx_gfp_mask_get());
45905 + if (page == NULL) {
45906 + nr_pages = i;
45907 + result = RETERR(-ENOMEM);
45908 + goto out;
45909 + }
45910 +
45911 + jnodes[i] = jnode_of_page(page);
45912 + if (IS_ERR(jnodes[i])) {
45913 + unlock_page(page);
45914 + page_cache_release(page);
45915 + nr_pages = i;
45916 + result = RETERR(-ENOMEM);
45917 + goto out;
45918 + }
45919 + /* prevent jnode and page from disconnecting */
45920 + JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
45921 + unlock_page(page);
45922 + }
45923 +
45924 + BUG_ON(get_current_context()->trans->atom != NULL);
45925 +
45926 + have_to_update_extent = 0;
45927 +
45928 + page_off = (*pos & (PAGE_CACHE_SIZE - 1));
45929 + for (i = 0; i < nr_pages; i ++) {
45930 + to_page = PAGE_CACHE_SIZE - page_off;
45931 + if (to_page > left)
45932 + to_page = left;
45933 + page = jnode_page(jnodes[i]);
45934 + if (page_offset(page) < inode->i_size &&
45935 + !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
45936 + /*
45937 + * the above is not optimal for partial write to last
45938 + * page of file when file size is not at boundary of
45939 + * page
45940 + */
45941 + lock_page(page);
45942 + if (!PageUptodate(page)) {
45943 + result = readpage_unix_file(NULL, page);
45944 + BUG_ON(result != 0);
45945 + /* wait for read completion */
45946 + lock_page(page);
45947 + BUG_ON(!PageUptodate(page));
45948 + } else
45949 + result = 0;
45950 + unlock_page(page);
45951 + }
45952 +
45953 + BUG_ON(get_current_context()->trans->atom != NULL);
45954 + fault_in_pages_readable(buf, to_page);
45955 + BUG_ON(get_current_context()->trans->atom != NULL);
45956 +
45957 + lock_page(page);
45958 + if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
45959 + simple_prepare_write(file, page, page_off,
45960 + page_off + to_page);
45961 +
45962 + written = filemap_copy_from_user(page, page_off, buf, to_page);
45963 + if (unlikely(written != to_page)) {
45964 + unlock_page(page);
45965 + result = RETERR(-EFAULT);
45966 + break;
45967 + }
45968 +
45969 + flush_dcache_page(page);
45970 + reiser4_set_page_dirty_internal(page);
45971 + unlock_page(page);
45972 + nr_dirty++;
45973 +
45974 + mark_page_accessed(page);
45975 + SetPageUptodate(page);
45976 +
45977 + if (jnodes[i]->blocknr == 0)
45978 + have_to_update_extent ++;
45979 +
45980 + page_off = 0;
45981 + buf += to_page;
45982 + left -= to_page;
45983 + BUG_ON(get_current_context()->trans->atom != NULL);
45984 + }
45985 +
45986 + if (have_to_update_extent) {
45987 + update_extents(file, jnodes, nr_dirty, *pos);
45988 + } else {
45989 + for (i = 0; i < nr_dirty; i ++) {
45990 + int ret;
45991 + spin_lock_jnode(jnodes[i]);
45992 + ret = reiser4_try_capture(jnodes[i],
45993 + ZNODE_WRITE_LOCK, 0);
45994 + BUG_ON(ret != 0);
45995 + jnode_make_dirty_locked(jnodes[i]);
45996 + spin_unlock_jnode(jnodes[i]);
45997 + }
45998 + }
45999 +out:
46000 + for (i = 0; i < nr_pages; i ++) {
46001 + page_cache_release(jnode_page(jnodes[i]));
46002 + JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
46003 + jput(jnodes[i]);
46004 + }
46005 +
46006 + /* the only errors handled so far is ENOMEM and
46007 + EFAULT on copy_from_user */
46008 +
46009 + return (count - left) ? (count - left) : result;
46010 +}
46011 +
46012 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
46013 + struct page *page)
46014 +{
46015 + jnode *j;
46016 + struct address_space *mapping;
46017 + unsigned long index;
46018 + oid_t oid;
46019 + reiser4_block_nr block;
46020 +
46021 + mapping = page->mapping;
46022 + oid = get_inode_oid(mapping->host);
46023 + index = page->index;
46024 +
46025 + switch (state_of_extent(ext)) {
46026 + case HOLE_EXTENT:
46027 + /*
46028 + * it is possible to have hole page with jnode, if page was
46029 + * eflushed previously.
46030 + */
46031 + j = jfind(mapping, index);
46032 + if (j == NULL) {
46033 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46034 + SetPageUptodate(page);
46035 + unlock_page(page);
46036 + return 0;
46037 + }
46038 + spin_lock_jnode(j);
46039 + if (!jnode_page(j)) {
46040 + jnode_attach_page(j, page);
46041 + } else {
46042 + BUG_ON(jnode_page(j) != page);
46043 + assert("vs-1504", jnode_page(j) == page);
46044 + }
46045 + block = *jnode_get_io_block(j);
46046 + spin_unlock_jnode(j);
46047 + if (block == 0) {
46048 + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46049 + SetPageUptodate(page);
46050 + unlock_page(page);
46051 + jput(j);
46052 + return 0;
46053 + }
46054 + break;
46055 +
46056 + case ALLOCATED_EXTENT:
46057 + j = jnode_of_page(page);
46058 + if (IS_ERR(j))
46059 + return PTR_ERR(j);
46060 + if (*jnode_get_block(j) == 0) {
46061 + reiser4_block_nr blocknr;
46062 +
46063 + blocknr = extent_get_start(ext) + pos;
46064 + jnode_set_block(j, &blocknr);
46065 + } else
46066 + assert("vs-1403",
46067 + j->blocknr == extent_get_start(ext) + pos);
46068 + break;
46069 +
46070 + case UNALLOCATED_EXTENT:
46071 + j = jfind(mapping, index);
46072 + assert("nikita-2688", j);
46073 + assert("vs-1426", jnode_page(j) == NULL);
46074 +
46075 + spin_lock_jnode(j);
46076 + jnode_attach_page(j, page);
46077 + spin_unlock_jnode(j);
46078 + break;
46079 +
46080 + default:
46081 + warning("vs-957", "wrong extent\n");
46082 + return RETERR(-EIO);
46083 + }
46084 +
46085 + BUG_ON(j == 0);
46086 + reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
46087 + jput(j);
46088 + return 0;
46089 +}
46090 +
46091 +/* Implements plugin->u.item.s.file.read operation for extent items. */
46092 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
46093 +{
46094 + int result;
46095 + struct page *page;
46096 + unsigned long cur_page, next_page;
46097 + unsigned long page_off, count;
46098 + struct address_space *mapping;
46099 + loff_t file_off;
46100 + uf_coord_t *uf_coord;
46101 + coord_t *coord;
46102 + struct extent_coord_extension *ext_coord;
46103 + unsigned long nr_pages;
46104 + char *kaddr;
46105 +
46106 + assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
46107 + assert("vs-572", flow->user == 1);
46108 + assert("vs-1351", flow->length > 0);
46109 +
46110 + uf_coord = &hint->ext_coord;
46111 +
46112 + check_uf_coord(uf_coord, NULL);
46113 + assert("vs-33", uf_coord->lh == &hint->lh);
46114 +
46115 + coord = &uf_coord->coord;
46116 + assert("vs-1119", znode_is_rlocked(coord->node));
46117 + assert("vs-1120", znode_is_loaded(coord->node));
46118 + assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
46119 +
46120 + mapping = file->f_dentry->d_inode->i_mapping;
46121 + ext_coord = &uf_coord->extension.extent;
46122 +
46123 + /* offset in a file to start read from */
46124 + file_off = get_key_offset(&flow->key);
46125 + /* offset within the page to start read from */
46126 + page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
46127 + /* bytes which can be read from the page which contains file_off */
46128 + count = PAGE_CACHE_SIZE - page_off;
46129 +
46130 + /* index of page containing offset read is to start from */
46131 + cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
46132 + next_page = cur_page;
46133 + /* number of pages flow spans over */
46134 + nr_pages =
46135 + ((file_off + flow->length + PAGE_CACHE_SIZE -
46136 + 1) >> PAGE_CACHE_SHIFT) - cur_page;
46137 +
46138 + /* we start having twig node read locked. However, we do not want to
46139 + keep that lock all the time readahead works. So, set a sel and
46140 + release twig node. */
46141 + reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
46142 + /* &hint->lh is done-ed */
46143 +
46144 + do {
46145 + reiser4_txn_restart_current();
46146 + page = read_mapping_page(mapping, cur_page, file);
46147 + if (IS_ERR(page))
46148 + return PTR_ERR(page);
46149 + lock_page(page);
46150 + if (!PageUptodate(page)) {
46151 + unlock_page(page);
46152 + page_cache_release(page);
46153 + warning("jmacd-97178", "extent_read: page is not up to date");
46154 + return RETERR(-EIO);
46155 + }
46156 + mark_page_accessed(page);
46157 + unlock_page(page);
46158 +
46159 + /* If users can be writing to this page using arbitrary virtual
46160 + addresses, take care about potential aliasing before reading
46161 + the page on the kernel side.
46162 + */
46163 + if (mapping_writably_mapped(mapping))
46164 + flush_dcache_page(page);
46165 +
46166 + assert("nikita-3034", reiser4_schedulable());
46167 +
46168 + /* number of bytes which are to be read from the page */
46169 + if (count > flow->length)
46170 + count = flow->length;
46171 +
46172 + result = fault_in_pages_writeable(flow->data, count);
46173 + if (result) {
46174 + page_cache_release(page);
46175 + return RETERR(-EFAULT);
46176 + }
46177 +
46178 + kaddr = kmap_atomic(page, KM_USER0);
46179 + result = __copy_to_user_inatomic(flow->data,
46180 + kaddr + page_off, count);
46181 + kunmap_atomic(kaddr, KM_USER0);
46182 + if (result != 0) {
46183 + kaddr = kmap(page);
46184 + result = __copy_to_user(flow->data, kaddr + page_off, count);
46185 + kunmap(page);
46186 + if (unlikely(result))
46187 + return RETERR(-EFAULT);
46188 + }
46189 +
46190 + page_cache_release(page);
46191 +
46192 + /* increase key (flow->key), update user area pointer (flow->data) */
46193 + move_flow_forward(flow, count);
46194 +
46195 + page_off = 0;
46196 + cur_page ++;
46197 + count = PAGE_CACHE_SIZE;
46198 + nr_pages--;
46199 + } while (flow->length);
46200 +
46201 + return 0;
46202 +}
46203 +
46204 +/*
46205 + plugin->s.file.readpage
46206 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
46207 + or
46208 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
46209 +
46210 + At the beginning: coord->node is read locked, zloaded, page is
46211 + locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
46212 +*/
46213 +int reiser4_readpage_extent(void *vp, struct page *page)
46214 +{
46215 + uf_coord_t *uf_coord = vp;
46216 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
46217 + ON_DEBUG(reiser4_key key);
46218 +
46219 + assert("vs-1040", PageLocked(page));
46220 + assert("vs-1050", !PageUptodate(page));
46221 + assert("vs-1039", page->mapping && page->mapping->host);
46222 +
46223 + assert("vs-1044", znode_is_loaded(coord->node));
46224 + assert("vs-758", item_is_extent(coord));
46225 + assert("vs-1046", coord_is_existing_unit(coord));
46226 + assert("vs-1045", znode_is_rlocked(coord->node));
46227 + assert("vs-1047",
46228 + page->mapping->host->i_ino ==
46229 + get_key_objectid(item_key_by_coord(coord, &key)));
46230 + check_uf_coord(uf_coord, NULL);
46231 +
46232 + return reiser4_do_readpage_extent(
46233 + ext_by_ext_coord(uf_coord),
46234 + uf_coord->extension.extent.pos_in_unit, page);
46235 +}
46236 +
46237 +/**
46238 + * get_block_address_extent
46239 + * @coord:
46240 + * @block:
46241 + * @result:
46242 + *
46243 + *
46244 + */
46245 +int get_block_address_extent(const coord_t *coord, sector_t block,
46246 + sector_t *result)
46247 +{
46248 + reiser4_extent *ext;
46249 +
46250 + if (!coord_is_existing_unit(coord))
46251 + return RETERR(-EINVAL);
46252 +
46253 + ext = extent_by_coord(coord);
46254 +
46255 + if (state_of_extent(ext) != ALLOCATED_EXTENT)
46256 + /* FIXME: bad things may happen if it is unallocated extent */
46257 + *result = 0;
46258 + else {
46259 + reiser4_key key;
46260 +
46261 + unit_key_by_coord(coord, &key);
46262 + assert("vs-1645",
46263 + block >= get_key_offset(&key) >> current_blocksize_bits);
46264 + assert("vs-1646",
46265 + block <
46266 + (get_key_offset(&key) >> current_blocksize_bits) +
46267 + extent_get_width(ext));
46268 + *result =
46269 + extent_get_start(ext) + (block -
46270 + (get_key_offset(&key) >>
46271 + current_blocksize_bits));
46272 + }
46273 + return 0;
46274 +}
46275 +
46276 +/*
46277 + plugin->u.item.s.file.append_key
46278 + key of first byte which is the next to last byte by addressed by this extent
46279 +*/
46280 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
46281 +{
46282 + item_key_by_coord(coord, key);
46283 + set_key_offset(key,
46284 + get_key_offset(key) + reiser4_extent_size(coord,
46285 + nr_units_extent
46286 + (coord)));
46287 +
46288 + assert("vs-610", get_key_offset(key)
46289 + && (get_key_offset(key) & (current_blocksize - 1)) == 0);
46290 + return key;
46291 +}
46292 +
46293 +/* plugin->u.item.s.file.init_coord_extension */
46294 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
46295 +{
46296 + coord_t *coord;
46297 + struct extent_coord_extension *ext_coord;
46298 + reiser4_key key;
46299 + loff_t offset;
46300 +
46301 + assert("vs-1295", uf_coord->valid == 0);
46302 +
46303 + coord = &uf_coord->coord;
46304 + assert("vs-1288", coord_is_iplug_set(coord));
46305 + assert("vs-1327", znode_is_loaded(coord->node));
46306 +
46307 + if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
46308 + return;
46309 +
46310 + ext_coord = &uf_coord->extension.extent;
46311 + ext_coord->nr_units = nr_units_extent(coord);
46312 + ext_coord->ext_offset =
46313 + (char *)extent_by_coord(coord) - zdata(coord->node);
46314 + ext_coord->width = extent_get_width(extent_by_coord(coord));
46315 + ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
46316 + uf_coord->valid = 1;
46317 +
46318 + /* pos_in_unit is the only uninitialized field in extended coord */
46319 + if (coord->between == AFTER_UNIT) {
46320 + assert("vs-1330",
46321 + coord->unit_pos == nr_units_extent(coord) - 1);
46322 +
46323 + ext_coord->pos_in_unit = ext_coord->width - 1;
46324 + } else {
46325 + /* AT_UNIT */
46326 + unit_key_by_coord(coord, &key);
46327 + offset = get_key_offset(&key);
46328 +
46329 + assert("vs-1328", offset <= lookuped);
46330 + assert("vs-1329",
46331 + lookuped <
46332 + offset + ext_coord->width * current_blocksize);
46333 + ext_coord->pos_in_unit =
46334 + ((lookuped - offset) >> current_blocksize_bits);
46335 + }
46336 +}
46337 +
46338 +/*
46339 + * Local variables:
46340 + * c-indentation-style: "K&R"
46341 + * mode-name: "LC"
46342 + * c-basic-offset: 8
46343 + * tab-width: 8
46344 + * fill-column: 79
46345 + * scroll-step: 1
46346 + * End:
46347 + */
46348 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.23/fs/reiser4/plugin/item/extent_flush_ops.c
46349 --- linux-2.6.23.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
46350 +++ linux-2.6.23/fs/reiser4/plugin/item/extent_flush_ops.c 2007-12-04 16:49:30.000000000 +0300
46351 @@ -0,0 +1,1028 @@
46352 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46353 +
46354 +#include "item.h"
46355 +#include "../../tree.h"
46356 +#include "../../jnode.h"
46357 +#include "../../super.h"
46358 +#include "../../flush.h"
46359 +#include "../../carry.h"
46360 +#include "../object.h"
46361 +
46362 +#include <linux/pagemap.h>
46363 +
46364 +static reiser4_block_nr extent_unit_start(const coord_t * item);
46365 +
46366 +/* Return either first or last extent (depending on @side) of the item
46367 + @coord is set to. Set @pos_in_unit either to first or to last block
46368 + of extent. */
46369 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
46370 + reiser4_block_nr * pos_in_unit)
46371 +{
46372 + reiser4_extent *ext;
46373 +
46374 + if (side == LEFT_SIDE) {
46375 + /* get first extent of item */
46376 + ext = extent_item(coord);
46377 + *pos_in_unit = 0;
46378 + } else {
46379 + /* get last extent of item and last position within it */
46380 + assert("vs-363", side == RIGHT_SIDE);
46381 + ext = extent_item(coord) + coord_last_unit_pos(coord);
46382 + *pos_in_unit = extent_get_width(ext) - 1;
46383 + }
46384 +
46385 + return ext;
46386 +}
46387 +
46388 +/* item_plugin->f.utmost_child */
46389 +/* Return the child. Coord is set to extent item. Find jnode corresponding
46390 + either to first or to last unformatted node pointed by the item */
46391 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
46392 +{
46393 + reiser4_extent *ext;
46394 + reiser4_block_nr pos_in_unit;
46395 +
46396 + ext = extent_utmost_ext(coord, side, &pos_in_unit);
46397 +
46398 + switch (state_of_extent(ext)) {
46399 + case HOLE_EXTENT:
46400 + *childp = NULL;
46401 + return 0;
46402 + case ALLOCATED_EXTENT:
46403 + case UNALLOCATED_EXTENT:
46404 + break;
46405 + default:
46406 + /* this should never happen */
46407 + assert("vs-1417", 0);
46408 + }
46409 +
46410 + {
46411 + reiser4_key key;
46412 + reiser4_tree *tree;
46413 + unsigned long index;
46414 +
46415 + if (side == LEFT_SIDE) {
46416 + /* get key of first byte addressed by the extent */
46417 + item_key_by_coord(coord, &key);
46418 + } else {
46419 + /* get key of byte which next after last byte addressed by the extent */
46420 + append_key_extent(coord, &key);
46421 + }
46422 +
46423 + assert("vs-544",
46424 + (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
46425 + /* index of first or last (depending on @side) page addressed
46426 + by the extent */
46427 + index =
46428 + (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
46429 + if (side == RIGHT_SIDE)
46430 + index--;
46431 +
46432 + tree = coord->node->zjnode.tree;
46433 + *childp = jlookup(tree, get_key_objectid(&key), index);
46434 + }
46435 +
46436 + return 0;
46437 +}
46438 +
46439 +/* item_plugin->f.utmost_child_real_block */
46440 +/* Return the child's block, if allocated. */
46441 +int
46442 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
46443 + reiser4_block_nr * block)
46444 +{
46445 + reiser4_extent *ext;
46446 +
46447 + ext = extent_by_coord(coord);
46448 +
46449 + switch (state_of_extent(ext)) {
46450 + case ALLOCATED_EXTENT:
46451 + *block = extent_get_start(ext);
46452 + if (side == RIGHT_SIDE)
46453 + *block += extent_get_width(ext) - 1;
46454 + break;
46455 + case HOLE_EXTENT:
46456 + case UNALLOCATED_EXTENT:
46457 + *block = 0;
46458 + break;
46459 + default:
46460 + /* this should never happen */
46461 + assert("vs-1418", 0);
46462 + }
46463 +
46464 + return 0;
46465 +}
46466 +
46467 +/* item_plugin->f.scan */
46468 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
46469 + This scan continues, advancing the parent coordinate, until either it encounters a
46470 + formatted child or it finishes scanning this node.
46471 +
46472 + If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
46473 + not sure this is last property (same atom) is enforced, but it should be the case since
46474 + one atom must write the parent and the others must read the parent, thus fusing?). In
46475 + any case, the code below asserts this case for unallocated extents. Unallocated
46476 + extents are thus optimized because we can skip to the endpoint when scanning.
46477 +
46478 + It returns control to reiser4_scan_extent, handles these terminating conditions,
46479 + e.g., by loading the next twig.
46480 +*/
46481 +int reiser4_scan_extent(flush_scan * scan)
46482 +{
46483 + coord_t coord;
46484 + jnode *neighbor;
46485 + unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
46486 + reiser4_block_nr unit_start;
46487 + __u64 oid;
46488 + reiser4_key key;
46489 + int ret = 0, allocated, incr;
46490 + reiser4_tree *tree;
46491 +
46492 + if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
46493 + scan->stop = 1;
46494 + return 0; /* Race with truncate, this node is already
46495 + * truncated. */
46496 + }
46497 +
46498 + coord_dup(&coord, &scan->parent_coord);
46499 +
46500 + assert("jmacd-1404", !reiser4_scan_finished(scan));
46501 + assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
46502 + assert("jmacd-1406", jnode_is_unformatted(scan->node));
46503 +
46504 + /* The scan_index variable corresponds to the current page index of the
46505 + unformatted block scan position. */
46506 + scan_index = index_jnode(scan->node);
46507 +
46508 + assert("jmacd-7889", item_is_extent(&coord));
46509 +
46510 + repeat:
46511 + /* objectid of file */
46512 + oid = get_key_objectid(item_key_by_coord(&coord, &key));
46513 +
46514 + allocated = !extent_is_unallocated(&coord);
46515 + /* Get the values of this extent unit: */
46516 + unit_index = extent_unit_index(&coord);
46517 + unit_width = extent_unit_width(&coord);
46518 + unit_start = extent_unit_start(&coord);
46519 +
46520 + assert("jmacd-7187", unit_width > 0);
46521 + assert("jmacd-7188", scan_index >= unit_index);
46522 + assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
46523 +
46524 + /* Depending on the scan direction, we set different maximum values for scan_index
46525 + (scan_max) and the number of nodes that would be passed if the scan goes the
46526 + entire way (scan_dist). Incr is an integer reflecting the incremental
46527 + direction of scan_index. */
46528 + if (reiser4_scanning_left(scan)) {
46529 + scan_max = unit_index;
46530 + scan_dist = scan_index - unit_index;
46531 + incr = -1;
46532 + } else {
46533 + scan_max = unit_index + unit_width - 1;
46534 + scan_dist = scan_max - unit_index;
46535 + incr = +1;
46536 + }
46537 +
46538 + tree = coord.node->zjnode.tree;
46539 +
46540 + /* If the extent is allocated we have to check each of its blocks. If the extent
46541 + is unallocated we can skip to the scan_max. */
46542 + if (allocated) {
46543 + do {
46544 + neighbor = jlookup(tree, oid, scan_index);
46545 + if (neighbor == NULL)
46546 + goto stop_same_parent;
46547 +
46548 + if (scan->node != neighbor
46549 + && !reiser4_scan_goto(scan, neighbor)) {
46550 + /* @neighbor was jput() by reiser4_scan_goto */
46551 + goto stop_same_parent;
46552 + }
46553 +
46554 + ret = scan_set_current(scan, neighbor, 1, &coord);
46555 + if (ret != 0) {
46556 + goto exit;
46557 + }
46558 +
46559 + /* reference to @neighbor is stored in @scan, no need
46560 + to jput(). */
46561 + scan_index += incr;
46562 +
46563 + } while (incr + scan_max != scan_index);
46564 +
46565 + } else {
46566 + /* Optimized case for unallocated extents, skip to the end. */
46567 + neighbor = jlookup(tree, oid, scan_max /*index */ );
46568 + if (neighbor == NULL) {
46569 + /* Race with truncate */
46570 + scan->stop = 1;
46571 + ret = 0;
46572 + goto exit;
46573 + }
46574 +
46575 + assert("zam-1043",
46576 + reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
46577 +
46578 + ret = scan_set_current(scan, neighbor, scan_dist, &coord);
46579 + if (ret != 0) {
46580 + goto exit;
46581 + }
46582 + }
46583 +
46584 + if (coord_sideof_unit(&coord, scan->direction) == 0
46585 + && item_is_extent(&coord)) {
46586 + /* Continue as long as there are more extent units. */
46587 +
46588 + scan_index =
46589 + extent_unit_index(&coord) +
46590 + (reiser4_scanning_left(scan) ?
46591 + extent_unit_width(&coord) - 1 : 0);
46592 + goto repeat;
46593 + }
46594 +
46595 + if (0) {
46596 + stop_same_parent:
46597 +
46598 + /* If we are scanning left and we stop in the middle of an allocated
46599 + extent, we know the preceder immediately.. */
46600 + /* middle of extent is (scan_index - unit_index) != 0. */
46601 + if (reiser4_scanning_left(scan) &&
46602 + (scan_index - unit_index) != 0) {
46603 + /* FIXME(B): Someone should step-through and verify that this preceder
46604 + calculation is indeed correct. */
46605 + /* @unit_start is starting block (number) of extent
46606 + unit. Flush stopped at the @scan_index block from
46607 + the beginning of the file, which is (scan_index -
46608 + unit_index) block within extent.
46609 + */
46610 + if (unit_start) {
46611 + /* skip preceder update when we are at hole */
46612 + scan->preceder_blk =
46613 + unit_start + scan_index - unit_index;
46614 + check_preceder(scan->preceder_blk);
46615 + }
46616 + }
46617 +
46618 + /* In this case, we leave coord set to the parent of scan->node. */
46619 + scan->stop = 1;
46620 +
46621 + } else {
46622 + /* In this case, we are still scanning, coord is set to the next item which is
46623 + either off-the-end of the node or not an extent. */
46624 + assert("jmacd-8912", scan->stop == 0);
46625 + assert("jmacd-7812",
46626 + (coord_is_after_sideof_unit(&coord, scan->direction)
46627 + || !item_is_extent(&coord)));
46628 + }
46629 +
46630 + ret = 0;
46631 + exit:
46632 + return ret;
46633 +}
46634 +
46635 +/* ask block allocator for some blocks */
46636 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
46637 + reiser4_block_nr wanted_count,
46638 + reiser4_block_nr *first_allocated,
46639 + reiser4_block_nr *allocated,
46640 + block_stage_t block_stage)
46641 +{
46642 + *allocated = wanted_count;
46643 + preceder->max_dist = 0; /* scan whole disk, if needed */
46644 +
46645 + /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
46646 + preceder->block_stage = block_stage;
46647 +
46648 + /* FIXME: we do not handle errors here now */
46649 + check_me("vs-420",
46650 + reiser4_alloc_blocks(preceder, first_allocated, allocated,
46651 + BA_PERMANENT) == 0);
46652 + /* update flush_pos's preceder to last allocated block number */
46653 + preceder->blk = *first_allocated + *allocated - 1;
46654 +}
46655 +
46656 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
46657 + will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
46658 + to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
46659 +static reiser4_block_nr reserve_replace(void)
46660 +{
46661 + reiser4_block_nr grabbed, needed;
46662 +
46663 + grabbed = get_current_context()->grabbed_blocks;
46664 + needed = estimate_one_insert_into_item(current_tree);
46665 + check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
46666 + return grabbed;
46667 +}
46668 +
46669 +static void free_replace_reserved(reiser4_block_nr grabbed)
46670 +{
46671 + reiser4_context *ctx;
46672 +
46673 + ctx = get_current_context();
46674 + grabbed2free(ctx, get_super_private(ctx->super),
46675 + ctx->grabbed_blocks - grabbed);
46676 +}
46677 +
46678 +/* Block offset of first block addressed by unit */
46679 +__u64 extent_unit_index(const coord_t * item)
46680 +{
46681 + reiser4_key key;
46682 +
46683 + assert("vs-648", coord_is_existing_unit(item));
46684 + unit_key_by_coord(item, &key);
46685 + return get_key_offset(&key) >> current_blocksize_bits;
46686 +}
46687 +
46688 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
46689 + Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
46690 +__u64 extent_unit_width(const coord_t * item)
46691 +{
46692 + assert("vs-649", coord_is_existing_unit(item));
46693 + return width_by_coord(item);
46694 +}
46695 +
46696 +/* Starting block location of this unit */
46697 +static reiser4_block_nr extent_unit_start(const coord_t * item)
46698 +{
46699 + return extent_get_start(extent_by_coord(item));
46700 +}
46701 +
46702 +/**
46703 + * split_allocated_extent -
46704 + * @coord:
46705 + * @pos_in_unit:
46706 + *
46707 + * replace allocated extent with two allocated extents
46708 + */
46709 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
46710 +{
46711 + int result;
46712 + struct replace_handle *h;
46713 + reiser4_extent *ext;
46714 + reiser4_block_nr grabbed;
46715 +
46716 + ext = extent_by_coord(coord);
46717 + assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
46718 + assert("vs-1411", extent_get_width(ext) > pos_in_unit);
46719 +
46720 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46721 + if (h == NULL)
46722 + return RETERR(-ENOMEM);
46723 + h->coord = coord;
46724 + h->lh = znode_lh(coord->node);
46725 + h->pkey = &h->key;
46726 + unit_key_by_coord(coord, h->pkey);
46727 + set_key_offset(h->pkey,
46728 + (get_key_offset(h->pkey) +
46729 + pos_in_unit * current_blocksize));
46730 + reiser4_set_extent(&h->overwrite, extent_get_start(ext),
46731 + pos_in_unit);
46732 + reiser4_set_extent(&h->new_extents[0],
46733 + extent_get_start(ext) + pos_in_unit,
46734 + extent_get_width(ext) - pos_in_unit);
46735 + h->nr_new_extents = 1;
46736 + h->flags = COPI_DONT_SHIFT_LEFT;
46737 + h->paste_key = h->key;
46738 +
46739 + /* reserve space for extent unit paste, @grabbed is reserved before */
46740 + grabbed = reserve_replace();
46741 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46742 + extent */);
46743 + /* restore reserved */
46744 + free_replace_reserved(grabbed);
46745 + kfree(h);
46746 + return result;
46747 +}
46748 +
46749 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
46750 + one). Return 1 if it succeeded, 0 - otherwise */
46751 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
46752 + reiser4_extent *replace)
46753 +{
46754 + assert("vs-1415", extent_by_coord(coord) == ext);
46755 +
46756 + if (coord->unit_pos == 0
46757 + || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
46758 + /* @ext either does not exist or is not allocated extent */
46759 + return 0;
46760 + if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
46761 + extent_get_start(replace))
46762 + return 0;
46763 +
46764 + /* we can glue, widen previous unit */
46765 + extent_set_width(ext - 1,
46766 + extent_get_width(ext - 1) + extent_get_width(replace));
46767 +
46768 + if (extent_get_width(ext) != extent_get_width(replace)) {
46769 + /* make current extent narrower */
46770 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
46771 + extent_set_start(ext,
46772 + extent_get_start(ext) +
46773 + extent_get_width(replace));
46774 + extent_set_width(ext,
46775 + extent_get_width(ext) -
46776 + extent_get_width(replace));
46777 + } else {
46778 + /* current extent completely glued with its left neighbor, remove it */
46779 + coord_t from, to;
46780 +
46781 + coord_dup(&from, coord);
46782 + from.unit_pos = nr_units_extent(coord) - 1;
46783 + coord_dup(&to, &from);
46784 +
46785 + /* currently cut from extent can cut either from the beginning or from the end. Move place which got
46786 + freed after unit removal to end of item */
46787 + memmove(ext, ext + 1,
46788 + (from.unit_pos -
46789 + coord->unit_pos) * sizeof(reiser4_extent));
46790 + /* wipe part of item which is going to be cut, so that node_check will not be confused */
46791 + cut_node_content(&from, &to, NULL, NULL, NULL);
46792 + }
46793 + znode_make_dirty(coord->node);
46794 + /* move coord back */
46795 + coord->unit_pos--;
46796 + return 1;
46797 +}
46798 +
46799 +/**
46800 + * conv_extent - replace extent with 2 ones
46801 + * @coord: coordinate of extent to be replaced
46802 + * @replace: extent to overwrite the one @coord is set to
46803 + *
46804 + * Overwrites extent @coord is set to and paste one extent unit after
46805 + * overwritten one if @replace is shorter than initial extent
46806 + */
46807 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
46808 +{
46809 + int result;
46810 + struct replace_handle *h;
46811 + reiser4_extent *ext;
46812 + reiser4_block_nr start, width, new_width;
46813 + reiser4_block_nr grabbed;
46814 + extent_state state;
46815 +
46816 + ext = extent_by_coord(coord);
46817 + state = state_of_extent(ext);
46818 + start = extent_get_start(ext);
46819 + width = extent_get_width(ext);
46820 + new_width = extent_get_width(replace);
46821 +
46822 + assert("vs-1458", (state == UNALLOCATED_EXTENT ||
46823 + state == ALLOCATED_EXTENT));
46824 + assert("vs-1459", width >= new_width);
46825 +
46826 + if (try_to_merge_with_left(coord, ext, replace)) {
46827 + /* merged @replace with left neighbor. Current unit is either
46828 + removed or narrowed */
46829 + return 0;
46830 + }
46831 +
46832 + if (width == new_width) {
46833 + /* replace current extent with @replace */
46834 + *ext = *replace;
46835 + znode_make_dirty(coord->node);
46836 + return 0;
46837 + }
46838 +
46839 + h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46840 + if (h == NULL)
46841 + return RETERR(-ENOMEM);
46842 + h->coord = coord;
46843 + h->lh = znode_lh(coord->node);
46844 + h->pkey = &h->key;
46845 + unit_key_by_coord(coord, h->pkey);
46846 + set_key_offset(h->pkey,
46847 + (get_key_offset(h->pkey) + new_width * current_blocksize));
46848 + h->overwrite = *replace;
46849 +
46850 + /* replace @ext with @replace and padding extent */
46851 + reiser4_set_extent(&h->new_extents[0],
46852 + (state == ALLOCATED_EXTENT) ?
46853 + (start + new_width) :
46854 + UNALLOCATED_EXTENT_START,
46855 + width - new_width);
46856 + h->nr_new_extents = 1;
46857 + h->flags = COPI_DONT_SHIFT_LEFT;
46858 + h->paste_key = h->key;
46859 +
46860 + /* reserve space for extent unit paste, @grabbed is reserved before */
46861 + grabbed = reserve_replace();
46862 + result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46863 + extent */);
46864 +
46865 + /* restore reserved */
46866 + free_replace_reserved(grabbed);
46867 + kfree(h);
46868 + return result;
46869 +}
46870 +
46871 +/**
46872 + * assign_real_blocknrs
46873 + * @flush_pos:
46874 + * @oid: objectid of file jnodes to assign block number to belongs to
46875 + * @index: first jnode on the range
46876 + * @count: number of jnodes to assign block numbers to
46877 + * @first: start of allocated block range
46878 + *
46879 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
46880 + * @index. Jnodes get lookuped with jlookup.
46881 + */
46882 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
46883 + unsigned long index, reiser4_block_nr count,
46884 + reiser4_block_nr first)
46885 +{
46886 + unsigned long i;
46887 + reiser4_tree *tree;
46888 + txn_atom *atom;
46889 + int nr;
46890 +
46891 + atom = atom_locked_by_fq(flush_pos->fq);
46892 + assert("vs-1468", atom);
46893 + BUG_ON(atom == NULL);
46894 +
46895 + nr = 0;
46896 + tree = current_tree;
46897 + for (i = 0; i < count; ++i, ++index) {
46898 + jnode *node;
46899 +
46900 + node = jlookup(tree, oid, index);
46901 + assert("", node != NULL);
46902 + BUG_ON(node == NULL);
46903 +
46904 + spin_lock_jnode(node);
46905 + assert("", !jnode_is_flushprepped(node));
46906 + assert("vs-1475", node->atom == atom);
46907 + assert("vs-1476", atomic_read(&node->x_count) > 0);
46908 +
46909 + JF_CLR(node, JNODE_FLUSH_RESERVED);
46910 + jnode_set_block(node, &first);
46911 + unformatted_make_reloc(node, flush_pos->fq);
46912 + ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
46913 + FQ_LIST, 0));
46914 + spin_unlock_jnode(node);
46915 + first++;
46916 +
46917 + atomic_dec(&node->x_count);
46918 + nr ++;
46919 + }
46920 +
46921 + spin_unlock_atom(atom);
46922 + return;
46923 +}
46924 +
46925 +/**
46926 + * make_node_ovrwr - assign node to overwrite set
46927 + * @jnodes: overwrite set list head
46928 + * @node: jnode to belong to overwrite set
46929 + *
46930 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
46931 + * which is an accumulator for nodes before they get to overwrite set list of
46932 + * atom.
46933 + */
46934 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
46935 +{
46936 + spin_lock_jnode(node);
46937 +
46938 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
46939 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
46940 +
46941 + JF_SET(node, JNODE_OVRWR);
46942 + list_move_tail(&node->capture_link, jnodes);
46943 + ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
46944 +
46945 + spin_unlock_jnode(node);
46946 +}
46947 +
46948 +/**
46949 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
46950 + * @flush_pos: flush position
46951 + * @oid: objectid of file jnodes belong to
46952 + * @index: starting index
46953 + * @width: extent width
46954 + *
46955 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
46956 + * overwrite set. Starting from the one with index @index. If end of slum is
46957 + * detected (node is not found or flushprepped) - stop iterating and set flush
46958 + * position's state to POS_INVALID.
46959 + */
46960 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
46961 + unsigned long index, reiser4_block_nr width)
46962 +{
46963 + unsigned long i;
46964 + reiser4_tree *tree;
46965 + jnode *node;
46966 + txn_atom *atom;
46967 + LIST_HEAD(jnodes);
46968 +
46969 + tree = current_tree;
46970 +
46971 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
46972 + assert("vs-1478", atom);
46973 +
46974 + for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
46975 + node = jlookup(tree, oid, index);
46976 + if (!node) {
46977 + flush_pos->state = POS_INVALID;
46978 + break;
46979 + }
46980 + if (jnode_check_flushprepped(node)) {
46981 + flush_pos->state = POS_INVALID;
46982 + atomic_dec(&node->x_count);
46983 + break;
46984 + }
46985 + if (node->atom != atom) {
46986 + flush_pos->state = POS_INVALID;
46987 + atomic_dec(&node->x_count);
46988 + break;
46989 + }
46990 + make_node_ovrwr(&jnodes, node);
46991 + atomic_dec(&node->x_count);
46992 + }
46993 +
46994 + list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
46995 + spin_unlock_atom(atom);
46996 +}
46997 +
46998 +/**
46999 + * allocated_extent_slum_size
47000 + * @flush_pos:
47001 + * @oid:
47002 + * @index:
47003 + * @count:
47004 + *
47005 + *
47006 + */
47007 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
47008 + unsigned long index, unsigned long count)
47009 +{
47010 + unsigned long i;
47011 + reiser4_tree *tree;
47012 + txn_atom *atom;
47013 + int nr;
47014 +
47015 + atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47016 + assert("vs-1468", atom);
47017 +
47018 + nr = 0;
47019 + tree = current_tree;
47020 + for (i = 0; i < count; ++i, ++index) {
47021 + jnode *node;
47022 +
47023 + node = jlookup(tree, oid, index);
47024 + if (!node)
47025 + break;
47026 +
47027 + if (jnode_check_flushprepped(node)) {
47028 + atomic_dec(&node->x_count);
47029 + break;
47030 + }
47031 +
47032 + if (node->atom != atom) {
47033 + /*
47034 + * this is possible on overwrite: extent_write may
47035 + * capture several unformatted nodes without capturing
47036 + * any formatted nodes.
47037 + */
47038 + atomic_dec(&node->x_count);
47039 + break;
47040 + }
47041 +
47042 + assert("vs-1476", atomic_read(&node->x_count) > 1);
47043 + atomic_dec(&node->x_count);
47044 + nr ++;
47045 + }
47046 +
47047 + spin_unlock_atom(atom);
47048 + return nr;
47049 +}
47050 +
47051 +/**
47052 + * alloc_extent
47053 + * @flush_pos:
47054 + *
47055 + *
47056 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
47057 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
47058 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
47059 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
47060 + * set to 1 and to overwrite set otherwise
47061 + */
47062 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
47063 +{
47064 + coord_t *coord;
47065 + reiser4_extent *ext;
47066 + reiser4_extent replace_ext;
47067 + oid_t oid;
47068 + reiser4_block_nr protected;
47069 + reiser4_block_nr start;
47070 + __u64 index;
47071 + __u64 width;
47072 + extent_state state;
47073 + int result;
47074 + reiser4_block_nr first_allocated;
47075 + __u64 allocated;
47076 + reiser4_key key;
47077 + block_stage_t block_stage;
47078 +
47079 + assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
47080 + assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
47081 + && item_is_extent(&flush_pos->coord));
47082 +
47083 + coord = &flush_pos->coord;
47084 +
47085 + ext = extent_by_coord(coord);
47086 + state = state_of_extent(ext);
47087 + if (state == HOLE_EXTENT) {
47088 + flush_pos->state = POS_INVALID;
47089 + return 0;
47090 + }
47091 +
47092 + item_key_by_coord(coord, &key);
47093 + oid = get_key_objectid(&key);
47094 + index = extent_unit_index(coord) + flush_pos->pos_in_unit;
47095 + start = extent_get_start(ext);
47096 + width = extent_get_width(ext);
47097 +
47098 + assert("vs-1457", width > flush_pos->pos_in_unit);
47099 +
47100 + if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
47101 + /* relocate */
47102 + if (flush_pos->pos_in_unit) {
47103 + /* split extent unit into two */
47104 + result =
47105 + split_allocated_extent(coord,
47106 + flush_pos->pos_in_unit);
47107 + flush_pos->pos_in_unit = 0;
47108 + return result;
47109 + }
47110 +
47111 + /* limit number of nodes to allocate */
47112 + if (flush_pos->nr_to_write < width)
47113 + width = flush_pos->nr_to_write;
47114 +
47115 + if (state == ALLOCATED_EXTENT) {
47116 + /*
47117 + * all protected nodes are not flushprepped, therefore
47118 + * they are counted as flush_reserved
47119 + */
47120 + block_stage = BLOCK_FLUSH_RESERVED;
47121 + protected = allocated_extent_slum_size(flush_pos, oid,
47122 + index, width);
47123 + if (protected == 0) {
47124 + flush_pos->state = POS_INVALID;
47125 + flush_pos->pos_in_unit = 0;
47126 + return 0;
47127 + }
47128 + } else {
47129 + block_stage = BLOCK_UNALLOCATED;
47130 + protected = width;
47131 + }
47132 +
47133 + /*
47134 + * look at previous unit if possible. If it is allocated, make
47135 + * preceder more precise
47136 + */
47137 + if (coord->unit_pos &&
47138 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47139 + reiser4_pos_hint(flush_pos)->blk =
47140 + extent_get_start(ext - 1) +
47141 + extent_get_width(ext - 1);
47142 +
47143 + /* allocate new block numbers for protected nodes */
47144 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47145 + protected,
47146 + &first_allocated, &allocated,
47147 + block_stage);
47148 +
47149 + if (state == ALLOCATED_EXTENT)
47150 + /*
47151 + * on relocating - free nodes which are going to be
47152 + * relocated
47153 + */
47154 + reiser4_dealloc_blocks(&start, &allocated,
47155 + BLOCK_ALLOCATED, BA_DEFER);
47156 +
47157 + /* assign new block numbers to protected nodes */
47158 + assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
47159 +
47160 + /* prepare extent which will replace current one */
47161 + reiser4_set_extent(&replace_ext, first_allocated, allocated);
47162 +
47163 + /* adjust extent item */
47164 + result = conv_extent(coord, &replace_ext);
47165 + if (result != 0 && result != -ENOMEM) {
47166 + warning("vs-1461",
47167 + "Failed to allocate extent. Should not happen\n");
47168 + return result;
47169 + }
47170 +
47171 + /*
47172 + * break flush: we prepared for flushing as many blocks as we
47173 + * were asked for
47174 + */
47175 + if (flush_pos->nr_to_write == allocated)
47176 + flush_pos->state = POS_INVALID;
47177 + } else {
47178 + /* overwrite */
47179 + mark_jnodes_overwrite(flush_pos, oid, index, width);
47180 + }
47181 + flush_pos->pos_in_unit = 0;
47182 + return 0;
47183 +}
47184 +
47185 +/* if @key is glueable to the item @coord is set to */
47186 +static int must_insert(const coord_t *coord, const reiser4_key *key)
47187 +{
47188 + reiser4_key last;
47189 +
47190 + if (item_id_by_coord(coord) == EXTENT_POINTER_ID
47191 + && keyeq(append_key_extent(coord, &last), key))
47192 + return 0;
47193 + return 1;
47194 +}
47195 +
47196 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
47197 + or modify last unit of last item to have greater width */
47198 +static int put_unit_to_end(znode *node, const reiser4_key *key,
47199 + reiser4_extent *copy_ext)
47200 +{
47201 + int result;
47202 + coord_t coord;
47203 + cop_insert_flag flags;
47204 + reiser4_extent *last_ext;
47205 + reiser4_item_data data;
47206 +
47207 + /* set coord after last unit in an item */
47208 + coord_init_last_unit(&coord, node);
47209 + coord.between = AFTER_UNIT;
47210 +
47211 + flags =
47212 + COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
47213 + if (must_insert(&coord, key)) {
47214 + result =
47215 + insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
47216 + key, NULL /*lh */ , flags);
47217 +
47218 + } else {
47219 + /* try to glue with last unit */
47220 + last_ext = extent_by_coord(&coord);
47221 + if (state_of_extent(last_ext) &&
47222 + extent_get_start(last_ext) + extent_get_width(last_ext) ==
47223 + extent_get_start(copy_ext)) {
47224 + /* widen last unit of node */
47225 + extent_set_width(last_ext,
47226 + extent_get_width(last_ext) +
47227 + extent_get_width(copy_ext));
47228 + znode_make_dirty(node);
47229 + return 0;
47230 + }
47231 +
47232 + /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
47233 + result =
47234 + insert_into_item(&coord, NULL /*lh */ , key,
47235 + init_new_extent(&data, copy_ext, 1),
47236 + flags);
47237 + }
47238 +
47239 + assert("vs-438", result == 0 || result == -E_NODE_FULL);
47240 + return result;
47241 +}
47242 +
47243 +/* @coord is set to extent unit */
47244 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
47245 + flush_pos_t *flush_pos,
47246 + reiser4_key *stop_key)
47247 +{
47248 + reiser4_extent *ext;
47249 + __u64 index;
47250 + __u64 width;
47251 + reiser4_block_nr start;
47252 + extent_state state;
47253 + oid_t oid;
47254 + reiser4_block_nr first_allocated;
47255 + __u64 allocated;
47256 + __u64 protected;
47257 + reiser4_extent copy_extent;
47258 + reiser4_key key;
47259 + int result;
47260 + block_stage_t block_stage;
47261 +
47262 + assert("vs-1457", flush_pos->pos_in_unit == 0);
47263 + assert("vs-1467", coord_is_leftmost_unit(coord));
47264 + assert("vs-1467", item_is_extent(coord));
47265 +
47266 + ext = extent_by_coord(coord);
47267 + index = extent_unit_index(coord);
47268 + start = extent_get_start(ext);
47269 + width = extent_get_width(ext);
47270 + state = state_of_extent(ext);
47271 + unit_key_by_coord(coord, &key);
47272 + oid = get_key_objectid(&key);
47273 +
47274 + if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
47275 + (state == UNALLOCATED_EXTENT)) {
47276 + /* relocate */
47277 + if (state == ALLOCATED_EXTENT) {
47278 + /* all protected nodes are not flushprepped, therefore
47279 + * they are counted as flush_reserved */
47280 + block_stage = BLOCK_FLUSH_RESERVED;
47281 + protected = allocated_extent_slum_size(flush_pos, oid,
47282 + index, width);
47283 + if (protected == 0) {
47284 + flush_pos->state = POS_INVALID;
47285 + flush_pos->pos_in_unit = 0;
47286 + return 0;
47287 + }
47288 + } else {
47289 + block_stage = BLOCK_UNALLOCATED;
47290 + protected = width;
47291 + }
47292 +
47293 + /*
47294 + * look at previous unit if possible. If it is allocated, make
47295 + * preceder more precise
47296 + */
47297 + if (coord->unit_pos &&
47298 + (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47299 + reiser4_pos_hint(flush_pos)->blk =
47300 + extent_get_start(ext - 1) +
47301 + extent_get_width(ext - 1);
47302 +
47303 + /* allocate new block numbers for protected nodes */
47304 + extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47305 + protected,
47306 + &first_allocated, &allocated,
47307 + block_stage);
47308 +
47309 + /* prepare extent which will be copied to left */
47310 + reiser4_set_extent(&copy_extent, first_allocated, allocated);
47311 +
47312 + result = put_unit_to_end(left, &key, &copy_extent);
47313 + if (result == -E_NODE_FULL) {
47314 + int target_block_stage;
47315 +
47316 + /* free blocks which were just allocated */
47317 + target_block_stage =
47318 + (state ==
47319 + ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
47320 + BLOCK_UNALLOCATED;
47321 + reiser4_dealloc_blocks(&first_allocated, &allocated,
47322 + target_block_stage,
47323 + BA_PERMANENT);
47324 +
47325 + /* rewind the preceder. */
47326 + flush_pos->preceder.blk = first_allocated;
47327 + check_preceder(flush_pos->preceder.blk);
47328 +
47329 + return SQUEEZE_TARGET_FULL;
47330 + }
47331 +
47332 + if (state == ALLOCATED_EXTENT) {
47333 + /* free nodes which were relocated */
47334 + reiser4_dealloc_blocks(&start, &allocated,
47335 + BLOCK_ALLOCATED, BA_DEFER);
47336 + }
47337 +
47338 + /* assign new block numbers to protected nodes */
47339 + assign_real_blocknrs(flush_pos, oid, index, allocated,
47340 + first_allocated);
47341 +
47342 + set_key_offset(&key,
47343 + get_key_offset(&key) +
47344 + (allocated << current_blocksize_bits));
47345 + } else {
47346 + /*
47347 + * overwrite: try to copy unit as it is to left neighbor and
47348 + * make all first not flushprepped nodes overwrite nodes
47349 + */
47350 + reiser4_set_extent(&copy_extent, start, width);
47351 + result = put_unit_to_end(left, &key, &copy_extent);
47352 + if (result == -E_NODE_FULL)
47353 + return SQUEEZE_TARGET_FULL;
47354 +
47355 + if (state != HOLE_EXTENT)
47356 + mark_jnodes_overwrite(flush_pos, oid, index, width);
47357 + set_key_offset(&key,
47358 + get_key_offset(&key) +
47359 + (width << current_blocksize_bits));
47360 + }
47361 + *stop_key = key;
47362 + return SQUEEZE_CONTINUE;
47363 +}
47364 +
47365 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
47366 +{
47367 + return key_by_inode_and_offset_common(inode, off, key);
47368 +}
47369 +
47370 +/*
47371 + * Local variables:
47372 + * c-indentation-style: "K&R"
47373 + * mode-name: "LC"
47374 + * c-basic-offset: 8
47375 + * tab-width: 8
47376 + * fill-column: 79
47377 + * scroll-step: 1
47378 + * End:
47379 + */
47380 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent.h linux-2.6.23/fs/reiser4/plugin/item/extent.h
47381 --- linux-2.6.23.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300
47382 +++ linux-2.6.23/fs/reiser4/plugin/item/extent.h 2007-12-04 16:49:30.000000000 +0300
47383 @@ -0,0 +1,231 @@
47384 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47385 +
47386 +#ifndef __REISER4_EXTENT_H__
47387 +#define __REISER4_EXTENT_H__
47388 +
47389 +/* on disk extent */
47390 +typedef struct {
47391 + reiser4_dblock_nr start;
47392 + reiser4_dblock_nr width;
47393 +} reiser4_extent;
47394 +
47395 +struct extent_stat {
47396 + int unallocated_units;
47397 + int unallocated_blocks;
47398 + int allocated_units;
47399 + int allocated_blocks;
47400 + int hole_units;
47401 + int hole_blocks;
47402 +};
47403 +
47404 +/* extents in an extent item can be either holes, or unallocated or allocated
47405 + extents */
47406 +typedef enum {
47407 + HOLE_EXTENT,
47408 + UNALLOCATED_EXTENT,
47409 + ALLOCATED_EXTENT
47410 +} extent_state;
47411 +
47412 +#define HOLE_EXTENT_START 0
47413 +#define UNALLOCATED_EXTENT_START 1
47414 +#define UNALLOCATED_EXTENT_START2 2
47415 +
47416 +struct extent_coord_extension {
47417 + reiser4_block_nr pos_in_unit;
47418 + reiser4_block_nr width; /* width of current unit */
47419 + pos_in_node_t nr_units; /* number of units */
47420 + int ext_offset; /* offset from the beginning of zdata() */
47421 + unsigned long expected_page;
47422 +#if REISER4_DEBUG
47423 + reiser4_extent extent;
47424 +#endif
47425 +};
47426 +
47427 +/* macros to set/get fields of on-disk extent */
47428 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47429 +{
47430 + return le64_to_cpu(ext->start);
47431 +}
47432 +
47433 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47434 +{
47435 + return le64_to_cpu(ext->width);
47436 +}
47437 +
47438 +extern __u64 reiser4_current_block_count(void);
47439 +
47440 +static inline void
47441 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47442 +{
47443 + cassert(sizeof(ext->start) == 8);
47444 + assert("nikita-2510",
47445 + ergo(start > 1, start < reiser4_current_block_count()));
47446 + put_unaligned(cpu_to_le64(start), &ext->start);
47447 +}
47448 +
47449 +static inline void
47450 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47451 +{
47452 + cassert(sizeof(ext->width) == 8);
47453 + assert("", width > 0);
47454 + put_unaligned(cpu_to_le64(width), &ext->width);
47455 + assert("nikita-2511",
47456 + ergo(extent_get_start(ext) > 1,
47457 + extent_get_start(ext) + width <=
47458 + reiser4_current_block_count()));
47459 +}
47460 +
47461 +#define extent_item(coord) \
47462 +({ \
47463 + assert("nikita-3143", item_is_extent(coord)); \
47464 + ((reiser4_extent *)item_body_by_coord (coord)); \
47465 +})
47466 +
47467 +#define extent_by_coord(coord) \
47468 +({ \
47469 + assert("nikita-3144", item_is_extent(coord)); \
47470 + (extent_item (coord) + (coord)->unit_pos); \
47471 +})
47472 +
47473 +#define width_by_coord(coord) \
47474 +({ \
47475 + assert("nikita-3145", item_is_extent(coord)); \
47476 + extent_get_width (extent_by_coord(coord)); \
47477 +})
47478 +
47479 +struct carry_cut_data;
47480 +struct carry_kill_data;
47481 +
47482 +/* plugin->u.item.b.* */
47483 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47484 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47485 + const reiser4_item_data *);
47486 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
47487 +pos_in_node_t nr_units_extent(const coord_t *);
47488 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47489 +void init_coord_extent(coord_t *);
47490 +int init_extent(coord_t *, reiser4_item_data *);
47491 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47492 +int can_shift_extent(unsigned free_space,
47493 + coord_t * source, znode * target, shift_direction,
47494 + unsigned *size, unsigned want);
47495 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47496 + unsigned count, shift_direction where_is_free_space,
47497 + unsigned free_space);
47498 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47499 + struct carry_kill_data *);
47500 +int create_hook_extent(const coord_t * coord, void *arg);
47501 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47502 + struct carry_cut_data *, reiser4_key * smallest_removed,
47503 + reiser4_key * new_first);
47504 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47505 + struct carry_kill_data *, reiser4_key * smallest_removed,
47506 + reiser4_key * new_first);
47507 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47508 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47509 +void print_extent(const char *, coord_t *);
47510 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47511 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47512 + reiser4_block_nr * block);
47513 +void item_stat_extent(const coord_t * coord, void *vp);
47514 +int reiser4_check_extent(const coord_t * coord, const char **error);
47515 +
47516 +/* plugin->u.item.s.file.* */
47517 +ssize_t reiser4_write_extent(struct file *, const char __user *,
47518 + size_t, loff_t *);
47519 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
47520 +int reiser4_readpage_extent(void *, struct page *);
47521 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
47522 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47523 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47524 +int get_block_address_extent(const coord_t *, sector_t block,
47525 + sector_t * result);
47526 +
47527 +/* these are used in flush.c
47528 + FIXME-VS: should they be somewhere in item_plugin? */
47529 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47530 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47531 + reiser4_key * stop_key);
47532 +
47533 +int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47534 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47535 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47536 +
47537 +/* plugin->u.item.f. */
47538 +int reiser4_scan_extent(flush_scan * scan);
47539 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47540 +
47541 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47542 + int nr_extents);
47543 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
47544 +extent_state state_of_extent(reiser4_extent * ext);
47545 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
47546 + reiser4_block_nr width);
47547 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
47548 + int *plugged_hole);
47549 +
47550 +#include "../../coord.h"
47551 +#include "../../lock.h"
47552 +#include "../../tap.h"
47553 +
47554 +struct replace_handle {
47555 + /* these are to be set before calling reiser4_replace_extent */
47556 + coord_t *coord;
47557 + lock_handle *lh;
47558 + reiser4_key key;
47559 + reiser4_key *pkey;
47560 + reiser4_extent overwrite;
47561 + reiser4_extent new_extents[2];
47562 + int nr_new_extents;
47563 + unsigned flags;
47564 +
47565 + /* these are used by reiser4_replace_extent */
47566 + reiser4_item_data item;
47567 + coord_t coord_after;
47568 + lock_handle lh_after;
47569 + tap_t watch;
47570 + reiser4_key paste_key;
47571 +#if REISER4_DEBUG
47572 + reiser4_extent orig_ext;
47573 + reiser4_key tmp;
47574 +#endif
47575 +};
47576 +
47577 +/* this structure is kmalloced before calling make_extent to avoid excessive
47578 + stack consumption on plug_hole->reiser4_replace_extent */
47579 +struct make_extent_handle {
47580 + uf_coord_t *uf_coord;
47581 + reiser4_block_nr blocknr;
47582 + int created;
47583 + struct inode *inode;
47584 + union {
47585 + struct {
47586 + } append;
47587 + struct replace_handle replace;
47588 + } u;
47589 +};
47590 +
47591 +int reiser4_replace_extent(struct replace_handle *,
47592 + int return_inserted_position);
47593 +lock_handle *znode_lh(znode *);
47594 +
47595 +/* the reiser4 repacker support */
47596 +struct repacker_cursor;
47597 +extern int process_extent_backward_for_repacking(tap_t *,
47598 + struct repacker_cursor *);
47599 +extern int mark_extent_for_repacking(tap_t *, int);
47600 +
47601 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47602 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47603 +
47604 +/* __REISER4_EXTENT_H__ */
47605 +#endif
47606 +/*
47607 + Local variables:
47608 + c-indentation-style: "K&R"
47609 + mode-name: "LC"
47610 + c-basic-offset: 8
47611 + tab-width: 8
47612 + fill-column: 120
47613 + End:
47614 +*/
47615 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.23/fs/reiser4/plugin/item/extent_item_ops.c
47616 --- linux-2.6.23.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300
47617 +++ linux-2.6.23/fs/reiser4/plugin/item/extent_item_ops.c 2007-12-04 16:49:30.000000000 +0300
47618 @@ -0,0 +1,889 @@
47619 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47620 +
47621 +#include "item.h"
47622 +#include "../../inode.h"
47623 +#include "../../tree_walk.h" /* check_sibling_list() */
47624 +#include "../../page_cache.h"
47625 +#include "../../carry.h"
47626 +
47627 +#include <linux/quotaops.h>
47628 +
47629 +/* item_plugin->b.max_key_inside */
47630 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
47631 +{
47632 + item_key_by_coord(coord, key);
47633 + set_key_offset(key, get_key_offset(reiser4_max_key()));
47634 + return key;
47635 +}
47636 +
47637 +/* item_plugin->b.can_contain_key
47638 + this checks whether @key of @data is matching to position set by @coord */
47639 +int
47640 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47641 + const reiser4_item_data * data)
47642 +{
47643 + reiser4_key item_key;
47644 +
47645 + if (item_plugin_by_coord(coord) != data->iplug)
47646 + return 0;
47647 +
47648 + item_key_by_coord(coord, &item_key);
47649 + if (get_key_locality(key) != get_key_locality(&item_key) ||
47650 + get_key_objectid(key) != get_key_objectid(&item_key) ||
47651 + get_key_ordering(key) != get_key_ordering(&item_key))
47652 + return 0;
47653 +
47654 + return 1;
47655 +}
47656 +
47657 +/* item_plugin->b.mergeable
47658 + first item is of extent type */
47659 +/* Audited by: green(2002.06.13) */
47660 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
47661 +{
47662 + reiser4_key key1, key2;
47663 +
47664 + assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
47665 + /* FIXME-VS: Which is it? Assert or return 0 */
47666 + if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
47667 + return 0;
47668 + }
47669 +
47670 + item_key_by_coord(p1, &key1);
47671 + item_key_by_coord(p2, &key2);
47672 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
47673 + get_key_objectid(&key1) != get_key_objectid(&key2) ||
47674 + get_key_ordering(&key1) != get_key_ordering(&key2) ||
47675 + get_key_type(&key1) != get_key_type(&key2))
47676 + return 0;
47677 + if (get_key_offset(&key1) +
47678 + reiser4_extent_size(p1, nr_units_extent(p1)) !=
47679 + get_key_offset(&key2))
47680 + return 0;
47681 + return 1;
47682 +}
47683 +
47684 +/* item_plugin->b.nr_units */
47685 +pos_in_node_t nr_units_extent(const coord_t * coord)
47686 +{
47687 + /* length of extent item has to be multiple of extent size */
47688 + assert("vs-1424",
47689 + (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
47690 + return item_length_by_coord(coord) / sizeof(reiser4_extent);
47691 +}
47692 +
47693 +/* item_plugin->b.lookup */
47694 +lookup_result
47695 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
47696 + coord_t * coord)
47697 +{ /* znode and item_pos are
47698 + set to an extent item to
47699 + look through */
47700 + reiser4_key item_key;
47701 + reiser4_block_nr lookuped, offset;
47702 + unsigned i, nr_units;
47703 + reiser4_extent *ext;
47704 + unsigned blocksize;
47705 + unsigned char blocksize_bits;
47706 +
47707 + item_key_by_coord(coord, &item_key);
47708 + offset = get_key_offset(&item_key);
47709 +
47710 + /* key we are looking for must be greater than key of item @coord */
47711 + assert("vs-414", keygt(key, &item_key));
47712 +
47713 + assert("umka-99945",
47714 + !keygt(key, max_key_inside_extent(coord, &item_key)));
47715 +
47716 + ext = extent_item(coord);
47717 + assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
47718 +
47719 + blocksize = current_blocksize;
47720 + blocksize_bits = current_blocksize_bits;
47721 +
47722 + /* offset we are looking for */
47723 + lookuped = get_key_offset(key);
47724 +
47725 + nr_units = nr_units_extent(coord);
47726 + /* go through all extents until the one which address given offset */
47727 + for (i = 0; i < nr_units; i++, ext++) {
47728 + offset += (extent_get_width(ext) << blocksize_bits);
47729 + if (offset > lookuped) {
47730 + /* desired byte is somewhere in this extent */
47731 + coord->unit_pos = i;
47732 + coord->between = AT_UNIT;
47733 + return CBK_COORD_FOUND;
47734 + }
47735 + }
47736 +
47737 + /* set coord after last unit */
47738 + coord->unit_pos = nr_units - 1;
47739 + coord->between = AFTER_UNIT;
47740 + return CBK_COORD_FOUND;
47741 +}
47742 +
47743 +/* item_plugin->b.paste
47744 + item @coord is set to has been appended with @data->length of free
47745 + space. data->data contains data to be pasted into the item in position
47746 + @coord->in_item.unit_pos. It must fit into that free space.
47747 + @coord must be set between units.
47748 +*/
47749 +int
47750 +paste_extent(coord_t * coord, reiser4_item_data * data,
47751 + carry_plugin_info * info UNUSED_ARG)
47752 +{
47753 + unsigned old_nr_units;
47754 + reiser4_extent *ext;
47755 + int item_length;
47756 +
47757 + ext = extent_item(coord);
47758 + item_length = item_length_by_coord(coord);
47759 + old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
47760 +
47761 + /* this is also used to copy extent into newly created item, so
47762 + old_nr_units could be 0 */
47763 + assert("vs-260", item_length >= data->length);
47764 +
47765 + /* make sure that coord is set properly */
47766 + assert("vs-35",
47767 + ((!coord_is_existing_unit(coord))
47768 + || (!old_nr_units && !coord->unit_pos)));
47769 +
47770 + /* first unit to be moved */
47771 + switch (coord->between) {
47772 + case AFTER_UNIT:
47773 + coord->unit_pos++;
47774 + case BEFORE_UNIT:
47775 + coord->between = AT_UNIT;
47776 + break;
47777 + case AT_UNIT:
47778 + assert("vs-331", !old_nr_units && !coord->unit_pos);
47779 + break;
47780 + default:
47781 + impossible("vs-330", "coord is set improperly");
47782 + }
47783 +
47784 + /* prepare space for new units */
47785 + memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
47786 + ext + coord->unit_pos,
47787 + (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
47788 +
47789 + /* copy new data from kernel space */
47790 + assert("vs-556", data->user == 0);
47791 + memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
47792 +
47793 + /* after paste @coord is set to first of pasted units */
47794 + assert("vs-332", coord_is_existing_unit(coord));
47795 + assert("vs-333",
47796 + !memcmp(data->data, extent_by_coord(coord),
47797 + (unsigned)data->length));
47798 + return 0;
47799 +}
47800 +
47801 +/* item_plugin->b.can_shift */
47802 +int
47803 +can_shift_extent(unsigned free_space, coord_t * source,
47804 + znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
47805 + unsigned *size, unsigned want)
47806 +{
47807 + *size = item_length_by_coord(source);
47808 + if (*size > free_space)
47809 + /* never split a unit of extent item */
47810 + *size = free_space - free_space % sizeof(reiser4_extent);
47811 +
47812 + /* we can shift *size bytes, calculate how many do we want to shift */
47813 + if (*size > want * sizeof(reiser4_extent))
47814 + *size = want * sizeof(reiser4_extent);
47815 +
47816 + if (*size % sizeof(reiser4_extent) != 0)
47817 + impossible("vs-119", "Wrong extent size: %i %zd", *size,
47818 + sizeof(reiser4_extent));
47819 + return *size / sizeof(reiser4_extent);
47820 +
47821 +}
47822 +
47823 +/* item_plugin->b.copy_units */
47824 +void
47825 +copy_units_extent(coord_t * target, coord_t * source,
47826 + unsigned from, unsigned count,
47827 + shift_direction where_is_free_space, unsigned free_space)
47828 +{
47829 + char *from_ext, *to_ext;
47830 +
47831 + assert("vs-217", free_space == count * sizeof(reiser4_extent));
47832 +
47833 + from_ext = item_body_by_coord(source);
47834 + to_ext = item_body_by_coord(target);
47835 +
47836 + if (where_is_free_space == SHIFT_LEFT) {
47837 + assert("vs-215", from == 0);
47838 +
47839 + /* At this moment, item length was already updated in the item
47840 + header by shifting code, hence nr_units_extent() will
47841 + return "new" number of units---one we obtain after copying
47842 + units.
47843 + */
47844 + to_ext +=
47845 + (nr_units_extent(target) - count) * sizeof(reiser4_extent);
47846 + } else {
47847 + reiser4_key key;
47848 + coord_t coord;
47849 +
47850 + assert("vs-216",
47851 + from + count == coord_last_unit_pos(source) + 1);
47852 +
47853 + from_ext += item_length_by_coord(source) - free_space;
47854 +
47855 + /* new units are inserted before first unit in an item,
47856 + therefore, we have to update item key */
47857 + coord = *source;
47858 + coord.unit_pos = from;
47859 + unit_key_extent(&coord, &key);
47860 +
47861 + node_plugin_by_node(target->node)->update_item_key(target, &key,
47862 + NULL /*info */);
47863 + }
47864 +
47865 + memcpy(to_ext, from_ext, free_space);
47866 +}
47867 +
47868 +/* item_plugin->b.create_hook
47869 + @arg is znode of leaf node for which we need to update right delimiting key */
47870 +int create_hook_extent(const coord_t * coord, void *arg)
47871 +{
47872 + coord_t *child_coord;
47873 + znode *node;
47874 + reiser4_key key;
47875 + reiser4_tree *tree;
47876 +
47877 + if (!arg)
47878 + return 0;
47879 +
47880 + child_coord = arg;
47881 + tree = znode_get_tree(coord->node);
47882 +
47883 + assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
47884 +
47885 + write_lock_tree(tree);
47886 + write_lock_dk(tree);
47887 + /* find a node on the left level for which right delimiting key has to
47888 + be updated */
47889 + if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
47890 + assert("vs-411", znode_is_left_connected(child_coord->node));
47891 + node = child_coord->node->left;
47892 + } else {
47893 + assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
47894 + node = child_coord->node;
47895 + assert("nikita-3314", node != NULL);
47896 + }
47897 +
47898 + if (node != NULL) {
47899 + znode_set_rd_key(node, item_key_by_coord(coord, &key));
47900 +
47901 + assert("nikita-3282", check_sibling_list(node));
47902 + /* break sibling links */
47903 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
47904 + ON_DEBUG(node->right->left_version =
47905 + atomic_inc_return(&delim_key_version);
47906 + node->right_version =
47907 + atomic_inc_return(&delim_key_version););
47908 +
47909 + node->right->left = NULL;
47910 + node->right = NULL;
47911 + }
47912 + }
47913 + write_unlock_dk(tree);
47914 + write_unlock_tree(tree);
47915 + return 0;
47916 +}
47917 +
47918 +#define ITEM_TAIL_KILLED 0
47919 +#define ITEM_HEAD_KILLED 1
47920 +#define ITEM_KILLED 2
47921 +
47922 +/* item_plugin->b.kill_hook
47923 + this is called when @count units starting from @from-th one are going to be removed
47924 + */
47925 +int
47926 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
47927 + struct carry_kill_data *kdata)
47928 +{
47929 + reiser4_extent *ext;
47930 + reiser4_block_nr start, length;
47931 + const reiser4_key *pfrom_key, *pto_key;
47932 + struct inode *inode;
47933 + reiser4_tree *tree;
47934 + pgoff_t from_off, to_off, offset, skip;
47935 + int retval;
47936 +
47937 + /* these are located in memory kmalloc-ed by kill_node_content */
47938 + reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
47939 + coord_t *dup, *next;
47940 +
47941 + assert("zam-811", znode_is_write_locked(coord->node));
47942 + assert("nikita-3315", kdata != NULL);
47943 + assert("vs-34", kdata->buf != NULL);
47944 +
47945 + /* map structures to kdata->buf */
47946 + min_item_key = (reiser4_key *) (kdata->buf);
47947 + max_item_key = min_item_key + 1;
47948 + from_key = max_item_key + 1;
47949 + to_key = from_key + 1;
47950 + key = to_key + 1;
47951 + dup = (coord_t *) (key + 1);
47952 + next = dup + 1;
47953 +
47954 + item_key_by_coord(coord, min_item_key);
47955 + max_item_key_by_coord(coord, max_item_key);
47956 +
47957 + if (kdata->params.from_key) {
47958 + pfrom_key = kdata->params.from_key;
47959 + pto_key = kdata->params.to_key;
47960 + } else {
47961 + assert("vs-1549", from == coord->unit_pos);
47962 + unit_key_by_coord(coord, from_key);
47963 + pfrom_key = from_key;
47964 +
47965 + coord_dup(dup, coord);
47966 + dup->unit_pos = from + count - 1;
47967 + max_unit_key_by_coord(dup, to_key);
47968 + pto_key = to_key;
47969 + }
47970 +
47971 + if (!keylt(pto_key, max_item_key)) {
47972 + if (!keygt(pfrom_key, min_item_key)) {
47973 + znode *left, *right;
47974 +
47975 + /* item is to be removed completely */
47976 + assert("nikita-3316", kdata->left != NULL
47977 + && kdata->right != NULL);
47978 +
47979 + left = kdata->left->node;
47980 + right = kdata->right->node;
47981 +
47982 + tree = current_tree;
47983 + /* we have to do two things:
47984 + *
47985 + * 1. link left and right formatted neighbors of
47986 + * extent being removed, and
47987 + *
47988 + * 2. update their delimiting keys.
47989 + *
47990 + * atomicity of these operations is protected by
47991 + * taking dk-lock and tree-lock.
47992 + */
47993 + /* if neighbors of item being removed are znodes -
47994 + * link them */
47995 + write_lock_tree(tree);
47996 + write_lock_dk(tree);
47997 + link_left_and_right(left, right);
47998 + if (left) {
47999 + /* update right delimiting key of left
48000 + * neighbor of extent item */
48001 + /*coord_t next;
48002 + reiser4_key key; */
48003 +
48004 + coord_dup(next, coord);
48005 +
48006 + if (coord_next_item(next))
48007 + *key = *znode_get_rd_key(coord->node);
48008 + else
48009 + item_key_by_coord(next, key);
48010 + znode_set_rd_key(left, key);
48011 + }
48012 + write_unlock_dk(tree);
48013 + write_unlock_tree(tree);
48014 +
48015 + from_off =
48016 + get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
48017 + to_off =
48018 + (get_key_offset(max_item_key) +
48019 + 1) >> PAGE_CACHE_SHIFT;
48020 + retval = ITEM_KILLED;
48021 + } else {
48022 + /* tail of item is to be removed */
48023 + from_off =
48024 + (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
48025 + 1) >> PAGE_CACHE_SHIFT;
48026 + to_off =
48027 + (get_key_offset(max_item_key) +
48028 + 1) >> PAGE_CACHE_SHIFT;
48029 + retval = ITEM_TAIL_KILLED;
48030 + }
48031 + } else {
48032 + /* head of item is to be removed */
48033 + assert("vs-1571", keyeq(pfrom_key, min_item_key));
48034 + assert("vs-1572",
48035 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
48036 + 0);
48037 + assert("vs-1573",
48038 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48039 + 1)) == 0);
48040 +
48041 + if (kdata->left->node) {
48042 + /* update right delimiting key of left neighbor of extent item */
48043 + /*reiser4_key key; */
48044 +
48045 + *key = *pto_key;
48046 + set_key_offset(key, get_key_offset(pto_key) + 1);
48047 +
48048 + write_lock_dk(current_tree);
48049 + znode_set_rd_key(kdata->left->node, key);
48050 + write_unlock_dk(current_tree);
48051 + }
48052 +
48053 + from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
48054 + to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
48055 + retval = ITEM_HEAD_KILLED;
48056 + }
48057 +
48058 + inode = kdata->inode;
48059 + assert("vs-1545", inode != NULL);
48060 + if (inode != NULL)
48061 + /* take care of pages and jnodes corresponding to part of item being killed */
48062 + reiser4_invalidate_pages(inode->i_mapping, from_off,
48063 + to_off - from_off,
48064 + kdata->params.truncate);
48065 +
48066 + ext = extent_item(coord) + from;
48067 + offset =
48068 + (get_key_offset(min_item_key) +
48069 + reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
48070 +
48071 + assert("vs-1551", from_off >= offset);
48072 + assert("vs-1552", from_off - offset <= extent_get_width(ext));
48073 + skip = from_off - offset;
48074 + offset = from_off;
48075 +
48076 + while (offset < to_off) {
48077 + length = extent_get_width(ext) - skip;
48078 + if (state_of_extent(ext) == HOLE_EXTENT) {
48079 + skip = 0;
48080 + offset += length;
48081 + ext++;
48082 + continue;
48083 + }
48084 +
48085 + if (offset + length > to_off) {
48086 + length = to_off - offset;
48087 + }
48088 +
48089 + DQUOT_FREE_BLOCK_NODIRTY(inode, length);
48090 +
48091 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48092 + /* some jnodes corresponding to this unallocated extent */
48093 + fake_allocated2free(length, 0 /* unformatted */ );
48094 +
48095 + skip = 0;
48096 + offset += length;
48097 + ext++;
48098 + continue;
48099 + }
48100 +
48101 + assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
48102 +
48103 + if (length != 0) {
48104 + start = extent_get_start(ext) + skip;
48105 +
48106 + /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
48107 + immediately */
48108 + reiser4_dealloc_blocks(&start, &length,
48109 + 0 /* not used */ ,
48110 + BA_DEFER
48111 + /* unformatted with defer */ );
48112 + }
48113 + skip = 0;
48114 + offset += length;
48115 + ext++;
48116 + }
48117 + return retval;
48118 +}
48119 +
48120 +/* item_plugin->b.kill_units */
48121 +int
48122 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48123 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
48124 + reiser4_key * new_first)
48125 +{
48126 + reiser4_extent *ext;
48127 + reiser4_key item_key;
48128 + pos_in_node_t count;
48129 + reiser4_key from_key, to_key;
48130 + const reiser4_key *pfrom_key, *pto_key;
48131 + loff_t off;
48132 + int result;
48133 +
48134 + assert("vs-1541",
48135 + ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
48136 + || (kdata->params.from_key != NULL
48137 + && kdata->params.to_key != NULL)));
48138 +
48139 + if (kdata->params.from_key) {
48140 + pfrom_key = kdata->params.from_key;
48141 + pto_key = kdata->params.to_key;
48142 + } else {
48143 + coord_t dup;
48144 +
48145 + /* calculate key range of kill */
48146 + assert("vs-1549", from == coord->unit_pos);
48147 + unit_key_by_coord(coord, &from_key);
48148 + pfrom_key = &from_key;
48149 +
48150 + coord_dup(&dup, coord);
48151 + dup.unit_pos = to;
48152 + max_unit_key_by_coord(&dup, &to_key);
48153 + pto_key = &to_key;
48154 + }
48155 +
48156 + item_key_by_coord(coord, &item_key);
48157 +
48158 +#if REISER4_DEBUG
48159 + {
48160 + reiser4_key max_item_key;
48161 +
48162 + max_item_key_by_coord(coord, &max_item_key);
48163 +
48164 + if (new_first) {
48165 + /* head of item is to be cut */
48166 + assert("vs-1542", keyeq(pfrom_key, &item_key));
48167 + assert("vs-1538", keylt(pto_key, &max_item_key));
48168 + } else {
48169 + /* tail of item is to be cut */
48170 + assert("vs-1540", keygt(pfrom_key, &item_key));
48171 + assert("vs-1543", !keylt(pto_key, &max_item_key));
48172 + }
48173 + }
48174 +#endif
48175 +
48176 + if (smallest_removed)
48177 + *smallest_removed = *pfrom_key;
48178 +
48179 + if (new_first) {
48180 + /* item head is cut. Item key will change. This new key is calculated here */
48181 + assert("vs-1556",
48182 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48183 + (PAGE_CACHE_SIZE - 1));
48184 + *new_first = *pto_key;
48185 + set_key_offset(new_first, get_key_offset(new_first) + 1);
48186 + }
48187 +
48188 + count = to - from + 1;
48189 + result = kill_hook_extent(coord, from, count, kdata);
48190 + if (result == ITEM_TAIL_KILLED) {
48191 + assert("vs-1553",
48192 + get_key_offset(pfrom_key) >=
48193 + get_key_offset(&item_key) +
48194 + reiser4_extent_size(coord, from));
48195 + off =
48196 + get_key_offset(pfrom_key) -
48197 + (get_key_offset(&item_key) +
48198 + reiser4_extent_size(coord, from));
48199 + if (off) {
48200 + /* unit @from is to be cut partially. Its width decreases */
48201 + ext = extent_item(coord) + from;
48202 + extent_set_width(ext,
48203 + (off + PAGE_CACHE_SIZE -
48204 + 1) >> PAGE_CACHE_SHIFT);
48205 + count--;
48206 + }
48207 + } else {
48208 + __u64 max_to_offset;
48209 + __u64 rest;
48210 +
48211 + assert("vs-1575", result == ITEM_HEAD_KILLED);
48212 + assert("", from == 0);
48213 + assert("",
48214 + ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48215 + 1)) == 0);
48216 + assert("",
48217 + get_key_offset(pto_key) + 1 >
48218 + get_key_offset(&item_key) +
48219 + reiser4_extent_size(coord, to));
48220 + max_to_offset =
48221 + get_key_offset(&item_key) +
48222 + reiser4_extent_size(coord, to + 1) - 1;
48223 + assert("", get_key_offset(pto_key) <= max_to_offset);
48224 +
48225 + rest =
48226 + (max_to_offset -
48227 + get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
48228 + if (rest) {
48229 + /* unit @to is to be cut partially */
48230 + ext = extent_item(coord) + to;
48231 +
48232 + assert("", extent_get_width(ext) > rest);
48233 +
48234 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
48235 + extent_set_start(ext,
48236 + extent_get_start(ext) +
48237 + (extent_get_width(ext) -
48238 + rest));
48239 +
48240 + extent_set_width(ext, rest);
48241 + count--;
48242 + }
48243 + }
48244 + return count * sizeof(reiser4_extent);
48245 +}
48246 +
48247 +/* item_plugin->b.cut_units
48248 + this is too similar to kill_units_extent */
48249 +int
48250 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48251 + struct carry_cut_data *cdata, reiser4_key * smallest_removed,
48252 + reiser4_key * new_first)
48253 +{
48254 + reiser4_extent *ext;
48255 + reiser4_key item_key;
48256 + pos_in_node_t count;
48257 + reiser4_key from_key, to_key;
48258 + const reiser4_key *pfrom_key, *pto_key;
48259 + loff_t off;
48260 +
48261 + assert("vs-1541",
48262 + ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
48263 + || (cdata->params.from_key != NULL
48264 + && cdata->params.to_key != NULL)));
48265 +
48266 + if (cdata->params.from_key) {
48267 + pfrom_key = cdata->params.from_key;
48268 + pto_key = cdata->params.to_key;
48269 + } else {
48270 + coord_t dup;
48271 +
48272 + /* calculate key range of kill */
48273 + coord_dup(&dup, coord);
48274 + dup.unit_pos = from;
48275 + unit_key_by_coord(&dup, &from_key);
48276 +
48277 + dup.unit_pos = to;
48278 + max_unit_key_by_coord(&dup, &to_key);
48279 +
48280 + pfrom_key = &from_key;
48281 + pto_key = &to_key;
48282 + }
48283 +
48284 + assert("vs-1555",
48285 + (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
48286 + assert("vs-1556",
48287 + (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48288 + (PAGE_CACHE_SIZE - 1));
48289 +
48290 + item_key_by_coord(coord, &item_key);
48291 +
48292 +#if REISER4_DEBUG
48293 + {
48294 + reiser4_key max_item_key;
48295 +
48296 + assert("vs-1584",
48297 + get_key_locality(pfrom_key) ==
48298 + get_key_locality(&item_key));
48299 + assert("vs-1585",
48300 + get_key_type(pfrom_key) == get_key_type(&item_key));
48301 + assert("vs-1586",
48302 + get_key_objectid(pfrom_key) ==
48303 + get_key_objectid(&item_key));
48304 + assert("vs-1587",
48305 + get_key_ordering(pfrom_key) ==
48306 + get_key_ordering(&item_key));
48307 +
48308 + max_item_key_by_coord(coord, &max_item_key);
48309 +
48310 + if (new_first != NULL) {
48311 + /* head of item is to be cut */
48312 + assert("vs-1542", keyeq(pfrom_key, &item_key));
48313 + assert("vs-1538", keylt(pto_key, &max_item_key));
48314 + } else {
48315 + /* tail of item is to be cut */
48316 + assert("vs-1540", keygt(pfrom_key, &item_key));
48317 + assert("vs-1543", keyeq(pto_key, &max_item_key));
48318 + }
48319 + }
48320 +#endif
48321 +
48322 + if (smallest_removed)
48323 + *smallest_removed = *pfrom_key;
48324 +
48325 + if (new_first) {
48326 + /* item head is cut. Item key will change. This new key is calculated here */
48327 + *new_first = *pto_key;
48328 + set_key_offset(new_first, get_key_offset(new_first) + 1);
48329 + }
48330 +
48331 + count = to - from + 1;
48332 +
48333 + assert("vs-1553",
48334 + get_key_offset(pfrom_key) >=
48335 + get_key_offset(&item_key) + reiser4_extent_size(coord, from));
48336 + off =
48337 + get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
48338 + reiser4_extent_size(coord, from));
48339 + if (off) {
48340 + /* tail of unit @from is to be cut partially. Its width decreases */
48341 + assert("vs-1582", new_first == NULL);
48342 + ext = extent_item(coord) + from;
48343 + extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
48344 + count--;
48345 + }
48346 +
48347 + assert("vs-1554",
48348 + get_key_offset(pto_key) <=
48349 + get_key_offset(&item_key) +
48350 + reiser4_extent_size(coord, to + 1) - 1);
48351 + off =
48352 + (get_key_offset(&item_key) +
48353 + reiser4_extent_size(coord, to + 1) - 1) -
48354 + get_key_offset(pto_key);
48355 + if (off) {
48356 + /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
48357 + and width decreased. */
48358 + assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
48359 + ext = extent_item(coord) + to;
48360 + if (state_of_extent(ext) == ALLOCATED_EXTENT)
48361 + extent_set_start(ext,
48362 + extent_get_start(ext) +
48363 + (extent_get_width(ext) -
48364 + (off >> PAGE_CACHE_SHIFT)));
48365 +
48366 + extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
48367 + count--;
48368 + }
48369 + return count * sizeof(reiser4_extent);
48370 +}
48371 +
48372 +/* item_plugin->b.unit_key */
48373 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
48374 +{
48375 + assert("vs-300", coord_is_existing_unit(coord));
48376 +
48377 + item_key_by_coord(coord, key);
48378 + set_key_offset(key,
48379 + (get_key_offset(key) +
48380 + reiser4_extent_size(coord, coord->unit_pos)));
48381 +
48382 + return key;
48383 +}
48384 +
48385 +/* item_plugin->b.max_unit_key */
48386 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
48387 +{
48388 + assert("vs-300", coord_is_existing_unit(coord));
48389 +
48390 + item_key_by_coord(coord, key);
48391 + set_key_offset(key,
48392 + (get_key_offset(key) +
48393 + reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
48394 + return key;
48395 +}
48396 +
48397 +/* item_plugin->b.estimate
48398 + item_plugin->b.item_data_by_flow */
48399 +
48400 +#if REISER4_DEBUG
48401 +
48402 +/* item_plugin->b.check
48403 + used for debugging, every item should have here the most complete
48404 + possible check of the consistency of the item that the inventor can
48405 + construct
48406 +*/
48407 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
48408 + const char **error /* where to store error message */)
48409 +{
48410 + reiser4_extent *ext, *first;
48411 + unsigned i, j;
48412 + reiser4_block_nr start, width, blk_cnt;
48413 + unsigned num_units;
48414 + reiser4_tree *tree;
48415 + oid_t oid;
48416 + reiser4_key key;
48417 + coord_t scan;
48418 +
48419 + assert("vs-933", REISER4_DEBUG);
48420 +
48421 + if (znode_get_level(coord->node) != TWIG_LEVEL) {
48422 + *error = "Extent on the wrong level";
48423 + return -1;
48424 + }
48425 + if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
48426 + *error = "Wrong item size";
48427 + return -1;
48428 + }
48429 + ext = first = extent_item(coord);
48430 + blk_cnt = reiser4_block_count(reiser4_get_current_sb());
48431 + num_units = coord_num_units(coord);
48432 + tree = znode_get_tree(coord->node);
48433 + item_key_by_coord(coord, &key);
48434 + oid = get_key_objectid(&key);
48435 + coord_dup(&scan, coord);
48436 +
48437 + for (i = 0; i < num_units; ++i, ++ext) {
48438 + __u64 index;
48439 +
48440 + scan.unit_pos = i;
48441 + index = extent_unit_index(&scan);
48442 +
48443 +#if 0
48444 + /* check that all jnodes are present for the unallocated
48445 + * extent */
48446 + if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48447 + for (j = 0; j < extent_get_width(ext); j++) {
48448 + jnode *node;
48449 +
48450 + node = jlookup(tree, oid, index + j);
48451 + if (node == NULL) {
48452 + print_coord("scan", &scan, 0);
48453 + *error = "Jnode missing";
48454 + return -1;
48455 + }
48456 + jput(node);
48457 + }
48458 + }
48459 +#endif
48460 +
48461 + start = extent_get_start(ext);
48462 + if (start < 2)
48463 + continue;
48464 + /* extent is allocated one */
48465 + width = extent_get_width(ext);
48466 + if (start >= blk_cnt) {
48467 + *error = "Start too large";
48468 + return -1;
48469 + }
48470 + if (start + width > blk_cnt) {
48471 + *error = "End too large";
48472 + return -1;
48473 + }
48474 + /* make sure that this extent does not overlap with other
48475 + allocated extents extents */
48476 + for (j = 0; j < i; j++) {
48477 + if (state_of_extent(first + j) != ALLOCATED_EXTENT)
48478 + continue;
48479 + if (!
48480 + ((extent_get_start(ext) >=
48481 + extent_get_start(first + j) +
48482 + extent_get_width(first + j))
48483 + || (extent_get_start(ext) +
48484 + extent_get_width(ext) <=
48485 + extent_get_start(first + j)))) {
48486 + *error = "Extent overlaps with others";
48487 + return -1;
48488 + }
48489 + }
48490 +
48491 + }
48492 +
48493 + return 0;
48494 +}
48495 +
48496 +#endif /* REISER4_DEBUG */
48497 +
48498 +/*
48499 + Local variables:
48500 + c-indentation-style: "K&R"
48501 + mode-name: "LC"
48502 + c-basic-offset: 8
48503 + tab-width: 8
48504 + fill-column: 120
48505 + scroll-step: 1
48506 + End:
48507 +*/
48508 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/internal.c linux-2.6.23/fs/reiser4/plugin/item/internal.c
48509 --- linux-2.6.23.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
48510 +++ linux-2.6.23/fs/reiser4/plugin/item/internal.c 2007-12-04 16:49:30.000000000 +0300
48511 @@ -0,0 +1,396 @@
48512 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48513 +
48514 +/* Implementation of internal-item plugin methods. */
48515 +
48516 +#include "../../forward.h"
48517 +#include "../../debug.h"
48518 +#include "../../dformat.h"
48519 +#include "../../key.h"
48520 +#include "../../coord.h"
48521 +#include "internal.h"
48522 +#include "item.h"
48523 +#include "../node/node.h"
48524 +#include "../plugin.h"
48525 +#include "../../jnode.h"
48526 +#include "../../znode.h"
48527 +#include "../../tree_walk.h"
48528 +#include "../../tree_mod.h"
48529 +#include "../../tree.h"
48530 +#include "../../super.h"
48531 +#include "../../block_alloc.h"
48532 +
48533 +/* see internal.h for explanation */
48534 +
48535 +/* plugin->u.item.b.mergeable */
48536 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
48537 + const coord_t * p2 UNUSED_ARG /* second item */ )
48538 +{
48539 + /* internal items are not mergeable */
48540 + return 0;
48541 +}
48542 +
48543 +/* ->lookup() method for internal items */
48544 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
48545 + lookup_bias bias UNUSED_ARG /* lookup bias */ ,
48546 + coord_t * coord /* coord of item */ )
48547 +{
48548 + reiser4_key ukey;
48549 +
48550 + switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
48551 + default:
48552 + impossible("", "keycmp()?!");
48553 + case LESS_THAN:
48554 + /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
48555 + item plugin can not be taken using coord set this way */
48556 + assert("vs-681", coord->unit_pos == 0);
48557 + coord->between = AFTER_UNIT;
48558 + case EQUAL_TO:
48559 + return CBK_COORD_FOUND;
48560 + case GREATER_THAN:
48561 + return CBK_COORD_NOTFOUND;
48562 + }
48563 +}
48564 +
48565 +/* return body of internal item at @coord */
48566 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
48567 + * item */ )
48568 +{
48569 + assert("nikita-607", coord != NULL);
48570 + assert("nikita-1650",
48571 + item_plugin_by_coord(coord) ==
48572 + item_plugin_by_id(NODE_POINTER_ID));
48573 + return (internal_item_layout *) item_body_by_coord(coord);
48574 +}
48575 +
48576 +void reiser4_update_internal(const coord_t * coord,
48577 + const reiser4_block_nr * blocknr)
48578 +{
48579 + internal_item_layout *item = internal_at(coord);
48580 + assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
48581 +
48582 + put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
48583 +}
48584 +
48585 +/* return child block number stored in the internal item at @coord */
48586 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
48587 +{
48588 + assert("nikita-608", coord != NULL);
48589 + return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
48590 +}
48591 +
48592 +/* get znode pointed to by internal @item */
48593 +static znode *znode_at(const coord_t * item /* coord of item */ ,
48594 + znode * parent /* parent node */ )
48595 +{
48596 + return child_znode(item, parent, 1, 0);
48597 +}
48598 +
48599 +/* store pointer from internal item into "block". Implementation of
48600 + ->down_link() method */
48601 +void down_link_internal(const coord_t * coord /* coord of item */ ,
48602 + const reiser4_key * key UNUSED_ARG /* key to get
48603 + * pointer for */ ,
48604 + reiser4_block_nr * block /* resulting block number */ )
48605 +{
48606 + ON_DEBUG(reiser4_key item_key);
48607 +
48608 + assert("nikita-609", coord != NULL);
48609 + assert("nikita-611", block != NULL);
48610 + assert("nikita-612", (key == NULL) ||
48611 + /* twig horrors */
48612 + (znode_get_level(coord->node) == TWIG_LEVEL)
48613 + || keyle(item_key_by_coord(coord, &item_key), key));
48614 +
48615 + *block = pointer_at(coord);
48616 + assert("nikita-2960", reiser4_blocknr_is_sane(block));
48617 +}
48618 +
48619 +/* Get the child's block number, or 0 if the block is unallocated. */
48620 +int
48621 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
48622 + reiser4_block_nr * block)
48623 +{
48624 + assert("jmacd-2059", coord != NULL);
48625 +
48626 + *block = pointer_at(coord);
48627 + assert("nikita-2961", reiser4_blocknr_is_sane(block));
48628 +
48629 + if (reiser4_blocknr_is_fake(block)) {
48630 + *block = 0;
48631 + }
48632 +
48633 + return 0;
48634 +}
48635 +
48636 +/* Return the child. */
48637 +int
48638 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
48639 + jnode ** childp)
48640 +{
48641 + reiser4_block_nr block = pointer_at(coord);
48642 + znode *child;
48643 +
48644 + assert("jmacd-2059", childp != NULL);
48645 + assert("nikita-2962", reiser4_blocknr_is_sane(&block));
48646 +
48647 + child = zlook(znode_get_tree(coord->node), &block);
48648 +
48649 + if (IS_ERR(child)) {
48650 + return PTR_ERR(child);
48651 + }
48652 +
48653 + *childp = ZJNODE(child);
48654 +
48655 + return 0;
48656 +}
48657 +
48658 +#if REISER4_DEBUG
48659 +
48660 +static void check_link(znode * left, znode * right)
48661 +{
48662 + znode *scan;
48663 +
48664 + for (scan = left; scan != right; scan = scan->right) {
48665 + if (ZF_ISSET(scan, JNODE_RIP))
48666 + break;
48667 + if (znode_is_right_connected(scan) && scan->right != NULL) {
48668 + if (ZF_ISSET(scan->right, JNODE_RIP))
48669 + break;
48670 + assert("nikita-3285",
48671 + znode_is_left_connected(scan->right));
48672 + assert("nikita-3265",
48673 + ergo(scan != left,
48674 + ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
48675 + assert("nikita-3284", scan->right->left == scan);
48676 + } else
48677 + break;
48678 + }
48679 +}
48680 +
48681 +int check__internal(const coord_t * coord, const char **error)
48682 +{
48683 + reiser4_block_nr blk;
48684 + znode *child;
48685 + coord_t cpy;
48686 +
48687 + blk = pointer_at(coord);
48688 + if (!reiser4_blocknr_is_sane(&blk)) {
48689 + *error = "Invalid pointer";
48690 + return -1;
48691 + }
48692 + coord_dup(&cpy, coord);
48693 + child = znode_at(&cpy, cpy.node);
48694 + if (child != NULL) {
48695 + znode *left_child;
48696 + znode *right_child;
48697 +
48698 + left_child = right_child = NULL;
48699 +
48700 + assert("nikita-3256", znode_invariant(child));
48701 + if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
48702 + left_child = znode_at(&cpy, cpy.node);
48703 + if (left_child != NULL) {
48704 + read_lock_tree(znode_get_tree(child));
48705 + check_link(left_child, child);
48706 + read_unlock_tree(znode_get_tree(child));
48707 + zput(left_child);
48708 + }
48709 + }
48710 + coord_dup(&cpy, coord);
48711 + if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
48712 + right_child = znode_at(&cpy, cpy.node);
48713 + if (right_child != NULL) {
48714 + read_lock_tree(znode_get_tree(child));
48715 + check_link(child, right_child);
48716 + read_unlock_tree(znode_get_tree(child));
48717 + zput(right_child);
48718 + }
48719 + }
48720 + zput(child);
48721 + }
48722 + return 0;
48723 +}
48724 +
48725 +#endif /* REISER4_DEBUG */
48726 +
48727 +/* return true only if this item really points to "block" */
48728 +/* Audited by: green(2002.06.14) */
48729 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
48730 + const reiser4_block_nr * block /* block number to
48731 + * check */ )
48732 +{
48733 + assert("nikita-613", coord != NULL);
48734 + assert("nikita-614", block != NULL);
48735 +
48736 + return pointer_at(coord) == *block;
48737 +}
48738 +
48739 +/* hook called by ->create_item() method of node plugin after new internal
48740 + item was just created.
48741 +
48742 + This is point where pointer to new node is inserted into tree. Initialize
48743 + parent pointer in child znode, insert child into sibling list and slum.
48744 +
48745 +*/
48746 +int create_hook_internal(const coord_t * item /* coord of item */ ,
48747 + void *arg /* child's left neighbor, if any */ )
48748 +{
48749 + znode *child;
48750 + __u64 child_ptr;
48751 +
48752 + assert("nikita-1252", item != NULL);
48753 + assert("nikita-1253", item->node != NULL);
48754 + assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
48755 + assert("nikita-1450", item->unit_pos == 0);
48756 +
48757 + /*
48758 + * preparing to item insertion build_child_ptr_data sets pointer to
48759 + * data to be inserted to jnode's blocknr which is in cpu byte
48760 + * order. Node's create_item simply copied those data. As result we
48761 + * have child pointer in cpu's byte order. Convert content of internal
48762 + * item to little endian byte order.
48763 + */
48764 + child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
48765 + reiser4_update_internal(item, &child_ptr);
48766 +
48767 + child = znode_at(item, item->node);
48768 + if (child != NULL && !IS_ERR(child)) {
48769 + znode *left;
48770 + int result = 0;
48771 + reiser4_tree *tree;
48772 +
48773 + left = arg;
48774 + tree = znode_get_tree(item->node);
48775 + write_lock_tree(tree);
48776 + write_lock_dk(tree);
48777 + assert("nikita-1400", (child->in_parent.node == NULL)
48778 + || (znode_above_root(child->in_parent.node)));
48779 + ++item->node->c_count;
48780 + coord_to_parent_coord(item, &child->in_parent);
48781 + sibling_list_insert_nolock(child, left);
48782 +
48783 + assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
48784 + ZF_CLR(child, JNODE_ORPHAN);
48785 +
48786 + if ((left != NULL) && !keyeq(znode_get_rd_key(left),
48787 + znode_get_rd_key(child))) {
48788 + znode_set_rd_key(child, znode_get_rd_key(left));
48789 + }
48790 + write_unlock_dk(tree);
48791 + write_unlock_tree(tree);
48792 + zput(child);
48793 + return result;
48794 + } else {
48795 + if (child == NULL)
48796 + child = ERR_PTR(-EIO);
48797 + return PTR_ERR(child);
48798 + }
48799 +}
48800 +
48801 +/* hook called by ->cut_and_kill() method of node plugin just before internal
48802 + item is removed.
48803 +
48804 + This is point where empty node is removed from the tree. Clear parent
48805 + pointer in child, and mark node for pending deletion.
48806 +
48807 + Node will be actually deleted later and in several installations:
48808 +
48809 + . when last lock on this node will be released, node will be removed from
48810 + the sibling list and its lock will be invalidated
48811 +
48812 + . when last reference to this node will be dropped, bitmap will be updated
48813 + and node will be actually removed from the memory.
48814 +
48815 +*/
48816 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
48817 + pos_in_node_t from UNUSED_ARG /* start unit */ ,
48818 + pos_in_node_t count UNUSED_ARG /* stop unit */ ,
48819 + struct carry_kill_data *p UNUSED_ARG)
48820 +{
48821 + znode *child;
48822 +
48823 + assert("nikita-1222", item != NULL);
48824 + assert("nikita-1224", from == 0);
48825 + assert("nikita-1225", count == 1);
48826 +
48827 + child = znode_at(item, item->node);
48828 + if (IS_ERR(child))
48829 + return PTR_ERR(child);
48830 + else if (node_is_empty(child)) {
48831 + reiser4_tree *tree;
48832 +
48833 + assert("nikita-1397", znode_is_write_locked(child));
48834 + assert("nikita-1398", child->c_count == 0);
48835 + assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
48836 +
48837 + tree = znode_get_tree(item->node);
48838 + write_lock_tree(tree);
48839 + init_parent_coord(&child->in_parent, NULL);
48840 + --item->node->c_count;
48841 + write_unlock_tree(tree);
48842 + zput(child);
48843 + return 0;
48844 + } else {
48845 + warning("nikita-1223",
48846 + "Cowardly refuse to remove link to non-empty node");
48847 + zput(child);
48848 + return RETERR(-EIO);
48849 + }
48850 +}
48851 +
48852 +/* hook called by ->shift() node plugin method when iternal item was just
48853 + moved from one node to another.
48854 +
48855 + Update parent pointer in child and c_counts in old and new parent
48856 +
48857 +*/
48858 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
48859 + unsigned from UNUSED_ARG /* start unit */ ,
48860 + unsigned count UNUSED_ARG /* stop unit */ ,
48861 + znode * old_node /* old parent */ )
48862 +{
48863 + znode *child;
48864 + znode *new_node;
48865 + reiser4_tree *tree;
48866 +
48867 + assert("nikita-1276", item != NULL);
48868 + assert("nikita-1277", from == 0);
48869 + assert("nikita-1278", count == 1);
48870 + assert("nikita-1451", item->unit_pos == 0);
48871 +
48872 + new_node = item->node;
48873 + assert("nikita-2132", new_node != old_node);
48874 + tree = znode_get_tree(item->node);
48875 + child = child_znode(item, old_node, 1, 0);
48876 + if (child == NULL)
48877 + return 0;
48878 + if (!IS_ERR(child)) {
48879 + write_lock_tree(tree);
48880 + ++new_node->c_count;
48881 + assert("nikita-1395", znode_parent(child) == old_node);
48882 + assert("nikita-1396", old_node->c_count > 0);
48883 + coord_to_parent_coord(item, &child->in_parent);
48884 + assert("nikita-1781", znode_parent(child) == new_node);
48885 + assert("nikita-1782",
48886 + check_tree_pointer(item, child) == NS_FOUND);
48887 + --old_node->c_count;
48888 + write_unlock_tree(tree);
48889 + zput(child);
48890 + return 0;
48891 + } else
48892 + return PTR_ERR(child);
48893 +}
48894 +
48895 +/* plugin->u.item.b.max_key_inside - not defined */
48896 +
48897 +/* plugin->u.item.b.nr_units - item.c:single_unit */
48898 +
48899 +/* Make Linus happy.
48900 + Local variables:
48901 + c-indentation-style: "K&R"
48902 + mode-name: "LC"
48903 + c-basic-offset: 8
48904 + tab-width: 8
48905 + fill-column: 120
48906 + End:
48907 +*/
48908 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/internal.h linux-2.6.23/fs/reiser4/plugin/item/internal.h
48909 --- linux-2.6.23.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
48910 +++ linux-2.6.23/fs/reiser4/plugin/item/internal.h 2007-12-04 16:49:30.000000000 +0300
48911 @@ -0,0 +1,57 @@
48912 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48913 +/* Internal item contains down-link to the child of the internal/twig
48914 + node in a tree. It is internal items that are actually used during
48915 + tree traversal. */
48916 +
48917 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
48918 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
48919 +
48920 +#include "../../forward.h"
48921 +#include "../../dformat.h"
48922 +
48923 +/* on-disk layout of internal item */
48924 +typedef struct internal_item_layout {
48925 + /* 0 */ reiser4_dblock_nr pointer;
48926 + /* 4 */
48927 +} internal_item_layout;
48928 +
48929 +struct cut_list;
48930 +
48931 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
48932 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
48933 + coord_t * coord);
48934 +/* store pointer from internal item into "block". Implementation of
48935 + ->down_link() method */
48936 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
48937 + reiser4_block_nr * block);
48938 +extern int has_pointer_to_internal(const coord_t * coord,
48939 + const reiser4_block_nr * block);
48940 +extern int create_hook_internal(const coord_t * item, void *arg);
48941 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
48942 + pos_in_node_t count, struct carry_kill_data *);
48943 +extern int shift_hook_internal(const coord_t * item, unsigned from,
48944 + unsigned count, znode * old_node);
48945 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
48946 +
48947 +extern int utmost_child_internal(const coord_t * coord, sideof side,
48948 + jnode ** child);
48949 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
48950 + reiser4_block_nr * block);
48951 +
48952 +extern void reiser4_update_internal(const coord_t * coord,
48953 + const reiser4_block_nr * blocknr);
48954 +/* FIXME: reiserfs has check_internal */
48955 +extern int check__internal(const coord_t * coord, const char **error);
48956 +
48957 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
48958 +#endif
48959 +
48960 +/* Make Linus happy.
48961 + Local variables:
48962 + c-indentation-style: "K&R"
48963 + mode-name: "LC"
48964 + c-basic-offset: 8
48965 + tab-width: 8
48966 + fill-column: 120
48967 + End:
48968 +*/
48969 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/item.c linux-2.6.23/fs/reiser4/plugin/item/item.c
48970 --- linux-2.6.23.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300
48971 +++ linux-2.6.23/fs/reiser4/plugin/item/item.c 2007-12-04 16:49:30.000000000 +0300
48972 @@ -0,0 +1,719 @@
48973 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48974 +
48975 +/* definition of item plugins. */
48976 +
48977 +#include "../../forward.h"
48978 +#include "../../debug.h"
48979 +#include "../../key.h"
48980 +#include "../../coord.h"
48981 +#include "../plugin_header.h"
48982 +#include "sde.h"
48983 +#include "internal.h"
48984 +#include "item.h"
48985 +#include "static_stat.h"
48986 +#include "../plugin.h"
48987 +#include "../../znode.h"
48988 +#include "../../tree.h"
48989 +#include "../../context.h"
48990 +#include "ctail.h"
48991 +
48992 +/* return pointer to item body */
48993 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
48994 +{
48995 + assert("nikita-324", coord != NULL);
48996 + assert("nikita-325", coord->node != NULL);
48997 + assert("nikita-326", znode_is_loaded(coord->node));
48998 + assert("nikita-3200", coord->offset == INVALID_OFFSET);
48999 +
49000 + coord->offset =
49001 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
49002 + zdata(coord->node);
49003 + ON_DEBUG(coord->body_v = coord->node->times_locked);
49004 +}
49005 +
49006 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
49007 +{
49008 + return zdata(coord->node) + coord->offset;
49009 +}
49010 +
49011 +#if REISER4_DEBUG
49012 +
49013 +int item_body_is_valid(const coord_t * coord)
49014 +{
49015 + return
49016 + coord->offset ==
49017 + node_plugin_by_node(coord->node)->item_by_coord(coord) -
49018 + zdata(coord->node);
49019 +}
49020 +
49021 +#endif
49022 +
49023 +/* return length of item at @coord */
49024 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
49025 +{
49026 + int len;
49027 +
49028 + assert("nikita-327", coord != NULL);
49029 + assert("nikita-328", coord->node != NULL);
49030 + assert("nikita-329", znode_is_loaded(coord->node));
49031 +
49032 + len = node_plugin_by_node(coord->node)->length_by_coord(coord);
49033 + return len;
49034 +}
49035 +
49036 +void obtain_item_plugin(const coord_t * coord)
49037 +{
49038 + assert("nikita-330", coord != NULL);
49039 + assert("nikita-331", coord->node != NULL);
49040 + assert("nikita-332", znode_is_loaded(coord->node));
49041 +
49042 + coord_set_iplug((coord_t *) coord,
49043 + node_plugin_by_node(coord->node)->
49044 + plugin_by_coord(coord));
49045 + assert("nikita-2479",
49046 + coord_iplug(coord) ==
49047 + node_plugin_by_node(coord->node)->plugin_by_coord(coord));
49048 +}
49049 +
49050 +/* return id of item */
49051 +/* Audited by: green(2002.06.15) */
49052 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
49053 +{
49054 + assert("vs-539", coord != NULL);
49055 + assert("vs-538", coord->node != NULL);
49056 + assert("vs-537", znode_is_loaded(coord->node));
49057 + assert("vs-536", item_plugin_by_coord(coord) != NULL);
49058 + assert("vs-540",
49059 + item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
49060 +
49061 + return item_id_by_plugin(item_plugin_by_coord(coord));
49062 +}
49063 +
49064 +/* return key of item at @coord */
49065 +/* Audited by: green(2002.06.15) */
49066 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
49067 + reiser4_key * key /* result */ )
49068 +{
49069 + assert("nikita-338", coord != NULL);
49070 + assert("nikita-339", coord->node != NULL);
49071 + assert("nikita-340", znode_is_loaded(coord->node));
49072 +
49073 + return node_plugin_by_node(coord->node)->key_at(coord, key);
49074 +}
49075 +
49076 +/* this returns max key in the item */
49077 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
49078 + reiser4_key * key /* result */ )
49079 +{
49080 + coord_t last;
49081 +
49082 + assert("nikita-338", coord != NULL);
49083 + assert("nikita-339", coord->node != NULL);
49084 + assert("nikita-340", znode_is_loaded(coord->node));
49085 +
49086 + /* make coord pointing to last item's unit */
49087 + coord_dup(&last, coord);
49088 + last.unit_pos = coord_num_units(&last) - 1;
49089 + assert("vs-1560", coord_is_existing_unit(&last));
49090 +
49091 + max_unit_key_by_coord(&last, key);
49092 + return key;
49093 +}
49094 +
49095 +/* return key of unit at @coord */
49096 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49097 + reiser4_key * key /* result */ )
49098 +{
49099 + assert("nikita-772", coord != NULL);
49100 + assert("nikita-774", coord->node != NULL);
49101 + assert("nikita-775", znode_is_loaded(coord->node));
49102 +
49103 + if (item_plugin_by_coord(coord)->b.unit_key != NULL)
49104 + return item_plugin_by_coord(coord)->b.unit_key(coord, key);
49105 + else
49106 + return item_key_by_coord(coord, key);
49107 +}
49108 +
49109 +/* return the biggest key contained the unit @coord */
49110 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49111 + reiser4_key * key /* result */ )
49112 +{
49113 + assert("nikita-772", coord != NULL);
49114 + assert("nikita-774", coord->node != NULL);
49115 + assert("nikita-775", znode_is_loaded(coord->node));
49116 +
49117 + if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
49118 + return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
49119 + else
49120 + return unit_key_by_coord(coord, key);
49121 +}
49122 +
49123 +/* ->max_key_inside() method for items consisting of exactly one key (like
49124 + stat-data) */
49125 +static reiser4_key *max_key_inside_single_key(const coord_t *
49126 + coord /* coord of item */ ,
49127 + reiser4_key *
49128 + result /* resulting key */ )
49129 +{
49130 + assert("nikita-604", coord != NULL);
49131 +
49132 + /* coord -> key is starting key of this item and it has to be already
49133 + filled in */
49134 + return unit_key_by_coord(coord, result);
49135 +}
49136 +
49137 +/* ->nr_units() method for items consisting of exactly one unit always */
49138 +pos_in_node_t
49139 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
49140 +{
49141 + return 1;
49142 +}
49143 +
49144 +static int
49145 +paste_no_paste(coord_t * coord UNUSED_ARG,
49146 + reiser4_item_data * data UNUSED_ARG,
49147 + carry_plugin_info * info UNUSED_ARG)
49148 +{
49149 + return 0;
49150 +}
49151 +
49152 +/* default ->fast_paste() method */
49153 +static int
49154 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
49155 +{
49156 + return 1;
49157 +}
49158 +
49159 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
49160 + const reiser4_key * key /* key to check */ ,
49161 + const reiser4_item_data * data /* parameters of item
49162 + * being created */ )
49163 +{
49164 + item_plugin *iplug;
49165 + reiser4_key min_key_in_item;
49166 + reiser4_key max_key_in_item;
49167 +
49168 + assert("nikita-1658", item != NULL);
49169 + assert("nikita-1659", key != NULL);
49170 +
49171 + iplug = item_plugin_by_coord(item);
49172 + if (iplug->b.can_contain_key != NULL)
49173 + return iplug->b.can_contain_key(item, key, data);
49174 + else {
49175 + assert("nikita-1681", iplug->b.max_key_inside != NULL);
49176 + item_key_by_coord(item, &min_key_in_item);
49177 + iplug->b.max_key_inside(item, &max_key_in_item);
49178 +
49179 + /* can contain key if
49180 + min_key_in_item <= key &&
49181 + key <= max_key_in_item
49182 + */
49183 + return keyle(&min_key_in_item, key)
49184 + && keyle(key, &max_key_in_item);
49185 + }
49186 +}
49187 +
49188 +/* mergeable method for non mergeable items */
49189 +static int
49190 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
49191 +{
49192 + return 0;
49193 +}
49194 +
49195 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
49196 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
49197 + const coord_t * i2 /* coord of second item */ )
49198 +{
49199 + item_plugin *iplug;
49200 + reiser4_key k1;
49201 + reiser4_key k2;
49202 +
49203 + assert("nikita-1336", i1 != NULL);
49204 + assert("nikita-1337", i2 != NULL);
49205 +
49206 + iplug = item_plugin_by_coord(i1);
49207 + assert("nikita-1338", iplug != NULL);
49208 +
49209 + /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
49210 + shifting code when nodes are in "suspended" state. */
49211 + assert("nikita-1663",
49212 + keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
49213 +
49214 + if (iplug->b.mergeable != NULL) {
49215 + return iplug->b.mergeable(i1, i2);
49216 + } else if (iplug->b.max_key_inside != NULL) {
49217 + iplug->b.max_key_inside(i1, &k1);
49218 + item_key_by_coord(i2, &k2);
49219 +
49220 + /* mergeable if ->max_key_inside() >= key of i2; */
49221 + return keyge(iplug->b.max_key_inside(i1, &k1),
49222 + item_key_by_coord(i2, &k2));
49223 + } else {
49224 + item_key_by_coord(i1, &k1);
49225 + item_key_by_coord(i2, &k2);
49226 +
49227 + return
49228 + (get_key_locality(&k1) == get_key_locality(&k2)) &&
49229 + (get_key_objectid(&k1) == get_key_objectid(&k2))
49230 + && (iplug == item_plugin_by_coord(i2));
49231 + }
49232 +}
49233 +
49234 +int item_is_extent(const coord_t * item)
49235 +{
49236 + assert("vs-482", coord_is_existing_item(item));
49237 + return item_id_by_coord(item) == EXTENT_POINTER_ID;
49238 +}
49239 +
49240 +int item_is_tail(const coord_t * item)
49241 +{
49242 + assert("vs-482", coord_is_existing_item(item));
49243 + return item_id_by_coord(item) == FORMATTING_ID;
49244 +}
49245 +
49246 +#if REISER4_DEBUG
49247 +
49248 +int item_is_statdata(const coord_t * item)
49249 +{
49250 + assert("vs-516", coord_is_existing_item(item));
49251 + return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
49252 +}
49253 +
49254 +int item_is_ctail(const coord_t * item)
49255 +{
49256 + assert("edward-xx", coord_is_existing_item(item));
49257 + return item_id_by_coord(item) == CTAIL_ID;
49258 +}
49259 +
49260 +#endif /* REISER4_DEBUG */
49261 +
49262 +static int change_item(struct inode *inode,
49263 + reiser4_plugin * plugin,
49264 + pset_member memb)
49265 +{
49266 + /* cannot change constituent item (sd, or dir_item) */
49267 + return RETERR(-EINVAL);
49268 +}
49269 +
49270 +static reiser4_plugin_ops item_plugin_ops = {
49271 + .init = NULL,
49272 + .load = NULL,
49273 + .save_len = NULL,
49274 + .save = NULL,
49275 + .change = change_item
49276 +};
49277 +
49278 +item_plugin item_plugins[LAST_ITEM_ID] = {
49279 + [STATIC_STAT_DATA_ID] = {
49280 + .h = {
49281 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49282 + .id = STATIC_STAT_DATA_ID,
49283 + .groups = (1 << STAT_DATA_ITEM_TYPE),
49284 + .pops = &item_plugin_ops,
49285 + .label = "sd",
49286 + .desc = "stat-data",
49287 + .linkage = {NULL, NULL}
49288 + },
49289 + .b = {
49290 + .max_key_inside = max_key_inside_single_key,
49291 + .can_contain_key = NULL,
49292 + .mergeable = not_mergeable,
49293 + .nr_units = nr_units_single_unit,
49294 + .lookup = NULL,
49295 + .init = NULL,
49296 + .paste = paste_no_paste,
49297 + .fast_paste = NULL,
49298 + .can_shift = NULL,
49299 + .copy_units = NULL,
49300 + .create_hook = NULL,
49301 + .kill_hook = NULL,
49302 + .shift_hook = NULL,
49303 + .cut_units = NULL,
49304 + .kill_units = NULL,
49305 + .unit_key = NULL,
49306 + .max_unit_key = NULL,
49307 + .estimate = NULL,
49308 + .item_data_by_flow = NULL,
49309 +#if REISER4_DEBUG
49310 + .check = NULL
49311 +#endif
49312 + },
49313 + .f = {
49314 + .utmost_child = NULL,
49315 + .utmost_child_real_block = NULL,
49316 + .update = NULL,
49317 + .scan = NULL,
49318 + .convert = NULL
49319 + },
49320 + .s = {
49321 + .sd = {
49322 + .init_inode = init_inode_static_sd,
49323 + .save_len = save_len_static_sd,
49324 + .save = save_static_sd
49325 + }
49326 + }
49327 + },
49328 + [SIMPLE_DIR_ENTRY_ID] = {
49329 + .h = {
49330 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49331 + .id = SIMPLE_DIR_ENTRY_ID,
49332 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49333 + .pops = &item_plugin_ops,
49334 + .label = "de",
49335 + .desc = "directory entry",
49336 + .linkage = {NULL, NULL}
49337 + },
49338 + .b = {
49339 + .max_key_inside = max_key_inside_single_key,
49340 + .can_contain_key = NULL,
49341 + .mergeable = NULL,
49342 + .nr_units = nr_units_single_unit,
49343 + .lookup = NULL,
49344 + .init = NULL,
49345 + .paste = NULL,
49346 + .fast_paste = NULL,
49347 + .can_shift = NULL,
49348 + .copy_units = NULL,
49349 + .create_hook = NULL,
49350 + .kill_hook = NULL,
49351 + .shift_hook = NULL,
49352 + .cut_units = NULL,
49353 + .kill_units = NULL,
49354 + .unit_key = NULL,
49355 + .max_unit_key = NULL,
49356 + .estimate = NULL,
49357 + .item_data_by_flow = NULL,
49358 +#if REISER4_DEBUG
49359 + .check = NULL
49360 +#endif
49361 + },
49362 + .f = {
49363 + .utmost_child = NULL,
49364 + .utmost_child_real_block = NULL,
49365 + .update = NULL,
49366 + .scan = NULL,
49367 + .convert = NULL
49368 + },
49369 + .s = {
49370 + .dir = {
49371 + .extract_key = extract_key_de,
49372 + .update_key = update_key_de,
49373 + .extract_name = extract_name_de,
49374 + .extract_file_type = extract_file_type_de,
49375 + .add_entry = add_entry_de,
49376 + .rem_entry = rem_entry_de,
49377 + .max_name_len = max_name_len_de
49378 + }
49379 + }
49380 + },
49381 + [COMPOUND_DIR_ID] = {
49382 + .h = {
49383 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49384 + .id = COMPOUND_DIR_ID,
49385 + .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49386 + .pops = &item_plugin_ops,
49387 + .label = "cde",
49388 + .desc = "compressed directory entry",
49389 + .linkage = {NULL, NULL}
49390 + },
49391 + .b = {
49392 + .max_key_inside = max_key_inside_cde,
49393 + .can_contain_key = can_contain_key_cde,
49394 + .mergeable = mergeable_cde,
49395 + .nr_units = nr_units_cde,
49396 + .lookup = lookup_cde,
49397 + .init = init_cde,
49398 + .paste = paste_cde,
49399 + .fast_paste = agree_to_fast_op,
49400 + .can_shift = can_shift_cde,
49401 + .copy_units = copy_units_cde,
49402 + .create_hook = NULL,
49403 + .kill_hook = NULL,
49404 + .shift_hook = NULL,
49405 + .cut_units = cut_units_cde,
49406 + .kill_units = kill_units_cde,
49407 + .unit_key = unit_key_cde,
49408 + .max_unit_key = unit_key_cde,
49409 + .estimate = estimate_cde,
49410 + .item_data_by_flow = NULL,
49411 +#if REISER4_DEBUG
49412 + .check = reiser4_check_cde
49413 +#endif
49414 + },
49415 + .f = {
49416 + .utmost_child = NULL,
49417 + .utmost_child_real_block = NULL,
49418 + .update = NULL,
49419 + .scan = NULL,
49420 + .convert = NULL
49421 + },
49422 + .s = {
49423 + .dir = {
49424 + .extract_key = extract_key_cde,
49425 + .update_key = update_key_cde,
49426 + .extract_name = extract_name_cde,
49427 + .extract_file_type = extract_file_type_de,
49428 + .add_entry = add_entry_cde,
49429 + .rem_entry = rem_entry_cde,
49430 + .max_name_len = max_name_len_cde
49431 + }
49432 + }
49433 + },
49434 + [NODE_POINTER_ID] = {
49435 + .h = {
49436 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49437 + .id = NODE_POINTER_ID,
49438 + .groups = (1 << INTERNAL_ITEM_TYPE),
49439 + .pops = NULL,
49440 + .label = "internal",
49441 + .desc = "internal item",
49442 + .linkage = {NULL, NULL}
49443 + },
49444 + .b = {
49445 + .max_key_inside = NULL,
49446 + .can_contain_key = NULL,
49447 + .mergeable = mergeable_internal,
49448 + .nr_units = nr_units_single_unit,
49449 + .lookup = lookup_internal,
49450 + .init = NULL,
49451 + .paste = NULL,
49452 + .fast_paste = NULL,
49453 + .can_shift = NULL,
49454 + .copy_units = NULL,
49455 + .create_hook = create_hook_internal,
49456 + .kill_hook = kill_hook_internal,
49457 + .shift_hook = shift_hook_internal,
49458 + .cut_units = NULL,
49459 + .kill_units = NULL,
49460 + .unit_key = NULL,
49461 + .max_unit_key = NULL,
49462 + .estimate = NULL,
49463 + .item_data_by_flow = NULL,
49464 +#if REISER4_DEBUG
49465 + .check = check__internal
49466 +#endif
49467 + },
49468 + .f = {
49469 + .utmost_child = utmost_child_internal,
49470 + .utmost_child_real_block =
49471 + utmost_child_real_block_internal,
49472 + .update = reiser4_update_internal,
49473 + .scan = NULL,
49474 + .convert = NULL
49475 + },
49476 + .s = {
49477 + .internal = {
49478 + .down_link = down_link_internal,
49479 + .has_pointer_to = has_pointer_to_internal
49480 + }
49481 + }
49482 + },
49483 + [EXTENT_POINTER_ID] = {
49484 + .h = {
49485 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49486 + .id = EXTENT_POINTER_ID,
49487 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49488 + .pops = NULL,
49489 + .label = "extent",
49490 + .desc = "extent item",
49491 + .linkage = {NULL, NULL}
49492 + },
49493 + .b = {
49494 + .max_key_inside = max_key_inside_extent,
49495 + .can_contain_key = can_contain_key_extent,
49496 + .mergeable = mergeable_extent,
49497 + .nr_units = nr_units_extent,
49498 + .lookup = lookup_extent,
49499 + .init = NULL,
49500 + .paste = paste_extent,
49501 + .fast_paste = agree_to_fast_op,
49502 + .can_shift = can_shift_extent,
49503 + .create_hook = create_hook_extent,
49504 + .copy_units = copy_units_extent,
49505 + .kill_hook = kill_hook_extent,
49506 + .shift_hook = NULL,
49507 + .cut_units = cut_units_extent,
49508 + .kill_units = kill_units_extent,
49509 + .unit_key = unit_key_extent,
49510 + .max_unit_key = max_unit_key_extent,
49511 + .estimate = NULL,
49512 + .item_data_by_flow = NULL,
49513 +#if REISER4_DEBUG
49514 + .check = reiser4_check_extent
49515 +#endif
49516 + },
49517 + .f = {
49518 + .utmost_child = utmost_child_extent,
49519 + .utmost_child_real_block =
49520 + utmost_child_real_block_extent,
49521 + .update = NULL,
49522 + .scan = reiser4_scan_extent,
49523 + .convert = NULL,
49524 + .key_by_offset = key_by_offset_extent
49525 + },
49526 + .s = {
49527 + .file = {
49528 + .write = reiser4_write_extent,
49529 + .read = reiser4_read_extent,
49530 + .readpage = reiser4_readpage_extent,
49531 + .get_block = get_block_address_extent,
49532 + .append_key = append_key_extent,
49533 + .init_coord_extension =
49534 + init_coord_extension_extent
49535 + }
49536 + }
49537 + },
49538 + [FORMATTING_ID] = {
49539 + .h = {
49540 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49541 + .id = FORMATTING_ID,
49542 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49543 + .pops = NULL,
49544 + .label = "body",
49545 + .desc = "body (or tail?) item",
49546 + .linkage = {NULL, NULL}
49547 + },
49548 + .b = {
49549 + .max_key_inside = max_key_inside_tail,
49550 + .can_contain_key = can_contain_key_tail,
49551 + .mergeable = mergeable_tail,
49552 + .nr_units = nr_units_tail,
49553 + .lookup = lookup_tail,
49554 + .init = NULL,
49555 + .paste = paste_tail,
49556 + .fast_paste = agree_to_fast_op,
49557 + .can_shift = can_shift_tail,
49558 + .create_hook = NULL,
49559 + .copy_units = copy_units_tail,
49560 + .kill_hook = kill_hook_tail,
49561 + .shift_hook = NULL,
49562 + .cut_units = cut_units_tail,
49563 + .kill_units = kill_units_tail,
49564 + .unit_key = unit_key_tail,
49565 + .max_unit_key = unit_key_tail,
49566 + .estimate = NULL,
49567 + .item_data_by_flow = NULL,
49568 +#if REISER4_DEBUG
49569 + .check = NULL
49570 +#endif
49571 + },
49572 + .f = {
49573 + .utmost_child = NULL,
49574 + .utmost_child_real_block = NULL,
49575 + .update = NULL,
49576 + .scan = NULL,
49577 + .convert = NULL
49578 + },
49579 + .s = {
49580 + .file = {
49581 + .write = reiser4_write_tail,
49582 + .read = reiser4_read_tail,
49583 + .readpage = readpage_tail,
49584 + .get_block = get_block_address_tail,
49585 + .append_key = append_key_tail,
49586 + .init_coord_extension =
49587 + init_coord_extension_tail
49588 + }
49589 + }
49590 + },
49591 + [CTAIL_ID] = {
49592 + .h = {
49593 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49594 + .id = CTAIL_ID,
49595 + .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49596 + .pops = NULL,
49597 + .label = "ctail",
49598 + .desc = "cryptcompress tail item",
49599 + .linkage = {NULL, NULL}
49600 + },
49601 + .b = {
49602 + .max_key_inside = max_key_inside_tail,
49603 + .can_contain_key = can_contain_key_ctail,
49604 + .mergeable = mergeable_ctail,
49605 + .nr_units = nr_units_ctail,
49606 + .lookup = NULL,
49607 + .init = init_ctail,
49608 + .paste = paste_ctail,
49609 + .fast_paste = agree_to_fast_op,
49610 + .can_shift = can_shift_ctail,
49611 + .create_hook = create_hook_ctail,
49612 + .copy_units = copy_units_ctail,
49613 + .kill_hook = kill_hook_ctail,
49614 + .shift_hook = shift_hook_ctail,
49615 + .cut_units = cut_units_ctail,
49616 + .kill_units = kill_units_ctail,
49617 + .unit_key = unit_key_tail,
49618 + .max_unit_key = unit_key_tail,
49619 + .estimate = estimate_ctail,
49620 + .item_data_by_flow = NULL,
49621 +#if REISER4_DEBUG
49622 + .check = check_ctail
49623 +#endif
49624 + },
49625 + .f = {
49626 + .utmost_child = utmost_child_ctail,
49627 + /* FIXME-EDWARD: write this */
49628 + .utmost_child_real_block = NULL,
49629 + .update = NULL,
49630 + .scan = scan_ctail,
49631 + .convert = convert_ctail
49632 + },
49633 + .s = {
49634 + .file = {
49635 + .write = NULL,
49636 + .read = read_ctail,
49637 + .readpage = readpage_ctail,
49638 + .get_block = get_block_address_tail,
49639 + .append_key = append_key_ctail,
49640 + .init_coord_extension =
49641 + init_coord_extension_tail
49642 + }
49643 + }
49644 + },
49645 + [BLACK_BOX_ID] = {
49646 + .h = {
49647 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
49648 + .id = BLACK_BOX_ID,
49649 + .groups = (1 << OTHER_ITEM_TYPE),
49650 + .pops = NULL,
49651 + .label = "blackbox",
49652 + .desc = "black box item",
49653 + .linkage = {NULL, NULL}
49654 + },
49655 + .b = {
49656 + .max_key_inside = NULL,
49657 + .can_contain_key = NULL,
49658 + .mergeable = not_mergeable,
49659 + .nr_units = nr_units_single_unit,
49660 + /* to need for ->lookup method */
49661 + .lookup = NULL,
49662 + .init = NULL,
49663 + .paste = NULL,
49664 + .fast_paste = NULL,
49665 + .can_shift = NULL,
49666 + .copy_units = NULL,
49667 + .create_hook = NULL,
49668 + .kill_hook = NULL,
49669 + .shift_hook = NULL,
49670 + .cut_units = NULL,
49671 + .kill_units = NULL,
49672 + .unit_key = NULL,
49673 + .max_unit_key = NULL,
49674 + .estimate = NULL,
49675 + .item_data_by_flow = NULL,
49676 +#if REISER4_DEBUG
49677 + .check = NULL
49678 +#endif
49679 + }
49680 + }
49681 +};
49682 +
49683 +/* Make Linus happy.
49684 + Local variables:
49685 + c-indentation-style: "K&R"
49686 + mode-name: "LC"
49687 + c-basic-offset: 8
49688 + tab-width: 8
49689 + fill-column: 120
49690 + End:
49691 +*/
49692 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/item.h linux-2.6.23/fs/reiser4/plugin/item/item.h
49693 --- linux-2.6.23.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300
49694 +++ linux-2.6.23/fs/reiser4/plugin/item/item.h 2007-12-04 16:49:30.000000000 +0300
49695 @@ -0,0 +1,397 @@
49696 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49697 +
49698 +/* first read balance.c comments before reading this */
49699 +
49700 +/* An item_plugin implements all of the operations required for
49701 + balancing that are item specific. */
49702 +
49703 +/* an item plugin also implements other operations that are specific to that
49704 + item. These go into the item specific operations portion of the item
49705 + handler, and all of the item specific portions of the item handler are put
49706 + into a union. */
49707 +
49708 +#if !defined( __REISER4_ITEM_H__ )
49709 +#define __REISER4_ITEM_H__
49710 +
49711 +#include "../../forward.h"
49712 +#include "../plugin_header.h"
49713 +#include "../../dformat.h"
49714 +#include "../../seal.h"
49715 +#include "../../plugin/file/file.h"
49716 +
49717 +#include <linux/fs.h> /* for struct file, struct inode */
49718 +#include <linux/mm.h> /* for struct page */
49719 +#include <linux/dcache.h> /* for struct dentry */
49720 +
49721 +typedef enum {
49722 + STAT_DATA_ITEM_TYPE,
49723 + DIR_ENTRY_ITEM_TYPE,
49724 + INTERNAL_ITEM_TYPE,
49725 + UNIX_FILE_METADATA_ITEM_TYPE,
49726 + OTHER_ITEM_TYPE
49727 +} item_type_id;
49728 +
49729 +/* this is the part of each item plugin that all items are expected to
49730 + support or at least explicitly fail to support by setting the
49731 + pointer to null. */
49732 +struct balance_ops {
49733 + /* operations called by balancing
49734 +
49735 + It is interesting to consider that some of these item
49736 + operations could be given sources or targets that are not
49737 + really items in nodes. This could be ok/useful.
49738 +
49739 + */
49740 + /* maximal key that can _possibly_ be occupied by this item
49741 +
49742 + When inserting, and node ->lookup() method (called by
49743 + coord_by_key()) reaches an item after binary search,
49744 + the ->max_key_inside() item plugin method is used to determine
49745 + whether new item should pasted into existing item
49746 + (new_key<=max_key_inside()) or new item has to be created
49747 + (new_key>max_key_inside()).
49748 +
49749 + For items that occupy exactly one key (like stat-data)
49750 + this method should return this key. For items that can
49751 + grow indefinitely (extent, directory item) this should
49752 + return reiser4_max_key().
49753 +
49754 + For example extent with the key
49755 +
49756 + (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
49757 +
49758 + ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
49759 + */
49760 + reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
49761 +
49762 + /* true if item @coord can merge data at @key. */
49763 + int (*can_contain_key) (const coord_t *, const reiser4_key *,
49764 + const reiser4_item_data *);
49765 + /* mergeable() - check items for mergeability
49766 +
49767 + Optional method. Returns true if two items can be merged.
49768 +
49769 + */
49770 + int (*mergeable) (const coord_t *, const coord_t *);
49771 +
49772 + /* number of atomic things in an item.
49773 + NOTE FOR CONTRIBUTORS: use a generic method
49774 + nr_units_single_unit() for solid (atomic) items, as
49775 + tree operations use it as a criterion of solidness
49776 + (see is_solid_item macro) */
49777 + pos_in_node_t(*nr_units) (const coord_t *);
49778 +
49779 + /* search within item for a unit within the item, and return a
49780 + pointer to it. This can be used to calculate how many
49781 + bytes to shrink an item if you use pointer arithmetic and
49782 + compare to the start of the item body if the item's data
49783 + are continuous in the node, if the item's data are not
49784 + continuous in the node, all sorts of other things are maybe
49785 + going to break as well. */
49786 + lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
49787 + /* method called by ode_plugin->create_item() to initialise new
49788 + item */
49789 + int (*init) (coord_t * target, coord_t * from,
49790 + reiser4_item_data * data);
49791 + /* method called (e.g., by reiser4_resize_item()) to place new data
49792 + into item when it grows */
49793 + int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
49794 + /* return true if paste into @coord is allowed to skip
49795 + carry. That is, if such paste would require any changes
49796 + at the parent level
49797 + */
49798 + int (*fast_paste) (const coord_t *);
49799 + /* how many but not more than @want units of @source can be
49800 + shifted into @target node. If pend == append - we try to
49801 + append last item of @target by first units of @source. If
49802 + pend == prepend - we try to "prepend" first item in @target
49803 + by last units of @source. @target node has @free_space
49804 + bytes of free space. Total size of those units are returned
49805 + via @size.
49806 +
49807 + @target is not NULL if shifting to the mergeable item and
49808 + NULL is new item will be created during shifting.
49809 + */
49810 + int (*can_shift) (unsigned free_space, coord_t *,
49811 + znode *, shift_direction, unsigned *size,
49812 + unsigned want);
49813 +
49814 + /* starting off @from-th unit of item @source append or
49815 + prepend @count units to @target. @target has been already
49816 + expanded by @free_space bytes. That must be exactly what is
49817 + needed for those items in @target. If @where_is_free_space
49818 + == SHIFT_LEFT - free space is at the end of @target item,
49819 + othersize - it is in the beginning of it. */
49820 + void (*copy_units) (coord_t *, coord_t *,
49821 + unsigned from, unsigned count,
49822 + shift_direction where_is_free_space,
49823 + unsigned free_space);
49824 +
49825 + int (*create_hook) (const coord_t *, void *);
49826 + /* do whatever is necessary to do when @count units starting
49827 + from @from-th one are removed from the tree */
49828 + /* FIXME-VS: this is used to be here for, in particular,
49829 + extents and items of internal type to free blocks they point
49830 + to at the same time with removing items from a
49831 + tree. Problems start, however, when dealloc_block fails due
49832 + to some reason. Item gets removed, but blocks it pointed to
49833 + are not freed. It is not clear how to fix this for items of
49834 + internal type because a need to remove internal item may
49835 + appear in the middle of balancing, and there is no way to
49836 + undo changes made. OTOH, if space allocator involves
49837 + balancing to perform dealloc_block - this will probably
49838 + break balancing due to deadlock issues
49839 + */
49840 + int (*kill_hook) (const coord_t *, pos_in_node_t from,
49841 + pos_in_node_t count, struct carry_kill_data *);
49842 + int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
49843 + znode * _node);
49844 +
49845 + /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
49846 + including boundaries. When units are cut from item beginning - move space which gets freed to head of
49847 + item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
49848 + item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
49849 + @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
49850 + */
49851 + int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49852 + struct carry_cut_data *,
49853 + reiser4_key * smallest_removed,
49854 + reiser4_key * new_first_key);
49855 +
49856 + /* like cut_units, except that these units are removed from the
49857 + tree, not only from a node */
49858 + int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49859 + struct carry_kill_data *,
49860 + reiser4_key * smallest_removed,
49861 + reiser4_key * new_first);
49862 +
49863 + /* if @key_of_coord == 1 - returned key of coord, otherwise -
49864 + key of unit is returned. If @coord is not set to certain
49865 + unit - ERR_PTR(-ENOENT) is returned */
49866 + reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
49867 + reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
49868 + /* estimate how much space is needed for paste @data into item at
49869 + @coord. if @coord==0 - estimate insertion, otherwise - estimate
49870 + pasting
49871 + */
49872 + int (*estimate) (const coord_t *, const reiser4_item_data *);
49873 +
49874 + /* converts flow @f to item data. @coord == 0 on insert */
49875 + int (*item_data_by_flow) (const coord_t *, const flow_t *,
49876 + reiser4_item_data *);
49877 +
49878 + /*void (*show) (struct seq_file *, coord_t *); */
49879 +
49880 +#if REISER4_DEBUG
49881 + /* used for debugging, every item should have here the most
49882 + complete possible check of the consistency of the item that
49883 + the inventor can construct */
49884 + int (*check) (const coord_t *, const char **error);
49885 +#endif
49886 +
49887 +};
49888 +
49889 +struct flush_ops {
49890 + /* return the right or left child of @coord, only if it is in memory */
49891 + int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
49892 +
49893 + /* return whether the right or left child of @coord has a non-fake
49894 + block number. */
49895 + int (*utmost_child_real_block) (const coord_t *, sideof side,
49896 + reiser4_block_nr *);
49897 + /* relocate child at @coord to the @block */
49898 + void (*update) (const coord_t *, const reiser4_block_nr *);
49899 + /* count unformatted nodes per item for leave relocation policy, etc.. */
49900 + int (*scan) (flush_scan * scan);
49901 + /* convert item by flush */
49902 + int (*convert) (flush_pos_t * pos);
49903 + /* backward mapping from jnode offset to a key. */
49904 + int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
49905 +};
49906 +
49907 +/* operations specific to the directory item */
49908 +struct dir_entry_iops {
49909 + /* extract stat-data key from directory entry at @coord and place it
49910 + into @key. */
49911 + int (*extract_key) (const coord_t *, reiser4_key * key);
49912 + /* update object key in item. */
49913 + int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
49914 + /* extract name from directory entry at @coord and return it */
49915 + char *(*extract_name) (const coord_t *, char *buf);
49916 + /* extract file type (DT_* stuff) from directory entry at @coord and
49917 + return it */
49918 + unsigned (*extract_file_type) (const coord_t *);
49919 + int (*add_entry) (struct inode * dir,
49920 + coord_t *, lock_handle *,
49921 + const struct dentry * name,
49922 + reiser4_dir_entry_desc * entry);
49923 + int (*rem_entry) (struct inode * dir, const struct qstr * name,
49924 + coord_t *, lock_handle *,
49925 + reiser4_dir_entry_desc * entry);
49926 + int (*max_name_len) (const struct inode * dir);
49927 +};
49928 +
49929 +/* operations specific to items regular (unix) file metadata are built of */
49930 +struct file_iops{
49931 + int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
49932 + int (*read) (struct file *, flow_t *, hint_t *);
49933 + int (*readpage) (void *, struct page *);
49934 + int (*get_block) (const coord_t *, sector_t, sector_t *);
49935 + /*
49936 + * key of first byte which is not addressed by the item @coord is set
49937 + * to.
49938 + * For example, for extent item with the key
49939 + *
49940 + * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
49941 + *
49942 + * ->append_key is
49943 + *
49944 + * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
49945 + */
49946 + reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
49947 +
49948 + void (*init_coord_extension) (uf_coord_t *, loff_t);
49949 +};
49950 +
49951 +/* operations specific to items of stat data type */
49952 +struct sd_iops {
49953 + int (*init_inode) (struct inode * inode, char *sd, int len);
49954 + int (*save_len) (struct inode * inode);
49955 + int (*save) (struct inode * inode, char **area);
49956 +};
49957 +
49958 +/* operations specific to internal item */
49959 +struct internal_iops{
49960 + /* all tree traversal want to know from internal item is where
49961 + to go next. */
49962 + void (*down_link) (const coord_t * coord,
49963 + const reiser4_key * key, reiser4_block_nr * block);
49964 + /* check that given internal item contains given pointer. */
49965 + int (*has_pointer_to) (const coord_t * coord,
49966 + const reiser4_block_nr * block);
49967 +};
49968 +
49969 +struct item_plugin {
49970 + /* generic fields */
49971 + plugin_header h;
49972 + /* methods common for all item types */
49973 + struct balance_ops b; /* balance operations */
49974 + struct flush_ops f; /* flush operates with items via this methods */
49975 +
49976 + /* methods specific to particular type of item */
49977 + union {
49978 + struct dir_entry_iops dir;
49979 + struct file_iops file;
49980 + struct sd_iops sd;
49981 + struct internal_iops internal;
49982 + } s;
49983 +};
49984 +
49985 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
49986 +
49987 +static inline item_id item_id_by_plugin(item_plugin * plugin)
49988 +{
49989 + return plugin->h.id;
49990 +}
49991 +
49992 +static inline char get_iplugid(item_plugin * iplug)
49993 +{
49994 + assert("nikita-2838", iplug != NULL);
49995 + assert("nikita-2839", iplug->h.id < 0xff);
49996 + return (char)item_id_by_plugin(iplug);
49997 +}
49998 +
49999 +extern unsigned long znode_times_locked(const znode * z);
50000 +
50001 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
50002 +{
50003 + assert("nikita-2837", coord != NULL);
50004 + assert("nikita-2838", iplug != NULL);
50005 + coord->iplugid = get_iplugid(iplug);
50006 + ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
50007 +}
50008 +
50009 +static inline item_plugin *coord_iplug(const coord_t * coord)
50010 +{
50011 + assert("nikita-2833", coord != NULL);
50012 + assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
50013 + assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
50014 + return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
50015 + coord->iplugid);
50016 +}
50017 +
50018 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
50019 + const reiser4_item_data *);
50020 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
50021 +extern int item_is_extent(const coord_t *);
50022 +extern int item_is_tail(const coord_t *);
50023 +extern int item_is_statdata(const coord_t * item);
50024 +extern int item_is_ctail(const coord_t *);
50025 +
50026 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
50027 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
50028 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
50029 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
50030 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
50031 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
50032 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
50033 + reiser4_key * key);
50034 +extern void obtain_item_plugin(const coord_t * coord);
50035 +
50036 +#if defined(REISER4_DEBUG)
50037 +extern int znode_is_loaded(const znode * node);
50038 +#endif
50039 +
50040 +/* return plugin of item at @coord */
50041 +static inline item_plugin *item_plugin_by_coord(const coord_t *
50042 + coord /* coord to query */ )
50043 +{
50044 + assert("nikita-330", coord != NULL);
50045 + assert("nikita-331", coord->node != NULL);
50046 + assert("nikita-332", znode_is_loaded(coord->node));
50047 +
50048 + if (unlikely(!coord_is_iplug_set(coord)))
50049 + obtain_item_plugin(coord);
50050 + return coord_iplug(coord);
50051 +}
50052 +
50053 +/* this returns true if item is of internal type */
50054 +static inline int item_is_internal(const coord_t * item)
50055 +{
50056 + assert("vs-483", coord_is_existing_item(item));
50057 + return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
50058 +}
50059 +
50060 +extern void item_body_by_coord_hard(coord_t * coord);
50061 +extern void *item_body_by_coord_easy(const coord_t * coord);
50062 +#if REISER4_DEBUG
50063 +extern int item_body_is_valid(const coord_t * coord);
50064 +#endif
50065 +
50066 +/* return pointer to item body */
50067 +static inline void *item_body_by_coord(const coord_t *
50068 + coord /* coord to query */ )
50069 +{
50070 + assert("nikita-324", coord != NULL);
50071 + assert("nikita-325", coord->node != NULL);
50072 + assert("nikita-326", znode_is_loaded(coord->node));
50073 +
50074 + if (coord->offset == INVALID_OFFSET)
50075 + item_body_by_coord_hard((coord_t *) coord);
50076 + assert("nikita-3201", item_body_is_valid(coord));
50077 + assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
50078 + return item_body_by_coord_easy(coord);
50079 +}
50080 +
50081 +/* __REISER4_ITEM_H__ */
50082 +#endif
50083 +/* Make Linus happy.
50084 + Local variables:
50085 + c-indentation-style: "K&R"
50086 + mode-name: "LC"
50087 + c-basic-offset: 8
50088 + tab-width: 8
50089 + fill-column: 120
50090 + scroll-step: 1
50091 + End:
50092 +*/
50093 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/Makefile linux-2.6.23/fs/reiser4/plugin/item/Makefile
50094 --- linux-2.6.23.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300
50095 +++ linux-2.6.23/fs/reiser4/plugin/item/Makefile 2007-12-04 16:49:30.000000000 +0300
50096 @@ -0,0 +1,18 @@
50097 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
50098 +
50099 +item_plugins-objs := \
50100 + item.o \
50101 + static_stat.o \
50102 + sde.o \
50103 + cde.o \
50104 + blackbox.o \
50105 + internal.o \
50106 + tail.o \
50107 + ctail.o \
50108 + extent.o \
50109 + extent_item_ops.o \
50110 + extent_file_ops.o \
50111 + extent_flush_ops.o
50112 +
50113 +
50114 +
50115 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/sde.c linux-2.6.23/fs/reiser4/plugin/item/sde.c
50116 --- linux-2.6.23.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300
50117 +++ linux-2.6.23/fs/reiser4/plugin/item/sde.c 2007-12-04 16:49:30.000000000 +0300
50118 @@ -0,0 +1,190 @@
50119 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50120 +
50121 +/* Directory entry implementation */
50122 +#include "../../forward.h"
50123 +#include "../../debug.h"
50124 +#include "../../dformat.h"
50125 +#include "../../kassign.h"
50126 +#include "../../coord.h"
50127 +#include "sde.h"
50128 +#include "item.h"
50129 +#include "../plugin.h"
50130 +#include "../../znode.h"
50131 +#include "../../carry.h"
50132 +#include "../../tree.h"
50133 +#include "../../inode.h"
50134 +
50135 +#include <linux/fs.h> /* for struct inode */
50136 +#include <linux/dcache.h> /* for struct dentry */
50137 +#include <linux/quotaops.h>
50138 +
50139 +/* ->extract_key() method of simple directory item plugin. */
50140 +int extract_key_de(const coord_t * coord /* coord of item */ ,
50141 + reiser4_key * key /* resulting key */ )
50142 +{
50143 + directory_entry_format *dent;
50144 +
50145 + assert("nikita-1458", coord != NULL);
50146 + assert("nikita-1459", key != NULL);
50147 +
50148 + dent = (directory_entry_format *) item_body_by_coord(coord);
50149 + assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
50150 + return extract_key_from_id(&dent->id, key);
50151 +}
50152 +
50153 +int
50154 +update_key_de(const coord_t * coord, const reiser4_key * key,
50155 + lock_handle * lh UNUSED_ARG)
50156 +{
50157 + directory_entry_format *dent;
50158 + obj_key_id obj_id;
50159 + int result;
50160 +
50161 + assert("nikita-2342", coord != NULL);
50162 + assert("nikita-2343", key != NULL);
50163 +
50164 + dent = (directory_entry_format *) item_body_by_coord(coord);
50165 + result = build_obj_key_id(key, &obj_id);
50166 + if (result == 0) {
50167 + dent->id = obj_id;
50168 + znode_make_dirty(coord->node);
50169 + }
50170 + return 0;
50171 +}
50172 +
50173 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
50174 + char *buf)
50175 +{
50176 + reiser4_key key;
50177 +
50178 + unit_key_by_coord(coord, &key);
50179 + if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
50180 + reiser4_print_address("oops", znode_get_block(coord->node));
50181 + if (!is_longname_key(&key)) {
50182 + if (is_dot_key(&key))
50183 + return (char *)".";
50184 + else
50185 + return extract_name_from_key(&key, buf);
50186 + } else
50187 + return (char *)dent->name;
50188 +}
50189 +
50190 +/* ->extract_name() method of simple directory item plugin. */
50191 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
50192 +{
50193 + directory_entry_format *dent;
50194 +
50195 + assert("nikita-1460", coord != NULL);
50196 +
50197 + dent = (directory_entry_format *) item_body_by_coord(coord);
50198 + return extract_dent_name(coord, dent, buf);
50199 +}
50200 +
50201 +/* ->extract_file_type() method of simple directory item plugin. */
50202 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
50203 + * item */ )
50204 +{
50205 + assert("nikita-1764", coord != NULL);
50206 + /* we don't store file type in the directory entry yet.
50207 +
50208 + But see comments at kassign.h:obj_key_id
50209 + */
50210 + return DT_UNKNOWN;
50211 +}
50212 +
50213 +int add_entry_de(struct inode *dir /* directory of item */ ,
50214 + coord_t * coord /* coord of item */ ,
50215 + lock_handle * lh /* insertion lock handle */ ,
50216 + const struct dentry *de /* name to add */ ,
50217 + reiser4_dir_entry_desc * entry /* parameters of new directory
50218 + * entry */ )
50219 +{
50220 + reiser4_item_data data;
50221 + directory_entry_format *dent;
50222 + int result;
50223 + const char *name;
50224 + int len;
50225 + int longname;
50226 +
50227 + name = de->d_name.name;
50228 + len = de->d_name.len;
50229 + assert("nikita-1163", strlen(name) == len);
50230 +
50231 + longname = is_longname(name, len);
50232 +
50233 + data.length = sizeof *dent;
50234 + if (longname)
50235 + data.length += len + 1;
50236 + data.data = NULL;
50237 + data.user = 0;
50238 + data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
50239 +
50240 + /* NOTE-NIKITA quota plugin */
50241 + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
50242 + return -EDQUOT;
50243 +
50244 + result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
50245 + if (result != 0)
50246 + return result;
50247 +
50248 + dent = (directory_entry_format *) item_body_by_coord(coord);
50249 + build_inode_key_id(entry->obj, &dent->id);
50250 + if (longname) {
50251 + memcpy(dent->name, name, len);
50252 + put_unaligned(0, &dent->name[len]);
50253 + }
50254 + return 0;
50255 +}
50256 +
50257 +int rem_entry_de(struct inode *dir /* directory of item */ ,
50258 + const struct qstr *name UNUSED_ARG,
50259 + coord_t * coord /* coord of item */ ,
50260 + lock_handle * lh UNUSED_ARG /* lock handle for
50261 + * removal */ ,
50262 + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
50263 + * directory entry
50264 + * being removed */ )
50265 +{
50266 + coord_t shadow;
50267 + int result;
50268 + int length;
50269 +
50270 + length = item_length_by_coord(coord);
50271 + if (inode_get_bytes(dir) < length) {
50272 + warning("nikita-2627", "Dir is broke: %llu: %llu",
50273 + (unsigned long long)get_inode_oid(dir),
50274 + inode_get_bytes(dir));
50275 +
50276 + return RETERR(-EIO);
50277 + }
50278 +
50279 + /* cut_node() is supposed to take pointers to _different_
50280 + coords, because it will modify them without respect to
50281 + possible aliasing. To work around this, create temporary copy
50282 + of @coord.
50283 + */
50284 + coord_dup(&shadow, coord);
50285 + result =
50286 + kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
50287 + if (result == 0) {
50288 + /* NOTE-NIKITA quota plugin */
50289 + DQUOT_FREE_SPACE_NODIRTY(dir, length);
50290 + }
50291 + return result;
50292 +}
50293 +
50294 +int max_name_len_de(const struct inode *dir)
50295 +{
50296 + return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
50297 + sizeof(directory_entry_format) - 2;
50298 +}
50299 +
50300 +/* Make Linus happy.
50301 + Local variables:
50302 + c-indentation-style: "K&R"
50303 + mode-name: "LC"
50304 + c-basic-offset: 8
50305 + tab-width: 8
50306 + fill-column: 120
50307 + End:
50308 +*/
50309 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/sde.h linux-2.6.23/fs/reiser4/plugin/item/sde.h
50310 --- linux-2.6.23.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300
50311 +++ linux-2.6.23/fs/reiser4/plugin/item/sde.h 2007-12-04 16:49:30.000000000 +0300
50312 @@ -0,0 +1,66 @@
50313 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50314 +
50315 +/* Directory entry. */
50316 +
50317 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
50318 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
50319 +
50320 +#include "../../forward.h"
50321 +#include "../../dformat.h"
50322 +#include "../../kassign.h"
50323 +#include "../../key.h"
50324 +
50325 +#include <linux/fs.h>
50326 +#include <linux/dcache.h> /* for struct dentry */
50327 +
50328 +typedef struct directory_entry_format {
50329 + /* key of object stat-data. It's not necessary to store whole
50330 + key here, because it's always key of stat-data, so minor
50331 + packing locality and offset can be omitted here. But this
50332 + relies on particular key allocation scheme for stat-data, so,
50333 + for extensibility sake, whole key can be stored here.
50334 +
50335 + We store key as array of bytes, because we don't want 8-byte
50336 + alignment of dir entries.
50337 + */
50338 + obj_key_id id;
50339 + /* file name. Null terminated string. */
50340 + d8 name[0];
50341 +} directory_entry_format;
50342 +
50343 +void print_de(const char *prefix, coord_t * coord);
50344 +int extract_key_de(const coord_t * coord, reiser4_key * key);
50345 +int update_key_de(const coord_t * coord, const reiser4_key * key,
50346 + lock_handle * lh);
50347 +char *extract_name_de(const coord_t * coord, char *buf);
50348 +unsigned extract_file_type_de(const coord_t * coord);
50349 +int add_entry_de(struct inode *dir, coord_t * coord,
50350 + lock_handle * lh, const struct dentry *name,
50351 + reiser4_dir_entry_desc * entry);
50352 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
50353 + lock_handle * lh, reiser4_dir_entry_desc * entry);
50354 +int max_name_len_de(const struct inode *dir);
50355 +
50356 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
50357 +
50358 +char *extract_dent_name(const coord_t * coord,
50359 + directory_entry_format * dent, char *buf);
50360 +
50361 +#if REISER4_LARGE_KEY
50362 +#define DE_NAME_BUF_LEN (24)
50363 +#else
50364 +#define DE_NAME_BUF_LEN (16)
50365 +#endif
50366 +
50367 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
50368 +#endif
50369 +
50370 +/* Make Linus happy.
50371 + Local variables:
50372 + c-indentation-style: "K&R"
50373 + mode-name: "LC"
50374 + c-basic-offset: 8
50375 + tab-width: 8
50376 + fill-column: 120
50377 + End:
50378 +*/
50379 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.23/fs/reiser4/plugin/item/static_stat.c
50380 --- linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300
50381 +++ linux-2.6.23/fs/reiser4/plugin/item/static_stat.c 2007-12-04 16:49:30.000000000 +0300
50382 @@ -0,0 +1,1107 @@
50383 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50384 +
50385 +/* stat data manipulation. */
50386 +
50387 +#include "../../forward.h"
50388 +#include "../../super.h"
50389 +#include "../../vfs_ops.h"
50390 +#include "../../inode.h"
50391 +#include "../../debug.h"
50392 +#include "../../dformat.h"
50393 +#include "../object.h"
50394 +#include "../plugin.h"
50395 +#include "../plugin_header.h"
50396 +#include "static_stat.h"
50397 +#include "item.h"
50398 +
50399 +#include <linux/types.h>
50400 +#include <linux/fs.h>
50401 +
50402 +/* see static_stat.h for explanation */
50403 +
50404 +/* helper function used while we are dumping/loading inode/plugin state
50405 + to/from the stat-data. */
50406 +
50407 +static void move_on(int *length /* space remaining in stat-data */ ,
50408 + char **area /* current coord in stat data */ ,
50409 + int size_of /* how many bytes to move forward */ )
50410 +{
50411 + assert("nikita-615", length != NULL);
50412 + assert("nikita-616", area != NULL);
50413 +
50414 + *length -= size_of;
50415 + *area += size_of;
50416 +
50417 + assert("nikita-617", *length >= 0);
50418 +}
50419 +
50420 +/* helper function used while loading inode/plugin state from stat-data.
50421 + Complain if there is less space in stat-data than was expected.
50422 + Can only happen on disk corruption. */
50423 +static int not_enough_space(struct inode *inode /* object being processed */ ,
50424 + const char *where /* error message */ )
50425 +{
50426 + assert("nikita-618", inode != NULL);
50427 +
50428 + warning("nikita-619", "Not enough space in %llu while loading %s",
50429 + (unsigned long long)get_inode_oid(inode), where);
50430 +
50431 + return RETERR(-EINVAL);
50432 +}
50433 +
50434 +/* helper function used while loading inode/plugin state from
50435 + stat-data. Call it if invalid plugin id was found. */
50436 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
50437 + struct inode *inode /* object being processed */ )
50438 +{
50439 + warning("nikita-620", "Unknown plugin %i in %llu",
50440 + id, (unsigned long long)get_inode_oid(inode));
50441 +
50442 + return RETERR(-EINVAL);
50443 +}
50444 +
50445 +/* this is installed as ->init_inode() method of
50446 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
50447 + Copies data from on-disk stat-data format into inode.
50448 + Handles stat-data extensions. */
50449 +/* was sd_load */
50450 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
50451 + char *sd /* stat-data body */ ,
50452 + int len /* length of stat-data */ )
50453 +{
50454 + int result;
50455 + int bit;
50456 + int chunk;
50457 + __u16 mask;
50458 + __u64 bigmask;
50459 + reiser4_stat_data_base *sd_base;
50460 + reiser4_inode *state;
50461 +
50462 + assert("nikita-625", inode != NULL);
50463 + assert("nikita-626", sd != NULL);
50464 +
50465 + result = 0;
50466 + sd_base = (reiser4_stat_data_base *) sd;
50467 + state = reiser4_inode_data(inode);
50468 + mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
50469 + bigmask = mask;
50470 + reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
50471 +
50472 + move_on(&len, &sd, sizeof *sd_base);
50473 + for (bit = 0, chunk = 0;
50474 + mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
50475 + ++bit, mask >>= 1) {
50476 + if (((bit + 1) % 16) != 0) {
50477 + /* handle extension */
50478 + sd_ext_plugin *sdplug;
50479 +
50480 + if (bit >= LAST_SD_EXTENSION) {
50481 + warning("vpf-1904",
50482 + "No such extension %i in inode %llu",
50483 + bit,
50484 + (unsigned long long)
50485 + get_inode_oid(inode));
50486 +
50487 + result = RETERR(-EINVAL);
50488 + break;
50489 + }
50490 +
50491 + sdplug = sd_ext_plugin_by_id(bit);
50492 + if (sdplug == NULL) {
50493 + warning("nikita-627",
50494 + "No such extension %i in inode %llu",
50495 + bit,
50496 + (unsigned long long)
50497 + get_inode_oid(inode));
50498 +
50499 + result = RETERR(-EINVAL);
50500 + break;
50501 + }
50502 + if (mask & 1) {
50503 + assert("nikita-628", sdplug->present);
50504 + /* alignment is not supported in node layout
50505 + plugin yet.
50506 + result = align( inode, &len, &sd,
50507 + sdplug -> alignment );
50508 + if( result != 0 )
50509 + return result; */
50510 + result = sdplug->present(inode, &sd, &len);
50511 + } else if (sdplug->absent != NULL)
50512 + result = sdplug->absent(inode);
50513 + if (result)
50514 + break;
50515 + /* else, we are looking at the last bit in 16-bit
50516 + portion of bitmask */
50517 + } else if (mask & 1) {
50518 + /* next portion of bitmask */
50519 + if (len < (int)sizeof(d16)) {
50520 + warning("nikita-629",
50521 + "No space for bitmap in inode %llu",
50522 + (unsigned long long)
50523 + get_inode_oid(inode));
50524 +
50525 + result = RETERR(-EINVAL);
50526 + break;
50527 + }
50528 + mask = le16_to_cpu(get_unaligned((d16 *)sd));
50529 + bigmask <<= 16;
50530 + bigmask |= mask;
50531 + move_on(&len, &sd, sizeof(d16));
50532 + ++chunk;
50533 + if (chunk == 3) {
50534 + if (!(mask & 0x8000)) {
50535 + /* clear last bit */
50536 + mask &= ~0x8000;
50537 + continue;
50538 + }
50539 + /* too much */
50540 + warning("nikita-630",
50541 + "Too many extensions in %llu",
50542 + (unsigned long long)
50543 + get_inode_oid(inode));
50544 +
50545 + result = RETERR(-EINVAL);
50546 + break;
50547 + }
50548 + } else
50549 + /* bitmask exhausted */
50550 + break;
50551 + }
50552 + state->extmask = bigmask;
50553 + /* common initialisations */
50554 + if (len - (bit / 16 * sizeof(d16)) > 0) {
50555 + /* alignment in save_len_static_sd() is taken into account
50556 + -edward */
50557 + warning("nikita-631", "unused space in inode %llu",
50558 + (unsigned long long)get_inode_oid(inode));
50559 + }
50560 +
50561 + return result;
50562 +}
50563 +
50564 +/* estimates size of stat-data required to store inode.
50565 + Installed as ->save_len() method of
50566 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50567 +/* was sd_len */
50568 +int save_len_static_sd(struct inode *inode /* object being processed */ )
50569 +{
50570 + unsigned int result;
50571 + __u64 mask;
50572 + int bit;
50573 +
50574 + assert("nikita-632", inode != NULL);
50575 +
50576 + result = sizeof(reiser4_stat_data_base);
50577 + mask = reiser4_inode_data(inode)->extmask;
50578 + for (bit = 0; mask != 0; ++bit, mask >>= 1) {
50579 + if (mask & 1) {
50580 + sd_ext_plugin *sdplug;
50581 +
50582 + sdplug = sd_ext_plugin_by_id(bit);
50583 + assert("nikita-633", sdplug != NULL);
50584 + /* no aligment support
50585 + result +=
50586 + round_up( result, sdplug -> alignment ) - result; */
50587 + result += sdplug->save_len(inode);
50588 + }
50589 + }
50590 + result += bit / 16 * sizeof(d16);
50591 + return result;
50592 +}
50593 +
50594 +/* saves inode into stat-data.
50595 + Installed as ->save() method of
50596 + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50597 +/* was sd_save */
50598 +int save_static_sd(struct inode *inode /* object being processed */ ,
50599 + char **area /* where to save stat-data */ )
50600 +{
50601 + int result;
50602 + __u64 emask;
50603 + int bit;
50604 + unsigned int len;
50605 + reiser4_stat_data_base *sd_base;
50606 +
50607 + assert("nikita-634", inode != NULL);
50608 + assert("nikita-635", area != NULL);
50609 +
50610 + result = 0;
50611 + emask = reiser4_inode_data(inode)->extmask;
50612 + sd_base = (reiser4_stat_data_base *) * area;
50613 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
50614 + /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
50615 +
50616 + *area += sizeof *sd_base;
50617 + len = 0xffffffffu;
50618 + for (bit = 0; emask != 0; ++bit, emask >>= 1) {
50619 + if (emask & 1) {
50620 + if ((bit + 1) % 16 != 0) {
50621 + sd_ext_plugin *sdplug;
50622 + sdplug = sd_ext_plugin_by_id(bit);
50623 + assert("nikita-636", sdplug != NULL);
50624 + /* no alignment support yet
50625 + align( inode, &len, area,
50626 + sdplug -> alignment ); */
50627 + result = sdplug->save(inode, area);
50628 + if (result)
50629 + break;
50630 + } else {
50631 + put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
50632 + (d16 *)(*area));
50633 + /*cputod16((unsigned)(emask & 0xffff),
50634 + (d16 *) * area);*/
50635 + *area += sizeof(d16);
50636 + }
50637 + }
50638 + }
50639 + return result;
50640 +}
50641 +
50642 +/* stat-data extension handling functions. */
50643 +
50644 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
50645 + char **area /* position in stat-data */ ,
50646 + int *len /* remaining length */ )
50647 +{
50648 + if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
50649 + reiser4_light_weight_stat *sd_lw;
50650 +
50651 + sd_lw = (reiser4_light_weight_stat *) * area;
50652 +
50653 + inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
50654 + inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
50655 + inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
50656 + if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
50657 + inode->i_mode &= ~S_IFIFO;
50658 + warning("", "partially converted file is encountered");
50659 + reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
50660 + }
50661 + move_on(len, area, sizeof *sd_lw);
50662 + return 0;
50663 + } else
50664 + return not_enough_space(inode, "lw sd");
50665 +}
50666 +
50667 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
50668 + * processed */ )
50669 +{
50670 + return sizeof(reiser4_light_weight_stat);
50671 +}
50672 +
50673 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
50674 + char **area /* position in stat-data */ )
50675 +{
50676 + reiser4_light_weight_stat *sd;
50677 + mode_t delta;
50678 +
50679 + assert("nikita-2705", inode != NULL);
50680 + assert("nikita-2706", area != NULL);
50681 + assert("nikita-2707", *area != NULL);
50682 +
50683 + sd = (reiser4_light_weight_stat *) * area;
50684 +
50685 + delta = (reiser4_inode_get_flag(inode,
50686 + REISER4_PART_MIXED) ? S_IFIFO : 0);
50687 + put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
50688 + put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
50689 + put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
50690 + *area += sizeof *sd;
50691 + return 0;
50692 +}
50693 +
50694 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
50695 + char **area /* position in stat-data */ ,
50696 + int *len /* remaining length */ )
50697 +{
50698 + assert("nikita-637", inode != NULL);
50699 + assert("nikita-638", area != NULL);
50700 + assert("nikita-639", *area != NULL);
50701 + assert("nikita-640", len != NULL);
50702 + assert("nikita-641", *len > 0);
50703 +
50704 + if (*len >= (int)sizeof(reiser4_unix_stat)) {
50705 + reiser4_unix_stat *sd;
50706 +
50707 + sd = (reiser4_unix_stat *) * area;
50708 +
50709 + inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
50710 + inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
50711 + inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
50712 + inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
50713 + inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
50714 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50715 + inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
50716 + else
50717 + inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
50718 + move_on(len, area, sizeof *sd);
50719 + return 0;
50720 + } else
50721 + return not_enough_space(inode, "unix sd");
50722 +}
50723 +
50724 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
50725 +{
50726 + inode->i_uid = get_super_private(inode->i_sb)->default_uid;
50727 + inode->i_gid = get_super_private(inode->i_sb)->default_gid;
50728 + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
50729 + inode_set_bytes(inode, inode->i_size);
50730 + /* mark inode as lightweight, so that caller (lookup_common) will
50731 + complete initialisation by copying [ug]id from a parent. */
50732 + reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
50733 + return 0;
50734 +}
50735 +
50736 +/* Audited by: green(2002.06.14) */
50737 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
50738 + * processed */ )
50739 +{
50740 + return sizeof(reiser4_unix_stat);
50741 +}
50742 +
50743 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
50744 + char **area /* position in stat-data */ )
50745 +{
50746 + reiser4_unix_stat *sd;
50747 +
50748 + assert("nikita-642", inode != NULL);
50749 + assert("nikita-643", area != NULL);
50750 + assert("nikita-644", *area != NULL);
50751 +
50752 + sd = (reiser4_unix_stat *) * area;
50753 + put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
50754 + put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
50755 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
50756 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
50757 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
50758 + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50759 + put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
50760 + else
50761 + put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
50762 + *area += sizeof *sd;
50763 + return 0;
50764 +}
50765 +
50766 +static int
50767 +present_large_times_sd(struct inode *inode /* object being processed */ ,
50768 + char **area /* position in stat-data */ ,
50769 + int *len /* remaining length */ )
50770 +{
50771 + if (*len >= (int)sizeof(reiser4_large_times_stat)) {
50772 + reiser4_large_times_stat *sd_lt;
50773 +
50774 + sd_lt = (reiser4_large_times_stat *) * area;
50775 +
50776 + inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
50777 + inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
50778 + inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
50779 +
50780 + move_on(len, area, sizeof *sd_lt);
50781 + return 0;
50782 + } else
50783 + return not_enough_space(inode, "large times sd");
50784 +}
50785 +
50786 +static int
50787 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
50788 + /* object being processed */ )
50789 +{
50790 + return sizeof(reiser4_large_times_stat);
50791 +}
50792 +
50793 +static int
50794 +save_large_times_sd(struct inode *inode /* object being processed */ ,
50795 + char **area /* position in stat-data */ )
50796 +{
50797 + reiser4_large_times_stat *sd;
50798 +
50799 + assert("nikita-2817", inode != NULL);
50800 + assert("nikita-2818", area != NULL);
50801 + assert("nikita-2819", *area != NULL);
50802 +
50803 + sd = (reiser4_large_times_stat *) * area;
50804 +
50805 + put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
50806 + put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
50807 + put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
50808 +
50809 + *area += sizeof *sd;
50810 + return 0;
50811 +}
50812 +
50813 +/* symlink stat data extension */
50814 +
50815 +/* allocate memory for symlink target and attach it to inode->i_private */
50816 +static int
50817 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
50818 +{
50819 + assert("vs-845", inode->i_private == NULL);
50820 + assert("vs-846", !reiser4_inode_get_flag(inode,
50821 + REISER4_GENERIC_PTR_USED));
50822 + /* FIXME-VS: this is prone to deadlock. Not more than other similar
50823 + places, though */
50824 + inode->i_private = kmalloc((size_t) len + 1,
50825 + reiser4_ctx_gfp_mask_get());
50826 + if (!inode->i_private)
50827 + return RETERR(-ENOMEM);
50828 +
50829 + memcpy((char *)(inode->i_private), target, (size_t) len);
50830 + ((char *)(inode->i_private))[len] = 0;
50831 + reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
50832 + return 0;
50833 +}
50834 +
50835 +/* this is called on read_inode. There is nothing to do actually, but some
50836 + sanity checks */
50837 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
50838 +{
50839 + int result;
50840 + int length;
50841 + reiser4_symlink_stat *sd;
50842 +
50843 + length = (int)inode->i_size;
50844 + /*
50845 + * *len is number of bytes in stat data item from *area to the end of
50846 + * item. It must be not less than size of symlink + 1 for ending 0
50847 + */
50848 + if (length > *len)
50849 + return not_enough_space(inode, "symlink");
50850 +
50851 + if (*(*area + length) != 0) {
50852 + warning("vs-840", "Symlink is not zero terminated");
50853 + return RETERR(-EIO);
50854 + }
50855 +
50856 + sd = (reiser4_symlink_stat *) * area;
50857 + result = symlink_target_to_inode(inode, sd->body, length);
50858 +
50859 + move_on(len, area, length + 1);
50860 + return result;
50861 +}
50862 +
50863 +static int save_len_symlink_sd(struct inode *inode)
50864 +{
50865 + return inode->i_size + 1;
50866 +}
50867 +
50868 +/* this is called on create and update stat data. Do nothing on update but
50869 + update @area */
50870 +static int save_symlink_sd(struct inode *inode, char **area)
50871 +{
50872 + int result;
50873 + int length;
50874 + reiser4_symlink_stat *sd;
50875 +
50876 + length = (int)inode->i_size;
50877 + /* inode->i_size must be set already */
50878 + assert("vs-841", length);
50879 +
50880 + result = 0;
50881 + sd = (reiser4_symlink_stat *) * area;
50882 + if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
50883 + const char *target;
50884 +
50885 + target = (const char *)(inode->i_private);
50886 + inode->i_private = NULL;
50887 +
50888 + result = symlink_target_to_inode(inode, target, length);
50889 +
50890 + /* copy symlink to stat data */
50891 + memcpy(sd->body, target, (size_t) length);
50892 + (*area)[length] = 0;
50893 + } else {
50894 + /* there is nothing to do in update but move area */
50895 + assert("vs-844",
50896 + !memcmp(inode->i_private, sd->body,
50897 + (size_t) length + 1));
50898 + }
50899 +
50900 + *area += (length + 1);
50901 + return result;
50902 +}
50903 +
50904 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
50905 + char **area /* position in stat-data */ ,
50906 + int *len /* remaining length */ )
50907 +{
50908 + assert("nikita-645", inode != NULL);
50909 + assert("nikita-646", area != NULL);
50910 + assert("nikita-647", *area != NULL);
50911 + assert("nikita-648", len != NULL);
50912 + assert("nikita-649", *len > 0);
50913 +
50914 + if (*len >= (int)sizeof(reiser4_flags_stat)) {
50915 + reiser4_flags_stat *sd;
50916 +
50917 + sd = (reiser4_flags_stat *) * area;
50918 + inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
50919 + move_on(len, area, sizeof *sd);
50920 + return 0;
50921 + } else
50922 + return not_enough_space(inode, "generation and attrs");
50923 +}
50924 +
50925 +/* Audited by: green(2002.06.14) */
50926 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
50927 + * processed */ )
50928 +{
50929 + return sizeof(reiser4_flags_stat);
50930 +}
50931 +
50932 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
50933 + char **area /* position in stat-data */ )
50934 +{
50935 + reiser4_flags_stat *sd;
50936 +
50937 + assert("nikita-650", inode != NULL);
50938 + assert("nikita-651", area != NULL);
50939 + assert("nikita-652", *area != NULL);
50940 +
50941 + sd = (reiser4_flags_stat *) * area;
50942 + put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
50943 + *area += sizeof *sd;
50944 + return 0;
50945 +}
50946 +
50947 +static int absent_plugin_sd(struct inode *inode);
50948 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
50949 + char **area /* position in stat-data */ ,
50950 + int *len /* remaining length */,
50951 + int is_pset /* 1 if plugin set, 0 if heir set. */)
50952 +{
50953 + reiser4_plugin_stat *sd;
50954 + reiser4_plugin *plugin;
50955 + reiser4_inode *info;
50956 + int i;
50957 + __u16 mask;
50958 + int result;
50959 + int num_of_plugins;
50960 +
50961 + assert("nikita-653", inode != NULL);
50962 + assert("nikita-654", area != NULL);
50963 + assert("nikita-655", *area != NULL);
50964 + assert("nikita-656", len != NULL);
50965 + assert("nikita-657", *len > 0);
50966 +
50967 + if (*len < (int)sizeof(reiser4_plugin_stat))
50968 + return not_enough_space(inode, "plugin");
50969 +
50970 + sd = (reiser4_plugin_stat *) * area;
50971 + info = reiser4_inode_data(inode);
50972 +
50973 + mask = 0;
50974 + num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
50975 + move_on(len, area, sizeof *sd);
50976 + result = 0;
50977 + for (i = 0; i < num_of_plugins; ++i) {
50978 + reiser4_plugin_slot *slot;
50979 + reiser4_plugin_type type;
50980 + pset_member memb;
50981 +
50982 + slot = (reiser4_plugin_slot *) * area;
50983 + if (*len < (int)sizeof *slot)
50984 + return not_enough_space(inode, "additional plugin");
50985 +
50986 + memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
50987 + type = aset_member_to_type_unsafe(memb);
50988 +
50989 + if (type == REISER4_PLUGIN_TYPES) {
50990 + warning("nikita-3502",
50991 + "wrong %s member (%i) for %llu", is_pset ?
50992 + "pset" : "hset", memb,
50993 + (unsigned long long)get_inode_oid(inode));
50994 + return RETERR(-EINVAL);
50995 + }
50996 + plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
50997 + type, &slot->id);
50998 + if (plugin == NULL)
50999 + return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
51000 +
51001 + /* plugin is loaded into inode, mark this into inode's
51002 + bitmask of loaded non-standard plugins */
51003 + if (!(mask & (1 << memb))) {
51004 + mask |= (1 << memb);
51005 + } else {
51006 + warning("nikita-658", "duplicate plugin for %llu",
51007 + (unsigned long long)get_inode_oid(inode));
51008 + return RETERR(-EINVAL);
51009 + }
51010 + move_on(len, area, sizeof *slot);
51011 + /* load plugin data, if any */
51012 + if (plugin->h.pops != NULL && plugin->h.pops->load)
51013 + result = plugin->h.pops->load(inode, plugin, area, len);
51014 + else
51015 + result = aset_set_unsafe(is_pset ? &info->pset :
51016 + &info->hset, memb, plugin);
51017 + if (result)
51018 + return result;
51019 + }
51020 + if (is_pset) {
51021 + /* if object plugin wasn't loaded from stat-data, guess it by
51022 + mode bits */
51023 + plugin = file_plugin_to_plugin(inode_file_plugin(inode));
51024 + if (plugin == NULL)
51025 + result = absent_plugin_sd(inode);
51026 + info->plugin_mask = mask;
51027 + } else
51028 + info->heir_mask = mask;
51029 +
51030 + return result;
51031 +}
51032 +
51033 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
51034 + return present_plugin_sd(inode, area, len, 1 /* pset */);
51035 +}
51036 +
51037 +/* Determine object plugin for @inode based on i_mode.
51038 +
51039 + Many objects in reiser4 file system are controlled by standard object
51040 + plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
51041 +
51042 + For such files we don't explicitly store plugin id in object stat
51043 + data. Rather required plugin is guessed from mode bits, where file "type"
51044 + is encoded (see stat(2)).
51045 +*/
51046 +static int
51047 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
51048 +{
51049 + int fplug_id;
51050 + int dplug_id;
51051 + reiser4_inode *info;
51052 +
51053 + assert("nikita-736", inode != NULL);
51054 +
51055 + dplug_id = fplug_id = -1;
51056 +
51057 + switch (inode->i_mode & S_IFMT) {
51058 + case S_IFSOCK:
51059 + case S_IFBLK:
51060 + case S_IFCHR:
51061 + case S_IFIFO:
51062 + fplug_id = SPECIAL_FILE_PLUGIN_ID;
51063 + break;
51064 + case S_IFLNK:
51065 + fplug_id = SYMLINK_FILE_PLUGIN_ID;
51066 + break;
51067 + case S_IFDIR:
51068 + fplug_id = DIRECTORY_FILE_PLUGIN_ID;
51069 + dplug_id = HASHED_DIR_PLUGIN_ID;
51070 + break;
51071 + default:
51072 + warning("nikita-737", "wrong file mode: %o", inode->i_mode);
51073 + return RETERR(-EIO);
51074 + case S_IFREG:
51075 + fplug_id = UNIX_FILE_PLUGIN_ID;
51076 + break;
51077 + }
51078 + info = reiser4_inode_data(inode);
51079 + set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
51080 + plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
51081 + set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
51082 + plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
51083 + return 0;
51084 +}
51085 +
51086 +/* Audited by: green(2002.06.14) */
51087 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
51088 +{
51089 + int result;
51090 +
51091 + assert("nikita-659", inode != NULL);
51092 +
51093 + result = guess_plugin_by_mode(inode);
51094 + /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
51095 + but setup_inode_ops() will call make_bad_inode().
51096 + Another, more logical but bit more complex solution is to add
51097 + "bad-file plugin". */
51098 + /* FIXME-VS: activate was called here */
51099 + return result;
51100 +}
51101 +
51102 +/* helper function for plugin_sd_save_len(): calculate how much space
51103 + required to save state of given plugin */
51104 +/* Audited by: green(2002.06.14) */
51105 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
51106 + struct inode *inode /* object being processed */ ,
51107 + pset_member memb,
51108 + int len, int is_pset)
51109 +{
51110 + reiser4_inode *info;
51111 + assert("nikita-661", inode != NULL);
51112 +
51113 + if (plugin == NULL)
51114 + return len;
51115 +
51116 + info = reiser4_inode_data(inode);
51117 + if (is_pset ?
51118 + info->plugin_mask & (1 << memb) :
51119 + info->heir_mask & (1 << memb)) {
51120 + len += sizeof(reiser4_plugin_slot);
51121 + if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
51122 + /* non-standard plugin, call method */
51123 + /* commented as it is incompatible with alignment
51124 + * policy in save_plug() -edward */
51125 + /* len = round_up(len, plugin->h.pops->alignment); */
51126 + len += plugin->h.pops->save_len(inode, plugin);
51127 + }
51128 + }
51129 + return len;
51130 +}
51131 +
51132 +/* calculate how much space is required to save state of all plugins,
51133 + associated with inode */
51134 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
51135 + int is_pset)
51136 +{
51137 + int len;
51138 + int last;
51139 + reiser4_inode *state;
51140 + pset_member memb;
51141 +
51142 + assert("nikita-663", inode != NULL);
51143 +
51144 + state = reiser4_inode_data(inode);
51145 +
51146 + /* common case: no non-standard plugins */
51147 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51148 + return 0;
51149 + len = sizeof(reiser4_plugin_stat);
51150 + last = PSET_LAST;
51151 +
51152 + for (memb = 0; memb < last; ++memb) {
51153 + len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
51154 + inode, memb, len, is_pset);
51155 + }
51156 + assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
51157 + return len;
51158 +}
51159 +
51160 +static int save_len_pset_sd(struct inode *inode) {
51161 + return save_len_plugin_sd(inode, 1 /* pset */);
51162 +}
51163 +
51164 +/* helper function for plugin_sd_save(): save plugin, associated with
51165 + inode. */
51166 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
51167 + struct inode *inode /* object being processed */ ,
51168 + int memb /* what element of pset is saved */ ,
51169 + char **area /* position in stat-data */ ,
51170 + int *count /* incremented if plugin were actually saved. */,
51171 + int is_pset /* 1 for plugin set, 0 for heir set */)
51172 +{
51173 + reiser4_plugin_slot *slot;
51174 + int fake_len;
51175 + int result;
51176 +
51177 + assert("nikita-665", inode != NULL);
51178 + assert("nikita-666", area != NULL);
51179 + assert("nikita-667", *area != NULL);
51180 +
51181 + if (plugin == NULL)
51182 + return 0;
51183 +
51184 + if (is_pset ?
51185 + !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
51186 + !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
51187 + return 0;
51188 + slot = (reiser4_plugin_slot *) * area;
51189 + put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
51190 + put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
51191 + fake_len = (int)0xffff;
51192 + move_on(&fake_len, area, sizeof *slot);
51193 + ++*count;
51194 + result = 0;
51195 + if (plugin->h.pops != NULL) {
51196 + if (plugin->h.pops->save != NULL)
51197 + result = plugin->h.pops->save(inode, plugin, area);
51198 + }
51199 + return result;
51200 +}
51201 +
51202 +/* save state of all non-standard plugins associated with inode */
51203 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
51204 + char **area /* position in stat-data */,
51205 + int is_pset /* 1 for pset, 0 for hset */)
51206 +{
51207 + int fake_len;
51208 + int result = 0;
51209 + int num_of_plugins;
51210 + reiser4_plugin_stat *sd;
51211 + reiser4_inode *state;
51212 + pset_member memb;
51213 +
51214 + assert("nikita-669", inode != NULL);
51215 + assert("nikita-670", area != NULL);
51216 + assert("nikita-671", *area != NULL);
51217 +
51218 + state = reiser4_inode_data(inode);
51219 + if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51220 + return 0;
51221 + sd = (reiser4_plugin_stat *) * area;
51222 + fake_len = (int)0xffff;
51223 + move_on(&fake_len, area, sizeof *sd);
51224 +
51225 + num_of_plugins = 0;
51226 + for (memb = 0; memb < PSET_LAST; ++memb) {
51227 + result = save_plug(aset_get(is_pset ? state->pset : state->hset,
51228 + memb),
51229 + inode, memb, area, &num_of_plugins, is_pset);
51230 + if (result != 0)
51231 + break;
51232 + }
51233 +
51234 + put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
51235 + return result;
51236 +}
51237 +
51238 +static int save_pset_sd(struct inode *inode, char **area) {
51239 + return save_plugin_sd(inode, area, 1 /* pset */);
51240 +}
51241 +
51242 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
51243 + return present_plugin_sd(inode, area, len, 0 /* hset */);
51244 +}
51245 +
51246 +static int save_len_hset_sd(struct inode *inode) {
51247 + return save_len_plugin_sd(inode, 0 /* pset */);
51248 +}
51249 +
51250 +static int save_hset_sd(struct inode *inode, char **area) {
51251 + return save_plugin_sd(inode, area, 0 /* hset */);
51252 +}
51253 +
51254 +/* helper function for crypto_sd_present(), crypto_sd_save.
51255 + Extract crypto info from stat-data and attach it to inode */
51256 +static int extract_crypto_info (struct inode * inode,
51257 + reiser4_crypto_stat * sd)
51258 +{
51259 + struct reiser4_crypto_info * info;
51260 + assert("edward-11", !inode_crypto_info(inode));
51261 + assert("edward-1413",
51262 + !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
51263 + /* create and attach a crypto-stat without secret key loaded */
51264 + info = reiser4_alloc_crypto_info(inode);
51265 + if (IS_ERR(info))
51266 + return PTR_ERR(info);
51267 + info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
51268 + memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
51269 + reiser4_attach_crypto_info(inode, info);
51270 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51271 + return 0;
51272 +}
51273 +
51274 +/* crypto stat-data extension */
51275 +
51276 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
51277 +{
51278 + int result;
51279 + reiser4_crypto_stat *sd;
51280 + digest_plugin *dplug = inode_digest_plugin(inode);
51281 +
51282 + assert("edward-06", dplug != NULL);
51283 + assert("edward-684", dplug->fipsize);
51284 + assert("edward-07", area != NULL);
51285 + assert("edward-08", *area != NULL);
51286 + assert("edward-09", len != NULL);
51287 + assert("edward-10", *len > 0);
51288 +
51289 + if (*len < (int)sizeof(reiser4_crypto_stat)) {
51290 + return not_enough_space(inode, "crypto-sd");
51291 + }
51292 + /* *len is number of bytes in stat data item from *area to the end of
51293 + item. It must be not less than size of this extension */
51294 + assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
51295 +
51296 + sd = (reiser4_crypto_stat *) * area;
51297 + result = extract_crypto_info(inode, sd);
51298 + move_on(len, area, sizeof(*sd) + dplug->fipsize);
51299 +
51300 + return result;
51301 +}
51302 +
51303 +static int save_len_crypto_sd(struct inode *inode)
51304 +{
51305 + return sizeof(reiser4_crypto_stat) +
51306 + inode_digest_plugin(inode)->fipsize;
51307 +}
51308 +
51309 +static int save_crypto_sd(struct inode *inode, char **area)
51310 +{
51311 + int result = 0;
51312 + reiser4_crypto_stat *sd;
51313 + struct reiser4_crypto_info * info = inode_crypto_info(inode);
51314 + digest_plugin *dplug = inode_digest_plugin(inode);
51315 +
51316 + assert("edward-12", dplug != NULL);
51317 + assert("edward-13", area != NULL);
51318 + assert("edward-14", *area != NULL);
51319 + assert("edward-15", info != NULL);
51320 + assert("edward-1414", info->keyid != NULL);
51321 + assert("edward-1415", info->keysize != 0);
51322 + assert("edward-76", reiser4_inode_data(inode) != NULL);
51323 +
51324 + if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
51325 + /* file is just created */
51326 + sd = (reiser4_crypto_stat *) *area;
51327 + /* copy everything but private key to the disk stat-data */
51328 + put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
51329 + memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
51330 + reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51331 + }
51332 + *area += (sizeof(*sd) + dplug->fipsize);
51333 + return result;
51334 +}
51335 +
51336 +static int eio(struct inode *inode, char **area, int *len)
51337 +{
51338 + return RETERR(-EIO);
51339 +}
51340 +
51341 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
51342 + [LIGHT_WEIGHT_STAT] = {
51343 + .h = {
51344 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51345 + .id = LIGHT_WEIGHT_STAT,
51346 + .pops = NULL,
51347 + .label = "light-weight sd",
51348 + .desc = "sd for light-weight files",
51349 + .linkage = {NULL,NULL}
51350 + },
51351 + .present = present_lw_sd,
51352 + .absent = NULL,
51353 + .save_len = save_len_lw_sd,
51354 + .save = save_lw_sd,
51355 + .alignment = 8
51356 + },
51357 + [UNIX_STAT] = {
51358 + .h = {
51359 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51360 + .id = UNIX_STAT,
51361 + .pops = NULL,
51362 + .label = "unix-sd",
51363 + .desc = "unix stat-data fields",
51364 + .linkage = {NULL,NULL}
51365 + },
51366 + .present = present_unix_sd,
51367 + .absent = absent_unix_sd,
51368 + .save_len = save_len_unix_sd,
51369 + .save = save_unix_sd,
51370 + .alignment = 8
51371 + },
51372 + [LARGE_TIMES_STAT] = {
51373 + .h = {
51374 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51375 + .id = LARGE_TIMES_STAT,
51376 + .pops = NULL,
51377 + .label = "64time-sd",
51378 + .desc = "nanosecond resolution for times",
51379 + .linkage = {NULL,NULL}
51380 + },
51381 + .present = present_large_times_sd,
51382 + .absent = NULL,
51383 + .save_len = save_len_large_times_sd,
51384 + .save = save_large_times_sd,
51385 + .alignment = 8
51386 + },
51387 + [SYMLINK_STAT] = {
51388 + /* stat data of symlink has this extension */
51389 + .h = {
51390 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51391 + .id = SYMLINK_STAT,
51392 + .pops = NULL,
51393 + .label = "symlink-sd",
51394 + .desc =
51395 + "stat data is appended with symlink name",
51396 + .linkage = {NULL,NULL}
51397 + },
51398 + .present = present_symlink_sd,
51399 + .absent = NULL,
51400 + .save_len = save_len_symlink_sd,
51401 + .save = save_symlink_sd,
51402 + .alignment = 8
51403 + },
51404 + [PLUGIN_STAT] = {
51405 + .h = {
51406 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51407 + .id = PLUGIN_STAT,
51408 + .pops = NULL,
51409 + .label = "plugin-sd",
51410 + .desc = "plugin stat-data fields",
51411 + .linkage = {NULL,NULL}
51412 + },
51413 + .present = present_pset_sd,
51414 + .absent = absent_plugin_sd,
51415 + .save_len = save_len_pset_sd,
51416 + .save = save_pset_sd,
51417 + .alignment = 8
51418 + },
51419 + [HEIR_STAT] = {
51420 + .h = {
51421 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51422 + .id = HEIR_STAT,
51423 + .pops = NULL,
51424 + .label = "heir-plugin-sd",
51425 + .desc = "heir plugin stat-data fields",
51426 + .linkage = {NULL,NULL}
51427 + },
51428 + .present = present_hset_sd,
51429 + .absent = NULL,
51430 + .save_len = save_len_hset_sd,
51431 + .save = save_hset_sd,
51432 + .alignment = 8
51433 + },
51434 + [FLAGS_STAT] = {
51435 + .h = {
51436 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51437 + .id = FLAGS_STAT,
51438 + .pops = NULL,
51439 + .label = "flags-sd",
51440 + .desc = "inode bit flags",
51441 + .linkage = {NULL, NULL}
51442 + },
51443 + .present = present_flags_sd,
51444 + .absent = NULL,
51445 + .save_len = save_len_flags_sd,
51446 + .save = save_flags_sd,
51447 + .alignment = 8
51448 + },
51449 + [CAPABILITIES_STAT] = {
51450 + .h = {
51451 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51452 + .id = CAPABILITIES_STAT,
51453 + .pops = NULL,
51454 + .label = "capabilities-sd",
51455 + .desc = "capabilities",
51456 + .linkage = {NULL, NULL}
51457 + },
51458 + .present = eio,
51459 + .absent = NULL,
51460 + .save_len = save_len_flags_sd,
51461 + .save = save_flags_sd,
51462 + .alignment = 8
51463 + },
51464 + [CRYPTO_STAT] = {
51465 + .h = {
51466 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51467 + .id = CRYPTO_STAT,
51468 + .pops = NULL,
51469 + .label = "crypto-sd",
51470 + .desc = "secret key size and id",
51471 + .linkage = {NULL, NULL}
51472 + },
51473 + .present = present_crypto_sd,
51474 + .absent = NULL,
51475 + .save_len = save_len_crypto_sd,
51476 + .save = save_crypto_sd,
51477 + .alignment = 8
51478 + }
51479 +};
51480 +
51481 +/* Make Linus happy.
51482 + Local variables:
51483 + c-indentation-style: "K&R"
51484 + mode-name: "LC"
51485 + c-basic-offset: 8
51486 + tab-width: 8
51487 + fill-column: 120
51488 + End:
51489 +*/
51490 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.23/fs/reiser4/plugin/item/static_stat.h
51491 --- linux-2.6.23.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300
51492 +++ linux-2.6.23/fs/reiser4/plugin/item/static_stat.h 2007-12-04 16:49:30.000000000 +0300
51493 @@ -0,0 +1,224 @@
51494 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51495 +
51496 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
51497 +
51498 +In the case where each file has not less than the fields needed by the
51499 +stat() syscall, it is more compact to store those fields in this
51500 +struct.
51501 +
51502 +If this item does not exist, then all stats are dynamically resolved.
51503 +At the moment, we either resolve all stats dynamically or all of them
51504 +statically. If you think this is not fully optimal, and the rest of
51505 +reiser4 is working, then fix it...:-)
51506 +
51507 +*/
51508 +
51509 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
51510 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
51511 +
51512 +#include "../../forward.h"
51513 +#include "../../dformat.h"
51514 +
51515 +#include <linux/fs.h> /* for struct inode */
51516 +
51517 +/* Stat data layout: goals and implementation.
51518 +
51519 + We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
51520 + them, including not having semantic metadata attached to them.
51521 +
51522 + There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
51523 + want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
51524 + sized structure because the statically sized structure knows without recording it what the names and lengths of the
51525 + attributes are.
51526 +
51527 + This leads to a natural compromise, which is to special case those files which have simply the standard unix file
51528 + attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
51529 + file in their use of file attributes.
51530 +
51531 + Yet this compromise deserves to be compromised a little.
51532 +
51533 + We accommodate the case where you have no more than the standard unix file attributes by using an "extension
51534 + bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
51535 +
51536 + If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
51537 + from parent directory (as uid, gid) or initialised to some sane values.
51538 +
51539 + To capitalize on existing code infrastructure, extensions are
51540 + implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
51541 + Each stat-data extension plugin implements four methods:
51542 +
51543 + ->present() called by sd_load() when this extension is found in stat-data
51544 + ->absent() called by sd_load() when this extension is not found in stat-data
51545 + ->save_len() called by sd_len() to calculate total length of stat-data
51546 + ->save() called by sd_save() to store extension data into stat-data
51547 +
51548 + Implementation is in fs/reiser4/plugin/item/static_stat.c
51549 +*/
51550 +
51551 +/* stat-data extension. Please order this by presumed frequency of use */
51552 +typedef enum {
51553 + /* support for light-weight files */
51554 + LIGHT_WEIGHT_STAT,
51555 + /* data required to implement unix stat(2) call. Layout is in
51556 + reiser4_unix_stat. If this is not present, file is light-weight */
51557 + UNIX_STAT,
51558 + /* this contains additional set of 32bit [anc]time fields to implement
51559 + nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
51560 + if this extension is governed by 32bittimes mount option. */
51561 + LARGE_TIMES_STAT,
51562 + /* stat data has link name included */
51563 + SYMLINK_STAT,
51564 + /* on-disk slots of non-standard plugins for main plugin table
51565 + (@reiser4_inode->pset), that is, plugins that cannot be deduced
51566 + from file mode bits), for example, aggregation, interpolation etc. */
51567 + PLUGIN_STAT,
51568 + /* this extension contains persistent inode flags. These flags are
51569 + single bits: immutable, append, only, etc. Layout is in
51570 + reiser4_flags_stat. */
51571 + FLAGS_STAT,
51572 + /* this extension contains capabilities sets, associated with this
51573 + file. Layout is in reiser4_capabilities_stat */
51574 + CAPABILITIES_STAT,
51575 + /* this extension contains size and public id of the secret key.
51576 + Layout is in reiser4_crypto_stat */
51577 + CRYPTO_STAT,
51578 + /* on-disk slots of non-default plugins for inheritance, which
51579 + are extracted to special plugin table (@reiser4_inode->hset).
51580 + By default, children of the object will inherit plugins from
51581 + its main plugin table (pset). */
51582 + HEIR_STAT,
51583 + LAST_SD_EXTENSION,
51584 + /*
51585 + * init_inode_static_sd() iterates over extension mask until all
51586 + * non-zero bits are processed. This means, that neither ->present(),
51587 + * nor ->absent() methods will be called for stat-data extensions that
51588 + * go after last present extension. But some basic extensions, we want
51589 + * either ->absent() or ->present() method to be called, because these
51590 + * extensions set up something in inode even when they are not
51591 + * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
51592 + * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
51593 + * ->present(), or ->absent() method will be called, independently of
51594 + * what other extensions are present.
51595 + */
51596 + LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
51597 +} sd_ext_bits;
51598 +
51599 +/* minimal stat-data. This allows to support light-weight files. */
51600 +typedef struct reiser4_stat_data_base {
51601 + /* 0 */ __le16 extmask;
51602 + /* 2 */
51603 +} PACKED reiser4_stat_data_base;
51604 +
51605 +typedef struct reiser4_light_weight_stat {
51606 + /* 0 */ __le16 mode;
51607 + /* 2 */ __le32 nlink;
51608 + /* 6 */ __le64 size;
51609 + /* size in bytes */
51610 + /* 14 */
51611 +} PACKED reiser4_light_weight_stat;
51612 +
51613 +typedef struct reiser4_unix_stat {
51614 + /* owner id */
51615 + /* 0 */ __le32 uid;
51616 + /* group id */
51617 + /* 4 */ __le32 gid;
51618 + /* access time */
51619 + /* 8 */ __le32 atime;
51620 + /* modification time */
51621 + /* 12 */ __le32 mtime;
51622 + /* change time */
51623 + /* 16 */ __le32 ctime;
51624 + union {
51625 + /* minor:major for device files */
51626 + /* 20 */ __le64 rdev;
51627 + /* bytes used by file */
51628 + /* 20 */ __le64 bytes;
51629 + } u;
51630 + /* 28 */
51631 +} PACKED reiser4_unix_stat;
51632 +
51633 +/* symlink stored as part of inode */
51634 +typedef struct reiser4_symlink_stat {
51635 + char body[0];
51636 +} PACKED reiser4_symlink_stat;
51637 +
51638 +typedef struct reiser4_plugin_slot {
51639 + /* 0 */ __le16 pset_memb;
51640 + /* 2 */ __le16 id;
51641 + /* 4 *//* here plugin stores its persistent state */
51642 +} PACKED reiser4_plugin_slot;
51643 +
51644 +/* stat-data extension for files with non-standard plugin. */
51645 +typedef struct reiser4_plugin_stat {
51646 + /* number of additional plugins, associated with this object */
51647 + /* 0 */ __le16 plugins_no;
51648 + /* 2 */ reiser4_plugin_slot slot[0];
51649 + /* 2 */
51650 +} PACKED reiser4_plugin_stat;
51651 +
51652 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
51653 + * bit mask. If need arise, this can be replaced with variable width
51654 + * bitmask. */
51655 +typedef struct reiser4_flags_stat {
51656 + /* 0 */ __le32 flags;
51657 + /* 4 */
51658 +} PACKED reiser4_flags_stat;
51659 +
51660 +typedef struct reiser4_capabilities_stat {
51661 + /* 0 */ __le32 effective;
51662 + /* 8 */ __le32 permitted;
51663 + /* 16 */
51664 +} PACKED reiser4_capabilities_stat;
51665 +
51666 +typedef struct reiser4_cluster_stat {
51667 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
51668 + /* 0 */ d8 cluster_shift;
51669 + /* 1 */
51670 +} PACKED reiser4_cluster_stat;
51671 +
51672 +typedef struct reiser4_crypto_stat {
51673 + /* secret key size, bits */
51674 + /* 0 */ d16 keysize;
51675 + /* secret key id */
51676 + /* 2 */ d8 keyid[0];
51677 + /* 2 */
51678 +} PACKED reiser4_crypto_stat;
51679 +
51680 +typedef struct reiser4_large_times_stat {
51681 + /* access time */
51682 + /* 0 */ d32 atime;
51683 + /* modification time */
51684 + /* 4 */ d32 mtime;
51685 + /* change time */
51686 + /* 8 */ d32 ctime;
51687 + /* 12 */
51688 +} PACKED reiser4_large_times_stat;
51689 +
51690 +/* this structure is filled by sd_item_stat */
51691 +typedef struct sd_stat {
51692 + int dirs;
51693 + int files;
51694 + int others;
51695 +} sd_stat;
51696 +
51697 +/* plugin->item.common.* */
51698 +extern void print_sd(const char *prefix, coord_t * coord);
51699 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
51700 +
51701 +/* plugin->item.s.sd.* */
51702 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
51703 +extern int save_len_static_sd(struct inode *inode);
51704 +extern int save_static_sd(struct inode *inode, char **area);
51705 +
51706 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
51707 +#endif
51708 +
51709 +/* Make Linus happy.
51710 + Local variables:
51711 + c-indentation-style: "K&R"
51712 + mode-name: "LC"
51713 + c-basic-offset: 8
51714 + tab-width: 8
51715 + fill-column: 120
51716 + End:
51717 +*/
51718 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/tail.c linux-2.6.23/fs/reiser4/plugin/item/tail.c
51719 --- linux-2.6.23.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300
51720 +++ linux-2.6.23/fs/reiser4/plugin/item/tail.c 2007-12-04 23:04:00.738308094 +0300
51721 @@ -0,0 +1,809 @@
51722 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51723 +
51724 +#include "item.h"
51725 +#include "../../inode.h"
51726 +#include "../../page_cache.h"
51727 +#include "../../carry.h"
51728 +#include "../../vfs_ops.h"
51729 +
51730 +#include <linux/quotaops.h>
51731 +#include <asm/uaccess.h>
51732 +#include <linux/swap.h>
51733 +#include <linux/writeback.h>
51734 +
51735 +/* plugin->u.item.b.max_key_inside */
51736 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
51737 +{
51738 + item_key_by_coord(coord, key);
51739 + set_key_offset(key, get_key_offset(reiser4_max_key()));
51740 + return key;
51741 +}
51742 +
51743 +/* plugin->u.item.b.can_contain_key */
51744 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
51745 + const reiser4_item_data *data)
51746 +{
51747 + reiser4_key item_key;
51748 +
51749 + if (item_plugin_by_coord(coord) != data->iplug)
51750 + return 0;
51751 +
51752 + item_key_by_coord(coord, &item_key);
51753 + if (get_key_locality(key) != get_key_locality(&item_key) ||
51754 + get_key_objectid(key) != get_key_objectid(&item_key))
51755 + return 0;
51756 +
51757 + return 1;
51758 +}
51759 +
51760 +/* plugin->u.item.b.mergeable
51761 + first item is of tail type */
51762 +/* Audited by: green(2002.06.14) */
51763 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
51764 +{
51765 + reiser4_key key1, key2;
51766 +
51767 + assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
51768 + UNIX_FILE_METADATA_ITEM_TYPE));
51769 + assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
51770 +
51771 + if (item_id_by_coord(p2) != FORMATTING_ID) {
51772 + /* second item is of another type */
51773 + return 0;
51774 + }
51775 +
51776 + item_key_by_coord(p1, &key1);
51777 + item_key_by_coord(p2, &key2);
51778 + if (get_key_locality(&key1) != get_key_locality(&key2) ||
51779 + get_key_objectid(&key1) != get_key_objectid(&key2)
51780 + || get_key_type(&key1) != get_key_type(&key2)) {
51781 + /* items of different objects */
51782 + return 0;
51783 + }
51784 + if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
51785 + /* not adjacent items */
51786 + return 0;
51787 + }
51788 + return 1;
51789 +}
51790 +
51791 +/* plugin->u.item.b.print
51792 + plugin->u.item.b.check */
51793 +
51794 +/* plugin->u.item.b.nr_units */
51795 +pos_in_node_t nr_units_tail(const coord_t * coord)
51796 +{
51797 + return item_length_by_coord(coord);
51798 +}
51799 +
51800 +/* plugin->u.item.b.lookup */
51801 +lookup_result
51802 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
51803 +{
51804 + reiser4_key item_key;
51805 + __u64 lookuped, offset;
51806 + unsigned nr_units;
51807 +
51808 + item_key_by_coord(coord, &item_key);
51809 + offset = get_key_offset(item_key_by_coord(coord, &item_key));
51810 + nr_units = nr_units_tail(coord);
51811 +
51812 + /* key we are looking for must be greater than key of item @coord */
51813 + assert("vs-416", keygt(key, &item_key));
51814 +
51815 + /* offset we are looking for */
51816 + lookuped = get_key_offset(key);
51817 +
51818 + if (lookuped >= offset && lookuped < offset + nr_units) {
51819 + /* byte we are looking for is in this item */
51820 + coord->unit_pos = lookuped - offset;
51821 + coord->between = AT_UNIT;
51822 + return CBK_COORD_FOUND;
51823 + }
51824 +
51825 + /* set coord after last unit */
51826 + coord->unit_pos = nr_units - 1;
51827 + coord->between = AFTER_UNIT;
51828 + return bias ==
51829 + FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
51830 +}
51831 +
51832 +/* plugin->u.item.b.paste */
51833 +int
51834 +paste_tail(coord_t *coord, reiser4_item_data *data,
51835 + carry_plugin_info *info UNUSED_ARG)
51836 +{
51837 + unsigned old_item_length;
51838 + char *item;
51839 +
51840 + /* length the item had before resizing has been performed */
51841 + old_item_length = item_length_by_coord(coord) - data->length;
51842 +
51843 + /* tail items never get pasted in the middle */
51844 + assert("vs-363",
51845 + (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
51846 + (coord->unit_pos == old_item_length - 1 &&
51847 + coord->between == AFTER_UNIT) ||
51848 + (coord->unit_pos == 0 && old_item_length == 0
51849 + && coord->between == AT_UNIT));
51850 +
51851 + item = item_body_by_coord(coord);
51852 + if (coord->unit_pos == 0)
51853 + /* make space for pasted data when pasting at the beginning of
51854 + the item */
51855 + memmove(item + data->length, item, old_item_length);
51856 +
51857 + if (coord->between == AFTER_UNIT)
51858 + coord->unit_pos++;
51859 +
51860 + if (data->data) {
51861 + assert("vs-554", data->user == 0 || data->user == 1);
51862 + if (data->user) {
51863 + assert("nikita-3035", reiser4_schedulable());
51864 + /* copy from user space */
51865 + if (__copy_from_user(item + coord->unit_pos,
51866 + (const char __user *)data->data,
51867 + (unsigned)data->length))
51868 + return RETERR(-EFAULT);
51869 + } else
51870 + /* copy from kernel space */
51871 + memcpy(item + coord->unit_pos, data->data,
51872 + (unsigned)data->length);
51873 + } else {
51874 + memset(item + coord->unit_pos, 0, (unsigned)data->length);
51875 + }
51876 + return 0;
51877 +}
51878 +
51879 +/* plugin->u.item.b.fast_paste */
51880 +
51881 +/* plugin->u.item.b.can_shift
51882 + number of units is returned via return value, number of bytes via @size. For
51883 + tail items they coincide */
51884 +int
51885 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
51886 + znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
51887 + unsigned *size, unsigned want)
51888 +{
51889 + /* make sure that that we do not want to shift more than we have */
51890 + assert("vs-364", want > 0
51891 + && want <= (unsigned)item_length_by_coord(source));
51892 +
51893 + *size = min(want, free_space);
51894 + return *size;
51895 +}
51896 +
51897 +/* plugin->u.item.b.copy_units */
51898 +void
51899 +copy_units_tail(coord_t * target, coord_t * source,
51900 + unsigned from, unsigned count,
51901 + shift_direction where_is_free_space,
51902 + unsigned free_space UNUSED_ARG)
51903 +{
51904 + /* make sure that item @target is expanded already */
51905 + assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
51906 + assert("vs-370", free_space >= count);
51907 +
51908 + if (where_is_free_space == SHIFT_LEFT) {
51909 + /* append item @target with @count first bytes of @source */
51910 + assert("vs-365", from == 0);
51911 +
51912 + memcpy((char *)item_body_by_coord(target) +
51913 + item_length_by_coord(target) - count,
51914 + (char *)item_body_by_coord(source), count);
51915 + } else {
51916 + /* target item is moved to right already */
51917 + reiser4_key key;
51918 +
51919 + assert("vs-367",
51920 + (unsigned)item_length_by_coord(source) == from + count);
51921 +
51922 + memcpy((char *)item_body_by_coord(target),
51923 + (char *)item_body_by_coord(source) + from, count);
51924 +
51925 + /* new units are inserted before first unit in an item,
51926 + therefore, we have to update item key */
51927 + item_key_by_coord(source, &key);
51928 + set_key_offset(&key, get_key_offset(&key) + from);
51929 +
51930 + node_plugin_by_node(target->node)->update_item_key(target, &key,
51931 + NULL /*info */);
51932 + }
51933 +}
51934 +
51935 +/* plugin->u.item.b.create_hook */
51936 +
51937 +/* item_plugin->b.kill_hook
51938 + this is called when @count units starting from @from-th one are going to be removed
51939 + */
51940 +int
51941 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
51942 + pos_in_node_t count, struct carry_kill_data *kdata)
51943 +{
51944 + reiser4_key key;
51945 + loff_t start, end;
51946 +
51947 + assert("vs-1577", kdata);
51948 + assert("vs-1579", kdata->inode);
51949 +
51950 + item_key_by_coord(coord, &key);
51951 + start = get_key_offset(&key) + from;
51952 + end = start + count;
51953 + fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
51954 + return 0;
51955 +}
51956 +
51957 +/* plugin->u.item.b.shift_hook */
51958 +
51959 +/* helper for kill_units_tail and cut_units_tail */
51960 +static int
51961 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51962 + reiser4_key * smallest_removed, reiser4_key * new_first)
51963 +{
51964 + pos_in_node_t count;
51965 +
51966 + /* this method is only called to remove part of item */
51967 + assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
51968 + /* tails items are never cut from the middle of an item */
51969 + assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
51970 + assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
51971 +
51972 + count = to - from + 1;
51973 +
51974 + if (smallest_removed) {
51975 + /* store smallest key removed */
51976 + item_key_by_coord(coord, smallest_removed);
51977 + set_key_offset(smallest_removed,
51978 + get_key_offset(smallest_removed) + from);
51979 + }
51980 + if (new_first) {
51981 + /* head of item is cut */
51982 + assert("vs-1529", from == 0);
51983 +
51984 + item_key_by_coord(coord, new_first);
51985 + set_key_offset(new_first,
51986 + get_key_offset(new_first) + from + count);
51987 + }
51988 +
51989 + if (REISER4_DEBUG)
51990 + memset((char *)item_body_by_coord(coord) + from, 0, count);
51991 + return count;
51992 +}
51993 +
51994 +/* plugin->u.item.b.cut_units */
51995 +int
51996 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51997 + struct carry_cut_data *cdata UNUSED_ARG,
51998 + reiser4_key * smallest_removed, reiser4_key * new_first)
51999 +{
52000 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52001 +}
52002 +
52003 +/* plugin->u.item.b.kill_units */
52004 +int
52005 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52006 + struct carry_kill_data *kdata, reiser4_key * smallest_removed,
52007 + reiser4_key * new_first)
52008 +{
52009 + kill_hook_tail(coord, from, to - from + 1, kdata);
52010 + return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52011 +}
52012 +
52013 +/* plugin->u.item.b.unit_key */
52014 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
52015 +{
52016 + assert("vs-375", coord_is_existing_unit(coord));
52017 +
52018 + item_key_by_coord(coord, key);
52019 + set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
52020 +
52021 + return key;
52022 +}
52023 +
52024 +/* plugin->u.item.b.estimate
52025 + plugin->u.item.b.item_data_by_flow */
52026 +
52027 +/* tail redpage function. It is called from readpage_tail(). */
52028 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
52029 +{
52030 + tap_t tap;
52031 + int result;
52032 + coord_t coord;
52033 + lock_handle lh;
52034 + int count, mapped;
52035 + struct inode *inode;
52036 + char *pagedata;
52037 +
52038 + /* saving passed coord in order to do not move it by tap. */
52039 + init_lh(&lh);
52040 + copy_lh(&lh, uf_coord->lh);
52041 + inode = page->mapping->host;
52042 + coord_dup(&coord, &uf_coord->coord);
52043 +
52044 + reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
52045 +
52046 + if ((result = reiser4_tap_load(&tap)))
52047 + goto out_tap_done;
52048 +
52049 + /* lookup until page is filled up. */
52050 + for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
52051 + /* number of bytes to be copied to page */
52052 + count = item_length_by_coord(&coord) - coord.unit_pos;
52053 + if (count > PAGE_CACHE_SIZE - mapped)
52054 + count = PAGE_CACHE_SIZE - mapped;
52055 +
52056 + /* attach @page to address space and get data address */
52057 + pagedata = kmap_atomic(page, KM_USER0);
52058 +
52059 + /* copy tail item to page */
52060 + memcpy(pagedata + mapped,
52061 + ((char *)item_body_by_coord(&coord) + coord.unit_pos),
52062 + count);
52063 + mapped += count;
52064 +
52065 + flush_dcache_page(page);
52066 +
52067 + /* dettach page from address space */
52068 + kunmap_atomic(pagedata, KM_USER0);
52069 +
52070 + /* Getting next tail item. */
52071 + if (mapped < PAGE_CACHE_SIZE) {
52072 + /*
52073 + * unlock page in order to avoid keep it locked
52074 + * during tree lookup, which takes long term locks
52075 + */
52076 + unlock_page(page);
52077 +
52078 + /* getting right neighbour. */
52079 + result = go_dir_el(&tap, RIGHT_SIDE, 0);
52080 +
52081 + /* lock page back */
52082 + lock_page(page);
52083 + if (PageUptodate(page)) {
52084 + /*
52085 + * another thread read the page, we have
52086 + * nothing to do
52087 + */
52088 + result = 0;
52089 + goto out_unlock_page;
52090 + }
52091 +
52092 + if (result) {
52093 + if (result == -E_NO_NEIGHBOR) {
52094 + /*
52095 + * rigth neighbor is not a formatted
52096 + * node
52097 + */
52098 + result = 0;
52099 + goto done;
52100 + } else {
52101 + goto out_tap_relse;
52102 + }
52103 + } else {
52104 + if (!inode_file_plugin(inode)->
52105 + owns_item(inode, &coord)) {
52106 + /* item of another file is found */
52107 + result = 0;
52108 + goto done;
52109 + }
52110 + }
52111 + }
52112 + }
52113 +
52114 + done:
52115 + if (mapped != PAGE_CACHE_SIZE)
52116 + zero_user_page(page, mapped, PAGE_CACHE_SIZE - mapped,
52117 + KM_USER0);
52118 + SetPageUptodate(page);
52119 + out_unlock_page:
52120 + unlock_page(page);
52121 + out_tap_relse:
52122 + reiser4_tap_relse(&tap);
52123 + out_tap_done:
52124 + reiser4_tap_done(&tap);
52125 + return result;
52126 +}
52127 +
52128 +/*
52129 + plugin->s.file.readpage
52130 + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
52131 + or
52132 + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
52133 +
52134 + At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
52135 + item. */
52136 +int readpage_tail(void *vp, struct page *page)
52137 +{
52138 + uf_coord_t *uf_coord = vp;
52139 + ON_DEBUG(coord_t * coord = &uf_coord->coord);
52140 + ON_DEBUG(reiser4_key key);
52141 +
52142 + assert("umka-2515", PageLocked(page));
52143 + assert("umka-2516", !PageUptodate(page));
52144 + assert("umka-2517", !jprivate(page) && !PagePrivate(page));
52145 + assert("umka-2518", page->mapping && page->mapping->host);
52146 +
52147 + assert("umka-2519", znode_is_loaded(coord->node));
52148 + assert("umka-2520", item_is_tail(coord));
52149 + assert("umka-2521", coord_is_existing_unit(coord));
52150 + assert("umka-2522", znode_is_rlocked(coord->node));
52151 + assert("umka-2523",
52152 + page->mapping->host->i_ino ==
52153 + get_key_objectid(item_key_by_coord(coord, &key)));
52154 +
52155 + return do_readpage_tail(uf_coord, page);
52156 +}
52157 +
52158 +/**
52159 + * overwrite_tail
52160 + * @flow:
52161 + * @coord:
52162 + *
52163 + * Overwrites tail item or its part by user data. Returns number of bytes
52164 + * written or error code.
52165 + */
52166 +static int overwrite_tail(flow_t *flow, coord_t *coord)
52167 +{
52168 + unsigned count;
52169 +
52170 + assert("vs-570", flow->user == 1);
52171 + assert("vs-946", flow->data);
52172 + assert("vs-947", coord_is_existing_unit(coord));
52173 + assert("vs-948", znode_is_write_locked(coord->node));
52174 + assert("nikita-3036", reiser4_schedulable());
52175 +
52176 + count = item_length_by_coord(coord) - coord->unit_pos;
52177 + if (count > flow->length)
52178 + count = flow->length;
52179 +
52180 + if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
52181 + (const char __user *)flow->data, count))
52182 + return RETERR(-EFAULT);
52183 +
52184 + znode_make_dirty(coord->node);
52185 + return count;
52186 +}
52187 +
52188 +/**
52189 + * insert_first_tail
52190 + * @inode:
52191 + * @flow:
52192 + * @coord:
52193 + * @lh:
52194 + *
52195 + * Returns number of bytes written or error code.
52196 + */
52197 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
52198 + coord_t *coord, lock_handle *lh)
52199 +{
52200 + int result;
52201 + loff_t to_write;
52202 + struct unix_file_info *uf_info;
52203 +
52204 + if (get_key_offset(&flow->key) != 0) {
52205 + /*
52206 + * file is empty and we have to write not to the beginning of
52207 + * file. Create a hole at the beginning of file. On success
52208 + * insert_flow returns 0 as number of written bytes which is
52209 + * what we have to return on padding a file with holes
52210 + */
52211 + flow->data = NULL;
52212 + flow->length = get_key_offset(&flow->key);
52213 + set_key_offset(&flow->key, 0);
52214 + /*
52215 + * holes in files built of tails are stored just like if there
52216 + * were real data which are all zeros. Therefore we have to
52217 + * allocate quota here as well
52218 + */
52219 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52220 + return RETERR(-EDQUOT);
52221 + result = reiser4_insert_flow(coord, lh, flow);
52222 + if (flow->length)
52223 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52224 +
52225 + uf_info = unix_file_inode_data(inode);
52226 +
52227 + /*
52228 + * first item insertion is only possible when writing to empty
52229 + * file or performing tail conversion
52230 + */
52231 + assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
52232 + (reiser4_inode_get_flag(inode,
52233 + REISER4_PART_MIXED) &&
52234 + reiser4_inode_get_flag(inode,
52235 + REISER4_PART_IN_CONV))));
52236 + /* if file was empty - update its state */
52237 + if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
52238 + uf_info->container = UF_CONTAINER_TAILS;
52239 + return result;
52240 + }
52241 +
52242 + /* check quota before appending data */
52243 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52244 + return RETERR(-EDQUOT);
52245 +
52246 + to_write = flow->length;
52247 + result = reiser4_insert_flow(coord, lh, flow);
52248 + if (flow->length)
52249 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52250 + return (to_write - flow->length) ? (to_write - flow->length) : result;
52251 +}
52252 +
52253 +/**
52254 + * append_tail
52255 + * @inode:
52256 + * @flow:
52257 + * @coord:
52258 + * @lh:
52259 + *
52260 + * Returns number of bytes written or error code.
52261 + */
52262 +static ssize_t append_tail(struct inode *inode,
52263 + flow_t *flow, coord_t *coord, lock_handle *lh)
52264 +{
52265 + int result;
52266 + reiser4_key append_key;
52267 + loff_t to_write;
52268 +
52269 + if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
52270 + flow->data = NULL;
52271 + flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
52272 + set_key_offset(&flow->key, get_key_offset(&append_key));
52273 + /*
52274 + * holes in files built of tails are stored just like if there
52275 + * were real data which are all zeros. Therefore we have to
52276 + * allocate quota here as well
52277 + */
52278 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52279 + return RETERR(-EDQUOT);
52280 + result = reiser4_insert_flow(coord, lh, flow);
52281 + if (flow->length)
52282 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52283 + return result;
52284 + }
52285 +
52286 + /* check quota before appending data */
52287 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52288 + return RETERR(-EDQUOT);
52289 +
52290 + to_write = flow->length;
52291 + result = reiser4_insert_flow(coord, lh, flow);
52292 + if (flow->length)
52293 + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52294 + return (to_write - flow->length) ? (to_write - flow->length) : result;
52295 +}
52296 +
52297 +/**
52298 + * write_tail_reserve_space - reserve space for tail write operation
52299 + * @inode:
52300 + *
52301 + * Estimates and reserves space which may be required for writing one flow to a
52302 + * file
52303 + */
52304 +static int write_extent_reserve_space(struct inode *inode)
52305 +{
52306 + __u64 count;
52307 + reiser4_tree *tree;
52308 +
52309 + /*
52310 + * to write one flow to a file by tails we have to reserve disk space for:
52311 +
52312 + * 1. find_file_item may have to insert empty node to the tree (empty
52313 + * leaf node between two extent items). This requires 1 block and
52314 + * number of blocks which are necessary to perform insertion of an
52315 + * internal item into twig level.
52316 + *
52317 + * 2. flow insertion
52318 + *
52319 + * 3. stat data update
52320 + */
52321 + tree = reiser4_tree_by_inode(inode);
52322 + count = estimate_one_insert_item(tree) +
52323 + estimate_insert_flow(tree->height) +
52324 + estimate_one_insert_item(tree);
52325 + grab_space_enable();
52326 + return reiser4_grab_space(count, 0 /* flags */);
52327 +}
52328 +
52329 +#define PAGE_PER_FLOW 4
52330 +
52331 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
52332 +{
52333 + loff_t faulted;
52334 + int to_fault;
52335 +
52336 + if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
52337 + count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
52338 + faulted = 0;
52339 + while (count > 0) {
52340 + to_fault = PAGE_CACHE_SIZE;
52341 + if (count < to_fault)
52342 + to_fault = count;
52343 + fault_in_pages_readable(buf + faulted, to_fault);
52344 + count -= to_fault;
52345 + faulted += to_fault;
52346 + }
52347 + return faulted;
52348 +}
52349 +
52350 +/**
52351 + * reiser4_write_extent - write method of tail item plugin
52352 + * @file: file to write to
52353 + * @buf: address of user-space buffer
52354 + * @count: number of bytes to write
52355 + * @pos: position in file to write to
52356 + *
52357 + * Returns number of written bytes or error code.
52358 + */
52359 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
52360 + size_t count, loff_t *pos)
52361 +{
52362 + struct inode *inode;
52363 + struct hint hint;
52364 + int result;
52365 + flow_t flow;
52366 + coord_t *coord;
52367 + lock_handle *lh;
52368 + znode *loaded;
52369 +
52370 + inode = file->f_dentry->d_inode;
52371 +
52372 + if (write_extent_reserve_space(inode))
52373 + return RETERR(-ENOSPC);
52374 +
52375 + result = load_file_hint(file, &hint);
52376 + BUG_ON(result != 0);
52377 +
52378 + flow.length = faultin_user_pages(buf, count);
52379 + flow.user = 1;
52380 + memcpy(&flow.data, &buf, sizeof(buf));
52381 + flow.op = WRITE_OP;
52382 + key_by_inode_and_offset_common(inode, *pos, &flow.key);
52383 +
52384 + result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
52385 + if (IS_CBKERR(result))
52386 + return result;
52387 +
52388 + coord = &hint.ext_coord.coord;
52389 + lh = hint.ext_coord.lh;
52390 +
52391 + result = zload(coord->node);
52392 + BUG_ON(result != 0);
52393 + loaded = coord->node;
52394 +
52395 + if (coord->between == AFTER_UNIT) {
52396 + /* append with data or hole */
52397 + result = append_tail(inode, &flow, coord, lh);
52398 + } else if (coord->between == AT_UNIT) {
52399 + /* overwrite */
52400 + result = overwrite_tail(&flow, coord);
52401 + } else {
52402 + /* no items of this file yet. insert data or hole */
52403 + result = insert_first_tail(inode, &flow, coord, lh);
52404 + }
52405 + zrelse(loaded);
52406 + if (result < 0) {
52407 + done_lh(lh);
52408 + return result;
52409 + }
52410 +
52411 + /* seal and unlock znode */
52412 + hint.ext_coord.valid = 0;
52413 + if (hint.ext_coord.valid)
52414 + reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
52415 + else
52416 + reiser4_unset_hint(&hint);
52417 +
52418 + save_file_hint(file, &hint);
52419 + return result;
52420 +}
52421 +
52422 +#if REISER4_DEBUG
52423 +
52424 +static int
52425 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
52426 +{
52427 + reiser4_key item_key;
52428 +
52429 + assert("vs-1356", coord_is_existing_unit(coord));
52430 + assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
52431 + assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
52432 + return get_key_offset(key) ==
52433 + get_key_offset(&item_key) + coord->unit_pos;
52434 +
52435 +}
52436 +
52437 +#endif
52438 +
52439 +/* plugin->u.item.s.file.read */
52440 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
52441 +{
52442 + unsigned count;
52443 + int item_length;
52444 + coord_t *coord;
52445 + uf_coord_t *uf_coord;
52446 +
52447 + uf_coord = &hint->ext_coord;
52448 + coord = &uf_coord->coord;
52449 +
52450 + assert("vs-571", f->user == 1);
52451 + assert("vs-571", f->data);
52452 + assert("vs-967", coord && coord->node);
52453 + assert("vs-1117", znode_is_rlocked(coord->node));
52454 + assert("vs-1118", znode_is_loaded(coord->node));
52455 +
52456 + assert("nikita-3037", reiser4_schedulable());
52457 + assert("vs-1357", coord_matches_key_tail(coord, &f->key));
52458 +
52459 + /* calculate number of bytes to read off the item */
52460 + item_length = item_length_by_coord(coord);
52461 + count = item_length_by_coord(coord) - coord->unit_pos;
52462 + if (count > f->length)
52463 + count = f->length;
52464 +
52465 + /* user page has to be brought in so that major page fault does not
52466 + * occur here when longtem lock is held */
52467 + if (__copy_to_user((char __user *)f->data,
52468 + ((char *)item_body_by_coord(coord) + coord->unit_pos),
52469 + count))
52470 + return RETERR(-EFAULT);
52471 +
52472 + /* probably mark_page_accessed() should only be called if
52473 + * coord->unit_pos is zero. */
52474 + mark_page_accessed(znode_page(coord->node));
52475 + move_flow_forward(f, count);
52476 +
52477 + coord->unit_pos += count;
52478 + if (item_length == coord->unit_pos) {
52479 + coord->unit_pos--;
52480 + coord->between = AFTER_UNIT;
52481 + }
52482 + reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
52483 + return 0;
52484 +}
52485 +
52486 +/*
52487 + plugin->u.item.s.file.append_key
52488 + key of first byte which is the next to last byte by addressed by this item
52489 +*/
52490 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
52491 +{
52492 + item_key_by_coord(coord, key);
52493 + set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
52494 + return key;
52495 +}
52496 +
52497 +/* plugin->u.item.s.file.init_coord_extension */
52498 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
52499 +{
52500 + uf_coord->valid = 1;
52501 +}
52502 +
52503 +/*
52504 + plugin->u.item.s.file.get_block
52505 +*/
52506 +int
52507 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
52508 +{
52509 + assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
52510 +
52511 + if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
52512 + /* if node has'nt obtainet its block number yet, return 0.
52513 + * Lets avoid upsetting users with some cosmic numbers beyond
52514 + * the device capacity.*/
52515 + *block = 0;
52516 + else
52517 + *block = *znode_get_block(coord->node);
52518 + return 0;
52519 +}
52520 +
52521 +/*
52522 + * Local variables:
52523 + * c-indentation-style: "K&R"
52524 + * mode-name: "LC"
52525 + * c-basic-offset: 8
52526 + * tab-width: 8
52527 + * fill-column: 79
52528 + * scroll-step: 1
52529 + * End:
52530 + */
52531 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/item/tail.h linux-2.6.23/fs/reiser4/plugin/item/tail.h
52532 --- linux-2.6.23.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300
52533 +++ linux-2.6.23/fs/reiser4/plugin/item/tail.h 2007-12-04 16:49:30.000000000 +0300
52534 @@ -0,0 +1,58 @@
52535 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52536 +
52537 +#if !defined( __REISER4_TAIL_H__ )
52538 +#define __REISER4_TAIL_H__
52539 +
52540 +struct tail_coord_extension {
52541 + int not_used;
52542 +};
52543 +
52544 +struct cut_list;
52545 +
52546 +/* plugin->u.item.b.* */
52547 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
52548 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
52549 + const reiser4_item_data *);
52550 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
52551 +pos_in_node_t nr_units_tail(const coord_t *);
52552 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
52553 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
52554 +int can_shift_tail(unsigned free_space, coord_t * source,
52555 + znode * target, shift_direction, unsigned *size,
52556 + unsigned want);
52557 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
52558 + unsigned count, shift_direction, unsigned free_space);
52559 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
52560 + struct carry_kill_data *);
52561 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52562 + struct carry_cut_data *, reiser4_key * smallest_removed,
52563 + reiser4_key * new_first);
52564 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52565 + struct carry_kill_data *, reiser4_key * smallest_removed,
52566 + reiser4_key * new_first);
52567 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
52568 +
52569 +/* plugin->u.item.s.* */
52570 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
52571 + size_t count, loff_t *pos);
52572 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
52573 +int readpage_tail(void *vp, struct page *page);
52574 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
52575 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
52576 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
52577 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
52578 + hint_t *, int back_to_dirty, int set_hint);
52579 +
52580 +/* __REISER4_TAIL_H__ */
52581 +#endif
52582 +
52583 +/* Make Linus happy.
52584 + Local variables:
52585 + c-indentation-style: "K&R"
52586 + mode-name: "LC"
52587 + c-basic-offset: 8
52588 + tab-width: 8
52589 + fill-column: 120
52590 + scroll-step: 1
52591 + End:
52592 +*/
52593 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/Makefile linux-2.6.23/fs/reiser4/plugin/Makefile
52594 --- linux-2.6.23.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300
52595 +++ linux-2.6.23/fs/reiser4/plugin/Makefile 2007-12-04 16:49:30.000000000 +0300
52596 @@ -0,0 +1,26 @@
52597 +obj-$(CONFIG_REISER4_FS) += plugins.o
52598 +
52599 +plugins-objs := \
52600 + plugin.o \
52601 + plugin_set.o \
52602 + object.o \
52603 + inode_ops.o \
52604 + inode_ops_rename.o \
52605 + file_ops.o \
52606 + file_ops_readdir.o \
52607 + file_plugin_common.o \
52608 + dir_plugin_common.o \
52609 + digest.o \
52610 + hash.o \
52611 + fibration.o \
52612 + tail_policy.o \
52613 + regular.o
52614 +
52615 +obj-$(CONFIG_REISER4_FS) += item/
52616 +obj-$(CONFIG_REISER4_FS) += file/
52617 +obj-$(CONFIG_REISER4_FS) += dir/
52618 +obj-$(CONFIG_REISER4_FS) += node/
52619 +obj-$(CONFIG_REISER4_FS) += compress/
52620 +obj-$(CONFIG_REISER4_FS) += space/
52621 +obj-$(CONFIG_REISER4_FS) += disk_format/
52622 +obj-$(CONFIG_REISER4_FS) += security/
52623 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/Makefile linux-2.6.23/fs/reiser4/plugin/node/Makefile
52624 --- linux-2.6.23.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300
52625 +++ linux-2.6.23/fs/reiser4/plugin/node/Makefile 2007-12-04 16:49:30.000000000 +0300
52626 @@ -0,0 +1,5 @@
52627 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
52628 +
52629 +node_plugins-objs := \
52630 + node.o \
52631 + node40.o
52632 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node40.c linux-2.6.23/fs/reiser4/plugin/node/node40.c
52633 --- linux-2.6.23.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300
52634 +++ linux-2.6.23/fs/reiser4/plugin/node/node40.c 2007-12-04 16:49:30.000000000 +0300
52635 @@ -0,0 +1,2924 @@
52636 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52637 +
52638 +#include "../../debug.h"
52639 +#include "../../key.h"
52640 +#include "../../coord.h"
52641 +#include "../plugin_header.h"
52642 +#include "../item/item.h"
52643 +#include "node.h"
52644 +#include "node40.h"
52645 +#include "../plugin.h"
52646 +#include "../../jnode.h"
52647 +#include "../../znode.h"
52648 +#include "../../pool.h"
52649 +#include "../../carry.h"
52650 +#include "../../tap.h"
52651 +#include "../../tree.h"
52652 +#include "../../super.h"
52653 +#include "../../reiser4.h"
52654 +
52655 +#include <asm/uaccess.h>
52656 +#include <linux/types.h>
52657 +#include <linux/prefetch.h>
52658 +
52659 +/* leaf 40 format:
52660 +
52661 + [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
52662 + plugin_id (16) key
52663 + free_space (16) pluginid (16)
52664 + free_space_start (16) offset (16)
52665 + level (8)
52666 + num_items (16)
52667 + magic (32)
52668 + flush_time (32)
52669 +*/
52670 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
52671 +/* magic number that is stored in ->magic field of node header */
52672 +static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
52673 +
52674 +static int prepare_for_update(znode * left, znode * right,
52675 + carry_plugin_info * info);
52676 +
52677 +/* header of node of reiser40 format is at the beginning of node */
52678 +static inline node40_header *node40_node_header(const znode * node /* node to
52679 + * query */ )
52680 +{
52681 + assert("nikita-567", node != NULL);
52682 + assert("nikita-568", znode_page(node) != NULL);
52683 + assert("nikita-569", zdata(node) != NULL);
52684 + return (node40_header *) zdata(node);
52685 +}
52686 +
52687 +/* functions to get/set fields of node40_header */
52688 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
52689 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
52690 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
52691 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
52692 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
52693 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
52694 +
52695 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
52696 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
52697 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
52698 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
52699 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
52700 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
52701 +
52702 +/* plugin field of node header should be read/set by
52703 + plugin_by_disk_id/save_disk_plugin */
52704 +
52705 +/* array of item headers is at the end of node */
52706 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
52707 +{
52708 + return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
52709 +}
52710 +
52711 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
52712 + */
52713 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
52714 +{
52715 + return (item_header40 *) (zdata(coord->node) +
52716 + znode_size(coord->node)) - (coord->item_pos) -
52717 + 1;
52718 +}
52719 +
52720 +/* functions to get/set fields of item_header40 */
52721 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
52722 +
52723 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
52724 +
52725 +/* plugin field of item header should be read/set by
52726 + plugin_by_disk_id/save_disk_plugin */
52727 +
52728 +/* plugin methods */
52729 +
52730 +/* plugin->u.node.item_overhead
52731 + look for description of this method in plugin/node/node.h */
52732 +size_t
52733 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
52734 +{
52735 + return sizeof(item_header40);
52736 +}
52737 +
52738 +/* plugin->u.node.free_space
52739 + look for description of this method in plugin/node/node.h */
52740 +size_t free_space_node40(znode * node)
52741 +{
52742 + assert("nikita-577", node != NULL);
52743 + assert("nikita-578", znode_is_loaded(node));
52744 + assert("nikita-579", zdata(node) != NULL);
52745 +
52746 + return nh40_get_free_space(node40_node_header(node));
52747 +}
52748 +
52749 +/* private inline version of node40_num_of_items() for use in this file. This
52750 + is necessary, because address of node40_num_of_items() is taken and it is
52751 + never inlined as a result. */
52752 +static inline short node40_num_of_items_internal(const znode * node)
52753 +{
52754 + return nh40_get_num_items(node40_node_header(node));
52755 +}
52756 +
52757 +#if REISER4_DEBUG
52758 +static inline void check_num_items(const znode * node)
52759 +{
52760 + assert("nikita-2749",
52761 + node40_num_of_items_internal(node) == node->nr_items);
52762 + assert("nikita-2746", znode_is_write_locked(node));
52763 +}
52764 +#else
52765 +#define check_num_items(node) noop
52766 +#endif
52767 +
52768 +/* plugin->u.node.num_of_items
52769 + look for description of this method in plugin/node/node.h */
52770 +int num_of_items_node40(const znode * node)
52771 +{
52772 + return node40_num_of_items_internal(node);
52773 +}
52774 +
52775 +static void
52776 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
52777 +{
52778 + assert("nikita-2751", node != NULL);
52779 + assert("nikita-2750", nh == node40_node_header(node));
52780 +
52781 + check_num_items(node);
52782 + nh40_set_num_items(nh, value);
52783 + node->nr_items = value;
52784 + check_num_items(node);
52785 +}
52786 +
52787 +/* plugin->u.node.item_by_coord
52788 + look for description of this method in plugin/node/node.h */
52789 +char *item_by_coord_node40(const coord_t * coord)
52790 +{
52791 + item_header40 *ih;
52792 + char *p;
52793 +
52794 + /* @coord is set to existing item */
52795 + assert("nikita-596", coord != NULL);
52796 + assert("vs-255", coord_is_existing_item(coord));
52797 +
52798 + ih = node40_ih_at_coord(coord);
52799 + p = zdata(coord->node) + ih40_get_offset(ih);
52800 + return p;
52801 +}
52802 +
52803 +/* plugin->u.node.length_by_coord
52804 + look for description of this method in plugin/node/node.h */
52805 +int length_by_coord_node40(const coord_t * coord)
52806 +{
52807 + item_header40 *ih;
52808 + int result;
52809 +
52810 + /* @coord is set to existing item */
52811 + assert("vs-256", coord != NULL);
52812 + assert("vs-257", coord_is_existing_item(coord));
52813 +
52814 + ih = node40_ih_at_coord(coord);
52815 + if ((int)coord->item_pos ==
52816 + node40_num_of_items_internal(coord->node) - 1)
52817 + result =
52818 + nh40_get_free_space_start(node40_node_header(coord->node)) -
52819 + ih40_get_offset(ih);
52820 + else
52821 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52822 +
52823 + return result;
52824 +}
52825 +
52826 +static pos_in_node_t
52827 +node40_item_length(const znode * node, pos_in_node_t item_pos)
52828 +{
52829 + item_header40 *ih;
52830 + pos_in_node_t result;
52831 +
52832 + /* @coord is set to existing item */
52833 + assert("vs-256", node != NULL);
52834 + assert("vs-257", node40_num_of_items_internal(node) > item_pos);
52835 +
52836 + ih = node40_ih_at(node, item_pos);
52837 + if (item_pos == node40_num_of_items_internal(node) - 1)
52838 + result =
52839 + nh40_get_free_space_start(node40_node_header(node)) -
52840 + ih40_get_offset(ih);
52841 + else
52842 + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52843 +
52844 + return result;
52845 +}
52846 +
52847 +/* plugin->u.node.plugin_by_coord
52848 + look for description of this method in plugin/node/node.h */
52849 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
52850 +{
52851 + item_header40 *ih;
52852 + item_plugin *result;
52853 +
52854 + /* @coord is set to existing item */
52855 + assert("vs-258", coord != NULL);
52856 + assert("vs-259", coord_is_existing_item(coord));
52857 +
52858 + ih = node40_ih_at_coord(coord);
52859 + /* pass NULL in stead of current tree. This is time critical call. */
52860 + result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
52861 + return result;
52862 +}
52863 +
52864 +/* plugin->u.node.key_at
52865 + look for description of this method in plugin/node/node.h */
52866 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
52867 +{
52868 + item_header40 *ih;
52869 +
52870 + assert("nikita-1765", coord_is_existing_item(coord));
52871 +
52872 + /* @coord is set to existing item */
52873 + ih = node40_ih_at_coord(coord);
52874 + memcpy(key, &ih->key, sizeof(reiser4_key));
52875 + return key;
52876 +}
52877 +
52878 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
52879 +
52880 +#define NODE_INCSTAT(n, counter) \
52881 + reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
52882 +
52883 +#define NODE_ADDSTAT(n, counter, val) \
52884 + reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
52885 +
52886 +/* plugin->u.node.lookup
52887 + look for description of this method in plugin/node/node.h */
52888 +node_search_result lookup_node40(znode * node /* node to query */ ,
52889 + const reiser4_key * key /* key to look for */ ,
52890 + lookup_bias bias /* search bias */ ,
52891 + coord_t * coord /* resulting coord */ )
52892 +{
52893 + int left;
52894 + int right;
52895 + int found;
52896 + int items;
52897 +
52898 + item_header40 *lefth;
52899 + item_header40 *righth;
52900 +
52901 + item_plugin *iplug;
52902 + item_header40 *bstop;
52903 + item_header40 *ih;
52904 + cmp_t order;
52905 +
52906 + assert("nikita-583", node != NULL);
52907 + assert("nikita-584", key != NULL);
52908 + assert("nikita-585", coord != NULL);
52909 + assert("nikita-2693", znode_is_any_locked(node));
52910 + cassert(REISER4_SEQ_SEARCH_BREAK > 2);
52911 +
52912 + items = node_num_items(node);
52913 +
52914 + if (unlikely(items == 0)) {
52915 + coord_init_first_unit(coord, node);
52916 + return NS_NOT_FOUND;
52917 + }
52918 +
52919 + /* binary search for item that can contain given key */
52920 + left = 0;
52921 + right = items - 1;
52922 + coord->node = node;
52923 + coord_clear_iplug(coord);
52924 + found = 0;
52925 +
52926 + lefth = node40_ih_at(node, left);
52927 + righth = node40_ih_at(node, right);
52928 +
52929 + /* It is known that for small arrays sequential search is on average
52930 + more efficient than binary. This is because sequential search is
52931 + coded as tight loop that can be better optimized by compilers and
52932 + for small array size gain from this optimization makes sequential
52933 + search the winner. Another, maybe more important, reason for this,
52934 + is that sequential array is more CPU cache friendly, whereas binary
52935 + search effectively destroys CPU caching.
52936 +
52937 + Critical here is the notion of "smallness". Reasonable value of
52938 + REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
52939 + fs/reiser4/ulevel/ulevel.c:test_search().
52940 +
52941 + Don't try to further optimize sequential search by scanning from
52942 + right to left in attempt to use more efficient loop termination
52943 + condition (comparison with 0). This doesn't work.
52944 +
52945 + */
52946 +
52947 + while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
52948 + int median;
52949 + item_header40 *medianh;
52950 +
52951 + median = (left + right) / 2;
52952 + medianh = node40_ih_at(node, median);
52953 +
52954 + assert("nikita-1084", median >= 0);
52955 + assert("nikita-1085", median < items);
52956 + switch (keycmp(key, &medianh->key)) {
52957 + case LESS_THAN:
52958 + right = median;
52959 + righth = medianh;
52960 + break;
52961 + default:
52962 + wrong_return_value("nikita-586", "keycmp");
52963 + case GREATER_THAN:
52964 + left = median;
52965 + lefth = medianh;
52966 + break;
52967 + case EQUAL_TO:
52968 + do {
52969 + --median;
52970 + /* headers are ordered from right to left */
52971 + ++medianh;
52972 + } while (median >= 0 && keyeq(key, &medianh->key));
52973 + right = left = median + 1;
52974 + ih = lefth = righth = medianh - 1;
52975 + found = 1;
52976 + break;
52977 + }
52978 + }
52979 + /* sequential scan. Item headers, and, therefore, keys are stored at
52980 + the rightmost part of a node from right to left. We are trying to
52981 + access memory from left to right, and hence, scan in _descending_
52982 + order of item numbers.
52983 + */
52984 + if (!found) {
52985 + for (left = right, ih = righth; left >= 0; ++ih, --left) {
52986 + cmp_t comparison;
52987 +
52988 + prefetchkey(&(ih + 1)->key);
52989 + comparison = keycmp(&ih->key, key);
52990 + if (comparison == GREATER_THAN)
52991 + continue;
52992 + if (comparison == EQUAL_TO) {
52993 + found = 1;
52994 + do {
52995 + --left;
52996 + ++ih;
52997 + } while (left >= 0 && keyeq(&ih->key, key));
52998 + ++left;
52999 + --ih;
53000 + } else {
53001 + assert("nikita-1256", comparison == LESS_THAN);
53002 + }
53003 + break;
53004 + }
53005 + if (unlikely(left < 0))
53006 + left = 0;
53007 + }
53008 +
53009 + assert("nikita-3212", right >= left);
53010 + assert("nikita-3214",
53011 + equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
53012 +
53013 + coord_set_item_pos(coord, left);
53014 + coord->unit_pos = 0;
53015 + coord->between = AT_UNIT;
53016 +
53017 + /* key < leftmost key in a mode or node is corrupted and keys
53018 + are not sorted */
53019 + bstop = node40_ih_at(node, (unsigned)left);
53020 + order = keycmp(&bstop->key, key);
53021 + if (unlikely(order == GREATER_THAN)) {
53022 + if (unlikely(left != 0)) {
53023 + /* screw up */
53024 + warning("nikita-587", "Key less than %i key in a node",
53025 + left);
53026 + reiser4_print_key("key", key);
53027 + reiser4_print_key("min", &bstop->key);
53028 + print_coord_content("coord", coord);
53029 + return RETERR(-EIO);
53030 + } else {
53031 + coord->between = BEFORE_UNIT;
53032 + return NS_NOT_FOUND;
53033 + }
53034 + }
53035 + /* left <= key, ok */
53036 + iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
53037 +
53038 + if (unlikely(iplug == NULL)) {
53039 + warning("nikita-588", "Unknown plugin %i",
53040 + le16_to_cpu(get_unaligned(&bstop->plugin_id)));
53041 + reiser4_print_key("key", key);
53042 + print_coord_content("coord", coord);
53043 + return RETERR(-EIO);
53044 + }
53045 +
53046 + coord_set_iplug(coord, iplug);
53047 +
53048 + /* if exact key from item header was found by binary search, no
53049 + further checks are necessary. */
53050 + if (found) {
53051 + assert("nikita-1259", order == EQUAL_TO);
53052 + return NS_FOUND;
53053 + }
53054 + if (iplug->b.max_key_inside != NULL) {
53055 + reiser4_key max_item_key;
53056 +
53057 + /* key > max_item_key --- outside of an item */
53058 + if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
53059 + coord->unit_pos = 0;
53060 + coord->between = AFTER_ITEM;
53061 + /* FIXME-VS: key we are looking for does not fit into
53062 + found item. Return NS_NOT_FOUND then. Without that
53063 + the following case does not work: there is extent of
53064 + file 10000, 10001. File 10000, 10002 has been just
53065 + created. When writing to position 0 in that file -
53066 + traverse_tree will stop here on twig level. When we
53067 + want it to go down to leaf level
53068 + */
53069 + return NS_NOT_FOUND;
53070 + }
53071 + }
53072 +
53073 + if (iplug->b.lookup != NULL) {
53074 + return iplug->b.lookup(key, bias, coord);
53075 + } else {
53076 + assert("nikita-1260", order == LESS_THAN);
53077 + coord->between = AFTER_UNIT;
53078 + return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
53079 + }
53080 +}
53081 +
53082 +#undef NODE_ADDSTAT
53083 +#undef NODE_INCSTAT
53084 +
53085 +/* plugin->u.node.estimate
53086 + look for description of this method in plugin/node/node.h */
53087 +size_t estimate_node40(znode * node)
53088 +{
53089 + size_t result;
53090 +
53091 + assert("nikita-597", node != NULL);
53092 +
53093 + result = free_space_node40(node) - sizeof(item_header40);
53094 +
53095 + return (result > 0) ? result : 0;
53096 +}
53097 +
53098 +/* plugin->u.node.check
53099 + look for description of this method in plugin/node/node.h */
53100 +int check_node40(const znode * node /* node to check */ ,
53101 + __u32 flags /* check flags */ ,
53102 + const char **error /* where to store error message */ )
53103 +{
53104 + int nr_items;
53105 + int i;
53106 + reiser4_key prev;
53107 + unsigned old_offset;
53108 + tree_level level;
53109 + coord_t coord;
53110 + int result;
53111 +
53112 + assert("nikita-580", node != NULL);
53113 + assert("nikita-581", error != NULL);
53114 + assert("nikita-2948", znode_is_loaded(node));
53115 +
53116 + if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
53117 + return 0;
53118 +
53119 + assert("nikita-582", zdata(node) != NULL);
53120 +
53121 + nr_items = node40_num_of_items_internal(node);
53122 + if (nr_items < 0) {
53123 + *error = "Negative number of items";
53124 + return -1;
53125 + }
53126 +
53127 + if (flags & REISER4_NODE_DKEYS)
53128 + prev = *znode_get_ld_key((znode *) node);
53129 + else
53130 + prev = *reiser4_min_key();
53131 +
53132 + old_offset = 0;
53133 + coord_init_zero(&coord);
53134 + coord.node = (znode *) node;
53135 + coord.unit_pos = 0;
53136 + coord.between = AT_UNIT;
53137 + level = znode_get_level(node);
53138 + for (i = 0; i < nr_items; i++) {
53139 + item_header40 *ih;
53140 + reiser4_key unit_key;
53141 + unsigned j;
53142 +
53143 + ih = node40_ih_at(node, (unsigned)i);
53144 + coord_set_item_pos(&coord, i);
53145 + if ((ih40_get_offset(ih) >=
53146 + znode_size(node) - nr_items * sizeof(item_header40)) ||
53147 + (ih40_get_offset(ih) < sizeof(node40_header))) {
53148 + *error = "Offset is out of bounds";
53149 + return -1;
53150 + }
53151 + if (ih40_get_offset(ih) <= old_offset) {
53152 + *error = "Offsets are in wrong order";
53153 + return -1;
53154 + }
53155 + if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
53156 + *error = "Wrong offset of first item";
53157 + return -1;
53158 + }
53159 + old_offset = ih40_get_offset(ih);
53160 +
53161 + if (keygt(&prev, &ih->key)) {
53162 + *error = "Keys are in wrong order";
53163 + return -1;
53164 + }
53165 + if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
53166 + *error = "Wrong key of first unit";
53167 + return -1;
53168 + }
53169 + prev = ih->key;
53170 + for (j = 0; j < coord_num_units(&coord); ++j) {
53171 + coord.unit_pos = j;
53172 + unit_key_by_coord(&coord, &unit_key);
53173 + if (keygt(&prev, &unit_key)) {
53174 + *error = "Unit keys are in wrong order";
53175 + return -1;
53176 + }
53177 + prev = unit_key;
53178 + }
53179 + coord.unit_pos = 0;
53180 + if (level != TWIG_LEVEL && item_is_extent(&coord)) {
53181 + *error = "extent on the wrong level";
53182 + return -1;
53183 + }
53184 + if (level == LEAF_LEVEL && item_is_internal(&coord)) {
53185 + *error = "internal item on the wrong level";
53186 + return -1;
53187 + }
53188 + if (level != LEAF_LEVEL &&
53189 + !item_is_internal(&coord) && !item_is_extent(&coord)) {
53190 + *error = "wrong item on the internal level";
53191 + return -1;
53192 + }
53193 + if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
53194 + *error = "non-internal item on the internal level";
53195 + return -1;
53196 + }
53197 +#if REISER4_DEBUG
53198 + if (item_plugin_by_coord(&coord)->b.check
53199 + && item_plugin_by_coord(&coord)->b.check(&coord, error))
53200 + return -1;
53201 +#endif
53202 + if (i) {
53203 + coord_t prev_coord;
53204 + /* two neighboring items can not be mergeable */
53205 + coord_dup(&prev_coord, &coord);
53206 + coord_prev_item(&prev_coord);
53207 + if (are_items_mergeable(&prev_coord, &coord)) {
53208 + *error = "mergeable items in one node";
53209 + return -1;
53210 + }
53211 +
53212 + }
53213 + }
53214 +
53215 + if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
53216 + coord_t coord;
53217 + item_plugin *iplug;
53218 +
53219 + coord_init_last_unit(&coord, node);
53220 + iplug = item_plugin_by_coord(&coord);
53221 + if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
53222 + iplug->s.file.append_key != NULL) {
53223 + reiser4_key mkey;
53224 +
53225 + iplug->s.file.append_key(&coord, &mkey);
53226 + set_key_offset(&mkey, get_key_offset(&mkey) - 1);
53227 + read_lock_dk(current_tree);
53228 + result = keygt(&mkey, znode_get_rd_key((znode *) node));
53229 + read_unlock_dk(current_tree);
53230 + if (result) {
53231 + *error = "key of rightmost item is too large";
53232 + return -1;
53233 + }
53234 + }
53235 + }
53236 + if (flags & REISER4_NODE_DKEYS) {
53237 + read_lock_tree(current_tree);
53238 + read_lock_dk(current_tree);
53239 +
53240 + flags |= REISER4_NODE_TREE_STABLE;
53241 +
53242 + if (keygt(&prev, znode_get_rd_key((znode *) node))) {
53243 + if (flags & REISER4_NODE_TREE_STABLE) {
53244 + *error = "Last key is greater than rdkey";
53245 + read_unlock_dk(current_tree);
53246 + read_unlock_tree(current_tree);
53247 + return -1;
53248 + }
53249 + }
53250 + if (keygt
53251 + (znode_get_ld_key((znode *) node),
53252 + znode_get_rd_key((znode *) node))) {
53253 + *error = "ldkey is greater than rdkey";
53254 + read_unlock_dk(current_tree);
53255 + read_unlock_tree(current_tree);
53256 + return -1;
53257 + }
53258 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
53259 + (node->left != NULL) &&
53260 + !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
53261 + ergo(flags & REISER4_NODE_TREE_STABLE,
53262 + !keyeq(znode_get_rd_key(node->left),
53263 + znode_get_ld_key((znode *) node)))
53264 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53265 + keygt(znode_get_rd_key(node->left),
53266 + znode_get_ld_key((znode *) node)))) {
53267 + *error = "left rdkey or ldkey is wrong";
53268 + read_unlock_dk(current_tree);
53269 + read_unlock_tree(current_tree);
53270 + return -1;
53271 + }
53272 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
53273 + (node->right != NULL) &&
53274 + !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
53275 + ergo(flags & REISER4_NODE_TREE_STABLE,
53276 + !keyeq(znode_get_rd_key((znode *) node),
53277 + znode_get_ld_key(node->right)))
53278 + && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53279 + keygt(znode_get_rd_key((znode *) node),
53280 + znode_get_ld_key(node->right)))) {
53281 + *error = "rdkey or right ldkey is wrong";
53282 + read_unlock_dk(current_tree);
53283 + read_unlock_tree(current_tree);
53284 + return -1;
53285 + }
53286 +
53287 + read_unlock_dk(current_tree);
53288 + read_unlock_tree(current_tree);
53289 + }
53290 +
53291 + return 0;
53292 +}
53293 +
53294 +/* plugin->u.node.parse
53295 + look for description of this method in plugin/node/node.h */
53296 +int parse_node40(znode * node /* node to parse */ )
53297 +{
53298 + node40_header *header;
53299 + int result;
53300 + d8 level;
53301 +
53302 + header = node40_node_header((znode *) node);
53303 + result = -EIO;
53304 + level = nh40_get_level(header);
53305 + if (unlikely(((__u8) znode_get_level(node)) != level))
53306 + warning("nikita-494", "Wrong level found in node: %i != %i",
53307 + znode_get_level(node), level);
53308 + else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
53309 + warning("nikita-495",
53310 + "Wrong magic in tree node: want %x, got %x",
53311 + REISER4_NODE_MAGIC, nh40_get_magic(header));
53312 + else {
53313 + node->nr_items = node40_num_of_items_internal(node);
53314 + result = 0;
53315 + }
53316 + return RETERR(result);
53317 +}
53318 +
53319 +/* plugin->u.node.init
53320 + look for description of this method in plugin/node/node.h */
53321 +int init_node40(znode * node /* node to initialise */ )
53322 +{
53323 + node40_header *header;
53324 +
53325 + assert("nikita-570", node != NULL);
53326 + assert("nikita-572", zdata(node) != NULL);
53327 +
53328 + header = node40_node_header(node);
53329 + memset(header, 0, sizeof(node40_header));
53330 + nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
53331 + nh40_set_free_space_start(header, sizeof(node40_header));
53332 + /* sane hypothesis: 0 in CPU format is 0 in disk format */
53333 + /* items: 0 */
53334 + save_plugin_id(node_plugin_to_plugin(node->nplug),
53335 + &header->common_header.plugin_id);
53336 + nh40_set_level(header, znode_get_level(node));
53337 + nh40_set_magic(header, REISER4_NODE_MAGIC);
53338 + node->nr_items = 0;
53339 + nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
53340 +
53341 + /* flags: 0 */
53342 + return 0;
53343 +}
53344 +
53345 +#ifdef GUESS_EXISTS
53346 +int guess_node40(const znode * node /* node to guess plugin of */ )
53347 +{
53348 + node40_header *nethack;
53349 +
53350 + assert("nikita-1058", node != NULL);
53351 + nethack = node40_node_header(node);
53352 + return
53353 + (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
53354 + (plugin_by_disk_id(znode_get_tree(node),
53355 + REISER4_NODE_PLUGIN_TYPE,
53356 + &nethack->common_header.plugin_id)->h.id ==
53357 + NODE40_ID);
53358 +}
53359 +#endif
53360 +
53361 +/* plugin->u.node.chage_item_size
53362 + look for description of this method in plugin/node/node.h */
53363 +void change_item_size_node40(coord_t * coord, int by)
53364 +{
53365 + node40_header *nh;
53366 + item_header40 *ih;
53367 + char *item_data;
53368 + int item_length;
53369 + unsigned i;
53370 +
53371 + /* make sure that @item is coord of existing item */
53372 + assert("vs-210", coord_is_existing_item(coord));
53373 +
53374 + nh = node40_node_header(coord->node);
53375 +
53376 + item_data = item_by_coord_node40(coord);
53377 + item_length = length_by_coord_node40(coord);
53378 +
53379 + /* move item bodies */
53380 + ih = node40_ih_at_coord(coord);
53381 + memmove(item_data + item_length + by, item_data + item_length,
53382 + nh40_get_free_space_start(node40_node_header(coord->node)) -
53383 + (ih40_get_offset(ih) + item_length));
53384 +
53385 + /* update offsets of moved items */
53386 + for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
53387 + ih = node40_ih_at(coord->node, i);
53388 + ih40_set_offset(ih, ih40_get_offset(ih) + by);
53389 + }
53390 +
53391 + /* update node header */
53392 + nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
53393 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
53394 +}
53395 +
53396 +static int should_notify_parent(const znode * node)
53397 +{
53398 + /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
53399 + return !disk_addr_eq(znode_get_block(node),
53400 + &znode_get_tree(node)->root_block);
53401 +}
53402 +
53403 +/* plugin->u.node.create_item
53404 + look for description of this method in plugin/node/node.h */
53405 +int
53406 +create_item_node40(coord_t *target, const reiser4_key *key,
53407 + reiser4_item_data *data, carry_plugin_info *info)
53408 +{
53409 + node40_header *nh;
53410 + item_header40 *ih;
53411 + unsigned offset;
53412 + unsigned i;
53413 +
53414 + nh = node40_node_header(target->node);
53415 +
53416 + assert("vs-212", coord_is_between_items(target));
53417 + /* node must have enough free space */
53418 + assert("vs-254",
53419 + free_space_node40(target->node) >=
53420 + data->length + sizeof(item_header40));
53421 + assert("vs-1410", data->length >= 0);
53422 +
53423 + if (coord_set_to_right(target))
53424 + /* there are not items to the right of @target, so, new item
53425 + will be inserted after last one */
53426 + coord_set_item_pos(target, nh40_get_num_items(nh));
53427 +
53428 + if (target->item_pos < nh40_get_num_items(nh)) {
53429 + /* there are items to be moved to prepare space for new
53430 + item */
53431 + ih = node40_ih_at_coord(target);
53432 + /* new item will start at this offset */
53433 + offset = ih40_get_offset(ih);
53434 +
53435 + memmove(zdata(target->node) + offset + data->length,
53436 + zdata(target->node) + offset,
53437 + nh40_get_free_space_start(nh) - offset);
53438 + /* update headers of moved items */
53439 + for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
53440 + ih = node40_ih_at(target->node, i);
53441 + ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
53442 + }
53443 +
53444 + /* @ih is set to item header of the last item, move item headers */
53445 + memmove(ih - 1, ih,
53446 + sizeof(item_header40) * (nh40_get_num_items(nh) -
53447 + target->item_pos));
53448 + } else {
53449 + /* new item will start at this offset */
53450 + offset = nh40_get_free_space_start(nh);
53451 + }
53452 +
53453 + /* make item header for the new item */
53454 + ih = node40_ih_at_coord(target);
53455 + memcpy(&ih->key, key, sizeof(reiser4_key));
53456 + ih40_set_offset(ih, offset);
53457 + save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
53458 +
53459 + /* update node header */
53460 + nh40_set_free_space(nh,
53461 + nh40_get_free_space(nh) - data->length -
53462 + sizeof(item_header40));
53463 + nh40_set_free_space_start(nh,
53464 + nh40_get_free_space_start(nh) + data->length);
53465 + node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
53466 +
53467 + /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
53468 + target->unit_pos = 0;
53469 + target->between = AT_UNIT;
53470 + coord_clear_iplug(target);
53471 +
53472 + /* initialize item */
53473 + if (data->iplug->b.init != NULL) {
53474 + data->iplug->b.init(target, NULL, data);
53475 + }
53476 + /* copy item body */
53477 + if (data->iplug->b.paste != NULL) {
53478 + data->iplug->b.paste(target, data, info);
53479 + } else if (data->data != NULL) {
53480 + if (data->user) {
53481 + /* AUDIT: Are we really should not check that pointer
53482 + from userspace was valid and data bytes were
53483 + available? How will we return -EFAULT of some kind
53484 + without this check? */
53485 + assert("nikita-3038", reiser4_schedulable());
53486 + /* copy data from user space */
53487 + __copy_from_user(zdata(target->node) + offset,
53488 + (const char __user *)data->data,
53489 + (unsigned)data->length);
53490 + } else
53491 + /* copy from kernel space */
53492 + memcpy(zdata(target->node) + offset, data->data,
53493 + (unsigned)data->length);
53494 + }
53495 +
53496 + if (target->item_pos == 0) {
53497 + /* left delimiting key has to be updated */
53498 + prepare_for_update(NULL, target->node, info);
53499 + }
53500 +
53501 + if (item_plugin_by_coord(target)->b.create_hook != NULL) {
53502 + item_plugin_by_coord(target)->b.create_hook(target, data->arg);
53503 + }
53504 +
53505 + return 0;
53506 +}
53507 +
53508 +/* plugin->u.node.update_item_key
53509 + look for description of this method in plugin/node/node.h */
53510 +void
53511 +update_item_key_node40(coord_t * target, const reiser4_key * key,
53512 + carry_plugin_info * info)
53513 +{
53514 + item_header40 *ih;
53515 +
53516 + ih = node40_ih_at_coord(target);
53517 + memcpy(&ih->key, key, sizeof(reiser4_key));
53518 +
53519 + if (target->item_pos == 0) {
53520 + prepare_for_update(NULL, target->node, info);
53521 + }
53522 +}
53523 +
53524 +/* this bits encode cut mode */
53525 +#define CMODE_TAIL 1
53526 +#define CMODE_WHOLE 2
53527 +#define CMODE_HEAD 4
53528 +
53529 +struct cut40_info {
53530 + int mode;
53531 + pos_in_node_t tail_removed; /* position of item which gets tail removed */
53532 + pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
53533 + pos_in_node_t removed_count; /* number of items removed completely */
53534 + pos_in_node_t head_removed; /* position of item which gets head removed */
53535 +
53536 + pos_in_node_t freed_space_start;
53537 + pos_in_node_t freed_space_end;
53538 + pos_in_node_t first_moved;
53539 + pos_in_node_t head_removed_location;
53540 +};
53541 +
53542 +static void init_cinfo(struct cut40_info *cinfo)
53543 +{
53544 + cinfo->mode = 0;
53545 + cinfo->tail_removed = MAX_POS_IN_NODE;
53546 + cinfo->first_removed = MAX_POS_IN_NODE;
53547 + cinfo->removed_count = MAX_POS_IN_NODE;
53548 + cinfo->head_removed = MAX_POS_IN_NODE;
53549 + cinfo->freed_space_start = MAX_POS_IN_NODE;
53550 + cinfo->freed_space_end = MAX_POS_IN_NODE;
53551 + cinfo->first_moved = MAX_POS_IN_NODE;
53552 + cinfo->head_removed_location = MAX_POS_IN_NODE;
53553 +}
53554 +
53555 +/* complete cut_node40/kill_node40 content by removing the gap created by */
53556 +static void compact(znode * node, struct cut40_info *cinfo)
53557 +{
53558 + node40_header *nh;
53559 + item_header40 *ih;
53560 + pos_in_node_t freed;
53561 + pos_in_node_t pos, nr_items;
53562 +
53563 + assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
53564 + cinfo->freed_space_end != MAX_POS_IN_NODE &&
53565 + cinfo->first_moved != MAX_POS_IN_NODE));
53566 + assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
53567 +
53568 + nh = node40_node_header(node);
53569 + nr_items = nh40_get_num_items(nh);
53570 +
53571 + /* remove gap made up by removal */
53572 + memmove(zdata(node) + cinfo->freed_space_start,
53573 + zdata(node) + cinfo->freed_space_end,
53574 + nh40_get_free_space_start(nh) - cinfo->freed_space_end);
53575 +
53576 + /* update item headers of moved items - change their locations */
53577 + pos = cinfo->first_moved;
53578 + ih = node40_ih_at(node, pos);
53579 + if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
53580 + assert("vs-1580", pos == cinfo->head_removed);
53581 + ih40_set_offset(ih, cinfo->head_removed_location);
53582 + pos++;
53583 + ih--;
53584 + }
53585 +
53586 + freed = cinfo->freed_space_end - cinfo->freed_space_start;
53587 + for (; pos < nr_items; pos++, ih--) {
53588 + assert("vs-1581", ih == node40_ih_at(node, pos));
53589 + ih40_set_offset(ih, ih40_get_offset(ih) - freed);
53590 + }
53591 +
53592 + /* free space start moved to right */
53593 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
53594 +
53595 + if (cinfo->removed_count != MAX_POS_IN_NODE) {
53596 + /* number of items changed. Remove item headers of those items */
53597 + ih = node40_ih_at(node, nr_items - 1);
53598 + memmove(ih + cinfo->removed_count, ih,
53599 + sizeof(item_header40) * (nr_items -
53600 + cinfo->removed_count -
53601 + cinfo->first_removed));
53602 + freed += sizeof(item_header40) * cinfo->removed_count;
53603 + node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
53604 + }
53605 +
53606 + /* total amount of free space increased */
53607 + nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
53608 +}
53609 +
53610 +int shrink_item_node40(coord_t * coord, int delta)
53611 +{
53612 + node40_header *nh;
53613 + item_header40 *ih;
53614 + pos_in_node_t pos;
53615 + pos_in_node_t nr_items;
53616 + char *end;
53617 + znode *node;
53618 + int off;
53619 +
53620 + assert("nikita-3487", coord != NULL);
53621 + assert("nikita-3488", delta >= 0);
53622 +
53623 + node = coord->node;
53624 + nh = node40_node_header(node);
53625 + nr_items = nh40_get_num_items(nh);
53626 +
53627 + ih = node40_ih_at_coord(coord);
53628 + assert("nikita-3489", delta <= length_by_coord_node40(coord));
53629 + off = ih40_get_offset(ih) + length_by_coord_node40(coord);
53630 + end = zdata(node) + off;
53631 +
53632 + /* remove gap made up by removal */
53633 + memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
53634 +
53635 + /* update item headers of moved items - change their locations */
53636 + pos = coord->item_pos + 1;
53637 + ih = node40_ih_at(node, pos);
53638 + for (; pos < nr_items; pos++, ih--) {
53639 + assert("nikita-3490", ih == node40_ih_at(node, pos));
53640 + ih40_set_offset(ih, ih40_get_offset(ih) - delta);
53641 + }
53642 +
53643 + /* free space start moved to left */
53644 + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
53645 + /* total amount of free space increased */
53646 + nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
53647 + /*
53648 + * This method does _not_ changes number of items. Hence, it cannot
53649 + * make node empty. Also it doesn't remove items at all, which means
53650 + * that no keys have to be updated either.
53651 + */
53652 + return 0;
53653 +}
53654 +
53655 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
53656 + of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
53657 + rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
53658 + getting head cut. Function returns 0 in this case */
53659 +static int
53660 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
53661 +{
53662 + reiser4_key left_key, right_key;
53663 + reiser4_key min_from_key, max_to_key;
53664 + const reiser4_key *from_key, *to_key;
53665 +
53666 + init_cinfo(cinfo);
53667 +
53668 + /* calculate minimal key stored in first item of items to be cut (params->from) */
53669 + item_key_by_coord(params->from, &min_from_key);
53670 + /* and max key stored in last item of items to be cut (params->to) */
53671 + max_item_key_by_coord(params->to, &max_to_key);
53672 +
53673 + /* if cut key range is not defined in input parameters - define it using cut coord range */
53674 + if (params->from_key == NULL) {
53675 + assert("vs-1513", params->to_key == NULL);
53676 + unit_key_by_coord(params->from, &left_key);
53677 + from_key = &left_key;
53678 + max_unit_key_by_coord(params->to, &right_key);
53679 + to_key = &right_key;
53680 + } else {
53681 + from_key = params->from_key;
53682 + to_key = params->to_key;
53683 + }
53684 +
53685 + if (params->from->item_pos == params->to->item_pos) {
53686 + if (keylt(&min_from_key, from_key)
53687 + && keylt(to_key, &max_to_key))
53688 + return 1;
53689 +
53690 + if (keygt(from_key, &min_from_key)) {
53691 + /* tail of item is to be cut cut */
53692 + cinfo->tail_removed = params->from->item_pos;
53693 + cinfo->mode |= CMODE_TAIL;
53694 + } else if (keylt(to_key, &max_to_key)) {
53695 + /* head of item is to be cut */
53696 + cinfo->head_removed = params->from->item_pos;
53697 + cinfo->mode |= CMODE_HEAD;
53698 + } else {
53699 + /* item is removed completely */
53700 + cinfo->first_removed = params->from->item_pos;
53701 + cinfo->removed_count = 1;
53702 + cinfo->mode |= CMODE_WHOLE;
53703 + }
53704 + } else {
53705 + cinfo->first_removed = params->from->item_pos + 1;
53706 + cinfo->removed_count =
53707 + params->to->item_pos - params->from->item_pos - 1;
53708 +
53709 + if (keygt(from_key, &min_from_key)) {
53710 + /* first item is not cut completely */
53711 + cinfo->tail_removed = params->from->item_pos;
53712 + cinfo->mode |= CMODE_TAIL;
53713 + } else {
53714 + cinfo->first_removed--;
53715 + cinfo->removed_count++;
53716 + }
53717 + if (keylt(to_key, &max_to_key)) {
53718 + /* last item is not cut completely */
53719 + cinfo->head_removed = params->to->item_pos;
53720 + cinfo->mode |= CMODE_HEAD;
53721 + } else {
53722 + cinfo->removed_count++;
53723 + }
53724 + if (cinfo->removed_count)
53725 + cinfo->mode |= CMODE_WHOLE;
53726 + }
53727 +
53728 + return 0;
53729 +}
53730 +
53731 +static void
53732 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
53733 + carry_kill_data * kdata)
53734 +{
53735 + coord_t coord;
53736 + item_plugin *iplug;
53737 + pos_in_node_t pos;
53738 +
53739 + coord.node = node;
53740 + coord.unit_pos = 0;
53741 + coord.between = AT_UNIT;
53742 + for (pos = 0; pos < count; pos++) {
53743 + coord_set_item_pos(&coord, from + pos);
53744 + coord.unit_pos = 0;
53745 + coord.between = AT_UNIT;
53746 + iplug = item_plugin_by_coord(&coord);
53747 + if (iplug->b.kill_hook) {
53748 + iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
53749 + kdata);
53750 + }
53751 + }
53752 +}
53753 +
53754 +/* this is used to kill item partially */
53755 +static pos_in_node_t
53756 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53757 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
53758 +{
53759 + struct carry_kill_data *kdata;
53760 + item_plugin *iplug;
53761 +
53762 + kdata = data;
53763 + iplug = item_plugin_by_coord(coord);
53764 +
53765 + assert("vs-1524", iplug->b.kill_units);
53766 + return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
53767 + new_first_key);
53768 +}
53769 +
53770 +/* call item plugin to cut tail of file */
53771 +static pos_in_node_t
53772 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53773 +{
53774 + struct carry_kill_data *kdata;
53775 + pos_in_node_t to;
53776 +
53777 + kdata = data;
53778 + to = coord_last_unit_pos(coord);
53779 + return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
53780 + NULL);
53781 +}
53782 +
53783 +/* call item plugin to cut head of item */
53784 +static pos_in_node_t
53785 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53786 + reiser4_key * new_first_key)
53787 +{
53788 + return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
53789 + new_first_key);
53790 +}
53791 +
53792 +/* this is used to cut item partially */
53793 +static pos_in_node_t
53794 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53795 + reiser4_key * smallest_removed, reiser4_key * new_first_key)
53796 +{
53797 + carry_cut_data *cdata;
53798 + item_plugin *iplug;
53799 +
53800 + cdata = data;
53801 + iplug = item_plugin_by_coord(coord);
53802 + assert("vs-302", iplug->b.cut_units);
53803 + return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
53804 + new_first_key);
53805 +}
53806 +
53807 +/* call item plugin to cut tail of file */
53808 +static pos_in_node_t
53809 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53810 +{
53811 + carry_cut_data *cdata;
53812 + pos_in_node_t to;
53813 +
53814 + cdata = data;
53815 + to = coord_last_unit_pos(cdata->params.from);
53816 + return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
53817 +}
53818 +
53819 +/* call item plugin to cut head of item */
53820 +static pos_in_node_t
53821 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53822 + reiser4_key * new_first_key)
53823 +{
53824 + return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
53825 + new_first_key);
53826 +}
53827 +
53828 +/* this returns 1 of key of first item changed, 0 - if it did not */
53829 +static int
53830 +prepare_for_compact(struct cut40_info *cinfo,
53831 + const struct cut_kill_params *params, int is_cut,
53832 + void *data, carry_plugin_info * info)
53833 +{
53834 + znode *node;
53835 + item_header40 *ih;
53836 + pos_in_node_t freed;
53837 + pos_in_node_t item_pos;
53838 + coord_t coord;
53839 + reiser4_key new_first_key;
53840 + pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
53841 + void *, reiser4_key *, reiser4_key *);
53842 + pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
53843 + pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
53844 + reiser4_key *);
53845 + int retval;
53846 +
53847 + retval = 0;
53848 +
53849 + node = params->from->node;
53850 +
53851 + assert("vs-184", node == params->to->node);
53852 + assert("vs-312", !node_is_empty(node));
53853 + assert("vs-297",
53854 + coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
53855 +
53856 + if (is_cut) {
53857 + kill_units_f = cut_units;
53858 + kill_tail_f = cut_tail;
53859 + kill_head_f = cut_head;
53860 + } else {
53861 + kill_units_f = kill_units;
53862 + kill_tail_f = kill_tail;
53863 + kill_head_f = kill_head;
53864 + }
53865 +
53866 + if (parse_cut(cinfo, params) == 1) {
53867 + /* cut from the middle of item */
53868 + freed =
53869 + kill_units_f(params->from, params->from->unit_pos,
53870 + params->to->unit_pos, data,
53871 + params->smallest_removed, NULL);
53872 +
53873 + item_pos = params->from->item_pos;
53874 + ih = node40_ih_at(node, item_pos);
53875 + cinfo->freed_space_start =
53876 + ih40_get_offset(ih) + node40_item_length(node,
53877 + item_pos) - freed;
53878 + cinfo->freed_space_end = cinfo->freed_space_start + freed;
53879 + cinfo->first_moved = item_pos + 1;
53880 + } else {
53881 + assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
53882 + cinfo->first_removed != MAX_POS_IN_NODE ||
53883 + cinfo->head_removed != MAX_POS_IN_NODE));
53884 +
53885 + switch (cinfo->mode) {
53886 + case CMODE_TAIL:
53887 + /* one item gets cut partially from its end */
53888 + assert("vs-1562",
53889 + cinfo->tail_removed == params->from->item_pos);
53890 +
53891 + freed =
53892 + kill_tail_f(params->from, data,
53893 + params->smallest_removed);
53894 +
53895 + item_pos = cinfo->tail_removed;
53896 + ih = node40_ih_at(node, item_pos);
53897 + cinfo->freed_space_start =
53898 + ih40_get_offset(ih) + node40_item_length(node,
53899 + item_pos) -
53900 + freed;
53901 + cinfo->freed_space_end =
53902 + cinfo->freed_space_start + freed;
53903 + cinfo->first_moved = cinfo->tail_removed + 1;
53904 + break;
53905 +
53906 + case CMODE_WHOLE:
53907 + /* one or more items get removed completely */
53908 + assert("vs-1563",
53909 + cinfo->first_removed == params->from->item_pos);
53910 + assert("vs-1564", cinfo->removed_count > 0
53911 + && cinfo->removed_count != MAX_POS_IN_NODE);
53912 +
53913 + /* call kill hook for all items removed completely */
53914 + if (is_cut == 0)
53915 + call_kill_hooks(node, cinfo->first_removed,
53916 + cinfo->removed_count, data);
53917 +
53918 + item_pos = cinfo->first_removed;
53919 + ih = node40_ih_at(node, item_pos);
53920 +
53921 + if (params->smallest_removed)
53922 + memcpy(params->smallest_removed, &ih->key,
53923 + sizeof(reiser4_key));
53924 +
53925 + cinfo->freed_space_start = ih40_get_offset(ih);
53926 +
53927 + item_pos += (cinfo->removed_count - 1);
53928 + ih -= (cinfo->removed_count - 1);
53929 + cinfo->freed_space_end =
53930 + ih40_get_offset(ih) + node40_item_length(node,
53931 + item_pos);
53932 + cinfo->first_moved = item_pos + 1;
53933 + if (cinfo->first_removed == 0)
53934 + /* key of first item of the node changes */
53935 + retval = 1;
53936 + break;
53937 +
53938 + case CMODE_HEAD:
53939 + /* one item gets cut partially from its head */
53940 + assert("vs-1565",
53941 + cinfo->head_removed == params->from->item_pos);
53942 +
53943 + freed =
53944 + kill_head_f(params->to, data,
53945 + params->smallest_removed,
53946 + &new_first_key);
53947 +
53948 + item_pos = cinfo->head_removed;
53949 + ih = node40_ih_at(node, item_pos);
53950 + cinfo->freed_space_start = ih40_get_offset(ih);
53951 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
53952 + cinfo->first_moved = cinfo->head_removed + 1;
53953 +
53954 + /* item head is removed, therefore, item key changed */
53955 + coord.node = node;
53956 + coord_set_item_pos(&coord, item_pos);
53957 + coord.unit_pos = 0;
53958 + coord.between = AT_UNIT;
53959 + update_item_key_node40(&coord, &new_first_key, NULL);
53960 + if (item_pos == 0)
53961 + /* key of first item of the node changes */
53962 + retval = 1;
53963 + break;
53964 +
53965 + case CMODE_TAIL | CMODE_WHOLE:
53966 + /* one item gets cut from its end and one or more items get removed completely */
53967 + assert("vs-1566",
53968 + cinfo->tail_removed == params->from->item_pos);
53969 + assert("vs-1567",
53970 + cinfo->first_removed == cinfo->tail_removed + 1);
53971 + assert("vs-1564", cinfo->removed_count > 0
53972 + && cinfo->removed_count != MAX_POS_IN_NODE);
53973 +
53974 + freed =
53975 + kill_tail_f(params->from, data,
53976 + params->smallest_removed);
53977 +
53978 + item_pos = cinfo->tail_removed;
53979 + ih = node40_ih_at(node, item_pos);
53980 + cinfo->freed_space_start =
53981 + ih40_get_offset(ih) + node40_item_length(node,
53982 + item_pos) -
53983 + freed;
53984 +
53985 + /* call kill hook for all items removed completely */
53986 + if (is_cut == 0)
53987 + call_kill_hooks(node, cinfo->first_removed,
53988 + cinfo->removed_count, data);
53989 +
53990 + item_pos += cinfo->removed_count;
53991 + ih -= cinfo->removed_count;
53992 + cinfo->freed_space_end =
53993 + ih40_get_offset(ih) + node40_item_length(node,
53994 + item_pos);
53995 + cinfo->first_moved = item_pos + 1;
53996 + break;
53997 +
53998 + case CMODE_WHOLE | CMODE_HEAD:
53999 + /* one or more items get removed completely and one item gets cut partially from its head */
54000 + assert("vs-1568",
54001 + cinfo->first_removed == params->from->item_pos);
54002 + assert("vs-1564", cinfo->removed_count > 0
54003 + && cinfo->removed_count != MAX_POS_IN_NODE);
54004 + assert("vs-1569",
54005 + cinfo->head_removed ==
54006 + cinfo->first_removed + cinfo->removed_count);
54007 +
54008 + /* call kill hook for all items removed completely */
54009 + if (is_cut == 0)
54010 + call_kill_hooks(node, cinfo->first_removed,
54011 + cinfo->removed_count, data);
54012 +
54013 + item_pos = cinfo->first_removed;
54014 + ih = node40_ih_at(node, item_pos);
54015 +
54016 + if (params->smallest_removed)
54017 + memcpy(params->smallest_removed, &ih->key,
54018 + sizeof(reiser4_key));
54019 +
54020 + freed =
54021 + kill_head_f(params->to, data, NULL, &new_first_key);
54022 +
54023 + cinfo->freed_space_start = ih40_get_offset(ih);
54024 +
54025 + ih = node40_ih_at(node, cinfo->head_removed);
54026 + /* this is the most complex case. Item which got head removed and items which are to be moved
54027 + intact change their location differently. */
54028 + cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54029 + cinfo->first_moved = cinfo->head_removed;
54030 + cinfo->head_removed_location = cinfo->freed_space_start;
54031 +
54032 + /* item head is removed, therefore, item key changed */
54033 + coord.node = node;
54034 + coord_set_item_pos(&coord, cinfo->head_removed);
54035 + coord.unit_pos = 0;
54036 + coord.between = AT_UNIT;
54037 + update_item_key_node40(&coord, &new_first_key, NULL);
54038 +
54039 + assert("vs-1579", cinfo->first_removed == 0);
54040 + /* key of first item of the node changes */
54041 + retval = 1;
54042 + break;
54043 +
54044 + case CMODE_TAIL | CMODE_HEAD:
54045 + /* one item get cut from its end and its neighbor gets cut from its tail */
54046 + impossible("vs-1576", "this can not happen currently");
54047 + break;
54048 +
54049 + case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
54050 + impossible("vs-1577", "this can not happen currently");
54051 + break;
54052 + default:
54053 + impossible("vs-1578", "unexpected cut mode");
54054 + break;
54055 + }
54056 + }
54057 + return retval;
54058 +}
54059 +
54060 +/* plugin->u.node.kill
54061 + return value is number of items removed completely */
54062 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
54063 +{
54064 + znode *node;
54065 + struct cut40_info cinfo;
54066 + int first_key_changed;
54067 +
54068 + node = kdata->params.from->node;
54069 +
54070 + first_key_changed =
54071 + prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
54072 + info);
54073 + compact(node, &cinfo);
54074 +
54075 + if (info) {
54076 + /* it is not called by node40_shift, so we have to take care
54077 + of changes on upper levels */
54078 + if (node_is_empty(node)
54079 + && !(kdata->flags & DELETE_RETAIN_EMPTY))
54080 + /* all contents of node is deleted */
54081 + prepare_removal_node40(node, info);
54082 + else if (first_key_changed) {
54083 + prepare_for_update(NULL, node, info);
54084 + }
54085 + }
54086 +
54087 + coord_clear_iplug(kdata->params.from);
54088 + coord_clear_iplug(kdata->params.to);
54089 +
54090 + znode_make_dirty(node);
54091 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54092 +}
54093 +
54094 +/* plugin->u.node.cut
54095 + return value is number of items removed completely */
54096 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
54097 +{
54098 + znode *node;
54099 + struct cut40_info cinfo;
54100 + int first_key_changed;
54101 +
54102 + node = cdata->params.from->node;
54103 +
54104 + first_key_changed =
54105 + prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
54106 + info);
54107 + compact(node, &cinfo);
54108 +
54109 + if (info) {
54110 + /* it is not called by node40_shift, so we have to take care
54111 + of changes on upper levels */
54112 + if (node_is_empty(node))
54113 + /* all contents of node is deleted */
54114 + prepare_removal_node40(node, info);
54115 + else if (first_key_changed) {
54116 + prepare_for_update(NULL, node, info);
54117 + }
54118 + }
54119 +
54120 + coord_clear_iplug(cdata->params.from);
54121 + coord_clear_iplug(cdata->params.to);
54122 +
54123 + znode_make_dirty(node);
54124 + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54125 +}
54126 +
54127 +/* this structure is used by shift method of node40 plugin */
54128 +struct shift_params {
54129 + shift_direction pend; /* when @pend == append - we are shifting to
54130 + left, when @pend == prepend - to right */
54131 + coord_t wish_stop; /* when shifting to left this is last unit we
54132 + want shifted, when shifting to right - this
54133 + is set to unit we want to start shifting
54134 + from */
54135 + znode *target;
54136 + int everything; /* it is set to 1 if everything we have to shift is
54137 + shifted, 0 - otherwise */
54138 +
54139 + /* FIXME-VS: get rid of read_stop */
54140 +
54141 + /* these are set by estimate_shift */
54142 + coord_t real_stop; /* this will be set to last unit which will be
54143 + really shifted */
54144 +
54145 + /* coordinate in source node before operation of unit which becomes
54146 + first after shift to left of last after shift to right */
54147 + union {
54148 + coord_t future_first;
54149 + coord_t future_last;
54150 + } u;
54151 +
54152 + unsigned merging_units; /* number of units of first item which have to
54153 + be merged with last item of target node */
54154 + unsigned merging_bytes; /* number of bytes in those units */
54155 +
54156 + unsigned entire; /* items shifted in their entirety */
54157 + unsigned entire_bytes; /* number of bytes in those items */
54158 +
54159 + unsigned part_units; /* number of units of partially copied item */
54160 + unsigned part_bytes; /* number of bytes in those units */
54161 +
54162 + unsigned shift_bytes; /* total number of bytes in items shifted (item
54163 + headers not included) */
54164 +
54165 +};
54166 +
54167 +static int item_creation_overhead(coord_t *item)
54168 +{
54169 + return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
54170 +}
54171 +
54172 +/* how many units are there in @source starting from source->unit_pos
54173 + but not further than @stop_coord */
54174 +static int
54175 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
54176 +{
54177 + if (pend == SHIFT_LEFT) {
54178 + assert("vs-181", source->unit_pos == 0);
54179 + } else {
54180 + assert("vs-182",
54181 + source->unit_pos == coord_last_unit_pos(source));
54182 + }
54183 +
54184 + if (source->item_pos != stop_coord->item_pos) {
54185 + /* @source and @stop_coord are different items */
54186 + return coord_last_unit_pos(source) + 1;
54187 + }
54188 +
54189 + if (pend == SHIFT_LEFT) {
54190 + return stop_coord->unit_pos + 1;
54191 + } else {
54192 + return source->unit_pos - stop_coord->unit_pos + 1;
54193 + }
54194 +}
54195 +
54196 +/* this calculates what can be copied from @shift->wish_stop.node to
54197 + @shift->target */
54198 +static void
54199 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
54200 +{
54201 + unsigned target_free_space, size;
54202 + pos_in_node_t stop_item; /* item which estimating should not consider */
54203 + unsigned want; /* number of units of item we want shifted */
54204 + coord_t source; /* item being estimated */
54205 + item_plugin *iplug;
54206 +
54207 + /* shifting to left/right starts from first/last units of
54208 + @shift->wish_stop.node */
54209 + if (shift->pend == SHIFT_LEFT) {
54210 + coord_init_first_unit(&source, shift->wish_stop.node);
54211 + } else {
54212 + coord_init_last_unit(&source, shift->wish_stop.node);
54213 + }
54214 + shift->real_stop = source;
54215 +
54216 + /* free space in target node and number of items in source */
54217 + target_free_space = znode_free_space(shift->target);
54218 +
54219 + shift->everything = 0;
54220 + if (!node_is_empty(shift->target)) {
54221 + /* target node is not empty, check for boundary items
54222 + mergeability */
54223 + coord_t to;
54224 +
54225 + /* item we try to merge @source with */
54226 + if (shift->pend == SHIFT_LEFT) {
54227 + coord_init_last_unit(&to, shift->target);
54228 + } else {
54229 + coord_init_first_unit(&to, shift->target);
54230 + }
54231 +
54232 + if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
54233 + &source) :
54234 + are_items_mergeable(&source, &to)) {
54235 + /* how many units of @source do we want to merge to
54236 + item @to */
54237 + want =
54238 + wanted_units(&source, &shift->wish_stop,
54239 + shift->pend);
54240 +
54241 + /* how many units of @source we can merge to item
54242 + @to */
54243 + iplug = item_plugin_by_coord(&source);
54244 + if (iplug->b.can_shift != NULL)
54245 + shift->merging_units =
54246 + iplug->b.can_shift(target_free_space,
54247 + &source, shift->target,
54248 + shift->pend, &size,
54249 + want);
54250 + else {
54251 + shift->merging_units = 0;
54252 + size = 0;
54253 + }
54254 + shift->merging_bytes = size;
54255 + shift->shift_bytes += size;
54256 + /* update stop coord to be set to last unit of @source
54257 + we can merge to @target */
54258 + if (shift->merging_units)
54259 + /* at least one unit can be shifted */
54260 + shift->real_stop.unit_pos =
54261 + (shift->merging_units - source.unit_pos -
54262 + 1) * shift->pend;
54263 + else {
54264 + /* nothing can be shifted */
54265 + if (shift->pend == SHIFT_LEFT)
54266 + coord_init_before_first_item(&shift->
54267 + real_stop,
54268 + source.
54269 + node);
54270 + else
54271 + coord_init_after_last_item(&shift->
54272 + real_stop,
54273 + source.node);
54274 + }
54275 + assert("nikita-2081", shift->real_stop.unit_pos + 1);
54276 +
54277 + if (shift->merging_units != want) {
54278 + /* we could not copy as many as we want, so,
54279 + there is no reason for estimating any
54280 + longer */
54281 + return;
54282 + }
54283 +
54284 + target_free_space -= size;
54285 + coord_add_item_pos(&source, shift->pend);
54286 + }
54287 + }
54288 +
54289 + /* number of item nothing of which we want to shift */
54290 + stop_item = shift->wish_stop.item_pos + shift->pend;
54291 +
54292 + /* calculate how many items can be copied into given free
54293 + space as whole */
54294 + for (; source.item_pos != stop_item;
54295 + coord_add_item_pos(&source, shift->pend)) {
54296 + if (shift->pend == SHIFT_RIGHT)
54297 + source.unit_pos = coord_last_unit_pos(&source);
54298 +
54299 + /* how many units of @source do we want to copy */
54300 + want = wanted_units(&source, &shift->wish_stop, shift->pend);
54301 +
54302 + if (want == coord_last_unit_pos(&source) + 1) {
54303 + /* we want this item to be copied entirely */
54304 + size =
54305 + item_length_by_coord(&source) +
54306 + item_creation_overhead(&source);
54307 + if (size <= target_free_space) {
54308 + /* item fits into target node as whole */
54309 + target_free_space -= size;
54310 + shift->shift_bytes +=
54311 + size - item_creation_overhead(&source);
54312 + shift->entire_bytes +=
54313 + size - item_creation_overhead(&source);
54314 + shift->entire++;
54315 +
54316 + /* update shift->real_stop coord to be set to
54317 + last unit of @source we can merge to
54318 + @target */
54319 + shift->real_stop = source;
54320 + if (shift->pend == SHIFT_LEFT)
54321 + shift->real_stop.unit_pos =
54322 + coord_last_unit_pos(&shift->
54323 + real_stop);
54324 + else
54325 + shift->real_stop.unit_pos = 0;
54326 + continue;
54327 + }
54328 + }
54329 +
54330 + /* we reach here only for an item which does not fit into
54331 + target node in its entirety. This item may be either
54332 + partially shifted, or not shifted at all. We will have to
54333 + create new item in target node, so decrease amout of free
54334 + space by an item creation overhead. We can reach here also
54335 + if stop coord is in this item */
54336 + if (target_free_space >=
54337 + (unsigned)item_creation_overhead(&source)) {
54338 + target_free_space -= item_creation_overhead(&source);
54339 + iplug = item_plugin_by_coord(&source);
54340 + if (iplug->b.can_shift) {
54341 + shift->part_units = iplug->b.can_shift(target_free_space,
54342 + &source,
54343 + NULL, /* target */
54344 + shift->pend,
54345 + &size,
54346 + want);
54347 + } else {
54348 + target_free_space = 0;
54349 + shift->part_units = 0;
54350 + size = 0;
54351 + }
54352 + } else {
54353 + target_free_space = 0;
54354 + shift->part_units = 0;
54355 + size = 0;
54356 + }
54357 + shift->part_bytes = size;
54358 + shift->shift_bytes += size;
54359 +
54360 + /* set @shift->real_stop to last unit of @source we can merge
54361 + to @shift->target */
54362 + if (shift->part_units) {
54363 + shift->real_stop = source;
54364 + shift->real_stop.unit_pos =
54365 + (shift->part_units - source.unit_pos -
54366 + 1) * shift->pend;
54367 + assert("nikita-2082", shift->real_stop.unit_pos + 1);
54368 + }
54369 +
54370 + if (want != shift->part_units)
54371 + /* not everything wanted were shifted */
54372 + return;
54373 + break;
54374 + }
54375 +
54376 + shift->everything = 1;
54377 +}
54378 +
54379 +static void
54380 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
54381 + shift_direction dir, unsigned free_space)
54382 +{
54383 + item_plugin *iplug;
54384 +
54385 + assert("nikita-1463", target != NULL);
54386 + assert("nikita-1464", source != NULL);
54387 + assert("nikita-1465", from + count <= coord_num_units(source));
54388 +
54389 + iplug = item_plugin_by_coord(source);
54390 + assert("nikita-1468", iplug == item_plugin_by_coord(target));
54391 + iplug->b.copy_units(target, source, from, count, dir, free_space);
54392 +
54393 + if (dir == SHIFT_RIGHT) {
54394 + /* FIXME-VS: this looks not necessary. update_item_key was
54395 + called already by copy_units method */
54396 + reiser4_key split_key;
54397 +
54398 + assert("nikita-1469", target->unit_pos == 0);
54399 +
54400 + unit_key_by_coord(target, &split_key);
54401 + node_plugin_by_coord(target)->update_item_key(target,
54402 + &split_key, NULL);
54403 + }
54404 +}
54405 +
54406 +/* copy part of @shift->real_stop.node starting either from its beginning or
54407 + from its end and ending at @shift->real_stop to either the end or the
54408 + beginning of @shift->target */
54409 +static void copy(struct shift_params *shift)
54410 +{
54411 + node40_header *nh;
54412 + coord_t from;
54413 + coord_t to;
54414 + item_header40 *from_ih, *to_ih;
54415 + int free_space_start;
54416 + int new_items;
54417 + unsigned old_items;
54418 + int old_offset;
54419 + unsigned i;
54420 +
54421 + nh = node40_node_header(shift->target);
54422 + free_space_start = nh40_get_free_space_start(nh);
54423 + old_items = nh40_get_num_items(nh);
54424 + new_items = shift->entire + (shift->part_units ? 1 : 0);
54425 + assert("vs-185",
54426 + shift->shift_bytes ==
54427 + shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
54428 +
54429 + from = shift->wish_stop;
54430 +
54431 + coord_init_first_unit(&to, shift->target);
54432 +
54433 + /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
54434 + hence to.between is set to EMPTY_NODE above. Looks like we want it
54435 + to be AT_UNIT.
54436 +
54437 + Oh, wonders of ->betweeness...
54438 +
54439 + */
54440 + to.between = AT_UNIT;
54441 +
54442 + if (shift->pend == SHIFT_LEFT) {
54443 + /* copying to left */
54444 +
54445 + coord_set_item_pos(&from, 0);
54446 + from_ih = node40_ih_at(from.node, 0);
54447 +
54448 + coord_set_item_pos(&to,
54449 + node40_num_of_items_internal(to.node) - 1);
54450 + if (shift->merging_units) {
54451 + /* expand last item, so that plugin methods will see
54452 + correct data */
54453 + free_space_start += shift->merging_bytes;
54454 + nh40_set_free_space_start(nh,
54455 + (unsigned)free_space_start);
54456 + nh40_set_free_space(nh,
54457 + nh40_get_free_space(nh) -
54458 + shift->merging_bytes);
54459 +
54460 + /* appending last item of @target */
54461 + copy_units(&to, &from, 0, /* starting from 0-th unit */
54462 + shift->merging_units, SHIFT_LEFT,
54463 + shift->merging_bytes);
54464 + coord_inc_item_pos(&from);
54465 + from_ih--;
54466 + coord_inc_item_pos(&to);
54467 + }
54468 +
54469 + to_ih = node40_ih_at(shift->target, old_items);
54470 + if (shift->entire) {
54471 + /* copy @entire items entirely */
54472 +
54473 + /* copy item headers */
54474 + memcpy(to_ih - shift->entire + 1,
54475 + from_ih - shift->entire + 1,
54476 + shift->entire * sizeof(item_header40));
54477 + /* update item header offset */
54478 + old_offset = ih40_get_offset(from_ih);
54479 + /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
54480 + for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
54481 + ih40_set_offset(to_ih,
54482 + ih40_get_offset(from_ih) -
54483 + old_offset + free_space_start);
54484 +
54485 + /* copy item bodies */
54486 + memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
54487 + shift->entire_bytes);
54488 +
54489 + coord_add_item_pos(&from, (int)shift->entire);
54490 + coord_add_item_pos(&to, (int)shift->entire);
54491 + }
54492 +
54493 + nh40_set_free_space_start(nh,
54494 + free_space_start +
54495 + shift->shift_bytes -
54496 + shift->merging_bytes);
54497 + nh40_set_free_space(nh,
54498 + nh40_get_free_space(nh) -
54499 + (shift->shift_bytes - shift->merging_bytes +
54500 + sizeof(item_header40) * new_items));
54501 +
54502 + /* update node header */
54503 + node40_set_num_items(shift->target, nh, old_items + new_items);
54504 + assert("vs-170",
54505 + nh40_get_free_space(nh) < znode_size(shift->target));
54506 +
54507 + if (shift->part_units) {
54508 + /* copy heading part (@part units) of @source item as
54509 + a new item into @target->node */
54510 +
54511 + /* copy item header of partially copied item */
54512 + coord_set_item_pos(&to,
54513 + node40_num_of_items_internal(to.node)
54514 + - 1);
54515 + memcpy(to_ih, from_ih, sizeof(item_header40));
54516 + ih40_set_offset(to_ih,
54517 + nh40_get_free_space_start(nh) -
54518 + shift->part_bytes);
54519 + if (item_plugin_by_coord(&to)->b.init)
54520 + item_plugin_by_coord(&to)->b.init(&to, &from,
54521 + NULL);
54522 + copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
54523 + shift->part_bytes);
54524 + }
54525 +
54526 + } else {
54527 + /* copying to right */
54528 +
54529 + coord_set_item_pos(&from,
54530 + node40_num_of_items_internal(from.node) - 1);
54531 + from_ih = node40_ih_at_coord(&from);
54532 +
54533 + coord_set_item_pos(&to, 0);
54534 +
54535 + /* prepare space for new items */
54536 + memmove(zdata(to.node) + sizeof(node40_header) +
54537 + shift->shift_bytes,
54538 + zdata(to.node) + sizeof(node40_header),
54539 + free_space_start - sizeof(node40_header));
54540 + /* update item headers of moved items */
54541 + to_ih = node40_ih_at(to.node, 0);
54542 + /* first item gets @merging_bytes longer. free space appears
54543 + at its beginning */
54544 + if (!node_is_empty(to.node))
54545 + ih40_set_offset(to_ih,
54546 + ih40_get_offset(to_ih) +
54547 + shift->shift_bytes -
54548 + shift->merging_bytes);
54549 +
54550 + for (i = 1; i < old_items; i++)
54551 + ih40_set_offset(to_ih - i,
54552 + ih40_get_offset(to_ih - i) +
54553 + shift->shift_bytes);
54554 +
54555 + /* move item headers to make space for new items */
54556 + memmove(to_ih - old_items + 1 - new_items,
54557 + to_ih - old_items + 1,
54558 + sizeof(item_header40) * old_items);
54559 + to_ih -= (new_items - 1);
54560 +
54561 + nh40_set_free_space_start(nh,
54562 + free_space_start +
54563 + shift->shift_bytes);
54564 + nh40_set_free_space(nh,
54565 + nh40_get_free_space(nh) -
54566 + (shift->shift_bytes +
54567 + sizeof(item_header40) * new_items));
54568 +
54569 + /* update node header */
54570 + node40_set_num_items(shift->target, nh, old_items + new_items);
54571 + assert("vs-170",
54572 + nh40_get_free_space(nh) < znode_size(shift->target));
54573 +
54574 + if (shift->merging_units) {
54575 + coord_add_item_pos(&to, new_items);
54576 + to.unit_pos = 0;
54577 + to.between = AT_UNIT;
54578 + /* prepend first item of @to */
54579 + copy_units(&to, &from,
54580 + coord_last_unit_pos(&from) -
54581 + shift->merging_units + 1,
54582 + shift->merging_units, SHIFT_RIGHT,
54583 + shift->merging_bytes);
54584 + coord_dec_item_pos(&from);
54585 + from_ih++;
54586 + }
54587 +
54588 + if (shift->entire) {
54589 + /* copy @entire items entirely */
54590 +
54591 + /* copy item headers */
54592 + memcpy(to_ih, from_ih,
54593 + shift->entire * sizeof(item_header40));
54594 +
54595 + /* update item header offset */
54596 + old_offset =
54597 + ih40_get_offset(from_ih + shift->entire - 1);
54598 + /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
54599 + for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
54600 + ih40_set_offset(to_ih,
54601 + ih40_get_offset(from_ih) -
54602 + old_offset +
54603 + sizeof(node40_header) +
54604 + shift->part_bytes);
54605 + /* copy item bodies */
54606 + coord_add_item_pos(&from, -(int)(shift->entire - 1));
54607 + memcpy(zdata(to.node) + sizeof(node40_header) +
54608 + shift->part_bytes, item_by_coord_node40(&from),
54609 + shift->entire_bytes);
54610 + coord_dec_item_pos(&from);
54611 + }
54612 +
54613 + if (shift->part_units) {
54614 + coord_set_item_pos(&to, 0);
54615 + to.unit_pos = 0;
54616 + to.between = AT_UNIT;
54617 + /* copy heading part (@part units) of @source item as
54618 + a new item into @target->node */
54619 +
54620 + /* copy item header of partially copied item */
54621 + memcpy(to_ih, from_ih, sizeof(item_header40));
54622 + ih40_set_offset(to_ih, sizeof(node40_header));
54623 + if (item_plugin_by_coord(&to)->b.init)
54624 + item_plugin_by_coord(&to)->b.init(&to, &from,
54625 + NULL);
54626 + copy_units(&to, &from,
54627 + coord_last_unit_pos(&from) -
54628 + shift->part_units + 1, shift->part_units,
54629 + SHIFT_RIGHT, shift->part_bytes);
54630 + }
54631 + }
54632 +}
54633 +
54634 +/* remove everything either before or after @fact_stop. Number of items
54635 + removed completely is returned */
54636 +static int delete_copied(struct shift_params *shift)
54637 +{
54638 + coord_t from;
54639 + coord_t to;
54640 + struct carry_cut_data cdata;
54641 +
54642 + if (shift->pend == SHIFT_LEFT) {
54643 + /* we were shifting to left, remove everything from the
54644 + beginning of @shift->wish_stop->node upto
54645 + @shift->wish_stop */
54646 + coord_init_first_unit(&from, shift->real_stop.node);
54647 + to = shift->real_stop;
54648 +
54649 + /* store old coordinate of unit which will be first after
54650 + shift to left */
54651 + shift->u.future_first = to;
54652 + coord_next_unit(&shift->u.future_first);
54653 + } else {
54654 + /* we were shifting to right, remove everything from
54655 + @shift->stop_coord upto to end of
54656 + @shift->stop_coord->node */
54657 + from = shift->real_stop;
54658 + coord_init_last_unit(&to, from.node);
54659 +
54660 + /* store old coordinate of unit which will be last after
54661 + shift to right */
54662 + shift->u.future_last = from;
54663 + coord_prev_unit(&shift->u.future_last);
54664 + }
54665 +
54666 + cdata.params.from = &from;
54667 + cdata.params.to = &to;
54668 + cdata.params.from_key = NULL;
54669 + cdata.params.to_key = NULL;
54670 + cdata.params.smallest_removed = NULL;
54671 + return cut_node40(&cdata, NULL);
54672 +}
54673 +
54674 +/* something was moved between @left and @right. Add carry operation to @info
54675 + list to have carry to update delimiting key between them */
54676 +static int
54677 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
54678 +{
54679 + carry_op *op;
54680 + carry_node *cn;
54681 +
54682 + if (info == NULL)
54683 + /* nowhere to send operation to. */
54684 + return 0;
54685 +
54686 + if (!should_notify_parent(right))
54687 + return 0;
54688 +
54689 + op = node_post_carry(info, COP_UPDATE, right, 1);
54690 + if (IS_ERR(op) || op == NULL)
54691 + return op ? PTR_ERR(op) : -EIO;
54692 +
54693 + if (left != NULL) {
54694 + carry_node *reference;
54695 +
54696 + if (info->doing)
54697 + reference = insert_carry_node(info->doing,
54698 + info->todo, left);
54699 + else
54700 + reference = op->node;
54701 + assert("nikita-2992", reference != NULL);
54702 + cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
54703 + if (IS_ERR(cn))
54704 + return PTR_ERR(cn);
54705 + cn->parent = 1;
54706 + cn->node = left;
54707 + if (ZF_ISSET(left, JNODE_ORPHAN))
54708 + cn->left_before = 1;
54709 + op->u.update.left = cn;
54710 + } else
54711 + op->u.update.left = NULL;
54712 + return 0;
54713 +}
54714 +
54715 +/* plugin->u.node.prepare_removal
54716 + to delete a pointer to @empty from the tree add corresponding carry
54717 + operation (delete) to @info list */
54718 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
54719 +{
54720 + carry_op *op;
54721 + reiser4_tree *tree;
54722 +
54723 + if (!should_notify_parent(empty))
54724 + return 0;
54725 + /* already on a road to Styx */
54726 + if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
54727 + return 0;
54728 + op = node_post_carry(info, COP_DELETE, empty, 1);
54729 + if (IS_ERR(op) || op == NULL)
54730 + return RETERR(op ? PTR_ERR(op) : -EIO);
54731 +
54732 + op->u.delete.child = NULL;
54733 + op->u.delete.flags = 0;
54734 +
54735 + /* fare thee well */
54736 + tree = znode_get_tree(empty);
54737 + read_lock_tree(tree);
54738 + write_lock_dk(tree);
54739 + znode_set_ld_key(empty, znode_get_rd_key(empty));
54740 + if (znode_is_left_connected(empty) && empty->left)
54741 + znode_set_rd_key(empty->left, znode_get_rd_key(empty));
54742 + write_unlock_dk(tree);
54743 + read_unlock_tree(tree);
54744 +
54745 + ZF_SET(empty, JNODE_HEARD_BANSHEE);
54746 + return 0;
54747 +}
54748 +
54749 +/* something were shifted from @insert_coord->node to @shift->target, update
54750 + @insert_coord correspondingly */
54751 +static void
54752 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
54753 + int including_insert_coord)
54754 +{
54755 + /* item plugin was invalidated by shifting */
54756 + coord_clear_iplug(insert_coord);
54757 +
54758 + if (node_is_empty(shift->wish_stop.node)) {
54759 + assert("vs-242", shift->everything);
54760 + if (including_insert_coord) {
54761 + if (shift->pend == SHIFT_RIGHT) {
54762 + /* set @insert_coord before first unit of
54763 + @shift->target node */
54764 + coord_init_before_first_item(insert_coord,
54765 + shift->target);
54766 + } else {
54767 + /* set @insert_coord after last in target node */
54768 + coord_init_after_last_item(insert_coord,
54769 + shift->target);
54770 + }
54771 + } else {
54772 + /* set @insert_coord inside of empty node. There is
54773 + only one possible coord within an empty
54774 + node. init_first_unit will set that coord */
54775 + coord_init_first_unit(insert_coord,
54776 + shift->wish_stop.node);
54777 + }
54778 + return;
54779 + }
54780 +
54781 + if (shift->pend == SHIFT_RIGHT) {
54782 + /* there was shifting to right */
54783 + if (shift->everything) {
54784 + /* everything wanted was shifted */
54785 + if (including_insert_coord) {
54786 + /* @insert_coord is set before first unit of
54787 + @to node */
54788 + coord_init_before_first_item(insert_coord,
54789 + shift->target);
54790 + insert_coord->between = BEFORE_UNIT;
54791 + } else {
54792 + /* @insert_coord is set after last unit of
54793 + @insert->node */
54794 + coord_init_last_unit(insert_coord,
54795 + shift->wish_stop.node);
54796 + insert_coord->between = AFTER_UNIT;
54797 + }
54798 + }
54799 + return;
54800 + }
54801 +
54802 + /* there was shifting to left */
54803 + if (shift->everything) {
54804 + /* everything wanted was shifted */
54805 + if (including_insert_coord) {
54806 + /* @insert_coord is set after last unit in @to node */
54807 + coord_init_after_last_item(insert_coord, shift->target);
54808 + } else {
54809 + /* @insert_coord is set before first unit in the same
54810 + node */
54811 + coord_init_before_first_item(insert_coord,
54812 + shift->wish_stop.node);
54813 + }
54814 + return;
54815 + }
54816 +
54817 + /* FIXME-VS: the code below is complicated because with between ==
54818 + AFTER_ITEM unit_pos is set to 0 */
54819 +
54820 + if (!removed) {
54821 + /* no items were shifted entirely */
54822 + assert("vs-195", shift->merging_units == 0
54823 + || shift->part_units == 0);
54824 +
54825 + if (shift->real_stop.item_pos == insert_coord->item_pos) {
54826 + if (shift->merging_units) {
54827 + if (insert_coord->between == AFTER_UNIT) {
54828 + assert("nikita-1441",
54829 + insert_coord->unit_pos >=
54830 + shift->merging_units);
54831 + insert_coord->unit_pos -=
54832 + shift->merging_units;
54833 + } else if (insert_coord->between == BEFORE_UNIT) {
54834 + assert("nikita-2090",
54835 + insert_coord->unit_pos >
54836 + shift->merging_units);
54837 + insert_coord->unit_pos -=
54838 + shift->merging_units;
54839 + }
54840 +
54841 + assert("nikita-2083",
54842 + insert_coord->unit_pos + 1);
54843 + } else {
54844 + if (insert_coord->between == AFTER_UNIT) {
54845 + assert("nikita-1442",
54846 + insert_coord->unit_pos >=
54847 + shift->part_units);
54848 + insert_coord->unit_pos -=
54849 + shift->part_units;
54850 + } else if (insert_coord->between == BEFORE_UNIT) {
54851 + assert("nikita-2089",
54852 + insert_coord->unit_pos >
54853 + shift->part_units);
54854 + insert_coord->unit_pos -=
54855 + shift->part_units;
54856 + }
54857 +
54858 + assert("nikita-2084",
54859 + insert_coord->unit_pos + 1);
54860 + }
54861 + }
54862 + return;
54863 + }
54864 +
54865 + /* we shifted to left and there was no enough space for everything */
54866 + switch (insert_coord->between) {
54867 + case AFTER_UNIT:
54868 + case BEFORE_UNIT:
54869 + if (shift->real_stop.item_pos == insert_coord->item_pos)
54870 + insert_coord->unit_pos -= shift->part_units;
54871 + case AFTER_ITEM:
54872 + coord_add_item_pos(insert_coord, -removed);
54873 + break;
54874 + default:
54875 + impossible("nikita-2087", "not ready");
54876 + }
54877 + assert("nikita-2085", insert_coord->unit_pos + 1);
54878 +}
54879 +
54880 +static int call_shift_hooks(struct shift_params *shift)
54881 +{
54882 + unsigned i, shifted;
54883 + coord_t coord;
54884 + item_plugin *iplug;
54885 +
54886 + assert("vs-275", !node_is_empty(shift->target));
54887 +
54888 + /* number of items shift touches */
54889 + shifted =
54890 + shift->entire + (shift->merging_units ? 1 : 0) +
54891 + (shift->part_units ? 1 : 0);
54892 +
54893 + if (shift->pend == SHIFT_LEFT) {
54894 + /* moved items are at the end */
54895 + coord_init_last_unit(&coord, shift->target);
54896 + coord.unit_pos = 0;
54897 +
54898 + assert("vs-279", shift->pend == 1);
54899 + for (i = 0; i < shifted; i++) {
54900 + unsigned from, count;
54901 +
54902 + iplug = item_plugin_by_coord(&coord);
54903 + if (i == 0 && shift->part_units) {
54904 + assert("vs-277",
54905 + coord_num_units(&coord) ==
54906 + shift->part_units);
54907 + count = shift->part_units;
54908 + from = 0;
54909 + } else if (i == shifted - 1 && shift->merging_units) {
54910 + count = shift->merging_units;
54911 + from = coord_num_units(&coord) - count;
54912 + } else {
54913 + count = coord_num_units(&coord);
54914 + from = 0;
54915 + }
54916 +
54917 + if (iplug->b.shift_hook) {
54918 + iplug->b.shift_hook(&coord, from, count,
54919 + shift->wish_stop.node);
54920 + }
54921 + coord_add_item_pos(&coord, -shift->pend);
54922 + }
54923 + } else {
54924 + /* moved items are at the beginning */
54925 + coord_init_first_unit(&coord, shift->target);
54926 +
54927 + assert("vs-278", shift->pend == -1);
54928 + for (i = 0; i < shifted; i++) {
54929 + unsigned from, count;
54930 +
54931 + iplug = item_plugin_by_coord(&coord);
54932 + if (i == 0 && shift->part_units) {
54933 + assert("vs-277",
54934 + coord_num_units(&coord) ==
54935 + shift->part_units);
54936 + count = coord_num_units(&coord);
54937 + from = 0;
54938 + } else if (i == shifted - 1 && shift->merging_units) {
54939 + count = shift->merging_units;
54940 + from = 0;
54941 + } else {
54942 + count = coord_num_units(&coord);
54943 + from = 0;
54944 + }
54945 +
54946 + if (iplug->b.shift_hook) {
54947 + iplug->b.shift_hook(&coord, from, count,
54948 + shift->wish_stop.node);
54949 + }
54950 + coord_add_item_pos(&coord, -shift->pend);
54951 + }
54952 + }
54953 +
54954 + return 0;
54955 +}
54956 +
54957 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
54958 +static int
54959 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
54960 +{
54961 + assert("vs-944", shift->real_stop.node == old->node);
54962 +
54963 + if (shift->real_stop.item_pos < old->item_pos)
54964 + return 0;
54965 + if (shift->real_stop.item_pos == old->item_pos) {
54966 + if (shift->real_stop.unit_pos < old->unit_pos)
54967 + return 0;
54968 + }
54969 + return 1;
54970 +}
54971 +
54972 +/* shift to right is completed. Return 1 if unit @old was moved to right
54973 + neighbor */
54974 +static int
54975 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
54976 +{
54977 + assert("vs-944", shift->real_stop.node == old->node);
54978 +
54979 + if (shift->real_stop.item_pos > old->item_pos)
54980 + return 0;
54981 + if (shift->real_stop.item_pos == old->item_pos) {
54982 + if (shift->real_stop.unit_pos > old->unit_pos)
54983 + return 0;
54984 + }
54985 + return 1;
54986 +}
54987 +
54988 +/* coord @old was set in node from which shift was performed. What was shifted
54989 + is stored in @shift. Update @old correspondingly to performed shift */
54990 +static coord_t *adjust_coord2(const struct shift_params *shift,
54991 + const coord_t * old, coord_t * new)
54992 +{
54993 + coord_clear_iplug(new);
54994 + new->between = old->between;
54995 +
54996 + coord_clear_iplug(new);
54997 + if (old->node == shift->target) {
54998 + if (shift->pend == SHIFT_LEFT) {
54999 + /* coord which is set inside of left neighbor does not
55000 + change during shift to left */
55001 + coord_dup(new, old);
55002 + return new;
55003 + }
55004 + new->node = old->node;
55005 + coord_set_item_pos(new,
55006 + old->item_pos + shift->entire +
55007 + (shift->part_units ? 1 : 0));
55008 + new->unit_pos = old->unit_pos;
55009 + if (old->item_pos == 0 && shift->merging_units)
55010 + new->unit_pos += shift->merging_units;
55011 + return new;
55012 + }
55013 +
55014 + assert("vs-977", old->node == shift->wish_stop.node);
55015 + if (shift->pend == SHIFT_LEFT) {
55016 + if (unit_moved_left(shift, old)) {
55017 + /* unit @old moved to left neighbor. Calculate its
55018 + coordinate there */
55019 + new->node = shift->target;
55020 + coord_set_item_pos(new,
55021 + node_num_items(shift->target) -
55022 + shift->entire -
55023 + (shift->part_units ? 1 : 0) +
55024 + old->item_pos);
55025 +
55026 + new->unit_pos = old->unit_pos;
55027 + if (shift->merging_units) {
55028 + coord_dec_item_pos(new);
55029 + if (old->item_pos == 0) {
55030 + /* unit_pos only changes if item got
55031 + merged */
55032 + new->unit_pos =
55033 + coord_num_units(new) -
55034 + (shift->merging_units -
55035 + old->unit_pos);
55036 + }
55037 + }
55038 + } else {
55039 + /* unit @old did not move to left neighbor.
55040 +
55041 + Use _nocheck, because @old is outside of its node.
55042 + */
55043 + coord_dup_nocheck(new, old);
55044 + coord_add_item_pos(new,
55045 + -shift->u.future_first.item_pos);
55046 + if (new->item_pos == 0)
55047 + new->unit_pos -= shift->u.future_first.unit_pos;
55048 + }
55049 + } else {
55050 + if (unit_moved_right(shift, old)) {
55051 + /* unit @old moved to right neighbor */
55052 + new->node = shift->target;
55053 + coord_set_item_pos(new,
55054 + old->item_pos -
55055 + shift->real_stop.item_pos);
55056 + if (new->item_pos == 0) {
55057 + /* unit @old might change unit pos */
55058 + coord_set_item_pos(new,
55059 + old->unit_pos -
55060 + shift->real_stop.unit_pos);
55061 + }
55062 + } else {
55063 + /* unit @old did not move to right neighbor, therefore
55064 + it did not change */
55065 + coord_dup(new, old);
55066 + }
55067 + }
55068 + coord_set_iplug(new, item_plugin_by_coord(new));
55069 + return new;
55070 +}
55071 +
55072 +/* this is called when shift is completed (something of source node is copied
55073 + to target and deleted in source) to update all taps set in current
55074 + context */
55075 +static void update_taps(const struct shift_params *shift)
55076 +{
55077 + tap_t *tap;
55078 + coord_t new;
55079 +
55080 + for_all_taps(tap) {
55081 + /* update only taps set to nodes participating in shift */
55082 + if (tap->coord->node == shift->wish_stop.node
55083 + || tap->coord->node == shift->target)
55084 + tap_to_coord(tap,
55085 + adjust_coord2(shift, tap->coord, &new));
55086 + }
55087 +}
55088 +
55089 +#if REISER4_DEBUG
55090 +
55091 +struct shift_check {
55092 + reiser4_key key;
55093 + __u16 plugin_id;
55094 + union {
55095 + __u64 bytes;
55096 + __u64 entries;
55097 + void *unused;
55098 + } u;
55099 +};
55100 +
55101 +void *shift_check_prepare(const znode * left, const znode * right)
55102 +{
55103 + pos_in_node_t i, nr_items;
55104 + int mergeable;
55105 + struct shift_check *data;
55106 + item_header40 *ih;
55107 +
55108 + if (node_is_empty(left) || node_is_empty(right))
55109 + mergeable = 0;
55110 + else {
55111 + coord_t l, r;
55112 +
55113 + coord_init_last_unit(&l, left);
55114 + coord_init_first_unit(&r, right);
55115 + mergeable = are_items_mergeable(&l, &r);
55116 + }
55117 + nr_items =
55118 + node40_num_of_items_internal(left) +
55119 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55120 + data =
55121 + kmalloc(sizeof(struct shift_check) * nr_items,
55122 + reiser4_ctx_gfp_mask_get());
55123 + if (data != NULL) {
55124 + coord_t coord;
55125 + pos_in_node_t item_pos;
55126 +
55127 + coord_init_first_unit(&coord, left);
55128 + i = 0;
55129 +
55130 + for (item_pos = 0;
55131 + item_pos < node40_num_of_items_internal(left);
55132 + item_pos++) {
55133 +
55134 + coord_set_item_pos(&coord, item_pos);
55135 + ih = node40_ih_at_coord(&coord);
55136 +
55137 + data[i].key = ih->key;
55138 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55139 + switch (data[i].plugin_id) {
55140 + case CTAIL_ID:
55141 + case FORMATTING_ID:
55142 + data[i].u.bytes = coord_num_units(&coord);
55143 + break;
55144 + case EXTENT_POINTER_ID:
55145 + data[i].u.bytes =
55146 + reiser4_extent_size(&coord,
55147 + coord_num_units(&coord));
55148 + break;
55149 + case COMPOUND_DIR_ID:
55150 + data[i].u.entries = coord_num_units(&coord);
55151 + break;
55152 + default:
55153 + data[i].u.unused = NULL;
55154 + break;
55155 + }
55156 + i++;
55157 + }
55158 +
55159 + coord_init_first_unit(&coord, right);
55160 +
55161 + if (mergeable) {
55162 + assert("vs-1609", i != 0);
55163 +
55164 + ih = node40_ih_at_coord(&coord);
55165 +
55166 + assert("vs-1589",
55167 + data[i - 1].plugin_id ==
55168 + le16_to_cpu(get_unaligned(&ih->plugin_id)));
55169 + switch (data[i - 1].plugin_id) {
55170 + case CTAIL_ID:
55171 + case FORMATTING_ID:
55172 + data[i - 1].u.bytes += coord_num_units(&coord);
55173 + break;
55174 + case EXTENT_POINTER_ID:
55175 + data[i - 1].u.bytes +=
55176 + reiser4_extent_size(&coord,
55177 + coord_num_units(&coord));
55178 + break;
55179 + case COMPOUND_DIR_ID:
55180 + data[i - 1].u.entries +=
55181 + coord_num_units(&coord);
55182 + break;
55183 + default:
55184 + impossible("vs-1605", "wrong mergeable item");
55185 + break;
55186 + }
55187 + item_pos = 1;
55188 + } else
55189 + item_pos = 0;
55190 + for (; item_pos < node40_num_of_items_internal(right);
55191 + item_pos++) {
55192 +
55193 + assert("vs-1604", i < nr_items);
55194 + coord_set_item_pos(&coord, item_pos);
55195 + ih = node40_ih_at_coord(&coord);
55196 +
55197 + data[i].key = ih->key;
55198 + data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55199 + switch (data[i].plugin_id) {
55200 + case CTAIL_ID:
55201 + case FORMATTING_ID:
55202 + data[i].u.bytes = coord_num_units(&coord);
55203 + break;
55204 + case EXTENT_POINTER_ID:
55205 + data[i].u.bytes =
55206 + reiser4_extent_size(&coord,
55207 + coord_num_units(&coord));
55208 + break;
55209 + case COMPOUND_DIR_ID:
55210 + data[i].u.entries = coord_num_units(&coord);
55211 + break;
55212 + default:
55213 + data[i].u.unused = NULL;
55214 + break;
55215 + }
55216 + i++;
55217 + }
55218 + assert("vs-1606", i == nr_items);
55219 + }
55220 + return data;
55221 +}
55222 +
55223 +void shift_check(void *vp, const znode * left, const znode * right)
55224 +{
55225 + pos_in_node_t i, nr_items;
55226 + coord_t coord;
55227 + __u64 last_bytes;
55228 + int mergeable;
55229 + item_header40 *ih;
55230 + pos_in_node_t item_pos;
55231 + struct shift_check *data;
55232 +
55233 + data = (struct shift_check *)vp;
55234 +
55235 + if (data == NULL)
55236 + return;
55237 +
55238 + if (node_is_empty(left) || node_is_empty(right))
55239 + mergeable = 0;
55240 + else {
55241 + coord_t l, r;
55242 +
55243 + coord_init_last_unit(&l, left);
55244 + coord_init_first_unit(&r, right);
55245 + mergeable = are_items_mergeable(&l, &r);
55246 + }
55247 +
55248 + nr_items =
55249 + node40_num_of_items_internal(left) +
55250 + node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55251 +
55252 + i = 0;
55253 + last_bytes = 0;
55254 +
55255 + coord_init_first_unit(&coord, left);
55256 +
55257 + for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
55258 + item_pos++) {
55259 +
55260 + coord_set_item_pos(&coord, item_pos);
55261 + ih = node40_ih_at_coord(&coord);
55262 +
55263 + assert("vs-1611", i == item_pos);
55264 + assert("vs-1590", keyeq(&ih->key, &data[i].key));
55265 + assert("vs-1591",
55266 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55267 + if ((i < (node40_num_of_items_internal(left) - 1))
55268 + || !mergeable) {
55269 + switch (data[i].plugin_id) {
55270 + case CTAIL_ID:
55271 + case FORMATTING_ID:
55272 + assert("vs-1592",
55273 + data[i].u.bytes ==
55274 + coord_num_units(&coord));
55275 + break;
55276 + case EXTENT_POINTER_ID:
55277 + assert("vs-1593",
55278 + data[i].u.bytes ==
55279 + reiser4_extent_size(&coord,
55280 + coord_num_units
55281 + (&coord)));
55282 + break;
55283 + case COMPOUND_DIR_ID:
55284 + assert("vs-1594",
55285 + data[i].u.entries ==
55286 + coord_num_units(&coord));
55287 + break;
55288 + default:
55289 + break;
55290 + }
55291 + }
55292 + if (item_pos == (node40_num_of_items_internal(left) - 1)
55293 + && mergeable) {
55294 + switch (data[i].plugin_id) {
55295 + case CTAIL_ID:
55296 + case FORMATTING_ID:
55297 + last_bytes = coord_num_units(&coord);
55298 + break;
55299 + case EXTENT_POINTER_ID:
55300 + last_bytes =
55301 + reiser4_extent_size(&coord,
55302 + coord_num_units(&coord));
55303 + break;
55304 + case COMPOUND_DIR_ID:
55305 + last_bytes = coord_num_units(&coord);
55306 + break;
55307 + default:
55308 + impossible("vs-1595", "wrong mergeable item");
55309 + break;
55310 + }
55311 + }
55312 + i++;
55313 + }
55314 +
55315 + coord_init_first_unit(&coord, right);
55316 + if (mergeable) {
55317 + ih = node40_ih_at_coord(&coord);
55318 +
55319 + assert("vs-1589",
55320 + data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
55321 + assert("vs-1608", last_bytes != 0);
55322 + switch (data[i - 1].plugin_id) {
55323 + case CTAIL_ID:
55324 + case FORMATTING_ID:
55325 + assert("vs-1596",
55326 + data[i - 1].u.bytes ==
55327 + last_bytes + coord_num_units(&coord));
55328 + break;
55329 +
55330 + case EXTENT_POINTER_ID:
55331 + assert("vs-1597",
55332 + data[i - 1].u.bytes ==
55333 + last_bytes + reiser4_extent_size(&coord,
55334 + coord_num_units
55335 + (&coord)));
55336 + break;
55337 +
55338 + case COMPOUND_DIR_ID:
55339 + assert("vs-1598",
55340 + data[i - 1].u.bytes ==
55341 + last_bytes + coord_num_units(&coord));
55342 + break;
55343 + default:
55344 + impossible("vs-1599", "wrong mergeable item");
55345 + break;
55346 + }
55347 + item_pos = 1;
55348 + } else
55349 + item_pos = 0;
55350 +
55351 + for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
55352 +
55353 + coord_set_item_pos(&coord, item_pos);
55354 + ih = node40_ih_at_coord(&coord);
55355 +
55356 + assert("vs-1612", keyeq(&ih->key, &data[i].key));
55357 + assert("vs-1613",
55358 + le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55359 + switch (data[i].plugin_id) {
55360 + case CTAIL_ID:
55361 + case FORMATTING_ID:
55362 + assert("vs-1600",
55363 + data[i].u.bytes == coord_num_units(&coord));
55364 + break;
55365 + case EXTENT_POINTER_ID:
55366 + assert("vs-1601",
55367 + data[i].u.bytes ==
55368 + reiser4_extent_size(&coord,
55369 + coord_num_units
55370 + (&coord)));
55371 + break;
55372 + case COMPOUND_DIR_ID:
55373 + assert("vs-1602",
55374 + data[i].u.entries == coord_num_units(&coord));
55375 + break;
55376 + default:
55377 + break;
55378 + }
55379 + i++;
55380 + }
55381 +
55382 + assert("vs-1603", i == nr_items);
55383 + kfree(data);
55384 +}
55385 +
55386 +#endif
55387 +
55388 +/* plugin->u.node.shift
55389 + look for description of this method in plugin/node/node.h */
55390 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
55391 + deleted from the tree if this is set to 1 */
55392 + int including_stop_coord, carry_plugin_info * info)
55393 +{
55394 + struct shift_params shift;
55395 + int result;
55396 + znode *left, *right;
55397 + znode *source;
55398 + int target_empty;
55399 +
55400 + assert("nikita-2161", coord_check(from));
55401 +
55402 + memset(&shift, 0, sizeof(shift));
55403 + shift.pend = pend;
55404 + shift.wish_stop = *from;
55405 + shift.target = to;
55406 +
55407 + assert("nikita-1473", znode_is_write_locked(from->node));
55408 + assert("nikita-1474", znode_is_write_locked(to));
55409 +
55410 + source = from->node;
55411 +
55412 + /* set @shift.wish_stop to rightmost/leftmost unit among units we want
55413 + shifted */
55414 + if (pend == SHIFT_LEFT) {
55415 + result = coord_set_to_left(&shift.wish_stop);
55416 + left = to;
55417 + right = from->node;
55418 + } else {
55419 + result = coord_set_to_right(&shift.wish_stop);
55420 + left = from->node;
55421 + right = to;
55422 + }
55423 +
55424 + if (result) {
55425 + /* move insertion coord even if there is nothing to move */
55426 + if (including_stop_coord) {
55427 + /* move insertion coord (@from) */
55428 + if (pend == SHIFT_LEFT) {
55429 + /* after last item in target node */
55430 + coord_init_after_last_item(from, to);
55431 + } else {
55432 + /* before first item in target node */
55433 + coord_init_before_first_item(from, to);
55434 + }
55435 + }
55436 +
55437 + if (delete_child && node_is_empty(shift.wish_stop.node))
55438 + result =
55439 + prepare_removal_node40(shift.wish_stop.node, info);
55440 + else
55441 + result = 0;
55442 + /* there is nothing to shift */
55443 + assert("nikita-2078", coord_check(from));
55444 + return result;
55445 + }
55446 +
55447 + target_empty = node_is_empty(to);
55448 +
55449 + /* when first node plugin with item body compression is implemented,
55450 + this must be changed to call node specific plugin */
55451 +
55452 + /* shift->stop_coord is updated to last unit which really will be
55453 + shifted */
55454 + estimate_shift(&shift, get_current_context());
55455 + if (!shift.shift_bytes) {
55456 + /* we could not shift anything */
55457 + assert("nikita-2079", coord_check(from));
55458 + return 0;
55459 + }
55460 +
55461 + copy(&shift);
55462 +
55463 + /* result value of this is important. It is used by adjust_coord below */
55464 + result = delete_copied(&shift);
55465 +
55466 + assert("vs-1610", result >= 0);
55467 + assert("vs-1471",
55468 + ((reiser4_context *) current->journal_info)->magic ==
55469 + context_magic);
55470 +
55471 + /* item which has been moved from one node to another might want to do
55472 + something on that event. This can be done by item's shift_hook
55473 + method, which will be now called for every moved items */
55474 + call_shift_hooks(&shift);
55475 +
55476 + assert("vs-1472",
55477 + ((reiser4_context *) current->journal_info)->magic ==
55478 + context_magic);
55479 +
55480 + update_taps(&shift);
55481 +
55482 + assert("vs-1473",
55483 + ((reiser4_context *) current->journal_info)->magic ==
55484 + context_magic);
55485 +
55486 + /* adjust @from pointer in accordance with @including_stop_coord flag
55487 + and amount of data which was really shifted */
55488 + adjust_coord(from, &shift, result, including_stop_coord);
55489 +
55490 + if (target_empty)
55491 + /*
55492 + * items were shifted into empty node. Update delimiting key.
55493 + */
55494 + result = prepare_for_update(NULL, left, info);
55495 +
55496 + /* add update operation to @info, which is the list of operations to
55497 + be performed on a higher level */
55498 + result = prepare_for_update(left, right, info);
55499 + if (!result && node_is_empty(source) && delete_child) {
55500 + /* all contents of @from->node is moved to @to and @from->node
55501 + has to be removed from the tree, so, on higher level we
55502 + will be removing the pointer to node @from->node */
55503 + result = prepare_removal_node40(source, info);
55504 + }
55505 + assert("nikita-2080", coord_check(from));
55506 + return result ? result : (int)shift.shift_bytes;
55507 +}
55508 +
55509 +/* plugin->u.node.fast_insert()
55510 + look for description of this method in plugin/node/node.h */
55511 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55512 +{
55513 + return 1;
55514 +}
55515 +
55516 +/* plugin->u.node.fast_paste()
55517 + look for description of this method in plugin/node/node.h */
55518 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55519 +{
55520 + return 1;
55521 +}
55522 +
55523 +/* plugin->u.node.fast_cut()
55524 + look for description of this method in plugin/node/node.h */
55525 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55526 +{
55527 + return 1;
55528 +}
55529 +
55530 +/* plugin->u.node.modify - not defined */
55531 +
55532 +/* plugin->u.node.max_item_size */
55533 +int max_item_size_node40(void)
55534 +{
55535 + return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
55536 + sizeof(item_header40);
55537 +}
55538 +
55539 +/* plugin->u.node.set_item_plugin */
55540 +int set_item_plugin_node40(coord_t *coord, item_id id)
55541 +{
55542 + item_header40 *ih;
55543 +
55544 + ih = node40_ih_at_coord(coord);
55545 + put_unaligned(cpu_to_le16(id), &ih->plugin_id);
55546 + coord->iplugid = id;
55547 + return 0;
55548 +}
55549 +
55550 +/*
55551 + Local variables:
55552 + c-indentation-style: "K&R"
55553 + mode-name: "LC"
55554 + c-basic-offset: 8
55555 + tab-width: 8
55556 + fill-column: 120
55557 + scroll-step: 1
55558 + End:
55559 +*/
55560 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node40.h linux-2.6.23/fs/reiser4/plugin/node/node40.h
55561 --- linux-2.6.23.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300
55562 +++ linux-2.6.23/fs/reiser4/plugin/node/node40.h 2007-12-04 16:49:30.000000000 +0300
55563 @@ -0,0 +1,125 @@
55564 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55565 +
55566 +#if !defined( __REISER4_NODE40_H__ )
55567 +#define __REISER4_NODE40_H__
55568 +
55569 +#include "../../forward.h"
55570 +#include "../../dformat.h"
55571 +#include "node.h"
55572 +
55573 +#include <linux/types.h>
55574 +
55575 +/* format of node header for 40 node layouts. Keep bloat out of this struct. */
55576 +typedef struct node40_header {
55577 + /* identifier of node plugin. Must be located at the very beginning
55578 + of a node. */
55579 + common_node_header common_header; /* this is 16 bits */
55580 + /* number of items. Should be first element in the node header,
55581 + because we haven't yet finally decided whether it shouldn't go into
55582 + common_header.
55583 + */
55584 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
55585 + * node format at compile time, and it is this one, accesses do not function dereference when
55586 + * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
55587 + d16 nr_items;
55588 + /* free space in node measured in bytes */
55589 + d16 free_space;
55590 + /* offset to start of free space in node */
55591 + d16 free_space_start;
55592 + /* for reiser4_fsck. When information about what is a free
55593 + block is corrupted, and we try to recover everything even
55594 + if marked as freed, then old versions of data may
55595 + duplicate newer versions, and this field allows us to
55596 + restore the newer version. Also useful for when users
55597 + who don't have the new trashcan installed on their linux distro
55598 + delete the wrong files and send us desperate emails
55599 + offering $25 for them back. */
55600 +
55601 + /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
55602 + d32 magic;
55603 + /* flushstamp is made of mk_id and write_counter. mk_id is an
55604 + id generated randomly at mkreiserfs time. So we can just
55605 + skip all nodes with different mk_id. write_counter is d64
55606 + incrementing counter of writes on disk. It is used for
55607 + choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
55608 +
55609 + d32 mkfs_id;
55610 + d64 flush_id;
55611 + /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
55612 + and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
55613 + d16 flags;
55614 +
55615 + /* 1 is leaf level, 2 is twig level, root is the numerically
55616 + largest level */
55617 + d8 level;
55618 +
55619 + d8 pad;
55620 +} PACKED node40_header;
55621 +
55622 +/* item headers are not standard across all node layouts, pass
55623 + pos_in_node to functions instead */
55624 +typedef struct item_header40 {
55625 + /* key of item */
55626 + /* 0 */ reiser4_key key;
55627 + /* offset from start of a node measured in 8-byte chunks */
55628 + /* 24 */ d16 offset;
55629 + /* 26 */ d16 flags;
55630 + /* 28 */ d16 plugin_id;
55631 +} PACKED item_header40;
55632 +
55633 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
55634 +size_t free_space_node40(znode * node);
55635 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
55636 + lookup_bias bias, coord_t * coord);
55637 +int num_of_items_node40(const znode * node);
55638 +char *item_by_coord_node40(const coord_t * coord);
55639 +int length_by_coord_node40(const coord_t * coord);
55640 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
55641 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
55642 +size_t estimate_node40(znode * node);
55643 +int check_node40(const znode * node, __u32 flags, const char **error);
55644 +int parse_node40(znode * node);
55645 +int init_node40(znode * node);
55646 +#ifdef GUESS_EXISTS
55647 +int guess_node40(const znode * node);
55648 +#endif
55649 +void change_item_size_node40(coord_t * coord, int by);
55650 +int create_item_node40(coord_t * target, const reiser4_key * key,
55651 + reiser4_item_data * data, carry_plugin_info * info);
55652 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
55653 + carry_plugin_info * info);
55654 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
55655 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
55656 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
55657 + /* if @from->node becomes
55658 + empty - it will be deleted from
55659 + the tree if this is set to 1
55660 + */
55661 + int delete_child, int including_stop_coord,
55662 + carry_plugin_info * info);
55663 +
55664 +int fast_insert_node40(const coord_t * coord);
55665 +int fast_paste_node40(const coord_t * coord);
55666 +int fast_cut_node40(const coord_t * coord);
55667 +int max_item_size_node40(void);
55668 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
55669 +int set_item_plugin_node40(coord_t * coord, item_id id);
55670 +int shrink_item_node40(coord_t * coord, int delta);
55671 +
55672 +#if REISER4_DEBUG
55673 +void *shift_check_prepare(const znode *left, const znode *right);
55674 +void shift_check(void *vp, const znode *left, const znode *right);
55675 +#endif
55676 +
55677 +/* __REISER4_NODE40_H__ */
55678 +#endif
55679 +/*
55680 + Local variables:
55681 + c-indentation-style: "K&R"
55682 + mode-name: "LC"
55683 + c-basic-offset: 8
55684 + tab-width: 8
55685 + fill-column: 120
55686 + scroll-step: 1
55687 + End:
55688 +*/
55689 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node.c linux-2.6.23/fs/reiser4/plugin/node/node.c
55690 --- linux-2.6.23.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300
55691 +++ linux-2.6.23/fs/reiser4/plugin/node/node.c 2007-12-04 16:49:30.000000000 +0300
55692 @@ -0,0 +1,131 @@
55693 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55694 +
55695 +/* Node plugin interface.
55696 +
55697 + Description: The tree provides the abstraction of flows, which it
55698 + internally fragments into items which it stores in nodes.
55699 +
55700 + A key_atom is a piece of data bound to a single key.
55701 +
55702 + For reasonable space efficiency to be achieved it is often
55703 + necessary to store key_atoms in the nodes in the form of items, where
55704 + an item is a sequence of key_atoms of the same or similar type. It is
55705 + more space-efficient, because the item can implement (very)
55706 + efficient compression of key_atom's bodies using internal knowledge
55707 + about their semantics, and it can often avoid having a key for each
55708 + key_atom. Each type of item has specific operations implemented by its
55709 + item handler (see balance.c).
55710 +
55711 + Rationale: the rest of the code (specifically balancing routines)
55712 + accesses leaf level nodes through this interface. This way we can
55713 + implement various block layouts and even combine various layouts
55714 + within the same tree. Balancing/allocating algorithms should not
55715 + care about peculiarities of splitting/merging specific item types,
55716 + but rather should leave that to the item's item handler.
55717 +
55718 + Items, including those that provide the abstraction of flows, have
55719 + the property that if you move them in part or in whole to another
55720 + node, the balancing code invokes their is_left_mergeable()
55721 + item_operation to determine if they are mergeable with their new
55722 + neighbor in the node you have moved them to. For some items the
55723 + is_left_mergeable() function always returns null.
55724 +
55725 + When moving the bodies of items from one node to another:
55726 +
55727 + if a partial item is shifted to another node the balancing code invokes
55728 + an item handler method to handle the item splitting.
55729 +
55730 + if the balancing code needs to merge with an item in the node it
55731 + is shifting to, it will invoke an item handler method to handle
55732 + the item merging.
55733 +
55734 + if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55735 + adjusting the item headers after the move is done using the node handler.
55736 +*/
55737 +
55738 +#include "../../forward.h"
55739 +#include "../../debug.h"
55740 +#include "../../key.h"
55741 +#include "../../coord.h"
55742 +#include "../plugin_header.h"
55743 +#include "../item/item.h"
55744 +#include "node.h"
55745 +#include "../plugin.h"
55746 +#include "../../znode.h"
55747 +#include "../../tree.h"
55748 +#include "../../super.h"
55749 +#include "../../reiser4.h"
55750 +
55751 +/**
55752 + * leftmost_key_in_node - get the smallest key in node
55753 + * @node:
55754 + * @key: store result here
55755 + *
55756 + * Stores the leftmost key of @node in @key.
55757 + */
55758 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55759 +{
55760 + assert("nikita-1634", node != NULL);
55761 + assert("nikita-1635", key != NULL);
55762 +
55763 + if (!node_is_empty(node)) {
55764 + coord_t first_item;
55765 +
55766 + coord_init_first_unit(&first_item, (znode *) node);
55767 + item_key_by_coord(&first_item, key);
55768 + } else
55769 + *key = *reiser4_max_key();
55770 + return key;
55771 +}
55772 +
55773 +node_plugin node_plugins[LAST_NODE_ID] = {
55774 + [NODE40_ID] = {
55775 + .h = {
55776 + .type_id = REISER4_NODE_PLUGIN_TYPE,
55777 + .id = NODE40_ID,
55778 + .pops = NULL,
55779 + .label = "unified",
55780 + .desc = "unified node layout",
55781 + .linkage = {NULL, NULL}
55782 + },
55783 + .item_overhead = item_overhead_node40,
55784 + .free_space = free_space_node40,
55785 + .lookup = lookup_node40,
55786 + .num_of_items = num_of_items_node40,
55787 + .item_by_coord = item_by_coord_node40,
55788 + .length_by_coord = length_by_coord_node40,
55789 + .plugin_by_coord = plugin_by_coord_node40,
55790 + .key_at = key_at_node40,
55791 + .estimate = estimate_node40,
55792 + .check = check_node40,
55793 + .parse = parse_node40,
55794 + .init = init_node40,
55795 +#ifdef GUESS_EXISTS
55796 + .guess = guess_node40,
55797 +#endif
55798 + .change_item_size = change_item_size_node40,
55799 + .create_item = create_item_node40,
55800 + .update_item_key = update_item_key_node40,
55801 + .cut_and_kill = kill_node40,
55802 + .cut = cut_node40,
55803 + .shift = shift_node40,
55804 + .shrink_item = shrink_item_node40,
55805 + .fast_insert = fast_insert_node40,
55806 + .fast_paste = fast_paste_node40,
55807 + .fast_cut = fast_cut_node40,
55808 + .max_item_size = max_item_size_node40,
55809 + .prepare_removal = prepare_removal_node40,
55810 + .set_item_plugin = set_item_plugin_node40
55811 + }
55812 +};
55813 +
55814 +/*
55815 + Local variables:
55816 + c-indentation-style: "K&R"
55817 + mode-name: "LC"
55818 + c-basic-offset: 8
55819 + tab-width: 8
55820 + fill-column: 120
55821 + scroll-step: 1
55822 + End:
55823 +*/
55824 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/node/node.h linux-2.6.23/fs/reiser4/plugin/node/node.h
55825 --- linux-2.6.23.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300
55826 +++ linux-2.6.23/fs/reiser4/plugin/node/node.h 2007-12-04 16:49:30.000000000 +0300
55827 @@ -0,0 +1,272 @@
55828 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55829 +
55830 +/* We need a definition of the default node layout here. */
55831 +
55832 +/* Generally speaking, it is best to have free space in the middle of the
55833 + node so that two sets of things can grow towards it, and to have the
55834 + item bodies on the left so that the last one of them grows into free
55835 + space. We optimize for the case where we append new items to the end
55836 + of the node, or grow the last item, because it hurts nothing to so
55837 + optimize and it is a common special case to do massive insertions in
55838 + increasing key order (and one of cases more likely to have a real user
55839 + notice the delay time for).
55840 +
55841 + formatted leaf default layout: (leaf1)
55842 +
55843 + |node header:item bodies:free space:key + pluginid + item offset|
55844 +
55845 + We grow towards the middle, optimizing layout for the case where we
55846 + append new items to the end of the node. The node header is fixed
55847 + length. Keys, and item offsets plus pluginids for the items
55848 + corresponding to them are in increasing key order, and are fixed
55849 + length. Item offsets are relative to start of node (16 bits creating
55850 + a node size limit of 64k, 12 bits might be a better choice....). Item
55851 + bodies are in decreasing key order. Item bodies have a variable size.
55852 + There is a one to one to one mapping of keys to item offsets to item
55853 + bodies. Item offsets consist of pointers to the zeroth byte of the
55854 + item body. Item length equals the start of the next item minus the
55855 + start of this item, except the zeroth item whose length equals the end
55856 + of the node minus the start of that item (plus a byte). In other
55857 + words, the item length is not recorded anywhere, and it does not need
55858 + to be since it is computable.
55859 +
55860 + Leaf variable length items and keys layout : (lvar)
55861 +
55862 + |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55863 +
55864 + We grow towards the middle, optimizing layout for the case where we
55865 + append new items to the end of the node. The node header is fixed
55866 + length. Keys and item offsets for the items corresponding to them are
55867 + in increasing key order, and keys are variable length. Item offsets
55868 + are relative to start of node (16 bits). Item bodies are in
55869 + decreasing key order. Item bodies have a variable size. There is a
55870 + one to one to one mapping of keys to item offsets to item bodies.
55871 + Item offsets consist of pointers to the zeroth byte of the item body.
55872 + Item length equals the start of the next item's key minus the start of
55873 + this item, except the zeroth item whose length equals the end of the
55874 + node minus the start of that item (plus a byte).
55875 +
55876 + leaf compressed keys layout: (lcomp)
55877 +
55878 + |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55879 +
55880 + We grow towards the middle, optimizing layout for the case where we
55881 + append new items to the end of the node. The node header is fixed
55882 + length. Keys and item offsets for the items corresponding to them are
55883 + in increasing key order, and keys are variable length. The "key
55884 + inherit" field indicates how much of the key prefix is identical to
55885 + the previous key (stem compression as described in "Managing
55886 + Gigabytes" is used). key_inherit is a one byte integer. The
55887 + intra-node searches performed through this layout are linear searches,
55888 + and this is theorized to not hurt performance much due to the high
55889 + cost of processor stalls on modern CPUs, and the small number of keys
55890 + in a single node. Item offsets are relative to start of node (16
55891 + bits). Item bodies are in decreasing key order. Item bodies have a
55892 + variable size. There is a one to one to one mapping of keys to item
55893 + offsets to item bodies. Item offsets consist of pointers to the
55894 + zeroth byte of the item body. Item length equals the start of the
55895 + next item minus the start of this item, except the zeroth item whose
55896 + length equals the end of the node minus the start of that item (plus a
55897 + byte). In other words, item length and key length is not recorded
55898 + anywhere, and it does not need to be since it is computable.
55899 +
55900 + internal node default layout: (idef1)
55901 +
55902 + just like ldef1 except that item bodies are either blocknrs of
55903 + children or extents, and moving them may require updating parent
55904 + pointers in the nodes that they point to.
55905 +*/
55906 +
55907 +/* There is an inherent 3-way tradeoff between optimizing and
55908 + exchanging disks between different architectures and code
55909 + complexity. This is optimal and simple and inexchangeable.
55910 + Someone else can do the code for exchanging disks and make it
55911 + complex. It would not be that hard. Using other than the PAGE_SIZE
55912 + might be suboptimal.
55913 +*/
55914 +
55915 +#if !defined( __REISER4_NODE_H__ )
55916 +#define __REISER4_NODE_H__
55917 +
55918 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55919 +
55920 +#include "../../dformat.h"
55921 +#include "../plugin_header.h"
55922 +
55923 +#include <linux/types.h>
55924 +
55925 +typedef enum {
55926 + NS_FOUND = 0,
55927 + NS_NOT_FOUND = -ENOENT
55928 +} node_search_result;
55929 +
55930 +/* Maximal possible space overhead for creation of new item in a node */
55931 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55932 +
55933 +typedef enum {
55934 + REISER4_NODE_DKEYS = (1 << 0),
55935 + REISER4_NODE_TREE_STABLE = (1 << 1)
55936 +} reiser4_node_check_flag;
55937 +
55938 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55939 +struct cut_list {
55940 + coord_t *from;
55941 + coord_t *to;
55942 + const reiser4_key *from_key;
55943 + const reiser4_key *to_key;
55944 + reiser4_key *smallest_removed;
55945 + carry_plugin_info *info;
55946 + __u32 flags;
55947 + struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55948 + lock_handle *left;
55949 + lock_handle *right;
55950 +};
55951 +
55952 +struct carry_cut_data;
55953 +struct carry_kill_data;
55954 +
55955 +/* The responsibility of the node plugin is to store and give access
55956 + to the sequence of items within the node. */
55957 +typedef struct node_plugin {
55958 + /* generic plugin fields */
55959 + plugin_header h;
55960 +
55961 + /* calculates the amount of space that will be required to store an
55962 + item which is in addition to the space consumed by the item body.
55963 + (the space consumed by the item body can be gotten by calling
55964 + item->estimate) */
55965 + size_t(*item_overhead) (const znode * node, flow_t * f);
55966 +
55967 + /* returns free space by looking into node (i.e., without using
55968 + znode->free_space). */
55969 + size_t(*free_space) (znode * node);
55970 + /* search within the node for the one item which might
55971 + contain the key, invoking item->search_within to search within
55972 + that item to see if it is in there */
55973 + node_search_result(*lookup) (znode * node, const reiser4_key * key,
55974 + lookup_bias bias, coord_t * coord);
55975 + /* number of items in node */
55976 + int (*num_of_items) (const znode * node);
55977 +
55978 + /* store information about item in @coord in @data */
55979 + /* break into several node ops, don't add any more uses of this before doing so */
55980 + /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55981 + char *(*item_by_coord) (const coord_t * coord);
55982 + int (*length_by_coord) (const coord_t * coord);
55983 + item_plugin *(*plugin_by_coord) (const coord_t * coord);
55984 +
55985 + /* store item key in @key */
55986 + reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55987 + /* conservatively estimate whether unit of what size can fit
55988 + into node. This estimation should be performed without
55989 + actually looking into the node's content (free space is saved in
55990 + znode). */
55991 + size_t(*estimate) (znode * node);
55992 +
55993 + /* performs every consistency check the node plugin author could
55994 + imagine. Optional. */
55995 + int (*check) (const znode * node, __u32 flags, const char **error);
55996 +
55997 + /* Called when node is read into memory and node plugin is
55998 + already detected. This should read some data into znode (like free
55999 + space counter) and, optionally, check data consistency.
56000 + */
56001 + int (*parse) (znode * node);
56002 + /* This method is called on a new node to initialise plugin specific
56003 + data (header, etc.) */
56004 + int (*init) (znode * node);
56005 + /* Check whether @node content conforms to this plugin format.
56006 + Probably only useful after support for old V3.x formats is added.
56007 + Uncomment after 4.0 only.
56008 + */
56009 + /* int ( *guess )( const znode *node ); */
56010 +#if REISER4_DEBUG
56011 + void (*print) (const char *prefix, const znode * node, __u32 flags);
56012 +#endif
56013 + /* change size of @item by @by bytes. @item->node has enough free
56014 + space. When @by > 0 - free space is appended to end of item. When
56015 + @by < 0 - item is truncated - it is assumed that last @by bytes if
56016 + the item are freed already */
56017 + void (*change_item_size) (coord_t * item, int by);
56018 +
56019 + /* create new item @length bytes long in coord @target */
56020 + int (*create_item) (coord_t * target, const reiser4_key * key,
56021 + reiser4_item_data * data, carry_plugin_info * info);
56022 +
56023 + /* update key of item. */
56024 + void (*update_item_key) (coord_t * target, const reiser4_key * key,
56025 + carry_plugin_info * info);
56026 +
56027 + int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
56028 + int (*cut) (struct carry_cut_data *, carry_plugin_info *);
56029 +
56030 + /*
56031 + * shrink item pointed to by @coord by @delta bytes.
56032 + */
56033 + int (*shrink_item) (coord_t * coord, int delta);
56034 +
56035 + /* copy as much as possible but not more than up to @stop from
56036 + @stop->node to @target. If (pend == append) then data from beginning of
56037 + @stop->node are copied to the end of @target. If (pend == prepend) then
56038 + data from the end of @stop->node are copied to the beginning of
56039 + @target. Copied data are removed from @stop->node. Information
56040 + about what to do on upper level is stored in @todo */
56041 + int (*shift) (coord_t * stop, znode * target, shift_direction pend,
56042 + int delete_node, int including_insert_coord,
56043 + carry_plugin_info * info);
56044 + /* return true if this node allows skip carry() in some situations
56045 + (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56046 + emulation doesn't.
56047 +
56048 + This will speedup insertions that doesn't require updates to the
56049 + parent, by bypassing initialisation of carry() structures. It's
56050 + believed that majority of insertions will fit there.
56051 +
56052 + */
56053 + int (*fast_insert) (const coord_t * coord);
56054 + int (*fast_paste) (const coord_t * coord);
56055 + int (*fast_cut) (const coord_t * coord);
56056 + /* this limits max size of item which can be inserted into a node and
56057 + number of bytes item in a node may be appended with */
56058 + int (*max_item_size) (void);
56059 + int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56060 + /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56061 + * files */
56062 + int (*set_item_plugin) (coord_t * coord, item_id);
56063 +} node_plugin;
56064 +
56065 +typedef enum {
56066 + /* standard unified node layout used for both leaf and internal
56067 + nodes */
56068 + NODE40_ID,
56069 + LAST_NODE_ID
56070 +} reiser4_node_id;
56071 +
56072 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56073 +#if REISER4_DEBUG
56074 +extern void print_node_content(const char *prefix, const znode * node,
56075 + __u32 flags);
56076 +#endif
56077 +
56078 +extern void indent_znode(const znode * node);
56079 +
56080 +typedef struct common_node_header {
56081 + /*
56082 + * identifier of node plugin. Must be located at the very beginning of
56083 + * a node.
56084 + */
56085 + __le16 plugin_id;
56086 +} common_node_header;
56087 +
56088 +/* __REISER4_NODE_H__ */
56089 +#endif
56090 +/*
56091 + * Local variables:
56092 + * c-indentation-style: "K&R"
56093 + * mode-name: "LC"
56094 + * c-basic-offset: 8
56095 + * tab-width: 8
56096 + * fill-column: 79
56097 + * scroll-step: 1
56098 + * End:
56099 + */
56100 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/object.c linux-2.6.23/fs/reiser4/plugin/object.c
56101 --- linux-2.6.23.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300
56102 +++ linux-2.6.23/fs/reiser4/plugin/object.c 2007-12-04 18:49:45.000000000 +0300
56103 @@ -0,0 +1,531 @@
56104 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56105 + * reiser4/README */
56106 +
56107 +/*
56108 + * Examples of object plugins: file, directory, symlink, special file.
56109 + *
56110 + * Plugins associated with inode:
56111 + *
56112 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
56113 + * stat-data. How we store this plugin in in-core inode is not
56114 + * important. Currently pointers are used, another variant is to store offsets
56115 + * and do array lookup on each access.
56116 + *
56117 + * Now, each inode has one selected plugin: object plugin that
56118 + * determines what type of file this object is: directory, regular etc.
56119 + *
56120 + * This main plugin can use other plugins that are thus subordinated to
56121 + * it. Directory instance of object plugin uses hash; regular file
56122 + * instance uses tail policy plugin.
56123 + *
56124 + * Object plugin is either taken from id in stat-data or guessed from
56125 + * i_mode bits. Once it is established we ask it to install its
56126 + * subordinate plugins, by looking again in stat-data or inheriting them
56127 + * from parent.
56128 + *
56129 + * How new inode is initialized during ->read_inode():
56130 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
56131 + * i_generation, capabilities etc.
56132 + * 2 read plugin id from stat data or try to guess plugin id
56133 + * from inode->i_mode bits if plugin id is missing.
56134 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
56135 + *
56136 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
56137 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
56138 + *
56139 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
56140 + * from stat-data or guessed from mode bits
56141 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
56142 + * plugins from parent.
56143 + *
56144 + * Easy induction proves that on last step all plugins of inode would be
56145 + * initialized.
56146 + *
56147 + * When creating new object:
56148 + * 1 obtain object plugin id (see next period)
56149 + * NIKITA-FIXME-HANS: period?
56150 + * 2 ->install() this plugin
56151 + * 3 ->inherit() the rest from the parent
56152 + *
56153 + * We need some examples of creating an object with default and non-default
56154 + * plugin ids. Nikita, please create them.
56155 + */
56156 +
56157 +#include "../inode.h"
56158 +
56159 +static int _bugop(void)
56160 +{
56161 + BUG_ON(1);
56162 + return 0;
56163 +}
56164 +
56165 +#define bugop ((void *)_bugop)
56166 +
56167 +static int _dummyop(void)
56168 +{
56169 + return 0;
56170 +}
56171 +
56172 +#define dummyop ((void *)_dummyop)
56173 +
56174 +static int change_file(struct inode *inode,
56175 + reiser4_plugin * plugin,
56176 + pset_member memb)
56177 +{
56178 + /* cannot change object plugin of already existing object */
56179 + if (memb == PSET_FILE)
56180 + return RETERR(-EINVAL);
56181 +
56182 + /* Change PSET_CREATE */
56183 + return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
56184 +}
56185 +
56186 +static reiser4_plugin_ops file_plugin_ops = {
56187 + .change = change_file
56188 +};
56189 +
56190 +static struct inode_operations null_i_ops = {.create = NULL};
56191 +static struct file_operations null_f_ops = {.owner = NULL};
56192 +static struct address_space_operations null_a_ops = {.writepage = NULL};
56193 +
56194 +/* VFS methods for regular files */
56195 +static struct inode_operations regular_file_i_ops = {
56196 + .permission = reiser4_permission_common,
56197 + .setattr = reiser4_setattr_careful,
56198 + .getattr = reiser4_getattr_common
56199 +};
56200 +static struct file_operations regular_file_f_ops = {
56201 + .llseek = generic_file_llseek,
56202 + .read = reiser4_read_careful,
56203 + .write = reiser4_write_careful,
56204 + .aio_read = generic_file_aio_read,
56205 + .ioctl = reiser4_ioctl_careful,
56206 + .mmap = reiser4_mmap_careful,
56207 + .open = reiser4_open_careful,
56208 + .release = reiser4_release_careful,
56209 + .fsync = reiser4_sync_file_common,
56210 + .splice_read = generic_file_splice_read,
56211 + .splice_write = generic_file_splice_write
56212 +};
56213 +static struct address_space_operations regular_file_a_ops = {
56214 + .writepage = reiser4_writepage,
56215 + .readpage = reiser4_readpage,
56216 + .sync_page = block_sync_page,
56217 + .writepages = reiser4_writepages,
56218 + .set_page_dirty = reiser4_set_page_dirty,
56219 + .readpages = reiser4_readpages,
56220 + .prepare_write = reiser4_prepare_write,
56221 + .commit_write = reiser4_commit_write,
56222 + .bmap = reiser4_bmap_careful,
56223 + .invalidatepage = reiser4_invalidatepage,
56224 + .releasepage = reiser4_releasepage
56225 +};
56226 +
56227 +/* VFS methods for symlink files */
56228 +static struct inode_operations symlink_file_i_ops = {
56229 + .readlink = generic_readlink,
56230 + .follow_link = reiser4_follow_link_common,
56231 + .permission = reiser4_permission_common,
56232 + .setattr = reiser4_setattr_common,
56233 + .getattr = reiser4_getattr_common
56234 +};
56235 +
56236 +/* VFS methods for special files */
56237 +static struct inode_operations special_file_i_ops = {
56238 + .permission = reiser4_permission_common,
56239 + .setattr = reiser4_setattr_common,
56240 + .getattr = reiser4_getattr_common
56241 +};
56242 +
56243 +/* VFS methods for directories */
56244 +static struct inode_operations directory_i_ops = {
56245 + .create = reiser4_create_common,
56246 + .lookup = reiser4_lookup_common,
56247 + .link = reiser4_link_common,
56248 + .unlink = reiser4_unlink_common,
56249 + .symlink = reiser4_symlink_common,
56250 + .mkdir = reiser4_mkdir_common,
56251 + .rmdir = reiser4_unlink_common,
56252 + .mknod = reiser4_mknod_common,
56253 + .rename = reiser4_rename_common,
56254 + .permission = reiser4_permission_common,
56255 + .setattr = reiser4_setattr_common,
56256 + .getattr = reiser4_getattr_common
56257 +};
56258 +static struct file_operations directory_f_ops = {
56259 + .llseek = reiser4_llseek_dir_common,
56260 + .read = generic_read_dir,
56261 + .readdir = reiser4_readdir_common,
56262 + .release = reiser4_release_dir_common,
56263 + .fsync = reiser4_sync_common
56264 +};
56265 +static struct address_space_operations directory_a_ops = {
56266 + .writepage = bugop,
56267 + .sync_page = bugop,
56268 + .writepages = dummyop,
56269 + .set_page_dirty = bugop,
56270 + .readpages = bugop,
56271 + .prepare_write = bugop,
56272 + .commit_write = bugop,
56273 + .bmap = bugop,
56274 + .invalidatepage = bugop,
56275 + .releasepage = bugop
56276 +};
56277 +
56278 +/*
56279 + * Definitions of object plugins.
56280 + */
56281 +
56282 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
56283 + [UNIX_FILE_PLUGIN_ID] = {
56284 + .h = {
56285 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56286 + .id = UNIX_FILE_PLUGIN_ID,
56287 + .groups = (1 << REISER4_REGULAR_FILE),
56288 + .pops = &file_plugin_ops,
56289 + .label = "reg",
56290 + .desc = "regular file",
56291 + .linkage = {NULL, NULL},
56292 + },
56293 + /*
56294 + * invariant vfs ops
56295 + */
56296 + .inode_ops = &regular_file_i_ops,
56297 + .file_ops = &regular_file_f_ops,
56298 + .as_ops = &regular_file_a_ops,
56299 + /*
56300 + * private i_ops
56301 + */
56302 + .setattr = setattr_unix_file,
56303 + .open = open_unix_file,
56304 + .read = read_unix_file,
56305 + .write = write_unix_file,
56306 + .ioctl = ioctl_unix_file,
56307 + .mmap = mmap_unix_file,
56308 + .release = release_unix_file,
56309 + /*
56310 + * private f_ops
56311 + */
56312 + .readpage = readpage_unix_file,
56313 + .readpages = readpages_unix_file,
56314 + .writepages = writepages_unix_file,
56315 + .prepare_write = prepare_write_unix_file,
56316 + .commit_write = commit_write_unix_file,
56317 + /*
56318 + * private a_ops
56319 + */
56320 + .bmap = bmap_unix_file,
56321 + /*
56322 + * other private methods
56323 + */
56324 + .write_sd_by_inode = write_sd_by_inode_common,
56325 + .flow_by_inode = flow_by_inode_unix_file,
56326 + .key_by_inode = key_by_inode_and_offset_common,
56327 + .set_plug_in_inode = set_plug_in_inode_common,
56328 + .adjust_to_parent = adjust_to_parent_common,
56329 + .create_object = reiser4_create_object_common,
56330 + .delete_object = delete_object_unix_file,
56331 + .add_link = reiser4_add_link_common,
56332 + .rem_link = reiser4_rem_link_common,
56333 + .owns_item = owns_item_unix_file,
56334 + .can_add_link = can_add_link_common,
56335 + .detach = dummyop,
56336 + .bind = dummyop,
56337 + .safelink = safelink_common,
56338 + .estimate = {
56339 + .create = estimate_create_common,
56340 + .update = estimate_update_common,
56341 + .unlink = estimate_unlink_common
56342 + },
56343 + .init_inode_data = init_inode_data_unix_file,
56344 + .cut_tree_worker = cut_tree_worker_common,
56345 + .wire = {
56346 + .write = wire_write_common,
56347 + .read = wire_read_common,
56348 + .get = wire_get_common,
56349 + .size = wire_size_common,
56350 + .done = wire_done_common
56351 + }
56352 + },
56353 + [DIRECTORY_FILE_PLUGIN_ID] = {
56354 + .h = {
56355 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56356 + .id = DIRECTORY_FILE_PLUGIN_ID,
56357 + .groups = (1 << REISER4_DIRECTORY_FILE),
56358 + .pops = &file_plugin_ops,
56359 + .label = "dir",
56360 + .desc = "directory",
56361 + .linkage = {NULL, NULL}
56362 + },
56363 + .inode_ops = &null_i_ops,
56364 + .file_ops = &null_f_ops,
56365 + .as_ops = &null_a_ops,
56366 +
56367 + .write_sd_by_inode = write_sd_by_inode_common,
56368 + .flow_by_inode = bugop,
56369 + .key_by_inode = bugop,
56370 + .set_plug_in_inode = set_plug_in_inode_common,
56371 + .adjust_to_parent = adjust_to_parent_common_dir,
56372 + .create_object = reiser4_create_object_common,
56373 + .delete_object = reiser4_delete_dir_common,
56374 + .add_link = reiser4_add_link_common,
56375 + .rem_link = rem_link_common_dir,
56376 + .owns_item = owns_item_common_dir,
56377 + .can_add_link = can_add_link_common,
56378 + .can_rem_link = can_rem_link_common_dir,
56379 + .detach = reiser4_detach_common_dir,
56380 + .bind = reiser4_bind_common_dir,
56381 + .safelink = safelink_common,
56382 + .estimate = {
56383 + .create = estimate_create_common_dir,
56384 + .update = estimate_update_common,
56385 + .unlink = estimate_unlink_common_dir
56386 + },
56387 + .wire = {
56388 + .write = wire_write_common,
56389 + .read = wire_read_common,
56390 + .get = wire_get_common,
56391 + .size = wire_size_common,
56392 + .done = wire_done_common
56393 + },
56394 + .init_inode_data = init_inode_ordering,
56395 + .cut_tree_worker = cut_tree_worker_common,
56396 + },
56397 + [SYMLINK_FILE_PLUGIN_ID] = {
56398 + .h = {
56399 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56400 + .id = SYMLINK_FILE_PLUGIN_ID,
56401 + .groups = (1 << REISER4_SYMLINK_FILE),
56402 + .pops = &file_plugin_ops,
56403 + .label = "symlink",
56404 + .desc = "symbolic link",
56405 + .linkage = {NULL,NULL}
56406 + },
56407 + .inode_ops = &symlink_file_i_ops,
56408 + /* inode->i_fop of symlink is initialized
56409 + by NULL in setup_inode_ops */
56410 + .file_ops = &null_f_ops,
56411 + .as_ops = &null_a_ops,
56412 +
56413 + .write_sd_by_inode = write_sd_by_inode_common,
56414 + .set_plug_in_inode = set_plug_in_inode_common,
56415 + .adjust_to_parent = adjust_to_parent_common,
56416 + .create_object = reiser4_create_symlink,
56417 + .delete_object = reiser4_delete_object_common,
56418 + .add_link = reiser4_add_link_common,
56419 + .rem_link = reiser4_rem_link_common,
56420 + .can_add_link = can_add_link_common,
56421 + .detach = dummyop,
56422 + .bind = dummyop,
56423 + .safelink = safelink_common,
56424 + .estimate = {
56425 + .create = estimate_create_common,
56426 + .update = estimate_update_common,
56427 + .unlink = estimate_unlink_common
56428 + },
56429 + .init_inode_data = init_inode_ordering,
56430 + .cut_tree_worker = cut_tree_worker_common,
56431 + .destroy_inode = destroy_inode_symlink,
56432 + .wire = {
56433 + .write = wire_write_common,
56434 + .read = wire_read_common,
56435 + .get = wire_get_common,
56436 + .size = wire_size_common,
56437 + .done = wire_done_common
56438 + }
56439 + },
56440 + [SPECIAL_FILE_PLUGIN_ID] = {
56441 + .h = {
56442 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56443 + .id = SPECIAL_FILE_PLUGIN_ID,
56444 + .groups = (1 << REISER4_SPECIAL_FILE),
56445 + .pops = &file_plugin_ops,
56446 + .label = "special",
56447 + .desc =
56448 + "special: fifo, device or socket",
56449 + .linkage = {NULL, NULL}
56450 + },
56451 + .inode_ops = &special_file_i_ops,
56452 + /* file_ops of special files (sockets, block, char, fifo) are
56453 + initialized by init_special_inode. */
56454 + .file_ops = &null_f_ops,
56455 + .as_ops = &null_a_ops,
56456 +
56457 + .write_sd_by_inode = write_sd_by_inode_common,
56458 + .set_plug_in_inode = set_plug_in_inode_common,
56459 + .adjust_to_parent = adjust_to_parent_common,
56460 + .create_object = reiser4_create_object_common,
56461 + .delete_object = reiser4_delete_object_common,
56462 + .add_link = reiser4_add_link_common,
56463 + .rem_link = reiser4_rem_link_common,
56464 + .owns_item = owns_item_common,
56465 + .can_add_link = can_add_link_common,
56466 + .detach = dummyop,
56467 + .bind = dummyop,
56468 + .safelink = safelink_common,
56469 + .estimate = {
56470 + .create = estimate_create_common,
56471 + .update = estimate_update_common,
56472 + .unlink = estimate_unlink_common
56473 + },
56474 + .init_inode_data = init_inode_ordering,
56475 + .cut_tree_worker = cut_tree_worker_common,
56476 + .wire = {
56477 + .write = wire_write_common,
56478 + .read = wire_read_common,
56479 + .get = wire_get_common,
56480 + .size = wire_size_common,
56481 + .done = wire_done_common
56482 + }
56483 + },
56484 + [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
56485 + .h = {
56486 + .type_id = REISER4_FILE_PLUGIN_TYPE,
56487 + .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
56488 + .groups = (1 << REISER4_REGULAR_FILE),
56489 + .pops = &file_plugin_ops,
56490 + .label = "cryptcompress",
56491 + .desc = "cryptcompress file",
56492 + .linkage = {NULL, NULL}
56493 + },
56494 + .inode_ops = &regular_file_i_ops,
56495 + .file_ops = &regular_file_f_ops,
56496 + .as_ops = &regular_file_a_ops,
56497 +
56498 + .setattr = setattr_cryptcompress,
56499 + .open = open_cryptcompress,
56500 + .read = read_cryptcompress,
56501 + .write = write_cryptcompress,
56502 + .ioctl = ioctl_cryptcompress,
56503 + .mmap = mmap_cryptcompress,
56504 + .release = release_cryptcompress,
56505 +
56506 + .readpage = readpage_cryptcompress,
56507 + .readpages = readpages_cryptcompress,
56508 + .writepages = writepages_cryptcompress,
56509 + .prepare_write = prepare_write_cryptcompress,
56510 + .commit_write = commit_write_cryptcompress,
56511 +
56512 + .bmap = bmap_cryptcompress,
56513 +
56514 + .write_sd_by_inode = write_sd_by_inode_common,
56515 + .flow_by_inode = flow_by_inode_cryptcompress,
56516 + .key_by_inode = key_by_inode_cryptcompress,
56517 + .set_plug_in_inode = set_plug_in_inode_common,
56518 + .adjust_to_parent = adjust_to_parent_cryptcompress,
56519 + .create_object = create_object_cryptcompress,
56520 + .delete_object = delete_object_cryptcompress,
56521 + .add_link = reiser4_add_link_common,
56522 + .rem_link = reiser4_rem_link_common,
56523 + .owns_item = owns_item_common,
56524 + .can_add_link = can_add_link_common,
56525 + .detach = dummyop,
56526 + .bind = dummyop,
56527 + .safelink = safelink_common,
56528 + .estimate = {
56529 + .create = estimate_create_common,
56530 + .update = estimate_update_common,
56531 + .unlink = estimate_unlink_common
56532 + },
56533 + .init_inode_data = init_inode_data_cryptcompress,
56534 + .cut_tree_worker = cut_tree_worker_cryptcompress,
56535 + .destroy_inode = destroy_inode_cryptcompress,
56536 + .wire = {
56537 + .write = wire_write_common,
56538 + .read = wire_read_common,
56539 + .get = wire_get_common,
56540 + .size = wire_size_common,
56541 + .done = wire_done_common
56542 + }
56543 + }
56544 +};
56545 +
56546 +static int change_dir(struct inode *inode,
56547 + reiser4_plugin * plugin,
56548 + pset_member memb)
56549 +{
56550 + /* cannot change dir plugin of already existing object */
56551 + return RETERR(-EINVAL);
56552 +}
56553 +
56554 +static reiser4_plugin_ops dir_plugin_ops = {
56555 + .change = change_dir
56556 +};
56557 +
56558 +/*
56559 + * definition of directory plugins
56560 + */
56561 +
56562 +dir_plugin dir_plugins[LAST_DIR_ID] = {
56563 + /* standard hashed directory plugin */
56564 + [HASHED_DIR_PLUGIN_ID] = {
56565 + .h = {
56566 + .type_id = REISER4_DIR_PLUGIN_TYPE,
56567 + .id = HASHED_DIR_PLUGIN_ID,
56568 + .pops = &dir_plugin_ops,
56569 + .label = "dir",
56570 + .desc = "hashed directory",
56571 + .linkage = {NULL, NULL}
56572 + },
56573 + .inode_ops = &directory_i_ops,
56574 + .file_ops = &directory_f_ops,
56575 + .as_ops = &directory_a_ops,
56576 +
56577 + .get_parent = get_parent_common,
56578 + .is_name_acceptable = is_name_acceptable_common,
56579 + .build_entry_key = build_entry_key_hashed,
56580 + .build_readdir_key = build_readdir_key_common,
56581 + .add_entry = reiser4_add_entry_common,
56582 + .rem_entry = reiser4_rem_entry_common,
56583 + .init = reiser4_dir_init_common,
56584 + .done = reiser4_dir_done_common,
56585 + .attach = reiser4_attach_common,
56586 + .detach = reiser4_detach_common,
56587 + .estimate = {
56588 + .add_entry = estimate_add_entry_common,
56589 + .rem_entry = estimate_rem_entry_common,
56590 + .unlink = dir_estimate_unlink_common
56591 + }
56592 + },
56593 + /* hashed directory for which seekdir/telldir are guaranteed to
56594 + * work. Brain-damage. */
56595 + [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
56596 + .h = {
56597 + .type_id = REISER4_DIR_PLUGIN_TYPE,
56598 + .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
56599 + .pops = &dir_plugin_ops,
56600 + .label = "dir32",
56601 + .desc = "directory hashed with 31 bit hash",
56602 + .linkage = {NULL, NULL}
56603 + },
56604 + .inode_ops = &directory_i_ops,
56605 + .file_ops = &directory_f_ops,
56606 + .as_ops = &directory_a_ops,
56607 +
56608 + .get_parent = get_parent_common,
56609 + .is_name_acceptable = is_name_acceptable_common,
56610 + .build_entry_key = build_entry_key_seekable,
56611 + .build_readdir_key = build_readdir_key_common,
56612 + .add_entry = reiser4_add_entry_common,
56613 + .rem_entry = reiser4_rem_entry_common,
56614 + .init = reiser4_dir_init_common,
56615 + .done = reiser4_dir_done_common,
56616 + .attach = reiser4_attach_common,
56617 + .detach = reiser4_detach_common,
56618 + .estimate = {
56619 + .add_entry = estimate_add_entry_common,
56620 + .rem_entry = estimate_rem_entry_common,
56621 + .unlink = dir_estimate_unlink_common
56622 + }
56623 + }
56624 +};
56625 +
56626 +/* Make Linus happy.
56627 + Local variables:
56628 + c-indentation-style: "K&R"
56629 + mode-name: "LC"
56630 + c-basic-offset: 8
56631 + tab-width: 8
56632 + fill-column: 120
56633 + End:
56634 +*/
56635 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/object.h linux-2.6.23/fs/reiser4/plugin/object.h
56636 --- linux-2.6.23.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300
56637 +++ linux-2.6.23/fs/reiser4/plugin/object.h 2007-12-04 16:49:30.000000000 +0300
56638 @@ -0,0 +1,121 @@
56639 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
56640 + * reiser4/README */
56641 +
56642 +/* Declaration of object plugin functions. */
56643 +
56644 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
56645 +#define __FS_REISER4_PLUGIN_OBJECT_H__
56646 +
56647 +#include "../type_safe_hash.h"
56648 +
56649 +/* common implementations of inode operations */
56650 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
56651 + int mode, struct nameidata *);
56652 +struct dentry * reiser4_lookup_common(struct inode *parent,
56653 + struct dentry *dentry,
56654 + struct nameidata *nameidata);
56655 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
56656 + struct dentry *newname);
56657 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
56658 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
56659 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
56660 + const char *linkname);
56661 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
56662 + int mode, dev_t rdev);
56663 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
56664 + struct inode *new_dir, struct dentry *new_name);
56665 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
56666 +int reiser4_permission_common(struct inode *, int mask,
56667 + struct nameidata *nameidata);
56668 +int reiser4_setattr_common(struct dentry *, struct iattr *);
56669 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
56670 + struct kstat *);
56671 +
56672 +/* common implementations of file operations */
56673 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
56674 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
56675 +int reiser4_release_dir_common(struct inode *, struct file *);
56676 +int reiser4_sync_common(struct file *, struct dentry *, int datasync);
56677 +
56678 +/* common implementations of address space operations */
56679 +int prepare_write_common(struct file *, struct page *, unsigned from,
56680 + unsigned to);
56681 +
56682 +/* file plugin operations: common implementations */
56683 +int write_sd_by_inode_common(struct inode *);
56684 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
56685 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
56686 + reiser4_object_create_data *);
56687 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
56688 + struct inode *root);
56689 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
56690 + struct inode *root);
56691 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
56692 + struct inode *root);
56693 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
56694 + reiser4_object_create_data *);
56695 +int reiser4_delete_object_common(struct inode *);
56696 +int reiser4_delete_dir_common(struct inode *);
56697 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
56698 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
56699 +int rem_link_common_dir(struct inode *object, struct inode *parent);
56700 +int owns_item_common(const struct inode *, const coord_t *);
56701 +int owns_item_common_dir(const struct inode *, const coord_t *);
56702 +int can_add_link_common(const struct inode *);
56703 +int can_rem_link_common_dir(const struct inode *);
56704 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
56705 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
56706 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
56707 +reiser4_block_nr estimate_create_common(const struct inode *);
56708 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
56709 +reiser4_block_nr estimate_update_common(const struct inode *);
56710 +reiser4_block_nr estimate_unlink_common(const struct inode *,
56711 + const struct inode *);
56712 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
56713 + const struct inode *);
56714 +char *wire_write_common(struct inode *, char *start);
56715 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
56716 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
56717 +int wire_size_common(struct inode *);
56718 +void wire_done_common(reiser4_object_on_wire *);
56719 +
56720 +/* dir plugin operations: common implementations */
56721 +struct dentry *get_parent_common(struct inode *child);
56722 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
56723 +void build_entry_key_common(const struct inode *,
56724 + const struct qstr *qname, reiser4_key *);
56725 +int build_readdir_key_common(struct file *dir, reiser4_key *);
56726 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
56727 + reiser4_object_create_data *, reiser4_dir_entry_desc *);
56728 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
56729 + reiser4_dir_entry_desc *);
56730 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
56731 + reiser4_object_create_data *);
56732 +int reiser4_dir_done_common(struct inode *);
56733 +int reiser4_attach_common(struct inode *child, struct inode *parent);
56734 +int reiser4_detach_common(struct inode *object, struct inode *parent);
56735 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
56736 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
56737 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
56738 + const struct inode *);
56739 +
56740 +/* these are essential parts of common implementations, they are to make
56741 + customized implementations easier */
56742 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
56743 +
56744 +/* merely useful functions */
56745 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
56746 + const reiser4_key *, int silent);
56747 +
56748 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
56749 +#endif
56750 +
56751 +/* Make Linus happy.
56752 + Local variables:
56753 + c-indentation-style: "K&R"
56754 + mode-name: "LC"
56755 + c-basic-offset: 8
56756 + tab-width: 8
56757 + fill-column: 120
56758 + End:
56759 +*/
56760 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin.c linux-2.6.23/fs/reiser4/plugin/plugin.c
56761 --- linux-2.6.23.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300
56762 +++ linux-2.6.23/fs/reiser4/plugin/plugin.c 2007-12-04 16:49:30.000000000 +0300
56763 @@ -0,0 +1,559 @@
56764 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56765 + * reiser4/README */
56766 +
56767 +/* Basic plugin infrastructure, lookup etc. */
56768 +
56769 +/* PLUGINS:
56770 +
56771 + Plugins are internal Reiser4 "modules" or "objects" used to increase
56772 + extensibility and allow external users to easily adapt reiser4 to
56773 + their needs.
56774 +
56775 + Plugins are classified into several disjoint "types". Plugins
56776 + belonging to the particular plugin type are termed "instances" of
56777 + this type. Existing types are listed by enum reiser4_plugin_type
56778 + (see plugin/plugin_header.h)
56779 +
56780 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
56781 +
56782 + Object (file) plugin determines how given file-system object serves
56783 + standard VFS requests for read, write, seek, mmap etc. Instances of
56784 + file plugins are: regular file, directory, symlink. Another example
56785 + of file plugin is audit plugin, that optionally records accesses to
56786 + underlying object and forwards requests to it.
56787 +
56788 + Hash plugins compute hashes used by reiser4 to store and locate
56789 + files within directories. Instances of hash plugin type are: r5,
56790 + tea, rupasov.
56791 +
56792 + Tail plugins (or, more precisely, tail policy plugins) determine
56793 + when last part of the file should be stored in a formatted item.
56794 +
56795 + Scope and lookup:
56796 +
56797 + label such that pair ( type_label, plugin_label ) is unique. This
56798 + pair is a globally persistent and user-visible plugin
56799 + identifier. Internally kernel maintains plugins and plugin types in
56800 + arrays using an index into those arrays as plugin and plugin type
56801 + identifiers. File-system in turn, also maintains persistent
56802 + "dictionary" which is mapping from plugin label to numerical
56803 + identifier which is stored in file-system objects. That is, we
56804 + store the offset into the plugin array for that plugin type as the
56805 + plugin id in the stat data of the filesystem object.
56806 +
56807 + Internal kernel plugin type identifier (index in plugins[] array) is
56808 + of type reiser4_plugin_type. Set of available plugin types is
56809 + currently static, but dynamic loading doesn't seem to pose
56810 + insurmountable problems.
56811 +
56812 + Within each type plugins are addressed by the identifiers of type
56813 + reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
56814 + Such identifiers are only required to be unique within one type,
56815 + not globally.
56816 +
56817 + Thus, plugin in memory is uniquely identified by the pair (type_id,
56818 + id).
56819 +
56820 + Usage:
56821 +
56822 + There exists only one instance of each plugin instance, but this
56823 + single instance can be associated with many entities (file-system
56824 + objects, items, nodes, transactions, file-descriptors etc.). Entity
56825 + to which plugin of given type is termed (due to the lack of
56826 + imagination) "subject" of this plugin type and, by abuse of
56827 + terminology, subject of particular instance of this type to which
56828 + it's attached currently. For example, inode is subject of object
56829 + plugin type. Inode representing directory is subject of directory
56830 + plugin, hash plugin type and some particular instance of hash plugin
56831 + type. Inode, representing regular file is subject of "regular file"
56832 + plugin, tail-policy plugin type etc.
56833 +
56834 + With each subject the plugin possibly stores some state. For example,
56835 + the state of a directory plugin (instance of object plugin type) is pointer
56836 + to hash plugin (if directories always use hashing that is).
56837 +
56838 + Interface:
56839 +
56840 + In addition to a scalar identifier, each plugin type and plugin
56841 + proper has a "label": short string and a "description"---longer
56842 + descriptive string. Labels and descriptions of plugin types are
56843 + hard-coded into plugins[] array, declared and defined in
56844 + plugin.c. Label and description of plugin are stored in .label and
56845 + .desc fields of reiser4_plugin_header respectively. It's possible to
56846 + locate plugin by the pair of labels.
56847 +
56848 + Features (not implemented):
56849 +
56850 + . user-level plugin manipulations:
56851 + + reiser4("filename/..file_plugin<='audit'");
56852 + + write(open("filename/..file_plugin"), "audit", 8);
56853 +
56854 + . user level utilities lsplug and chplug to manipulate plugins.
56855 + Utilities are not of primary priority. Possibly they will be not
56856 + working on v4.0
56857 +
56858 + NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
56859 + option, do you agree? I don't think that specifying it at mount time,
56860 + and then changing it with each mount, is a good model for usage.
56861 +
56862 + . mount option "plug" to set-up plugins of root-directory.
56863 + "plug=foo:bar" will set "bar" as default plugin of type "foo".
56864 +
56865 + Limitations:
56866 +
56867 + . each plugin type has to provide at least one builtin
56868 + plugin. This is technical limitation and it can be lifted in the
56869 + future.
56870 +
56871 + TODO:
56872 +
56873 + New plugin types/plugings:
56874 + Things we should be able to separately choose to inherit:
56875 +
56876 + security plugins
56877 +
56878 + stat data
56879 +
56880 + file bodies
56881 +
56882 + file plugins
56883 +
56884 + dir plugins
56885 +
56886 + . perm:acl
56887 +
56888 + . audi---audit plugin intercepting and possibly logging all
56889 + accesses to object. Requires to put stub functions in file_operations
56890 + in stead of generic_file_*.
56891 +
56892 +NIKITA-FIXME-HANS: why make overflows a plugin?
56893 + . over---handle hash overflows
56894 +
56895 + . sqnt---handle different access patterns and instruments read-ahead
56896 +
56897 +NIKITA-FIXME-HANS: describe the line below in more detail.
56898 +
56899 + . hier---handle inheritance of plugins along file-system hierarchy
56900 +
56901 + Different kinds of inheritance: on creation vs. on access.
56902 + Compatible/incompatible plugins.
56903 + Inheritance for multi-linked files.
56904 + Layered plugins.
56905 + Notion of plugin context is abandoned.
56906 +
56907 +Each file is associated
56908 + with one plugin and dependant plugins (hash, etc.) are stored as
56909 + main plugin state. Now, if we have plugins used for regular files
56910 + but not for directories, how such plugins would be inherited?
56911 + . always store them with directories also
56912 +
56913 +NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing
56914 +the line below which is also useful.
56915 +
56916 + . use inheritance hierarchy, independent of file-system namespace
56917 +*/
56918 +
56919 +#include "../debug.h"
56920 +#include "../dformat.h"
56921 +#include "plugin_header.h"
56922 +#include "item/static_stat.h"
56923 +#include "node/node.h"
56924 +#include "security/perm.h"
56925 +#include "space/space_allocator.h"
56926 +#include "disk_format/disk_format.h"
56927 +#include "plugin.h"
56928 +#include "../reiser4.h"
56929 +#include "../jnode.h"
56930 +#include "../inode.h"
56931 +
56932 +#include <linux/fs.h> /* for struct super_block */
56933 +
56934 +/*
56935 + * init_plugins - initialize plugin sub-system.
56936 + * Just call this once on reiser4 startup.
56937 + *
56938 + * Initializes plugin sub-system. It is part of reiser4 module
56939 + * initialization. For each plugin of each type init method is called and each
56940 + * plugin is put into list of plugins.
56941 + */
56942 +int init_plugins(void)
56943 +{
56944 + reiser4_plugin_type type_id;
56945 +
56946 + for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
56947 + struct reiser4_plugin_type_data *ptype;
56948 + int i;
56949 +
56950 + ptype = &plugins[type_id];
56951 + assert("nikita-3508", ptype->label != NULL);
56952 + assert("nikita-3509", ptype->type_id == type_id);
56953 +
56954 + INIT_LIST_HEAD(&ptype->plugins_list);
56955 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
56956 + for (i = 0; i < ptype->builtin_num; ++i) {
56957 + reiser4_plugin *plugin;
56958 +
56959 + plugin = plugin_at(ptype, i);
56960 +
56961 + if (plugin->h.label == NULL)
56962 + /* uninitialized slot encountered */
56963 + continue;
56964 + assert("nikita-3445", plugin->h.type_id == type_id);
56965 + plugin->h.id = i;
56966 + if (plugin->h.pops != NULL &&
56967 + plugin->h.pops->init != NULL) {
56968 + int result;
56969 +
56970 + result = plugin->h.pops->init(plugin);
56971 + if (result != 0)
56972 + return result;
56973 + }
56974 + INIT_LIST_HEAD(&plugin->h.linkage);
56975 + list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
56976 + }
56977 + }
56978 + return 0;
56979 +}
56980 +
56981 +/* true if plugin type id is valid */
56982 +int is_plugin_type_valid(reiser4_plugin_type type)
56983 +{
56984 + /* "type" is unsigned, so no comparison with 0 is
56985 + necessary */
56986 + return (type < REISER4_PLUGIN_TYPES);
56987 +}
56988 +
56989 +/* true if plugin id is valid */
56990 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
56991 +{
56992 + assert("nikita-1653", is_plugin_type_valid(type));
56993 + return id < plugins[type].builtin_num;
56994 +}
56995 +
56996 +/* return plugin by its @type and @id.
56997 +
56998 + Both arguments are checked for validness: this is supposed to be called
56999 + from user-level.
57000 +
57001 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
57002 +user space, and passed to the filesystem by use of method files? Your
57003 +comment really confused me on the first reading....
57004 +
57005 +*/
57006 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
57007 + * unchecked */,
57008 + reiser4_plugin_id id /* plugin id,
57009 + * unchecked */)
57010 +{
57011 + if (is_plugin_type_valid(type)) {
57012 + if (is_plugin_id_valid(type, id))
57013 + return plugin_at(&plugins[type], id);
57014 + else
57015 + /* id out of bounds */
57016 + warning("nikita-2913",
57017 + "Invalid plugin id: [%i:%i]", type, id);
57018 + } else
57019 + /* type_id out of bounds */
57020 + warning("nikita-2914", "Invalid type_id: %i", type);
57021 + return NULL;
57022 +}
57023 +
57024 +/**
57025 + * save_plugin_id - store plugin id in disk format
57026 + * @plugin: plugin to convert
57027 + * @area: where to store result
57028 + *
57029 + * Puts id of @plugin in little endian format to address @area.
57030 + */
57031 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
57032 + d16 *area /* where to store result */ )
57033 +{
57034 + assert("nikita-1261", plugin != NULL);
57035 + assert("nikita-1262", area != NULL);
57036 +
57037 + put_unaligned(cpu_to_le16(plugin->h.id), area);
57038 + return 0;
57039 +}
57040 +
57041 +/* list of all plugins of given type */
57042 +struct list_head *get_plugin_list(reiser4_plugin_type type)
57043 +{
57044 + assert("nikita-1056", is_plugin_type_valid(type));
57045 + return &plugins[type].plugins_list;
57046 +}
57047 +
57048 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
57049 +{
57050 + struct dentry *rootdir;
57051 + reiser4_inode *root;
57052 +
57053 + assert("edward-1443", memb != PSET_FILE);
57054 +
57055 + rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
57056 + if (rootdir != NULL) {
57057 + root = reiser4_inode_data(rootdir->d_inode);
57058 + /*
57059 + * if inode is different from the default one, or we are
57060 + * changing plugin of root directory, update plugin_mask
57061 + */
57062 + if (aset_get(info->pset, memb) !=
57063 + aset_get(root->pset, memb) ||
57064 + info == root)
57065 + info->plugin_mask |= (1 << memb);
57066 + else
57067 + info->plugin_mask &= ~(1 << memb);
57068 + }
57069 +}
57070 +
57071 +/* Get specified plugin set member from parent,
57072 + or from fs-defaults (if no parent is given) and
57073 + install the result to pset of @self */
57074 +int grab_plugin_pset(struct inode *self,
57075 + struct inode *ancestor,
57076 + pset_member memb)
57077 +{
57078 + reiser4_plugin *plug;
57079 + reiser4_inode *info;
57080 + int result = 0;
57081 +
57082 + /* Do not grab if initialised already. */
57083 + info = reiser4_inode_data(self);
57084 + if (aset_get(info->pset, memb) != NULL)
57085 + return 0;
57086 + if (ancestor) {
57087 + reiser4_inode *parent;
57088 +
57089 + parent = reiser4_inode_data(ancestor);
57090 + plug = aset_get(parent->hset, memb) ? :
57091 + aset_get(parent->pset, memb);
57092 + }
57093 + else
57094 + plug = get_default_plugin(memb);
57095 +
57096 + result = set_plugin(&info->pset, memb, plug);
57097 + if (result == 0) {
57098 + if (!ancestor || self->i_sb->s_root->d_inode != self)
57099 + update_pset_mask(info, memb);
57100 + }
57101 + return result;
57102 +}
57103 +
57104 +/* Take missing pset members from root inode */
57105 +int finish_pset(struct inode *inode)
57106 +{
57107 + reiser4_plugin *plug;
57108 + reiser4_inode *root;
57109 + reiser4_inode *info;
57110 + pset_member memb;
57111 + int result = 0;
57112 +
57113 + root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
57114 + info = reiser4_inode_data(inode);
57115 +
57116 + assert("edward-1455", root != NULL);
57117 + assert("edward-1456", info != NULL);
57118 +
57119 + /* file and directory plugins are already initialized. */
57120 + for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
57121 +
57122 + /* Do not grab if initialised already. */
57123 + if (aset_get(info->pset, memb) != NULL)
57124 + continue;
57125 +
57126 + plug = aset_get(root->pset, memb);
57127 + result = set_plugin(&info->pset, memb, plug);
57128 + if (result != 0)
57129 + break;
57130 + }
57131 + if (result != 0) {
57132 + warning("nikita-3447",
57133 + "Cannot set up plugins for %lli",
57134 + (unsigned long long)
57135 + get_inode_oid(inode));
57136 + }
57137 + return result;
57138 +}
57139 +
57140 +int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
57141 +{
57142 + reiser4_inode *info;
57143 + int result = 0;
57144 +
57145 + if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
57146 + /* Changing pset in the root object. */
57147 + return RETERR(-EINVAL);
57148 + }
57149 +
57150 + info = reiser4_inode_data(self);
57151 + if (plug->h.pops != NULL && plug->h.pops->change != NULL)
57152 + result = plug->h.pops->change(self, plug, memb);
57153 + else
57154 + result = aset_set_unsafe(&info->pset, memb, plug);
57155 + if (result == 0) {
57156 + __u16 oldmask = info->plugin_mask;
57157 +
57158 + update_pset_mask(info, memb);
57159 + if (oldmask != info->plugin_mask)
57160 + reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
57161 + }
57162 + return result;
57163 +}
57164 +
57165 +struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
57166 + /* C90 initializers */
57167 + [REISER4_FILE_PLUGIN_TYPE] = {
57168 + .type_id = REISER4_FILE_PLUGIN_TYPE,
57169 + .label = "file",
57170 + .desc = "Object plugins",
57171 + .builtin_num = sizeof_array(file_plugins),
57172 + .builtin = file_plugins,
57173 + .plugins_list = {NULL, NULL},
57174 + .size = sizeof(file_plugin)
57175 + },
57176 + [REISER4_DIR_PLUGIN_TYPE] = {
57177 + .type_id = REISER4_DIR_PLUGIN_TYPE,
57178 + .label = "dir",
57179 + .desc = "Directory plugins",
57180 + .builtin_num = sizeof_array(dir_plugins),
57181 + .builtin = dir_plugins,
57182 + .plugins_list = {NULL, NULL},
57183 + .size = sizeof(dir_plugin)
57184 + },
57185 + [REISER4_HASH_PLUGIN_TYPE] = {
57186 + .type_id = REISER4_HASH_PLUGIN_TYPE,
57187 + .label = "hash",
57188 + .desc = "Directory hashes",
57189 + .builtin_num = sizeof_array(hash_plugins),
57190 + .builtin = hash_plugins,
57191 + .plugins_list = {NULL, NULL},
57192 + .size = sizeof(hash_plugin)
57193 + },
57194 + [REISER4_FIBRATION_PLUGIN_TYPE] = {
57195 + .type_id =
57196 + REISER4_FIBRATION_PLUGIN_TYPE,
57197 + .label = "fibration",
57198 + .desc = "Directory fibrations",
57199 + .builtin_num = sizeof_array(fibration_plugins),
57200 + .builtin = fibration_plugins,
57201 + .plugins_list = {NULL, NULL},
57202 + .size = sizeof(fibration_plugin)
57203 + },
57204 + [REISER4_CIPHER_PLUGIN_TYPE] = {
57205 + .type_id = REISER4_CIPHER_PLUGIN_TYPE,
57206 + .label = "cipher",
57207 + .desc = "Cipher plugins",
57208 + .builtin_num = sizeof_array(cipher_plugins),
57209 + .builtin = cipher_plugins,
57210 + .plugins_list = {NULL, NULL},
57211 + .size = sizeof(cipher_plugin)
57212 + },
57213 + [REISER4_DIGEST_PLUGIN_TYPE] = {
57214 + .type_id = REISER4_DIGEST_PLUGIN_TYPE,
57215 + .label = "digest",
57216 + .desc = "Digest plugins",
57217 + .builtin_num = sizeof_array(digest_plugins),
57218 + .builtin = digest_plugins,
57219 + .plugins_list = {NULL, NULL},
57220 + .size = sizeof(digest_plugin)
57221 + },
57222 + [REISER4_COMPRESSION_PLUGIN_TYPE] = {
57223 + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
57224 + .label = "compression",
57225 + .desc = "Compression plugins",
57226 + .builtin_num = sizeof_array(compression_plugins),
57227 + .builtin = compression_plugins,
57228 + .plugins_list = {NULL, NULL},
57229 + .size = sizeof(compression_plugin)
57230 + },
57231 + [REISER4_FORMATTING_PLUGIN_TYPE] = {
57232 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
57233 + .label = "formatting",
57234 + .desc = "Tail inlining policies",
57235 + .builtin_num = sizeof_array(formatting_plugins),
57236 + .builtin = formatting_plugins,
57237 + .plugins_list = {NULL, NULL},
57238 + .size = sizeof(formatting_plugin)
57239 + },
57240 + [REISER4_PERM_PLUGIN_TYPE] = {
57241 + .type_id = REISER4_PERM_PLUGIN_TYPE,
57242 + .label = "perm",
57243 + .desc = "Permission checks",
57244 + .builtin_num = sizeof_array(perm_plugins),
57245 + .builtin = perm_plugins,
57246 + .plugins_list = {NULL, NULL},
57247 + .size = sizeof(perm_plugin)
57248 + },
57249 + [REISER4_ITEM_PLUGIN_TYPE] = {
57250 + .type_id = REISER4_ITEM_PLUGIN_TYPE,
57251 + .label = "item",
57252 + .desc = "Item handlers",
57253 + .builtin_num = sizeof_array(item_plugins),
57254 + .builtin = item_plugins,
57255 + .plugins_list = {NULL, NULL},
57256 + .size = sizeof(item_plugin)
57257 + },
57258 + [REISER4_NODE_PLUGIN_TYPE] = {
57259 + .type_id = REISER4_NODE_PLUGIN_TYPE,
57260 + .label = "node",
57261 + .desc = "node layout handlers",
57262 + .builtin_num = sizeof_array(node_plugins),
57263 + .builtin = node_plugins,
57264 + .plugins_list = {NULL, NULL},
57265 + .size = sizeof(node_plugin)
57266 + },
57267 + [REISER4_SD_EXT_PLUGIN_TYPE] = {
57268 + .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
57269 + .label = "sd_ext",
57270 + .desc = "Parts of stat-data",
57271 + .builtin_num = sizeof_array(sd_ext_plugins),
57272 + .builtin = sd_ext_plugins,
57273 + .plugins_list = {NULL, NULL},
57274 + .size = sizeof(sd_ext_plugin)
57275 + },
57276 + [REISER4_FORMAT_PLUGIN_TYPE] = {
57277 + .type_id = REISER4_FORMAT_PLUGIN_TYPE,
57278 + .label = "disk_layout",
57279 + .desc = "defines filesystem on disk layout",
57280 + .builtin_num = sizeof_array(format_plugins),
57281 + .builtin = format_plugins,
57282 + .plugins_list = {NULL, NULL},
57283 + .size = sizeof(disk_format_plugin)
57284 + },
57285 + [REISER4_JNODE_PLUGIN_TYPE] = {
57286 + .type_id = REISER4_JNODE_PLUGIN_TYPE,
57287 + .label = "jnode",
57288 + .desc = "defines kind of jnode",
57289 + .builtin_num = sizeof_array(jnode_plugins),
57290 + .builtin = jnode_plugins,
57291 + .plugins_list = {NULL, NULL},
57292 + .size = sizeof(jnode_plugin)
57293 + },
57294 + [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
57295 + .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
57296 + .label = "compression_mode",
57297 + .desc = "Defines compression mode",
57298 + .builtin_num = sizeof_array(compression_mode_plugins),
57299 + .builtin = compression_mode_plugins,
57300 + .plugins_list = {NULL, NULL},
57301 + .size = sizeof(compression_mode_plugin)
57302 + },
57303 + [REISER4_CLUSTER_PLUGIN_TYPE] = {
57304 + .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
57305 + .label = "cluster",
57306 + .desc = "Defines cluster size",
57307 + .builtin_num = sizeof_array(cluster_plugins),
57308 + .builtin = cluster_plugins,
57309 + .plugins_list = {NULL, NULL},
57310 + .size = sizeof(cluster_plugin)
57311 + }
57312 +};
57313 +
57314 +/*
57315 + * Local variables:
57316 + * c-indentation-style: "K&R"
57317 + * mode-name: "LC"
57318 + * c-basic-offset: 8
57319 + * tab-width: 8
57320 + * fill-column: 120
57321 + * End:
57322 + */
57323 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin.h linux-2.6.23/fs/reiser4/plugin/plugin.h
57324 --- linux-2.6.23.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300
57325 +++ linux-2.6.23/fs/reiser4/plugin/plugin.h 2007-12-04 16:49:30.000000000 +0300
57326 @@ -0,0 +1,936 @@
57327 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57328 +
57329 +/* Basic plugin data-types.
57330 + see fs/reiser4/plugin/plugin.c for details */
57331 +
57332 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
57333 +#define __FS_REISER4_PLUGIN_TYPES_H__
57334 +
57335 +#include "../forward.h"
57336 +#include "../debug.h"
57337 +#include "../dformat.h"
57338 +#include "../key.h"
57339 +#include "compress/compress.h"
57340 +#include "crypto/cipher.h"
57341 +#include "plugin_header.h"
57342 +#include "item/static_stat.h"
57343 +#include "item/internal.h"
57344 +#include "item/sde.h"
57345 +#include "item/cde.h"
57346 +#include "item/item.h"
57347 +#include "node/node.h"
57348 +#include "node/node40.h"
57349 +#include "security/perm.h"
57350 +#include "fibration.h"
57351 +
57352 +#include "space/bitmap.h"
57353 +#include "space/space_allocator.h"
57354 +
57355 +#include "disk_format/disk_format40.h"
57356 +#include "disk_format/disk_format.h"
57357 +
57358 +#include <linux/fs.h> /* for struct super_block, address_space */
57359 +#include <linux/mm.h> /* for struct page */
57360 +#include <linux/buffer_head.h> /* for struct buffer_head */
57361 +#include <linux/dcache.h> /* for struct dentry */
57362 +#include <linux/types.h>
57363 +#include <linux/crypto.h>
57364 +
57365 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
57366 +
57367 +/*
57368 + * File plugin. Defines the set of methods that file plugins implement, some
57369 + * of which are optional.
57370 + *
57371 + * A file plugin offers to the caller an interface for IO ( writing to and/or
57372 + * reading from) to what the caller sees as one sequence of bytes. An IO to it
57373 + * may affect more than one physical sequence of bytes, or no physical sequence
57374 + * of bytes, it may affect sequences of bytes offered by other file plugins to
57375 + * the semantic layer, and the file plugin may invoke other plugins and
57376 + * delegate work to them, but its interface is structured for offering the
57377 + * caller the ability to read and/or write what the caller sees as being a
57378 + * single sequence of bytes.
57379 + *
57380 + * The file plugin must present a sequence of bytes to the caller, but it does
57381 + * not necessarily have to store a sequence of bytes, it does not necessarily
57382 + * have to support efficient tree traversal to any offset in the sequence of
57383 + * bytes (tail and extent items, whose keys contain offsets, do however provide
57384 + * efficient non-sequential lookup of any offset in the sequence of bytes).
57385 + *
57386 + * Directory plugins provide methods for selecting file plugins by resolving a
57387 + * name for them.
57388 + *
57389 + * The functionality other filesystems call an attribute, and rigidly tie
57390 + * together, we decompose into orthogonal selectable features of files. Using
57391 + * the terminology we will define next, an attribute is a perhaps constrained,
57392 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
57393 + * which might be grandparent-major-packed, and whose parent has a deletion
57394 + * method that deletes it.
57395 + *
57396 + * File plugins can implement constraints.
57397 + *
57398 + * Files can be of variable length (e.g. regular unix files), or of static
57399 + * length (e.g. static sized attributes).
57400 + *
57401 + * An object may have many sequences of bytes, and many file plugins, but, it
57402 + * has exactly one objectid. It is usually desirable that an object has a
57403 + * deletion method which deletes every item with that objectid. Items cannot
57404 + * in general be found by just their objectids. This means that an object must
57405 + * have either a method built into its deletion plugin method for knowing what
57406 + * items need to be deleted, or links stored with the object that provide the
57407 + * plugin with a method for finding those items. Deleting a file within an
57408 + * object may or may not have the effect of deleting the entire object,
57409 + * depending on the file plugin's deletion method.
57410 + *
57411 + * LINK TAXONOMY:
57412 + *
57413 + * Many objects have a reference count, and when the reference count reaches 0
57414 + * the object's deletion method is invoked. Some links embody a reference
57415 + * count increase ("countlinks"), and others do not ("nocountlinks").
57416 + *
57417 + * Some links are bi-directional links ("bilinks"), and some are
57418 + * uni-directional("unilinks").
57419 + *
57420 + * Some links are between parts of the same object ("intralinks"), and some are
57421 + * between different objects ("interlinks").
57422 + *
57423 + * PACKING TAXONOMY:
57424 + *
57425 + * Some items of an object are stored with a major packing locality based on
57426 + * their object's objectid (e.g. unix directory items in plan A), and these are
57427 + * called "self-major-packed".
57428 + *
57429 + * Some items of an object are stored with a major packing locality based on
57430 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
57431 + * and these are called "parent-major-packed".
57432 + *
57433 + * Some items of an object are stored with a major packing locality based on
57434 + * their semantic grandparent, and these are called "grandparent-major-packed".
57435 + * Now carefully notice that we run into trouble with key length if we have to
57436 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
57437 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
57438 + * a 24 byte key. One of these fields must be sacrificed if an item is to be
57439 + * grandparent-major-packed, and which to sacrifice is left to the item author
57440 + * choosing to make the item grandparent-major-packed. You cannot make tail
57441 + * items and extent items grandparent-major-packed, though you could make them
57442 + * self-major-packed (usually they are parent-major-packed).
57443 + *
57444 + * In the case of ACLs (which are composed of fixed length ACEs which consist
57445 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
57446 + * to not have an offset field in the ACE item key, and to allow duplicate keys
57447 + * for ACEs. Thus, the set of ACES for a given file is found by looking for a
57448 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
57449 + * a directory together), the minor packing locality of ACE, the objectid of
57450 + * the file, and 0.
57451 + *
57452 + * IO involves moving data from one location to another, which means that two
57453 + * locations must be specified, source and destination.
57454 + *
57455 + * This source and destination can be in the filesystem, or they can be a
57456 + * pointer in the user process address space plus a byte count.
57457 + *
57458 + * If both source and destination are in the filesystem, then at least one of
57459 + * them must be representable as a pure stream of bytes (which we call a flow,
57460 + * and define as a struct containing a key, a data pointer, and a length).
57461 + * This may mean converting one of them into a flow. We provide a generic
57462 + * cast_into_flow() method, which will work for any plugin supporting
57463 + * read_flow(), though it is inefficiently implemented in that it temporarily
57464 + * stores the flow in a buffer (Question: what to do with huge flows that
57465 + * cannot fit into memory? Answer: we must not convert them all at once. )
57466 + *
57467 + * Performing a write requires resolving the write request into a flow defining
57468 + * the source, and a method that performs the write, and a key that defines
57469 + * where in the tree the write is to go.
57470 + *
57471 + * Performing a read requires resolving the read request into a flow defining
57472 + * the target, and a method that performs the read, and a key that defines
57473 + * where in the tree the read is to come from.
57474 + *
57475 + * There will exist file plugins which have no pluginid stored on the disk for
57476 + * them, and which are only invoked by other plugins.
57477 + */
57478 +
57479 +/* This should be incremented with each new contributed
57480 + pair (plugin type, plugin id).
57481 + NOTE: Make sure there is a release of reiser4progs
57482 + with the corresponding version number */
57483 +#define PLUGIN_LIBRARY_VERSION 0
57484 +
57485 + /* enumeration of fields within plugin_set */
57486 +typedef enum {
57487 + PSET_FILE,
57488 + PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
57489 + * inode.c:read_inode() depends on this. */
57490 + PSET_PERM,
57491 + PSET_FORMATTING,
57492 + PSET_HASH,
57493 + PSET_FIBRATION,
57494 + PSET_SD,
57495 + PSET_DIR_ITEM,
57496 + PSET_CIPHER,
57497 + PSET_DIGEST,
57498 + PSET_COMPRESSION,
57499 + PSET_COMPRESSION_MODE,
57500 + PSET_CLUSTER,
57501 + PSET_CREATE,
57502 + PSET_LAST
57503 +} pset_member;
57504 +
57505 +/* builtin file-plugins */
57506 +typedef enum {
57507 + /* regular file */
57508 + UNIX_FILE_PLUGIN_ID,
57509 + /* directory */
57510 + DIRECTORY_FILE_PLUGIN_ID,
57511 + /* symlink */
57512 + SYMLINK_FILE_PLUGIN_ID,
57513 + /* for objects completely handled by the VFS: fifos, devices,
57514 + sockets */
57515 + SPECIAL_FILE_PLUGIN_ID,
57516 + /* regular cryptcompress file */
57517 + CRYPTCOMPRESS_FILE_PLUGIN_ID,
57518 + /* number of file plugins. Used as size of arrays to hold
57519 + file plugins. */
57520 + LAST_FILE_PLUGIN_ID
57521 +} reiser4_file_id;
57522 +
57523 +typedef struct file_plugin {
57524 +
57525 + /* generic fields */
57526 + plugin_header h;
57527 +
57528 + /* VFS methods.
57529 + * Must be invariant with respect to plugin conversion.
57530 + * It can be achieved by using "common" methods, which
57531 + * are the same for all plugins that take participation in
57532 + * conversion, or by using "generic" or "careful" methods,
57533 + * which provide automatic redirection to proper private
57534 + * plugin methods ("careful" are the same as "generic",
57535 + * but with protection of pset and other disk structures
57536 + * from being rebuilt during conversion.
57537 + */
57538 + struct inode_operations * inode_ops;
57539 + struct file_operations * file_ops;
57540 + struct address_space_operations * as_ops;
57541 + /**
57542 + * Private methods. These are optional. If used they will allow you
57543 + * to minimize the amount of code needed to implement a deviation
57544 + * from some other method that also uses them.
57545 + */
57546 + /*
57547 + * private inode_ops
57548 + */
57549 + int (*setattr)(struct dentry *, struct iattr *);
57550 + /*
57551 + * private file_ops
57552 + */
57553 + /* do whatever is necessary to do when object is opened */
57554 + int (*open) (struct inode * inode, struct file * file);
57555 + ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
57556 + loff_t *off);
57557 + /* write a file;
57558 + * perform file plugin conversion (if needed);
57559 + * set @*conv to 1, if the conversion occurred */
57560 + ssize_t (*write) (struct file *, const char __user *buf,
57561 + size_t write_amount, loff_t * off, int * conv);
57562 + int (*ioctl) (struct inode *inode, struct file *filp,
57563 + unsigned int cmd, unsigned long arg);
57564 + int (*mmap) (struct file *, struct vm_area_struct *);
57565 + int (*release) (struct inode *, struct file *);
57566 + /*
57567 + * private a_ops
57568 + */
57569 + int (*readpage) (struct file *file, struct page *page);
57570 + int (*readpages)(struct file *file, struct address_space *mapping,
57571 + struct list_head *pages, unsigned nr_pages);
57572 + int (*writepages)(struct address_space *mapping,
57573 + struct writeback_control *wbc);
57574 + int (*prepare_write)(struct file *file, struct page *page,
57575 + unsigned from, unsigned to);
57576 + int (*commit_write)(struct file *file, struct page *page,
57577 + unsigned from, unsigned to);
57578 + sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
57579 + /* other private methods */
57580 + /* save inode cached stat-data onto disk. It was called
57581 + reiserfs_update_sd() in 3.x */
57582 + int (*write_sd_by_inode) (struct inode *);
57583 + /*
57584 + * Construct flow into @flow according to user-supplied data.
57585 + *
57586 + * This is used by read/write methods to construct a flow to
57587 + * write/read. ->flow_by_inode() is plugin method, rather than single
57588 + * global implementation, because key in a flow used by plugin may
57589 + * depend on data in a @buf.
57590 + *
57591 + * NIKITA-FIXME-HANS: please create statistics on what functions are
57592 + * dereferenced how often for the mongo benchmark. You can supervise
57593 + * Elena doing this for you if that helps. Email me the list of the
57594 + * top 10, with their counts, and an estimate of the total number of
57595 + * CPU cycles spent dereferencing as a percentage of CPU cycles spent
57596 + * processing (non-idle processing). If the total percent is, say,
57597 + * less than 1%, it will make our coding discussions much easier, and
57598 + * keep me from questioning whether functions like the below are too
57599 + * frequently called to be dereferenced. If the total percent is more
57600 + * than 1%, perhaps private methods should be listed in a "required"
57601 + * comment at the top of each plugin (with stern language about how if
57602 + * the comment is missing it will not be accepted by the maintainer),
57603 + * and implemented using macros not dereferenced functions. How about
57604 + * replacing this whole private methods part of the struct with a
57605 + * thorough documentation of what the standard helper functions are for
57606 + * use in constructing plugins? I think users have been asking for
57607 + * that, though not in so many words.
57608 + */
57609 + int (*flow_by_inode) (struct inode *, const char __user *buf,
57610 + int user, loff_t size,
57611 + loff_t off, rw_op op, flow_t *);
57612 + /*
57613 + * Return the key used to retrieve an offset of a file. It is used by
57614 + * default implementation of ->flow_by_inode() method
57615 + * (common_build_flow()) and, among other things, to get to the extent
57616 + * from jnode of unformatted node.
57617 + */
57618 + int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
57619 +
57620 + /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
57621 + /*
57622 + * set the plugin for a file. Called during file creation in creat()
57623 + * but not reiser4() unless an inode already exists for the file.
57624 + */
57625 + int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
57626 + reiser4_object_create_data *);
57627 +
57628 + /* NIKITA-FIXME-HANS: comment and name seem to say different things,
57629 + * are you setting up the object itself also or just adjusting the
57630 + * parent?.... */
57631 + /* set up plugins for new @object created in @parent. @root is root
57632 + directory. */
57633 + int (*adjust_to_parent) (struct inode *object, struct inode *parent,
57634 + struct inode *root);
57635 + /*
57636 + * this does whatever is necessary to do when object is created. For
57637 + * instance, for unix files stat data is inserted. It is supposed to be
57638 + * called by create of struct inode_operations.
57639 + */
57640 + int (*create_object) (struct inode *object, struct inode *parent,
57641 + reiser4_object_create_data *);
57642 + /*
57643 + * this method should check REISER4_NO_SD and set REISER4_NO_SD on
57644 + * success. Deletion of an object usually includes removal of items
57645 + * building file body (for directories this is removal of "." and "..")
57646 + * and removal of stat-data item.
57647 + */
57648 + int (*delete_object) (struct inode *);
57649 +
57650 + /* add link from @parent to @object */
57651 + int (*add_link) (struct inode *object, struct inode *parent);
57652 +
57653 + /* remove link from @parent to @object */
57654 + int (*rem_link) (struct inode *object, struct inode *parent);
57655 +
57656 + /*
57657 + * return true if item addressed by @coord belongs to @inode. This is
57658 + * used by read/write to properly slice flow into items in presence of
57659 + * multiple key assignment policies, because items of a file are not
57660 + * necessarily contiguous in a key space, for example, in a plan-b.
57661 + */
57662 + int (*owns_item) (const struct inode *, const coord_t *);
57663 +
57664 + /* checks whether yet another hard links to this object can be
57665 + added */
57666 + int (*can_add_link) (const struct inode *);
57667 +
57668 + /* checks whether hard links to this object can be removed */
57669 + int (*can_rem_link) (const struct inode *);
57670 +
57671 + /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
57672 + detach of directory plugin to remove ".." */
57673 + int (*detach) (struct inode * child, struct inode * parent);
57674 +
57675 + /* called when @child was just looked up in the @parent. It is not
57676 + empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
57677 + directory plugin */
57678 + int (*bind) (struct inode * child, struct inode * parent);
57679 +
57680 + /* process safe-link during mount */
57681 + int (*safelink) (struct inode * object, reiser4_safe_link_t link,
57682 + __u64 value);
57683 +
57684 + /* The couple of estimate methods for all file operations */
57685 + struct {
57686 + reiser4_block_nr(*create) (const struct inode *);
57687 + reiser4_block_nr(*update) (const struct inode *);
57688 + reiser4_block_nr(*unlink) (const struct inode *,
57689 + const struct inode *);
57690 + } estimate;
57691 +
57692 + /*
57693 + * reiser4 specific part of inode has a union of structures which are
57694 + * specific to a plugin. This method is called when inode is read
57695 + * (read_inode) and when file is created (common_create_child) so that
57696 + * file plugin could initialize its inode data
57697 + */
57698 + void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
57699 + int);
57700 +
57701 + /*
57702 + * This method performs progressive deletion of items and whole nodes
57703 + * from right to left.
57704 + *
57705 + * @tap: the point deletion process begins from,
57706 + * @from_key: the beginning of the deleted key range,
57707 + * @to_key: the end of the deleted key range,
57708 + * @smallest_removed: the smallest removed key,
57709 + *
57710 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
57711 + * operation was interrupted for allowing atom commit .
57712 + */
57713 + int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
57714 + const reiser4_key * to_key,
57715 + reiser4_key * smallest_removed, struct inode *,
57716 + int, int *);
57717 +
57718 + /* called from ->destroy_inode() */
57719 + void (*destroy_inode) (struct inode *);
57720 +
57721 + /*
57722 + * methods to serialize object identify. This is used, for example, by
57723 + * reiser4_{en,de}code_fh().
57724 + */
57725 + struct {
57726 + /* store object's identity at @area */
57727 + char *(*write) (struct inode * inode, char *area);
57728 + /* parse object from wire to the @obj */
57729 + char *(*read) (char *area, reiser4_object_on_wire * obj);
57730 + /* given object identity in @obj, find or create its dentry */
57731 + struct dentry *(*get) (struct super_block * s,
57732 + reiser4_object_on_wire * obj);
57733 + /* how many bytes ->wire.write() consumes */
57734 + int (*size) (struct inode * inode);
57735 + /* finish with object identify */
57736 + void (*done) (reiser4_object_on_wire * obj);
57737 + } wire;
57738 +} file_plugin;
57739 +
57740 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
57741 +
57742 +struct reiser4_object_on_wire {
57743 + file_plugin *plugin;
57744 + union {
57745 + struct {
57746 + obj_key_id key_id;
57747 + } std;
57748 + void *generic;
57749 + } u;
57750 +};
57751 +
57752 +/* builtin dir-plugins */
57753 +typedef enum {
57754 + HASHED_DIR_PLUGIN_ID,
57755 + SEEKABLE_HASHED_DIR_PLUGIN_ID,
57756 + LAST_DIR_ID
57757 +} reiser4_dir_id;
57758 +
57759 +typedef struct dir_plugin {
57760 + /* generic fields */
57761 + plugin_header h;
57762 +
57763 + struct inode_operations * inode_ops;
57764 + struct file_operations * file_ops;
57765 + struct address_space_operations * as_ops;
57766 +
57767 + /*
57768 + * private methods: These are optional. If used they will allow you to
57769 + * minimize the amount of code needed to implement a deviation from
57770 + * some other method that uses them. You could logically argue that
57771 + * they should be a separate type of plugin.
57772 + */
57773 +
57774 + struct dentry *(*get_parent) (struct inode * childdir);
57775 +
57776 + /*
57777 + * check whether "name" is acceptable name to be inserted into this
57778 + * object. Optionally implemented by directory-like objects. Can check
57779 + * for maximal length, reserved symbols etc
57780 + */
57781 + int (*is_name_acceptable) (const struct inode * inode, const char *name,
57782 + int len);
57783 +
57784 + void (*build_entry_key) (const struct inode * dir /* directory where
57785 + * entry is (or will
57786 + * be) in.*/ ,
57787 + const struct qstr * name /* name of file
57788 + * referenced by this
57789 + * entry */ ,
57790 + reiser4_key * result /* resulting key of
57791 + * directory entry */ );
57792 + int (*build_readdir_key) (struct file * dir, reiser4_key * result);
57793 + int (*add_entry) (struct inode * object, struct dentry * where,
57794 + reiser4_object_create_data * data,
57795 + reiser4_dir_entry_desc * entry);
57796 + int (*rem_entry) (struct inode * object, struct dentry * where,
57797 + reiser4_dir_entry_desc * entry);
57798 +
57799 + /*
57800 + * initialize directory structure for newly created object. For normal
57801 + * unix directories, insert dot and dotdot.
57802 + */
57803 + int (*init) (struct inode * object, struct inode * parent,
57804 + reiser4_object_create_data * data);
57805 +
57806 + /* destroy directory */
57807 + int (*done) (struct inode * child);
57808 +
57809 + /* called when @subdir was just looked up in the @dir */
57810 + int (*attach) (struct inode * subdir, struct inode * dir);
57811 + int (*detach) (struct inode * subdir, struct inode * dir);
57812 +
57813 + struct {
57814 + reiser4_block_nr(*add_entry) (const struct inode *);
57815 + reiser4_block_nr(*rem_entry) (const struct inode *);
57816 + reiser4_block_nr(*unlink) (const struct inode *,
57817 + const struct inode *);
57818 + } estimate;
57819 +} dir_plugin;
57820 +
57821 +extern dir_plugin dir_plugins[LAST_DIR_ID];
57822 +
57823 +typedef struct formatting_plugin {
57824 + /* generic fields */
57825 + plugin_header h;
57826 + /* returns non-zero iff file's tail has to be stored
57827 + in a direct item. */
57828 + int (*have_tail) (const struct inode * inode, loff_t size);
57829 +} formatting_plugin;
57830 +
57831 +typedef struct hash_plugin {
57832 + /* generic fields */
57833 + plugin_header h;
57834 + /* computes hash of the given name */
57835 + __u64(*hash) (const unsigned char *name, int len);
57836 +} hash_plugin;
57837 +
57838 +typedef struct cipher_plugin {
57839 + /* generic fields */
57840 + plugin_header h;
57841 + struct crypto_blkcipher * (*alloc) (void);
57842 + void (*free) (struct crypto_blkcipher * tfm);
57843 + /* Offset translator. For each offset this returns (k * offset), where
57844 + k (k >= 1) is an expansion factor of the cipher algorithm.
57845 + For all symmetric algorithms k == 1. For asymmetric algorithms (which
57846 + inflate data) offset translation guarantees that all disk cluster's
57847 + units will have keys smaller then next cluster's one.
57848 + */
57849 + loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
57850 + /* Cipher algorithms can accept data only by chunks of cipher block
57851 + size. This method is to align any flow up to cipher block size when
57852 + we pass it to cipher algorithm. To align means to append padding of
57853 + special format specific to the cipher algorithm */
57854 + int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
57855 + /* low-level key manager (check, install, etc..) */
57856 + int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
57857 + unsigned int keylen);
57858 + /* main text processing procedures */
57859 + void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57860 + void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57861 +} cipher_plugin;
57862 +
57863 +typedef struct digest_plugin {
57864 + /* generic fields */
57865 + plugin_header h;
57866 + /* fingerprint size in bytes */
57867 + int fipsize;
57868 + struct crypto_hash * (*alloc) (void);
57869 + void (*free) (struct crypto_hash * tfm);
57870 +} digest_plugin;
57871 +
57872 +typedef struct compression_plugin {
57873 + /* generic fields */
57874 + plugin_header h;
57875 + int (*init) (void);
57876 + /* the maximum number of bytes the size of the "compressed" data can
57877 + * exceed the uncompressed data. */
57878 + int (*overrun) (unsigned src_len);
57879 + coa_t(*alloc) (tfm_action act);
57880 + void (*free) (coa_t coa, tfm_action act);
57881 + /* minimal size of the flow we still try to compress */
57882 + int (*min_size_deflate) (void);
57883 + __u32(*checksum) (char *data, __u32 length);
57884 + /* main transform procedures */
57885 + void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
57886 + __u8 * dst_first, unsigned *dst_len);
57887 + void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
57888 + __u8 * dst_first, unsigned *dst_len);
57889 +} compression_plugin;
57890 +
57891 +typedef struct compression_mode_plugin {
57892 + /* generic fields */
57893 + plugin_header h;
57894 + /* this is called when estimating compressibility
57895 + of a logical cluster by its content */
57896 + int (*should_deflate) (struct inode * inode, cloff_t index);
57897 + /* this is called when results of compression should be saved */
57898 + int (*accept_hook) (struct inode * inode, cloff_t index);
57899 + /* this is called when results of compression should be discarded */
57900 + int (*discard_hook) (struct inode * inode, cloff_t index);
57901 +} compression_mode_plugin;
57902 +
57903 +typedef struct cluster_plugin {
57904 + /* generic fields */
57905 + plugin_header h;
57906 + int shift;
57907 +} cluster_plugin;
57908 +
57909 +typedef struct sd_ext_plugin {
57910 + /* generic fields */
57911 + plugin_header h;
57912 + int (*present) (struct inode * inode, char **area, int *len);
57913 + int (*absent) (struct inode * inode);
57914 + int (*save_len) (struct inode * inode);
57915 + int (*save) (struct inode * inode, char **area);
57916 + /* alignment requirement for this stat-data part */
57917 + int alignment;
57918 +} sd_ext_plugin;
57919 +
57920 +/* this plugin contains methods to allocate objectid for newly created files,
57921 + to deallocate objectid when file gets removed, to report number of used and
57922 + free objectids */
57923 +typedef struct oid_allocator_plugin {
57924 + /* generic fields */
57925 + plugin_header h;
57926 + int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
57927 + __u64 oids);
57928 + /* used to report statfs->f_files */
57929 + __u64(*oids_used) (reiser4_oid_allocator * map);
57930 + /* get next oid to use */
57931 + __u64(*next_oid) (reiser4_oid_allocator * map);
57932 + /* used to report statfs->f_ffree */
57933 + __u64(*oids_free) (reiser4_oid_allocator * map);
57934 + /* allocate new objectid */
57935 + int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
57936 + /* release objectid */
57937 + int (*release_oid) (reiser4_oid_allocator * map, oid_t);
57938 + /* how many pages to reserve in transaction for allocation of new
57939 + objectid */
57940 + int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
57941 + /* how many pages to reserve in transaction for freeing of an
57942 + objectid */
57943 + int (*oid_reserve_release) (reiser4_oid_allocator * map);
57944 + void (*print_info) (const char *, reiser4_oid_allocator *);
57945 +} oid_allocator_plugin;
57946 +
57947 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
57948 + are any) locations, etc */
57949 +typedef struct disk_format_plugin {
57950 + /* generic fields */
57951 + plugin_header h;
57952 + /* replay journal, initialize super_info_data, etc */
57953 + int (*init_format) (struct super_block *, void *data);
57954 +
57955 + /* key of root directory stat data */
57956 + const reiser4_key *(*root_dir_key) (const struct super_block *);
57957 +
57958 + int (*release) (struct super_block *);
57959 + jnode *(*log_super) (struct super_block *);
57960 + int (*check_open) (const struct inode * object);
57961 + int (*version_update) (struct super_block *);
57962 +} disk_format_plugin;
57963 +
57964 +struct jnode_plugin {
57965 + /* generic fields */
57966 + plugin_header h;
57967 + int (*init) (jnode * node);
57968 + int (*parse) (jnode * node);
57969 + struct address_space *(*mapping) (const jnode * node);
57970 + unsigned long (*index) (const jnode * node);
57971 + jnode *(*clone) (jnode * node);
57972 +};
57973 +
57974 +/* plugin instance. */
57975 +/* */
57976 +/* This is "wrapper" union for all types of plugins. Most of the code uses */
57977 +/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
57978 +/* operates with pointers to reiser4_plugin. This union is only used in */
57979 +/* some generic code in plugin/plugin.c that operates on all */
57980 +/* plugins. Technically speaking purpose of this union is to add type */
57981 +/* safety to said generic code: each plugin type (file_plugin, for */
57982 +/* example), contains plugin_header as its first memeber. This first member */
57983 +/* is located at the same place in memory as .h member of */
57984 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
57985 +/* looks in the .h which is header of plugin type located in union. This */
57986 +/* allows to avoid type-casts. */
57987 +union reiser4_plugin {
57988 + /* generic fields */
57989 + plugin_header h;
57990 + /* file plugin */
57991 + file_plugin file;
57992 + /* directory plugin */
57993 + dir_plugin dir;
57994 + /* hash plugin, used by directory plugin */
57995 + hash_plugin hash;
57996 + /* fibration plugin used by directory plugin */
57997 + fibration_plugin fibration;
57998 + /* cipher transform plugin, used by file plugin */
57999 + cipher_plugin cipher;
58000 + /* digest transform plugin, used by file plugin */
58001 + digest_plugin digest;
58002 + /* compression transform plugin, used by file plugin */
58003 + compression_plugin compression;
58004 + /* tail plugin, used by file plugin */
58005 + formatting_plugin formatting;
58006 + /* permission plugin */
58007 + perm_plugin perm;
58008 + /* node plugin */
58009 + node_plugin node;
58010 + /* item plugin */
58011 + item_plugin item;
58012 + /* stat-data extension plugin */
58013 + sd_ext_plugin sd_ext;
58014 + /* disk layout plugin */
58015 + disk_format_plugin format;
58016 + /* object id allocator plugin */
58017 + oid_allocator_plugin oid_allocator;
58018 + /* plugin for different jnode types */
58019 + jnode_plugin jnode;
58020 + /* compression mode plugin, used by object plugin */
58021 + compression_mode_plugin compression_mode;
58022 + /* cluster plugin, used by object plugin */
58023 + cluster_plugin clust;
58024 + /* place-holder for new plugin types that can be registered
58025 + dynamically, and used by other dynamically loaded plugins. */
58026 + void *generic;
58027 +};
58028 +
58029 +struct reiser4_plugin_ops {
58030 + /* called when plugin is initialized */
58031 + int (*init) (reiser4_plugin * plugin);
58032 + /* called when plugin is unloaded */
58033 + int (*done) (reiser4_plugin * plugin);
58034 + /* load given plugin from disk */
58035 + int (*load) (struct inode * inode,
58036 + reiser4_plugin * plugin, char **area, int *len);
58037 + /* how many space is required to store this plugin's state
58038 + in stat-data */
58039 + int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
58040 + /* save persistent plugin-data to disk */
58041 + int (*save) (struct inode * inode, reiser4_plugin * plugin,
58042 + char **area);
58043 + /* alignment requirement for on-disk state of this plugin
58044 + in number of bytes */
58045 + int alignment;
58046 + /* install itself into given inode. This can return error
58047 + (e.g., you cannot change hash of non-empty directory). */
58048 + int (*change) (struct inode * inode, reiser4_plugin * plugin,
58049 + pset_member memb);
58050 + /* install itself into given inode. This can return error
58051 + (e.g., you cannot change hash of non-empty directory). */
58052 + int (*inherit) (struct inode * inode, struct inode * parent,
58053 + reiser4_plugin * plugin);
58054 +};
58055 +
58056 +/* functions implemented in fs/reiser4/plugin/plugin.c */
58057 +
58058 +/* stores plugin reference in reiser4-specific part of inode */
58059 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
58060 +extern int init_plugins(void);
58061 +
58062 +/* builtin plugins */
58063 +
58064 +/* builtin hash-plugins */
58065 +
58066 +typedef enum {
58067 + RUPASOV_HASH_ID,
58068 + R5_HASH_ID,
58069 + TEA_HASH_ID,
58070 + FNV1_HASH_ID,
58071 + DEGENERATE_HASH_ID,
58072 + LAST_HASH_ID
58073 +} reiser4_hash_id;
58074 +
58075 +/* builtin cipher plugins */
58076 +
58077 +typedef enum {
58078 + NONE_CIPHER_ID,
58079 + LAST_CIPHER_ID
58080 +} reiser4_cipher_id;
58081 +
58082 +/* builtin digest plugins */
58083 +
58084 +typedef enum {
58085 + SHA256_32_DIGEST_ID,
58086 + LAST_DIGEST_ID
58087 +} reiser4_digest_id;
58088 +
58089 +/* builtin compression mode plugins */
58090 +typedef enum {
58091 + NONE_COMPRESSION_MODE_ID,
58092 + LATTD_COMPRESSION_MODE_ID,
58093 + ULTIM_COMPRESSION_MODE_ID,
58094 + FORCE_COMPRESSION_MODE_ID,
58095 + CONVX_COMPRESSION_MODE_ID,
58096 + LAST_COMPRESSION_MODE_ID
58097 +} reiser4_compression_mode_id;
58098 +
58099 +/* builtin cluster plugins */
58100 +typedef enum {
58101 + CLUSTER_64K_ID,
58102 + CLUSTER_32K_ID,
58103 + CLUSTER_16K_ID,
58104 + CLUSTER_8K_ID,
58105 + CLUSTER_4K_ID,
58106 + LAST_CLUSTER_ID
58107 +} reiser4_cluster_id;
58108 +
58109 +/* builtin tail-plugins */
58110 +
58111 +typedef enum {
58112 + NEVER_TAILS_FORMATTING_ID,
58113 + ALWAYS_TAILS_FORMATTING_ID,
58114 + SMALL_FILE_FORMATTING_ID,
58115 + LAST_TAIL_FORMATTING_ID
58116 +} reiser4_formatting_id;
58117 +
58118 +/* data type used to pack parameters that we pass to vfs object creation
58119 + function create_object() */
58120 +struct reiser4_object_create_data {
58121 + /* plugin to control created object */
58122 + reiser4_file_id id;
58123 + /* mode of regular file, directory or special file */
58124 +/* what happens if some other sort of perm plugin is in use? */
58125 + int mode;
58126 + /* rdev of special file */
58127 + dev_t rdev;
58128 + /* symlink target */
58129 + const char *name;
58130 + /* add here something for non-standard objects you invent, like
58131 + query for interpolation file etc. */
58132 +
58133 + struct reiser4_crypto_info * crypto;
58134 +
58135 + struct inode *parent;
58136 + struct dentry *dentry;
58137 +};
58138 +
58139 +/* description of directory entry being created/destroyed/sought for
58140 +
58141 + It is passed down to the directory plugin and farther to the
58142 + directory item plugin methods. Creation of new directory is done in
58143 + several stages: first we search for an entry with the same name, then
58144 + create new one. reiser4_dir_entry_desc is used to store some information
58145 + collected at some stage of this process and required later: key of
58146 + item that we want to insert/delete and pointer to an object that will
58147 + be bound by the new directory entry. Probably some more fields will
58148 + be added there.
58149 +
58150 +*/
58151 +struct reiser4_dir_entry_desc {
58152 + /* key of directory entry */
58153 + reiser4_key key;
58154 + /* object bound by this entry. */
58155 + struct inode *obj;
58156 +};
58157 +
58158 +#define MAX_PLUGIN_TYPE_LABEL_LEN 32
58159 +#define MAX_PLUGIN_PLUG_LABEL_LEN 32
58160 +
58161 +#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
58162 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
58163 +{ \
58164 + reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
58165 + return plugin ? & plugin -> FIELD : NULL; \
58166 +} \
58167 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
58168 +{ \
58169 + reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
58170 + return plugin ? & plugin -> FIELD : NULL; \
58171 +} \
58172 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
58173 +{ \
58174 + reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
58175 + return plugin ? & plugin -> FIELD : NULL; \
58176 +} \
58177 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
58178 +{ \
58179 + return ( reiser4_plugin * ) plugin; \
58180 +} \
58181 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
58182 +{ \
58183 + return TYPE ## _to_plugin (plugin) -> h.id; \
58184 +} \
58185 +typedef struct { int foo; } TYPE ## _plugin_dummy
58186 +
58187 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
58188 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
58189 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
58190 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
58191 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
58192 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
58193 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
58194 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
58195 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
58196 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
58197 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
58198 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
58199 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
58200 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
58201 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58202 + compression_mode);
58203 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
58204 +
58205 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
58206 +
58207 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
58208 +
58209 +#define for_all_plugins(ptype, plugin) \
58210 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
58211 + get_plugin_list(ptype) != &plugin->h.linkage; \
58212 + plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
58213 +
58214 +
58215 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
58216 +extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
58217 +extern int finish_pset(struct inode *inode);
58218 +
58219 +/* defined in fs/reiser4/plugin/object.c */
58220 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58221 +/* defined in fs/reiser4/plugin/object.c */
58222 +extern dir_plugin dir_plugins[LAST_DIR_ID];
58223 +/* defined in fs/reiser4/plugin/item/static_stat.c */
58224 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
58225 +/* defined in fs/reiser4/plugin/hash.c */
58226 +extern hash_plugin hash_plugins[LAST_HASH_ID];
58227 +/* defined in fs/reiser4/plugin/fibration.c */
58228 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
58229 +/* defined in fs/reiser4/plugin/crypt.c */
58230 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
58231 +/* defined in fs/reiser4/plugin/digest.c */
58232 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
58233 +/* defined in fs/reiser4/plugin/compress/compress.c */
58234 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
58235 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
58236 +extern compression_mode_plugin
58237 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
58238 +/* defined in fs/reiser4/plugin/cluster.c */
58239 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
58240 +/* defined in fs/reiser4/plugin/tail.c */
58241 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
58242 +/* defined in fs/reiser4/plugin/security/security.c */
58243 +extern perm_plugin perm_plugins[LAST_PERM_ID];
58244 +/* defined in fs/reiser4/plugin/item/item.c */
58245 +extern item_plugin item_plugins[LAST_ITEM_ID];
58246 +/* defined in fs/reiser4/plugin/node/node.c */
58247 +extern node_plugin node_plugins[LAST_NODE_ID];
58248 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
58249 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
58250 +
58251 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
58252 +#endif
58253 +
58254 +/* Make Linus happy.
58255 + Local variables:
58256 + c-indentation-style: "K&R"
58257 + mode-name: "LC"
58258 + c-basic-offset: 8
58259 + tab-width: 8
58260 + fill-column: 120
58261 + End:
58262 +*/
58263 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.23/fs/reiser4/plugin/plugin_header.h
58264 --- linux-2.6.23.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
58265 +++ linux-2.6.23/fs/reiser4/plugin/plugin_header.h 2007-12-04 16:49:30.000000000 +0300
58266 @@ -0,0 +1,155 @@
58267 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58268 +
58269 +/* plugin header. Data structures required by all plugin types. */
58270 +
58271 +#if !defined( __PLUGIN_HEADER_H__ )
58272 +#define __PLUGIN_HEADER_H__
58273 +
58274 +/* plugin data-types and constants */
58275 +
58276 +#include "../debug.h"
58277 +#include "../dformat.h"
58278 +
58279 +/* Every plugin type can be considered as a class of virtual objects
58280 + {(type, i) | i = 0, 1, ...}, which has one the following categories
58281 + of virtualization:
58282 + A - no virtualization;
58283 + F - per-file virtualization;
58284 + S - per-superblock virtualization;
58285 + FIXME-EDWARD: Define every such category */
58286 +
58287 +/* Supported plugin types: (id, (virtualization category), short description) */
58288 +typedef enum {
58289 + REISER4_FILE_PLUGIN_TYPE, /* (F) service VFS enry-points */
58290 + REISER4_DIR_PLUGIN_TYPE, /* (F) service VFS enry-points */
58291 + REISER4_ITEM_PLUGIN_TYPE, /* (F) manage items */
58292 + REISER4_NODE_PLUGIN_TYPE, /* (S) manage formatted nodes */
58293 + REISER4_HASH_PLUGIN_TYPE, /* (F) compute hash */
58294 + REISER4_FIBRATION_PLUGIN_TYPE, /* (F) directory fibrations */
58295 + REISER4_FORMATTING_PLUGIN_TYPE, /* (F) tail-packing policy */
58296 + REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */
58297 + REISER4_SD_EXT_PLUGIN_TYPE, /* (A) stat-data extensions */
58298 + REISER4_FORMAT_PLUGIN_TYPE, /* (S) specify disk format */
58299 + REISER4_JNODE_PLUGIN_TYPE, /* (A) in-memory node headers */
58300 + REISER4_CIPHER_PLUGIN_TYPE, /* (F) cipher transform algs */
58301 + REISER4_DIGEST_PLUGIN_TYPE, /* (F) digest transform algs */
58302 + REISER4_COMPRESSION_PLUGIN_TYPE, /* (F) compression tfm algs */
58303 + REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
58304 + REISER4_CLUSTER_PLUGIN_TYPE, /* (F) size of logical cluster */
58305 + REISER4_PLUGIN_TYPES
58306 +} reiser4_plugin_type;
58307 +
58308 +/* Supported plugin groups */
58309 +typedef enum {
58310 + REISER4_DIRECTORY_FILE,
58311 + REISER4_REGULAR_FILE,
58312 + REISER4_SYMLINK_FILE,
58313 + REISER4_SPECIAL_FILE,
58314 +} file_plugin_group;
58315 +
58316 +struct reiser4_plugin_ops;
58317 +/* generic plugin operations, supported by each
58318 + plugin type. */
58319 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
58320 +
58321 +/* the common part of all plugin instances. */
58322 +typedef struct plugin_header {
58323 + /* plugin type */
58324 + reiser4_plugin_type type_id;
58325 + /* id of this plugin */
58326 + reiser4_plugin_id id;
58327 + /* bitmask of groups the plugin belongs to. */
58328 + reiser4_plugin_groups groups;
58329 + /* plugin operations */
58330 + reiser4_plugin_ops *pops;
58331 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
58332 + /* short label of this plugin */
58333 + const char *label;
58334 + /* descriptive string.. */
58335 + const char *desc;
58336 + /* list linkage */
58337 + struct list_head linkage;
58338 +} plugin_header;
58339 +
58340 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
58341 +
58342 +/* PRIVATE INTERFACES */
58343 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
58344 +/* plugin type representation. */
58345 +struct reiser4_plugin_type_data {
58346 + /* internal plugin type identifier. Should coincide with
58347 + index of this item in plugins[] array. */
58348 + reiser4_plugin_type type_id;
58349 + /* short symbolic label of this plugin type. Should be no longer
58350 + than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
58351 + const char *label;
58352 + /* plugin type description longer than .label */
58353 + const char *desc;
58354 +
58355 +/* NIKITA-FIXME-HANS: define built-in */
58356 + /* number of built-in plugin instances of this type */
58357 + int builtin_num;
58358 + /* array of built-in plugins */
58359 + void *builtin;
58360 + struct list_head plugins_list;
58361 + size_t size;
58362 +};
58363 +
58364 +extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
58365 +
58366 +int is_plugin_type_valid(reiser4_plugin_type type);
58367 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
58368 +
58369 +static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data * ptype,
58370 + int i)
58371 +{
58372 + char *builtin;
58373 +
58374 + builtin = ptype->builtin;
58375 + return (reiser4_plugin *) (builtin + i * ptype->size);
58376 +}
58377 +
58378 +/* return plugin by its @type_id and @id */
58379 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
58380 + reiser4_plugin_id id)
58381 +{
58382 + assert("nikita-1651", is_plugin_type_valid(type));
58383 + assert("nikita-1652", is_plugin_id_valid(type, id));
58384 + return plugin_at(&plugins[type], id);
58385 +}
58386 +
58387 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
58388 + reiser4_plugin_id id);
58389 +
58390 +/**
58391 + * plugin_by_disk_id - get reiser4_plugin
58392 + * @type_id: plugin type id
58393 + * @did: plugin id in disk format
58394 + *
58395 + * Returns reiser4_plugin by plugin type id an dplugin_id.
58396 + */
58397 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
58398 + reiser4_plugin_type type_id,
58399 + __le16 *plugin_id)
58400 +{
58401 + /*
58402 + * what we should do properly is to maintain within each file-system a
58403 + * dictionary that maps on-disk plugin ids to "universal" ids. This
58404 + * dictionary will be resolved on mount time, so that this function
58405 + * will perform just one additional array lookup.
58406 + */
58407 + return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
58408 +}
58409 +
58410 +/* __PLUGIN_HEADER_H__ */
58411 +#endif
58412 +
58413 +/*
58414 + * Local variables:
58415 + * c-indentation-style: "K&R"
58416 + * mode-name: "LC"
58417 + * c-basic-offset: 8
58418 + * tab-width: 8
58419 + * fill-column: 79
58420 + * End:
58421 + */
58422 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.23/fs/reiser4/plugin/plugin_set.c
58423 --- linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300
58424 +++ linux-2.6.23/fs/reiser4/plugin/plugin_set.c 2007-12-04 16:49:30.000000000 +0300
58425 @@ -0,0 +1,379 @@
58426 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58427 + * reiser4/README */
58428 +/* This file contains Reiser4 plugin set operations */
58429 +
58430 +/* plugin sets
58431 + *
58432 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
58433 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
58434 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
58435 + * set of plugins (so called pset) is described by structure plugin_set (see
58436 + * plugin/plugin_set.h), which contains pointers to all required plugins.
58437 + *
58438 + * Children can inherit some pset members from their parent, however sometimes
58439 + * it is useful to specify members different from parent ones. Since object's
58440 + * pset can not be easily changed without fatal consequences, we use for this
58441 + * purpose another special plugin table (so called hset, or heir set) described
58442 + * by the same structure.
58443 + *
58444 + * Inode only stores a pointers to pset and hset. Different inodes with the
58445 + * same set of pset (hset) members point to the same pset (hset). This is
58446 + * archived by storing psets and hsets in global hash table. Races are avoided
58447 + * by simple (and efficient so far) solution of never recycling psets, even
58448 + * when last inode pointing to it is destroyed.
58449 + */
58450 +
58451 +#include "../debug.h"
58452 +#include "../super.h"
58453 +#include "plugin_set.h"
58454 +
58455 +#include <linux/slab.h>
58456 +#include <linux/stddef.h>
58457 +
58458 +/* slab for plugin sets */
58459 +static struct kmem_cache *plugin_set_slab;
58460 +
58461 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
58462 + [0 ... 7] = SPIN_LOCK_UNLOCKED
58463 +};
58464 +
58465 +/* hash table support */
58466 +
58467 +#define PS_TABLE_SIZE (32)
58468 +
58469 +static inline plugin_set *cast_to(const unsigned long *a)
58470 +{
58471 + return container_of(a, plugin_set, hashval);
58472 +}
58473 +
58474 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
58475 +{
58476 + plugin_set *set1;
58477 + plugin_set *set2;
58478 +
58479 + /* make sure fields are not missed in the code below */
58480 + cassert(sizeof *set1 ==
58481 + sizeof set1->hashval +
58482 + sizeof set1->link +
58483 + sizeof set1->file +
58484 + sizeof set1->dir +
58485 + sizeof set1->perm +
58486 + sizeof set1->formatting +
58487 + sizeof set1->hash +
58488 + sizeof set1->fibration +
58489 + sizeof set1->sd +
58490 + sizeof set1->dir_item +
58491 + sizeof set1->cipher +
58492 + sizeof set1->digest +
58493 + sizeof set1->compression +
58494 + sizeof set1->compression_mode +
58495 + sizeof set1->cluster +
58496 + sizeof set1->create);
58497 +
58498 + set1 = cast_to(a1);
58499 + set2 = cast_to(a2);
58500 + return
58501 + set1->hashval == set2->hashval &&
58502 + set1->file == set2->file &&
58503 + set1->dir == set2->dir &&
58504 + set1->perm == set2->perm &&
58505 + set1->formatting == set2->formatting &&
58506 + set1->hash == set2->hash &&
58507 + set1->fibration == set2->fibration &&
58508 + set1->sd == set2->sd &&
58509 + set1->dir_item == set2->dir_item &&
58510 + set1->cipher == set2->cipher &&
58511 + set1->digest == set2->digest &&
58512 + set1->compression == set2->compression &&
58513 + set1->compression_mode == set2->compression_mode &&
58514 + set1->cluster == set2->cluster &&
58515 + set1->create == set2->create;
58516 +}
58517 +
58518 +#define HASH_FIELD(hash, set, field) \
58519 +({ \
58520 + (hash) += (unsigned long)(set)->field >> 2; \
58521 +})
58522 +
58523 +static inline unsigned long calculate_hash(const plugin_set * set)
58524 +{
58525 + unsigned long result;
58526 +
58527 + result = 0;
58528 + HASH_FIELD(result, set, file);
58529 + HASH_FIELD(result, set, dir);
58530 + HASH_FIELD(result, set, perm);
58531 + HASH_FIELD(result, set, formatting);
58532 + HASH_FIELD(result, set, hash);
58533 + HASH_FIELD(result, set, fibration);
58534 + HASH_FIELD(result, set, sd);
58535 + HASH_FIELD(result, set, dir_item);
58536 + HASH_FIELD(result, set, cipher);
58537 + HASH_FIELD(result, set, digest);
58538 + HASH_FIELD(result, set, compression);
58539 + HASH_FIELD(result, set, compression_mode);
58540 + HASH_FIELD(result, set, cluster);
58541 + HASH_FIELD(result, set, create);
58542 + return result & (PS_TABLE_SIZE - 1);
58543 +}
58544 +
58545 +static inline unsigned long
58546 +pshash(ps_hash_table * table, const unsigned long *a)
58547 +{
58548 + return *a;
58549 +}
58550 +
58551 +/* The hash table definition */
58552 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
58553 +#define KFREE(ptr, size) kfree(ptr)
58554 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
58555 + pseq);
58556 +#undef KFREE
58557 +#undef KMALLOC
58558 +
58559 +static ps_hash_table ps_table;
58560 +static plugin_set empty_set = {
58561 + .hashval = 0,
58562 + .file = NULL,
58563 + .dir = NULL,
58564 + .perm = NULL,
58565 + .formatting = NULL,
58566 + .hash = NULL,
58567 + .fibration = NULL,
58568 + .sd = NULL,
58569 + .dir_item = NULL,
58570 + .cipher = NULL,
58571 + .digest = NULL,
58572 + .compression = NULL,
58573 + .compression_mode = NULL,
58574 + .cluster = NULL,
58575 + .create = NULL,
58576 + .link = {NULL}
58577 +};
58578 +
58579 +plugin_set *plugin_set_get_empty(void)
58580 +{
58581 + return &empty_set;
58582 +}
58583 +
58584 +void plugin_set_put(plugin_set * set)
58585 +{
58586 +}
58587 +
58588 +static inline unsigned long *pset_field(plugin_set * set, int offset)
58589 +{
58590 + return (unsigned long *)(((char *)set) + offset);
58591 +}
58592 +
58593 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
58594 + const int offset)
58595 +{
58596 + unsigned long *spot;
58597 + spinlock_t *lock;
58598 + plugin_set replica;
58599 + plugin_set *twin;
58600 + plugin_set *psal;
58601 + plugin_set *orig;
58602 +
58603 + assert("nikita-2902", set != NULL);
58604 + assert("nikita-2904", *set != NULL);
58605 +
58606 + spot = pset_field(*set, offset);
58607 + if (unlikely(*spot == val))
58608 + return 0;
58609 +
58610 + replica = *(orig = *set);
58611 + *pset_field(&replica, offset) = val;
58612 + replica.hashval = calculate_hash(&replica);
58613 + rcu_read_lock();
58614 + twin = ps_hash_find(&ps_table, &replica.hashval);
58615 + if (unlikely(twin == NULL)) {
58616 + rcu_read_unlock();
58617 + psal = kmem_cache_alloc(plugin_set_slab,
58618 + reiser4_ctx_gfp_mask_get());
58619 + if (psal == NULL)
58620 + return RETERR(-ENOMEM);
58621 + *psal = replica;
58622 + lock = &plugin_set_lock[replica.hashval & 7];
58623 + spin_lock(lock);
58624 + twin = ps_hash_find(&ps_table, &replica.hashval);
58625 + if (likely(twin == NULL)) {
58626 + *set = psal;
58627 + ps_hash_insert_rcu(&ps_table, psal);
58628 + } else {
58629 + *set = twin;
58630 + kmem_cache_free(plugin_set_slab, psal);
58631 + }
58632 + spin_unlock(lock);
58633 + } else {
58634 + rcu_read_unlock();
58635 + *set = twin;
58636 + }
58637 + return 0;
58638 +}
58639 +
58640 +static struct {
58641 + int offset;
58642 + reiser4_plugin_groups groups;
58643 + reiser4_plugin_type type;
58644 +} pset_descr[PSET_LAST] = {
58645 + [PSET_FILE] = {
58646 + .offset = offsetof(plugin_set, file),
58647 + .type = REISER4_FILE_PLUGIN_TYPE,
58648 + .groups = 0
58649 + },
58650 + [PSET_DIR] = {
58651 + .offset = offsetof(plugin_set, dir),
58652 + .type = REISER4_DIR_PLUGIN_TYPE,
58653 + .groups = 0
58654 + },
58655 + [PSET_PERM] = {
58656 + .offset = offsetof(plugin_set, perm),
58657 + .type = REISER4_PERM_PLUGIN_TYPE,
58658 + .groups = 0
58659 + },
58660 + [PSET_FORMATTING] = {
58661 + .offset = offsetof(plugin_set, formatting),
58662 + .type = REISER4_FORMATTING_PLUGIN_TYPE,
58663 + .groups = 0
58664 + },
58665 + [PSET_HASH] = {
58666 + .offset = offsetof(plugin_set, hash),
58667 + .type = REISER4_HASH_PLUGIN_TYPE,
58668 + .groups = 0
58669 + },
58670 + [PSET_FIBRATION] = {
58671 + .offset = offsetof(plugin_set, fibration),
58672 + .type = REISER4_FIBRATION_PLUGIN_TYPE,
58673 + .groups = 0
58674 + },
58675 + [PSET_SD] = {
58676 + .offset = offsetof(plugin_set, sd),
58677 + .type = REISER4_ITEM_PLUGIN_TYPE,
58678 + .groups = (1 << STAT_DATA_ITEM_TYPE)
58679 + },
58680 + [PSET_DIR_ITEM] = {
58681 + .offset = offsetof(plugin_set, dir_item),
58682 + .type = REISER4_ITEM_PLUGIN_TYPE,
58683 + .groups = (1 << DIR_ENTRY_ITEM_TYPE)
58684 + },
58685 + [PSET_CIPHER] = {
58686 + .offset = offsetof(plugin_set, cipher),
58687 + .type = REISER4_CIPHER_PLUGIN_TYPE,
58688 + .groups = 0
58689 + },
58690 + [PSET_DIGEST] = {
58691 + .offset = offsetof(plugin_set, digest),
58692 + .type = REISER4_DIGEST_PLUGIN_TYPE,
58693 + .groups = 0
58694 + },
58695 + [PSET_COMPRESSION] = {
58696 + .offset = offsetof(plugin_set, compression),
58697 + .type = REISER4_COMPRESSION_PLUGIN_TYPE,
58698 + .groups = 0
58699 + },
58700 + [PSET_COMPRESSION_MODE] = {
58701 + .offset = offsetof(plugin_set, compression_mode),
58702 + .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58703 + .groups = 0
58704 + },
58705 + [PSET_CLUSTER] = {
58706 + .offset = offsetof(plugin_set, cluster),
58707 + .type = REISER4_CLUSTER_PLUGIN_TYPE,
58708 + .groups = 0
58709 + },
58710 + [PSET_CREATE] = {
58711 + .offset = offsetof(plugin_set, create),
58712 + .type = REISER4_FILE_PLUGIN_TYPE,
58713 + .groups = (1 << REISER4_REGULAR_FILE)
58714 + }
58715 +};
58716 +
58717 +#define DEFINE_PSET_OPS(PREFIX) \
58718 + reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
58719 +{ \
58720 + if (memb > PSET_LAST) \
58721 + return REISER4_PLUGIN_TYPES; \
58722 + return pset_descr[memb].type; \
58723 +} \
58724 + \
58725 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
58726 + reiser4_plugin * plugin) \
58727 +{ \
58728 + assert("nikita-3492", set != NULL); \
58729 + assert("nikita-3493", *set != NULL); \
58730 + assert("nikita-3494", plugin != NULL); \
58731 + assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
58732 + assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
58733 + \
58734 + if (pset_descr[memb].groups) \
58735 + if (!(pset_descr[memb].groups & plugin->h.groups)) \
58736 + return -EINVAL; \
58737 + \
58738 + return plugin_set_field(set, \
58739 + (unsigned long)plugin, pset_descr[memb].offset); \
58740 +} \
58741 + \
58742 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
58743 +{ \
58744 + assert("nikita-3497", set != NULL); \
58745 + assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
58746 + \
58747 + return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
58748 +}
58749 +
58750 +DEFINE_PSET_OPS(aset);
58751 +
58752 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
58753 + return plugin_set_field(set,
58754 + (unsigned long)plugin, pset_descr[memb].offset);
58755 +}
58756 +
58757 +/**
58758 + * init_plugin_set - create plugin set cache and hash table
58759 + *
58760 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
58761 + * reiser4 module initialization.
58762 + */
58763 +int init_plugin_set(void)
58764 +{
58765 + int result;
58766 +
58767 + result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
58768 + if (result == 0) {
58769 + plugin_set_slab = kmem_cache_create("plugin_set",
58770 + sizeof(plugin_set), 0,
58771 + SLAB_HWCACHE_ALIGN,
58772 + NULL);
58773 + if (plugin_set_slab == NULL)
58774 + result = RETERR(-ENOMEM);
58775 + }
58776 + return result;
58777 +}
58778 +
58779 +/**
58780 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
58781 + *
58782 + * This is called on reiser4 module unloading or system shutdown.
58783 + */
58784 +void done_plugin_set(void)
58785 +{
58786 + plugin_set *cur, *next;
58787 +
58788 + for_all_in_htable(&ps_table, ps, cur, next) {
58789 + ps_hash_remove(&ps_table, cur);
58790 + kmem_cache_free(plugin_set_slab, cur);
58791 + }
58792 + destroy_reiser4_cache(&plugin_set_slab);
58793 + ps_hash_done(&ps_table);
58794 +}
58795 +
58796 +/*
58797 + * Local variables:
58798 + * c-indentation-style: "K&R"
58799 + * mode-name: "LC"
58800 + * c-basic-offset: 8
58801 + * tab-width: 8
58802 + * fill-column: 120
58803 + * End:
58804 + */
58805 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.23/fs/reiser4/plugin/plugin_set.h
58806 --- linux-2.6.23.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300
58807 +++ linux-2.6.23/fs/reiser4/plugin/plugin_set.h 2007-12-04 16:49:30.000000000 +0300
58808 @@ -0,0 +1,77 @@
58809 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58810 +
58811 +/* Reiser4 plugin set definition.
58812 + See fs/reiser4/plugin/plugin_set.c for details */
58813 +
58814 +#if !defined( __PLUGIN_SET_H__ )
58815 +#define __PLUGIN_SET_H__
58816 +
58817 +#include "../type_safe_hash.h"
58818 +#include "plugin.h"
58819 +
58820 +#include <linux/rcupdate.h>
58821 +
58822 +struct plugin_set;
58823 +typedef struct plugin_set plugin_set;
58824 +
58825 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
58826 +
58827 +struct plugin_set {
58828 + unsigned long hashval;
58829 + /* plugin of file */
58830 + file_plugin *file;
58831 + /* plugin of dir */
58832 + dir_plugin *dir;
58833 + /* perm plugin for this file */
58834 + perm_plugin *perm;
58835 + /* tail policy plugin. Only meaningful for regular files */
58836 + formatting_plugin *formatting;
58837 + /* hash plugin. Only meaningful for directories. */
58838 + hash_plugin *hash;
58839 + /* fibration plugin. Only meaningful for directories. */
58840 + fibration_plugin *fibration;
58841 + /* plugin of stat-data */
58842 + item_plugin *sd;
58843 + /* plugin of items a directory is built of */
58844 + item_plugin *dir_item;
58845 + /* cipher plugin */
58846 + cipher_plugin *cipher;
58847 + /* digest plugin */
58848 + digest_plugin *digest;
58849 + /* compression plugin */
58850 + compression_plugin *compression;
58851 + /* compression mode plugin */
58852 + compression_mode_plugin *compression_mode;
58853 + /* cluster plugin */
58854 + cluster_plugin *cluster;
58855 + /* this specifies file plugin of regular children.
58856 + only meaningful for directories */
58857 + file_plugin *create;
58858 + ps_hash_link link;
58859 +};
58860 +
58861 +extern plugin_set *plugin_set_get_empty(void);
58862 +extern void plugin_set_put(plugin_set * set);
58863 +
58864 +extern int init_plugin_set(void);
58865 +extern void done_plugin_set(void);
58866 +
58867 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
58868 +extern int set_plugin(plugin_set ** set, pset_member memb,
58869 + reiser4_plugin * plugin);
58870 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
58871 + reiser4_plugin * plugin);
58872 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
58873 +
58874 +/* __PLUGIN_SET_H__ */
58875 +#endif
58876 +
58877 +/* Make Linus happy.
58878 + Local variables:
58879 + c-indentation-style: "K&R"
58880 + mode-name: "LC"
58881 + c-basic-offset: 8
58882 + tab-width: 8
58883 + fill-column: 120
58884 + End:
58885 +*/
58886 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/security/Makefile linux-2.6.23/fs/reiser4/plugin/security/Makefile
58887 --- linux-2.6.23.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300
58888 +++ linux-2.6.23/fs/reiser4/plugin/security/Makefile 2007-12-04 16:49:30.000000000 +0300
58889 @@ -0,0 +1,4 @@
58890 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
58891 +
58892 +security_plugins-objs := \
58893 + perm.o
58894 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/security/perm.c linux-2.6.23/fs/reiser4/plugin/security/perm.c
58895 --- linux-2.6.23.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
58896 +++ linux-2.6.23/fs/reiser4/plugin/security/perm.c 2007-12-04 16:49:30.000000000 +0300
58897 @@ -0,0 +1,33 @@
58898 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58899 +
58900 +/*
58901 + * This file contains implementation of permission plugins.
58902 + * See the comments in perm.h
58903 + */
58904 +
58905 +#include "../plugin.h"
58906 +#include "../plugin_header.h"
58907 +#include "../../debug.h"
58908 +
58909 +perm_plugin perm_plugins[LAST_PERM_ID] = {
58910 + [NULL_PERM_ID] = {
58911 + .h = {
58912 + .type_id = REISER4_PERM_PLUGIN_TYPE,
58913 + .id = NULL_PERM_ID,
58914 + .pops = NULL,
58915 + .label = "null",
58916 + .desc = "stub permission plugin",
58917 + .linkage = {NULL, NULL}
58918 + }
58919 + }
58920 +};
58921 +
58922 +/*
58923 + * Local variables:
58924 + * c-indentation-style: "K&R"
58925 + * mode-name: "LC"
58926 + * c-basic-offset: 8
58927 + * tab-width: 8
58928 + * fill-column: 79
58929 + * End:
58930 + */
58931 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/security/perm.h linux-2.6.23/fs/reiser4/plugin/security/perm.h
58932 --- linux-2.6.23.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
58933 +++ linux-2.6.23/fs/reiser4/plugin/security/perm.h 2007-12-04 16:49:30.000000000 +0300
58934 @@ -0,0 +1,38 @@
58935 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58936 +
58937 +/* Perm (short for "permissions") plugins common stuff. */
58938 +
58939 +#if !defined( __REISER4_PERM_H__ )
58940 +#define __REISER4_PERM_H__
58941 +
58942 +#include "../../forward.h"
58943 +#include "../plugin_header.h"
58944 +
58945 +#include <linux/types.h>
58946 +
58947 +/* Definition of permission plugin */
58948 +/* NIKITA-FIXME-HANS: define what this is targeted for.
58949 + It does not seem to be intended for use with sys_reiser4. Explain. */
58950 +
58951 +/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
58952 + Consider it like a temporary "seam" and reserved pset member.
58953 + If you have something usefull to add, then rename this plugin and add here */
58954 +typedef struct perm_plugin {
58955 + /* generic plugin fields */
58956 + plugin_header h;
58957 +} perm_plugin;
58958 +
58959 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
58960 +
58961 +/* __REISER4_PERM_H__ */
58962 +#endif
58963 +
58964 +/* Make Linus happy.
58965 + Local variables:
58966 + c-indentation-style: "K&R"
58967 + mode-name: "LC"
58968 + c-basic-offset: 8
58969 + tab-width: 8
58970 + fill-column: 120
58971 + End:
58972 +*/
58973 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.23/fs/reiser4/plugin/space/bitmap.c
58974 --- linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300
58975 +++ linux-2.6.23/fs/reiser4/plugin/space/bitmap.c 2007-12-04 16:49:30.000000000 +0300
58976 @@ -0,0 +1,1585 @@
58977 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58978 +
58979 +#include "../../debug.h"
58980 +#include "../../dformat.h"
58981 +#include "../../txnmgr.h"
58982 +#include "../../jnode.h"
58983 +#include "../../block_alloc.h"
58984 +#include "../../tree.h"
58985 +#include "../../super.h"
58986 +#include "../plugin.h"
58987 +#include "space_allocator.h"
58988 +#include "bitmap.h"
58989 +
58990 +#include <linux/types.h>
58991 +#include <linux/fs.h> /* for struct super_block */
58992 +#include <linux/mutex.h>
58993 +#include <asm/div64.h>
58994 +
58995 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
58996 + * blocks
58997 +
58998 + A useful optimization of reiser4 bitmap handling would be dynamic bitmap
58999 + blocks loading/unloading which is different from v3.x where all bitmap
59000 + blocks are loaded at mount time.
59001 +
59002 + To implement bitmap blocks unloading we need to count bitmap block usage
59003 + and detect currently unused blocks allowing them to be unloaded. It is not
59004 + a simple task since we allow several threads to modify one bitmap block
59005 + simultaneously.
59006 +
59007 + Briefly speaking, the following schema is proposed: we count in special
59008 + variable associated with each bitmap block. That is for counting of block
59009 + alloc/dealloc operations on that bitmap block. With a deferred block
59010 + deallocation feature of reiser4 all those operation will be represented in
59011 + atom dirty/deleted lists as jnodes for freshly allocated or deleted
59012 + nodes.
59013 +
59014 + So, we increment usage counter for each new node allocated or deleted, and
59015 + decrement it at atom commit one time for each node from the dirty/deleted
59016 + atom's list. Of course, freshly allocated node deletion and node reusing
59017 + from atom deleted (if we do so) list should decrement bitmap usage counter
59018 + also.
59019 +
59020 + This schema seems to be working but that reference counting is
59021 + not easy to debug. I think we should agree with Hans and do not implement
59022 + it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
59023 +
59024 + For simplicity all bitmap nodes (both commit and working bitmap blocks) are
59025 + loaded into memory on fs mount time or each bitmap nodes are loaded at the
59026 + first access to it, the "dont_load_bitmap" mount option controls whether
59027 + bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
59028 + nodes currently is not supported. */
59029 +
59030 +#define CHECKSUM_SIZE 4
59031 +
59032 +#define BYTES_PER_LONG (sizeof(long))
59033 +
59034 +#if BITS_PER_LONG == 64
59035 +# define LONG_INT_SHIFT (6)
59036 +#else
59037 +# define LONG_INT_SHIFT (5)
59038 +#endif
59039 +
59040 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
59041 +
59042 +typedef unsigned long ulong_t;
59043 +
59044 +#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
59045 +#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
59046 +
59047 +/* Block allocation/deallocation are done through special bitmap objects which
59048 + are allocated in an array at fs mount. */
59049 +struct bitmap_node {
59050 + struct mutex mutex; /* long term lock object */
59051 +
59052 + jnode *wjnode; /* j-nodes for WORKING ... */
59053 + jnode *cjnode; /* ... and COMMIT bitmap blocks */
59054 +
59055 + bmap_off_t first_zero_bit; /* for skip_busy option implementation */
59056 +
59057 + atomic_t loaded; /* a flag which shows that bnode is loaded
59058 + * already */
59059 +};
59060 +
59061 +static inline char *bnode_working_data(struct bitmap_node *bnode)
59062 +{
59063 + char *data;
59064 +
59065 + data = jdata(bnode->wjnode);
59066 + assert("zam-429", data != NULL);
59067 +
59068 + return data + CHECKSUM_SIZE;
59069 +}
59070 +
59071 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
59072 +{
59073 + char *data;
59074 +
59075 + data = jdata(bnode->cjnode);
59076 + assert("zam-430", data != NULL);
59077 +
59078 + return data + CHECKSUM_SIZE;
59079 +}
59080 +
59081 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
59082 +{
59083 + char *data;
59084 +
59085 + data = jdata(bnode->cjnode);
59086 + assert("vpf-261", data != NULL);
59087 +
59088 + return le32_to_cpu(get_unaligned((d32 *)data));
59089 +}
59090 +
59091 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
59092 +{
59093 + char *data;
59094 +
59095 + data = jdata(bnode->cjnode);
59096 + assert("vpf-261", data != NULL);
59097 +
59098 + put_unaligned(cpu_to_le32(crc), (d32 *)data);
59099 +}
59100 +
59101 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
59102 + * written the code, does this added abstraction still have */
59103 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
59104 + * reiser4_space_allocator structure) */
59105 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
59106 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
59107 + * someday?". What they about? If there is a reason to have a union, it should
59108 + * be a union, if not, it should not be a union. "..might be someday" means no
59109 + * reason. */
59110 +struct bitmap_allocator_data {
59111 + /* an array for bitmap blocks direct access */
59112 + struct bitmap_node *bitmap;
59113 +};
59114 +
59115 +#define get_barray(super) \
59116 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
59117 +
59118 +#define get_bnode(super, i) (get_barray(super) + i)
59119 +
59120 +/* allocate and initialize jnode with JNODE_BITMAP type */
59121 +static jnode *bnew(void)
59122 +{
59123 + jnode *jal = jalloc();
59124 +
59125 + if (jal)
59126 + jnode_init(jal, current_tree, JNODE_BITMAP);
59127 +
59128 + return jal;
59129 +}
59130 +
59131 +/* this file contains:
59132 + - bitmap based implementation of space allocation plugin
59133 + - all the helper functions like set bit, find_first_zero_bit, etc */
59134 +
59135 +/* Audited by: green(2002.06.12) */
59136 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
59137 +{
59138 + ulong_t mask = 1UL << start_bit;
59139 + int i = start_bit;
59140 +
59141 + while ((word & mask) != 0) {
59142 + mask <<= 1;
59143 + if (++i >= BITS_PER_LONG)
59144 + break;
59145 + }
59146 +
59147 + return i;
59148 +}
59149 +
59150 +#include <linux/bitops.h>
59151 +
59152 +#if BITS_PER_LONG == 64
59153 +
59154 +#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
59155 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
59156 +
59157 +static inline void reiser4_set_bit(int nr, void *addr)
59158 +{
59159 + ext2_set_bit(nr + OFF(addr), BASE(addr));
59160 +}
59161 +
59162 +static inline void reiser4_clear_bit(int nr, void *addr)
59163 +{
59164 + ext2_clear_bit(nr + OFF(addr), BASE(addr));
59165 +}
59166 +
59167 +static inline int reiser4_test_bit(int nr, void *addr)
59168 +{
59169 + return ext2_test_bit(nr + OFF(addr), BASE(addr));
59170 +}
59171 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
59172 + int offset)
59173 +{
59174 + int off = OFF(addr);
59175 +
59176 + return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
59177 + offset + off) - off;
59178 +}
59179 +
59180 +#else
59181 +
59182 +#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
59183 +#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
59184 +#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
59185 +
59186 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
59187 +ext2_find_next_zero_bit(addr, maxoffset, offset)
59188 +#endif
59189 +
59190 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
59191 + * are counted from @addr, return the offset of the first bit if it is found,
59192 + * @maxoffset otherwise. */
59193 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59194 + bmap_off_t start_offset)
59195 +{
59196 + ulong_t *base = addr;
59197 + /* start_offset is in bits, convert it to byte offset within bitmap. */
59198 + int word_nr = start_offset >> LONG_INT_SHIFT;
59199 + /* bit number within the byte. */
59200 + int bit_nr = start_offset & LONG_INT_MASK;
59201 + int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
59202 +
59203 + assert("zam-387", max_offset != 0);
59204 +
59205 + /* Unaligned @start_offset case. */
59206 + if (bit_nr != 0) {
59207 + bmap_nr_t nr;
59208 +
59209 + nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
59210 +
59211 + if (nr < BITS_PER_LONG)
59212 + return (word_nr << LONG_INT_SHIFT) + nr;
59213 +
59214 + ++word_nr;
59215 + }
59216 +
59217 + /* Fast scan trough aligned words. */
59218 + while (word_nr <= max_word_nr) {
59219 + if (base[word_nr] != 0) {
59220 + return (word_nr << LONG_INT_SHIFT)
59221 + + find_next_zero_bit_in_word(~(base[word_nr]), 0);
59222 + }
59223 +
59224 + ++word_nr;
59225 + }
59226 +
59227 + return max_offset;
59228 +}
59229 +
59230 +#if BITS_PER_LONG == 64
59231 +
59232 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59233 + bmap_off_t start_offset)
59234 +{
59235 + bmap_off_t off = OFF(addr);
59236 +
59237 + return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
59238 + start_offset + off) - off;
59239 +}
59240 +
59241 +#else
59242 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
59243 + __reiser4_find_next_set_bit(addr, max_offset, start_offset)
59244 +#endif
59245 +
59246 +/* search for the first set bit in single word. */
59247 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
59248 +{
59249 + ulong_t bit_mask;
59250 + int nr = start_bit;
59251 +
59252 + assert("zam-965", start_bit < BITS_PER_LONG);
59253 + assert("zam-966", start_bit >= 0);
59254 +
59255 + bit_mask = (1UL << nr);
59256 +
59257 + while (bit_mask != 0) {
59258 + if (bit_mask & word)
59259 + return nr;
59260 + bit_mask >>= 1;
59261 + nr--;
59262 + }
59263 + return BITS_PER_LONG;
59264 +}
59265 +
59266 +/* Search bitmap for a set bit in backward direction from the end to the
59267 + * beginning of given region
59268 + *
59269 + * @result: result offset of the last set bit
59270 + * @addr: base memory address,
59271 + * @low_off: low end of the search region, edge bit included into the region,
59272 + * @high_off: high end of the search region, edge bit included into the region,
59273 + *
59274 + * @return: 0 - set bit was found, -1 otherwise.
59275 + */
59276 +static int
59277 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59278 + bmap_off_t high_off)
59279 +{
59280 + ulong_t *base = addr;
59281 + int last_word;
59282 + int first_word;
59283 + int last_bit;
59284 + int nr;
59285 +
59286 + assert("zam-962", high_off >= low_off);
59287 +
59288 + last_word = high_off >> LONG_INT_SHIFT;
59289 + last_bit = high_off & LONG_INT_MASK;
59290 + first_word = low_off >> LONG_INT_SHIFT;
59291 +
59292 + if (last_bit < BITS_PER_LONG) {
59293 + nr = find_last_set_bit_in_word(base[last_word], last_bit);
59294 + if (nr < BITS_PER_LONG) {
59295 + *result = (last_word << LONG_INT_SHIFT) + nr;
59296 + return 0;
59297 + }
59298 + --last_word;
59299 + }
59300 + while (last_word >= first_word) {
59301 + if (base[last_word] != 0x0) {
59302 + last_bit =
59303 + find_last_set_bit_in_word(base[last_word],
59304 + BITS_PER_LONG - 1);
59305 + assert("zam-972", last_bit < BITS_PER_LONG);
59306 + *result = (last_word << LONG_INT_SHIFT) + last_bit;
59307 + return 0;
59308 + }
59309 + --last_word;
59310 + }
59311 +
59312 + return -1; /* set bit not found */
59313 +}
59314 +
59315 +/* Search bitmap for a clear bit in backward direction from the end to the
59316 + * beginning of given region */
59317 +static int
59318 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59319 + bmap_off_t high_off)
59320 +{
59321 + ulong_t *base = addr;
59322 + int last_word;
59323 + int first_word;
59324 + int last_bit;
59325 + int nr;
59326 +
59327 + last_word = high_off >> LONG_INT_SHIFT;
59328 + last_bit = high_off & LONG_INT_MASK;
59329 + first_word = low_off >> LONG_INT_SHIFT;
59330 +
59331 + if (last_bit < BITS_PER_LONG) {
59332 + nr = find_last_set_bit_in_word(~base[last_word], last_bit);
59333 + if (nr < BITS_PER_LONG) {
59334 + *result = (last_word << LONG_INT_SHIFT) + nr;
59335 + return 0;
59336 + }
59337 + --last_word;
59338 + }
59339 + while (last_word >= first_word) {
59340 + if (base[last_word] != (ulong_t) (-1)) {
59341 + *result = (last_word << LONG_INT_SHIFT) +
59342 + find_last_set_bit_in_word(~base[last_word],
59343 + BITS_PER_LONG - 1);
59344 + return 0;
59345 + }
59346 + --last_word;
59347 + }
59348 +
59349 + return -1; /* zero bit not found */
59350 +}
59351 +
59352 +/* Audited by: green(2002.06.12) */
59353 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
59354 +{
59355 + int first_byte;
59356 + int last_byte;
59357 +
59358 + unsigned char first_byte_mask = 0xFF;
59359 + unsigned char last_byte_mask = 0xFF;
59360 +
59361 + assert("zam-410", start < end);
59362 +
59363 + first_byte = start >> 3;
59364 + last_byte = (end - 1) >> 3;
59365 +
59366 + if (last_byte > first_byte + 1)
59367 + memset(addr + first_byte + 1, 0,
59368 + (size_t) (last_byte - first_byte - 1));
59369 +
59370 + first_byte_mask >>= 8 - (start & 0x7);
59371 + last_byte_mask <<= ((end - 1) & 0x7) + 1;
59372 +
59373 + if (first_byte == last_byte) {
59374 + addr[first_byte] &= (first_byte_mask | last_byte_mask);
59375 + } else {
59376 + addr[first_byte] &= first_byte_mask;
59377 + addr[last_byte] &= last_byte_mask;
59378 + }
59379 +}
59380 +
59381 +/* Audited by: green(2002.06.12) */
59382 +/* ZAM-FIXME-HANS: comment this */
59383 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
59384 +{
59385 + int first_byte;
59386 + int last_byte;
59387 +
59388 + unsigned char first_byte_mask = 0xFF;
59389 + unsigned char last_byte_mask = 0xFF;
59390 +
59391 + assert("zam-386", start < end);
59392 +
59393 + first_byte = start >> 3;
59394 + last_byte = (end - 1) >> 3;
59395 +
59396 + if (last_byte > first_byte + 1)
59397 + memset(addr + first_byte + 1, 0xFF,
59398 + (size_t) (last_byte - first_byte - 1));
59399 +
59400 + first_byte_mask <<= start & 0x7;
59401 + last_byte_mask >>= 7 - ((end - 1) & 0x7);
59402 +
59403 + if (first_byte == last_byte) {
59404 + addr[first_byte] |= (first_byte_mask & last_byte_mask);
59405 + } else {
59406 + addr[first_byte] |= first_byte_mask;
59407 + addr[last_byte] |= last_byte_mask;
59408 + }
59409 +}
59410 +
59411 +#define ADLER_BASE 65521
59412 +#define ADLER_NMAX 5552
59413 +
59414 +/* Calculates the adler32 checksum for the data pointed by `data` of the
59415 + length `len`. This function was originally taken from zlib, version 1.1.3,
59416 + July 9th, 1998.
59417 +
59418 + Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
59419 +
59420 + This software is provided 'as-is', without any express or implied
59421 + warranty. In no event will the authors be held liable for any damages
59422 + arising from the use of this software.
59423 +
59424 + Permission is granted to anyone to use this software for any purpose,
59425 + including commercial applications, and to alter it and redistribute it
59426 + freely, subject to the following restrictions:
59427 +
59428 + 1. The origin of this software must not be misrepresented; you must not
59429 + claim that you wrote the original software. If you use this software
59430 + in a product, an acknowledgment in the product documentation would be
59431 + appreciated but is not required.
59432 + 2. Altered source versions must be plainly marked as such, and must not be
59433 + misrepresented as being the original software.
59434 + 3. This notice may not be removed or altered from any source distribution.
59435 +
59436 + Jean-loup Gailly Mark Adler
59437 + jloup@gzip.org madler@alumni.caltech.edu
59438 +
59439 + The above comment applies only to the reiser4_adler32 function.
59440 +*/
59441 +
59442 +__u32 reiser4_adler32(char *data, __u32 len)
59443 +{
59444 + unsigned char *t = data;
59445 + __u32 s1 = 1;
59446 + __u32 s2 = 0;
59447 + int k;
59448 +
59449 + while (len > 0) {
59450 + k = len < ADLER_NMAX ? len : ADLER_NMAX;
59451 + len -= k;
59452 +
59453 + while (k--) {
59454 + s1 += *t++;
59455 + s2 += s1;
59456 + }
59457 +
59458 + s1 %= ADLER_BASE;
59459 + s2 %= ADLER_BASE;
59460 + }
59461 + return (s2 << 16) | s1;
59462 +}
59463 +
59464 +#define sb_by_bnode(bnode) \
59465 + ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
59466 +
59467 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
59468 +{
59469 + return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
59470 +}
59471 +
59472 +static int
59473 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
59474 +{
59475 + if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
59476 + bmap_nr_t bmap;
59477 +
59478 + bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
59479 +
59480 + warning("vpf-263",
59481 + "Checksum for the bitmap block %llu is incorrect",
59482 + bmap);
59483 +
59484 + return RETERR(-EIO);
59485 + }
59486 +
59487 + return 0;
59488 +}
59489 +
59490 +#define REISER4_CHECK_BMAP_CRC (0)
59491 +
59492 +#if REISER4_CHECK_BMAP_CRC
59493 +static int bnode_check_crc(const struct bitmap_node *bnode)
59494 +{
59495 + return bnode_check_adler32(bnode,
59496 + bmap_size(sb_by_bnode(bnode)->s_blocksize));
59497 +}
59498 +
59499 +/* REISER4_CHECK_BMAP_CRC */
59500 +#else
59501 +
59502 +#define bnode_check_crc(bnode) (0)
59503 +
59504 +/* REISER4_CHECK_BMAP_CRC */
59505 +#endif
59506 +
59507 +/* Recalculates the adler32 checksum for only 1 byte change.
59508 + adler - previous adler checksum
59509 + old_data, data - old, new byte values.
59510 + tail == (chunk - offset) : length, checksum was calculated for, - offset of
59511 + the changed byte within this chunk.
59512 + This function can be used for checksum calculation optimisation.
59513 +*/
59514 +
59515 +static __u32
59516 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
59517 + __u32 tail)
59518 +{
59519 + __u32 delta = data - old_data + 2 * ADLER_BASE;
59520 + __u32 s1 = adler & 0xffff;
59521 + __u32 s2 = (adler >> 16) & 0xffff;
59522 +
59523 + s1 = (delta + s1) % ADLER_BASE;
59524 + s2 = (delta * tail + s2) % ADLER_BASE;
59525 +
59526 + return (s2 << 16) | s1;
59527 +}
59528 +
59529 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
59530 +
59531 +/**
59532 + * get_nr_bitmap - calculate number of bitmap blocks
59533 + * @super: super block with initialized blocksize and block count
59534 + *
59535 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
59536 + * maintain free disk space. It assumes that each bitmap addresses the same
59537 + * number of blocks which is calculated by bmap_block_count macro defined in
59538 + * above. Number of blocks in the filesystem has to be initialized in reiser4
59539 + * private data of super block already so that it can be obtained via
59540 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
59541 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
59542 + * to use special function to divide and modulo 64bits filesystem block
59543 + * counters.
59544 + *
59545 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
59546 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
59547 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
59548 + */
59549 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
59550 +{
59551 + u64 quotient;
59552 +
59553 + assert("zam-393", reiser4_block_count(super) != 0);
59554 +
59555 + quotient = reiser4_block_count(super) - 1;
59556 + do_div(quotient, bmap_bit_count(super->s_blocksize));
59557 + return quotient + 1;
59558 +}
59559 +
59560 +/**
59561 + * parse_blocknr - calculate bitmap number and offset in it by block number
59562 + * @block: pointer to block number to calculate location in bitmap of
59563 + * @bmap: pointer where to store bitmap block number
59564 + * @offset: pointer where to store offset within bitmap block
59565 + *
59566 + * Calculates location of bit which is responsible for allocation/freeing of
59567 + * block @*block. That location is represented by bitmap block number and offset
59568 + * within that bitmap block.
59569 + */
59570 +static void
59571 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
59572 + bmap_off_t *offset)
59573 +{
59574 + struct super_block *super = get_current_context()->super;
59575 + u64 quotient = *block;
59576 +
59577 + *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
59578 + *bmap = quotient;
59579 +
59580 + assert("zam-433", *bmap < get_nr_bmap(super));
59581 + assert("", *offset < bmap_bit_count(super->s_blocksize));
59582 +}
59583 +
59584 +#if REISER4_DEBUG
59585 +/* Audited by: green(2002.06.12) */
59586 +static void
59587 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
59588 +{
59589 + struct super_block *sb = reiser4_get_current_sb();
59590 +
59591 + assert("zam-436", sb != NULL);
59592 +
59593 + assert("zam-455", start != NULL);
59594 + assert("zam-437", *start != 0);
59595 + assert("zam-541", !reiser4_blocknr_is_fake(start));
59596 + assert("zam-441", *start < reiser4_block_count(sb));
59597 +
59598 + if (len != NULL) {
59599 + assert("zam-438", *len != 0);
59600 + assert("zam-442", *start + *len <= reiser4_block_count(sb));
59601 + }
59602 +}
59603 +
59604 +static void check_bnode_loaded(const struct bitmap_node *bnode)
59605 +{
59606 + assert("zam-485", bnode != NULL);
59607 + assert("zam-483", jnode_page(bnode->wjnode) != NULL);
59608 + assert("zam-484", jnode_page(bnode->cjnode) != NULL);
59609 + assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
59610 + assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
59611 +}
59612 +
59613 +#else
59614 +
59615 +# define check_block_range(start, len) do { /* nothing */} while(0)
59616 +# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
59617 +
59618 +#endif
59619 +
59620 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
59621 + spin-locked */
59622 +static inline void
59623 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
59624 +{
59625 + if (offset < bnode->first_zero_bit)
59626 + bnode->first_zero_bit = offset;
59627 +}
59628 +
59629 +/* return a physical disk address for logical bitmap number @bmap */
59630 +/* FIXME-VS: this is somehow related to disk layout? */
59631 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
59632 + * per block allocation so that performance is not affected. Probably this
59633 + * whole file should be considered part of the disk layout plugin, and other
59634 + * disk layouts can use other defines and efficiency will not be significantly
59635 + * affected. */
59636 +
59637 +#define REISER4_FIRST_BITMAP_BLOCK \
59638 + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
59639 +
59640 +/* Audited by: green(2002.06.12) */
59641 +static void
59642 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
59643 + reiser4_block_nr * bnr)
59644 +{
59645 +
59646 + assert("zam-390", bmap < get_nr_bmap(super));
59647 +
59648 +#ifdef CONFIG_REISER4_BADBLOCKS
59649 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
59650 + /* Check if the diskmap have this already, first. */
59651 + if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
59652 + return; /* Found it in diskmap */
59653 +#endif
59654 + /* FIXME_ZAM: before discussing of disk layouts and disk format
59655 + plugins I implement bitmap location scheme which is close to scheme
59656 + used in reiser 3.6 */
59657 + if (bmap == 0) {
59658 + *bnr = REISER4_FIRST_BITMAP_BLOCK;
59659 + } else {
59660 + *bnr = bmap * bmap_bit_count(super->s_blocksize);
59661 + }
59662 +}
59663 +
59664 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
59665 +/* Audited by: green(2002.06.12) */
59666 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
59667 +{
59668 + *bnr =
59669 + (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
59670 + REISER4_BITMAP_BLOCKS_STATUS_VALUE);
59671 +}
59672 +
59673 +/* bnode structure initialization */
59674 +static void
59675 +init_bnode(struct bitmap_node *bnode,
59676 + struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
59677 +{
59678 + memset(bnode, 0, sizeof(struct bitmap_node));
59679 +
59680 + mutex_init(&bnode->mutex);
59681 + atomic_set(&bnode->loaded, 0);
59682 +}
59683 +
59684 +static void release(jnode * node)
59685 +{
59686 + jrelse(node);
59687 + JF_SET(node, JNODE_HEARD_BANSHEE);
59688 + jput(node);
59689 +}
59690 +
59691 +/* This function is for internal bitmap.c use because it assumes that jnode is
59692 + in under full control of this thread */
59693 +static void done_bnode(struct bitmap_node *bnode)
59694 +{
59695 + if (bnode) {
59696 + atomic_set(&bnode->loaded, 0);
59697 + if (bnode->wjnode != NULL)
59698 + release(bnode->wjnode);
59699 + if (bnode->cjnode != NULL)
59700 + release(bnode->cjnode);
59701 + bnode->wjnode = bnode->cjnode = NULL;
59702 + }
59703 +}
59704 +
59705 +/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
59706 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
59707 + jnode **wjnode_ret)
59708 +{
59709 + struct super_block *super;
59710 + jnode *cjnode;
59711 + jnode *wjnode;
59712 + bmap_nr_t bmap;
59713 + int ret;
59714 +
59715 + super = reiser4_get_current_sb();
59716 +
59717 + *wjnode_ret = wjnode = bnew();
59718 + if (wjnode == NULL) {
59719 + *cjnode_ret = NULL;
59720 + return RETERR(-ENOMEM);
59721 + }
59722 +
59723 + *cjnode_ret = cjnode = bnew();
59724 + if (cjnode == NULL)
59725 + return RETERR(-ENOMEM);
59726 +
59727 + bmap = bnode - get_bnode(super, 0);
59728 +
59729 + get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
59730 + get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
59731 +
59732 + jref(cjnode);
59733 + jref(wjnode);
59734 +
59735 + /* load commit bitmap */
59736 + ret = jload_gfp(cjnode, GFP_NOFS, 1);
59737 +
59738 + if (ret)
59739 + goto error;
59740 +
59741 + /* allocate memory for working bitmap block. Note that for
59742 + * bitmaps jinit_new() doesn't actually modifies node content,
59743 + * so parallel calls to this are ok. */
59744 + ret = jinit_new(wjnode, GFP_NOFS);
59745 +
59746 + if (ret != 0) {
59747 + jrelse(cjnode);
59748 + goto error;
59749 + }
59750 +
59751 + return 0;
59752 +
59753 + error:
59754 + jput(cjnode);
59755 + jput(wjnode);
59756 + *wjnode_ret = *cjnode_ret = NULL;
59757 + return ret;
59758 +
59759 +}
59760 +
59761 +/* Check the bnode data on read. */
59762 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
59763 +{
59764 + void *data;
59765 + int ret;
59766 +
59767 + /* Check CRC */
59768 + ret = bnode_check_adler32(bnode, blksize);
59769 +
59770 + if (ret) {
59771 + return ret;
59772 + }
59773 +
59774 + data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
59775 +
59776 + /* Check the very first bit -- it must be busy. */
59777 + if (!reiser4_test_bit(0, data)) {
59778 + warning("vpf-1362", "The allocator block %llu is not marked "
59779 + "as used.", (unsigned long long)bnode->cjnode->blocknr);
59780 +
59781 + return -EINVAL;
59782 + }
59783 +
59784 + return 0;
59785 +}
59786 +
59787 +/* load bitmap blocks "on-demand" */
59788 +static int load_and_lock_bnode(struct bitmap_node *bnode)
59789 +{
59790 + int ret;
59791 +
59792 + jnode *cjnode;
59793 + jnode *wjnode;
59794 +
59795 + assert("nikita-3040", reiser4_schedulable());
59796 +
59797 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
59798 + * need to be atomic, right? Just leave a comment that if bitmaps were
59799 + * unloadable, this would need to be atomic. */
59800 + if (atomic_read(&bnode->loaded)) {
59801 + /* bitmap is already loaded, nothing to do */
59802 + check_bnode_loaded(bnode);
59803 + mutex_lock(&bnode->mutex);
59804 + assert("nikita-2827", atomic_read(&bnode->loaded));
59805 + return 0;
59806 + }
59807 +
59808 + ret = prepare_bnode(bnode, &cjnode, &wjnode);
59809 + if (ret == 0) {
59810 + mutex_lock(&bnode->mutex);
59811 +
59812 + if (!atomic_read(&bnode->loaded)) {
59813 + assert("nikita-2822", cjnode != NULL);
59814 + assert("nikita-2823", wjnode != NULL);
59815 + assert("nikita-2824", jnode_is_loaded(cjnode));
59816 + assert("nikita-2825", jnode_is_loaded(wjnode));
59817 +
59818 + bnode->wjnode = wjnode;
59819 + bnode->cjnode = cjnode;
59820 +
59821 + ret = check_struct_bnode(bnode, current_blocksize);
59822 + if (!ret) {
59823 + cjnode = wjnode = NULL;
59824 + atomic_set(&bnode->loaded, 1);
59825 + /* working bitmap is initialized by on-disk
59826 + * commit bitmap. This should be performed
59827 + * under mutex. */
59828 + memcpy(bnode_working_data(bnode),
59829 + bnode_commit_data(bnode),
59830 + bmap_size(current_blocksize));
59831 + } else
59832 + mutex_unlock(&bnode->mutex);
59833 + } else
59834 + /* race: someone already loaded bitmap while we were
59835 + * busy initializing data. */
59836 + check_bnode_loaded(bnode);
59837 + }
59838 +
59839 + if (wjnode != NULL) {
59840 + release(wjnode);
59841 + bnode->wjnode = NULL;
59842 + }
59843 + if (cjnode != NULL) {
59844 + release(cjnode);
59845 + bnode->cjnode = NULL;
59846 + }
59847 +
59848 + return ret;
59849 +}
59850 +
59851 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
59852 +{
59853 + check_bnode_loaded(bnode);
59854 + mutex_unlock(&bnode->mutex);
59855 +}
59856 +
59857 +/* This function does all block allocation work but only for one bitmap
59858 + block.*/
59859 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
59860 + block responsibility zone boundaries. This had no sense in v3.6 but may
59861 + have it in v4.x */
59862 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
59863 +static int
59864 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
59865 + bmap_off_t max_offset, int min_len, int max_len)
59866 +{
59867 + struct super_block *super = get_current_context()->super;
59868 + struct bitmap_node *bnode = get_bnode(super, bmap);
59869 +
59870 + char *data;
59871 +
59872 + bmap_off_t search_end;
59873 + bmap_off_t start;
59874 + bmap_off_t end;
59875 +
59876 + int set_first_zero_bit = 0;
59877 +
59878 + int ret;
59879 +
59880 + assert("zam-364", min_len > 0);
59881 + assert("zam-365", max_len >= min_len);
59882 + assert("zam-366", *offset <= max_offset);
59883 +
59884 + ret = load_and_lock_bnode(bnode);
59885 +
59886 + if (ret)
59887 + return ret;
59888 +
59889 + data = bnode_working_data(bnode);
59890 +
59891 + start = *offset;
59892 +
59893 + if (bnode->first_zero_bit >= start) {
59894 + start = bnode->first_zero_bit;
59895 + set_first_zero_bit = 1;
59896 + }
59897 +
59898 + while (start + min_len < max_offset) {
59899 +
59900 + start =
59901 + reiser4_find_next_zero_bit((long *)data, max_offset, start);
59902 + if (set_first_zero_bit) {
59903 + bnode->first_zero_bit = start;
59904 + set_first_zero_bit = 0;
59905 + }
59906 + if (start >= max_offset)
59907 + break;
59908 +
59909 + search_end = LIMIT(start + max_len, max_offset);
59910 + end =
59911 + reiser4_find_next_set_bit((long *)data, search_end, start);
59912 + if (end >= start + min_len) {
59913 + /* we can't trust find_next_set_bit result if set bit
59914 + was not fount, result may be bigger than
59915 + max_offset */
59916 + if (end > search_end)
59917 + end = search_end;
59918 +
59919 + ret = end - start;
59920 + *offset = start;
59921 +
59922 + reiser4_set_bits(data, start, end);
59923 +
59924 + /* FIXME: we may advance first_zero_bit if [start,
59925 + end] region overlaps the first_zero_bit point */
59926 +
59927 + break;
59928 + }
59929 +
59930 + start = end + 1;
59931 + }
59932 +
59933 + release_and_unlock_bnode(bnode);
59934 +
59935 + return ret;
59936 +}
59937 +
59938 +static int
59939 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
59940 + bmap_off_t end_offset, int min_len, int max_len)
59941 +{
59942 + struct super_block *super = get_current_context()->super;
59943 + struct bitmap_node *bnode = get_bnode(super, bmap);
59944 + char *data;
59945 + bmap_off_t start;
59946 + int ret;
59947 +
59948 + assert("zam-958", min_len > 0);
59949 + assert("zam-959", max_len >= min_len);
59950 + assert("zam-960", *start_offset >= end_offset);
59951 +
59952 + ret = load_and_lock_bnode(bnode);
59953 + if (ret)
59954 + return ret;
59955 +
59956 + data = bnode_working_data(bnode);
59957 + start = *start_offset;
59958 +
59959 + while (1) {
59960 + bmap_off_t end, search_end;
59961 +
59962 + /* Find the beginning of the zero filled region */
59963 + if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
59964 + break;
59965 + /* Is there more than `min_len' bits from `start' to
59966 + * `end_offset'? */
59967 + if (start < end_offset + min_len - 1)
59968 + break;
59969 +
59970 + /* Do not search to `end_offset' if we need to find less than
59971 + * `max_len' zero bits. */
59972 + if (end_offset + max_len - 1 < start)
59973 + search_end = start - max_len + 1;
59974 + else
59975 + search_end = end_offset;
59976 +
59977 + if (reiser4_find_last_set_bit(&end, data, search_end, start))
59978 + end = search_end;
59979 + else
59980 + end++;
59981 +
59982 + if (end + min_len <= start + 1) {
59983 + if (end < search_end)
59984 + end = search_end;
59985 + ret = start - end + 1;
59986 + *start_offset = end; /* `end' is lowest offset */
59987 + assert("zam-987",
59988 + reiser4_find_next_set_bit(data, start + 1,
59989 + end) >= start + 1);
59990 + reiser4_set_bits(data, end, start + 1);
59991 + break;
59992 + }
59993 +
59994 + if (end <= end_offset)
59995 + /* left search boundary reached. */
59996 + break;
59997 + start = end - 1;
59998 + }
59999 +
60000 + release_and_unlock_bnode(bnode);
60001 + return ret;
60002 +}
60003 +
60004 +/* allocate contiguous range of blocks in bitmap */
60005 +static int bitmap_alloc_forward(reiser4_block_nr * start,
60006 + const reiser4_block_nr * end, int min_len,
60007 + int max_len)
60008 +{
60009 + bmap_nr_t bmap, end_bmap;
60010 + bmap_off_t offset, end_offset;
60011 + int len;
60012 +
60013 + reiser4_block_nr tmp;
60014 +
60015 + struct super_block *super = get_current_context()->super;
60016 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60017 +
60018 + parse_blocknr(start, &bmap, &offset);
60019 +
60020 + tmp = *end - 1;
60021 + parse_blocknr(&tmp, &end_bmap, &end_offset);
60022 + ++end_offset;
60023 +
60024 + assert("zam-358", end_bmap >= bmap);
60025 + assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
60026 +
60027 + for (; bmap < end_bmap; bmap++, offset = 0) {
60028 + len =
60029 + search_one_bitmap_forward(bmap, &offset, max_offset,
60030 + min_len, max_len);
60031 + if (len != 0)
60032 + goto out;
60033 + }
60034 +
60035 + len =
60036 + search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
60037 + max_len);
60038 + out:
60039 + *start = bmap * max_offset + offset;
60040 + return len;
60041 +}
60042 +
60043 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
60044 + * backward direction) */
60045 +static int bitmap_alloc_backward(reiser4_block_nr * start,
60046 + const reiser4_block_nr * end, int min_len,
60047 + int max_len)
60048 +{
60049 + bmap_nr_t bmap, end_bmap;
60050 + bmap_off_t offset, end_offset;
60051 + int len;
60052 + struct super_block *super = get_current_context()->super;
60053 + const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60054 +
60055 + parse_blocknr(start, &bmap, &offset);
60056 + parse_blocknr(end, &end_bmap, &end_offset);
60057 +
60058 + assert("zam-961", end_bmap <= bmap);
60059 + assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
60060 +
60061 + for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
60062 + len =
60063 + search_one_bitmap_backward(bmap, &offset, 0, min_len,
60064 + max_len);
60065 + if (len != 0)
60066 + goto out;
60067 + }
60068 +
60069 + len =
60070 + search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
60071 + max_len);
60072 + out:
60073 + *start = bmap * max_offset + offset;
60074 + return len;
60075 +}
60076 +
60077 +/* plugin->u.space_allocator.alloc_blocks() */
60078 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
60079 + reiser4_block_nr *start, reiser4_block_nr *len)
60080 +{
60081 + struct super_block *super = get_current_context()->super;
60082 + int actual_len;
60083 +
60084 + reiser4_block_nr search_start;
60085 + reiser4_block_nr search_end;
60086 +
60087 + assert("zam-398", super != NULL);
60088 + assert("zam-412", hint != NULL);
60089 + assert("zam-397", hint->blk <= reiser4_block_count(super));
60090 +
60091 + if (hint->max_dist == 0)
60092 + search_end = reiser4_block_count(super);
60093 + else
60094 + search_end =
60095 + LIMIT(hint->blk + hint->max_dist,
60096 + reiser4_block_count(super));
60097 +
60098 + /* We use @hint -> blk as a search start and search from it to the end
60099 + of the disk or in given region if @hint -> max_dist is not zero */
60100 + search_start = hint->blk;
60101 +
60102 + actual_len =
60103 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60104 +
60105 + /* There is only one bitmap search if max_dist was specified or first
60106 + pass was from the beginning of the bitmap. We also do one pass for
60107 + scanning bitmap in backward direction. */
60108 + if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
60109 + /* next step is a scanning from 0 to search_start */
60110 + search_end = search_start;
60111 + search_start = 0;
60112 + actual_len =
60113 + bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60114 + }
60115 + if (actual_len == 0)
60116 + return RETERR(-ENOSPC);
60117 + if (actual_len < 0)
60118 + return RETERR(actual_len);
60119 + *len = actual_len;
60120 + *start = search_start;
60121 + return 0;
60122 +}
60123 +
60124 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
60125 + reiser4_block_nr * start,
60126 + reiser4_block_nr * len)
60127 +{
60128 + reiser4_block_nr search_start;
60129 + reiser4_block_nr search_end;
60130 + int actual_len;
60131 +
60132 + ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
60133 +
60134 + assert("zam-969", super != NULL);
60135 + assert("zam-970", hint != NULL);
60136 + assert("zam-971", hint->blk <= reiser4_block_count(super));
60137 +
60138 + search_start = hint->blk;
60139 + if (hint->max_dist == 0 || search_start <= hint->max_dist)
60140 + search_end = 0;
60141 + else
60142 + search_end = search_start - hint->max_dist;
60143 +
60144 + actual_len =
60145 + bitmap_alloc_backward(&search_start, &search_end, 1, needed);
60146 + if (actual_len == 0)
60147 + return RETERR(-ENOSPC);
60148 + if (actual_len < 0)
60149 + return RETERR(actual_len);
60150 + *len = actual_len;
60151 + *start = search_start;
60152 + return 0;
60153 +}
60154 +
60155 +/* plugin->u.space_allocator.alloc_blocks() */
60156 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
60157 + reiser4_blocknr_hint * hint, int needed,
60158 + reiser4_block_nr * start, reiser4_block_nr * len)
60159 +{
60160 + if (hint->backward)
60161 + return alloc_blocks_backward(hint, needed, start, len);
60162 + return alloc_blocks_forward(hint, needed, start, len);
60163 +}
60164 +
60165 +/* plugin->u.space_allocator.dealloc_blocks(). */
60166 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
60167 + nodes deletion is deferred until transaction commit. However, deallocation
60168 + of temporary objects like wandered blocks and transaction commit records
60169 + requires immediate node deletion from WORKING BITMAP.*/
60170 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
60171 + reiser4_block_nr start, reiser4_block_nr len)
60172 +{
60173 + struct super_block *super = reiser4_get_current_sb();
60174 +
60175 + bmap_nr_t bmap;
60176 + bmap_off_t offset;
60177 +
60178 + struct bitmap_node *bnode;
60179 + int ret;
60180 +
60181 + assert("zam-468", len != 0);
60182 + check_block_range(&start, &len);
60183 +
60184 + parse_blocknr(&start, &bmap, &offset);
60185 +
60186 + assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
60187 +
60188 + bnode = get_bnode(super, bmap);
60189 +
60190 + assert("zam-470", bnode != NULL);
60191 +
60192 + ret = load_and_lock_bnode(bnode);
60193 + assert("zam-481", ret == 0);
60194 +
60195 + reiser4_clear_bits(bnode_working_data(bnode), offset,
60196 + (bmap_off_t) (offset + len));
60197 +
60198 + adjust_first_zero_bit(bnode, offset);
60199 +
60200 + release_and_unlock_bnode(bnode);
60201 +}
60202 +
60203 +/* plugin->u.space_allocator.check_blocks(). */
60204 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
60205 + const reiser4_block_nr * len, int desired)
60206 +{
60207 +#if REISER4_DEBUG
60208 + struct super_block *super = reiser4_get_current_sb();
60209 +
60210 + bmap_nr_t bmap;
60211 + bmap_off_t start_offset;
60212 + bmap_off_t end_offset;
60213 +
60214 + struct bitmap_node *bnode;
60215 + int ret;
60216 +
60217 + assert("zam-622", len != NULL);
60218 + check_block_range(start, len);
60219 + parse_blocknr(start, &bmap, &start_offset);
60220 +
60221 + end_offset = start_offset + *len;
60222 + assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
60223 +
60224 + bnode = get_bnode(super, bmap);
60225 +
60226 + assert("nikita-2215", bnode != NULL);
60227 +
60228 + ret = load_and_lock_bnode(bnode);
60229 + assert("zam-626", ret == 0);
60230 +
60231 + assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
60232 +
60233 + if (desired) {
60234 + assert("zam-623",
60235 + reiser4_find_next_zero_bit(bnode_working_data(bnode),
60236 + end_offset, start_offset)
60237 + >= end_offset);
60238 + } else {
60239 + assert("zam-624",
60240 + reiser4_find_next_set_bit(bnode_working_data(bnode),
60241 + end_offset, start_offset)
60242 + >= end_offset);
60243 + }
60244 +
60245 + release_and_unlock_bnode(bnode);
60246 +#endif
60247 +}
60248 +
60249 +/* conditional insertion of @node into atom's overwrite set if it was not there */
60250 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
60251 +{
60252 + assert("zam-546", atom != NULL);
60253 + assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
60254 + assert("zam-548", node != NULL);
60255 +
60256 + spin_lock_atom(atom);
60257 + spin_lock_jnode(node);
60258 +
60259 + if (node->atom == NULL) {
60260 + JF_SET(node, JNODE_OVRWR);
60261 + insert_into_atom_ovrwr_list(atom, node);
60262 + } else {
60263 + assert("zam-549", node->atom == atom);
60264 + }
60265 +
60266 + spin_unlock_jnode(node);
60267 + spin_unlock_atom(atom);
60268 +}
60269 +
60270 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
60271 + pages in a single-linked list */
60272 +static int
60273 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
60274 + const reiser4_block_nr * len, void *data)
60275 +{
60276 +
60277 + bmap_nr_t bmap;
60278 + bmap_off_t offset;
60279 + int ret;
60280 +
60281 + long long *blocks_freed_p = data;
60282 +
60283 + struct bitmap_node *bnode;
60284 +
60285 + struct super_block *sb = reiser4_get_current_sb();
60286 +
60287 + check_block_range(start, len);
60288 +
60289 + parse_blocknr(start, &bmap, &offset);
60290 +
60291 + /* FIXME-ZAM: we assume that all block ranges are allocated by this
60292 + bitmap-based allocator and each block range can't go over a zone of
60293 + responsibility of one bitmap block; same assumption is used in
60294 + other journal hooks in bitmap code. */
60295 + bnode = get_bnode(sb, bmap);
60296 + assert("zam-448", bnode != NULL);
60297 +
60298 + /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
60299 + assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
60300 + ret = load_and_lock_bnode(bnode);
60301 + if (ret)
60302 + return ret;
60303 +
60304 + /* put bnode into atom's overwrite set */
60305 + cond_add_to_overwrite_set(atom, bnode->cjnode);
60306 +
60307 + data = bnode_commit_data(bnode);
60308 +
60309 + ret = bnode_check_crc(bnode);
60310 + if (ret != 0)
60311 + return ret;
60312 +
60313 + if (len != NULL) {
60314 + /* FIXME-ZAM: a check that all bits are set should be there */
60315 + assert("zam-443",
60316 + offset + *len <= bmap_bit_count(sb->s_blocksize));
60317 + reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
60318 +
60319 + (*blocks_freed_p) += *len;
60320 + } else {
60321 + reiser4_clear_bit(offset, data);
60322 + (*blocks_freed_p)++;
60323 + }
60324 +
60325 + bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
60326 +
60327 + release_and_unlock_bnode(bnode);
60328 +
60329 + return 0;
60330 +}
60331 +
60332 +/* plugin->u.space_allocator.pre_commit_hook(). */
60333 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
60334 + rest is done by transaction manager (allocate wandered locations for COMMIT
60335 + BITMAP blocks, copy COMMIT BITMAP blocks data). */
60336 +/* Only one instance of this function can be running at one given time, because
60337 + only one transaction can be committed a time, therefore it is safe to access
60338 + some global variables without any locking */
60339 +
60340 +int reiser4_pre_commit_hook_bitmap(void)
60341 +{
60342 + struct super_block *super = reiser4_get_current_sb();
60343 + txn_atom *atom;
60344 +
60345 + long long blocks_freed = 0;
60346 +
60347 + atom = get_current_atom_locked();
60348 + assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
60349 + spin_unlock_atom(atom);
60350 +
60351 + { /* scan atom's captured list and find all freshly allocated nodes,
60352 + * mark corresponded bits in COMMIT BITMAP as used */
60353 + struct list_head *head = ATOM_CLEAN_LIST(atom);
60354 + jnode *node = list_entry(head->next, jnode, capture_link);
60355 +
60356 + while (head != &node->capture_link) {
60357 + /* we detect freshly allocated jnodes */
60358 + if (JF_ISSET(node, JNODE_RELOC)) {
60359 + int ret;
60360 + bmap_nr_t bmap;
60361 +
60362 + bmap_off_t offset;
60363 + bmap_off_t index;
60364 + struct bitmap_node *bn;
60365 + __u32 size = bmap_size(super->s_blocksize);
60366 + __u32 crc;
60367 + char byte;
60368 +
60369 + assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
60370 + assert("zam-460",
60371 + !reiser4_blocknr_is_fake(&node->blocknr));
60372 +
60373 + parse_blocknr(&node->blocknr, &bmap, &offset);
60374 + bn = get_bnode(super, bmap);
60375 +
60376 + index = offset >> 3;
60377 + assert("vpf-276", index < size);
60378 +
60379 + ret = bnode_check_crc(bnode);
60380 + if (ret != 0)
60381 + return ret;
60382 +
60383 + check_bnode_loaded(bn);
60384 + load_and_lock_bnode(bn);
60385 +
60386 + byte = *(bnode_commit_data(bn) + index);
60387 + reiser4_set_bit(offset, bnode_commit_data(bn));
60388 +
60389 + crc = adler32_recalc(bnode_commit_crc(bn), byte,
60390 + *(bnode_commit_data(bn) +
60391 + index),
60392 + size - index),
60393 + bnode_set_commit_crc(bn, crc);
60394 +
60395 + release_and_unlock_bnode(bn);
60396 +
60397 + ret = bnode_check_crc(bn);
60398 + if (ret != 0)
60399 + return ret;
60400 +
60401 + /* working of this depends on how it inserts
60402 + new j-node into clean list, because we are
60403 + scanning the same list now. It is OK, if
60404 + insertion is done to the list front */
60405 + cond_add_to_overwrite_set(atom, bn->cjnode);
60406 + }
60407 +
60408 + node = list_entry(node->capture_link.next, jnode, capture_link);
60409 + }
60410 + }
60411 +
60412 + blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
60413 + &blocks_freed, 0);
60414 +
60415 + blocks_freed -= atom->nr_blocks_allocated;
60416 +
60417 + {
60418 + reiser4_super_info_data *sbinfo;
60419 +
60420 + sbinfo = get_super_private(super);
60421 +
60422 + spin_lock_reiser4_super(sbinfo);
60423 + sbinfo->blocks_free_committed += blocks_freed;
60424 + spin_unlock_reiser4_super(sbinfo);
60425 + }
60426 +
60427 + return 0;
60428 +}
60429 +
60430 +/* plugin->u.space_allocator.init_allocator
60431 + constructor of reiser4_space_allocator object. It is called on fs mount */
60432 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
60433 + struct super_block *super, void *arg)
60434 +{
60435 + struct bitmap_allocator_data *data = NULL;
60436 + bmap_nr_t bitmap_blocks_nr;
60437 + bmap_nr_t i;
60438 +
60439 + assert("nikita-3039", reiser4_schedulable());
60440 +
60441 + /* getting memory for bitmap allocator private data holder */
60442 + data =
60443 + kmalloc(sizeof(struct bitmap_allocator_data),
60444 + reiser4_ctx_gfp_mask_get());
60445 +
60446 + if (data == NULL)
60447 + return RETERR(-ENOMEM);
60448 +
60449 + /* allocation and initialization for the array of bnodes */
60450 + bitmap_blocks_nr = get_nr_bmap(super);
60451 +
60452 + /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
60453 + which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
60454 + may I never meet someone who still uses the ia32 architecture when
60455 + storage devices of that size enter the market, and wants to use ia32
60456 + with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
60457 + probably, another dynamic data structure should replace a static
60458 + array of bnodes. */
60459 + /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
60460 + data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
60461 + if (data->bitmap == NULL) {
60462 + kfree(data);
60463 + return RETERR(-ENOMEM);
60464 + }
60465 +
60466 + for (i = 0; i < bitmap_blocks_nr; i++)
60467 + init_bnode(data->bitmap + i, super, i);
60468 +
60469 + allocator->u.generic = data;
60470 +
60471 +#if REISER4_DEBUG
60472 + get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
60473 +#endif
60474 +
60475 + /* Load all bitmap blocks at mount time. */
60476 + if (!test_bit
60477 + (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
60478 + __u64 start_time, elapsed_time;
60479 + struct bitmap_node *bnode;
60480 + int ret;
60481 +
60482 + if (REISER4_DEBUG)
60483 + printk(KERN_INFO "loading reiser4 bitmap...");
60484 + start_time = jiffies;
60485 +
60486 + for (i = 0; i < bitmap_blocks_nr; i++) {
60487 + bnode = data->bitmap + i;
60488 + ret = load_and_lock_bnode(bnode);
60489 + if (ret) {
60490 + reiser4_destroy_allocator_bitmap(allocator,
60491 + super);
60492 + return ret;
60493 + }
60494 + release_and_unlock_bnode(bnode);
60495 + }
60496 +
60497 + elapsed_time = jiffies - start_time;
60498 + if (REISER4_DEBUG)
60499 + printk("...done (%llu jiffies)\n",
60500 + (unsigned long long)elapsed_time);
60501 + }
60502 +
60503 + return 0;
60504 +}
60505 +
60506 +/* plugin->u.space_allocator.destroy_allocator
60507 + destructor. It is called on fs unmount */
60508 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
60509 + struct super_block *super)
60510 +{
60511 + bmap_nr_t bitmap_blocks_nr;
60512 + bmap_nr_t i;
60513 +
60514 + struct bitmap_allocator_data *data = allocator->u.generic;
60515 +
60516 + assert("zam-414", data != NULL);
60517 + assert("zam-376", data->bitmap != NULL);
60518 +
60519 + bitmap_blocks_nr = get_nr_bmap(super);
60520 +
60521 + for (i = 0; i < bitmap_blocks_nr; i++) {
60522 + struct bitmap_node *bnode = data->bitmap + i;
60523 +
60524 + mutex_lock(&bnode->mutex);
60525 +
60526 +#if REISER4_DEBUG
60527 + if (atomic_read(&bnode->loaded)) {
60528 + jnode *wj = bnode->wjnode;
60529 + jnode *cj = bnode->cjnode;
60530 +
60531 + assert("zam-480", jnode_page(cj) != NULL);
60532 + assert("zam-633", jnode_page(wj) != NULL);
60533 +
60534 + assert("zam-634",
60535 + memcmp(jdata(wj), jdata(wj),
60536 + bmap_size(super->s_blocksize)) == 0);
60537 +
60538 + }
60539 +#endif
60540 + done_bnode(bnode);
60541 + mutex_unlock(&bnode->mutex);
60542 + }
60543 +
60544 + vfree(data->bitmap);
60545 + kfree(data);
60546 +
60547 + allocator->u.generic = NULL;
60548 +
60549 + return 0;
60550 +}
60551 +
60552 +/*
60553 + * Local variables:
60554 + * c-indentation-style: "K&R"
60555 + * mode-name: "LC"
60556 + * c-basic-offset: 8
60557 + * tab-width: 8
60558 + * fill-column: 79
60559 + * scroll-step: 1
60560 + * End:
60561 + */
60562 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.23/fs/reiser4/plugin/space/bitmap.h
60563 --- linux-2.6.23.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300
60564 +++ linux-2.6.23/fs/reiser4/plugin/space/bitmap.h 2007-12-04 16:49:30.000000000 +0300
60565 @@ -0,0 +1,47 @@
60566 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60567 +
60568 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
60569 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
60570 +
60571 +#include "../../dformat.h"
60572 +#include "../../block_alloc.h"
60573 +
60574 +#include <linux/types.h> /* for __u?? */
60575 +#include <linux/fs.h> /* for struct super_block */
60576 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
60577 +/* declarations of functions implementing methods of space allocator plugin for
60578 + bitmap based allocator. The functions themselves are in bitmap.c */
60579 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
60580 + struct super_block *, void *);
60581 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
60582 + struct super_block *);
60583 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
60584 + reiser4_blocknr_hint *, int needed,
60585 + reiser4_block_nr * start,
60586 + reiser4_block_nr * len);
60587 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
60588 + const reiser4_block_nr *, int);
60589 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
60590 + reiser4_block_nr,
60591 + reiser4_block_nr);
60592 +extern int reiser4_pre_commit_hook_bitmap(void);
60593 +
60594 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
60595 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
60596 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
60597 +
60598 +typedef __u64 bmap_nr_t;
60599 +typedef __u32 bmap_off_t;
60600 +
60601 +#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
60602 +
60603 +/* Make Linus happy.
60604 + Local variables:
60605 + c-indentation-style: "K&R"
60606 + mode-name: "LC"
60607 + c-basic-offset: 8
60608 + tab-width: 8
60609 + fill-column: 120
60610 + scroll-step: 1
60611 + End:
60612 +*/
60613 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/Makefile linux-2.6.23/fs/reiser4/plugin/space/Makefile
60614 --- linux-2.6.23.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300
60615 +++ linux-2.6.23/fs/reiser4/plugin/space/Makefile 2007-12-04 16:49:30.000000000 +0300
60616 @@ -0,0 +1,4 @@
60617 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
60618 +
60619 +space_plugins-objs := \
60620 + bitmap.o
60621 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.23/fs/reiser4/plugin/space/space_allocator.h
60622 --- linux-2.6.23.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
60623 +++ linux-2.6.23/fs/reiser4/plugin/space/space_allocator.h 2007-12-04 16:49:30.000000000 +0300
60624 @@ -0,0 +1,80 @@
60625 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60626 +
60627 +#ifndef __SPACE_ALLOCATOR_H__
60628 +#define __SPACE_ALLOCATOR_H__
60629 +
60630 +#include "../../forward.h"
60631 +#include "bitmap.h"
60632 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
60633 + * but... */
60634 +#define DEF_SPACE_ALLOCATOR(allocator) \
60635 + \
60636 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
60637 +{ \
60638 + return reiser4_init_allocator_##allocator (al, s, opaque); \
60639 +} \
60640 + \
60641 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
60642 +{ \
60643 + reiser4_destroy_allocator_##allocator (al, s); \
60644 +} \
60645 + \
60646 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
60647 + int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
60648 +{ \
60649 + return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
60650 +} \
60651 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
60652 +{ \
60653 + reiser4_dealloc_blocks_##allocator (al, start, len); \
60654 +} \
60655 + \
60656 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
60657 +{ \
60658 + reiser4_check_blocks_##allocator (start, end, desired); \
60659 +} \
60660 + \
60661 +static inline void sa_pre_commit_hook (void) \
60662 +{ \
60663 + reiser4_pre_commit_hook_##allocator (); \
60664 +} \
60665 + \
60666 +static inline void sa_post_commit_hook (void) \
60667 +{ \
60668 + reiser4_post_commit_hook_##allocator (); \
60669 +} \
60670 + \
60671 +static inline void sa_post_write_back_hook (void) \
60672 +{ \
60673 + reiser4_post_write_back_hook_##allocator(); \
60674 +} \
60675 + \
60676 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
60677 +{ \
60678 + reiser4_print_info_##allocator (prefix, al); \
60679 +}
60680 +
60681 +DEF_SPACE_ALLOCATOR(bitmap)
60682 +
60683 +/* this object is part of reiser4 private in-core super block */
60684 +struct reiser4_space_allocator {
60685 + union {
60686 + /* space allocators might use this pointer to reference their
60687 + * data. */
60688 + void *generic;
60689 + } u;
60690 +};
60691 +
60692 +/* __SPACE_ALLOCATOR_H__ */
60693 +#endif
60694 +
60695 +/* Make Linus happy.
60696 + Local variables:
60697 + c-indentation-style: "K&R"
60698 + mode-name: "LC"
60699 + c-basic-offset: 8
60700 + tab-width: 8
60701 + fill-column: 120
60702 + scroll-step: 1
60703 + End:
60704 +*/
60705 diff -urN linux-2.6.23.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.23/fs/reiser4/plugin/tail_policy.c
60706 --- linux-2.6.23.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300
60707 +++ linux-2.6.23/fs/reiser4/plugin/tail_policy.c 2007-12-04 16:49:30.000000000 +0300
60708 @@ -0,0 +1,113 @@
60709 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60710 + * reiser4/README */
60711 +
60712 +/* Formatting policy plugins */
60713 +
60714 +/*
60715 + * Formatting policy plugin is used by object plugin (of regular file) to
60716 + * convert file between two representations.
60717 + *
60718 + * Currently following policies are implemented:
60719 + * never store file in formatted nodes
60720 + * always store file in formatted nodes
60721 + * store file in formatted nodes if file is smaller than 4 blocks (default)
60722 + */
60723 +
60724 +#include "../tree.h"
60725 +#include "../inode.h"
60726 +#include "../super.h"
60727 +#include "object.h"
60728 +#include "plugin.h"
60729 +#include "node/node.h"
60730 +#include "plugin_header.h"
60731 +
60732 +#include <linux/pagemap.h>
60733 +#include <linux/fs.h> /* For struct inode */
60734 +
60735 +/**
60736 + * have_formatting_never -
60737 + * @inode:
60738 + * @size:
60739 + *
60740 + *
60741 + */
60742 +/* Never store file's tail as direct item */
60743 +/* Audited by: green(2002.06.12) */
60744 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
60745 + /* inode to operate on */ ,
60746 + loff_t size UNUSED_ARG /* new object size */ )
60747 +{
60748 + return 0;
60749 +}
60750 +
60751 +/* Always store file's tail as direct item */
60752 +/* Audited by: green(2002.06.12) */
60753 +static int
60754 +have_formatting_always(const struct inode *inode UNUSED_ARG
60755 + /* inode to operate on */ ,
60756 + loff_t size UNUSED_ARG /* new object size */ )
60757 +{
60758 + return 1;
60759 +}
60760 +
60761 +/* This function makes test if we should store file denoted @inode as tails only or
60762 + as extents only. */
60763 +static int
60764 +have_formatting_default(const struct inode *inode UNUSED_ARG
60765 + /* inode to operate on */ ,
60766 + loff_t size /* new object size */ )
60767 +{
60768 + assert("umka-1253", inode != NULL);
60769 +
60770 + if (size > inode->i_sb->s_blocksize * 4)
60771 + return 0;
60772 +
60773 + return 1;
60774 +}
60775 +
60776 +/* tail plugins */
60777 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
60778 + [NEVER_TAILS_FORMATTING_ID] = {
60779 + .h = {
60780 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60781 + .id = NEVER_TAILS_FORMATTING_ID,
60782 + .pops = NULL,
60783 + .label = "never",
60784 + .desc = "Never store file's tail",
60785 + .linkage = {NULL, NULL}
60786 + },
60787 + .have_tail = have_formatting_never
60788 + },
60789 + [ALWAYS_TAILS_FORMATTING_ID] = {
60790 + .h = {
60791 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60792 + .id = ALWAYS_TAILS_FORMATTING_ID,
60793 + .pops = NULL,
60794 + .label = "always",
60795 + .desc = "Always store file's tail",
60796 + .linkage = {NULL, NULL}
60797 + },
60798 + .have_tail = have_formatting_always
60799 + },
60800 + [SMALL_FILE_FORMATTING_ID] = {
60801 + .h = {
60802 + .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60803 + .id = SMALL_FILE_FORMATTING_ID,
60804 + .pops = NULL,
60805 + .label = "4blocks",
60806 + .desc = "store files shorter than 4 blocks in tail items",
60807 + .linkage = {NULL, NULL}
60808 + },
60809 + .have_tail = have_formatting_default
60810 + }
60811 +};
60812 +
60813 +/*
60814 + * Local variables:
60815 + * c-indentation-style: "K&R"
60816 + * mode-name: "LC"
60817 + * c-basic-offset: 8
60818 + * tab-width: 8
60819 + * fill-column: 79
60820 + * End:
60821 + */
60822 diff -urN linux-2.6.23.orig/fs/reiser4/pool.c linux-2.6.23/fs/reiser4/pool.c
60823 --- linux-2.6.23.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
60824 +++ linux-2.6.23/fs/reiser4/pool.c 2007-12-04 16:49:30.000000000 +0300
60825 @@ -0,0 +1,231 @@
60826 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60827 + * reiser4/README */
60828 +
60829 +/* Fast pool allocation.
60830 +
60831 + There are situations when some sub-system normally asks memory allocator
60832 + for only few objects, but under some circumstances could require much
60833 + more. Typical and actually motivating example is tree balancing. It needs
60834 + to keep track of nodes that were involved into it, and it is well-known
60835 + that in reasonable packed balanced tree most (92.938121%) percent of all
60836 + balancings end up after working with only few nodes (3.141592 on
60837 + average). But in rare cases balancing can involve much more nodes
60838 + (3*tree_height+1 in extremal situation).
60839 +
60840 + On the one hand, we don't want to resort to dynamic allocation (slab,
60841 + malloc(), etc.) to allocate data structures required to keep track of
60842 + nodes during balancing. On the other hand, we cannot statically allocate
60843 + required amount of space on the stack, because first: it is useless wastage
60844 + of precious resource, and second: this amount is unknown in advance (tree
60845 + height can change).
60846 +
60847 + Pools, implemented in this file are solution for this problem:
60848 +
60849 + - some configurable amount of objects is statically preallocated on the
60850 + stack
60851 +
60852 + - if this preallocated pool is exhausted and more objects is requested
60853 + they are allocated dynamically.
60854 +
60855 + Pools encapsulate distinction between statically and dynamically allocated
60856 + objects. Both allocation and recycling look exactly the same.
60857 +
60858 + To keep track of dynamically allocated objects, pool adds its own linkage
60859 + to each object.
60860 +
60861 + NOTE-NIKITA This linkage also contains some balancing-specific data. This
60862 + is not perfect. On the other hand, balancing is currently the only client
60863 + of pool code.
60864 +
60865 + NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
60866 + functions in the style of tslist/tshash, i.e., make them unreadable, but
60867 + type-safe.
60868 +
60869 +*/
60870 +
60871 +#include "debug.h"
60872 +#include "pool.h"
60873 +#include "super.h"
60874 +
60875 +#include <linux/types.h>
60876 +#include <linux/err.h>
60877 +
60878 +/* initialize new pool object @h */
60879 +static void reiser4_init_pool_obj(struct reiser4_pool_header * h)
60880 +{
60881 + INIT_LIST_HEAD(&h->usage_linkage);
60882 + INIT_LIST_HEAD(&h->level_linkage);
60883 + INIT_LIST_HEAD(&h->extra_linkage);
60884 +}
60885 +
60886 +/* initialize new pool */
60887 +void reiser4_init_pool(struct reiser4_pool * pool /* pool to initialize */ ,
60888 + size_t obj_size /* size of objects in @pool */ ,
60889 + int num_of_objs /* number of preallocated objects */ ,
60890 + char *data /* area for preallocated objects */ )
60891 +{
60892 + struct reiser4_pool_header *h;
60893 + int i;
60894 +
60895 + assert("nikita-955", pool != NULL);
60896 + assert("nikita-1044", obj_size > 0);
60897 + assert("nikita-956", num_of_objs >= 0);
60898 + assert("nikita-957", data != NULL);
60899 +
60900 + memset(pool, 0, sizeof *pool);
60901 + pool->obj_size = obj_size;
60902 + pool->data = data;
60903 + INIT_LIST_HEAD(&pool->free);
60904 + INIT_LIST_HEAD(&pool->used);
60905 + INIT_LIST_HEAD(&pool->extra);
60906 + memset(data, 0, obj_size * num_of_objs);
60907 + for (i = 0; i < num_of_objs; ++i) {
60908 + h = (struct reiser4_pool_header *) (data + i * obj_size);
60909 + reiser4_init_pool_obj(h);
60910 + /* add pool header to the end of pool's free list */
60911 + list_add_tail(&h->usage_linkage, &pool->free);
60912 + }
60913 +}
60914 +
60915 +/* release pool resources
60916 +
60917 + Release all resources acquired by this pool, specifically, dynamically
60918 + allocated objects.
60919 +
60920 +*/
60921 +void reiser4_done_pool(struct reiser4_pool * pool UNUSED_ARG)
60922 +{
60923 +}
60924 +
60925 +/* allocate carry object from @pool
60926 +
60927 + First, try to get preallocated object. If this fails, resort to dynamic
60928 + allocation.
60929 +
60930 +*/
60931 +static void *reiser4_pool_alloc(struct reiser4_pool * pool)
60932 +{
60933 + struct reiser4_pool_header *result;
60934 +
60935 + assert("nikita-959", pool != NULL);
60936 +
60937 + if (!list_empty(&pool->free)) {
60938 + struct list_head *linkage;
60939 +
60940 + linkage = pool->free.next;
60941 + list_del(linkage);
60942 + INIT_LIST_HEAD(linkage);
60943 + result = list_entry(linkage, struct reiser4_pool_header,
60944 + usage_linkage);
60945 + BUG_ON(!list_empty(&result->level_linkage) ||
60946 + !list_empty(&result->extra_linkage));
60947 + } else {
60948 + /* pool is empty. Extra allocations don't deserve dedicated
60949 + slab to be served from, as they are expected to be rare. */
60950 + result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
60951 + if (result != 0) {
60952 + reiser4_init_pool_obj(result);
60953 + list_add(&result->extra_linkage, &pool->extra);
60954 + } else
60955 + return ERR_PTR(RETERR(-ENOMEM));
60956 + BUG_ON(!list_empty(&result->usage_linkage) ||
60957 + !list_empty(&result->level_linkage));
60958 + }
60959 + ++pool->objs;
60960 + list_add(&result->usage_linkage, &pool->used);
60961 + memset(result + 1, 0, pool->obj_size - sizeof *result);
60962 + return result;
60963 +}
60964 +
60965 +/* return object back to the pool */
60966 +void reiser4_pool_free(struct reiser4_pool * pool,
60967 + struct reiser4_pool_header * h)
60968 +{
60969 + assert("nikita-961", h != NULL);
60970 + assert("nikita-962", pool != NULL);
60971 +
60972 + --pool->objs;
60973 + assert("nikita-963", pool->objs >= 0);
60974 +
60975 + list_del_init(&h->usage_linkage);
60976 + list_del_init(&h->level_linkage);
60977 +
60978 + if (list_empty(&h->extra_linkage))
60979 + /*
60980 + * pool header is not an extra one. Push it onto free list
60981 + * using usage_linkage
60982 + */
60983 + list_add(&h->usage_linkage, &pool->free);
60984 + else {
60985 + /* remove pool header from pool's extra list and kfree it */
60986 + list_del(&h->extra_linkage);
60987 + kfree(h);
60988 + }
60989 +}
60990 +
60991 +/* add new object to the carry level list
60992 +
60993 + Carry level is FIFO most of the time, but not always. Complications arise
60994 + when make_space() function tries to go to the left neighbor and thus adds
60995 + carry node before existing nodes, and also, when updating delimiting keys
60996 + after moving data between two nodes, we want left node to be locked before
60997 + right node.
60998 +
60999 + Latter case is confusing at the first glance. Problem is that COP_UPDATE
61000 + opration that updates delimiting keys is sometimes called with two nodes
61001 + (when data are moved between two nodes) and sometimes with only one node
61002 + (when leftmost item is deleted in a node). In any case operation is
61003 + supplied with at least node whose left delimiting key is to be updated
61004 + (that is "right" node).
61005 +
61006 + @pool - from which to allocate new object;
61007 + @list - where to add object;
61008 + @reference - after (or before) which existing object to add
61009 +*/
61010 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61011 + struct list_head *list,
61012 + pool_ordering order,
61013 + struct reiser4_pool_header * reference)
61014 +{
61015 + struct reiser4_pool_header *result;
61016 +
61017 + assert("nikita-972", pool != NULL);
61018 +
61019 + result = reiser4_pool_alloc(pool);
61020 + if (IS_ERR(result))
61021 + return result;
61022 +
61023 + assert("nikita-973", result != NULL);
61024 +
61025 + switch (order) {
61026 + case POOLO_BEFORE:
61027 + __list_add(&result->level_linkage,
61028 + reference->level_linkage.prev,
61029 + &reference->level_linkage);
61030 + break;
61031 + case POOLO_AFTER:
61032 + __list_add(&result->level_linkage,
61033 + &reference->level_linkage,
61034 + reference->level_linkage.next);
61035 + break;
61036 + case POOLO_LAST:
61037 + list_add_tail(&result->level_linkage, list);
61038 + break;
61039 + case POOLO_FIRST:
61040 + list_add(&result->level_linkage, list);
61041 + break;
61042 + default:
61043 + wrong_return_value("nikita-927", "order");
61044 + }
61045 + return result;
61046 +}
61047 +
61048 +/* Make Linus happy.
61049 + Local variables:
61050 + c-indentation-style: "K&R"
61051 + mode-name: "LC"
61052 + c-basic-offset: 8
61053 + tab-width: 8
61054 + fill-column: 120
61055 + End:
61056 +*/
61057 diff -urN linux-2.6.23.orig/fs/reiser4/pool.h linux-2.6.23/fs/reiser4/pool.h
61058 --- linux-2.6.23.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
61059 +++ linux-2.6.23/fs/reiser4/pool.h 2007-12-04 16:49:30.000000000 +0300
61060 @@ -0,0 +1,56 @@
61061 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61062 +
61063 +/* Fast pool allocation */
61064 +
61065 +#ifndef __REISER4_POOL_H__
61066 +#define __REISER4_POOL_H__
61067 +
61068 +#include <linux/types.h>
61069 +
61070 +struct reiser4_pool {
61071 + size_t obj_size;
61072 + int objs;
61073 + char *data;
61074 + struct list_head free;
61075 + struct list_head used;
61076 + struct list_head extra;
61077 +};
61078 +
61079 +struct reiser4_pool_header {
61080 + /* object is either on free or "used" lists */
61081 + struct list_head usage_linkage;
61082 + struct list_head level_linkage;
61083 + struct list_head extra_linkage;
61084 +};
61085 +
61086 +typedef enum {
61087 + POOLO_BEFORE,
61088 + POOLO_AFTER,
61089 + POOLO_LAST,
61090 + POOLO_FIRST
61091 +} pool_ordering;
61092 +
61093 +/* pool manipulation functions */
61094 +
61095 +extern void reiser4_init_pool(struct reiser4_pool * pool, size_t obj_size,
61096 + int num_of_objs, char *data);
61097 +extern void reiser4_done_pool(struct reiser4_pool * pool);
61098 +extern void reiser4_pool_free(struct reiser4_pool * pool,
61099 + struct reiser4_pool_header * h);
61100 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61101 + struct list_head * list,
61102 + pool_ordering order,
61103 + struct reiser4_pool_header *reference);
61104 +
61105 +/* __REISER4_POOL_H__ */
61106 +#endif
61107 +
61108 +/* Make Linus happy.
61109 + Local variables:
61110 + c-indentation-style: "K&R"
61111 + mode-name: "LC"
61112 + c-basic-offset: 8
61113 + tab-width: 8
61114 + fill-column: 120
61115 + End:
61116 +*/
61117 diff -urN linux-2.6.23.orig/fs/reiser4/readahead.c linux-2.6.23/fs/reiser4/readahead.c
61118 --- linux-2.6.23.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300
61119 +++ linux-2.6.23/fs/reiser4/readahead.c 2007-12-04 16:49:30.000000000 +0300
61120 @@ -0,0 +1,138 @@
61121 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61122 + * reiser4/README */
61123 +
61124 +#include "forward.h"
61125 +#include "tree.h"
61126 +#include "tree_walk.h"
61127 +#include "super.h"
61128 +#include "inode.h"
61129 +#include "key.h"
61130 +#include "znode.h"
61131 +
61132 +#include <linux/swap.h> /* for totalram_pages */
61133 +
61134 +void reiser4_init_ra_info(ra_info_t * rai)
61135 +{
61136 + rai->key_to_stop = *reiser4_min_key();
61137 +}
61138 +
61139 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
61140 +static inline int ra_adjacent_only(int flags)
61141 +{
61142 + return flags & RA_ADJACENT_ONLY;
61143 +}
61144 +
61145 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
61146 + if right neighbor's first key is less or equal to readahead's stop key */
61147 +static int should_readahead_neighbor(znode * node, ra_info_t * info)
61148 +{
61149 + int result;
61150 +
61151 + read_lock_dk(znode_get_tree(node));
61152 + result = keyle(znode_get_rd_key(node), &info->key_to_stop);
61153 + read_unlock_dk(znode_get_tree(node));
61154 + return result;
61155 +}
61156 +
61157 +#define LOW_MEM_PERCENTAGE (5)
61158 +
61159 +static int low_on_memory(void)
61160 +{
61161 + unsigned int freepages;
61162 +
61163 + freepages = nr_free_pages();
61164 + return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
61165 +}
61166 +
61167 +/* start read for @node and for a few of its right neighbors */
61168 +void formatted_readahead(znode * node, ra_info_t * info)
61169 +{
61170 + struct formatted_ra_params *ra_params;
61171 + znode *cur;
61172 + int i;
61173 + int grn_flags;
61174 + lock_handle next_lh;
61175 +
61176 + /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
61177 + if (reiser4_blocknr_is_fake(znode_get_block(node)))
61178 + return;
61179 +
61180 + ra_params = get_current_super_ra_params();
61181 +
61182 + if (znode_page(node) == NULL)
61183 + jstartio(ZJNODE(node));
61184 +
61185 + if (znode_get_level(node) != LEAF_LEVEL)
61186 + return;
61187 +
61188 + /* don't waste memory for read-ahead when low on memory */
61189 + if (low_on_memory())
61190 + return;
61191 +
61192 + /* We can have locked nodes on upper tree levels, in this situation lock
61193 + priorities do not help to resolve deadlocks, we have to use TRY_LOCK
61194 + here. */
61195 + grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
61196 +
61197 + i = 0;
61198 + cur = zref(node);
61199 + init_lh(&next_lh);
61200 + while (i < ra_params->max) {
61201 + const reiser4_block_nr *nextblk;
61202 +
61203 + if (!should_readahead_neighbor(cur, info))
61204 + break;
61205 +
61206 + if (reiser4_get_right_neighbor
61207 + (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
61208 + break;
61209 +
61210 + nextblk = znode_get_block(next_lh.node);
61211 + if (reiser4_blocknr_is_fake(nextblk) ||
61212 + (ra_adjacent_only(ra_params->flags)
61213 + && *nextblk != *znode_get_block(cur) + 1)) {
61214 + break;
61215 + }
61216 +
61217 + zput(cur);
61218 + cur = zref(next_lh.node);
61219 + done_lh(&next_lh);
61220 + if (znode_page(cur) == NULL)
61221 + jstartio(ZJNODE(cur));
61222 + else
61223 + /* Do not scan read-ahead window if pages already
61224 + * allocated (and i/o already started). */
61225 + break;
61226 +
61227 + i++;
61228 + }
61229 + zput(cur);
61230 + done_lh(&next_lh);
61231 +}
61232 +
61233 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
61234 +{
61235 + reiser4_key *stop_key;
61236 +
61237 + assert("nikita-3542", dir != NULL);
61238 + assert("nikita-3543", tap != NULL);
61239 +
61240 + stop_key = &tap->ra_info.key_to_stop;
61241 + /* initialize readdir readahead information: include into readahead
61242 + * stat data of all files of the directory */
61243 + set_key_locality(stop_key, get_inode_oid(dir));
61244 + set_key_type(stop_key, KEY_SD_MINOR);
61245 + set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
61246 + set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
61247 + set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
61248 +}
61249 +
61250 +/*
61251 + Local variables:
61252 + c-indentation-style: "K&R"
61253 + mode-name: "LC"
61254 + c-basic-offset: 8
61255 + tab-width: 8
61256 + fill-column: 80
61257 + End:
61258 +*/
61259 diff -urN linux-2.6.23.orig/fs/reiser4/readahead.h linux-2.6.23/fs/reiser4/readahead.h
61260 --- linux-2.6.23.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300
61261 +++ linux-2.6.23/fs/reiser4/readahead.h 2007-12-04 16:49:30.000000000 +0300
61262 @@ -0,0 +1,51 @@
61263 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61264 +
61265 +#ifndef __READAHEAD_H__
61266 +#define __READAHEAD_H__
61267 +
61268 +#include "key.h"
61269 +
61270 +typedef enum {
61271 + RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent.
61272 + Default is NO (not only adjacent) */
61273 +} ra_global_flags;
61274 +
61275 +/* reiser4 super block has a field of this type.
61276 + It controls readahead during tree traversals */
61277 +struct formatted_ra_params {
61278 + unsigned long max; /* request not more than this amount of nodes.
61279 + Default is totalram_pages / 4 */
61280 + int flags;
61281 +};
61282 +
61283 +typedef struct {
61284 + reiser4_key key_to_stop;
61285 +} ra_info_t;
61286 +
61287 +void formatted_readahead(znode *, ra_info_t *);
61288 +void reiser4_init_ra_info(ra_info_t * rai);
61289 +
61290 +struct reiser4_file_ra_state {
61291 + loff_t start; /* Current window */
61292 + loff_t size;
61293 + loff_t next_size; /* Next window size */
61294 + loff_t ahead_start; /* Ahead window */
61295 + loff_t ahead_size;
61296 + loff_t max_window_size; /* Maximum readahead window */
61297 + loff_t slow_start; /* enlarging r/a size algorithm. */
61298 +};
61299 +
61300 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
61301 +
61302 +/* __READAHEAD_H__ */
61303 +#endif
61304 +
61305 +/*
61306 + Local variables:
61307 + c-indentation-style: "K&R"
61308 + mode-name: "LC"
61309 + c-basic-offset: 8
61310 + tab-width: 8
61311 + fill-column: 120
61312 + End:
61313 +*/
61314 diff -urN linux-2.6.23.orig/fs/reiser4/README linux-2.6.23/fs/reiser4/README
61315 --- linux-2.6.23.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
61316 +++ linux-2.6.23/fs/reiser4/README 2007-12-04 16:49:30.000000000 +0300
61317 @@ -0,0 +1,128 @@
61318 +[LICENSING]
61319 +
61320 +Reiser4 is hereby licensed under the GNU General
61321 +Public License version 2.
61322 +
61323 +Source code files that contain the phrase "licensing governed by
61324 +reiser4/README" are "governed files" throughout this file. Governed
61325 +files are licensed under the GPL. The portions of them owned by Hans
61326 +Reiser, or authorized to be licensed by him, have been in the past,
61327 +and likely will be in the future, licensed to other parties under
61328 +other licenses. If you add your code to governed files, and don't
61329 +want it to be owned by Hans Reiser, put your copyright label on that
61330 +code so the poor blight and his customers can keep things straight.
61331 +All portions of governed files not labeled otherwise are owned by Hans
61332 +Reiser, and by adding your code to it, widely distributing it to
61333 +others or sending us a patch, and leaving the sentence in stating that
61334 +licensing is governed by the statement in this file, you accept this.
61335 +It will be a kindness if you identify whether Hans Reiser is allowed
61336 +to license code labeled as owned by you on your behalf other than
61337 +under the GPL, because he wants to know if it is okay to do so and put
61338 +a check in the mail to you (for non-trivial improvements) when he
61339 +makes his next sale. He makes no guarantees as to the amount if any,
61340 +though he feels motivated to motivate contributors, and you can surely
61341 +discuss this with him before or after contributing. You have the
61342 +right to decline to allow him to license your code contribution other
61343 +than under the GPL.
61344 +
61345 +Further licensing options are available for commercial and/or other
61346 +interests directly from Hans Reiser: reiser@namesys.com. If you interpret
61347 +the GPL as not allowing those additional licensing options, you read
61348 +it wrongly, and Richard Stallman agrees with me, when carefully read
61349 +you can see that those restrictions on additional terms do not apply
61350 +to the owner of the copyright, and my interpretation of this shall
61351 +govern for this license.
61352 +
61353 +[END LICENSING]
61354 +
61355 +Reiser4 is a file system based on dancing tree algorithms, and is
61356 +described at http://www.namesys.com
61357 +
61358 +mkfs.reiser4 and other utilities are on our webpage or wherever your
61359 +Linux provider put them. You really want to be running the latest
61360 +version off the website if you use fsck.
61361 +
61362 +Yes, if you update your reiser4 kernel module you do have to
61363 +recompile your kernel, most of the time. The errors you get will be
61364 +quite cryptic if your forget to do so.
61365 +
61366 +Hideous Commercial Pitch: Spread your development costs across other OS
61367 +vendors. Select from the best in the world, not the best in your
61368 +building, by buying from third party OS component suppliers. Leverage
61369 +the software component development power of the internet. Be the most
61370 +aggressive in taking advantage of the commercial possibilities of
61371 +decentralized internet development, and add value through your branded
61372 +integration that you sell as an operating system. Let your competitors
61373 +be the ones to compete against the entire internet by themselves. Be
61374 +hip, get with the new economic trend, before your competitors do. Send
61375 +email to reiser@namesys.com
61376 +
61377 +Hans Reiser was the primary architect of Reiser4, but a whole team
61378 +chipped their ideas in. He invested everything he had into Namesys
61379 +for 5.5 dark years of no money before Reiser3 finally started to work well
61380 +enough to bring in money. He owns the copyright.
61381 +
61382 +DARPA was the primary sponsor of Reiser4. DARPA does not endorse
61383 +Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
61384 +opinion, unique in its willingness to invest into things more
61385 +theoretical than the VC community can readily understand, and more
61386 +longterm than allows them to be sure that they will be the ones to
61387 +extract the economic benefits from. DARPA also integrated us into a
61388 +security community that transformed our security worldview.
61389 +
61390 +Vladimir Saveliev is our lead programmer, with us from the beginning,
61391 +and he worked long hours writing the cleanest code. This is why he is
61392 +now the lead programmer after years of commitment to our work. He
61393 +always made the effort to be the best he could be, and to make his
61394 +code the best that it could be. What resulted was quite remarkable. I
61395 +don't think that money can ever motivate someone to work the way he
61396 +did, he is one of the most selfless men I know.
61397 +
61398 +Alexander Lyamin was our sysadmin, and helped to educate us in
61399 +security issues. Moscow State University and IMT were very generous
61400 +in the internet access they provided us, and in lots of other little
61401 +ways that a generous institution can be.
61402 +
61403 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
61404 +locking code, the block allocator, and finished the flushing code.
61405 +His code is always crystal clean and well structured.
61406 +
61407 +Nikita Danilov wrote the core of the balancing code, the core of the
61408 +plugins code, and the directory code. He worked a steady pace of long
61409 +hours that produced a whole lot of well abstracted code. He is our
61410 +senior computer scientist.
61411 +
61412 +Vladimir Demidov wrote the parser. Writing an in kernel parser is
61413 +something very few persons have the skills for, and it is thanks to
61414 +him that we can say that the parser is really not so big compared to
61415 +various bits of our other code, and making a parser work in the kernel
61416 +was not so complicated as everyone would imagine mainly because it was
61417 +him doing it...
61418 +
61419 +Joshua McDonald wrote the transaction manager, and the flush code.
61420 +The flush code unexpectedly turned out be extremely hairy for reasons
61421 +you can read about on our web page, and he did a great job on an
61422 +extremely difficult task.
61423 +
61424 +Nina Reiser handled our accounting, government relations, and much
61425 +more.
61426 +
61427 +Ramon Reiser developed our website.
61428 +
61429 +Beverly Palmer drew our graphics.
61430 +
61431 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
61432 +and worked with Umka on developing libreiser4 and userspace plugins.
61433 +
61434 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
61435 +userspace tools (reiser4progs).
61436 +
61437 +Oleg Drokin (aka Green) is the release manager who fixes everything.
61438 +It is so nice to have someone like that on the team. He (plus Chris
61439 +and Jeff) make it possible for the entire rest of the Namesys team to
61440 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
61441 +is just amazing to watch his talent for spotting bugs in action.
61442 +
61443 +Edward Shishkin wrote cryptcompress file plugin (which manages files
61444 +built of encrypted and(or) compressed bodies) and other plugins related
61445 +to transparent encryption and compression support.
61446 diff -urN linux-2.6.23.orig/fs/reiser4/reiser4.h linux-2.6.23/fs/reiser4/reiser4.h
61447 --- linux-2.6.23.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300
61448 +++ linux-2.6.23/fs/reiser4/reiser4.h 2007-12-04 16:49:30.000000000 +0300
61449 @@ -0,0 +1,269 @@
61450 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61451 + * reiser4/README */
61452 +
61453 +/* definitions of common constants used by reiser4 */
61454 +
61455 +#if !defined( __REISER4_H__ )
61456 +#define __REISER4_H__
61457 +
61458 +#include <asm/param.h> /* for HZ */
61459 +#include <linux/errno.h>
61460 +#include <linux/types.h>
61461 +#include <linux/fs.h>
61462 +#include <linux/hardirq.h>
61463 +#include <linux/sched.h>
61464 +
61465 +/*
61466 + * reiser4 compilation options.
61467 + */
61468 +
61469 +#if defined(CONFIG_REISER4_DEBUG)
61470 +/* turn on assertion checks */
61471 +#define REISER4_DEBUG (1)
61472 +#else
61473 +#define REISER4_DEBUG (0)
61474 +#endif
61475 +
61476 +#if defined(CONFIG_ZLIB_INFLATE)
61477 +/* turn on zlib */
61478 +#define REISER4_ZLIB (1)
61479 +#else
61480 +#define REISER4_ZLIB (0)
61481 +#endif
61482 +
61483 +#if defined(CONFIG_CRYPTO_SHA256)
61484 +#define REISER4_SHA256 (1)
61485 +#else
61486 +#define REISER4_SHA256 (0)
61487 +#endif
61488 +
61489 +/*
61490 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
61491 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
61492 + * components. Additional component, referred to as "ordering" is used to
61493 + * order items from which given object is composed of. As such, ordering is
61494 + * placed between locality and objectid. For directory item ordering contains
61495 + * initial prefix of the file name this item is for. This sorts all directory
61496 + * items within given directory lexicographically (but see
61497 + * fibration.[ch]). For file body and stat-data, ordering contains initial
61498 + * prefix of the name file was initially created with. In the common case
61499 + * (files with single name) this allows to order file bodies and stat-datas in
61500 + * the same order as their respective directory entries, thus speeding up
61501 + * readdir.
61502 + *
61503 + * Note, that kernel can only mount file system with the same key size as one
61504 + * it is compiled for, so flipping this option may render your data
61505 + * inaccessible.
61506 + */
61507 +#define REISER4_LARGE_KEY (1)
61508 +/*#define REISER4_LARGE_KEY (0)*/
61509 +
61510 +/*#define GUESS_EXISTS 1*/
61511 +
61512 +/*
61513 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
61514 + * option
61515 + */
61516 +
61517 +extern const char *REISER4_SUPER_MAGIC_STRING;
61518 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
61519 + * beginning of device */
61520 +
61521 +/* here go tunable parameters that are not worth special entry in kernel
61522 + configuration */
61523 +
61524 +/* default number of slots in coord-by-key caches */
61525 +#define CBK_CACHE_SLOTS (16)
61526 +/* how many elementary tree operation to carry on the next level */
61527 +#define CARRIES_POOL_SIZE (5)
61528 +/* size of pool of preallocated nodes for carry process. */
61529 +#define NODES_LOCKED_POOL_SIZE (5)
61530 +
61531 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61532 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61533 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
61534 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
61535 +
61536 +/* we are supporting reservation of disk space on uid basis */
61537 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
61538 +/* we are supporting reservation of disk space for groups */
61539 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
61540 +/* we are supporting reservation of disk space for root */
61541 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
61542 +/* we use rapid flush mode, see flush.c for comments. */
61543 +#define REISER4_USE_RAPID_FLUSH (1)
61544 +
61545 +/*
61546 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
61547 + */
61548 +#define REISER4_USE_ENTD (1)
61549 +
61550 +/* key allocation is Plan-A */
61551 +#define REISER4_PLANA_KEY_ALLOCATION (1)
61552 +/* key allocation follows good old 3.x scheme */
61553 +#define REISER4_3_5_KEY_ALLOCATION (0)
61554 +
61555 +/* size of hash-table for znodes */
61556 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
61557 +
61558 +/* number of buckets in lnode hash-table */
61559 +#define LNODE_HTABLE_BUCKETS (1024)
61560 +
61561 +/* some ridiculously high maximal limit on height of znode tree. This
61562 + is used in declaration of various per level arrays and
61563 + to allocate stattistics gathering array for per-level stats. */
61564 +#define REISER4_MAX_ZTREE_HEIGHT (8)
61565 +
61566 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
61567 +
61568 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
61569 + sequential search is on average faster than binary. This is because
61570 + of better optimization and because sequential search is more CPU
61571 + cache friendly. This number (25) was found by experiments on dual AMD
61572 + Athlon(tm), 1400MHz.
61573 +
61574 + NOTE: testing in kernel has shown that binary search is more effective than
61575 + implied by results of the user level benchmarking. Probably because in the
61576 + node keys are separated by other data. So value was adjusted after few
61577 + tests. More thorough tuning is needed.
61578 +*/
61579 +#define REISER4_SEQ_SEARCH_BREAK (3)
61580 +
61581 +/* don't allow tree to be lower than this */
61582 +#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
61583 +
61584 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
61585 + * available memory. */
61586 +/* Default value of maximal atom size. Can be ovewritten by
61587 + tmgr.atom_max_size mount option. By default infinity. */
61588 +#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
61589 +
61590 +/* Default value of maximal atom age (in jiffies). After reaching this age
61591 + atom will be forced to commit, either synchronously or asynchronously. Can
61592 + be overwritten by tmgr.atom_max_age mount option. */
61593 +#define REISER4_ATOM_MAX_AGE (600 * HZ)
61594 +
61595 +/* sleeping period for ktxnmrgd */
61596 +#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
61597 +
61598 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
61599 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
61600 +
61601 +/* start complaining after that many restarts in coord_by_key().
61602 +
61603 + This either means incredibly heavy contention for this part of a tree, or
61604 + some corruption or bug.
61605 +*/
61606 +#define REISER4_CBK_ITERATIONS_LIMIT (100)
61607 +
61608 +/* return -EIO after that many iterations in coord_by_key().
61609 +
61610 + I have witnessed more than 800 iterations (in 30 thread test) before cbk
61611 + finished. --nikita
61612 +*/
61613 +#define REISER4_MAX_CBK_ITERATIONS 500000
61614 +
61615 +/* put a per-inode limit on maximal number of directory entries with identical
61616 + keys in hashed directory.
61617 +
61618 + Disable this until inheritance interfaces stabilize: we need some way to
61619 + set per directory limit.
61620 +*/
61621 +#define REISER4_USE_COLLISION_LIMIT (0)
61622 +
61623 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
61624 + will force them to be relocated. */
61625 +#define FLUSH_RELOCATE_THRESHOLD 64
61626 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
61627 + from the preceder it will relocate to that position. */
61628 +#define FLUSH_RELOCATE_DISTANCE 64
61629 +
61630 +/* If we have written this much or more blocks before encountering busy jnode
61631 + in flush list - abort flushing hoping that next time we get called
61632 + this jnode will be clean already, and we will save some seeks. */
61633 +#define FLUSH_WRITTEN_THRESHOLD 50
61634 +
61635 +/* The maximum number of nodes to scan left on a level during flush. */
61636 +#define FLUSH_SCAN_MAXNODES 10000
61637 +
61638 +/* per-atom limit of flushers */
61639 +#define ATOM_MAX_FLUSHERS (1)
61640 +
61641 +/* default tracing buffer size */
61642 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
61643 +
61644 +/* what size units of IO we would like cp, etc., to use, in writing to
61645 + reiser4. In bytes.
61646 +
61647 + Can be overwritten by optimal_io_size mount option.
61648 +*/
61649 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
61650 +
61651 +/* see comments in inode.c:oid_to_uino() */
61652 +#define REISER4_UINO_SHIFT (1 << 30)
61653 +
61654 +/* Mark function argument as unused to avoid compiler warnings. */
61655 +#define UNUSED_ARG __attribute__((unused))
61656 +
61657 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
61658 +#define NONNULL __attribute__((nonnull))
61659 +#else
61660 +#define NONNULL
61661 +#endif
61662 +
61663 +/* master super block offset in bytes.*/
61664 +#define REISER4_MASTER_OFFSET 65536
61665 +
61666 +/* size of VFS block */
61667 +#define VFS_BLKSIZE 512
61668 +/* number of bits in size of VFS block (512==2^9) */
61669 +#define VFS_BLKSIZE_BITS 9
61670 +
61671 +#define REISER4_I reiser4_inode_data
61672 +
61673 +/* implication */
61674 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
61675 +/* logical equivalence */
61676 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
61677 +
61678 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
61679 +
61680 +#define NOT_YET (0)
61681 +
61682 +/** Reiser4 specific error codes **/
61683 +
61684 +#define REISER4_ERROR_CODE_BASE 10000
61685 +
61686 +/* Neighbor is not available (side neighbor or parent) */
61687 +#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
61688 +
61689 +/* Node was not found in cache */
61690 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
61691 +
61692 +/* node has no free space enough for completion of balancing operation */
61693 +#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
61694 +
61695 +/* repeat operation */
61696 +#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
61697 +
61698 +/* deadlock happens */
61699 +#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
61700 +
61701 +/* operation cannot be performed, because it would block and non-blocking mode
61702 + * was requested. */
61703 +#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
61704 +
61705 +/* wait some event (depends on context), then repeat */
61706 +#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
61707 +
61708 +#endif /* __REISER4_H__ */
61709 +
61710 +/* Make Linus happy.
61711 + Local variables:
61712 + c-indentation-style: "K&R"
61713 + mode-name: "LC"
61714 + c-basic-offset: 8
61715 + tab-width: 8
61716 + fill-column: 120
61717 + End:
61718 +*/
61719 diff -urN linux-2.6.23.orig/fs/reiser4/safe_link.c linux-2.6.23/fs/reiser4/safe_link.c
61720 --- linux-2.6.23.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300
61721 +++ linux-2.6.23/fs/reiser4/safe_link.c 2007-12-04 16:49:30.000000000 +0300
61722 @@ -0,0 +1,352 @@
61723 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
61724 + * reiser4/README */
61725 +
61726 +/* Safe-links. */
61727 +
61728 +/*
61729 + * Safe-links are used to maintain file system consistency during operations
61730 + * that spawns multiple transactions. For example:
61731 + *
61732 + * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
61733 + * without user-visible names in the file system, but still opened by some
61734 + * active process. What happens here is that unlink proper (i.e., removal
61735 + * of the last file name) and file deletion (truncate of file body to zero
61736 + * and deletion of stat-data, that happens when last file descriptor is
61737 + * closed), may belong to different transactions T1 and T2. If a crash
61738 + * happens after T1 commit, but before T2 commit, on-disk file system has
61739 + * a file without name, that is, disk space leak.
61740 + *
61741 + * 2. Truncate. Truncate of large file may spawn multiple transactions. If
61742 + * system crashes while truncate was in-progress, file is left partially
61743 + * truncated, which violates "atomicity guarantees" of reiser4, viz. that
61744 + * every system is atomic.
61745 + *
61746 + * Safe-links address both above cases. Basically, safe-link is a way post
61747 + * some operation to be executed during commit of some other transaction than
61748 + * current one. (Another way to look at the safe-link is to interpret it as a
61749 + * logical logging.)
61750 + *
61751 + * Specifically, at the beginning of unlink safe-link in inserted in the
61752 + * tree. This safe-link is normally removed by file deletion code (during
61753 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
61754 + * normally removed when truncate operation is finished.
61755 + *
61756 + * This means, that in the case of "clean umount" there are no safe-links in
61757 + * the tree. If safe-links are observed during mount, it means that (a) system
61758 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
61759 + * (i.e., not finished) operations that were in-progress during system
61760 + * termination. Each safe-link record enough information to complete
61761 + * corresponding operation, and mount simply "replays" them (hence, the
61762 + * analogy with the logical logging).
61763 + *
61764 + * Safe-links are implemented as blackbox items (see
61765 + * plugin/item/blackbox.[ch]).
61766 + *
61767 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
61768 + * list" there.
61769 + */
61770 +
61771 +#include "safe_link.h"
61772 +#include "debug.h"
61773 +#include "inode.h"
61774 +
61775 +#include "plugin/item/blackbox.h"
61776 +
61777 +#include <linux/fs.h>
61778 +
61779 +/*
61780 + * On-disk format of safe-link.
61781 + */
61782 +typedef struct safelink {
61783 + reiser4_key sdkey; /* key of stat-data for the file safe-link is
61784 + * for */
61785 + d64 size; /* size to which file should be truncated */
61786 +} safelink_t;
61787 +
61788 +/*
61789 + * locality where safe-link items are stored. Next to the objectid of root
61790 + * directory.
61791 + */
61792 +static oid_t safe_link_locality(reiser4_tree * tree)
61793 +{
61794 + return get_key_objectid(get_super_private(tree->super)->df_plug->
61795 + root_dir_key(tree->super)) + 1;
61796 +}
61797 +
61798 +/*
61799 + Construct a key for the safe-link. Key has the following format:
61800 +
61801 +| 60 | 4 | 64 | 4 | 60 | 64 |
61802 ++---------------+---+------------------+---+---------------+------------------+
61803 +| locality | 0 | 0 | 0 | objectid | link type |
61804 ++---------------+---+------------------+---+---------------+------------------+
61805 +| | | | |
61806 +| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
61807 +
61808 + This is in large keys format. In small keys format second 8 byte chunk is
61809 + out. Locality is a constant returned by safe_link_locality(). objectid is
61810 + an oid of a file on which operation protected by this safe-link is
61811 + performed. link-type is used to distinguish safe-links for different
61812 + operations.
61813 +
61814 + */
61815 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
61816 + reiser4_safe_link_t link, reiser4_key * key)
61817 +{
61818 + reiser4_key_init(key);
61819 + set_key_locality(key, safe_link_locality(tree));
61820 + set_key_objectid(key, oid);
61821 + set_key_offset(key, link);
61822 + return key;
61823 +}
61824 +
61825 +/*
61826 + * how much disk space is necessary to insert and remove (in the
61827 + * error-handling path) safe-link.
61828 + */
61829 +static __u64 safe_link_tograb(reiser4_tree * tree)
61830 +{
61831 + return
61832 + /* insert safe link */
61833 + estimate_one_insert_item(tree) +
61834 + /* remove safe link */
61835 + estimate_one_item_removal(tree) +
61836 + /* drill to the leaf level during insertion */
61837 + 1 + estimate_one_insert_item(tree) +
61838 + /*
61839 + * possible update of existing safe-link. Actually, if
61840 + * safe-link existed already (we failed to remove it), then no
61841 + * insertion is necessary, so this term is already "covered",
61842 + * but for simplicity let's left it.
61843 + */
61844 + 1;
61845 +}
61846 +
61847 +/*
61848 + * grab enough disk space to insert and remove (in the error-handling path)
61849 + * safe-link.
61850 + */
61851 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
61852 +{
61853 + int result;
61854 +
61855 + grab_space_enable();
61856 + /* The sbinfo->delete_mutex can be taken here.
61857 + * safe_link_release() should be called before leaving reiser4
61858 + * context. */
61859 + result =
61860 + reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
61861 + grab_space_enable();
61862 + return result;
61863 +}
61864 +
61865 +/*
61866 + * release unused disk space reserved by safe_link_grab().
61867 + */
61868 +void safe_link_release(reiser4_tree * tree)
61869 +{
61870 + reiser4_release_reserved(tree->super);
61871 +}
61872 +
61873 +/*
61874 + * insert into tree safe-link for operation @link on inode @inode.
61875 + */
61876 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
61877 +{
61878 + reiser4_key key;
61879 + safelink_t sl;
61880 + int length;
61881 + int result;
61882 + reiser4_tree *tree;
61883 +
61884 + build_sd_key(inode, &sl.sdkey);
61885 + length = sizeof sl.sdkey;
61886 +
61887 + if (link == SAFE_TRUNCATE) {
61888 + /*
61889 + * for truncate we have to store final file length also,
61890 + * expand item.
61891 + */
61892 + length += sizeof(sl.size);
61893 + put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
61894 + }
61895 + tree = reiser4_tree_by_inode(inode);
61896 + build_link_key(tree, get_inode_oid(inode), link, &key);
61897 +
61898 + result = store_black_box(tree, &key, &sl, length);
61899 + if (result == -EEXIST)
61900 + result = update_black_box(tree, &key, &sl, length);
61901 + return result;
61902 +}
61903 +
61904 +/*
61905 + * remove safe-link corresponding to the operation @link on inode @inode from
61906 + * the tree.
61907 + */
61908 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
61909 +{
61910 + reiser4_key key;
61911 +
61912 + return kill_black_box(tree, build_link_key(tree, oid, link, &key));
61913 +}
61914 +
61915 +/*
61916 + * in-memory structure to keep information extracted from safe-link. This is
61917 + * used to iterate over all safe-links.
61918 + */
61919 +struct safe_link_context {
61920 + reiser4_tree *tree; /* internal tree */
61921 + reiser4_key key; /* safe-link key */
61922 + reiser4_key sdkey; /* key of object stat-data */
61923 + reiser4_safe_link_t link; /* safe-link type */
61924 + oid_t oid; /* object oid */
61925 + __u64 size; /* final size for truncate */
61926 +};
61927 +
61928 +/*
61929 + * start iterating over all safe-links.
61930 + */
61931 +static void safe_link_iter_begin(reiser4_tree * tree,
61932 + struct safe_link_context * ctx)
61933 +{
61934 + ctx->tree = tree;
61935 + reiser4_key_init(&ctx->key);
61936 + set_key_locality(&ctx->key, safe_link_locality(tree));
61937 + set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
61938 + set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
61939 +}
61940 +
61941 +/*
61942 + * return next safe-link.
61943 + */
61944 +static int safe_link_iter_next(struct safe_link_context * ctx)
61945 +{
61946 + int result;
61947 + safelink_t sl;
61948 +
61949 + result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
61950 + if (result == 0) {
61951 + ctx->oid = get_key_objectid(&ctx->key);
61952 + ctx->link = get_key_offset(&ctx->key);
61953 + ctx->sdkey = sl.sdkey;
61954 + if (ctx->link == SAFE_TRUNCATE)
61955 + ctx->size = le64_to_cpu(get_unaligned(&sl.size));
61956 + }
61957 + return result;
61958 +}
61959 +
61960 +/*
61961 + * check are there any more safe-links left in the tree.
61962 + */
61963 +static int safe_link_iter_finished(struct safe_link_context * ctx)
61964 +{
61965 + return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
61966 +}
61967 +
61968 +/*
61969 + * finish safe-link iteration.
61970 + */
61971 +static void safe_link_iter_end(struct safe_link_context * ctx)
61972 +{
61973 + /* nothing special */
61974 +}
61975 +
61976 +/*
61977 + * process single safe-link.
61978 + */
61979 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
61980 + reiser4_key * sdkey, oid_t oid, __u64 size)
61981 +{
61982 + struct inode *inode;
61983 + int result;
61984 +
61985 + /*
61986 + * obtain object inode by reiser4_iget(), then call object plugin
61987 + * ->safelink() method to do actual work, then delete safe-link on
61988 + * success.
61989 + */
61990 + inode = reiser4_iget(super, sdkey, 1);
61991 + if (!IS_ERR(inode)) {
61992 + file_plugin *fplug;
61993 +
61994 + fplug = inode_file_plugin(inode);
61995 + assert("nikita-3428", fplug != NULL);
61996 + assert("", oid == get_inode_oid(inode));
61997 + if (fplug->safelink != NULL) {
61998 + /* reiser4_txn_restart_current is not necessary because
61999 + * mounting is signle thread. However, without it
62000 + * deadlock detection code will complain (see
62001 + * nikita-3361). */
62002 + reiser4_txn_restart_current();
62003 + result = fplug->safelink(inode, link, size);
62004 + } else {
62005 + warning("nikita-3430",
62006 + "Cannot handle safelink for %lli",
62007 + (unsigned long long)oid);
62008 + reiser4_print_key("key", sdkey);
62009 + result = 0;
62010 + }
62011 + if (result != 0) {
62012 + warning("nikita-3431",
62013 + "Error processing safelink for %lli: %i",
62014 + (unsigned long long)oid, result);
62015 + }
62016 + reiser4_iget_complete(inode);
62017 + iput(inode);
62018 + if (result == 0) {
62019 + result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
62020 + if (result == 0)
62021 + result =
62022 + safe_link_del(reiser4_get_tree(super), oid, link);
62023 + safe_link_release(reiser4_get_tree(super));
62024 + /*
62025 + * restart transaction: if there was large number of
62026 + * safe-links, their processing may fail to fit into
62027 + * single transaction.
62028 + */
62029 + if (result == 0)
62030 + reiser4_txn_restart_current();
62031 + }
62032 + } else
62033 + result = PTR_ERR(inode);
62034 + return result;
62035 +}
62036 +
62037 +/*
62038 + * iterate over all safe-links in the file-system processing them one by one.
62039 + */
62040 +int process_safelinks(struct super_block *super)
62041 +{
62042 + struct safe_link_context ctx;
62043 + int result;
62044 +
62045 + if (rofs_super(super))
62046 + /* do nothing on the read-only file system */
62047 + return 0;
62048 + safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
62049 + result = 0;
62050 + do {
62051 + result = safe_link_iter_next(&ctx);
62052 + if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
62053 + result = 0;
62054 + break;
62055 + }
62056 + if (result == 0)
62057 + result = process_safelink(super, ctx.link,
62058 + &ctx.sdkey, ctx.oid,
62059 + ctx.size);
62060 + } while (result == 0);
62061 + safe_link_iter_end(&ctx);
62062 + return result;
62063 +}
62064 +
62065 +/* Make Linus happy.
62066 + Local variables:
62067 + c-indentation-style: "K&R"
62068 + mode-name: "LC"
62069 + c-basic-offset: 8
62070 + tab-width: 8
62071 + fill-column: 120
62072 + scroll-step: 1
62073 + End:
62074 +*/
62075 diff -urN linux-2.6.23.orig/fs/reiser4/safe_link.h linux-2.6.23/fs/reiser4/safe_link.h
62076 --- linux-2.6.23.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300
62077 +++ linux-2.6.23/fs/reiser4/safe_link.h 2007-12-04 16:49:30.000000000 +0300
62078 @@ -0,0 +1,29 @@
62079 +/* Copyright 2003 by Hans Reiser, licensing governed by
62080 + * reiser4/README */
62081 +
62082 +/* Safe-links. See safe_link.c for details. */
62083 +
62084 +#if !defined( __FS_SAFE_LINK_H__ )
62085 +#define __FS_SAFE_LINK_H__
62086 +
62087 +#include "tree.h"
62088 +
62089 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
62090 +void safe_link_release(reiser4_tree * tree);
62091 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
62092 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
62093 +
62094 +int process_safelinks(struct super_block *super);
62095 +
62096 +/* __FS_SAFE_LINK_H__ */
62097 +#endif
62098 +
62099 +/* Make Linus happy.
62100 + Local variables:
62101 + c-indentation-style: "K&R"
62102 + mode-name: "LC"
62103 + c-basic-offset: 8
62104 + tab-width: 8
62105 + fill-column: 120
62106 + End:
62107 +*/
62108 diff -urN linux-2.6.23.orig/fs/reiser4/seal.c linux-2.6.23/fs/reiser4/seal.c
62109 --- linux-2.6.23.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
62110 +++ linux-2.6.23/fs/reiser4/seal.c 2007-12-04 16:49:30.000000000 +0300
62111 @@ -0,0 +1,218 @@
62112 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62113 +/* Seals implementation. */
62114 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
62115 + allowing to bypass tree traversal. But normal usage of coords implies that
62116 + node pointed to by coord is locked, whereas seals don't keep a lock (or
62117 + even a reference) to znode. In stead, each znode contains a version number,
62118 + increased on each znode modification. This version number is copied into a
62119 + seal when seal is created. Later, one can "validate" seal by calling
62120 + reiser4_seal_validate(). If znode is in cache and its version number is
62121 + still the same, seal is "pristine" and coord associated with it can be
62122 + re-used immediately.
62123 +
62124 + If, on the other hand, znode is out of cache, or it is obviously different
62125 + one from the znode seal was initially attached to (for example, it is on
62126 + the different level, or is being removed from the tree), seal is
62127 + irreparably invalid ("burned") and tree traversal has to be repeated.
62128 +
62129 + Otherwise, there is some hope, that while znode was modified (and seal was
62130 + "broken" as a result), key attached to the seal is still in the node. This
62131 + is checked by first comparing this key with delimiting keys of node and, if
62132 + key is ok, doing intra-node lookup.
62133 +
62134 + Znode version is maintained in the following way:
62135 +
62136 + there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
62137 + znode_epoch is incremented and its new value is stored in ->version field
62138 + of new znode. Whenever znode is dirtied (which means it was probably
62139 + modified), znode_epoch is also incremented and its new value is stored in
62140 + znode->version. This is done so, because just incrementing znode->version
62141 + on each update is not enough: it may so happen, that znode get deleted, new
62142 + znode is allocated for the same disk block and gets the same version
62143 + counter, tricking seal code into false positive.
62144 +*/
62145 +
62146 +#include "forward.h"
62147 +#include "debug.h"
62148 +#include "key.h"
62149 +#include "coord.h"
62150 +#include "seal.h"
62151 +#include "plugin/item/item.h"
62152 +#include "plugin/node/node.h"
62153 +#include "jnode.h"
62154 +#include "znode.h"
62155 +#include "super.h"
62156 +
62157 +static znode *seal_node(const seal_t * seal);
62158 +static int seal_matches(const seal_t * seal, znode * node);
62159 +
62160 +/* initialise seal. This can be called several times on the same seal. @coord
62161 + and @key can be NULL. */
62162 +void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
62163 + const coord_t * coord /* coord @seal will be
62164 + * attached to */ ,
62165 + const reiser4_key * key UNUSED_ARG /* key @seal will be
62166 + * attached to */ )
62167 +{
62168 + assert("nikita-1886", seal != NULL);
62169 + memset(seal, 0, sizeof *seal);
62170 + if (coord != NULL) {
62171 + znode *node;
62172 +
62173 + node = coord->node;
62174 + assert("nikita-1987", node != NULL);
62175 + spin_lock_znode(node);
62176 + seal->version = node->version;
62177 + assert("nikita-1988", seal->version != 0);
62178 + seal->block = *znode_get_block(node);
62179 +#if REISER4_DEBUG
62180 + seal->coord1 = *coord;
62181 + if (key != NULL)
62182 + seal->key = *key;
62183 +#endif
62184 + spin_unlock_znode(node);
62185 + }
62186 +}
62187 +
62188 +/* finish with seal */
62189 +void reiser4_seal_done(seal_t * seal /* seal to clear */ )
62190 +{
62191 + assert("nikita-1887", seal != NULL);
62192 + seal->version = 0;
62193 +}
62194 +
62195 +/* true if seal was initialised */
62196 +int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
62197 +{
62198 + assert("nikita-1890", seal != NULL);
62199 + return seal->version != 0;
62200 +}
62201 +
62202 +#if REISER4_DEBUG
62203 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
62204 + * has expected key. This is to detect cases where node was modified but wasn't
62205 + * marked dirty. */
62206 +static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
62207 + const reiser4_key * k /* expected key */ )
62208 +{
62209 + reiser4_key ukey;
62210 +
62211 + return (coord->between != AT_UNIT) ||
62212 + /* FIXME-VS: we only can compare keys for items whose units
62213 + represent exactly one key */
62214 + ((coord_is_existing_unit(coord))
62215 + && (item_is_extent(coord)
62216 + || keyeq(k, unit_key_by_coord(coord, &ukey))))
62217 + || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
62218 + && keyge(k, unit_key_by_coord(coord, &ukey)));
62219 +}
62220 +#endif
62221 +
62222 +/* this is used by reiser4_seal_validate. It accepts return value of
62223 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
62224 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
62225 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
62226 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
62227 + * distinguish between -EINVAL and -E_REPEAT. */
62228 +static int should_repeat(int return_code)
62229 +{
62230 + return return_code == -EINVAL;
62231 +}
62232 +
62233 +/* (re-)validate seal.
62234 +
62235 + Checks whether seal is pristine, and try to revalidate it if possible.
62236 +
62237 + If seal was burned, or broken irreparably, return -E_REPEAT.
62238 +
62239 + NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
62240 + looking for is in range of keys covered by the sealed node, but item wasn't
62241 + found by node ->lookup() method. Alternative is to return -ENOENT in this
62242 + case, but this would complicate callers logic.
62243 +
62244 +*/
62245 +int reiser4_seal_validate(seal_t * seal /* seal to validate */,
62246 + coord_t * coord /* coord to validate against */,
62247 + const reiser4_key * key /* key to validate against */,
62248 + lock_handle * lh /* resulting lock handle */,
62249 + znode_lock_mode mode /* lock node */,
62250 + znode_lock_request request /* locking priority */)
62251 +{
62252 + znode *node;
62253 + int result;
62254 +
62255 + assert("nikita-1889", seal != NULL);
62256 + assert("nikita-1881", reiser4_seal_is_set(seal));
62257 + assert("nikita-1882", key != NULL);
62258 + assert("nikita-1883", coord != NULL);
62259 + assert("nikita-1884", lh != NULL);
62260 + assert("nikita-1885", keyeq(&seal->key, key));
62261 + assert("nikita-1989", coords_equal(&seal->coord1, coord));
62262 +
62263 + /* obtain znode by block number */
62264 + node = seal_node(seal);
62265 + if (node != NULL) {
62266 + /* znode was in cache, lock it */
62267 + result = longterm_lock_znode(lh, node, mode, request);
62268 + zput(node);
62269 + if (result == 0) {
62270 + if (seal_matches(seal, node)) {
62271 + /* if seal version and znode version
62272 + coincide */
62273 + ON_DEBUG(coord_update_v(coord));
62274 + assert("nikita-1990",
62275 + node == seal->coord1.node);
62276 + assert("nikita-1898",
62277 + WITH_DATA_RET(coord->node, 1,
62278 + check_seal_match(coord,
62279 + key)));
62280 + } else
62281 + result = RETERR(-E_REPEAT);
62282 + }
62283 + if (result != 0) {
62284 + if (should_repeat(result))
62285 + result = RETERR(-E_REPEAT);
62286 + /* unlock node on failure */
62287 + done_lh(lh);
62288 + }
62289 + } else {
62290 + /* znode wasn't in cache */
62291 + result = RETERR(-E_REPEAT);
62292 + }
62293 + return result;
62294 +}
62295 +
62296 +/* helpers functions */
62297 +
62298 +/* obtain reference to znode seal points to, if in cache */
62299 +static znode *seal_node(const seal_t * seal /* seal to query */ )
62300 +{
62301 + assert("nikita-1891", seal != NULL);
62302 + return zlook(current_tree, &seal->block);
62303 +}
62304 +
62305 +/* true if @seal version and @node version coincide */
62306 +static int seal_matches(const seal_t * seal /* seal to check */ ,
62307 + znode * node /* node to check */ )
62308 +{
62309 + int result;
62310 +
62311 + assert("nikita-1991", seal != NULL);
62312 + assert("nikita-1993", node != NULL);
62313 +
62314 + spin_lock_znode(node);
62315 + result = (seal->version == node->version);
62316 + spin_unlock_znode(node);
62317 + return result;
62318 +}
62319 +
62320 +/* Make Linus happy.
62321 + Local variables:
62322 + c-indentation-style: "K&R"
62323 + mode-name: "LC"
62324 + c-basic-offset: 8
62325 + tab-width: 8
62326 + fill-column: 120
62327 + scroll-step: 1
62328 + End:
62329 +*/
62330 diff -urN linux-2.6.23.orig/fs/reiser4/seal.h linux-2.6.23/fs/reiser4/seal.h
62331 --- linux-2.6.23.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
62332 +++ linux-2.6.23/fs/reiser4/seal.h 2007-12-04 16:49:30.000000000 +0300
62333 @@ -0,0 +1,49 @@
62334 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62335 +
62336 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
62337 +
62338 +#ifndef __SEAL_H__
62339 +#define __SEAL_H__
62340 +
62341 +#include "forward.h"
62342 +#include "debug.h"
62343 +#include "dformat.h"
62344 +#include "key.h"
62345 +#include "coord.h"
62346 +
62347 +/* for __u?? types */
62348 +/*#include <linux/types.h>*/
62349 +
62350 +/* seal. See comment at the top of seal.c */
62351 +typedef struct seal_s {
62352 + /* version of znode recorder at the time of seal creation */
62353 + __u64 version;
62354 + /* block number of znode attached to this seal */
62355 + reiser4_block_nr block;
62356 +#if REISER4_DEBUG
62357 + /* coord this seal is attached to. For debugging. */
62358 + coord_t coord1;
62359 + /* key this seal is attached to. For debugging. */
62360 + reiser4_key key;
62361 +#endif
62362 +} seal_t;
62363 +
62364 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
62365 +extern void reiser4_seal_done(seal_t *);
62366 +extern int reiser4_seal_is_set(const seal_t *);
62367 +extern int reiser4_seal_validate(seal_t *, coord_t *,
62368 + const reiser4_key *, lock_handle *,
62369 + znode_lock_mode mode, znode_lock_request request);
62370 +
62371 +/* __SEAL_H__ */
62372 +#endif
62373 +
62374 +/* Make Linus happy.
62375 + Local variables:
62376 + c-indentation-style: "K&R"
62377 + mode-name: "LC"
62378 + c-basic-offset: 8
62379 + tab-width: 8
62380 + fill-column: 120
62381 + End:
62382 +*/
62383 diff -urN linux-2.6.23.orig/fs/reiser4/search.c linux-2.6.23/fs/reiser4/search.c
62384 --- linux-2.6.23.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
62385 +++ linux-2.6.23/fs/reiser4/search.c 2007-12-04 16:49:30.000000000 +0300
62386 @@ -0,0 +1,1611 @@
62387 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62388 + * reiser4/README */
62389 +
62390 +#include "forward.h"
62391 +#include "debug.h"
62392 +#include "dformat.h"
62393 +#include "key.h"
62394 +#include "coord.h"
62395 +#include "seal.h"
62396 +#include "plugin/item/item.h"
62397 +#include "plugin/node/node.h"
62398 +#include "plugin/plugin.h"
62399 +#include "jnode.h"
62400 +#include "znode.h"
62401 +#include "block_alloc.h"
62402 +#include "tree_walk.h"
62403 +#include "tree.h"
62404 +#include "reiser4.h"
62405 +#include "super.h"
62406 +#include "inode.h"
62407 +
62408 +#include <linux/slab.h>
62409 +
62410 +static const char *bias_name(lookup_bias bias);
62411 +
62412 +/* tree searching algorithm, intranode searching algorithms are in
62413 + plugin/node/ */
62414 +
62415 +/* tree lookup cache
62416 + *
62417 + * The coord by key cache consists of small list of recently accessed nodes
62418 + * maintained according to the LRU discipline. Before doing real top-to-down
62419 + * tree traversal this cache is scanned for nodes that can contain key
62420 + * requested.
62421 + *
62422 + * The efficiency of coord cache depends heavily on locality of reference for
62423 + * tree accesses. Our user level simulations show reasonably good hit ratios
62424 + * for coord cache under most loads so far.
62425 + */
62426 +
62427 +/* Initialise coord cache slot */
62428 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
62429 +{
62430 + assert("nikita-345", slot != NULL);
62431 +
62432 + INIT_LIST_HEAD(&slot->lru);
62433 + slot->node = NULL;
62434 +}
62435 +
62436 +/* Initialize coord cache */
62437 +int cbk_cache_init(cbk_cache *cache /* cache to init */ )
62438 +{
62439 + int i;
62440 +
62441 + assert("nikita-346", cache != NULL);
62442 +
62443 + cache->slot =
62444 + kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
62445 + reiser4_ctx_gfp_mask_get());
62446 + if (cache->slot == NULL)
62447 + return RETERR(-ENOMEM);
62448 +
62449 + INIT_LIST_HEAD(&cache->lru);
62450 + for (i = 0; i < cache->nr_slots; ++i) {
62451 + cbk_cache_init_slot(cache->slot + i);
62452 + list_add_tail(&((cache->slot + i)->lru), &cache->lru);
62453 + }
62454 + rwlock_init(&cache->guard);
62455 + return 0;
62456 +}
62457 +
62458 +/* free cbk cache data */
62459 +void cbk_cache_done(cbk_cache * cache /* cache to release */ )
62460 +{
62461 + assert("nikita-2493", cache != NULL);
62462 + if (cache->slot != NULL) {
62463 + kfree(cache->slot);
62464 + cache->slot = NULL;
62465 + }
62466 +}
62467 +
62468 +/* macro to iterate over all cbk cache slots */
62469 +#define for_all_slots(cache, slot) \
62470 + for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
62471 + &(cache)->lru != &(slot)->lru; \
62472 + (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
62473 +
62474 +#if REISER4_DEBUG
62475 +/* this function assures that [cbk-cache-invariant] invariant holds */
62476 +static int cbk_cache_invariant(const cbk_cache *cache)
62477 +{
62478 + cbk_cache_slot *slot;
62479 + int result;
62480 + int unused;
62481 +
62482 + if (cache->nr_slots == 0)
62483 + return 1;
62484 +
62485 + assert("nikita-2469", cache != NULL);
62486 + unused = 0;
62487 + result = 1;
62488 + read_lock(&((cbk_cache *)cache)->guard);
62489 + for_all_slots(cache, slot) {
62490 + /* in LRU first go all `used' slots followed by `unused' */
62491 + if (unused && (slot->node != NULL))
62492 + result = 0;
62493 + if (slot->node == NULL)
62494 + unused = 1;
62495 + else {
62496 + cbk_cache_slot *scan;
62497 +
62498 + /* all cached nodes are different */
62499 + scan = slot;
62500 + while (result) {
62501 + scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
62502 + if (&cache->lru == &scan->lru)
62503 + break;
62504 + if (slot->node == scan->node)
62505 + result = 0;
62506 + }
62507 + }
62508 + if (!result)
62509 + break;
62510 + }
62511 + read_unlock(&((cbk_cache *)cache)->guard);
62512 + return result;
62513 +}
62514 +
62515 +#endif
62516 +
62517 +/* Remove references, if any, to @node from coord cache */
62518 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
62519 + reiser4_tree * tree /* tree to remove node from */ )
62520 +{
62521 + cbk_cache_slot *slot;
62522 + cbk_cache *cache;
62523 + int i;
62524 +
62525 + assert("nikita-350", node != NULL);
62526 + assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
62527 +
62528 + cache = &tree->cbk_cache;
62529 + assert("nikita-2470", cbk_cache_invariant(cache));
62530 +
62531 + write_lock(&(cache->guard));
62532 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62533 + if (slot->node == node) {
62534 + list_move_tail(&slot->lru, &cache->lru);
62535 + slot->node = NULL;
62536 + break;
62537 + }
62538 + }
62539 + write_unlock(&(cache->guard));
62540 + assert("nikita-2471", cbk_cache_invariant(cache));
62541 +}
62542 +
62543 +/* add to the cbk-cache in the "tree" information about "node". This
62544 + can actually be update of existing slot in a cache. */
62545 +static void cbk_cache_add(const znode *node /* node to add to the cache */ )
62546 +{
62547 + cbk_cache *cache;
62548 + cbk_cache_slot *slot;
62549 + int i;
62550 +
62551 + assert("nikita-352", node != NULL);
62552 +
62553 + cache = &znode_get_tree(node)->cbk_cache;
62554 + assert("nikita-2472", cbk_cache_invariant(cache));
62555 +
62556 + if (cache->nr_slots == 0)
62557 + return;
62558 +
62559 + write_lock(&(cache->guard));
62560 + /* find slot to update/add */
62561 + for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62562 + /* oops, this node is already in a cache */
62563 + if (slot->node == node)
62564 + break;
62565 + }
62566 + /* if all slots are used, reuse least recently used one */
62567 + if (i == cache->nr_slots) {
62568 + slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
62569 + slot->node = (znode *) node;
62570 + }
62571 + list_move(&slot->lru, &cache->lru);
62572 + write_unlock(&(cache->guard));
62573 + assert("nikita-2473", cbk_cache_invariant(cache));
62574 +}
62575 +
62576 +static int setup_delimiting_keys(cbk_handle * h);
62577 +static lookup_result coord_by_handle(cbk_handle * handle);
62578 +static lookup_result traverse_tree(cbk_handle * h);
62579 +static int cbk_cache_search(cbk_handle * h);
62580 +
62581 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
62582 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
62583 +
62584 +/* helper functions */
62585 +
62586 +static void update_stale_dk(reiser4_tree * tree, znode * node);
62587 +
62588 +/* release parent node during traversal */
62589 +static void put_parent(cbk_handle * h);
62590 +/* check consistency of fields */
62591 +static int sanity_check(cbk_handle * h);
62592 +/* release resources in handle */
62593 +static void hput(cbk_handle * h);
62594 +
62595 +static level_lookup_result search_to_left(cbk_handle * h);
62596 +
62597 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
62598 + * cbk_handle */
62599 +static cbk_handle *cbk_pack(cbk_handle * handle,
62600 + reiser4_tree * tree,
62601 + const reiser4_key * key,
62602 + coord_t * coord,
62603 + lock_handle * active_lh,
62604 + lock_handle * parent_lh,
62605 + znode_lock_mode lock_mode,
62606 + lookup_bias bias,
62607 + tree_level lock_level,
62608 + tree_level stop_level,
62609 + __u32 flags, ra_info_t * info)
62610 +{
62611 + memset(handle, 0, sizeof *handle);
62612 +
62613 + handle->tree = tree;
62614 + handle->key = key;
62615 + handle->lock_mode = lock_mode;
62616 + handle->bias = bias;
62617 + handle->lock_level = lock_level;
62618 + handle->stop_level = stop_level;
62619 + handle->coord = coord;
62620 + /* set flags. See comment in tree.h:cbk_flags */
62621 + handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
62622 +
62623 + handle->active_lh = active_lh;
62624 + handle->parent_lh = parent_lh;
62625 + handle->ra_info = info;
62626 + return handle;
62627 +}
62628 +
62629 +/* main tree lookup procedure
62630 +
62631 + Check coord cache. If key we are looking for is not found there, call cbk()
62632 + to do real tree traversal.
62633 +
62634 + As we have extents on the twig level, @lock_level and @stop_level can
62635 + be different from LEAF_LEVEL and each other.
62636 +
62637 + Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
62638 + long term locks) while calling this.
62639 +*/
62640 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
62641 + * in. Usually this tree is
62642 + * part of file-system
62643 + * super-block */ ,
62644 + const reiser4_key * key /* key to look for */ ,
62645 + coord_t * coord /* where to store found
62646 + * position in a tree. Fields
62647 + * in "coord" are only valid if
62648 + * coord_by_key() returned
62649 + * "CBK_COORD_FOUND" */ ,
62650 + lock_handle * lh, /* resulting lock handle */
62651 + znode_lock_mode lock_mode /* type of lookup we
62652 + * want on node. Pass
62653 + * ZNODE_READ_LOCK here
62654 + * if you only want to
62655 + * read item found and
62656 + * ZNODE_WRITE_LOCK if
62657 + * you want to modify
62658 + * it */ ,
62659 + lookup_bias bias /* what to return if coord
62660 + * with exactly the @key is
62661 + * not in the tree */ ,
62662 + tree_level lock_level /* tree level where to start
62663 + * taking @lock type of
62664 + * locks */ ,
62665 + tree_level stop_level /* tree level to stop. Pass
62666 + * LEAF_LEVEL or TWIG_LEVEL
62667 + * here Item being looked
62668 + * for has to be between
62669 + * @lock_level and
62670 + * @stop_level, inclusive */ ,
62671 + __u32 flags /* search flags */ ,
62672 + ra_info_t *
62673 + info
62674 + /* information about desired tree traversal readahead */
62675 + )
62676 +{
62677 + cbk_handle handle;
62678 + lock_handle parent_lh;
62679 + lookup_result result;
62680 +
62681 + init_lh(lh);
62682 + init_lh(&parent_lh);
62683 +
62684 + assert("nikita-3023", reiser4_schedulable());
62685 +
62686 + assert("nikita-353", tree != NULL);
62687 + assert("nikita-354", key != NULL);
62688 + assert("nikita-355", coord != NULL);
62689 + assert("nikita-356", (bias == FIND_EXACT)
62690 + || (bias == FIND_MAX_NOT_MORE_THAN));
62691 + assert("nikita-357", stop_level >= LEAF_LEVEL);
62692 + /* no locks can be held during tree traversal */
62693 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62694 +
62695 + cbk_pack(&handle,
62696 + tree,
62697 + key,
62698 + coord,
62699 + lh,
62700 + &parent_lh,
62701 + lock_mode, bias, lock_level, stop_level, flags, info);
62702 +
62703 + result = coord_by_handle(&handle);
62704 + assert("nikita-3247",
62705 + ergo(!IS_CBKERR(result), coord->node == lh->node));
62706 + return result;
62707 +}
62708 +
62709 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
62710 + * from tree root. */
62711 +lookup_result reiser4_object_lookup(struct inode * object,
62712 + const reiser4_key * key,
62713 + coord_t * coord,
62714 + lock_handle * lh,
62715 + znode_lock_mode lock_mode,
62716 + lookup_bias bias,
62717 + tree_level lock_level,
62718 + tree_level stop_level, __u32 flags,
62719 + ra_info_t * info)
62720 +{
62721 + cbk_handle handle;
62722 + lock_handle parent_lh;
62723 + lookup_result result;
62724 +
62725 + init_lh(lh);
62726 + init_lh(&parent_lh);
62727 +
62728 + assert("nikita-3023", reiser4_schedulable());
62729 +
62730 + assert("nikita-354", key != NULL);
62731 + assert("nikita-355", coord != NULL);
62732 + assert("nikita-356", (bias == FIND_EXACT)
62733 + || (bias == FIND_MAX_NOT_MORE_THAN));
62734 + assert("nikita-357", stop_level >= LEAF_LEVEL);
62735 + /* no locks can be held during tree search by key */
62736 + assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62737 +
62738 + cbk_pack(&handle,
62739 + object != NULL ? reiser4_tree_by_inode(object) : current_tree,
62740 + key,
62741 + coord,
62742 + lh,
62743 + &parent_lh,
62744 + lock_mode, bias, lock_level, stop_level, flags, info);
62745 + handle.object = object;
62746 +
62747 + result = coord_by_handle(&handle);
62748 + assert("nikita-3247",
62749 + ergo(!IS_CBKERR(result), coord->node == lh->node));
62750 + return result;
62751 +}
62752 +
62753 +/* lookup by cbk_handle. Common part of coord_by_key() and
62754 + reiser4_object_lookup(). */
62755 +static lookup_result coord_by_handle(cbk_handle * handle)
62756 +{
62757 + /*
62758 + * first check cbk_cache (which is look-aside cache for our tree) and
62759 + * of this fails, start traversal.
62760 + */
62761 + /* first check whether "key" is in cache of recent lookups. */
62762 + if (cbk_cache_search(handle) == 0)
62763 + return handle->result;
62764 + else
62765 + return traverse_tree(handle);
62766 +}
62767 +
62768 +/* Execute actor for each item (or unit, depending on @through_units_p),
62769 + starting from @coord, right-ward, until either:
62770 +
62771 + - end of the tree is reached
62772 + - unformatted node is met
62773 + - error occurred
62774 + - @actor returns 0 or less
62775 +
62776 + Error code, or last actor return value is returned.
62777 +
62778 + This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
62779 + sequence of entries with identical keys and alikes.
62780 +*/
62781 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
62782 + coord_t * coord /* coord to start from */ ,
62783 + lock_handle * lh /* lock handle to start with and to
62784 + * update along the way */ ,
62785 + tree_iterate_actor_t actor /* function to call on each
62786 + * item/unit */ ,
62787 + void *arg /* argument to pass to @actor */ ,
62788 + znode_lock_mode mode /* lock mode on scanned nodes */ ,
62789 + int through_units_p /* call @actor on each item or on
62790 + * each unit */ )
62791 +{
62792 + int result;
62793 +
62794 + assert("nikita-1143", tree != NULL);
62795 + assert("nikita-1145", coord != NULL);
62796 + assert("nikita-1146", lh != NULL);
62797 + assert("nikita-1147", actor != NULL);
62798 +
62799 + result = zload(coord->node);
62800 + coord_clear_iplug(coord);
62801 + if (result != 0)
62802 + return result;
62803 + if (!coord_is_existing_unit(coord)) {
62804 + zrelse(coord->node);
62805 + return -ENOENT;
62806 + }
62807 + while ((result = actor(tree, coord, lh, arg)) > 0) {
62808 + /* move further */
62809 + if ((through_units_p && coord_next_unit(coord)) ||
62810 + (!through_units_p && coord_next_item(coord))) {
62811 + do {
62812 + lock_handle couple;
62813 +
62814 + /* move to the next node */
62815 + init_lh(&couple);
62816 + result =
62817 + reiser4_get_right_neighbor(&couple,
62818 + coord->node,
62819 + (int)mode,
62820 + GN_CAN_USE_UPPER_LEVELS);
62821 + zrelse(coord->node);
62822 + if (result == 0) {
62823 +
62824 + result = zload(couple.node);
62825 + if (result != 0) {
62826 + done_lh(&couple);
62827 + return result;
62828 + }
62829 +
62830 + coord_init_first_unit(coord,
62831 + couple.node);
62832 + done_lh(lh);
62833 + move_lh(lh, &couple);
62834 + } else
62835 + return result;
62836 + } while (node_is_empty(coord->node));
62837 + }
62838 +
62839 + assert("nikita-1149", coord_is_existing_unit(coord));
62840 + }
62841 + zrelse(coord->node);
62842 + return result;
62843 +}
62844 +
62845 +/* return locked uber znode for @tree */
62846 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
62847 + znode_lock_request pri, lock_handle * lh)
62848 +{
62849 + int result;
62850 +
62851 + result = longterm_lock_znode(lh, tree->uber, mode, pri);
62852 + return result;
62853 +}
62854 +
62855 +/* true if @key is strictly within @node
62856 +
62857 + we are looking for possibly non-unique key and it is item is at the edge of
62858 + @node. May be it is in the neighbor.
62859 +*/
62860 +static int znode_contains_key_strict(znode * node /* node to check key
62861 + * against */ ,
62862 + const reiser4_key *
62863 + key /* key to check */ ,
62864 + int isunique)
62865 +{
62866 + int answer;
62867 +
62868 + assert("nikita-1760", node != NULL);
62869 + assert("nikita-1722", key != NULL);
62870 +
62871 + if (keyge(key, &node->rd_key))
62872 + return 0;
62873 +
62874 + answer = keycmp(&node->ld_key, key);
62875 +
62876 + if (isunique)
62877 + return answer != GREATER_THAN;
62878 + else
62879 + return answer == LESS_THAN;
62880 +}
62881 +
62882 +/*
62883 + * Virtual Root (vroot) code.
62884 + *
62885 + * For given file system object (e.g., regular file or directory) let's
62886 + * define its "virtual root" as lowest in the tree (that is, furtherest
62887 + * from the tree root) node such that all body items of said object are
62888 + * located in a tree rooted at this node.
62889 + *
62890 + * Once vroot of object is found all tree lookups for items within body of
62891 + * this object ("object lookups") can be started from its vroot rather
62892 + * than from real root. This has following advantages:
62893 + *
62894 + * 1. amount of nodes traversed during lookup (and, hence, amount of
62895 + * key comparisons made) decreases, and
62896 + *
62897 + * 2. contention on tree root is decreased. This latter was actually
62898 + * motivating reason behind vroot, because spin lock of root node,
62899 + * which is taken when acquiring long-term lock on root node is the
62900 + * hottest lock in the reiser4.
62901 + *
62902 + * How to find vroot.
62903 + *
62904 + * When vroot of object F is not yet determined, all object lookups start
62905 + * from the root of the tree. At each tree level during traversal we have
62906 + * a node N such that a key we are looking for (which is the key inside
62907 + * object's body) is located within N. In function handle_vroot() called
62908 + * from cbk_level_lookup() we check whether N is possible vroot for
62909 + * F. Check is trivial---if neither leftmost nor rightmost item of N
62910 + * belongs to F (and we already have helpful ->owns_item() method of
62911 + * object plugin for this), then N is possible vroot of F. This, of
62912 + * course, relies on the assumption that each object occupies contiguous
62913 + * range of keys in the tree.
62914 + *
62915 + * Thus, traversing tree downward and checking each node as we go, we can
62916 + * find lowest such node, which, by definition, is vroot.
62917 + *
62918 + * How to track vroot.
62919 + *
62920 + * Nohow. If actual vroot changes, next object lookup will just restart
62921 + * from the actual tree root, refreshing object's vroot along the way.
62922 + *
62923 + */
62924 +
62925 +/*
62926 + * Check whether @node is possible vroot of @object.
62927 + */
62928 +static void handle_vroot(struct inode *object, znode * node)
62929 +{
62930 + file_plugin *fplug;
62931 + coord_t coord;
62932 +
62933 + fplug = inode_file_plugin(object);
62934 + assert("nikita-3353", fplug != NULL);
62935 + assert("nikita-3354", fplug->owns_item != NULL);
62936 +
62937 + if (unlikely(node_is_empty(node)))
62938 + return;
62939 +
62940 + coord_init_first_unit(&coord, node);
62941 + /*
62942 + * if leftmost item of @node belongs to @object, we cannot be sure
62943 + * that @node is vroot of @object, because, some items of @object are
62944 + * probably in the sub-tree rooted at the left neighbor of @node.
62945 + */
62946 + if (fplug->owns_item(object, &coord))
62947 + return;
62948 + coord_init_last_unit(&coord, node);
62949 + /* mutatis mutandis for the rightmost item */
62950 + if (fplug->owns_item(object, &coord))
62951 + return;
62952 + /* otherwise, @node is possible vroot of @object */
62953 + inode_set_vroot(object, node);
62954 +}
62955 +
62956 +/*
62957 + * helper function used by traverse tree to start tree traversal not from the
62958 + * tree root, but from @h->object's vroot, if possible.
62959 + */
62960 +static int prepare_object_lookup(cbk_handle * h)
62961 +{
62962 + znode *vroot;
62963 + int result;
62964 +
62965 + vroot = inode_get_vroot(h->object);
62966 + if (vroot == NULL) {
62967 + /*
62968 + * object doesn't have known vroot, start from real tree root.
62969 + */
62970 + return LOOKUP_CONT;
62971 + }
62972 +
62973 + h->level = znode_get_level(vroot);
62974 + /* take a long-term lock on vroot */
62975 + h->result = longterm_lock_znode(h->active_lh, vroot,
62976 + cbk_lock_mode(h->level, h),
62977 + ZNODE_LOCK_LOPRI);
62978 + result = LOOKUP_REST;
62979 + if (h->result == 0) {
62980 + int isunique;
62981 + int inside;
62982 +
62983 + isunique = h->flags & CBK_UNIQUE;
62984 + /* check that key is inside vroot */
62985 + read_lock_dk(h->tree);
62986 + inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
62987 + !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
62988 + read_unlock_dk(h->tree);
62989 + if (inside) {
62990 + h->result = zload(vroot);
62991 + if (h->result == 0) {
62992 + /* search for key in vroot. */
62993 + result = cbk_node_lookup(h);
62994 + zrelse(vroot); /*h->active_lh->node); */
62995 + if (h->active_lh->node != vroot) {
62996 + result = LOOKUP_REST;
62997 + } else if (result == LOOKUP_CONT) {
62998 + move_lh(h->parent_lh, h->active_lh);
62999 + h->flags &= ~CBK_DKSET;
63000 + }
63001 + }
63002 + }
63003 + }
63004 +
63005 + zput(vroot);
63006 +
63007 + if (IS_CBKERR(h->result) || result == LOOKUP_REST)
63008 + hput(h);
63009 + return result;
63010 +}
63011 +
63012 +/* main function that handles common parts of tree traversal: starting
63013 + (fake znode handling), restarts, error handling, completion */
63014 +static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
63015 +{
63016 + int done;
63017 + int iterations;
63018 + int vroot_used;
63019 +
63020 + assert("nikita-365", h != NULL);
63021 + assert("nikita-366", h->tree != NULL);
63022 + assert("nikita-367", h->key != NULL);
63023 + assert("nikita-368", h->coord != NULL);
63024 + assert("nikita-369", (h->bias == FIND_EXACT)
63025 + || (h->bias == FIND_MAX_NOT_MORE_THAN));
63026 + assert("nikita-370", h->stop_level >= LEAF_LEVEL);
63027 + assert("nikita-2949", !(h->flags & CBK_DKSET));
63028 + assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
63029 +
63030 + done = 0;
63031 + iterations = 0;
63032 + vroot_used = 0;
63033 +
63034 + /* loop for restarts */
63035 + restart:
63036 +
63037 + assert("nikita-3024", reiser4_schedulable());
63038 +
63039 + h->result = CBK_COORD_FOUND;
63040 + /* connect_znode() needs it */
63041 + h->ld_key = *reiser4_min_key();
63042 + h->rd_key = *reiser4_max_key();
63043 + h->flags |= CBK_DKSET;
63044 + h->error = NULL;
63045 +
63046 + if (!vroot_used && h->object != NULL) {
63047 + vroot_used = 1;
63048 + done = prepare_object_lookup(h);
63049 + if (done == LOOKUP_REST) {
63050 + goto restart;
63051 + } else if (done == LOOKUP_DONE)
63052 + return h->result;
63053 + }
63054 + if (h->parent_lh->node == NULL) {
63055 + done =
63056 + get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
63057 + h->parent_lh);
63058 +
63059 + assert("nikita-1637", done != -E_DEADLOCK);
63060 +
63061 + h->block = h->tree->root_block;
63062 + h->level = h->tree->height;
63063 + h->coord->node = h->parent_lh->node;
63064 +
63065 + if (done != 0)
63066 + return done;
63067 + }
63068 +
63069 + /* loop descending a tree */
63070 + while (!done) {
63071 +
63072 + if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
63073 + IS_POW(iterations))) {
63074 + warning("nikita-1481", "Too many iterations: %i",
63075 + iterations);
63076 + reiser4_print_key("key", h->key);
63077 + ++iterations;
63078 + } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
63079 + h->error =
63080 + "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
63081 + h->result = RETERR(-EIO);
63082 + break;
63083 + }
63084 + switch (cbk_level_lookup(h)) {
63085 + case LOOKUP_CONT:
63086 + move_lh(h->parent_lh, h->active_lh);
63087 + continue;
63088 + default:
63089 + wrong_return_value("nikita-372", "cbk_level");
63090 + case LOOKUP_DONE:
63091 + done = 1;
63092 + break;
63093 + case LOOKUP_REST:
63094 + hput(h);
63095 + /* deadlock avoidance is normal case. */
63096 + if (h->result != -E_DEADLOCK)
63097 + ++iterations;
63098 + reiser4_preempt_point();
63099 + goto restart;
63100 + }
63101 + }
63102 + /* that's all. The rest is error handling */
63103 + if (unlikely(h->error != NULL)) {
63104 + warning("nikita-373", "%s: level: %i, "
63105 + "lock_level: %i, stop_level: %i "
63106 + "lock_mode: %s, bias: %s",
63107 + h->error, h->level, h->lock_level, h->stop_level,
63108 + lock_mode_name(h->lock_mode), bias_name(h->bias));
63109 + reiser4_print_address("block", &h->block);
63110 + reiser4_print_key("key", h->key);
63111 + print_coord_content("coord", h->coord);
63112 + }
63113 + /* `unlikely' error case */
63114 + if (unlikely(IS_CBKERR(h->result))) {
63115 + /* failure. do cleanup */
63116 + hput(h);
63117 + } else {
63118 + assert("nikita-1605", WITH_DATA_RET
63119 + (h->coord->node, 1,
63120 + ergo((h->result == CBK_COORD_FOUND) &&
63121 + (h->bias == FIND_EXACT) &&
63122 + (!node_is_empty(h->coord->node)),
63123 + coord_is_existing_item(h->coord))));
63124 + }
63125 + return h->result;
63126 +}
63127 +
63128 +/* find delimiting keys of child
63129 +
63130 + Determine left and right delimiting keys for child pointed to by
63131 + @parent_coord.
63132 +
63133 +*/
63134 +static void find_child_delimiting_keys(znode * parent /* parent znode, passed
63135 + * locked */ ,
63136 + const coord_t * parent_coord /* coord where
63137 + * pointer to
63138 + * child is
63139 + * stored */ ,
63140 + reiser4_key * ld /* where to store left
63141 + * delimiting key */ ,
63142 + reiser4_key * rd /* where to store right
63143 + * delimiting key */ )
63144 +{
63145 + coord_t neighbor;
63146 +
63147 + assert("nikita-1484", parent != NULL);
63148 + assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
63149 +
63150 + coord_dup(&neighbor, parent_coord);
63151 +
63152 + if (neighbor.between == AT_UNIT)
63153 + /* imitate item ->lookup() behavior. */
63154 + neighbor.between = AFTER_UNIT;
63155 +
63156 + if (coord_set_to_left(&neighbor) == 0)
63157 + unit_key_by_coord(&neighbor, ld);
63158 + else {
63159 + assert("nikita-14851", 0);
63160 + *ld = *znode_get_ld_key(parent);
63161 + }
63162 +
63163 + coord_dup(&neighbor, parent_coord);
63164 + if (neighbor.between == AT_UNIT)
63165 + neighbor.between = AFTER_UNIT;
63166 + if (coord_set_to_right(&neighbor) == 0)
63167 + unit_key_by_coord(&neighbor, rd);
63168 + else
63169 + *rd = *znode_get_rd_key(parent);
63170 +}
63171 +
63172 +/*
63173 + * setup delimiting keys for a child
63174 + *
63175 + * @parent parent node
63176 + *
63177 + * @coord location in @parent where pointer to @child is
63178 + *
63179 + * @child child node
63180 + */
63181 +int
63182 +set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
63183 +{
63184 + reiser4_tree *tree;
63185 +
63186 + assert("nikita-2952",
63187 + znode_get_level(parent) == znode_get_level(coord->node));
63188 +
63189 + /* fast check without taking dk lock. This is safe, because
63190 + * JNODE_DKSET is never cleared once set. */
63191 + if (!ZF_ISSET(child, JNODE_DKSET)) {
63192 + tree = znode_get_tree(parent);
63193 + write_lock_dk(tree);
63194 + if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
63195 + find_child_delimiting_keys(parent, coord,
63196 + &child->ld_key,
63197 + &child->rd_key);
63198 + ON_DEBUG(child->ld_key_version =
63199 + atomic_inc_return(&delim_key_version);
63200 + child->rd_key_version =
63201 + atomic_inc_return(&delim_key_version););
63202 + ZF_SET(child, JNODE_DKSET);
63203 + }
63204 + write_unlock_dk(tree);
63205 + return 1;
63206 + }
63207 + return 0;
63208 +}
63209 +
63210 +/* Perform tree lookup at one level. This is called from cbk_traverse()
63211 + function that drives lookup through tree and calls cbk_node_lookup() to
63212 + perform lookup within one node.
63213 +
63214 + See comments in a code.
63215 +*/
63216 +static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
63217 +{
63218 + int ret;
63219 + int setdk;
63220 + int ldkeyset = 0;
63221 + reiser4_key ldkey;
63222 + reiser4_key key;
63223 + znode *active;
63224 +
63225 + assert("nikita-3025", reiser4_schedulable());
63226 +
63227 + /* acquire reference to @active node */
63228 + active =
63229 + zget(h->tree, &h->block, h->parent_lh->node, h->level,
63230 + reiser4_ctx_gfp_mask_get());
63231 +
63232 + if (IS_ERR(active)) {
63233 + h->result = PTR_ERR(active);
63234 + return LOOKUP_DONE;
63235 + }
63236 +
63237 + /* lock @active */
63238 + h->result = longterm_lock_znode(h->active_lh,
63239 + active,
63240 + cbk_lock_mode(h->level, h),
63241 + ZNODE_LOCK_LOPRI);
63242 + /* longterm_lock_znode() acquires additional reference to znode (which
63243 + will be later released by longterm_unlock_znode()). Release
63244 + reference acquired by zget().
63245 + */
63246 + zput(active);
63247 + if (unlikely(h->result != 0))
63248 + goto fail_or_restart;
63249 +
63250 + setdk = 0;
63251 + /* if @active is accessed for the first time, setup delimiting keys on
63252 + it. Delimiting keys are taken from the parent node. See
63253 + setup_delimiting_keys() for details.
63254 + */
63255 + if (h->flags & CBK_DKSET) {
63256 + setdk = setup_delimiting_keys(h);
63257 + h->flags &= ~CBK_DKSET;
63258 + } else {
63259 + znode *parent;
63260 +
63261 + parent = h->parent_lh->node;
63262 + h->result = zload(parent);
63263 + if (unlikely(h->result != 0))
63264 + goto fail_or_restart;
63265 +
63266 + if (!ZF_ISSET(active, JNODE_DKSET))
63267 + setdk = set_child_delimiting_keys(parent,
63268 + h->coord, active);
63269 + else {
63270 + read_lock_dk(h->tree);
63271 + find_child_delimiting_keys(parent, h->coord, &ldkey,
63272 + &key);
63273 + read_unlock_dk(h->tree);
63274 + ldkeyset = 1;
63275 + }
63276 + zrelse(parent);
63277 + }
63278 +
63279 + /* this is ugly kludge. Reminder: this is necessary, because
63280 + ->lookup() method returns coord with ->between field probably set
63281 + to something different from AT_UNIT.
63282 + */
63283 + h->coord->between = AT_UNIT;
63284 +
63285 + if (znode_just_created(active) && (h->coord->node != NULL)) {
63286 + write_lock_tree(h->tree);
63287 + /* if we are going to load znode right now, setup
63288 + ->in_parent: coord where pointer to this node is stored in
63289 + parent.
63290 + */
63291 + coord_to_parent_coord(h->coord, &active->in_parent);
63292 + write_unlock_tree(h->tree);
63293 + }
63294 +
63295 + /* check connectedness without holding tree lock---false negatives
63296 + * will be re-checked by connect_znode(), and false positives are
63297 + * impossible---@active cannot suddenly turn into unconnected
63298 + * state. */
63299 + if (!znode_is_connected(active)) {
63300 + h->result = connect_znode(h->coord, active);
63301 + if (unlikely(h->result != 0)) {
63302 + put_parent(h);
63303 + goto fail_or_restart;
63304 + }
63305 + }
63306 +
63307 + jload_prefetch(ZJNODE(active));
63308 +
63309 + if (setdk)
63310 + update_stale_dk(h->tree, active);
63311 +
63312 + /* put_parent() cannot be called earlier, because connect_znode()
63313 + assumes parent node is referenced; */
63314 + put_parent(h);
63315 +
63316 + if ((!znode_contains_key_lock(active, h->key) &&
63317 + (h->flags & CBK_TRUST_DK))
63318 + || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
63319 + /* 1. key was moved out of this node while this thread was
63320 + waiting for the lock. Restart. More elaborate solution is
63321 + to determine where key moved (to the left, or to the right)
63322 + and try to follow it through sibling pointers.
63323 +
63324 + 2. or, node itself is going to be removed from the
63325 + tree. Release lock and restart.
63326 + */
63327 + h->result = -E_REPEAT;
63328 + }
63329 + if (h->result == -E_REPEAT)
63330 + return LOOKUP_REST;
63331 +
63332 + h->result = zload_ra(active, h->ra_info);
63333 + if (h->result) {
63334 + return LOOKUP_DONE;
63335 + }
63336 +
63337 + /* sanity checks */
63338 + if (sanity_check(h)) {
63339 + zrelse(active);
63340 + return LOOKUP_DONE;
63341 + }
63342 +
63343 + /* check that key of leftmost item in the @active is the same as in
63344 + * its parent */
63345 + if (ldkeyset && !node_is_empty(active) &&
63346 + !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
63347 + warning("vs-3533", "Keys are inconsistent. Fsck?");
63348 + reiser4_print_key("inparent", &ldkey);
63349 + reiser4_print_key("inchild", &key);
63350 + h->result = RETERR(-EIO);
63351 + zrelse(active);
63352 + return LOOKUP_DONE;
63353 + }
63354 +
63355 + if (h->object != NULL)
63356 + handle_vroot(h->object, active);
63357 +
63358 + ret = cbk_node_lookup(h);
63359 +
63360 + /* h->active_lh->node might change, but active is yet to be zrelsed */
63361 + zrelse(active);
63362 +
63363 + return ret;
63364 +
63365 + fail_or_restart:
63366 + if (h->result == -E_DEADLOCK)
63367 + return LOOKUP_REST;
63368 + return LOOKUP_DONE;
63369 +}
63370 +
63371 +#if REISER4_DEBUG
63372 +/* check left and right delimiting keys of a znode */
63373 +void check_dkeys(znode * node)
63374 +{
63375 + znode *left;
63376 + znode *right;
63377 +
63378 + read_lock_tree(current_tree);
63379 + read_lock_dk(current_tree);
63380 +
63381 + assert("vs-1710", znode_is_any_locked(node));
63382 + assert("vs-1197",
63383 + !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
63384 +
63385 + left = node->left;
63386 + right = node->right;
63387 +
63388 + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63389 + && left != NULL && ZF_ISSET(left, JNODE_DKSET))
63390 + /* check left neighbor. Note that left neighbor is not locked,
63391 + so it might get wrong delimiting keys therefore */
63392 + assert("vs-1198",
63393 + (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
63394 + || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
63395 +
63396 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63397 + && right != NULL && ZF_ISSET(right, JNODE_DKSET))
63398 + /* check right neighbor. Note that right neighbor is not
63399 + locked, so it might get wrong delimiting keys therefore */
63400 + assert("vs-1199",
63401 + (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
63402 + || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
63403 +
63404 + read_unlock_dk(current_tree);
63405 + read_unlock_tree(current_tree);
63406 +}
63407 +#endif
63408 +
63409 +/* true if @key is left delimiting key of @node */
63410 +static int key_is_ld(znode * node, const reiser4_key * key)
63411 +{
63412 + int ld;
63413 +
63414 + assert("nikita-1716", node != NULL);
63415 + assert("nikita-1758", key != NULL);
63416 +
63417 + read_lock_dk(znode_get_tree(node));
63418 + assert("nikita-1759", znode_contains_key(node, key));
63419 + ld = keyeq(znode_get_ld_key(node), key);
63420 + read_unlock_dk(znode_get_tree(node));
63421 + return ld;
63422 +}
63423 +
63424 +/* Process one node during tree traversal.
63425 +
63426 + This is called by cbk_level_lookup(). */
63427 +static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
63428 +{
63429 + /* node plugin of @active */
63430 + node_plugin *nplug;
63431 + /* item plugin of item that was found */
63432 + item_plugin *iplug;
63433 + /* search bias */
63434 + lookup_bias node_bias;
63435 + /* node we are operating upon */
63436 + znode *active;
63437 + /* tree we are searching in */
63438 + reiser4_tree *tree;
63439 + /* result */
63440 + int result;
63441 +
63442 + assert("nikita-379", h != NULL);
63443 +
63444 + active = h->active_lh->node;
63445 + tree = h->tree;
63446 +
63447 + nplug = active->nplug;
63448 + assert("nikita-380", nplug != NULL);
63449 +
63450 + ON_DEBUG(check_dkeys(active));
63451 +
63452 + /* return item from "active" node with maximal key not greater than
63453 + "key" */
63454 + node_bias = h->bias;
63455 + result = nplug->lookup(active, h->key, node_bias, h->coord);
63456 + if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
63457 + /* error occurred */
63458 + h->result = result;
63459 + return LOOKUP_DONE;
63460 + }
63461 + if (h->level == h->stop_level) {
63462 + /* welcome to the stop level */
63463 + assert("nikita-381", h->coord->node == active);
63464 + if (result == NS_FOUND) {
63465 + /* success of tree lookup */
63466 + if (!(h->flags & CBK_UNIQUE)
63467 + && key_is_ld(active, h->key)) {
63468 + return search_to_left(h);
63469 + } else
63470 + h->result = CBK_COORD_FOUND;
63471 + } else {
63472 + h->result = CBK_COORD_NOTFOUND;
63473 + }
63474 + if (!(h->flags & CBK_IN_CACHE))
63475 + cbk_cache_add(active);
63476 + return LOOKUP_DONE;
63477 + }
63478 +
63479 + if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
63480 + h->error = "not found on internal node";
63481 + h->result = result;
63482 + return LOOKUP_DONE;
63483 + }
63484 +
63485 + assert("vs-361", h->level > h->stop_level);
63486 +
63487 + if (handle_eottl(h, &result)) {
63488 + assert("vs-1674", (result == LOOKUP_DONE ||
63489 + result == LOOKUP_REST));
63490 + return result;
63491 + }
63492 +
63493 + /* go down to next level */
63494 + check_me("vs-12", zload(h->coord->node) == 0);
63495 + assert("nikita-2116", item_is_internal(h->coord));
63496 + iplug = item_plugin_by_coord(h->coord);
63497 + iplug->s.internal.down_link(h->coord, h->key, &h->block);
63498 + zrelse(h->coord->node);
63499 + --h->level;
63500 + return LOOKUP_CONT; /* continue */
63501 +}
63502 +
63503 +/* scan cbk_cache slots looking for a match for @h */
63504 +static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
63505 +{
63506 + level_lookup_result llr;
63507 + znode *node;
63508 + reiser4_tree *tree;
63509 + cbk_cache_slot *slot;
63510 + cbk_cache *cache;
63511 + tree_level level;
63512 + int isunique;
63513 + const reiser4_key *key;
63514 + int result;
63515 +
63516 + assert("nikita-1317", h != NULL);
63517 + assert("nikita-1315", h->tree != NULL);
63518 + assert("nikita-1316", h->key != NULL);
63519 +
63520 + tree = h->tree;
63521 + cache = &tree->cbk_cache;
63522 + if (cache->nr_slots == 0)
63523 + /* size of cbk cache was set to 0 by mount time option. */
63524 + return RETERR(-ENOENT);
63525 +
63526 + assert("nikita-2474", cbk_cache_invariant(cache));
63527 + node = NULL; /* to keep gcc happy */
63528 + level = h->level;
63529 + key = h->key;
63530 + isunique = h->flags & CBK_UNIQUE;
63531 + result = RETERR(-ENOENT);
63532 +
63533 + /*
63534 + * this is time-critical function and dragons had, hence, been settled
63535 + * here.
63536 + *
63537 + * Loop below scans cbk cache slots trying to find matching node with
63538 + * suitable range of delimiting keys and located at the h->level.
63539 + *
63540 + * Scan is done under cbk cache spin lock that protects slot->node
63541 + * pointers. If suitable node is found we want to pin it in
63542 + * memory. But slot->node can point to the node with x_count 0
63543 + * (unreferenced). Such node can be recycled at any moment, or can
63544 + * already be in the process of being recycled (within jput()).
63545 + *
63546 + * As we found node in the cbk cache, it means that jput() hasn't yet
63547 + * called cbk_cache_invalidate().
63548 + *
63549 + * We acquire reference to the node without holding tree lock, and
63550 + * later, check node's RIP bit. This avoids races with jput().
63551 + */
63552 +
63553 + rcu_read_lock();
63554 + read_lock(&((cbk_cache *)cache)->guard);
63555 +
63556 + slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
63557 + slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
63558 + BUG_ON(&slot->lru != &cache->lru);/*????*/
63559 + while (1) {
63560 +
63561 + slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
63562 +
63563 + if (&cache->lru != &slot->lru)
63564 + node = slot->node;
63565 + else
63566 + node = NULL;
63567 +
63568 + if (unlikely(node == NULL))
63569 + break;
63570 +
63571 + /*
63572 + * this is (hopefully) the only place in the code where we are
63573 + * working with delimiting keys without holding dk lock. This
63574 + * is fine here, because this is only "guess" anyway---keys
63575 + * are rechecked under dk lock below.
63576 + */
63577 + if (znode_get_level(node) == level &&
63578 + /* reiser4_min_key < key < reiser4_max_key */
63579 + znode_contains_key_strict(node, key, isunique)) {
63580 + zref(node);
63581 + result = 0;
63582 + spin_lock_prefetch(&tree->tree_lock);
63583 + break;
63584 + }
63585 + }
63586 + read_unlock(&((cbk_cache *)cache)->guard);
63587 +
63588 + assert("nikita-2475", cbk_cache_invariant(cache));
63589 +
63590 + if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
63591 + result = -ENOENT;
63592 +
63593 + rcu_read_unlock();
63594 +
63595 + if (result != 0) {
63596 + h->result = CBK_COORD_NOTFOUND;
63597 + return RETERR(-ENOENT);
63598 + }
63599 +
63600 + result =
63601 + longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
63602 + ZNODE_LOCK_LOPRI);
63603 + zput(node);
63604 + if (result != 0)
63605 + return result;
63606 + result = zload(node);
63607 + if (result != 0)
63608 + return result;
63609 +
63610 + /* recheck keys */
63611 + read_lock_dk(tree);
63612 + result = (znode_contains_key_strict(node, key, isunique) &&
63613 + !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
63614 + read_unlock_dk(tree);
63615 + if (result) {
63616 + /* do lookup inside node */
63617 + llr = cbk_node_lookup(h);
63618 + /* if cbk_node_lookup() wandered to another node (due to eottl
63619 + or non-unique keys), adjust @node */
63620 + /*node = h->active_lh->node; */
63621 +
63622 + if (llr != LOOKUP_DONE) {
63623 + /* restart or continue on the next level */
63624 + result = RETERR(-ENOENT);
63625 + } else if (IS_CBKERR(h->result))
63626 + /* io or oom */
63627 + result = RETERR(-ENOENT);
63628 + else {
63629 + /* good. Either item found or definitely not found. */
63630 + result = 0;
63631 +
63632 + write_lock(&(cache->guard));
63633 + if (slot->node == h->active_lh->node /*node */ ) {
63634 + /* if this node is still in cbk cache---move
63635 + its slot to the head of the LRU list. */
63636 + list_move(&slot->lru, &cache->lru);
63637 + }
63638 + write_unlock(&(cache->guard));
63639 + }
63640 + } else {
63641 + /* race. While this thread was waiting for the lock, node was
63642 + rebalanced and item we are looking for, shifted out of it
63643 + (if it ever was here).
63644 +
63645 + Continuing scanning is almost hopeless: node key range was
63646 + moved to, is almost certainly at the beginning of the LRU
63647 + list at this time, because it's hot, but restarting
63648 + scanning from the very beginning is complex. Just return,
63649 + so that cbk() will be performed. This is not that
63650 + important, because such races should be rare. Are they?
63651 + */
63652 + result = RETERR(-ENOENT); /* -ERAUGHT */
63653 + }
63654 + zrelse(node);
63655 + assert("nikita-2476", cbk_cache_invariant(cache));
63656 + return result;
63657 +}
63658 +
63659 +/* look for item with given key in the coord cache
63660 +
63661 + This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
63662 + which is a small LRU list of znodes accessed lately. For each znode in
63663 + znode in this list, it checks whether key we are looking for fits into key
63664 + range covered by this node. If so, and in addition, node lies at allowed
63665 + level (this is to handle extents on a twig level), node is locked, and
63666 + lookup inside it is performed.
63667 +
63668 + we need a measurement of the cost of this cache search compared to the cost
63669 + of coord_by_key.
63670 +
63671 +*/
63672 +static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
63673 +{
63674 + int result = 0;
63675 + tree_level level;
63676 +
63677 + /* add CBK_IN_CACHE to the handle flags. This means that
63678 + * cbk_node_lookup() assumes that cbk_cache is scanned and would add
63679 + * found node to the cache. */
63680 + h->flags |= CBK_IN_CACHE;
63681 + for (level = h->stop_level; level <= h->lock_level; ++level) {
63682 + h->level = level;
63683 + result = cbk_cache_scan_slots(h);
63684 + if (result != 0) {
63685 + done_lh(h->active_lh);
63686 + done_lh(h->parent_lh);
63687 + } else {
63688 + assert("nikita-1319", !IS_CBKERR(h->result));
63689 + break;
63690 + }
63691 + }
63692 + h->flags &= ~CBK_IN_CACHE;
63693 + return result;
63694 +}
63695 +
63696 +/* type of lock we want to obtain during tree traversal. On stop level
63697 + we want type of lock user asked for, on upper levels: read lock. */
63698 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
63699 +{
63700 + assert("nikita-382", h != NULL);
63701 +
63702 + return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
63703 +}
63704 +
63705 +/* update outdated delimiting keys */
63706 +static void stale_dk(reiser4_tree * tree, znode * node)
63707 +{
63708 + znode *right;
63709 +
63710 + read_lock_tree(tree);
63711 + write_lock_dk(tree);
63712 + right = node->right;
63713 +
63714 + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63715 + right && ZF_ISSET(right, JNODE_DKSET) &&
63716 + !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
63717 + znode_set_rd_key(node, znode_get_ld_key(right));
63718 +
63719 + write_unlock_dk(tree);
63720 + read_unlock_tree(tree);
63721 +}
63722 +
63723 +/* check for possibly outdated delimiting keys, and update them if
63724 + * necessary. */
63725 +static void update_stale_dk(reiser4_tree * tree, znode * node)
63726 +{
63727 + znode *right;
63728 + reiser4_key rd;
63729 +
63730 + read_lock_tree(tree);
63731 + read_lock_dk(tree);
63732 + rd = *znode_get_rd_key(node);
63733 + right = node->right;
63734 + if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63735 + right && ZF_ISSET(right, JNODE_DKSET) &&
63736 + !keyeq(&rd, znode_get_ld_key(right)))) {
63737 + assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
63738 + read_unlock_dk(tree);
63739 + read_unlock_tree(tree);
63740 + stale_dk(tree, node);
63741 + return;
63742 + }
63743 + read_unlock_dk(tree);
63744 + read_unlock_tree(tree);
63745 +}
63746 +
63747 +/*
63748 + * handle searches a the non-unique key.
63749 + *
63750 + * Suppose that we are looking for an item with possibly non-unique key 100.
63751 + *
63752 + * Root node contains two pointers: one to a node with left delimiting key 0,
63753 + * and another to a node with left delimiting key 100. Item we interested in
63754 + * may well happen in the sub-tree rooted at the first pointer.
63755 + *
63756 + * To handle this search_to_left() is called when search reaches stop
63757 + * level. This function checks it is _possible_ that item we are looking for
63758 + * is in the left neighbor (this can be done by comparing delimiting keys) and
63759 + * if so, tries to lock left neighbor (this is low priority lock, so it can
63760 + * deadlock, tree traversal is just restarted if it did) and then checks
63761 + * whether left neighbor actually contains items with our key.
63762 + *
63763 + * Note that this is done on the stop level only. It is possible to try such
63764 + * left-check on each level, but as duplicate keys are supposed to be rare
63765 + * (very unlikely that more than one node is completely filled with items with
63766 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
63767 + *
63768 + */
63769 +static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
63770 +{
63771 + level_lookup_result result;
63772 + coord_t *coord;
63773 + znode *node;
63774 + znode *neighbor;
63775 +
63776 + lock_handle lh;
63777 +
63778 + assert("nikita-1761", h != NULL);
63779 + assert("nikita-1762", h->level == h->stop_level);
63780 +
63781 + init_lh(&lh);
63782 + coord = h->coord;
63783 + node = h->active_lh->node;
63784 + assert("nikita-1763", coord_is_leftmost_unit(coord));
63785 +
63786 + h->result =
63787 + reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
63788 + GN_CAN_USE_UPPER_LEVELS);
63789 + neighbor = NULL;
63790 + switch (h->result) {
63791 + case -E_DEADLOCK:
63792 + result = LOOKUP_REST;
63793 + break;
63794 + case 0:{
63795 + node_plugin *nplug;
63796 + coord_t crd;
63797 + lookup_bias bias;
63798 +
63799 + neighbor = lh.node;
63800 + h->result = zload(neighbor);
63801 + if (h->result != 0) {
63802 + result = LOOKUP_DONE;
63803 + break;
63804 + }
63805 +
63806 + nplug = neighbor->nplug;
63807 +
63808 + coord_init_zero(&crd);
63809 + bias = h->bias;
63810 + h->bias = FIND_EXACT;
63811 + h->result =
63812 + nplug->lookup(neighbor, h->key, h->bias, &crd);
63813 + h->bias = bias;
63814 +
63815 + if (h->result == NS_NOT_FOUND) {
63816 + case -E_NO_NEIGHBOR:
63817 + h->result = CBK_COORD_FOUND;
63818 + if (!(h->flags & CBK_IN_CACHE))
63819 + cbk_cache_add(node);
63820 + default: /* some other error */
63821 + result = LOOKUP_DONE;
63822 + } else if (h->result == NS_FOUND) {
63823 + read_lock_dk(znode_get_tree(neighbor));
63824 + h->rd_key = *znode_get_ld_key(node);
63825 + leftmost_key_in_node(neighbor, &h->ld_key);
63826 + read_unlock_dk(znode_get_tree(neighbor));
63827 + h->flags |= CBK_DKSET;
63828 +
63829 + h->block = *znode_get_block(neighbor);
63830 + /* clear coord -> node so that cbk_level_lookup()
63831 + wouldn't overwrite parent hint in neighbor.
63832 +
63833 + Parent hint was set up by
63834 + reiser4_get_left_neighbor()
63835 + */
63836 + /* FIXME: why do we have to spinlock here? */
63837 + write_lock_tree(znode_get_tree(neighbor));
63838 + h->coord->node = NULL;
63839 + write_unlock_tree(znode_get_tree(neighbor));
63840 + result = LOOKUP_CONT;
63841 + } else {
63842 + result = LOOKUP_DONE;
63843 + }
63844 + if (neighbor != NULL)
63845 + zrelse(neighbor);
63846 + }
63847 + }
63848 + done_lh(&lh);
63849 + return result;
63850 +}
63851 +
63852 +/* debugging aid: return symbolic name of search bias */
63853 +static const char *bias_name(lookup_bias bias /* bias to get name of */ )
63854 +{
63855 + if (bias == FIND_EXACT)
63856 + return "exact";
63857 + else if (bias == FIND_MAX_NOT_MORE_THAN)
63858 + return "left-slant";
63859 +/* else if( bias == RIGHT_SLANT_BIAS ) */
63860 +/* return "right-bias"; */
63861 + else {
63862 + static char buf[30];
63863 +
63864 + sprintf(buf, "unknown: %i", bias);
63865 + return buf;
63866 + }
63867 +}
63868 +
63869 +#if REISER4_DEBUG
63870 +/* debugging aid: print human readable information about @p */
63871 +void print_coord_content(const char *prefix /* prefix to print */ ,
63872 + coord_t * p /* coord to print */ )
63873 +{
63874 + reiser4_key key;
63875 +
63876 + if (p == NULL) {
63877 + printk("%s: null\n", prefix);
63878 + return;
63879 + }
63880 + if ((p->node != NULL) && znode_is_loaded(p->node)
63881 + && coord_is_existing_item(p))
63882 + printk("%s: data: %p, length: %i\n", prefix,
63883 + item_body_by_coord(p), item_length_by_coord(p));
63884 + if (znode_is_loaded(p->node)) {
63885 + item_key_by_coord(p, &key);
63886 + reiser4_print_key(prefix, &key);
63887 + }
63888 +}
63889 +
63890 +/* debugging aid: print human readable information about @block */
63891 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
63892 + const reiser4_block_nr * block /* block number to print */ )
63893 +{
63894 + printk("%s: %s\n", prefix, sprint_address(block));
63895 +}
63896 +#endif
63897 +
63898 +/* return string containing human readable representation of @block */
63899 +char *sprint_address(const reiser4_block_nr *
63900 + block /* block number to print */ )
63901 +{
63902 + static char address[30];
63903 +
63904 + if (block == NULL)
63905 + sprintf(address, "null");
63906 + else if (reiser4_blocknr_is_fake(block))
63907 + sprintf(address, "%llx", (unsigned long long)(*block));
63908 + else
63909 + sprintf(address, "%llu", (unsigned long long)(*block));
63910 + return address;
63911 +}
63912 +
63913 +/* release parent node during traversal */
63914 +static void put_parent(cbk_handle * h /* search handle */ )
63915 +{
63916 + assert("nikita-383", h != NULL);
63917 + if (h->parent_lh->node != NULL) {
63918 + longterm_unlock_znode(h->parent_lh);
63919 + }
63920 +}
63921 +
63922 +/* helper function used by coord_by_key(): release reference to parent znode
63923 + stored in handle before processing its child. */
63924 +static void hput(cbk_handle * h /* search handle */ )
63925 +{
63926 + assert("nikita-385", h != NULL);
63927 + done_lh(h->parent_lh);
63928 + done_lh(h->active_lh);
63929 +}
63930 +
63931 +/* Helper function used by cbk(): update delimiting keys of child node (stored
63932 + in h->active_lh->node) using key taken from parent on the parent level. */
63933 +static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
63934 +{
63935 + znode *active;
63936 + reiser4_tree *tree;
63937 +
63938 + assert("nikita-1088", h != NULL);
63939 +
63940 + active = h->active_lh->node;
63941 +
63942 + /* fast check without taking dk lock. This is safe, because
63943 + * JNODE_DKSET is never cleared once set. */
63944 + if (!ZF_ISSET(active, JNODE_DKSET)) {
63945 + tree = znode_get_tree(active);
63946 + write_lock_dk(tree);
63947 + if (!ZF_ISSET(active, JNODE_DKSET)) {
63948 + znode_set_ld_key(active, &h->ld_key);
63949 + znode_set_rd_key(active, &h->rd_key);
63950 + ZF_SET(active, JNODE_DKSET);
63951 + }
63952 + write_unlock_dk(tree);
63953 + return 1;
63954 + }
63955 + return 0;
63956 +}
63957 +
63958 +/* true if @block makes sense for the @tree. Used to detect corrupted node
63959 + * pointers */
63960 +static int
63961 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
63962 + reiser4_tree * tree /* tree to check against */ )
63963 +{
63964 + assert("nikita-757", block != NULL);
63965 + assert("nikita-758", tree != NULL);
63966 +
63967 + /* check to see if it exceeds the size of the device. */
63968 + return reiser4_blocknr_is_sane_for(tree->super, block);
63969 +}
63970 +
63971 +/* check consistency of fields */
63972 +static int sanity_check(cbk_handle * h /* search handle */ )
63973 +{
63974 + assert("nikita-384", h != NULL);
63975 +
63976 + if (h->level < h->stop_level) {
63977 + h->error = "Buried under leaves";
63978 + h->result = RETERR(-EIO);
63979 + return LOOKUP_DONE;
63980 + } else if (!block_nr_is_correct(&h->block, h->tree)) {
63981 + h->error = "bad block number";
63982 + h->result = RETERR(-EIO);
63983 + return LOOKUP_DONE;
63984 + } else
63985 + return 0;
63986 +}
63987 +
63988 +/* Make Linus happy.
63989 + Local variables:
63990 + c-indentation-style: "K&R"
63991 + mode-name: "LC"
63992 + c-basic-offset: 8
63993 + tab-width: 8
63994 + fill-column: 120
63995 + scroll-step: 1
63996 + End:
63997 +*/
63998 diff -urN linux-2.6.23.orig/fs/reiser4/status_flags.c linux-2.6.23/fs/reiser4/status_flags.c
63999 --- linux-2.6.23.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
64000 +++ linux-2.6.23/fs/reiser4/status_flags.c 2007-12-04 21:05:55.810811035 +0300
64001 @@ -0,0 +1,175 @@
64002 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64003 + * reiser4/README */
64004 +
64005 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
64006 +
64007 +#include <linux/bio.h>
64008 +#include <linux/highmem.h>
64009 +#include <linux/fs.h>
64010 +#include <linux/blkdev.h>
64011 +#include "debug.h"
64012 +#include "dformat.h"
64013 +#include "status_flags.h"
64014 +#include "super.h"
64015 +
64016 +/* This is our end I/O handler that marks page uptodate if IO was successful. It also
64017 + unconditionally unlocks the page, so we can see that io was done.
64018 + We do not free bio, because we hope to reuse that. */
64019 +static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
64020 + int err)
64021 +{
64022 + if (bio->bi_size)
64023 + return 1;
64024 +
64025 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
64026 + SetPageUptodate(bio->bi_io_vec->bv_page);
64027 + } else {
64028 + ClearPageUptodate(bio->bi_io_vec->bv_page);
64029 + SetPageError(bio->bi_io_vec->bv_page);
64030 + }
64031 + unlock_page(bio->bi_io_vec->bv_page);
64032 + return 0;
64033 +}
64034 +
64035 +/* Initialise status code. This is expected to be called from the disk format
64036 + code. block paremeter is where status block lives. */
64037 +int reiser4_status_init(reiser4_block_nr block)
64038 +{
64039 + struct super_block *sb = reiser4_get_current_sb();
64040 + struct reiser4_status *statuspage;
64041 + struct bio *bio;
64042 + struct page *page;
64043 +
64044 + get_super_private(sb)->status_page = NULL;
64045 + get_super_private(sb)->status_bio = NULL;
64046 +
64047 + page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
64048 + if (!page)
64049 + return -ENOMEM;
64050 +
64051 + bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
64052 + if (bio != NULL) {
64053 + bio->bi_sector = block * (sb->s_blocksize >> 9);
64054 + bio->bi_bdev = sb->s_bdev;
64055 + bio->bi_io_vec[0].bv_page = page;
64056 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64057 + bio->bi_io_vec[0].bv_offset = 0;
64058 + bio->bi_vcnt = 1;
64059 + bio->bi_size = sb->s_blocksize;
64060 + bio->bi_end_io = reiser4_status_endio;
64061 + } else {
64062 + __free_pages(page, 0);
64063 + return -ENOMEM;
64064 + }
64065 + lock_page(page);
64066 + submit_bio(READ, bio);
64067 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64068 + wait_on_page_locked(page);
64069 + if (!PageUptodate(page)) {
64070 + warning("green-2007",
64071 + "I/O error while tried to read status page\n");
64072 + return -EIO;
64073 + }
64074 +
64075 + statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
64076 + if (memcmp
64077 + (statuspage->magic, REISER4_STATUS_MAGIC,
64078 + sizeof(REISER4_STATUS_MAGIC))) {
64079 + /* Magic does not match. */
64080 + kunmap_atomic((char *)statuspage, KM_USER0);
64081 + warning("green-2008", "Wrong magic in status block\n");
64082 + __free_pages(page, 0);
64083 + bio_put(bio);
64084 + return -EINVAL;
64085 + }
64086 + kunmap_atomic((char *)statuspage, KM_USER0);
64087 +
64088 + get_super_private(sb)->status_page = page;
64089 + get_super_private(sb)->status_bio = bio;
64090 + return 0;
64091 +}
64092 +
64093 +/* Query the status of fs. Returns if the FS can be safely mounted.
64094 + Also if "status" and "extended" parameters are given, it will fill
64095 + actual parts of status from disk there. */
64096 +int reiser4_status_query(u64 * status, u64 * extended)
64097 +{
64098 + struct super_block *sb = reiser4_get_current_sb();
64099 + struct reiser4_status *statuspage;
64100 + int retval;
64101 +
64102 + if (!get_super_private(sb)->status_page) { // No status page?
64103 + return REISER4_STATUS_MOUNT_UNKNOWN;
64104 + }
64105 + statuspage = (struct reiser4_status *)
64106 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64107 + switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
64108 + case REISER4_STATUS_OK:
64109 + retval = REISER4_STATUS_MOUNT_OK;
64110 + break;
64111 + case REISER4_STATUS_CORRUPTED:
64112 + retval = REISER4_STATUS_MOUNT_WARN;
64113 + break;
64114 + case REISER4_STATUS_DAMAGED:
64115 + case REISER4_STATUS_DESTROYED:
64116 + case REISER4_STATUS_IOERROR:
64117 + retval = REISER4_STATUS_MOUNT_RO;
64118 + break;
64119 + default:
64120 + retval = REISER4_STATUS_MOUNT_UNKNOWN;
64121 + break;
64122 + }
64123 +
64124 + if (status)
64125 + *status = le64_to_cpu(get_unaligned(&statuspage->status));
64126 + if (extended)
64127 + *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
64128 +
64129 + kunmap_atomic((char *)statuspage, KM_USER0);
64130 + return retval;
64131 +}
64132 +
64133 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
64134 + It fills the status structure and tries to push it to disk. */
64135 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
64136 +{
64137 + struct super_block *sb = reiser4_get_current_sb();
64138 + struct reiser4_status *statuspage;
64139 + struct bio *bio = get_super_private(sb)->status_bio;
64140 +
64141 + if (!get_super_private(sb)->status_page) { // No status page?
64142 + return -1;
64143 + }
64144 + statuspage = (struct reiser4_status *)
64145 + kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64146 +
64147 + put_unaligned(cpu_to_le64(status), &statuspage->status);
64148 + put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
64149 + strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
64150 +
64151 + kunmap_atomic((char *)statuspage, KM_USER0);
64152 + bio->bi_bdev = sb->s_bdev;
64153 + bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
64154 + bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64155 + bio->bi_io_vec[0].bv_offset = 0;
64156 + bio->bi_vcnt = 1;
64157 + bio->bi_size = sb->s_blocksize;
64158 + bio->bi_end_io = reiser4_status_endio;
64159 + lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
64160 + /* We can block now, but we have no other choice anyway */
64161 + submit_bio(WRITE, bio);
64162 + blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64163 + return 0; // We do not wait for io to finish.
64164 +}
64165 +
64166 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
64167 +int reiser4_status_finish(void)
64168 +{
64169 + struct super_block *sb = reiser4_get_current_sb();
64170 +
64171 + __free_pages(get_super_private(sb)->status_page, 0);
64172 + get_super_private(sb)->status_page = NULL;
64173 + bio_put(get_super_private(sb)->status_bio);
64174 + get_super_private(sb)->status_bio = NULL;
64175 + return 0;
64176 +}
64177 diff -urN linux-2.6.23.orig/fs/reiser4/status_flags.h linux-2.6.23/fs/reiser4/status_flags.h
64178 --- linux-2.6.23.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
64179 +++ linux-2.6.23/fs/reiser4/status_flags.h 2007-12-04 16:49:30.000000000 +0300
64180 @@ -0,0 +1,43 @@
64181 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64182 + * reiser4/README */
64183 +
64184 +/* Here we declare structures and flags that store reiser4 status on disk.
64185 + The status that helps us to find out if the filesystem is valid or if it
64186 + contains some critical, or not so critical errors */
64187 +
64188 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
64189 +#define __REISER4_STATUS_FLAGS_H__
64190 +
64191 +#include "dformat.h"
64192 +/* These are major status flags */
64193 +#define REISER4_STATUS_OK 0
64194 +#define REISER4_STATUS_CORRUPTED 0x1
64195 +#define REISER4_STATUS_DAMAGED 0x2
64196 +#define REISER4_STATUS_DESTROYED 0x4
64197 +#define REISER4_STATUS_IOERROR 0x8
64198 +
64199 +/* Return values for reiser4_status_query() */
64200 +#define REISER4_STATUS_MOUNT_OK 0
64201 +#define REISER4_STATUS_MOUNT_WARN 1
64202 +#define REISER4_STATUS_MOUNT_RO 2
64203 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
64204 +
64205 +#define REISER4_TEXTERROR_LEN 256
64206 +
64207 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
64208 +/* We probably need to keep its size under sector size which is 512 bytes */
64209 +struct reiser4_status {
64210 + char magic[16];
64211 + d64 status; /* Current FS state */
64212 + d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
64213 + last sector where io error happened if status is "io error encountered" */
64214 + d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
64215 + char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
64216 +};
64217 +
64218 +int reiser4_status_init(reiser4_block_nr block);
64219 +int reiser4_status_query(u64 * status, u64 * extended);
64220 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
64221 +int reiser4_status_finish(void);
64222 +
64223 +#endif
64224 diff -urN linux-2.6.23.orig/fs/reiser4/super.c linux-2.6.23/fs/reiser4/super.c
64225 --- linux-2.6.23.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300
64226 +++ linux-2.6.23/fs/reiser4/super.c 2007-12-04 16:49:30.000000000 +0300
64227 @@ -0,0 +1,316 @@
64228 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64229 + * reiser4/README */
64230 +
64231 +/* Super-block manipulations. */
64232 +
64233 +#include "debug.h"
64234 +#include "dformat.h"
64235 +#include "key.h"
64236 +#include "plugin/security/perm.h"
64237 +#include "plugin/space/space_allocator.h"
64238 +#include "plugin/plugin.h"
64239 +#include "tree.h"
64240 +#include "vfs_ops.h"
64241 +#include "super.h"
64242 +#include "reiser4.h"
64243 +
64244 +#include <linux/types.h> /* for __u?? */
64245 +#include <linux/fs.h> /* for struct super_block */
64246 +
64247 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
64248 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
64249 +static __u64 reserved_for_root(const struct super_block *super);
64250 +
64251 +/* Return reiser4-specific part of super block */
64252 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
64253 + * queried */ )
64254 +{
64255 + return (reiser4_super_info_data *) super->s_fs_info;
64256 +}
64257 +
64258 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
64259 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
64260 +{
64261 + assert("nikita-448", super != NULL);
64262 + assert("nikita-449", is_reiser4_super(super));
64263 + return (long)REISER4_SUPER_MAGIC;
64264 +}
64265 +
64266 +/* functions to read/modify fields of reiser4_super_info_data */
64267 +
64268 +/* get number of blocks in file system */
64269 +__u64 reiser4_block_count(const struct super_block *super /* super block
64270 + queried */ )
64271 +{
64272 + assert("vs-494", super != NULL);
64273 + assert("vs-495", is_reiser4_super(super));
64274 + return get_super_private(super)->block_count;
64275 +}
64276 +
64277 +#if REISER4_DEBUG
64278 +/*
64279 + * number of blocks in the current file system
64280 + */
64281 +__u64 reiser4_current_block_count(void)
64282 +{
64283 + return get_current_super_private()->block_count;
64284 +}
64285 +#endif /* REISER4_DEBUG */
64286 +
64287 +/* set number of block in filesystem */
64288 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
64289 +{
64290 + assert("vs-501", super != NULL);
64291 + assert("vs-502", is_reiser4_super(super));
64292 + get_super_private(super)->block_count = nr;
64293 + /*
64294 + * The proper calculation of the reserved space counter (%5 of device
64295 + * block counter) we need a 64 bit division which is missing in Linux
64296 + * on i386 platform. Because we do not need a precise calculation here
64297 + * we can replace a div64 operation by this combination of
64298 + * multiplication and shift: 51. / (2^10) == .0498 .
64299 + * FIXME: this is a bug. It comes up only for very small filesystems
64300 + * which probably are never used. Nevertheless, it is a bug. Number of
64301 + * reserved blocks must be not less than maximal number of blocks which
64302 + * get grabbed with BA_RESERVED.
64303 + */
64304 + get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
64305 +}
64306 +
64307 +/* amount of blocks used (allocated for data) in file system */
64308 +__u64 reiser4_data_blocks(const struct super_block *super /* super block
64309 + queried */ )
64310 +{
64311 + assert("nikita-452", super != NULL);
64312 + assert("nikita-453", is_reiser4_super(super));
64313 + return get_super_private(super)->blocks_used;
64314 +}
64315 +
64316 +/* set number of block used in filesystem */
64317 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
64318 +{
64319 + assert("vs-503", super != NULL);
64320 + assert("vs-504", is_reiser4_super(super));
64321 + get_super_private(super)->blocks_used = nr;
64322 +}
64323 +
64324 +/* amount of free blocks in file system */
64325 +__u64 reiser4_free_blocks(const struct super_block *super /* super block
64326 + queried */ )
64327 +{
64328 + assert("nikita-454", super != NULL);
64329 + assert("nikita-455", is_reiser4_super(super));
64330 + return get_super_private(super)->blocks_free;
64331 +}
64332 +
64333 +/* set number of blocks free in filesystem */
64334 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
64335 +{
64336 + assert("vs-505", super != NULL);
64337 + assert("vs-506", is_reiser4_super(super));
64338 + get_super_private(super)->blocks_free = nr;
64339 +}
64340 +
64341 +/* get mkfs unique identifier */
64342 +__u32 reiser4_mkfs_id(const struct super_block *super /* super block
64343 + queried */ )
64344 +{
64345 + assert("vpf-221", super != NULL);
64346 + assert("vpf-222", is_reiser4_super(super));
64347 + return get_super_private(super)->mkfs_id;
64348 +}
64349 +
64350 +/* amount of free blocks in file system */
64351 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
64352 +{
64353 + assert("vs-497", super != NULL);
64354 + assert("vs-498", is_reiser4_super(super));
64355 + return get_super_private(super)->blocks_free_committed;
64356 +}
64357 +
64358 +/* amount of blocks in the file system reserved for @uid and @gid */
64359 +long reiser4_reserved_blocks(const struct super_block *super /* super block
64360 + queried */ ,
64361 + uid_t uid /* user id */ ,
64362 + gid_t gid /* group id */ )
64363 +{
64364 + long reserved;
64365 +
64366 + assert("nikita-456", super != NULL);
64367 + assert("nikita-457", is_reiser4_super(super));
64368 +
64369 + reserved = 0;
64370 + if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
64371 + reserved += reserved_for_gid(super, gid);
64372 + if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
64373 + reserved += reserved_for_uid(super, uid);
64374 + if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
64375 + reserved += reserved_for_root(super);
64376 + return reserved;
64377 +}
64378 +
64379 +/* get/set value of/to grabbed blocks counter */
64380 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
64381 +{
64382 + assert("zam-512", super != NULL);
64383 + assert("zam-513", is_reiser4_super(super));
64384 +
64385 + return get_super_private(super)->blocks_grabbed;
64386 +}
64387 +
64388 +__u64 reiser4_flush_reserved(const struct super_block * super)
64389 +{
64390 + assert("vpf-285", super != NULL);
64391 + assert("vpf-286", is_reiser4_super(super));
64392 +
64393 + return get_super_private(super)->blocks_flush_reserved;
64394 +}
64395 +
64396 +/* get/set value of/to counter of fake allocated formatted blocks */
64397 +__u64 reiser4_fake_allocated(const struct super_block * super)
64398 +{
64399 + assert("zam-516", super != NULL);
64400 + assert("zam-517", is_reiser4_super(super));
64401 +
64402 + return get_super_private(super)->blocks_fake_allocated;
64403 +}
64404 +
64405 +/* get/set value of/to counter of fake allocated unformatted blocks */
64406 +__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
64407 +{
64408 + assert("zam-516", super != NULL);
64409 + assert("zam-517", is_reiser4_super(super));
64410 +
64411 + return get_super_private(super)->blocks_fake_allocated_unformatted;
64412 +}
64413 +
64414 +/* get/set value of/to counter of clustered blocks */
64415 +__u64 reiser4_clustered_blocks(const struct super_block * super)
64416 +{
64417 + assert("edward-601", super != NULL);
64418 + assert("edward-602", is_reiser4_super(super));
64419 +
64420 + return get_super_private(super)->blocks_clustered;
64421 +}
64422 +
64423 +/* space allocator used by this file system */
64424 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
64425 + *super)
64426 +{
64427 + assert("nikita-1965", super != NULL);
64428 + assert("nikita-1966", is_reiser4_super(super));
64429 + return &get_super_private(super)->space_allocator;
64430 +}
64431 +
64432 +/* return fake inode used to bind formatted nodes in the page cache */
64433 +struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
64434 + queried */ )
64435 +{
64436 + assert("nikita-1757", super != NULL);
64437 + return get_super_private(super)->fake;
64438 +}
64439 +
64440 +/* return fake inode used to bind copied on capture nodes in the page cache */
64441 +struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
64442 + queried */ )
64443 +{
64444 + assert("nikita-1757", super != NULL);
64445 + return get_super_private(super)->cc;
64446 +}
64447 +
64448 +/* return fake inode used to bind bitmaps and journlal heads */
64449 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
64450 +{
64451 + assert("nikita-17571", super != NULL);
64452 + return get_super_private(super)->bitmap;
64453 +}
64454 +
64455 +/* tree used by this file system */
64456 +reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
64457 + * queried */ )
64458 +{
64459 + assert("nikita-460", super != NULL);
64460 + assert("nikita-461", is_reiser4_super(super));
64461 + return &get_super_private(super)->tree;
64462 +}
64463 +
64464 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
64465 + use in assertions. */
64466 +int is_reiser4_super(const struct super_block *super /* super block
64467 + * queried */ )
64468 +{
64469 + return
64470 + super != NULL &&
64471 + get_super_private(super) != NULL &&
64472 + super->s_op == &(get_super_private(super)->ops.super);
64473 +}
64474 +
64475 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
64476 +{
64477 + return test_bit((int)f, &get_super_private(super)->fs_flags);
64478 +}
64479 +
64480 +/* amount of blocks reserved for given group in file system */
64481 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
64482 + * block
64483 + * queried */ ,
64484 + gid_t gid UNUSED_ARG /* group id */ )
64485 +{
64486 + return 0;
64487 +}
64488 +
64489 +/* amount of blocks reserved for given user in file system */
64490 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
64491 + block
64492 + queried */ ,
64493 + uid_t uid UNUSED_ARG /* user id */ )
64494 +{
64495 + return 0;
64496 +}
64497 +
64498 +/* amount of blocks reserved for super user in file system */
64499 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
64500 + block
64501 + queried */ )
64502 +{
64503 + return 0;
64504 +}
64505 +
64506 +/*
64507 + * true if block number @blk makes sense for the file system at @super.
64508 + */
64509 +int
64510 +reiser4_blocknr_is_sane_for(const struct super_block *super,
64511 + const reiser4_block_nr * blk)
64512 +{
64513 + reiser4_super_info_data *sbinfo;
64514 +
64515 + assert("nikita-2957", super != NULL);
64516 + assert("nikita-2958", blk != NULL);
64517 +
64518 + if (reiser4_blocknr_is_fake(blk))
64519 + return 1;
64520 +
64521 + sbinfo = get_super_private(super);
64522 + return *blk < sbinfo->block_count;
64523 +}
64524 +
64525 +#if REISER4_DEBUG
64526 +/*
64527 + * true, if block number @blk makes sense for the current file system
64528 + */
64529 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
64530 +{
64531 + return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
64532 +}
64533 +#endif /* REISER4_DEBUG */
64534 +
64535 +/* Make Linus happy.
64536 + Local variables:
64537 + c-indentation-style: "K&R"
64538 + mode-name: "LC"
64539 + c-basic-offset: 8
64540 + tab-width: 8
64541 + fill-column: 120
64542 + End:
64543 +*/
64544 diff -urN linux-2.6.23.orig/fs/reiser4/super.h linux-2.6.23/fs/reiser4/super.h
64545 --- linux-2.6.23.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300
64546 +++ linux-2.6.23/fs/reiser4/super.h 2007-12-04 16:49:30.000000000 +0300
64547 @@ -0,0 +1,466 @@
64548 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64549 + * reiser4/README */
64550 +
64551 +/* Super-block functions. See super.c for details. */
64552 +
64553 +#if !defined( __REISER4_SUPER_H__ )
64554 +#define __REISER4_SUPER_H__
64555 +
64556 +#include <linux/exportfs.h>
64557 +
64558 +#include "tree.h"
64559 +#include "entd.h"
64560 +#include "wander.h"
64561 +#include "fsdata.h"
64562 +#include "plugin/object.h"
64563 +#include "plugin/space/space_allocator.h"
64564 +
64565 +/*
64566 + * Flush algorithms parameters.
64567 + */
64568 +struct flush_params {
64569 + unsigned relocate_threshold;
64570 + unsigned relocate_distance;
64571 + unsigned written_threshold;
64572 + unsigned scan_maxnodes;
64573 +};
64574 +
64575 +typedef enum {
64576 + /*
64577 + * True if this file system doesn't support hard-links (multiple names)
64578 + * for directories: this is default UNIX behavior.
64579 + *
64580 + * If hard-links on directoires are not allowed, file system is Acyclic
64581 + * Directed Graph (modulo dot, and dotdot, of course).
64582 + *
64583 + * This is used by reiser4_link().
64584 + */
64585 + REISER4_ADG = 0,
64586 + /*
64587 + * set if all nodes in internal tree have the same node layout plugin.
64588 + * If so, znode_guess_plugin() will return tree->node_plugin in stead
64589 + * of guessing plugin by plugin id stored in the node.
64590 + */
64591 + REISER4_ONE_NODE_PLUGIN = 1,
64592 + /* if set, bsd gid assignment is supported. */
64593 + REISER4_BSD_GID = 2,
64594 + /* [mac]_time are 32 bit in inode */
64595 + REISER4_32_BIT_TIMES = 3,
64596 + /* load all bitmap blocks at mount time */
64597 + REISER4_DONT_LOAD_BITMAP = 5,
64598 + /* enforce atomicity during write(2) */
64599 + REISER4_ATOMIC_WRITE = 6,
64600 + /* don't use write barriers in the log writer code. */
64601 + REISER4_NO_WRITE_BARRIER = 7
64602 +} reiser4_fs_flag;
64603 +
64604 +/*
64605 + * VFS related operation vectors.
64606 + */
64607 +struct object_ops {
64608 + struct super_operations super;
64609 + struct dentry_operations dentry;
64610 + struct export_operations export;
64611 +};
64612 +
64613 +/* reiser4-specific part of super block
64614 +
64615 + Locking
64616 +
64617 + Fields immutable after mount:
64618 +
64619 + ->oid*
64620 + ->space*
64621 + ->default_[ug]id
64622 + ->mkfs_id
64623 + ->trace_flags
64624 + ->debug_flags
64625 + ->fs_flags
64626 + ->df_plug
64627 + ->optimal_io_size
64628 + ->plug
64629 + ->flush
64630 + ->u (bad name)
64631 + ->txnmgr
64632 + ->ra_params
64633 + ->fsuid
64634 + ->journal_header
64635 + ->journal_footer
64636 +
64637 + Fields protected by ->lnode_guard
64638 +
64639 + ->lnode_htable
64640 +
64641 + Fields protected by per-super block spin lock
64642 +
64643 + ->block_count
64644 + ->blocks_used
64645 + ->blocks_free
64646 + ->blocks_free_committed
64647 + ->blocks_grabbed
64648 + ->blocks_fake_allocated_unformatted
64649 + ->blocks_fake_allocated
64650 + ->blocks_flush_reserved
64651 + ->eflushed
64652 + ->blocknr_hint_default
64653 +
64654 + After journal replaying during mount,
64655 +
64656 + ->last_committed_tx
64657 +
64658 + is protected by ->tmgr.commit_mutex
64659 +
64660 + Invariants involving this data-type:
64661 +
64662 + [sb-block-counts]
64663 + [sb-grabbed]
64664 + [sb-fake-allocated]
64665 +*/
64666 +struct reiser4_super_info_data {
64667 + /*
64668 + * guard spinlock which protects reiser4 super block fields (currently
64669 + * blocks_free, blocks_free_committed)
64670 + */
64671 + spinlock_t guard;
64672 +
64673 + /* next oid that will be returned by oid_allocate() */
64674 + oid_t next_to_use;
64675 + /* total number of used oids */
64676 + oid_t oids_in_use;
64677 +
64678 + /* space manager plugin */
64679 + reiser4_space_allocator space_allocator;
64680 +
64681 + /* reiser4 internal tree */
64682 + reiser4_tree tree;
64683 +
64684 + /*
64685 + * default user id used for light-weight files without their own
64686 + * stat-data.
64687 + */
64688 + uid_t default_uid;
64689 +
64690 + /*
64691 + * default group id used for light-weight files without their own
64692 + * stat-data.
64693 + */
64694 + gid_t default_gid;
64695 +
64696 + /* mkfs identifier generated at mkfs time. */
64697 + __u32 mkfs_id;
64698 + /* amount of blocks in a file system */
64699 + __u64 block_count;
64700 +
64701 + /* inviolable reserve */
64702 + __u64 blocks_reserved;
64703 +
64704 + /* amount of blocks used by file system data and meta-data. */
64705 + __u64 blocks_used;
64706 +
64707 + /*
64708 + * amount of free blocks. This is "working" free blocks counter. It is
64709 + * like "working" bitmap, please see block_alloc.c for description.
64710 + */
64711 + __u64 blocks_free;
64712 +
64713 + /*
64714 + * free block count for fs committed state. This is "commit" version of
64715 + * free block counter.
64716 + */
64717 + __u64 blocks_free_committed;
64718 +
64719 + /*
64720 + * number of blocks reserved for further allocation, for all
64721 + * threads.
64722 + */
64723 + __u64 blocks_grabbed;
64724 +
64725 + /* number of fake allocated unformatted blocks in tree. */
64726 + __u64 blocks_fake_allocated_unformatted;
64727 +
64728 + /* number of fake allocated formatted blocks in tree. */
64729 + __u64 blocks_fake_allocated;
64730 +
64731 + /* number of blocks reserved for flush operations. */
64732 + __u64 blocks_flush_reserved;
64733 +
64734 + /* number of blocks reserved for cluster operations. */
64735 + __u64 blocks_clustered;
64736 +
64737 + /* unique file-system identifier */
64738 + __u32 fsuid;
64739 +
64740 + /* On-disk format version. If does not equal to the disk_format
64741 + plugin version, some format updates (e.g. enlarging plugin
64742 + set, etc) may have place on mount. */
64743 + int version;
64744 +
64745 + /* file-system wide flags. See reiser4_fs_flag enum */
64746 + unsigned long fs_flags;
64747 +
64748 + /* transaction manager */
64749 + txn_mgr tmgr;
64750 +
64751 + /* ent thread */
64752 + entd_context entd;
64753 +
64754 + /* fake inode used to bind formatted nodes */
64755 + struct inode *fake;
64756 + /* inode used to bind bitmaps (and journal heads) */
64757 + struct inode *bitmap;
64758 + /* inode used to bind copied on capture nodes */
64759 + struct inode *cc;
64760 +
64761 + /* disk layout plugin */
64762 + disk_format_plugin *df_plug;
64763 +
64764 + /* disk layout specific part of reiser4 super info data */
64765 + union {
64766 + format40_super_info format40;
64767 + } u;
64768 +
64769 + /* value we return in st_blksize on stat(2) */
64770 + unsigned long optimal_io_size;
64771 +
64772 + /* parameters for the flush algorithm */
64773 + struct flush_params flush;
64774 +
64775 + /* pointers to jnodes for journal header and footer */
64776 + jnode *journal_header;
64777 + jnode *journal_footer;
64778 +
64779 + journal_location jloc;
64780 +
64781 + /* head block number of last committed transaction */
64782 + __u64 last_committed_tx;
64783 +
64784 + /*
64785 + * we remember last written location for using as a hint for new block
64786 + * allocation
64787 + */
64788 + __u64 blocknr_hint_default;
64789 +
64790 + /* committed number of files (oid allocator state variable ) */
64791 + __u64 nr_files_committed;
64792 +
64793 + struct formatted_ra_params ra_params;
64794 +
64795 + /*
64796 + * A mutex for serializing cut tree operation if out-of-free-space:
64797 + * the only one cut_tree thread is allowed to grab space from reserved
64798 + * area (it is 5% of disk space)
64799 + */
64800 + struct mutex delete_mutex;
64801 + /* task owning ->delete_mutex */
64802 + struct task_struct *delete_mutex_owner;
64803 +
64804 + /* Diskmap's blocknumber */
64805 + __u64 diskmap_block;
64806 +
64807 + /* What to do in case of error */
64808 + int onerror;
64809 +
64810 + /* operations for objects on this file system */
64811 + struct object_ops ops;
64812 +
64813 + /*
64814 + * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
64815 + * more details
64816 + */
64817 + struct d_cursor_info d_info;
64818 +
64819 +#ifdef CONFIG_REISER4_BADBLOCKS
64820 + /* Alternative master superblock offset (in bytes) */
64821 + unsigned long altsuper;
64822 +#endif
64823 + struct repacker *repacker;
64824 + struct page *status_page;
64825 + struct bio *status_bio;
64826 +
64827 +#if REISER4_DEBUG
64828 + /*
64829 + * minimum used blocks value (includes super blocks, bitmap blocks and
64830 + * other fs reserved areas), depends on fs format and fs size.
64831 + */
64832 + __u64 min_blocks_used;
64833 +
64834 + /*
64835 + * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
64836 + * are kept on a list anchored at sbinfo->all_jnodes. This list is
64837 + * protected by sbinfo->all_guard spin lock. This lock should be taken
64838 + * with _irq modifier, because it is also modified from interrupt
64839 + * contexts (by RCU).
64840 + */
64841 + spinlock_t all_guard;
64842 + /* list of all jnodes */
64843 + struct list_head all_jnodes;
64844 +#endif
64845 + struct dentry *debugfs_root;
64846 +};
64847 +
64848 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
64849 + super_block *super);
64850 +
64851 +/* Return reiser4-specific part of super block */
64852 +static inline reiser4_super_info_data *get_super_private(const struct
64853 + super_block *super)
64854 +{
64855 + assert("nikita-447", super != NULL);
64856 +
64857 + return (reiser4_super_info_data *) super->s_fs_info;
64858 +}
64859 +
64860 +/* get ent context for the @super */
64861 +static inline entd_context *get_entd_context(struct super_block *super)
64862 +{
64863 + return &get_super_private(super)->entd;
64864 +}
64865 +
64866 +/* "Current" super-block: main super block used during current system
64867 + call. Reference to this super block is stored in reiser4_context. */
64868 +static inline struct super_block *reiser4_get_current_sb(void)
64869 +{
64870 + return get_current_context()->super;
64871 +}
64872 +
64873 +/* Reiser4-specific part of "current" super-block: main super block used
64874 + during current system call. Reference to this super block is stored in
64875 + reiser4_context. */
64876 +static inline reiser4_super_info_data *get_current_super_private(void)
64877 +{
64878 + return get_super_private(reiser4_get_current_sb());
64879 +}
64880 +
64881 +static inline struct formatted_ra_params *get_current_super_ra_params(void)
64882 +{
64883 + return &(get_current_super_private()->ra_params);
64884 +}
64885 +
64886 +/*
64887 + * true, if file system on @super is read-only
64888 + */
64889 +static inline int rofs_super(struct super_block *super)
64890 +{
64891 + return super->s_flags & MS_RDONLY;
64892 +}
64893 +
64894 +/*
64895 + * true, if @tree represents read-only file system
64896 + */
64897 +static inline int rofs_tree(reiser4_tree * tree)
64898 +{
64899 + return rofs_super(tree->super);
64900 +}
64901 +
64902 +/*
64903 + * true, if file system where @inode lives on, is read-only
64904 + */
64905 +static inline int rofs_inode(struct inode *inode)
64906 +{
64907 + return rofs_super(inode->i_sb);
64908 +}
64909 +
64910 +/*
64911 + * true, if file system where @node lives on, is read-only
64912 + */
64913 +static inline int rofs_jnode(jnode * node)
64914 +{
64915 + return rofs_tree(jnode_get_tree(node));
64916 +}
64917 +
64918 +extern __u64 reiser4_current_block_count(void);
64919 +
64920 +extern void build_object_ops(struct super_block *super, struct object_ops * ops);
64921 +
64922 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
64923 +
64924 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
64925 +{
64926 + spin_lock(&(sbinfo->guard));
64927 +}
64928 +
64929 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
64930 +{
64931 + assert_spin_locked(&(sbinfo->guard));
64932 + spin_unlock(&(sbinfo->guard));
64933 +}
64934 +
64935 +extern __u64 reiser4_flush_reserved(const struct super_block *);
64936 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
64937 +extern long reiser4_statfs_type(const struct super_block *super);
64938 +extern __u64 reiser4_block_count(const struct super_block *super);
64939 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
64940 +extern __u64 reiser4_data_blocks(const struct super_block *super);
64941 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
64942 +extern __u64 reiser4_free_blocks(const struct super_block *super);
64943 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
64944 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
64945 +
64946 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
64947 +
64948 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
64949 +extern __u64 reiser4_fake_allocated(const struct super_block *);
64950 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
64951 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
64952 +
64953 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
64954 + gid_t gid);
64955 +
64956 +extern reiser4_space_allocator *
64957 +reiser4_get_space_allocator(const struct super_block *super);
64958 +extern reiser4_oid_allocator *
64959 +reiser4_get_oid_allocator(const struct super_block *super);
64960 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
64961 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
64962 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
64963 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
64964 +extern int is_reiser4_super(const struct super_block *super);
64965 +
64966 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
64967 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
64968 + const reiser4_block_nr * blk);
64969 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
64970 +extern int reiser4_done_super(struct super_block *s);
64971 +
64972 +/* step of fill super */
64973 +extern int reiser4_init_fs_info(struct super_block *);
64974 +extern void reiser4_done_fs_info(struct super_block *);
64975 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
64976 +extern int reiser4_init_read_super(struct super_block *, int silent);
64977 +extern int reiser4_init_root_inode(struct super_block *);
64978 +extern reiser4_plugin *get_default_plugin(pset_member memb);
64979 +
64980 +/* Maximal possible object id. */
64981 +#define ABSOLUTE_MAX_OID ((oid_t)~0)
64982 +
64983 +#define OIDS_RESERVED ( 1 << 16 )
64984 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
64985 +oid_t oid_allocate(struct super_block *);
64986 +int oid_release(struct super_block *, oid_t);
64987 +oid_t oid_next(const struct super_block *);
64988 +void oid_count_allocated(void);
64989 +void oid_count_released(void);
64990 +long oids_used(const struct super_block *);
64991 +
64992 +#if REISER4_DEBUG
64993 +void print_fs_info(const char *prefix, const struct super_block *);
64994 +#endif
64995 +
64996 +extern void destroy_reiser4_cache(struct kmem_cache **);
64997 +
64998 +extern struct super_operations reiser4_super_operations;
64999 +extern struct export_operations reiser4_export_operations;
65000 +extern struct dentry_operations reiser4_dentry_operations;
65001 +
65002 +/* __REISER4_SUPER_H__ */
65003 +#endif
65004 +
65005 +/*
65006 + * Local variables:
65007 + * c-indentation-style: "K&R"
65008 + * mode-name: "LC"
65009 + * c-basic-offset: 8
65010 + * tab-width: 8
65011 + * fill-column: 120
65012 + * End:
65013 + */
65014 diff -urN linux-2.6.23.orig/fs/reiser4/super_ops.c linux-2.6.23/fs/reiser4/super_ops.c
65015 --- linux-2.6.23.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300
65016 +++ linux-2.6.23/fs/reiser4/super_ops.c 2007-12-04 22:57:33.998734400 +0300
65017 @@ -0,0 +1,724 @@
65018 +/* Copyright 2005 by Hans Reiser, licensing governed by
65019 + * reiser4/README */
65020 +
65021 +#include "inode.h"
65022 +#include "page_cache.h"
65023 +#include "ktxnmgrd.h"
65024 +#include "flush.h"
65025 +#include "safe_link.h"
65026 +
65027 +#include <linux/vfs.h>
65028 +#include <linux/writeback.h>
65029 +#include <linux/mount.h>
65030 +#include <linux/seq_file.h>
65031 +#include <linux/debugfs.h>
65032 +
65033 +/* slab cache for inodes */
65034 +static struct kmem_cache *inode_cache;
65035 +
65036 +static struct dentry *reiser4_debugfs_root = NULL;
65037 +
65038 +/**
65039 + * init_once - constructor for reiser4 inodes
65040 + * @cache: cache @obj belongs to
65041 + * @obj: inode to be initialized
65042 + *
65043 + * Initialization function to be called when new page is allocated by reiser4
65044 + * inode cache. It is set on inode cache creation.
65045 + */
65046 +static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
65047 +{
65048 + struct reiser4_inode_object *info;
65049 +
65050 + info = obj;
65051 +
65052 + /* initialize vfs inode */
65053 + inode_init_once(&info->vfs_inode);
65054 +
65055 + /*
65056 + * initialize reiser4 specific part fo inode.
65057 + * NOTE-NIKITA add here initializations for locks, list heads,
65058 + * etc. that will be added to our private inode part.
65059 + */
65060 + INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
65061 + init_rwsem(&info->p.conv_sem);
65062 + /* init semaphore which is used during inode loading */
65063 + loading_init_once(&info->p);
65064 + INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
65065 + GFP_ATOMIC);
65066 +#if REISER4_DEBUG
65067 + info->p.nr_jnodes = 0;
65068 +#endif
65069 +}
65070 +
65071 +/**
65072 + * init_inodes - create znode cache
65073 + *
65074 + * Initializes slab cache of inodes. It is part of reiser4 module initialization.
65075 + */
65076 +static int init_inodes(void)
65077 +{
65078 + inode_cache = kmem_cache_create("reiser4_inode",
65079 + sizeof(struct reiser4_inode_object),
65080 + 0,
65081 + SLAB_HWCACHE_ALIGN |
65082 + SLAB_RECLAIM_ACCOUNT, init_once);
65083 + if (inode_cache == NULL)
65084 + return RETERR(-ENOMEM);
65085 + return 0;
65086 +}
65087 +
65088 +/**
65089 + * done_inodes - delete inode cache
65090 + *
65091 + * This is called on reiser4 module unloading or system shutdown.
65092 + */
65093 +static void done_inodes(void)
65094 +{
65095 + destroy_reiser4_cache(&inode_cache);
65096 +}
65097 +
65098 +/**
65099 + * reiser4_alloc_inode - alloc_inode of super operations
65100 + * @super: super block new inode is allocated for
65101 + *
65102 + * Allocates new inode, initializes reiser4 specific part of it.
65103 + */
65104 +static struct inode *reiser4_alloc_inode(struct super_block *super)
65105 +{
65106 + struct reiser4_inode_object *obj;
65107 +
65108 + assert("nikita-1696", super != NULL);
65109 + obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
65110 + if (obj != NULL) {
65111 + reiser4_inode *info;
65112 +
65113 + info = &obj->p;
65114 +
65115 + info->pset = plugin_set_get_empty();
65116 + info->hset = plugin_set_get_empty();
65117 + info->extmask = 0;
65118 + info->locality_id = 0ull;
65119 + info->plugin_mask = 0;
65120 + info->heir_mask = 0;
65121 +#if !REISER4_INO_IS_OID
65122 + info->oid_hi = 0;
65123 +#endif
65124 + reiser4_seal_init(&info->sd_seal, NULL, NULL);
65125 + coord_init_invalid(&info->sd_coord, NULL);
65126 + info->flags = 0;
65127 + spin_lock_init(&info->guard);
65128 + /* this deals with info's loading semaphore */
65129 + loading_alloc(info);
65130 + info->vroot = UBER_TREE_ADDR;
65131 + return &obj->vfs_inode;
65132 + } else
65133 + return NULL;
65134 +}
65135 +
65136 +/**
65137 + * reiser4_destroy_inode - destroy_inode of super operations
65138 + * @inode: inode being destroyed
65139 + *
65140 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
65141 + */
65142 +static void reiser4_destroy_inode(struct inode *inode)
65143 +{
65144 + reiser4_inode *info;
65145 +
65146 + info = reiser4_inode_data(inode);
65147 +
65148 + assert("vs-1220", inode_has_no_jnodes(info));
65149 +
65150 + if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
65151 + file_plugin *fplug = inode_file_plugin(inode);
65152 + if (fplug->destroy_inode != NULL)
65153 + fplug->destroy_inode(inode);
65154 + }
65155 + reiser4_dispose_cursors(inode);
65156 + if (info->pset)
65157 + plugin_set_put(info->pset);
65158 + if (info->hset)
65159 + plugin_set_put(info->hset);
65160 +
65161 + /*
65162 + * cannot add similar assertion about ->i_list as prune_icache return
65163 + * inode into slab with dangling ->list.{next,prev}. This is safe,
65164 + * because they are re-initialized in the new_inode().
65165 + */
65166 + assert("nikita-2895", list_empty(&inode->i_dentry));
65167 + assert("nikita-2896", hlist_unhashed(&inode->i_hash));
65168 + assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
65169 +
65170 + /* this deals with info's loading semaphore */
65171 + loading_destroy(info);
65172 +
65173 + kmem_cache_free(inode_cache,
65174 + container_of(info, struct reiser4_inode_object, p));
65175 +}
65176 +
65177 +/**
65178 + * reiser4_dirty_inode - dirty_inode of super operations
65179 + * @inode: inode being dirtied
65180 + *
65181 + * Updates stat data.
65182 + */
65183 +static void reiser4_dirty_inode(struct inode *inode)
65184 +{
65185 + int result;
65186 +
65187 + if (!is_in_reiser4_context())
65188 + return;
65189 + assert("", !IS_RDONLY(inode));
65190 + assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
65191 + get_current_context()->grabbed_blocks));
65192 +
65193 + result = reiser4_update_sd(inode);
65194 + if (result)
65195 + warning("", "failed to dirty inode for %llu: %d",
65196 + get_inode_oid(inode), result);
65197 +}
65198 +
65199 +/**
65200 + * reiser4_delete_inode - delete_inode of super operations
65201 + * @inode: inode to delete
65202 + *
65203 + * Calls file plugin's delete_object method to delete object items from
65204 + * filesystem tree and calls clear_inode.
65205 + */
65206 +static void reiser4_delete_inode(struct inode *inode)
65207 +{
65208 + reiser4_context *ctx;
65209 + file_plugin *fplug;
65210 +
65211 + ctx = reiser4_init_context(inode->i_sb);
65212 + if (IS_ERR(ctx)) {
65213 + warning("vs-15", "failed to init context");
65214 + return;
65215 + }
65216 +
65217 + if (is_inode_loaded(inode)) {
65218 + fplug = inode_file_plugin(inode);
65219 + if (fplug != NULL && fplug->delete_object != NULL)
65220 + fplug->delete_object(inode);
65221 + }
65222 +
65223 + truncate_inode_pages(&inode->i_data, 0);
65224 + inode->i_blocks = 0;
65225 + clear_inode(inode);
65226 + reiser4_exit_context(ctx);
65227 +}
65228 +
65229 +/**
65230 + * reiser4_put_super - put_super of super operations
65231 + * @super: super block to free
65232 + *
65233 + * Stops daemons, release resources, umounts in short.
65234 + */
65235 +static void reiser4_put_super(struct super_block *super)
65236 +{
65237 + reiser4_super_info_data *sbinfo;
65238 + reiser4_context *ctx;
65239 +
65240 + sbinfo = get_super_private(super);
65241 + assert("vs-1699", sbinfo);
65242 +
65243 + debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
65244 + debugfs_remove(sbinfo->tmgr.debugfs_id_count);
65245 + debugfs_remove(sbinfo->debugfs_root);
65246 +
65247 + ctx = reiser4_init_context(super);
65248 + if (IS_ERR(ctx)) {
65249 + warning("vs-17", "failed to init context");
65250 + return;
65251 + }
65252 +
65253 + /* have disk format plugin to free its resources */
65254 + if (get_super_private(super)->df_plug->release)
65255 + get_super_private(super)->df_plug->release(super);
65256 +
65257 + reiser4_done_formatted_fake(super);
65258 +
65259 + /* stop daemons: ktxnmgr and entd */
65260 + reiser4_done_entd(super);
65261 + reiser4_done_ktxnmgrd(super);
65262 + reiser4_done_txnmgr(&sbinfo->tmgr);
65263 +
65264 + reiser4_done_fs_info(super);
65265 + reiser4_exit_context(ctx);
65266 +}
65267 +
65268 +/**
65269 + * reiser4_write_super - write_super of super operations
65270 + * @super: super block to write
65271 + *
65272 + * Captures znode associated with super block, comit all transactions.
65273 + */
65274 +static void reiser4_write_super(struct super_block *super)
65275 +{
65276 + int ret;
65277 + reiser4_context *ctx;
65278 +
65279 + assert("vs-1700", !rofs_super(super));
65280 +
65281 + ctx = reiser4_init_context(super);
65282 + if (IS_ERR(ctx)) {
65283 + warning("vs-16", "failed to init context");
65284 + return;
65285 + }
65286 +
65287 + ret = reiser4_capture_super_block(super);
65288 + if (ret != 0)
65289 + warning("vs-1701",
65290 + "reiser4_capture_super_block failed in write_super: %d",
65291 + ret);
65292 + ret = txnmgr_force_commit_all(super, 0);
65293 + if (ret != 0)
65294 + warning("jmacd-77113",
65295 + "txn_force failed in write_super: %d", ret);
65296 +
65297 + super->s_dirt = 0;
65298 +
65299 + reiser4_exit_context(ctx);
65300 +}
65301 +
65302 +/**
65303 + * reiser4_statfs - statfs of super operations
65304 + * @super: super block of file system in queried
65305 + * @stafs: buffer to fill with statistics
65306 + *
65307 + * Returns information about filesystem.
65308 + */
65309 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
65310 +{
65311 + sector_t total;
65312 + sector_t reserved;
65313 + sector_t free;
65314 + sector_t forroot;
65315 + sector_t deleted;
65316 + reiser4_context *ctx;
65317 + struct super_block *super = dentry->d_sb;
65318 +
65319 + assert("nikita-408", super != NULL);
65320 + assert("nikita-409", statfs != NULL);
65321 +
65322 + ctx = reiser4_init_context(super);
65323 + if (IS_ERR(ctx))
65324 + return PTR_ERR(ctx);
65325 +
65326 + statfs->f_type = reiser4_statfs_type(super);
65327 + statfs->f_bsize = super->s_blocksize;
65328 +
65329 + /*
65330 + * 5% of total block space is reserved. This is needed for flush and
65331 + * for truncates (so that we are able to perform truncate/unlink even
65332 + * on the otherwise completely full file system). If this reservation
65333 + * is hidden from statfs(2), users will mistakenly guess that they
65334 + * have enough free space to complete some operation, which is
65335 + * frustrating.
65336 + *
65337 + * Another possible solution is to subtract ->blocks_reserved from
65338 + * ->f_bfree, but changing available space seems less intrusive than
65339 + * letting user to see 5% of disk space to be used directly after
65340 + * mkfs.
65341 + */
65342 + total = reiser4_block_count(super);
65343 + reserved = get_super_private(super)->blocks_reserved;
65344 + deleted = txnmgr_count_deleted_blocks();
65345 + free = reiser4_free_blocks(super) + deleted;
65346 + forroot = reiser4_reserved_blocks(super, 0, 0);
65347 +
65348 + /*
65349 + * These counters may be in inconsistent state because we take the
65350 + * values without keeping any global spinlock. Here we do a sanity
65351 + * check that free block counter does not exceed the number of all
65352 + * blocks.
65353 + */
65354 + if (free > total)
65355 + free = total;
65356 + statfs->f_blocks = total - reserved;
65357 + /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
65358 + if (free > reserved)
65359 + free -= reserved;
65360 + else
65361 + free = 0;
65362 + statfs->f_bfree = free;
65363 +
65364 + if (free > forroot)
65365 + free -= forroot;
65366 + else
65367 + free = 0;
65368 + statfs->f_bavail = free;
65369 +
65370 + statfs->f_files = 0;
65371 + statfs->f_ffree = 0;
65372 +
65373 + /* maximal acceptable name length depends on directory plugin. */
65374 + assert("nikita-3351", super->s_root->d_inode != NULL);
65375 + statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
65376 + reiser4_exit_context(ctx);
65377 + return 0;
65378 +}
65379 +
65380 +/**
65381 + * reiser4_clear_inode - clear_inode of super operation
65382 + * @inode: inode about to destroy
65383 + *
65384 + * Does sanity checks: being destroyed should have all jnodes detached.
65385 + */
65386 +static void reiser4_clear_inode(struct inode *inode)
65387 +{
65388 +#if REISER4_DEBUG
65389 + reiser4_inode *r4_inode;
65390 +
65391 + r4_inode = reiser4_inode_data(inode);
65392 + if (!inode_has_no_jnodes(r4_inode))
65393 + warning("vs-1732", "reiser4 inode has %ld jnodes\n",
65394 + r4_inode->nr_jnodes);
65395 +#endif
65396 +}
65397 +
65398 +/**
65399 + * reiser4_sync_inodes - sync_inodes of super operations
65400 + * @super:
65401 + * @wbc:
65402 + *
65403 + * This method is called by background and non-backgound writeback. Reiser4's
65404 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
65405 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
65406 + * mapping - dirty pages get into atoms. Writeout is called to flush some
65407 + * atoms.
65408 + */
65409 +static void reiser4_sync_inodes(struct super_block *super,
65410 + struct writeback_control *wbc)
65411 +{
65412 + reiser4_context *ctx;
65413 + long to_write;
65414 +
65415 + if (wbc->for_kupdate)
65416 + /* reiser4 has its own means of periodical write-out */
65417 + return;
65418 +
65419 + to_write = wbc->nr_to_write;
65420 + assert("vs-49", wbc->older_than_this == NULL);
65421 +
65422 + ctx = reiser4_init_context(super);
65423 + if (IS_ERR(ctx)) {
65424 + warning("vs-13", "failed to init context");
65425 + return;
65426 + }
65427 +
65428 + /*
65429 + * call reiser4_writepages for each of dirty inodes to turn dirty pages
65430 + * into transactions if they were not yet.
65431 + */
65432 + generic_sync_sb_inodes(super, wbc);
65433 +
65434 + /* flush goes here */
65435 + wbc->nr_to_write = to_write;
65436 + reiser4_writeout(super, wbc);
65437 +
65438 + /* avoid recursive calls to ->sync_inodes */
65439 + context_set_commit_async(ctx);
65440 + reiser4_exit_context(ctx);
65441 +}
65442 +
65443 +/**
65444 + * reiser4_show_options - show_options of super operations
65445 + * @m: file where to write information
65446 + * @mnt: mount structure
65447 + *
65448 + * Makes reiser4 mount options visible in /proc/mounts.
65449 + */
65450 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
65451 +{
65452 + struct super_block *super;
65453 + reiser4_super_info_data *sbinfo;
65454 +
65455 + super = mnt->mnt_sb;
65456 + sbinfo = get_super_private(super);
65457 +
65458 + seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
65459 + seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
65460 + seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
65461 + seq_printf(m, ",atom_max_flushers=0x%x",
65462 + sbinfo->tmgr.atom_max_flushers);
65463 + seq_printf(m, ",cbk_cache_slots=0x%x",
65464 + sbinfo->tree.cbk_cache.nr_slots);
65465 +
65466 + return 0;
65467 +}
65468 +
65469 +struct super_operations reiser4_super_operations = {
65470 + .alloc_inode = reiser4_alloc_inode,
65471 + .destroy_inode = reiser4_destroy_inode,
65472 + .dirty_inode = reiser4_dirty_inode,
65473 + .delete_inode = reiser4_delete_inode,
65474 + .put_super = reiser4_put_super,
65475 + .write_super = reiser4_write_super,
65476 + .statfs = reiser4_statfs,
65477 + .clear_inode = reiser4_clear_inode,
65478 + .sync_inodes = reiser4_sync_inodes,
65479 + .show_options = reiser4_show_options
65480 +};
65481 +
65482 +/**
65483 + * fill_super - initialize super block on mount
65484 + * @super: super block to fill
65485 + * @data: reiser4 specific mount option
65486 + * @silent:
65487 + *
65488 + * This is to be called by reiser4_get_sb. Mounts filesystem.
65489 + */
65490 +static int fill_super(struct super_block *super, void *data, int silent)
65491 +{
65492 + reiser4_context ctx;
65493 + int result;
65494 + reiser4_super_info_data *sbinfo;
65495 +
65496 + assert("zam-989", super != NULL);
65497 +
65498 + super->s_op = NULL;
65499 + init_stack_context(&ctx, super);
65500 +
65501 + /* allocate reiser4 specific super block */
65502 + if ((result = reiser4_init_fs_info(super)) != 0)
65503 + goto failed_init_sinfo;
65504 +
65505 + sbinfo = get_super_private(super);
65506 + /* initialize various reiser4 parameters, parse mount options */
65507 + if ((result = reiser4_init_super_data(super, data)) != 0)
65508 + goto failed_init_super_data;
65509 +
65510 + /* read reiser4 master super block, initialize disk format plugin */
65511 + if ((result = reiser4_init_read_super(super, silent)) != 0)
65512 + goto failed_init_read_super;
65513 +
65514 + /* initialize transaction manager */
65515 + reiser4_init_txnmgr(&sbinfo->tmgr);
65516 +
65517 + /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
65518 + if ((result = reiser4_init_ktxnmgrd(super)) != 0)
65519 + goto failed_init_ktxnmgrd;
65520 +
65521 + /* initialize entd context and start kernel thread entd */
65522 + if ((result = reiser4_init_entd(super)) != 0)
65523 + goto failed_init_entd;
65524 +
65525 + /* initialize address spaces for formatted nodes and bitmaps */
65526 + if ((result = reiser4_init_formatted_fake(super)) != 0)
65527 + goto failed_init_formatted_fake;
65528 +
65529 + /* initialize disk format plugin */
65530 + if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
65531 + goto failed_init_disk_format;
65532 +
65533 + /*
65534 + * There are some 'committed' versions of reiser4 super block counters,
65535 + * which correspond to reiser4 on-disk state. These counters are
65536 + * initialized here
65537 + */
65538 + sbinfo->blocks_free_committed = sbinfo->blocks_free;
65539 + sbinfo->nr_files_committed = oids_used(super);
65540 +
65541 + /* get inode of root directory */
65542 + if ((result = reiser4_init_root_inode(super)) != 0)
65543 + goto failed_init_root_inode;
65544 +
65545 + if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
65546 + goto failed_update_format_version;
65547 +
65548 + process_safelinks(super);
65549 + reiser4_exit_context(&ctx);
65550 +
65551 + sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
65552 + reiser4_debugfs_root);
65553 + if (sbinfo->debugfs_root) {
65554 + sbinfo->tmgr.debugfs_atom_count =
65555 + debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
65556 + sbinfo->debugfs_root,
65557 + &sbinfo->tmgr.atom_count);
65558 + sbinfo->tmgr.debugfs_id_count =
65559 + debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
65560 + sbinfo->debugfs_root,
65561 + &sbinfo->tmgr.id_count);
65562 + }
65563 + return 0;
65564 +
65565 + failed_update_format_version:
65566 + failed_init_root_inode:
65567 + if (sbinfo->df_plug->release)
65568 + sbinfo->df_plug->release(super);
65569 + failed_init_disk_format:
65570 + reiser4_done_formatted_fake(super);
65571 + failed_init_formatted_fake:
65572 + reiser4_done_entd(super);
65573 + failed_init_entd:
65574 + reiser4_done_ktxnmgrd(super);
65575 + failed_init_ktxnmgrd:
65576 + reiser4_done_txnmgr(&sbinfo->tmgr);
65577 + failed_init_read_super:
65578 + failed_init_super_data:
65579 + reiser4_done_fs_info(super);
65580 + failed_init_sinfo:
65581 + reiser4_exit_context(&ctx);
65582 + return result;
65583 +}
65584 +
65585 +/**
65586 + * reiser4_get_sb - get_sb of file_system_type operations
65587 + * @fs_type:
65588 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
65589 + * @dev_name: block device file name
65590 + * @data: specific mount options
65591 + *
65592 + * Reiser4 mount entry.
65593 + */
65594 +static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
65595 + const char *dev_name, void *data, struct vfsmount *mnt)
65596 +{
65597 + return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
65598 +}
65599 +
65600 +/* structure describing the reiser4 filesystem implementation */
65601 +static struct file_system_type reiser4_fs_type = {
65602 + .owner = THIS_MODULE,
65603 + .name = "reiser4",
65604 + .fs_flags = FS_REQUIRES_DEV,
65605 + .get_sb = reiser4_get_sb,
65606 + .kill_sb = kill_block_super,
65607 + .next = NULL
65608 +};
65609 +
65610 +void destroy_reiser4_cache(struct kmem_cache **cachep)
65611 +{
65612 + BUG_ON(*cachep == NULL);
65613 + kmem_cache_destroy(*cachep);
65614 + *cachep = NULL;
65615 +}
65616 +
65617 +/**
65618 + * init_reiser4 - reiser4 initialization entry point
65619 + *
65620 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
65621 + * on kernel initialization or during reiser4 module load.
65622 + */
65623 +static int __init init_reiser4(void)
65624 +{
65625 + int result;
65626 +
65627 + printk(KERN_INFO
65628 + "Loading Reiser4. "
65629 + "See www.namesys.com for a description of Reiser4.\n");
65630 +
65631 + /* initialize slab cache of inodes */
65632 + if ((result = init_inodes()) != 0)
65633 + goto failed_inode_cache;
65634 +
65635 + /* initialize cache of znodes */
65636 + if ((result = init_znodes()) != 0)
65637 + goto failed_init_znodes;
65638 +
65639 + /* initialize all plugins */
65640 + if ((result = init_plugins()) != 0)
65641 + goto failed_init_plugins;
65642 +
65643 + /* initialize cache of plugin_set-s and plugin_set's hash table */
65644 + if ((result = init_plugin_set()) != 0)
65645 + goto failed_init_plugin_set;
65646 +
65647 + /* initialize caches of txn_atom-s and txn_handle-s */
65648 + if ((result = init_txnmgr_static()) != 0)
65649 + goto failed_init_txnmgr_static;
65650 +
65651 + /* initialize cache of jnodes */
65652 + if ((result = init_jnodes()) != 0)
65653 + goto failed_init_jnodes;
65654 +
65655 + /* initialize cache of flush queues */
65656 + if ((result = reiser4_init_fqs()) != 0)
65657 + goto failed_init_fqs;
65658 +
65659 + /* initialize cache of structures attached to dentry->d_fsdata */
65660 + if ((result = reiser4_init_dentry_fsdata()) != 0)
65661 + goto failed_init_dentry_fsdata;
65662 +
65663 + /* initialize cache of structures attached to file->private_data */
65664 + if ((result = reiser4_init_file_fsdata()) != 0)
65665 + goto failed_init_file_fsdata;
65666 +
65667 + /*
65668 + * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
65669 + * more details
65670 + */
65671 + if ((result = reiser4_init_d_cursor()) != 0)
65672 + goto failed_init_d_cursor;
65673 +
65674 + if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
65675 + reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
65676 + return 0;
65677 + }
65678 +
65679 + reiser4_done_d_cursor();
65680 + failed_init_d_cursor:
65681 + reiser4_done_file_fsdata();
65682 + failed_init_file_fsdata:
65683 + reiser4_done_dentry_fsdata();
65684 + failed_init_dentry_fsdata:
65685 + reiser4_done_fqs();
65686 + failed_init_fqs:
65687 + done_jnodes();
65688 + failed_init_jnodes:
65689 + done_txnmgr_static();
65690 + failed_init_txnmgr_static:
65691 + done_plugin_set();
65692 + failed_init_plugin_set:
65693 + failed_init_plugins:
65694 + done_znodes();
65695 + failed_init_znodes:
65696 + done_inodes();
65697 + failed_inode_cache:
65698 + return result;
65699 +}
65700 +
65701 +/**
65702 + * done_reiser4 - reiser4 exit entry point
65703 + *
65704 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
65705 + * or at module unload.
65706 + */
65707 +static void __exit done_reiser4(void)
65708 +{
65709 + int result;
65710 +
65711 + debugfs_remove(reiser4_debugfs_root);
65712 + result = unregister_filesystem(&reiser4_fs_type);
65713 + BUG_ON(result != 0);
65714 + reiser4_done_d_cursor();
65715 + reiser4_done_file_fsdata();
65716 + reiser4_done_dentry_fsdata();
65717 + reiser4_done_fqs();
65718 + done_jnodes();
65719 + done_txnmgr_static();
65720 + done_plugin_set();
65721 + done_znodes();
65722 + destroy_reiser4_cache(&inode_cache);
65723 +}
65724 +
65725 +module_init(init_reiser4);
65726 +module_exit(done_reiser4);
65727 +
65728 +MODULE_DESCRIPTION("Reiser4 filesystem");
65729 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
65730 +
65731 +MODULE_LICENSE("GPL");
65732 +
65733 +/*
65734 + * Local variables:
65735 + * c-indentation-style: "K&R"
65736 + * mode-name: "LC"
65737 + * c-basic-offset: 8
65738 + * tab-width: 8
65739 + * fill-column: 79
65740 + * End:
65741 + */
65742 diff -urN linux-2.6.23.orig/fs/reiser4/tap.c linux-2.6.23/fs/reiser4/tap.c
65743 --- linux-2.6.23.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300
65744 +++ linux-2.6.23/fs/reiser4/tap.c 2007-12-04 16:49:30.000000000 +0300
65745 @@ -0,0 +1,377 @@
65746 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65747 + * reiser4/README */
65748 +
65749 +/*
65750 + Tree Access Pointer (tap).
65751 +
65752 + tap is data structure combining coord and lock handle (mostly). It is
65753 + useful when one has to scan tree nodes (for example, in readdir, or flush),
65754 + for tap functions allow to move tap in either direction transparently
65755 + crossing unit/item/node borders.
65756 +
65757 + Tap doesn't provide automatic synchronization of its fields as it is
65758 + supposed to be per-thread object.
65759 +*/
65760 +
65761 +#include "forward.h"
65762 +#include "debug.h"
65763 +#include "coord.h"
65764 +#include "tree.h"
65765 +#include "context.h"
65766 +#include "tap.h"
65767 +#include "znode.h"
65768 +#include "tree_walk.h"
65769 +
65770 +#if REISER4_DEBUG
65771 +static int tap_invariant(const tap_t * tap);
65772 +static void tap_check(const tap_t * tap);
65773 +#else
65774 +#define tap_check(tap) noop
65775 +#endif
65776 +
65777 +/** load node tap is pointing to, if not loaded already */
65778 +int reiser4_tap_load(tap_t * tap)
65779 +{
65780 + tap_check(tap);
65781 + if (tap->loaded == 0) {
65782 + int result;
65783 +
65784 + result = zload_ra(tap->coord->node, &tap->ra_info);
65785 + if (result != 0)
65786 + return result;
65787 + coord_clear_iplug(tap->coord);
65788 + }
65789 + ++tap->loaded;
65790 + tap_check(tap);
65791 + return 0;
65792 +}
65793 +
65794 +/** release node tap is pointing to. Dual to tap_load() */
65795 +void reiser4_tap_relse(tap_t * tap)
65796 +{
65797 + tap_check(tap);
65798 + if (tap->loaded > 0) {
65799 + --tap->loaded;
65800 + if (tap->loaded == 0) {
65801 + zrelse(tap->coord->node);
65802 + }
65803 + }
65804 + tap_check(tap);
65805 +}
65806 +
65807 +/**
65808 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
65809 + * @mode
65810 + */
65811 +void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
65812 + znode_lock_mode mode)
65813 +{
65814 + tap->coord = coord;
65815 + tap->lh = lh;
65816 + tap->mode = mode;
65817 + tap->loaded = 0;
65818 + INIT_LIST_HEAD(&tap->linkage);
65819 + reiser4_init_ra_info(&tap->ra_info);
65820 +}
65821 +
65822 +/** add @tap to the per-thread list of all taps */
65823 +void reiser4_tap_monitor(tap_t * tap)
65824 +{
65825 + assert("nikita-2623", tap != NULL);
65826 + tap_check(tap);
65827 + list_add(&tap->linkage, reiser4_taps_list());
65828 + tap_check(tap);
65829 +}
65830 +
65831 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
65832 + * loaded. */
65833 +void reiser4_tap_copy(tap_t * dst, tap_t * src)
65834 +{
65835 + assert("nikita-3193", src != NULL);
65836 + assert("nikita-3194", dst != NULL);
65837 +
65838 + *dst->coord = *src->coord;
65839 + if (src->lh->node)
65840 + copy_lh(dst->lh, src->lh);
65841 + dst->mode = src->mode;
65842 + dst->loaded = 0;
65843 + INIT_LIST_HEAD(&dst->linkage);
65844 + dst->ra_info = src->ra_info;
65845 +}
65846 +
65847 +/** finish with @tap */
65848 +void reiser4_tap_done(tap_t * tap)
65849 +{
65850 + assert("nikita-2565", tap != NULL);
65851 + tap_check(tap);
65852 + if (tap->loaded > 0)
65853 + zrelse(tap->coord->node);
65854 + done_lh(tap->lh);
65855 + tap->loaded = 0;
65856 + list_del_init(&tap->linkage);
65857 + tap->coord->node = NULL;
65858 +}
65859 +
65860 +/**
65861 + * move @tap to the new node, locked with @target. Load @target, if @tap was
65862 + * already loaded.
65863 + */
65864 +int reiser4_tap_move(tap_t * tap, lock_handle * target)
65865 +{
65866 + int result = 0;
65867 +
65868 + assert("nikita-2567", tap != NULL);
65869 + assert("nikita-2568", target != NULL);
65870 + assert("nikita-2570", target->node != NULL);
65871 + assert("nikita-2569", tap->coord->node == tap->lh->node);
65872 +
65873 + tap_check(tap);
65874 + if (tap->loaded > 0)
65875 + result = zload_ra(target->node, &tap->ra_info);
65876 +
65877 + if (result == 0) {
65878 + if (tap->loaded > 0)
65879 + zrelse(tap->coord->node);
65880 + done_lh(tap->lh);
65881 + copy_lh(tap->lh, target);
65882 + tap->coord->node = target->node;
65883 + coord_clear_iplug(tap->coord);
65884 + }
65885 + tap_check(tap);
65886 + return result;
65887 +}
65888 +
65889 +/**
65890 + * move @tap to @target. Acquire lock on @target, if @tap was already
65891 + * loaded.
65892 + */
65893 +static int tap_to(tap_t * tap, znode * target)
65894 +{
65895 + int result;
65896 +
65897 + assert("nikita-2624", tap != NULL);
65898 + assert("nikita-2625", target != NULL);
65899 +
65900 + tap_check(tap);
65901 + result = 0;
65902 + if (tap->coord->node != target) {
65903 + lock_handle here;
65904 +
65905 + init_lh(&here);
65906 + result = longterm_lock_znode(&here, target,
65907 + tap->mode, ZNODE_LOCK_HIPRI);
65908 + if (result == 0) {
65909 + result = reiser4_tap_move(tap, &here);
65910 + done_lh(&here);
65911 + }
65912 + }
65913 + tap_check(tap);
65914 + return result;
65915 +}
65916 +
65917 +/**
65918 + * move @tap to given @target, loading and locking @target->node if
65919 + * necessary
65920 + */
65921 +int tap_to_coord(tap_t * tap, coord_t * target)
65922 +{
65923 + int result;
65924 +
65925 + tap_check(tap);
65926 + result = tap_to(tap, target->node);
65927 + if (result == 0)
65928 + coord_dup(tap->coord, target);
65929 + tap_check(tap);
65930 + return result;
65931 +}
65932 +
65933 +/** return list of all taps */
65934 +struct list_head *reiser4_taps_list(void)
65935 +{
65936 + return &get_current_context()->taps;
65937 +}
65938 +
65939 +/** helper function for go_{next,prev}_{item,unit,node}() */
65940 +int go_dir_el(tap_t * tap, sideof dir, int units_p)
65941 +{
65942 + coord_t dup;
65943 + coord_t *coord;
65944 + int result;
65945 +
65946 + int (*coord_dir) (coord_t *);
65947 + int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
65948 + void (*coord_init) (coord_t *, const znode *);
65949 + ON_DEBUG(int (*coord_check) (const coord_t *));
65950 +
65951 + assert("nikita-2556", tap != NULL);
65952 + assert("nikita-2557", tap->coord != NULL);
65953 + assert("nikita-2558", tap->lh != NULL);
65954 + assert("nikita-2559", tap->coord->node != NULL);
65955 +
65956 + tap_check(tap);
65957 + if (dir == LEFT_SIDE) {
65958 + coord_dir = units_p ? coord_prev_unit : coord_prev_item;
65959 + get_dir_neighbor = reiser4_get_left_neighbor;
65960 + coord_init = coord_init_last_unit;
65961 + } else {
65962 + coord_dir = units_p ? coord_next_unit : coord_next_item;
65963 + get_dir_neighbor = reiser4_get_right_neighbor;
65964 + coord_init = coord_init_first_unit;
65965 + }
65966 + ON_DEBUG(coord_check =
65967 + units_p ? coord_is_existing_unit : coord_is_existing_item);
65968 + assert("nikita-2560", coord_check(tap->coord));
65969 +
65970 + coord = tap->coord;
65971 + coord_dup(&dup, coord);
65972 + if (coord_dir(&dup) != 0) {
65973 + do {
65974 + /* move to the left neighboring node */
65975 + lock_handle dup;
65976 +
65977 + init_lh(&dup);
65978 + result =
65979 + get_dir_neighbor(&dup, coord->node, (int)tap->mode,
65980 + GN_CAN_USE_UPPER_LEVELS);
65981 + if (result == 0) {
65982 + result = reiser4_tap_move(tap, &dup);
65983 + if (result == 0)
65984 + coord_init(tap->coord, dup.node);
65985 + done_lh(&dup);
65986 + }
65987 + /* skip empty nodes */
65988 + } while ((result == 0) && node_is_empty(coord->node));
65989 + } else {
65990 + result = 0;
65991 + coord_dup(coord, &dup);
65992 + }
65993 + assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
65994 + tap_check(tap);
65995 + return result;
65996 +}
65997 +
65998 +/**
65999 + * move @tap to the next unit, transparently crossing item and node
66000 + * boundaries
66001 + */
66002 +int go_next_unit(tap_t * tap)
66003 +{
66004 + return go_dir_el(tap, RIGHT_SIDE, 1);
66005 +}
66006 +
66007 +/**
66008 + * move @tap to the previous unit, transparently crossing item and node
66009 + * boundaries
66010 + */
66011 +int go_prev_unit(tap_t * tap)
66012 +{
66013 + return go_dir_el(tap, LEFT_SIDE, 1);
66014 +}
66015 +
66016 +/**
66017 + * @shift times apply @actor to the @tap. This is used to move @tap by
66018 + * @shift units (or items, or nodes) in either direction.
66019 + */
66020 +static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
66021 +{
66022 + int result;
66023 +
66024 + assert("nikita-2555", shift >= 0);
66025 + assert("nikita-2562", tap->coord->node == tap->lh->node);
66026 +
66027 + tap_check(tap);
66028 + result = reiser4_tap_load(tap);
66029 + if (result != 0)
66030 + return result;
66031 +
66032 + for (; shift > 0; --shift) {
66033 + result = actor(tap);
66034 + assert("nikita-2563", tap->coord->node == tap->lh->node);
66035 + if (result != 0)
66036 + break;
66037 + }
66038 + reiser4_tap_relse(tap);
66039 + tap_check(tap);
66040 + return result;
66041 +}
66042 +
66043 +/** move @tap @shift units rightward */
66044 +int rewind_right(tap_t * tap, int shift)
66045 +{
66046 + return rewind_to(tap, go_next_unit, shift);
66047 +}
66048 +
66049 +/** move @tap @shift units leftward */
66050 +int rewind_left(tap_t * tap, int shift)
66051 +{
66052 + return rewind_to(tap, go_prev_unit, shift);
66053 +}
66054 +
66055 +#if REISER4_DEBUG
66056 +/** debugging function: print @tap content in human readable form */
66057 +static void print_tap(const char *prefix, const tap_t * tap)
66058 +{
66059 + if (tap == NULL) {
66060 + printk("%s: null tap\n", prefix);
66061 + return;
66062 + }
66063 + printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
66064 + tap->loaded, (&tap->linkage == tap->linkage.next &&
66065 + &tap->linkage == tap->linkage.prev),
66066 + tap->lh->node,
66067 + lock_mode_name(tap->mode));
66068 + print_coord("\tcoord", tap->coord, 0);
66069 +}
66070 +
66071 +/** check [tap-sane] invariant */
66072 +static int tap_invariant(const tap_t * tap)
66073 +{
66074 + /* [tap-sane] invariant */
66075 +
66076 + if (tap == NULL)
66077 + return 1;
66078 + /* tap->mode is one of
66079 + *
66080 + * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
66081 + */
66082 + if (tap->mode != ZNODE_NO_LOCK &&
66083 + tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
66084 + return 2;
66085 + /* tap->coord != NULL, and */
66086 + if (tap->coord == NULL)
66087 + return 3;
66088 + /* tap->lh != NULL, and */
66089 + if (tap->lh == NULL)
66090 + return 4;
66091 + /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
66092 + if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
66093 + return 5;
66094 + /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
66095 + if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
66096 + return 6;
66097 + return 0;
66098 +}
66099 +
66100 +/** debugging function: check internal @tap consistency */
66101 +static void tap_check(const tap_t * tap)
66102 +{
66103 + int result;
66104 +
66105 + result = tap_invariant(tap);
66106 + if (result != 0) {
66107 + print_tap("broken", tap);
66108 + reiser4_panic("nikita-2831", "tap broken: %i\n", result);
66109 + }
66110 +}
66111 +#endif
66112 +
66113 +/* Make Linus happy.
66114 + Local variables:
66115 + c-indentation-style: "K&R"
66116 + mode-name: "LC"
66117 + c-basic-offset: 8
66118 + tab-width: 8
66119 + fill-column: 120
66120 + scroll-step: 1
66121 + End:
66122 +*/
66123 diff -urN linux-2.6.23.orig/fs/reiser4/tap.h linux-2.6.23/fs/reiser4/tap.h
66124 --- linux-2.6.23.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300
66125 +++ linux-2.6.23/fs/reiser4/tap.h 2007-12-04 16:49:30.000000000 +0300
66126 @@ -0,0 +1,70 @@
66127 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
66128 +
66129 +/* Tree Access Pointers. See tap.c for more details. */
66130 +
66131 +#if !defined( __REISER4_TAP_H__ )
66132 +#define __REISER4_TAP_H__
66133 +
66134 +#include "forward.h"
66135 +#include "readahead.h"
66136 +
66137 +/**
66138 + tree_access_pointer aka tap. Data structure combining coord_t and lock
66139 + handle.
66140 + Invariants involving this data-type, see doc/lock-ordering for details:
66141 +
66142 + [tap-sane]
66143 + */
66144 +struct tree_access_pointer {
66145 + /* coord tap is at */
66146 + coord_t *coord;
66147 + /* lock handle on ->coord->node */
66148 + lock_handle *lh;
66149 + /* mode of lock acquired by this tap */
66150 + znode_lock_mode mode;
66151 + /* incremented by reiser4_tap_load().
66152 + Decremented by reiser4_tap_relse(). */
66153 + int loaded;
66154 + /* list of taps */
66155 + struct list_head linkage;
66156 + /* read-ahead hint */
66157 + ra_info_t ra_info;
66158 +};
66159 +
66160 +typedef int (*go_actor_t) (tap_t * tap);
66161 +
66162 +extern int reiser4_tap_load(tap_t * tap);
66163 +extern void reiser4_tap_relse(tap_t * tap);
66164 +extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
66165 + znode_lock_mode mode);
66166 +extern void reiser4_tap_monitor(tap_t * tap);
66167 +extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
66168 +extern void reiser4_tap_done(tap_t * tap);
66169 +extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
66170 +extern int tap_to_coord(tap_t * tap, coord_t * target);
66171 +
66172 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
66173 +extern int go_next_unit(tap_t * tap);
66174 +extern int go_prev_unit(tap_t * tap);
66175 +extern int rewind_right(tap_t * tap, int shift);
66176 +extern int rewind_left(tap_t * tap, int shift);
66177 +
66178 +extern struct list_head *reiser4_taps_list(void);
66179 +
66180 +#define for_all_taps(tap) \
66181 + for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
66182 + reiser4_taps_list() != &tap->linkage; \
66183 + tap = list_entry(tap->linkage.next, tap_t, linkage))
66184 +
66185 +/* __REISER4_TAP_H__ */
66186 +#endif
66187 +/* Make Linus happy.
66188 + Local variables:
66189 + c-indentation-style: "K&R"
66190 + mode-name: "LC"
66191 + c-basic-offset: 8
66192 + tab-width: 8
66193 + fill-column: 120
66194 + scroll-step: 1
66195 + End:
66196 +*/
66197 diff -urN linux-2.6.23.orig/fs/reiser4/tree.c linux-2.6.23/fs/reiser4/tree.c
66198 --- linux-2.6.23.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
66199 +++ linux-2.6.23/fs/reiser4/tree.c 2007-12-04 16:49:30.000000000 +0300
66200 @@ -0,0 +1,1876 @@
66201 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66202 + * reiser4/README */
66203 +
66204 +/*
66205 + * KEYS IN A TREE.
66206 + *
66207 + * The tree consists of nodes located on the disk. Node in the tree is either
66208 + * formatted or unformatted. Formatted node is one that has structure
66209 + * understood by the tree balancing and traversal code. Formatted nodes are
66210 + * further classified into leaf and internal nodes. Latter distinctions is
66211 + * (almost) of only historical importance: general structure of leaves and
66212 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
66213 + * that are part of bodies of ordinary files and attributes.
66214 + *
66215 + * Each node in the tree spawns some interval in the key space. Key ranges for
66216 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
66217 + * sense, because of the non-unique keys: intersection of key ranges for
66218 + * different nodes is either empty, or consists of exactly one key.
66219 + *
66220 + * Formatted node consists of a sequence of items. Each item spawns some
66221 + * interval in key space. Key ranges for all items in a tree are disjoint,
66222 + * modulo non-unique keys again. Items within nodes are ordered in the key
66223 + * order of the smallest key in a item.
66224 + *
66225 + * Particular type of item can be further split into units. Unit is piece of
66226 + * item that can be cut from item and moved into another item of the same
66227 + * time. Units are used by balancing code to repack data during balancing.
66228 + *
66229 + * Unit can be further split into smaller entities (for example, extent unit
66230 + * represents several pages, and it is natural for extent code to operate on
66231 + * particular pages and even bytes within one unit), but this is of no
66232 + * relevance to the generic balancing and lookup code.
66233 + *
66234 + * Although item is said to "spawn" range or interval of keys, it is not
66235 + * necessary that item contains piece of data addressable by each and every
66236 + * key in this range. For example, compound directory item, consisting of
66237 + * units corresponding to directory entries and keyed by hashes of file names,
66238 + * looks more as having "discrete spectrum": only some disjoint keys inside
66239 + * range occupied by this item really address data.
66240 + *
66241 + * No than less, each item always has well-defined least (minimal) key, that
66242 + * is recorded in item header, stored in the node this item is in. Also, item
66243 + * plugin can optionally define method ->max_key_inside() returning maximal
66244 + * key that can _possibly_ be located within this item. This method is used
66245 + * (mainly) to determine when given piece of data should be merged into
66246 + * existing item, in stead of creating new one. Because of this, even though
66247 + * ->max_key_inside() can be larger that any key actually located in the item,
66248 + * intervals
66249 + *
66250 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
66251 + *
66252 + * are still disjoint for all items within the _same_ node.
66253 + *
66254 + * In memory node is represented by znode. It plays several roles:
66255 + *
66256 + * . something locks are taken on
66257 + *
66258 + * . something tracked by transaction manager (this is going to change)
66259 + *
66260 + * . something used to access node data
66261 + *
66262 + * . something used to maintain tree structure in memory: sibling and
66263 + * parental linkage.
66264 + *
66265 + * . something used to organize nodes into "slums"
66266 + *
66267 + * More on znodes see in znode.[ch]
66268 + *
66269 + * DELIMITING KEYS
66270 + *
66271 + * To simplify balancing, allow some flexibility in locking and speed up
66272 + * important coord cache optimization, we keep delimiting keys of nodes in
66273 + * memory. Depending on disk format (implemented by appropriate node plugin)
66274 + * node on disk can record both left and right delimiting key, only one of
66275 + * them, or none. Still, our balancing and tree traversal code keep both
66276 + * delimiting keys for a node that is in memory stored in the znode. When
66277 + * node is first brought into memory during tree traversal, its left
66278 + * delimiting key is taken from its parent, and its right delimiting key is
66279 + * either next key in its parent, or is right delimiting key of parent if
66280 + * node is the rightmost child of parent.
66281 + *
66282 + * Physical consistency of delimiting key is protected by special dk
66283 + * read-write lock. That is, delimiting keys can only be inspected or
66284 + * modified under this lock. But dk lock is only sufficient for fast
66285 + * "pessimistic" check, because to simplify code and to decrease lock
66286 + * contention, balancing (carry) only updates delimiting keys right before
66287 + * unlocking all locked nodes on the given tree level. For example,
66288 + * coord-by-key cache scans LRU list of recently accessed znodes. For each
66289 + * node it first does fast check under dk spin lock. If key looked for is
66290 + * not between delimiting keys for this node, next node is inspected and so
66291 + * on. If key is inside of the key range, long term lock is taken on node
66292 + * and key range is rechecked.
66293 + *
66294 + * COORDINATES
66295 + *
66296 + * To find something in the tree, you supply a key, and the key is resolved
66297 + * by coord_by_key() into a coord (coordinate) that is valid as long as the
66298 + * node the coord points to remains locked. As mentioned above trees
66299 + * consist of nodes that consist of items that consist of units. A unit is
66300 + * the smallest and indivisible piece of tree as far as balancing and tree
66301 + * search are concerned. Each node, item, and unit can be addressed by
66302 + * giving its level in the tree and the key occupied by this entity. A node
66303 + * knows what the key ranges are of the items within it, and how to find its
66304 + * items and invoke their item handlers, but it does not know how to access
66305 + * individual units within its items except through the item handlers.
66306 + * coord is a structure containing a pointer to the node, the ordinal number
66307 + * of the item within this node (a sort of item offset), and the ordinal
66308 + * number of the unit within this item.
66309 + *
66310 + * TREE LOOKUP
66311 + *
66312 + * There are two types of access to the tree: lookup and modification.
66313 + *
66314 + * Lookup is a search for the key in the tree. Search can look for either
66315 + * exactly the key given to it, or for the largest key that is not greater
66316 + * than the key given to it. This distinction is determined by "bias"
66317 + * parameter of search routine (coord_by_key()). coord_by_key() either
66318 + * returns error (key is not in the tree, or some kind of external error
66319 + * occurred), or successfully resolves key into coord.
66320 + *
66321 + * This resolution is done by traversing tree top-to-bottom from root level
66322 + * to the desired level. On levels above twig level (level one above the
66323 + * leaf level) nodes consist exclusively of internal items. Internal item is
66324 + * nothing more than pointer to the tree node on the child level. On twig
66325 + * level nodes consist of internal items intermixed with extent
66326 + * items. Internal items form normal search tree structure used by traversal
66327 + * to descent through the tree.
66328 + *
66329 + * TREE LOOKUP OPTIMIZATIONS
66330 + *
66331 + * Tree lookup described above is expensive even if all nodes traversed are
66332 + * already in the memory: for each node binary search within it has to be
66333 + * performed and binary searches are CPU consuming and tend to destroy CPU
66334 + * caches.
66335 + *
66336 + * Several optimizations are used to work around this:
66337 + *
66338 + * . cbk_cache (look-aside cache for tree traversals, see search.c for
66339 + * details)
66340 + *
66341 + * . seals (see seal.[ch])
66342 + *
66343 + * . vroot (see search.c)
66344 + *
66345 + * General search-by-key is layered thusly:
66346 + *
66347 + * [check seal, if any] --ok--> done
66348 + * |
66349 + * failed
66350 + * |
66351 + * V
66352 + * [vroot defined] --no--> node = tree_root
66353 + * | |
66354 + * yes |
66355 + * | |
66356 + * V |
66357 + * node = vroot |
66358 + * | |
66359 + * | |
66360 + * | |
66361 + * V V
66362 + * [check cbk_cache for key] --ok--> done
66363 + * |
66364 + * failed
66365 + * |
66366 + * V
66367 + * [start tree traversal from node]
66368 + *
66369 + */
66370 +
66371 +#include "forward.h"
66372 +#include "debug.h"
66373 +#include "dformat.h"
66374 +#include "key.h"
66375 +#include "coord.h"
66376 +#include "plugin/item/static_stat.h"
66377 +#include "plugin/item/item.h"
66378 +#include "plugin/node/node.h"
66379 +#include "plugin/plugin.h"
66380 +#include "txnmgr.h"
66381 +#include "jnode.h"
66382 +#include "znode.h"
66383 +#include "block_alloc.h"
66384 +#include "tree_walk.h"
66385 +#include "carry.h"
66386 +#include "carry_ops.h"
66387 +#include "tap.h"
66388 +#include "tree.h"
66389 +#include "vfs_ops.h"
66390 +#include "page_cache.h"
66391 +#include "super.h"
66392 +#include "reiser4.h"
66393 +#include "inode.h"
66394 +
66395 +#include <linux/fs.h> /* for struct super_block */
66396 +#include <linux/spinlock.h>
66397 +
66398 +/* Disk address (block number) never ever used for any real tree node. This is
66399 + used as block number of "uber" znode.
66400 +
66401 + Invalid block addresses are 0 by tradition.
66402 +
66403 +*/
66404 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
66405 +
66406 +#define CUT_TREE_MIN_ITERATIONS 64
66407 +
66408 +static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
66409 +
66410 +/* return node plugin of coord->node */
66411 +node_plugin *node_plugin_by_coord(const coord_t * coord)
66412 +{
66413 + assert("vs-1", coord != NULL);
66414 + assert("vs-2", coord->node != NULL);
66415 +
66416 + return coord->node->nplug;
66417 +}
66418 +
66419 +/* insert item into tree. Fields of @coord are updated so that they can be
66420 + * used by consequent insert operation. */
66421 +insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
66422 + * into */ ,
66423 + const reiser4_key * key /* key of new item */ ,
66424 + reiser4_item_data * data /* parameters for item
66425 + * creation */ ,
66426 + coord_t * coord /* resulting insertion coord */ ,
66427 + lock_handle * lh /* resulting lock
66428 + * handle */ ,
66429 + tree_level stop_level /** level where to insert */ ,
66430 + __u32 flags /* insertion flags */ )
66431 +{
66432 + int result;
66433 +
66434 + assert("nikita-358", tree != NULL);
66435 + assert("nikita-360", coord != NULL);
66436 +
66437 + result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
66438 + FIND_EXACT, stop_level, stop_level,
66439 + flags | CBK_FOR_INSERT, NULL /*ra_info */ );
66440 + switch (result) {
66441 + default:
66442 + break;
66443 + case CBK_COORD_FOUND:
66444 + result = IBK_ALREADY_EXISTS;
66445 + break;
66446 + case CBK_COORD_NOTFOUND:
66447 + assert("nikita-2017", coord->node != NULL);
66448 + result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
66449 + break;
66450 + }
66451 + return result;
66452 +}
66453 +
66454 +/* insert item by calling carry. Helper function called if short-cut
66455 + insertion failed */
66456 +static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
66457 + lock_handle * lh, /* lock handle of insertion
66458 + * node */
66459 + reiser4_item_data * data, /* parameters of new
66460 + * item */
66461 + const reiser4_key * key, /* key of new item */
66462 + carry_opcode cop, /* carry operation to perform */
66463 + cop_insert_flag flags
66464 + /* carry flags */ )
66465 +{
66466 + int result;
66467 + carry_pool *pool;
66468 + carry_level *lowest_level;
66469 + carry_insert_data *cdata;
66470 + carry_op *op;
66471 +
66472 + assert("umka-314", coord != NULL);
66473 +
66474 + /* allocate carry_pool and 3 carry_level-s */
66475 + pool =
66476 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66477 + sizeof(*cdata));
66478 + if (IS_ERR(pool))
66479 + return PTR_ERR(pool);
66480 + lowest_level = (carry_level *) (pool + 1);
66481 + init_carry_level(lowest_level, pool);
66482 +
66483 + op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
66484 + if (IS_ERR(op) || (op == NULL)) {
66485 + done_carry_pool(pool);
66486 + return RETERR(op ? PTR_ERR(op) : -EIO);
66487 + }
66488 + cdata = (carry_insert_data *) (lowest_level + 3);
66489 + cdata->coord = coord;
66490 + cdata->data = data;
66491 + cdata->key = key;
66492 + op->u.insert.d = cdata;
66493 + if (flags == 0)
66494 + flags = znode_get_tree(coord->node)->carry.insert_flags;
66495 + op->u.insert.flags = flags;
66496 + op->u.insert.type = COPT_ITEM_DATA;
66497 + op->u.insert.child = NULL;
66498 + if (lh != NULL) {
66499 + assert("nikita-3245", lh->node == coord->node);
66500 + lowest_level->track_type = CARRY_TRACK_CHANGE;
66501 + lowest_level->tracked = lh;
66502 + }
66503 +
66504 + result = reiser4_carry(lowest_level, NULL);
66505 + done_carry_pool(pool);
66506 +
66507 + return result;
66508 +}
66509 +
66510 +/* form carry queue to perform paste of @data with @key at @coord, and launch
66511 + its execution by calling carry().
66512 +
66513 + Instruct carry to update @lh it after balancing insertion coord moves into
66514 + different block.
66515 +
66516 +*/
66517 +static int paste_with_carry(coord_t * coord, /* coord of paste */
66518 + lock_handle * lh, /* lock handle of node
66519 + * where item is
66520 + * pasted */
66521 + reiser4_item_data * data, /* parameters of new
66522 + * item */
66523 + const reiser4_key * key, /* key of new item */
66524 + unsigned flags /* paste flags */ )
66525 +{
66526 + int result;
66527 + carry_pool *pool;
66528 + carry_level *lowest_level;
66529 + carry_insert_data *cdata;
66530 + carry_op *op;
66531 +
66532 + assert("umka-315", coord != NULL);
66533 + assert("umka-316", key != NULL);
66534 +
66535 + pool =
66536 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66537 + sizeof(*cdata));
66538 + if (IS_ERR(pool))
66539 + return PTR_ERR(pool);
66540 + lowest_level = (carry_level *) (pool + 1);
66541 + init_carry_level(lowest_level, pool);
66542 +
66543 + op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
66544 + if (IS_ERR(op) || (op == NULL)) {
66545 + done_carry_pool(pool);
66546 + return RETERR(op ? PTR_ERR(op) : -EIO);
66547 + }
66548 + cdata = (carry_insert_data *) (lowest_level + 3);
66549 + cdata->coord = coord;
66550 + cdata->data = data;
66551 + cdata->key = key;
66552 + op->u.paste.d = cdata;
66553 + if (flags == 0)
66554 + flags = znode_get_tree(coord->node)->carry.paste_flags;
66555 + op->u.paste.flags = flags;
66556 + op->u.paste.type = COPT_ITEM_DATA;
66557 + if (lh != NULL) {
66558 + lowest_level->track_type = CARRY_TRACK_CHANGE;
66559 + lowest_level->tracked = lh;
66560 + }
66561 +
66562 + result = reiser4_carry(lowest_level, NULL);
66563 + done_carry_pool(pool);
66564 +
66565 + return result;
66566 +}
66567 +
66568 +/* insert item at the given coord.
66569 +
66570 + First try to skip carry by directly calling ->create_item() method of node
66571 + plugin. If this is impossible (there is not enough free space in the node,
66572 + or leftmost item in the node is created), call insert_with_carry_by_coord()
66573 + that will do full carry().
66574 +
66575 +*/
66576 +insert_result insert_by_coord(coord_t * coord /* coord where to
66577 + * insert. coord->node has
66578 + * to be write locked by
66579 + * caller */ ,
66580 + reiser4_item_data * data /* data to be
66581 + * inserted */ ,
66582 + const reiser4_key * key /* key of new item */ ,
66583 + lock_handle * lh /* lock handle of write
66584 + * lock on node */ ,
66585 + __u32 flags /* insertion flags */ )
66586 +{
66587 + unsigned item_size;
66588 + int result;
66589 + znode *node;
66590 +
66591 + assert("vs-247", coord != NULL);
66592 + assert("vs-248", data != NULL);
66593 + assert("vs-249", data->length >= 0);
66594 + assert("nikita-1191", znode_is_write_locked(coord->node));
66595 +
66596 + node = coord->node;
66597 + coord_clear_iplug(coord);
66598 + result = zload(node);
66599 + if (result != 0)
66600 + return result;
66601 +
66602 + item_size = space_needed(node, NULL, data, 1);
66603 + if (item_size > znode_free_space(node) &&
66604 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66605 + && (flags & COPI_DONT_ALLOCATE)) {
66606 + /* we are forced to use free space of coord->node and new item
66607 + does not fit into it.
66608 +
66609 + Currently we get here only when we allocate and copy units
66610 + of extent item from a node to its left neighbor during
66611 + "squalloc"-ing. If @node (this is left neighbor) does not
66612 + have enough free space - we do not want to attempt any
66613 + shifting and allocations because we are in squeezing and
66614 + everything to the left of @node is tightly packed.
66615 + */
66616 + result = -E_NODE_FULL;
66617 + } else if ((item_size <= znode_free_space(node)) &&
66618 + !coord_is_before_leftmost(coord) &&
66619 + (node_plugin_by_node(node)->fast_insert != NULL)
66620 + && node_plugin_by_node(node)->fast_insert(coord)) {
66621 + /* shortcut insertion without carry() overhead.
66622 +
66623 + Only possible if:
66624 +
66625 + - there is enough free space
66626 +
66627 + - insertion is not into the leftmost position in a node
66628 + (otherwise it would require updating of delimiting key in a
66629 + parent)
66630 +
66631 + - node plugin agrees with this
66632 +
66633 + */
66634 + result =
66635 + node_plugin_by_node(node)->create_item(coord, key, data,
66636 + NULL);
66637 + znode_make_dirty(node);
66638 + } else {
66639 + /* otherwise do full-fledged carry(). */
66640 + result =
66641 + insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
66642 + flags);
66643 + }
66644 + zrelse(node);
66645 + return result;
66646 +}
66647 +
66648 +/* @coord is set to leaf level and @data is to be inserted to twig level */
66649 +insert_result
66650 +insert_extent_by_coord(coord_t *
66651 + coord
66652 + /* coord where to insert. coord->node * has to be write * locked by caller */
66653 + ,
66654 + reiser4_item_data * data /* data to be inserted */ ,
66655 + const reiser4_key * key /* key of new item */ ,
66656 + lock_handle *
66657 + lh /* lock handle of write lock on * node */ )
66658 +{
66659 + assert("vs-405", coord != NULL);
66660 + assert("vs-406", data != NULL);
66661 + assert("vs-407", data->length > 0);
66662 + assert("vs-408", znode_is_write_locked(coord->node));
66663 + assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
66664 +
66665 + return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
66666 + 0 /*flags */ );
66667 +}
66668 +
66669 +/* Insert into the item at the given coord.
66670 +
66671 + First try to skip carry by directly calling ->paste() method of item
66672 + plugin. If this is impossible (there is not enough free space in the node,
66673 + or we are pasting into leftmost position in the node), call
66674 + paste_with_carry() that will do full carry().
66675 +
66676 +*/
66677 +/* paste_into_item */
66678 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
66679 + lock_handle * lh /* lock handle on node involved */ ,
66680 + const reiser4_key * key /* key of unit being pasted */ ,
66681 + reiser4_item_data * data /* parameters for new unit */ ,
66682 + unsigned flags /* insert/paste flags */ )
66683 +{
66684 + int result;
66685 + int size_change;
66686 + node_plugin *nplug;
66687 + item_plugin *iplug;
66688 +
66689 + assert("umka-317", coord != NULL);
66690 + assert("umka-318", key != NULL);
66691 +
66692 + iplug = item_plugin_by_coord(coord);
66693 + nplug = node_plugin_by_coord(coord);
66694 +
66695 + assert("nikita-1480", iplug == data->iplug);
66696 +
66697 + size_change = space_needed(coord->node, coord, data, 0);
66698 + if (size_change > (int)znode_free_space(coord->node) &&
66699 + (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66700 + && (flags & COPI_DONT_ALLOCATE)) {
66701 + /* we are forced to use free space of coord->node and new data
66702 + does not fit into it. */
66703 + return -E_NODE_FULL;
66704 + }
66705 +
66706 + /* shortcut paste without carry() overhead.
66707 +
66708 + Only possible if:
66709 +
66710 + - there is enough free space
66711 +
66712 + - paste is not into the leftmost unit in a node (otherwise
66713 + it would require updating of delimiting key in a parent)
66714 +
66715 + - node plugin agrees with this
66716 +
66717 + - item plugin agrees with us
66718 + */
66719 + if (size_change <= (int)znode_free_space(coord->node) &&
66720 + (coord->item_pos != 0 ||
66721 + coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
66722 + coord->unit_pos != 0 && nplug->fast_paste != NULL &&
66723 + nplug->fast_paste(coord) &&
66724 + iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
66725 + if (size_change > 0)
66726 + nplug->change_item_size(coord, size_change);
66727 + /* NOTE-NIKITA: huh? where @key is used? */
66728 + result = iplug->b.paste(coord, data, NULL);
66729 + if (size_change < 0)
66730 + nplug->change_item_size(coord, size_change);
66731 + znode_make_dirty(coord->node);
66732 + } else
66733 + /* otherwise do full-fledged carry(). */
66734 + result = paste_with_carry(coord, lh, data, key, flags);
66735 + return result;
66736 +}
66737 +
66738 +/* this either appends or truncates item @coord */
66739 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
66740 + reiser4_item_data * data /* parameters of resize */ ,
66741 + reiser4_key * key /* key of new unit */ ,
66742 + lock_handle * lh /* lock handle of node
66743 + * being modified */ ,
66744 + cop_insert_flag flags /* carry flags */ )
66745 +{
66746 + int result;
66747 + znode *node;
66748 +
66749 + assert("nikita-362", coord != NULL);
66750 + assert("nikita-363", data != NULL);
66751 + assert("vs-245", data->length != 0);
66752 +
66753 + node = coord->node;
66754 + coord_clear_iplug(coord);
66755 + result = zload(node);
66756 + if (result != 0)
66757 + return result;
66758 +
66759 + if (data->length < 0)
66760 + result = node_plugin_by_coord(coord)->shrink_item(coord,
66761 + -data->length);
66762 + else
66763 + result = insert_into_item(coord, lh, key, data, flags);
66764 +
66765 + zrelse(node);
66766 + return result;
66767 +}
66768 +
66769 +/* insert flow @f */
66770 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
66771 +{
66772 + int result;
66773 + carry_pool *pool;
66774 + carry_level *lowest_level;
66775 + reiser4_item_data *data;
66776 + carry_op *op;
66777 +
66778 + pool =
66779 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66780 + sizeof(*data));
66781 + if (IS_ERR(pool))
66782 + return PTR_ERR(pool);
66783 + lowest_level = (carry_level *) (pool + 1);
66784 + init_carry_level(lowest_level, pool);
66785 +
66786 + op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
66787 + 0 /* operate directly on coord -> node */ );
66788 + if (IS_ERR(op) || (op == NULL)) {
66789 + done_carry_pool(pool);
66790 + return RETERR(op ? PTR_ERR(op) : -EIO);
66791 + }
66792 +
66793 + /* these are permanent during insert_flow */
66794 + data = (reiser4_item_data *) (lowest_level + 3);
66795 + data->user = 1;
66796 + data->iplug = item_plugin_by_id(FORMATTING_ID);
66797 + data->arg = NULL;
66798 + /* data.length and data.data will be set before calling paste or
66799 + insert */
66800 + data->length = 0;
66801 + data->data = NULL;
66802 +
66803 + op->u.insert_flow.flags = 0;
66804 + op->u.insert_flow.insert_point = coord;
66805 + op->u.insert_flow.flow = f;
66806 + op->u.insert_flow.data = data;
66807 + op->u.insert_flow.new_nodes = 0;
66808 +
66809 + lowest_level->track_type = CARRY_TRACK_CHANGE;
66810 + lowest_level->tracked = lh;
66811 +
66812 + result = reiser4_carry(lowest_level, NULL);
66813 + done_carry_pool(pool);
66814 +
66815 + return result;
66816 +}
66817 +
66818 +/* Given a coord in parent node, obtain a znode for the corresponding child */
66819 +znode *child_znode(const coord_t * parent_coord /* coord of pointer to
66820 + * child */ ,
66821 + znode * parent /* parent of child */ ,
66822 + int incore_p /* if !0 only return child if already in
66823 + * memory */ ,
66824 + int setup_dkeys_p /* if !0 update delimiting keys of
66825 + * child */ )
66826 +{
66827 + znode *child;
66828 +
66829 + assert("nikita-1374", parent_coord != NULL);
66830 + assert("nikita-1482", parent != NULL);
66831 +#if REISER4_DEBUG
66832 + if (setup_dkeys_p)
66833 + assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
66834 +#endif
66835 + assert("nikita-2947", znode_is_any_locked(parent));
66836 +
66837 + if (znode_get_level(parent) <= LEAF_LEVEL) {
66838 + /* trying to get child of leaf node */
66839 + warning("nikita-1217", "Child of maize?");
66840 + return ERR_PTR(RETERR(-EIO));
66841 + }
66842 + if (item_is_internal(parent_coord)) {
66843 + reiser4_block_nr addr;
66844 + item_plugin *iplug;
66845 + reiser4_tree *tree;
66846 +
66847 + iplug = item_plugin_by_coord(parent_coord);
66848 + assert("vs-512", iplug->s.internal.down_link);
66849 + iplug->s.internal.down_link(parent_coord, NULL, &addr);
66850 +
66851 + tree = znode_get_tree(parent);
66852 + if (incore_p)
66853 + child = zlook(tree, &addr);
66854 + else
66855 + child =
66856 + zget(tree, &addr, parent,
66857 + znode_get_level(parent) - 1,
66858 + reiser4_ctx_gfp_mask_get());
66859 + if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
66860 + set_child_delimiting_keys(parent, parent_coord, child);
66861 + } else {
66862 + warning("nikita-1483", "Internal item expected");
66863 + child = ERR_PTR(RETERR(-EIO));
66864 + }
66865 + return child;
66866 +}
66867 +
66868 +/* remove znode from transaction */
66869 +static void uncapture_znode(znode * node)
66870 +{
66871 + struct page *page;
66872 +
66873 + assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66874 +
66875 + if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
66876 + int ret;
66877 +
66878 + /* An already allocated block goes right to the atom's delete set. */
66879 + ret =
66880 + reiser4_dealloc_block(znode_get_block(node), 0,
66881 + BA_DEFER | BA_FORMATTED);
66882 + if (ret)
66883 + warning("zam-942",
66884 + "can\'t add a block (%llu) number to atom's delete set\n",
66885 + (unsigned long long)(*znode_get_block(node)));
66886 +
66887 + spin_lock_znode(node);
66888 + /* Here we return flush reserved block which was reserved at the
66889 + * moment when this allocated node was marked dirty and still
66890 + * not used by flush in node relocation procedure. */
66891 + if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
66892 + txn_atom *atom;
66893 +
66894 + atom = jnode_get_atom(ZJNODE(node));
66895 + assert("zam-939", atom != NULL);
66896 + spin_unlock_znode(node);
66897 + flush_reserved2grabbed(atom, (__u64) 1);
66898 + spin_unlock_atom(atom);
66899 + } else
66900 + spin_unlock_znode(node);
66901 + } else {
66902 + /* znode has assigned block which is counted as "fake
66903 + allocated". Return it back to "free blocks") */
66904 + fake_allocated2free((__u64) 1, BA_FORMATTED);
66905 + }
66906 +
66907 + /*
66908 + * uncapture page from transaction. There is a possibility of a race
66909 + * with ->releasepage(): reiser4_releasepage() detaches page from this
66910 + * jnode and we have nothing to uncapture. To avoid this, get
66911 + * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
66912 + * will deal with released page itself.
66913 + */
66914 + spin_lock_znode(node);
66915 + page = znode_page(node);
66916 + if (likely(page != NULL)) {
66917 + /*
66918 + * reiser4_uncapture_page() can only be called when we are sure
66919 + * that znode is pinned in memory, which we are, because
66920 + * forget_znode() is only called from longterm_unlock_znode().
66921 + */
66922 + page_cache_get(page);
66923 + spin_unlock_znode(node);
66924 + lock_page(page);
66925 + reiser4_uncapture_page(page);
66926 + unlock_page(page);
66927 + page_cache_release(page);
66928 + } else {
66929 + txn_atom *atom;
66930 +
66931 + /* handle "flush queued" znodes */
66932 + while (1) {
66933 + atom = jnode_get_atom(ZJNODE(node));
66934 + assert("zam-943", atom != NULL);
66935 +
66936 + if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
66937 + || !atom->nr_running_queues)
66938 + break;
66939 +
66940 + spin_unlock_znode(node);
66941 + reiser4_atom_wait_event(atom);
66942 + spin_lock_znode(node);
66943 + }
66944 +
66945 + reiser4_uncapture_block(ZJNODE(node));
66946 + spin_unlock_atom(atom);
66947 + zput(node);
66948 + }
66949 +}
66950 +
66951 +/* This is called from longterm_unlock_znode() when last lock is released from
66952 + the node that has been removed from the tree. At this point node is removed
66953 + from sibling list and its lock is invalidated. */
66954 +void forget_znode(lock_handle * handle)
66955 +{
66956 + znode *node;
66957 + reiser4_tree *tree;
66958 +
66959 + assert("umka-319", handle != NULL);
66960 +
66961 + node = handle->node;
66962 + tree = znode_get_tree(node);
66963 +
66964 + assert("vs-164", znode_is_write_locked(node));
66965 + assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66966 + assert_rw_locked(&(node->lock.guard));
66967 +
66968 + /* We assume that this node was detached from its parent before
66969 + * unlocking, it gives no way to reach this node from parent through a
66970 + * down link. The node should have no children and, thereby, can't be
66971 + * reached from them by their parent pointers. The only way to obtain a
66972 + * reference to the node is to use sibling pointers from its left and
66973 + * right neighbors. In the next several lines we remove the node from
66974 + * the sibling list. */
66975 +
66976 + write_lock_tree(tree);
66977 + sibling_list_remove(node);
66978 + znode_remove(node, tree);
66979 + write_unlock_tree(tree);
66980 +
66981 + /* Here we set JNODE_DYING and cancel all pending lock requests. It
66982 + * forces all lock requestor threads to repeat iterations of getting
66983 + * lock on a child, neighbor or parent node. But, those threads can't
66984 + * come to this node again, because this node is no longer a child,
66985 + * neighbor or parent of any other node. This order of znode
66986 + * invalidation does not allow other threads to waste cpu time is a busy
66987 + * loop, trying to lock dying object. The exception is in the flush
66988 + * code when we take node directly from atom's capture list.*/
66989 + reiser4_invalidate_lock(handle);
66990 + uncapture_znode(node);
66991 +}
66992 +
66993 +/* Check that internal item at @pointer really contains pointer to @child. */
66994 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
66995 + * @child */ ,
66996 + const znode * child /* child znode */ )
66997 +{
66998 + assert("nikita-1016", pointer != NULL);
66999 + assert("nikita-1017", child != NULL);
67000 + assert("nikita-1018", pointer->node != NULL);
67001 +
67002 + assert("nikita-1325", znode_is_any_locked(pointer->node));
67003 +
67004 + assert("nikita-2985",
67005 + znode_get_level(pointer->node) == znode_get_level(child) + 1);
67006 +
67007 + coord_clear_iplug((coord_t *) pointer);
67008 +
67009 + if (coord_is_existing_unit(pointer)) {
67010 + item_plugin *iplug;
67011 + reiser4_block_nr addr;
67012 +
67013 + if (item_is_internal(pointer)) {
67014 + iplug = item_plugin_by_coord(pointer);
67015 + assert("vs-513", iplug->s.internal.down_link);
67016 + iplug->s.internal.down_link(pointer, NULL, &addr);
67017 + /* check that cached value is correct */
67018 + if (disk_addr_eq(&addr, znode_get_block(child))) {
67019 + return NS_FOUND;
67020 + }
67021 + }
67022 + }
67023 + /* warning ("jmacd-1002", "tree pointer incorrect"); */
67024 + return NS_NOT_FOUND;
67025 +}
67026 +
67027 +/* find coord of pointer to new @child in @parent.
67028 +
67029 + Find the &coord_t in the @parent where pointer to a given @child will
67030 + be in.
67031 +
67032 +*/
67033 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
67034 + znode *
67035 + child UNUSED_ARG /* child znode, passed locked */ ,
67036 + znode * left /* left brother of new node */ ,
67037 + coord_t * result /* where result is stored in */ )
67038 +{
67039 + int ret;
67040 +
67041 + assert("nikita-1486", parent != NULL);
67042 + assert("nikita-1487", child != NULL);
67043 + assert("nikita-1488", result != NULL);
67044 +
67045 + ret = find_child_ptr(parent, left, result);
67046 + if (ret != NS_FOUND) {
67047 + warning("nikita-1489", "Cannot find brother position: %i", ret);
67048 + return RETERR(-EIO);
67049 + } else {
67050 + result->between = AFTER_UNIT;
67051 + return RETERR(NS_NOT_FOUND);
67052 + }
67053 +}
67054 +
67055 +/* find coord of pointer to @child in @parent.
67056 +
67057 + Find the &coord_t in the @parent where pointer to a given @child is in.
67058 +
67059 +*/
67060 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
67061 + znode * child /* child znode, passed locked */ ,
67062 + coord_t * result /* where result is stored in */ )
67063 +{
67064 + int lookup_res;
67065 + node_plugin *nplug;
67066 + /* left delimiting key of a child */
67067 + reiser4_key ld;
67068 + reiser4_tree *tree;
67069 +
67070 + assert("nikita-934", parent != NULL);
67071 + assert("nikita-935", child != NULL);
67072 + assert("nikita-936", result != NULL);
67073 + assert("zam-356", znode_is_loaded(parent));
67074 +
67075 + coord_init_zero(result);
67076 + result->node = parent;
67077 +
67078 + nplug = parent->nplug;
67079 + assert("nikita-939", nplug != NULL);
67080 +
67081 + tree = znode_get_tree(parent);
67082 + /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
67083 + * not aliased to ->in_parent of some znode. Otherwise,
67084 + * parent_coord_to_coord() below would modify data protected by tree
67085 + * lock. */
67086 + read_lock_tree(tree);
67087 + /* fast path. Try to use cached value. Lock tree to keep
67088 + node->pos_in_parent and pos->*_blocknr consistent. */
67089 + if (child->in_parent.item_pos + 1 != 0) {
67090 + parent_coord_to_coord(&child->in_parent, result);
67091 + if (check_tree_pointer(result, child) == NS_FOUND) {
67092 + read_unlock_tree(tree);
67093 + return NS_FOUND;
67094 + }
67095 +
67096 + child->in_parent.item_pos = (unsigned short)~0;
67097 + }
67098 + read_unlock_tree(tree);
67099 +
67100 + /* is above failed, find some key from @child. We are looking for the
67101 + least key in a child. */
67102 + read_lock_dk(tree);
67103 + ld = *znode_get_ld_key(child);
67104 + read_unlock_dk(tree);
67105 + /*
67106 + * now, lookup parent with key just found. Note, that left delimiting
67107 + * key doesn't identify node uniquely, because (in extremely rare
67108 + * case) two nodes can have equal left delimiting keys, if one of them
67109 + * is completely filled with directory entries that all happened to be
67110 + * hash collision. But, we check block number in check_tree_pointer()
67111 + * and, so, are safe.
67112 + */
67113 + lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
67114 + /* update cached pos_in_node */
67115 + if (lookup_res == NS_FOUND) {
67116 + write_lock_tree(tree);
67117 + coord_to_parent_coord(result, &child->in_parent);
67118 + write_unlock_tree(tree);
67119 + lookup_res = check_tree_pointer(result, child);
67120 + }
67121 + if (lookup_res == NS_NOT_FOUND)
67122 + lookup_res = find_child_by_addr(parent, child, result);
67123 + return lookup_res;
67124 +}
67125 +
67126 +/* find coord of pointer to @child in @parent by scanning
67127 +
67128 + Find the &coord_t in the @parent where pointer to a given @child
67129 + is in by scanning all internal items in @parent and comparing block
67130 + numbers in them with that of @child.
67131 +
67132 +*/
67133 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
67134 + znode * child /* child znode, passed locked */ ,
67135 + coord_t * result /* where result is stored in */ )
67136 +{
67137 + int ret;
67138 +
67139 + assert("nikita-1320", parent != NULL);
67140 + assert("nikita-1321", child != NULL);
67141 + assert("nikita-1322", result != NULL);
67142 +
67143 + ret = NS_NOT_FOUND;
67144 +
67145 + for_all_units(result, parent) {
67146 + if (check_tree_pointer(result, child) == NS_FOUND) {
67147 + write_lock_tree(znode_get_tree(parent));
67148 + coord_to_parent_coord(result, &child->in_parent);
67149 + write_unlock_tree(znode_get_tree(parent));
67150 + ret = NS_FOUND;
67151 + break;
67152 + }
67153 + }
67154 + return ret;
67155 +}
67156 +
67157 +/* true, if @addr is "unallocated block number", which is just address, with
67158 + highest bit set. */
67159 +int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
67160 + * check */ )
67161 +{
67162 + assert("nikita-1766", addr != NULL);
67163 + cassert(sizeof(reiser4_block_nr) == 8);
67164 + return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
67165 + REISER4_UNALLOCATED_STATUS_VALUE;
67166 +}
67167 +
67168 +/* returns true if removing bytes of given range of key [from_key, to_key]
67169 + causes removing of whole item @from */
67170 +static int
67171 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
67172 + const reiser4_key * to_key)
67173 +{
67174 + item_plugin *iplug;
67175 + reiser4_key key_in_item;
67176 +
67177 + assert("umka-325", from != NULL);
67178 + assert("", item_is_extent(from));
67179 +
67180 + /* check first key just for case */
67181 + item_key_by_coord(from, &key_in_item);
67182 + if (keygt(from_key, &key_in_item))
67183 + return 0;
67184 +
67185 + /* check last key */
67186 + iplug = item_plugin_by_coord(from);
67187 + assert("vs-611", iplug && iplug->s.file.append_key);
67188 +
67189 + iplug->s.file.append_key(from, &key_in_item);
67190 + set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
67191 +
67192 + if (keylt(to_key, &key_in_item))
67193 + /* last byte is not removed */
67194 + return 0;
67195 + return 1;
67196 +}
67197 +
67198 +/* helper function for prepare_twig_kill(): @left and @right are formatted
67199 + * neighbors of extent item being completely removed. Load and lock neighbors
67200 + * and store lock handles into @cdata for later use by kill_hook_extent() */
67201 +static int
67202 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
67203 +{
67204 + int result;
67205 + int left_loaded;
67206 + int right_loaded;
67207 +
67208 + result = 0;
67209 + left_loaded = right_loaded = 0;
67210 +
67211 + if (left != NULL) {
67212 + result = zload(left);
67213 + if (result == 0) {
67214 + left_loaded = 1;
67215 + result = longterm_lock_znode(kdata->left, left,
67216 + ZNODE_READ_LOCK,
67217 + ZNODE_LOCK_LOPRI);
67218 + }
67219 + }
67220 + if (result == 0 && right != NULL) {
67221 + result = zload(right);
67222 + if (result == 0) {
67223 + right_loaded = 1;
67224 + result = longterm_lock_znode(kdata->right, right,
67225 + ZNODE_READ_LOCK,
67226 + ZNODE_LOCK_HIPRI |
67227 + ZNODE_LOCK_NONBLOCK);
67228 + }
67229 + }
67230 + if (result != 0) {
67231 + done_lh(kdata->left);
67232 + done_lh(kdata->right);
67233 + if (left_loaded != 0)
67234 + zrelse(left);
67235 + if (right_loaded != 0)
67236 + zrelse(right);
67237 + }
67238 + return result;
67239 +}
67240 +
67241 +static void done_children(carry_kill_data * kdata)
67242 +{
67243 + if (kdata->left != NULL && kdata->left->node != NULL) {
67244 + zrelse(kdata->left->node);
67245 + done_lh(kdata->left);
67246 + }
67247 + if (kdata->right != NULL && kdata->right->node != NULL) {
67248 + zrelse(kdata->right->node);
67249 + done_lh(kdata->right);
67250 + }
67251 +}
67252 +
67253 +/* part of cut_node. It is called when cut_node is called to remove or cut part
67254 + of extent item. When head of that item is removed - we have to update right
67255 + delimiting of left neighbor of extent. When item is removed completely - we
67256 + have to set sibling link between left and right neighbor of removed
67257 + extent. This may return -E_DEADLOCK because of trying to get left neighbor
67258 + locked. So, caller should repeat an attempt
67259 +*/
67260 +/* Audited by: umka (2002.06.16) */
67261 +static int
67262 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
67263 +{
67264 + int result;
67265 + reiser4_key key;
67266 + lock_handle left_lh;
67267 + lock_handle right_lh;
67268 + coord_t left_coord;
67269 + coord_t *from;
67270 + znode *left_child;
67271 + znode *right_child;
67272 + reiser4_tree *tree;
67273 + int left_zloaded_here, right_zloaded_here;
67274 +
67275 + from = kdata->params.from;
67276 + assert("umka-326", from != NULL);
67277 + assert("umka-327", kdata->params.to != NULL);
67278 +
67279 + /* for one extent item only yet */
67280 + assert("vs-591", item_is_extent(from));
67281 + assert("vs-592", from->item_pos == kdata->params.to->item_pos);
67282 +
67283 + if ((kdata->params.from_key
67284 + && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
67285 + || from->unit_pos != 0) {
67286 + /* head of item @from is not removed, there is nothing to
67287 + worry about */
67288 + return 0;
67289 + }
67290 +
67291 + result = 0;
67292 + left_zloaded_here = 0;
67293 + right_zloaded_here = 0;
67294 +
67295 + left_child = right_child = NULL;
67296 +
67297 + coord_dup(&left_coord, from);
67298 + init_lh(&left_lh);
67299 + init_lh(&right_lh);
67300 + if (coord_prev_unit(&left_coord)) {
67301 + /* @from is leftmost item in its node */
67302 + if (!locked_left_neighbor) {
67303 + result =
67304 + reiser4_get_left_neighbor(&left_lh, from->node,
67305 + ZNODE_READ_LOCK,
67306 + GN_CAN_USE_UPPER_LEVELS);
67307 + switch (result) {
67308 + case 0:
67309 + break;
67310 + case -E_NO_NEIGHBOR:
67311 + /* there is no formatted node to the left of
67312 + from->node */
67313 + warning("vs-605",
67314 + "extent item has smallest key in "
67315 + "the tree and it is about to be removed");
67316 + return 0;
67317 + case -E_DEADLOCK:
67318 + /* need to restart */
67319 + default:
67320 + return result;
67321 + }
67322 +
67323 + /* we have acquired left neighbor of from->node */
67324 + result = zload(left_lh.node);
67325 + if (result)
67326 + goto done;
67327 +
67328 + locked_left_neighbor = left_lh.node;
67329 + } else {
67330 + /* squalloc_right_twig_cut should have supplied locked
67331 + * left neighbor */
67332 + assert("vs-834",
67333 + znode_is_write_locked(locked_left_neighbor));
67334 + result = zload(locked_left_neighbor);
67335 + if (result)
67336 + return result;
67337 + }
67338 +
67339 + left_zloaded_here = 1;
67340 + coord_init_last_unit(&left_coord, locked_left_neighbor);
67341 + }
67342 +
67343 + if (!item_is_internal(&left_coord)) {
67344 + /* what else but extent can be on twig level */
67345 + assert("vs-606", item_is_extent(&left_coord));
67346 +
67347 + /* there is no left formatted child */
67348 + if (left_zloaded_here)
67349 + zrelse(locked_left_neighbor);
67350 + done_lh(&left_lh);
67351 + return 0;
67352 + }
67353 +
67354 + tree = znode_get_tree(left_coord.node);
67355 + left_child = child_znode(&left_coord, left_coord.node, 1, 0);
67356 +
67357 + if (IS_ERR(left_child)) {
67358 + result = PTR_ERR(left_child);
67359 + goto done;
67360 + }
67361 +
67362 + /* left child is acquired, calculate new right delimiting key for it
67363 + and get right child if it is necessary */
67364 + if (item_removed_completely
67365 + (from, kdata->params.from_key, kdata->params.to_key)) {
67366 + /* try to get right child of removed item */
67367 + coord_t right_coord;
67368 +
67369 + assert("vs-607",
67370 + kdata->params.to->unit_pos ==
67371 + coord_last_unit_pos(kdata->params.to));
67372 + coord_dup(&right_coord, kdata->params.to);
67373 + if (coord_next_unit(&right_coord)) {
67374 + /* @to is rightmost unit in the node */
67375 + result =
67376 + reiser4_get_right_neighbor(&right_lh, from->node,
67377 + ZNODE_READ_LOCK,
67378 + GN_CAN_USE_UPPER_LEVELS);
67379 + switch (result) {
67380 + case 0:
67381 + result = zload(right_lh.node);
67382 + if (result)
67383 + goto done;
67384 +
67385 + right_zloaded_here = 1;
67386 + coord_init_first_unit(&right_coord,
67387 + right_lh.node);
67388 + item_key_by_coord(&right_coord, &key);
67389 + break;
67390 +
67391 + case -E_NO_NEIGHBOR:
67392 + /* there is no formatted node to the right of
67393 + from->node */
67394 + read_lock_dk(tree);
67395 + key = *znode_get_rd_key(from->node);
67396 + read_unlock_dk(tree);
67397 + right_coord.node = NULL;
67398 + result = 0;
67399 + break;
67400 + default:
67401 + /* real error */
67402 + goto done;
67403 + }
67404 + } else {
67405 + /* there is an item to the right of @from - take its key */
67406 + item_key_by_coord(&right_coord, &key);
67407 + }
67408 +
67409 + /* try to get right child of @from */
67410 + if (right_coord.node && /* there is right neighbor of @from */
67411 + item_is_internal(&right_coord)) { /* it is internal item */
67412 + right_child = child_znode(&right_coord,
67413 + right_coord.node, 1, 0);
67414 +
67415 + if (IS_ERR(right_child)) {
67416 + result = PTR_ERR(right_child);
67417 + goto done;
67418 + }
67419 +
67420 + }
67421 + /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
67422 + update of right delimiting key of left_child */
67423 + result = prepare_children(left_child, right_child, kdata);
67424 + } else {
67425 + /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
67426 + result = prepare_children(left_child, NULL, kdata);
67427 + }
67428 +
67429 + done:
67430 + if (right_child)
67431 + zput(right_child);
67432 + if (right_zloaded_here)
67433 + zrelse(right_lh.node);
67434 + done_lh(&right_lh);
67435 +
67436 + if (left_child)
67437 + zput(left_child);
67438 + if (left_zloaded_here)
67439 + zrelse(locked_left_neighbor);
67440 + done_lh(&left_lh);
67441 + return result;
67442 +}
67443 +
67444 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
67445 + are to be cut completely */
67446 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
67447 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
67448 + const reiser4_key * to_key, /* last key to be removed */
67449 + reiser4_key *
67450 + smallest_removed /* smallest key actually removed */ )
67451 +{
67452 + int result;
67453 + carry_pool *pool;
67454 + carry_level *lowest_level;
67455 + carry_cut_data *cut_data;
67456 + carry_op *op;
67457 +
67458 + assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
67459 +
67460 + pool =
67461 + init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67462 + sizeof(*cut_data));
67463 + if (IS_ERR(pool))
67464 + return PTR_ERR(pool);
67465 + lowest_level = (carry_level *) (pool + 1);
67466 + init_carry_level(lowest_level, pool);
67467 +
67468 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67469 + assert("vs-1509", op != 0);
67470 + if (IS_ERR(op)) {
67471 + done_carry_pool(pool);
67472 + return PTR_ERR(op);
67473 + }
67474 +
67475 + cut_data = (carry_cut_data *) (lowest_level + 3);
67476 + cut_data->params.from = from;
67477 + cut_data->params.to = to;
67478 + cut_data->params.from_key = from_key;
67479 + cut_data->params.to_key = to_key;
67480 + cut_data->params.smallest_removed = smallest_removed;
67481 +
67482 + op->u.cut_or_kill.is_cut = 1;
67483 + op->u.cut_or_kill.u.cut = cut_data;
67484 +
67485 + result = reiser4_carry(lowest_level, NULL);
67486 + done_carry_pool(pool);
67487 +
67488 + return result;
67489 +}
67490 +
67491 +/* cut part of the node
67492 +
67493 + Cut part or whole content of node.
67494 +
67495 + cut data between @from and @to of @from->node and call carry() to make
67496 + corresponding changes in the tree. @from->node may become empty. If so -
67497 + pointer to it will be removed. Neighboring nodes are not changed. Smallest
67498 + removed key is stored in @smallest_removed
67499 +
67500 +*/
67501 +int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
67502 + coord_t * to, /* coord of the last unit/item that will be eliminated */
67503 + const reiser4_key * from_key, /* first key to be removed */
67504 + const reiser4_key * to_key, /* last key to be removed */
67505 + reiser4_key * smallest_removed, /* smallest key actually removed */
67506 + znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
67507 + * locked (in squalloc_right_twig_cut, namely) */
67508 + struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
67509 + invalidate pages together with item pointing to them */
67510 + int truncate)
67511 +{ /* this call is made for file truncate) */
67512 + int result;
67513 + carry_pool *pool;
67514 + carry_level *lowest_level;
67515 + carry_kill_data *kdata;
67516 + lock_handle *left_child;
67517 + lock_handle *right_child;
67518 + carry_op *op;
67519 +
67520 + assert("umka-328", from != NULL);
67521 + assert("vs-316", !node_is_empty(from->node));
67522 + assert("nikita-1812", coord_is_existing_unit(from)
67523 + && coord_is_existing_unit(to));
67524 +
67525 + /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
67526 + pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67527 + sizeof(carry_kill_data) +
67528 + 2 * sizeof(lock_handle) +
67529 + 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
67530 + if (IS_ERR(pool))
67531 + return PTR_ERR(pool);
67532 +
67533 + lowest_level = (carry_level *) (pool + 1);
67534 + init_carry_level(lowest_level, pool);
67535 +
67536 + kdata = (carry_kill_data *) (lowest_level + 3);
67537 + left_child = (lock_handle *) (kdata + 1);
67538 + right_child = left_child + 1;
67539 +
67540 + init_lh(left_child);
67541 + init_lh(right_child);
67542 +
67543 + kdata->params.from = from;
67544 + kdata->params.to = to;
67545 + kdata->params.from_key = from_key;
67546 + kdata->params.to_key = to_key;
67547 + kdata->params.smallest_removed = smallest_removed;
67548 + kdata->params.truncate = truncate;
67549 + kdata->flags = 0;
67550 + kdata->inode = inode;
67551 + kdata->left = left_child;
67552 + kdata->right = right_child;
67553 + /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
67554 + kdata->buf = (char *)(right_child + 1);
67555 +
67556 + if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
67557 + /* left child of extent item may have to get updated right
67558 + delimiting key and to get linked with right child of extent
67559 + @from if it will be removed completely */
67560 + result = prepare_twig_kill(kdata, locked_left_neighbor);
67561 + if (result) {
67562 + done_children(kdata);
67563 + done_carry_pool(pool);
67564 + return result;
67565 + }
67566 + }
67567 +
67568 + op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67569 + if (IS_ERR(op) || (op == NULL)) {
67570 + done_children(kdata);
67571 + done_carry_pool(pool);
67572 + return RETERR(op ? PTR_ERR(op) : -EIO);
67573 + }
67574 +
67575 + op->u.cut_or_kill.is_cut = 0;
67576 + op->u.cut_or_kill.u.kill = kdata;
67577 +
67578 + result = reiser4_carry(lowest_level, NULL);
67579 +
67580 + done_children(kdata);
67581 + done_carry_pool(pool);
67582 + return result;
67583 +}
67584 +
67585 +void
67586 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
67587 +{
67588 + if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
67589 + pgoff_t start_pg, end_pg;
67590 +
67591 + start_pg = start >> PAGE_CACHE_SHIFT;
67592 + end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
67593 +
67594 + if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
67595 + /*
67596 + * kill up to the page boundary.
67597 + */
67598 + assert("vs-123456", start_pg == end_pg);
67599 + reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
67600 + truncate);
67601 + } else if (start_pg != end_pg) {
67602 + /*
67603 + * page boundary is within killed portion of node.
67604 + */
67605 + assert("vs-654321", end_pg - start_pg == 1);
67606 + reiser4_invalidate_pages(inode->i_mapping, end_pg,
67607 + end_pg - start_pg, 1);
67608 + }
67609 + }
67610 + inode_sub_bytes(inode, end - start);
67611 +}
67612 +
67613 +/**
67614 + * Delete whole @node from the reiser4 tree without loading it.
67615 + *
67616 + * @left: locked left neighbor,
67617 + * @node: node to be deleted,
67618 + * @smallest_removed: leftmost key of deleted node,
67619 + * @object: inode pointer, if we truncate a file body.
67620 + * @truncate: true if called for file truncate.
67621 + *
67622 + * @return: 0 if success, error code otherwise.
67623 + *
67624 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
67625 + * contains the right value of the smallest removed key from the previous
67626 + * cut_worker() iteration. This is needed for proper accounting of
67627 + * "i_blocks" and "i_bytes" fields of the @object.
67628 + */
67629 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
67630 + struct inode *object, int truncate)
67631 +{
67632 + lock_handle parent_lock;
67633 + coord_t cut_from;
67634 + coord_t cut_to;
67635 + reiser4_tree *tree;
67636 + int ret;
67637 +
67638 + assert("zam-937", node != NULL);
67639 + assert("zam-933", znode_is_write_locked(node));
67640 + assert("zam-999", smallest_removed != NULL);
67641 +
67642 + init_lh(&parent_lock);
67643 +
67644 + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
67645 + if (ret)
67646 + return ret;
67647 +
67648 + assert("zam-934", !znode_above_root(parent_lock.node));
67649 +
67650 + ret = zload(parent_lock.node);
67651 + if (ret)
67652 + goto failed_nozrelse;
67653 +
67654 + ret = find_child_ptr(parent_lock.node, node, &cut_from);
67655 + if (ret)
67656 + goto failed;
67657 +
67658 + /* decrement child counter and set parent pointer to NULL before
67659 + deleting the list from parent node because of checks in
67660 + internal_kill_item_hook (we can delete the last item from the parent
67661 + node, the parent node is going to be deleted and its c_count should
67662 + be zero). */
67663 +
67664 + tree = znode_get_tree(node);
67665 + write_lock_tree(tree);
67666 + init_parent_coord(&node->in_parent, NULL);
67667 + --parent_lock.node->c_count;
67668 + write_unlock_tree(tree);
67669 +
67670 + assert("zam-989", item_is_internal(&cut_from));
67671 +
67672 + /* @node should be deleted after unlocking. */
67673 + ZF_SET(node, JNODE_HEARD_BANSHEE);
67674 +
67675 + /* remove a pointer from the parent node to the node being deleted. */
67676 + coord_dup(&cut_to, &cut_from);
67677 + /* FIXME: shouldn't this be kill_node_content */
67678 + ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
67679 + if (ret)
67680 + /* FIXME(Zam): Should we re-connect the node to its parent if
67681 + * cut_node fails? */
67682 + goto failed;
67683 +
67684 + {
67685 + reiser4_tree *tree = current_tree;
67686 + __u64 start_offset = 0, end_offset = 0;
67687 +
67688 + read_lock_tree(tree);
67689 + write_lock_dk(tree);
67690 + if (object) {
67691 + /* We use @smallest_removed and the left delimiting of
67692 + * the current node for @object->i_blocks, i_bytes
67693 + * calculation. We assume that the items after the
67694 + * *@smallest_removed key have been deleted from the
67695 + * file body. */
67696 + start_offset = get_key_offset(znode_get_ld_key(node));
67697 + end_offset = get_key_offset(smallest_removed);
67698 + }
67699 +
67700 + assert("zam-1021", znode_is_connected(node));
67701 + if (node->left)
67702 + znode_set_rd_key(node->left, znode_get_rd_key(node));
67703 +
67704 + *smallest_removed = *znode_get_ld_key(node);
67705 +
67706 + write_unlock_dk(tree);
67707 + read_unlock_tree(tree);
67708 +
67709 + if (object) {
67710 + /* we used to perform actions which are to be performed on items on their removal from tree in
67711 + special item method - kill_hook. Here for optimization reasons we avoid reading node
67712 + containing item we remove and can not call item's kill hook. Instead we call function which
67713 + does exactly the same things as tail kill hook in assumption that node we avoid reading
67714 + contains only one item and that item is a tail one. */
67715 + fake_kill_hook_tail(object, start_offset, end_offset,
67716 + truncate);
67717 + }
67718 + }
67719 + failed:
67720 + zrelse(parent_lock.node);
67721 + failed_nozrelse:
67722 + done_lh(&parent_lock);
67723 +
67724 + return ret;
67725 +}
67726 +
67727 +static int can_delete(const reiser4_key *key, znode *node)
67728 +{
67729 + int result;
67730 +
67731 + read_lock_dk(current_tree);
67732 + result = keyle(key, znode_get_ld_key(node));
67733 + read_unlock_dk(current_tree);
67734 + return result;
67735 +}
67736 +
67737 +/**
67738 + * This subroutine is not optimal but implementation seems to
67739 + * be easier).
67740 + *
67741 + * @tap: the point deletion process begins from,
67742 + * @from_key: the beginning of the deleted key range,
67743 + * @to_key: the end of the deleted key range,
67744 + * @smallest_removed: the smallest removed key,
67745 + * @truncate: true if called for file truncate.
67746 + * @progress: return true if a progress in file items deletions was made,
67747 + * @smallest_removed value is actual in that case.
67748 + *
67749 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
67750 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
67751 + */
67752 +int
67753 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
67754 + const reiser4_key * to_key,
67755 + reiser4_key * smallest_removed, struct inode *object,
67756 + int truncate, int *progress)
67757 +{
67758 + lock_handle next_node_lock;
67759 + coord_t left_coord;
67760 + int result;
67761 +
67762 + assert("zam-931", tap->coord->node != NULL);
67763 + assert("zam-932", znode_is_write_locked(tap->coord->node));
67764 +
67765 + *progress = 0;
67766 + init_lh(&next_node_lock);
67767 +
67768 + while (1) {
67769 + znode *node; /* node from which items are cut */
67770 + node_plugin *nplug; /* node plugin for @node */
67771 +
67772 + node = tap->coord->node;
67773 +
67774 + /* Move next_node_lock to the next node on the left. */
67775 + result =
67776 + reiser4_get_left_neighbor(&next_node_lock, node,
67777 + ZNODE_WRITE_LOCK,
67778 + GN_CAN_USE_UPPER_LEVELS);
67779 + if (result != 0 && result != -E_NO_NEIGHBOR)
67780 + break;
67781 + /* Check can we delete the node as a whole. */
67782 + if (*progress && znode_get_level(node) == LEAF_LEVEL &&
67783 + can_delete(from_key, node)) {
67784 + result = reiser4_delete_node(node, smallest_removed,
67785 + object, truncate);
67786 + } else {
67787 + result = reiser4_tap_load(tap);
67788 + if (result)
67789 + return result;
67790 +
67791 + /* Prepare the second (right) point for cut_node() */
67792 + if (*progress)
67793 + coord_init_last_unit(tap->coord, node);
67794 +
67795 + else if (item_plugin_by_coord(tap->coord)->b.lookup ==
67796 + NULL)
67797 + /* set rightmost unit for the items without lookup method */
67798 + tap->coord->unit_pos =
67799 + coord_last_unit_pos(tap->coord);
67800 +
67801 + nplug = node->nplug;
67802 +
67803 + assert("vs-686", nplug);
67804 + assert("vs-687", nplug->lookup);
67805 +
67806 + /* left_coord is leftmost unit cut from @node */
67807 + result = nplug->lookup(node, from_key,
67808 + FIND_MAX_NOT_MORE_THAN,
67809 + &left_coord);
67810 +
67811 + if (IS_CBKERR(result))
67812 + break;
67813 +
67814 + /* adjust coordinates so that they are set to existing units */
67815 + if (coord_set_to_right(&left_coord)
67816 + || coord_set_to_left(tap->coord)) {
67817 + result = 0;
67818 + break;
67819 + }
67820 +
67821 + if (coord_compare(&left_coord, tap->coord) ==
67822 + COORD_CMP_ON_RIGHT) {
67823 + /* keys from @from_key to @to_key are not in the tree */
67824 + result = 0;
67825 + break;
67826 + }
67827 +
67828 + if (left_coord.item_pos != tap->coord->item_pos) {
67829 + /* do not allow to cut more than one item. It is added to solve problem of truncating
67830 + partially converted files. If file is partially converted there may exist a twig node
67831 + containing both internal item or items pointing to leaf nodes with formatting items
67832 + and extent item. We do not want to kill internal items being at twig node here
67833 + because cut_tree_worker assumes killing them from level level */
67834 + coord_dup(&left_coord, tap->coord);
67835 + assert("vs-1652",
67836 + coord_is_existing_unit(&left_coord));
67837 + left_coord.unit_pos = 0;
67838 + }
67839 +
67840 + /* cut data from one node */
67841 + // *smallest_removed = *reiser4_min_key();
67842 + result =
67843 + kill_node_content(&left_coord, tap->coord, from_key,
67844 + to_key, smallest_removed,
67845 + next_node_lock.node, object,
67846 + truncate);
67847 + reiser4_tap_relse(tap);
67848 + }
67849 + if (result)
67850 + break;
67851 +
67852 + ++(*progress);
67853 +
67854 + /* Check whether all items with keys >= from_key were removed
67855 + * from the tree. */
67856 + if (keyle(smallest_removed, from_key))
67857 + /* result = 0; */
67858 + break;
67859 +
67860 + if (next_node_lock.node == NULL)
67861 + break;
67862 +
67863 + result = reiser4_tap_move(tap, &next_node_lock);
67864 + done_lh(&next_node_lock);
67865 + if (result)
67866 + break;
67867 +
67868 + /* Break long reiser4_cut_tree operation (deletion of a large
67869 + file) if atom requires commit. */
67870 + if (*progress > CUT_TREE_MIN_ITERATIONS
67871 + && current_atom_should_commit()) {
67872 + result = -E_REPEAT;
67873 + break;
67874 + }
67875 + }
67876 + done_lh(&next_node_lock);
67877 + // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
67878 + return result;
67879 +}
67880 +
67881 +/* there is a fundamental problem with optimizing deletes: VFS does it
67882 + one file at a time. Another problem is that if an item can be
67883 + anything, then deleting items must be done one at a time. It just
67884 + seems clean to writes this to specify a from and a to key, and cut
67885 + everything between them though. */
67886 +
67887 +/* use this function with care if deleting more than what is part of a single file. */
67888 +/* do not use this when cutting a single item, it is suboptimal for that */
67889 +
67890 +/* You are encouraged to write plugin specific versions of this. It
67891 + cannot be optimal for all plugins because it works item at a time,
67892 + and some plugins could sometimes work node at a time. Regular files
67893 + however are not optimizable to work node at a time because of
67894 + extents needing to free the blocks they point to.
67895 +
67896 + Optimizations compared to v3 code:
67897 +
67898 + It does not balance (that task is left to memory pressure code).
67899 +
67900 + Nodes are deleted only if empty.
67901 +
67902 + Uses extents.
67903 +
67904 + Performs read-ahead of formatted nodes whose contents are part of
67905 + the deletion.
67906 +*/
67907 +
67908 +/**
67909 + * Delete everything from the reiser4 tree between two keys: @from_key and
67910 + * @to_key.
67911 + *
67912 + * @from_key: the beginning of the deleted key range,
67913 + * @to_key: the end of the deleted key range,
67914 + * @smallest_removed: the smallest removed key,
67915 + * @object: owner of cutting items.
67916 + * @truncate: true if called for file truncate.
67917 + * @progress: return true if a progress in file items deletions was made,
67918 + * @smallest_removed value is actual in that case.
67919 + *
67920 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
67921 + * operation was interrupted for allowing atom commit .
67922 + */
67923 +
67924 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
67925 + const reiser4_key * to_key,
67926 + reiser4_key * smallest_removed_p,
67927 + struct inode *object, int truncate, int *progress)
67928 +{
67929 + lock_handle lock;
67930 + int result;
67931 + tap_t tap;
67932 + coord_t right_coord;
67933 + reiser4_key smallest_removed;
67934 + int (*cut_tree_worker) (tap_t *, const reiser4_key *,
67935 + const reiser4_key *, reiser4_key *,
67936 + struct inode *, int, int *);
67937 + STORE_COUNTERS;
67938 +
67939 + assert("umka-329", tree != NULL);
67940 + assert("umka-330", from_key != NULL);
67941 + assert("umka-331", to_key != NULL);
67942 + assert("zam-936", keyle(from_key, to_key));
67943 +
67944 + if (smallest_removed_p == NULL)
67945 + smallest_removed_p = &smallest_removed;
67946 +
67947 + init_lh(&lock);
67948 +
67949 + do {
67950 + /* Find rightmost item to cut away from the tree. */
67951 + result = reiser4_object_lookup(object, to_key, &right_coord,
67952 + &lock, ZNODE_WRITE_LOCK,
67953 + FIND_MAX_NOT_MORE_THAN,
67954 + TWIG_LEVEL, LEAF_LEVEL,
67955 + CBK_UNIQUE, NULL /*ra_info */);
67956 + if (result != CBK_COORD_FOUND)
67957 + break;
67958 + if (object == NULL
67959 + || inode_file_plugin(object)->cut_tree_worker == NULL)
67960 + cut_tree_worker = cut_tree_worker_common;
67961 + else
67962 + cut_tree_worker =
67963 + inode_file_plugin(object)->cut_tree_worker;
67964 + reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
67965 + result =
67966 + cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
67967 + object, truncate, progress);
67968 + reiser4_tap_done(&tap);
67969 +
67970 + reiser4_preempt_point();
67971 +
67972 + } while (0);
67973 +
67974 + done_lh(&lock);
67975 +
67976 + if (result) {
67977 + switch (result) {
67978 + case -E_NO_NEIGHBOR:
67979 + result = 0;
67980 + break;
67981 + case -E_DEADLOCK:
67982 + result = -E_REPEAT;
67983 + case -E_REPEAT:
67984 + case -ENOMEM:
67985 + case -ENOENT:
67986 + break;
67987 + default:
67988 + warning("nikita-2861", "failure: %i", result);
67989 + }
67990 + }
67991 +
67992 + CHECK_COUNTERS;
67993 + return result;
67994 +}
67995 +
67996 +/* repeat reiser4_cut_tree_object until everything is deleted.
67997 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
67998 + * is returned by cut_tree_object. */
67999 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68000 + const reiser4_key * to, struct inode *inode, int truncate)
68001 +{
68002 + int result;
68003 + int progress;
68004 +
68005 + do {
68006 + result = reiser4_cut_tree_object(tree, from, to, NULL,
68007 + inode, truncate, &progress);
68008 + } while (result == -E_REPEAT);
68009 +
68010 + return result;
68011 +}
68012 +
68013 +/* finishing reiser4 initialization */
68014 +int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
68015 + * initialized */ ,
68016 + const reiser4_block_nr * root_block /* address of a root block
68017 + * on a disk */ ,
68018 + tree_level height /* height of a tree */ ,
68019 + node_plugin * nplug /* default node plugin */ )
68020 +{
68021 + int result;
68022 +
68023 + assert("nikita-306", tree != NULL);
68024 + assert("nikita-307", root_block != NULL);
68025 + assert("nikita-308", height > 0);
68026 + assert("nikita-309", nplug != NULL);
68027 + assert("zam-587", tree->super != NULL);
68028 +
68029 + tree->root_block = *root_block;
68030 + tree->height = height;
68031 + tree->estimate_one_insert = calc_estimate_one_insert(height);
68032 + tree->nplug = nplug;
68033 +
68034 + tree->znode_epoch = 1ull;
68035 +
68036 + cbk_cache_init(&tree->cbk_cache);
68037 +
68038 + result = znodes_tree_init(tree);
68039 + if (result == 0)
68040 + result = jnodes_tree_init(tree);
68041 + if (result == 0) {
68042 + tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
68043 + reiser4_ctx_gfp_mask_get());
68044 + if (IS_ERR(tree->uber)) {
68045 + result = PTR_ERR(tree->uber);
68046 + tree->uber = NULL;
68047 + }
68048 + }
68049 + return result;
68050 +}
68051 +
68052 +/* release resources associated with @tree */
68053 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
68054 +{
68055 + if (tree == NULL)
68056 + return;
68057 +
68058 + if (tree->uber != NULL) {
68059 + zput(tree->uber);
68060 + tree->uber = NULL;
68061 + }
68062 + znodes_tree_done(tree);
68063 + jnodes_tree_done(tree);
68064 + cbk_cache_done(&tree->cbk_cache);
68065 +}
68066 +
68067 +/* Make Linus happy.
68068 + Local variables:
68069 + c-indentation-style: "K&R"
68070 + mode-name: "LC"
68071 + c-basic-offset: 8
68072 + tab-width: 8
68073 + fill-column: 120
68074 + scroll-step: 1
68075 + End:
68076 +*/
68077 diff -urN linux-2.6.23.orig/fs/reiser4/tree.h linux-2.6.23/fs/reiser4/tree.h
68078 --- linux-2.6.23.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
68079 +++ linux-2.6.23/fs/reiser4/tree.h 2007-12-04 16:49:30.000000000 +0300
68080 @@ -0,0 +1,577 @@
68081 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68082 + * reiser4/README */
68083 +
68084 +/* Tree operations. See fs/reiser4/tree.c for comments */
68085 +
68086 +#if !defined( __REISER4_TREE_H__ )
68087 +#define __REISER4_TREE_H__
68088 +
68089 +#include "forward.h"
68090 +#include "debug.h"
68091 +#include "dformat.h"
68092 +#include "plugin/node/node.h"
68093 +#include "plugin/plugin.h"
68094 +#include "znode.h"
68095 +#include "tap.h"
68096 +
68097 +#include <linux/types.h> /* for __u?? */
68098 +#include <linux/fs.h> /* for struct super_block */
68099 +#include <linux/spinlock.h>
68100 +#include <linux/sched.h> /* for struct task_struct */
68101 +
68102 +/* fictive block number never actually used */
68103 +extern const reiser4_block_nr UBER_TREE_ADDR;
68104 +
68105 +/* &cbk_cache_slot - entry in a coord cache.
68106 +
68107 + This is entry in a coord_by_key (cbk) cache, represented by
68108 + &cbk_cache.
68109 +
68110 +*/
68111 +typedef struct cbk_cache_slot {
68112 + /* cached node */
68113 + znode *node;
68114 + /* linkage to the next cbk cache slot in a LRU order */
68115 + struct list_head lru;
68116 +} cbk_cache_slot;
68117 +
68118 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
68119 +
68120 + cbk_cache is supposed to speed up tree lookups by caching results of recent
68121 + successful lookups (we don't cache negative results as dentry cache
68122 + does). Cache consists of relatively small number of entries kept in a LRU
68123 + order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
68124 + which we can obtain a range of keys that covered by this znode. Before
68125 + embarking into real tree traversal we scan cbk_cache slot by slot and for
68126 + each slot check whether key we are looking for is between minimal and
68127 + maximal keys for node pointed to by this slot. If no match is found, real
68128 + tree traversal is performed and if result is successful, appropriate entry
68129 + is inserted into cache, possibly pulling least recently used entry out of
68130 + it.
68131 +
68132 + Tree spin lock is used to protect coord cache. If contention for this
68133 + lock proves to be too high, more finer grained locking can be added.
68134 +
68135 + Invariants involving parts of this data-type:
68136 +
68137 + [cbk-cache-invariant]
68138 +*/
68139 +typedef struct cbk_cache {
68140 + /* serializator */
68141 + rwlock_t guard;
68142 + int nr_slots;
68143 + /* head of LRU list of cache slots */
68144 + struct list_head lru;
68145 + /* actual array of slots */
68146 + cbk_cache_slot *slot;
68147 +} cbk_cache;
68148 +
68149 +/* level_lookup_result - possible outcome of looking up key at some level.
68150 + This is used by coord_by_key when traversing tree downward. */
68151 +typedef enum {
68152 + /* continue to the next level */
68153 + LOOKUP_CONT,
68154 + /* done. Either required item was found, or we can prove it
68155 + doesn't exist, or some error occurred. */
68156 + LOOKUP_DONE,
68157 + /* restart traversal from the root. Infamous "repetition". */
68158 + LOOKUP_REST
68159 +} level_lookup_result;
68160 +
68161 +/* This is representation of internal reiser4 tree where all file-system
68162 + data and meta-data are stored. This structure is passed to all tree
68163 + manipulation functions. It's different from the super block because:
68164 + we don't want to limit ourselves to strictly one to one mapping
68165 + between super blocks and trees, and, because they are logically
68166 + different: there are things in a super block that have no relation to
68167 + the tree (bitmaps, journalling area, mount options, etc.) and there
68168 + are things in a tree that bear no relation to the super block, like
68169 + tree of znodes.
68170 +
68171 + At this time, there is only one tree
68172 + per filesystem, and this struct is part of the super block. We only
68173 + call the super block the super block for historical reasons (most
68174 + other filesystems call the per filesystem metadata the super block).
68175 +*/
68176 +
68177 +struct reiser4_tree {
68178 + /* block_nr == 0 is fake znode. Write lock it, while changing
68179 + tree height. */
68180 + /* disk address of root node of a tree */
68181 + reiser4_block_nr root_block;
68182 +
68183 + /* level of the root node. If this is 1, tree consists of root
68184 + node only */
68185 + tree_level height;
68186 +
68187 + /*
68188 + * this is cached here avoid calling plugins through function
68189 + * dereference all the time.
68190 + */
68191 + __u64 estimate_one_insert;
68192 +
68193 + /* cache of recent tree lookup results */
68194 + cbk_cache cbk_cache;
68195 +
68196 + /* hash table to look up znodes by block number. */
68197 + z_hash_table zhash_table;
68198 + z_hash_table zfake_table;
68199 + /* hash table to look up jnodes by inode and offset. */
68200 + j_hash_table jhash_table;
68201 +
68202 + /* lock protecting:
68203 + - parent pointers,
68204 + - sibling pointers,
68205 + - znode hash table
68206 + - coord cache
68207 + */
68208 + /* NOTE: The "giant" tree lock can be replaced by more spin locks,
68209 + hoping they will be less contented. We can use one spin lock per one
68210 + znode hash bucket. With adding of some code complexity, sibling
68211 + pointers can be protected by both znode spin locks. However it looks
68212 + more SMP scalable we should test this locking change on n-ways (n >
68213 + 4) SMP machines. Current 4-ways machine test does not show that tree
68214 + lock is contented and it is a bottleneck (2003.07.25). */
68215 +
68216 + rwlock_t tree_lock;
68217 +
68218 + /* lock protecting delimiting keys */
68219 + rwlock_t dk_lock;
68220 +
68221 + /* spin lock protecting znode_epoch */
68222 + spinlock_t epoch_lock;
68223 + /* version stamp used to mark znode updates. See seal.[ch] for more
68224 + * information. */
68225 + __u64 znode_epoch;
68226 +
68227 + znode *uber;
68228 + node_plugin *nplug;
68229 + struct super_block *super;
68230 + struct {
68231 + /* carry flags used for insertion of new nodes */
68232 + __u32 new_node_flags;
68233 + /* carry flags used for insertion of new extents */
68234 + __u32 new_extent_flags;
68235 + /* carry flags used for paste operations */
68236 + __u32 paste_flags;
68237 + /* carry flags used for insert operations */
68238 + __u32 insert_flags;
68239 + } carry;
68240 +};
68241 +
68242 +extern int reiser4_init_tree(reiser4_tree * tree,
68243 + const reiser4_block_nr * root_block,
68244 + tree_level height, node_plugin * default_plugin);
68245 +extern void reiser4_done_tree(reiser4_tree * tree);
68246 +
68247 +/* cbk flags: options for coord_by_key() */
68248 +typedef enum {
68249 + /* coord_by_key() is called for insertion. This is necessary because
68250 + of extents being located at the twig level. For explanation, see
68251 + comment just above is_next_item_internal().
68252 + */
68253 + CBK_FOR_INSERT = (1 << 0),
68254 + /* coord_by_key() is called with key that is known to be unique */
68255 + CBK_UNIQUE = (1 << 1),
68256 + /* coord_by_key() can trust delimiting keys. This options is not user
68257 + accessible. coord_by_key() will set it automatically. It will be
68258 + only cleared by special-case in extents-on-the-twig-level handling
68259 + where it is necessary to insert item with a key smaller than
68260 + leftmost key in a node. This is necessary because of extents being
68261 + located at the twig level. For explanation, see comment just above
68262 + is_next_item_internal().
68263 + */
68264 + CBK_TRUST_DK = (1 << 2),
68265 + CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
68266 + CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
68267 + CBK_DKSET = (1 << 5),
68268 + CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
68269 + CBK_IN_CACHE = (1 << 7), /* node is already in cache */
68270 + CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
68271 + * lock */
68272 +} cbk_flags;
68273 +
68274 +/* insertion outcome. IBK = insert by key */
68275 +typedef enum {
68276 + IBK_INSERT_OK = 0,
68277 + IBK_ALREADY_EXISTS = -EEXIST,
68278 + IBK_IO_ERROR = -EIO,
68279 + IBK_NO_SPACE = -E_NODE_FULL,
68280 + IBK_OOM = -ENOMEM
68281 +} insert_result;
68282 +
68283 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
68284 +
68285 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
68286 + lock_handle * lh, void *arg);
68287 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
68288 + lock_handle * lh,
68289 + tree_iterate_actor_t actor, void *arg,
68290 + znode_lock_mode mode, int through_units_p);
68291 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
68292 + znode_lock_request pri, lock_handle * lh);
68293 +
68294 +/* return node plugin of @node */
68295 +static inline node_plugin *node_plugin_by_node(const znode *
68296 + node /* node to query */ )
68297 +{
68298 + assert("vs-213", node != NULL);
68299 + assert("vs-214", znode_is_loaded(node));
68300 +
68301 + return node->nplug;
68302 +}
68303 +
68304 +/* number of items in @node */
68305 +static inline pos_in_node_t node_num_items(const znode * node)
68306 +{
68307 + assert("nikita-2754", znode_is_loaded(node));
68308 + assert("nikita-2468",
68309 + node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
68310 +
68311 + return node->nr_items;
68312 +}
68313 +
68314 +/* Return the number of items at the present node. Asserts coord->node !=
68315 + NULL. */
68316 +static inline unsigned coord_num_items(const coord_t * coord)
68317 +{
68318 + assert("jmacd-9805", coord->node != NULL);
68319 +
68320 + return node_num_items(coord->node);
68321 +}
68322 +
68323 +/* true if @node is empty */
68324 +static inline int node_is_empty(const znode * node)
68325 +{
68326 + return node_num_items(node) == 0;
68327 +}
68328 +
68329 +typedef enum {
68330 + SHIFTED_SOMETHING = 0,
68331 + SHIFT_NO_SPACE = -E_NODE_FULL,
68332 + SHIFT_IO_ERROR = -EIO,
68333 + SHIFT_OOM = -ENOMEM,
68334 +} shift_result;
68335 +
68336 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
68337 +extern int is_coord_in_node(const coord_t * coord);
68338 +extern int key_in_node(const reiser4_key *, const coord_t *);
68339 +extern void coord_item_move_to(coord_t * coord, int items);
68340 +extern void coord_unit_move_to(coord_t * coord, int units);
68341 +
68342 +/* there are two types of repetitive accesses (ra): intra-syscall
68343 + (local) and inter-syscall (global). Local ra is used when
68344 + during single syscall we add/delete several items and units in the
68345 + same place in a tree. Note that plan-A fragments local ra by
68346 + separating stat-data and file body in key-space. Global ra is
68347 + used when user does repetitive modifications in the same place in a
68348 + tree.
68349 +
68350 + Our ra implementation serves following purposes:
68351 + 1 it affects balancing decisions so that next operation in a row
68352 + can be performed faster;
68353 + 2 it affects lower-level read-ahead in page-cache;
68354 + 3 it allows to avoid unnecessary lookups by maintaining some state
68355 + across several operations (this is only for local ra);
68356 + 4 it leaves room for lazy-micro-balancing: when we start a sequence of
68357 + operations they are performed without actually doing any intra-node
68358 + shifts, until we finish sequence or scope of sequence leaves
68359 + current node, only then we really pack node (local ra only).
68360 +*/
68361 +
68362 +/* another thing that can be useful is to keep per-tree and/or
68363 + per-process cache of recent lookups. This cache can be organised as a
68364 + list of block numbers of formatted nodes sorted by starting key in
68365 + this node. Balancings should invalidate appropriate parts of this
68366 + cache.
68367 +*/
68368 +
68369 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
68370 + coord_t * coord, lock_handle * handle,
68371 + znode_lock_mode lock, lookup_bias bias,
68372 + tree_level lock_level, tree_level stop_level,
68373 + __u32 flags, ra_info_t *);
68374 +
68375 +lookup_result reiser4_object_lookup(struct inode *object,
68376 + const reiser4_key * key,
68377 + coord_t * coord,
68378 + lock_handle * lh,
68379 + znode_lock_mode lock_mode,
68380 + lookup_bias bias,
68381 + tree_level lock_level,
68382 + tree_level stop_level,
68383 + __u32 flags, ra_info_t * info);
68384 +
68385 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
68386 + reiser4_item_data * data, coord_t * coord,
68387 + lock_handle * lh,
68388 + tree_level stop_level, __u32 flags);
68389 +insert_result insert_by_coord(coord_t * coord,
68390 + reiser4_item_data * data, const reiser4_key * key,
68391 + lock_handle * lh, __u32);
68392 +insert_result insert_extent_by_coord(coord_t * coord,
68393 + reiser4_item_data * data,
68394 + const reiser4_key * key, lock_handle * lh);
68395 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
68396 + const reiser4_key * to_key,
68397 + reiser4_key * smallest_removed);
68398 +int kill_node_content(coord_t * from, coord_t * to,
68399 + const reiser4_key * from_key, const reiser4_key * to_key,
68400 + reiser4_key * smallest_removed,
68401 + znode * locked_left_neighbor, struct inode *inode,
68402 + int truncate);
68403 +
68404 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
68405 + reiser4_key * key, lock_handle * lh, cop_insert_flag);
68406 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
68407 + reiser4_item_data * data, unsigned);
68408 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
68409 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
68410 + coord_t * result);
68411 +
68412 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
68413 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
68414 +
68415 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
68416 +
68417 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
68418 + const reiser4_key *, reiser4_key *,
68419 + struct inode *, int, int *);
68420 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
68421 + const reiser4_key *, reiser4_key *,
68422 + struct inode *, int, int *);
68423 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68424 + const reiser4_key * to, struct inode *, int);
68425 +
68426 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
68427 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
68428 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
68429 + znode * left, coord_t * result);
68430 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
68431 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
68432 + znode * child);
68433 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
68434 + int incore_p, int setup_dkeys_p);
68435 +
68436 +extern int cbk_cache_init(cbk_cache * cache);
68437 +extern void cbk_cache_done(cbk_cache * cache);
68438 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
68439 +
68440 +extern char *sprint_address(const reiser4_block_nr * block);
68441 +
68442 +#if REISER4_DEBUG
68443 +extern void print_coord_content(const char *prefix, coord_t * p);
68444 +extern void reiser4_print_address(const char *prefix,
68445 + const reiser4_block_nr * block);
68446 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
68447 + __u32 flags);
68448 +extern void check_dkeys(znode *node);
68449 +#else
68450 +#define print_coord_content(p, c) noop
68451 +#define reiser4_print_address(p, b) noop
68452 +#endif
68453 +
68454 +extern void forget_znode(lock_handle * handle);
68455 +extern int deallocate_znode(znode * node);
68456 +
68457 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
68458 +
68459 +/* struct used internally to pack all numerous arguments of tree lookup.
68460 + Used to avoid passing a lot of arguments to helper functions. */
68461 +typedef struct cbk_handle {
68462 + /* tree we are in */
68463 + reiser4_tree *tree;
68464 + /* key we are going after */
68465 + const reiser4_key *key;
68466 + /* coord we will store result in */
68467 + coord_t *coord;
68468 + /* type of lock to take on target node */
68469 + znode_lock_mode lock_mode;
68470 + /* lookup bias. See comments at the declaration of lookup_bias */
68471 + lookup_bias bias;
68472 + /* lock level: level starting from which tree traversal starts taking
68473 + * write locks. */
68474 + tree_level lock_level;
68475 + /* level where search will stop. Either item will be found between
68476 + lock_level and stop_level, or CBK_COORD_NOTFOUND will be
68477 + returned.
68478 + */
68479 + tree_level stop_level;
68480 + /* level we are currently at */
68481 + tree_level level;
68482 + /* block number of @active node. Tree traversal operates on two
68483 + nodes: active and parent. */
68484 + reiser4_block_nr block;
68485 + /* put here error message to be printed by caller */
68486 + const char *error;
68487 + /* result passed back to caller */
68488 + lookup_result result;
68489 + /* lock handles for active and parent */
68490 + lock_handle *parent_lh;
68491 + lock_handle *active_lh;
68492 + reiser4_key ld_key;
68493 + reiser4_key rd_key;
68494 + /* flags, passed to the cbk routine. Bits of this bitmask are defined
68495 + in tree.h:cbk_flags enum. */
68496 + __u32 flags;
68497 + ra_info_t *ra_info;
68498 + struct inode *object;
68499 +} cbk_handle;
68500 +
68501 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
68502 +
68503 +/* eottl.c */
68504 +extern int handle_eottl(cbk_handle *h, int *outcome);
68505 +
68506 +int lookup_multikey(cbk_handle * handle, int nr_keys);
68507 +int lookup_couple(reiser4_tree * tree,
68508 + const reiser4_key * key1, const reiser4_key * key2,
68509 + coord_t * coord1, coord_t * coord2,
68510 + lock_handle * lh1, lock_handle * lh2,
68511 + znode_lock_mode lock_mode, lookup_bias bias,
68512 + tree_level lock_level, tree_level stop_level, __u32 flags,
68513 + int *result1, int *result2);
68514 +
68515 +static inline void read_lock_tree(reiser4_tree *tree)
68516 +{
68517 + /* check that tree is not locked */
68518 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68519 + LOCK_CNT_NIL(read_locked_tree) &&
68520 + LOCK_CNT_NIL(write_locked_tree)));
68521 + /* check that spinlocks of lower priorities are not held */
68522 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68523 + LOCK_CNT_NIL(rw_locked_dk) &&
68524 + LOCK_CNT_NIL(spin_locked_stack)));
68525 +
68526 + read_lock(&(tree->tree_lock));
68527 +
68528 + LOCK_CNT_INC(read_locked_tree);
68529 + LOCK_CNT_INC(rw_locked_tree);
68530 + LOCK_CNT_INC(spin_locked);
68531 +}
68532 +
68533 +static inline void read_unlock_tree(reiser4_tree *tree)
68534 +{
68535 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
68536 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68537 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68538 +
68539 + LOCK_CNT_DEC(read_locked_tree);
68540 + LOCK_CNT_DEC(rw_locked_tree);
68541 + LOCK_CNT_DEC(spin_locked);
68542 +
68543 + read_unlock(&(tree->tree_lock));
68544 +}
68545 +
68546 +static inline void write_lock_tree(reiser4_tree *tree)
68547 +{
68548 + /* check that tree is not locked */
68549 + assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68550 + LOCK_CNT_NIL(read_locked_tree) &&
68551 + LOCK_CNT_NIL(write_locked_tree)));
68552 + /* check that spinlocks of lower priorities are not held */
68553 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68554 + LOCK_CNT_NIL(rw_locked_dk) &&
68555 + LOCK_CNT_NIL(spin_locked_stack)));
68556 +
68557 + write_lock(&(tree->tree_lock));
68558 +
68559 + LOCK_CNT_INC(write_locked_tree);
68560 + LOCK_CNT_INC(rw_locked_tree);
68561 + LOCK_CNT_INC(spin_locked);
68562 +}
68563 +
68564 +static inline void write_unlock_tree(reiser4_tree *tree)
68565 +{
68566 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
68567 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68568 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68569 +
68570 + LOCK_CNT_DEC(write_locked_tree);
68571 + LOCK_CNT_DEC(rw_locked_tree);
68572 + LOCK_CNT_DEC(spin_locked);
68573 +
68574 + write_unlock(&(tree->tree_lock));
68575 +}
68576 +
68577 +static inline void read_lock_dk(reiser4_tree *tree)
68578 +{
68579 + /* check that dk is not locked */
68580 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68581 + LOCK_CNT_NIL(read_locked_dk) &&
68582 + LOCK_CNT_NIL(write_locked_dk)));
68583 + /* check that spinlocks of lower priorities are not held */
68584 + assert("", LOCK_CNT_NIL(spin_locked_stack));
68585 +
68586 + read_lock(&((tree)->dk_lock));
68587 +
68588 + LOCK_CNT_INC(read_locked_dk);
68589 + LOCK_CNT_INC(rw_locked_dk);
68590 + LOCK_CNT_INC(spin_locked);
68591 +}
68592 +
68593 +static inline void read_unlock_dk(reiser4_tree *tree)
68594 +{
68595 + assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
68596 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68597 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68598 +
68599 + LOCK_CNT_DEC(read_locked_dk);
68600 + LOCK_CNT_DEC(rw_locked_dk);
68601 + LOCK_CNT_DEC(spin_locked);
68602 +
68603 + read_unlock(&(tree->dk_lock));
68604 +}
68605 +
68606 +static inline void write_lock_dk(reiser4_tree *tree)
68607 +{
68608 + /* check that dk is not locked */
68609 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68610 + LOCK_CNT_NIL(read_locked_dk) &&
68611 + LOCK_CNT_NIL(write_locked_dk)));
68612 + /* check that spinlocks of lower priorities are not held */
68613 + assert("", LOCK_CNT_NIL(spin_locked_stack));
68614 +
68615 + write_lock(&((tree)->dk_lock));
68616 +
68617 + LOCK_CNT_INC(write_locked_dk);
68618 + LOCK_CNT_INC(rw_locked_dk);
68619 + LOCK_CNT_INC(spin_locked);
68620 +}
68621 +
68622 +static inline void write_unlock_dk(reiser4_tree *tree)
68623 +{
68624 + assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
68625 + assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68626 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68627 +
68628 + LOCK_CNT_DEC(write_locked_dk);
68629 + LOCK_CNT_DEC(rw_locked_dk);
68630 + LOCK_CNT_DEC(spin_locked);
68631 +
68632 + write_unlock(&(tree->dk_lock));
68633 +}
68634 +
68635 +/* estimate api. Implementation is in estimate.c */
68636 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
68637 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
68638 +reiser4_block_nr estimate_insert_flow(tree_level);
68639 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
68640 +reiser4_block_nr calc_estimate_one_insert(tree_level);
68641 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
68642 +reiser4_block_nr estimate_insert_cluster(struct inode *);
68643 +reiser4_block_nr estimate_update_cluster(struct inode *);
68644 +
68645 +/* __REISER4_TREE_H__ */
68646 +#endif
68647 +
68648 +/* Make Linus happy.
68649 + Local variables:
68650 + c-indentation-style: "K&R"
68651 + mode-name: "LC"
68652 + c-basic-offset: 8
68653 + tab-width: 8
68654 + fill-column: 120
68655 + scroll-step: 1
68656 + End:
68657 +*/
68658 diff -urN linux-2.6.23.orig/fs/reiser4/tree_mod.c linux-2.6.23/fs/reiser4/tree_mod.c
68659 --- linux-2.6.23.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300
68660 +++ linux-2.6.23/fs/reiser4/tree_mod.c 2007-12-04 16:49:30.000000000 +0300
68661 @@ -0,0 +1,386 @@
68662 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68663 + * reiser4/README */
68664 +
68665 +/*
68666 + * Functions to add/delete new nodes to/from the tree.
68667 + *
68668 + * Functions from this file are used by carry (see carry*) to handle:
68669 + *
68670 + * . insertion of new formatted node into tree
68671 + *
68672 + * . addition of new tree root, increasing tree height
68673 + *
68674 + * . removing tree root, decreasing tree height
68675 + *
68676 + */
68677 +
68678 +#include "forward.h"
68679 +#include "debug.h"
68680 +#include "dformat.h"
68681 +#include "key.h"
68682 +#include "coord.h"
68683 +#include "plugin/plugin.h"
68684 +#include "jnode.h"
68685 +#include "znode.h"
68686 +#include "tree_mod.h"
68687 +#include "block_alloc.h"
68688 +#include "tree_walk.h"
68689 +#include "tree.h"
68690 +#include "super.h"
68691 +
68692 +#include <linux/err.h>
68693 +
68694 +static int add_child_ptr(znode * parent, znode * child);
68695 +/* warning only issued if error is not -E_REPEAT */
68696 +#define ewarning( error, ... ) \
68697 + if( ( error ) != -E_REPEAT ) \
68698 + warning( __VA_ARGS__ )
68699 +
68700 +/* allocate new node on the @level and immediately on the right of @brother. */
68701 +znode * reiser4_new_node(znode * brother /* existing left neighbor
68702 + * of new node */,
68703 + tree_level level /* tree level at which new node is to
68704 + * be allocated */)
68705 +{
68706 + znode *result;
68707 + int retcode;
68708 + reiser4_block_nr blocknr;
68709 +
68710 + assert("nikita-930", brother != NULL);
68711 + assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
68712 +
68713 + retcode = assign_fake_blocknr_formatted(&blocknr);
68714 + if (retcode == 0) {
68715 + result =
68716 + zget(znode_get_tree(brother), &blocknr, NULL, level,
68717 + reiser4_ctx_gfp_mask_get());
68718 + if (IS_ERR(result)) {
68719 + ewarning(PTR_ERR(result), "nikita-929",
68720 + "Cannot allocate znode for carry: %li",
68721 + PTR_ERR(result));
68722 + return result;
68723 + }
68724 + /* cheap test, can be executed even when debugging is off */
68725 + if (!znode_just_created(result)) {
68726 + warning("nikita-2213",
68727 + "Allocated already existing block: %llu",
68728 + (unsigned long long)blocknr);
68729 + zput(result);
68730 + return ERR_PTR(RETERR(-EIO));
68731 + }
68732 +
68733 + assert("nikita-931", result != NULL);
68734 + result->nplug = znode_get_tree(brother)->nplug;
68735 + assert("nikita-933", result->nplug != NULL);
68736 +
68737 + retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
68738 + if (retcode == 0) {
68739 + ZF_SET(result, JNODE_CREATED);
68740 + zrelse(result);
68741 + } else {
68742 + zput(result);
68743 + result = ERR_PTR(retcode);
68744 + }
68745 + } else {
68746 + /* failure to allocate new node during balancing.
68747 + This should never happen. Ever. Returning -E_REPEAT
68748 + is not viable solution, because "out of disk space"
68749 + is not transient error that will go away by itself.
68750 + */
68751 + ewarning(retcode, "nikita-928",
68752 + "Cannot allocate block for carry: %i", retcode);
68753 + result = ERR_PTR(retcode);
68754 + }
68755 + assert("nikita-1071", result != NULL);
68756 + return result;
68757 +}
68758 +
68759 +/* allocate new root and add it to the tree
68760 +
68761 + This helper function is called by add_new_root().
68762 +
68763 +*/
68764 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
68765 + znode * fake /* "fake" znode */ )
68766 +{
68767 + reiser4_tree *tree = znode_get_tree(old_root);
68768 + znode *new_root = NULL; /* to shut gcc up */
68769 + int result;
68770 +
68771 + assert("nikita-1069", old_root != NULL);
68772 + assert("umka-262", fake != NULL);
68773 + assert("umka-263", tree != NULL);
68774 +
68775 + /* "fake" znode---one always hanging just above current root. This
68776 + node is locked when new root is created or existing root is
68777 + deleted. Downward tree traversal takes lock on it before taking
68778 + lock on a root node. This avoids race conditions with root
68779 + manipulations.
68780 +
68781 + */
68782 + assert("nikita-1348", znode_above_root(fake));
68783 + assert("nikita-1211", znode_is_root(old_root));
68784 +
68785 + result = 0;
68786 + if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
68787 + warning("nikita-1344", "Tree is too tall: %i", tree->height);
68788 + /* ext2 returns -ENOSPC when it runs out of free inodes with a
68789 + following comment (fs/ext2/ialloc.c:441): Is it really
68790 + ENOSPC?
68791 +
68792 + -EXFULL? -EINVAL?
68793 + */
68794 + result = RETERR(-ENOSPC);
68795 + } else {
68796 + /* Allocate block for new root. It's not that
68797 + important where it will be allocated, as root is
68798 + almost always in memory. Moreover, allocate on
68799 + flush can be going here.
68800 + */
68801 + assert("nikita-1448", znode_is_root(old_root));
68802 + new_root = reiser4_new_node(fake, tree->height + 1);
68803 + if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
68804 + lock_handle rlh;
68805 +
68806 + init_lh(&rlh);
68807 + result =
68808 + longterm_lock_znode(&rlh, new_root,
68809 + ZNODE_WRITE_LOCK,
68810 + ZNODE_LOCK_LOPRI);
68811 + if (result == 0) {
68812 + parent_coord_t *in_parent;
68813 +
68814 + znode_make_dirty(fake);
68815 +
68816 + /* new root is a child of "fake" node */
68817 + write_lock_tree(tree);
68818 +
68819 + ++tree->height;
68820 +
68821 + /* recalculate max balance overhead */
68822 + tree->estimate_one_insert =
68823 + estimate_one_insert_item(tree);
68824 +
68825 + tree->root_block = *znode_get_block(new_root);
68826 + in_parent = &new_root->in_parent;
68827 + init_parent_coord(in_parent, fake);
68828 + /* manually insert new root into sibling
68829 + * list. With this all nodes involved into
68830 + * balancing are connected after balancing is
68831 + * done---useful invariant to check. */
68832 + sibling_list_insert_nolock(new_root, NULL);
68833 + write_unlock_tree(tree);
68834 +
68835 + /* insert into new root pointer to the
68836 + @old_root. */
68837 + assert("nikita-1110",
68838 + WITH_DATA(new_root,
68839 + node_is_empty(new_root)));
68840 + write_lock_dk(tree);
68841 + znode_set_ld_key(new_root, reiser4_min_key());
68842 + znode_set_rd_key(new_root, reiser4_max_key());
68843 + write_unlock_dk(tree);
68844 + if (REISER4_DEBUG) {
68845 + ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
68846 + ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
68847 + ZF_SET(old_root, JNODE_ORPHAN);
68848 + }
68849 + result = add_child_ptr(new_root, old_root);
68850 + done_lh(&rlh);
68851 + }
68852 + zrelse(new_root);
68853 + }
68854 + }
68855 + if (result != 0)
68856 + new_root = ERR_PTR(result);
68857 + return new_root;
68858 +}
68859 +
68860 +/* build &reiser4_item_data for inserting child pointer
68861 +
68862 + Build &reiser4_item_data that can be later used to insert pointer to @child
68863 + in its parent.
68864 +
68865 +*/
68866 +void build_child_ptr_data(znode * child /* node pointer to which will be
68867 + * inserted */ ,
68868 + reiser4_item_data * data /* where to store result */ )
68869 +{
68870 + assert("nikita-1116", child != NULL);
68871 + assert("nikita-1117", data != NULL);
68872 +
68873 + /*
68874 + * NOTE: use address of child's blocknr as address of data to be
68875 + * inserted. As result of this data gets into on-disk structure in cpu
68876 + * byte order. internal's create_hook converts it to little endian byte
68877 + * order.
68878 + */
68879 + data->data = (char *)znode_get_block(child);
68880 + /* data -> data is kernel space */
68881 + data->user = 0;
68882 + data->length = sizeof(reiser4_block_nr);
68883 + /* FIXME-VS: hardcoded internal item? */
68884 +
68885 + /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
68886 + data->iplug = item_plugin_by_id(NODE_POINTER_ID);
68887 +}
68888 +
68889 +/* add pointer to @child into empty @parent.
68890 +
68891 + This is used when pointer to old root is inserted into new root which is
68892 + empty.
68893 +*/
68894 +static int add_child_ptr(znode * parent, znode * child)
68895 +{
68896 + coord_t coord;
68897 + reiser4_item_data data;
68898 + int result;
68899 + reiser4_key key;
68900 +
68901 + assert("nikita-1111", parent != NULL);
68902 + assert("nikita-1112", child != NULL);
68903 + assert("nikita-1115",
68904 + znode_get_level(parent) == znode_get_level(child) + 1);
68905 +
68906 + result = zload(parent);
68907 + if (result != 0)
68908 + return result;
68909 + assert("nikita-1113", node_is_empty(parent));
68910 + coord_init_first_unit(&coord, parent);
68911 +
68912 + build_child_ptr_data(child, &data);
68913 + data.arg = NULL;
68914 +
68915 + read_lock_dk(znode_get_tree(parent));
68916 + key = *znode_get_ld_key(child);
68917 + read_unlock_dk(znode_get_tree(parent));
68918 +
68919 + result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
68920 + NULL);
68921 + znode_make_dirty(parent);
68922 + zrelse(parent);
68923 + return result;
68924 +}
68925 +
68926 +/* actually remove tree root */
68927 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
68928 + * being removed */,
68929 + znode * old_root /* root node that is being
68930 + * removed */ ,
68931 + znode * new_root /* new root---sole child of
68932 + * @old_root */,
68933 + const reiser4_block_nr * new_root_blk /* disk address of
68934 + * @new_root */)
68935 +{
68936 + znode *uber;
68937 + int result;
68938 + lock_handle handle_for_uber;
68939 +
68940 + assert("umka-265", tree != NULL);
68941 + assert("nikita-1198", new_root != NULL);
68942 + assert("nikita-1199",
68943 + znode_get_level(new_root) + 1 == znode_get_level(old_root));
68944 +
68945 + assert("nikita-1201", znode_is_write_locked(old_root));
68946 +
68947 + assert("nikita-1203",
68948 + disk_addr_eq(new_root_blk, znode_get_block(new_root)));
68949 +
68950 + init_lh(&handle_for_uber);
68951 + /* obtain and lock "fake" znode protecting changes in tree height. */
68952 + result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
68953 + &handle_for_uber);
68954 + if (result == 0) {
68955 + uber = handle_for_uber.node;
68956 +
68957 + znode_make_dirty(uber);
68958 +
68959 + /* don't take long term lock a @new_root. Take spinlock. */
68960 +
68961 + write_lock_tree(tree);
68962 +
68963 + tree->root_block = *new_root_blk;
68964 + --tree->height;
68965 +
68966 + /* recalculate max balance overhead */
68967 + tree->estimate_one_insert = estimate_one_insert_item(tree);
68968 +
68969 + assert("nikita-1202",
68970 + tree->height == znode_get_level(new_root));
68971 +
68972 + /* new root is child on "fake" node */
68973 + init_parent_coord(&new_root->in_parent, uber);
68974 + ++uber->c_count;
68975 +
68976 + /* sibling_list_insert_nolock(new_root, NULL); */
68977 + write_unlock_tree(tree);
68978 +
68979 + /* reinitialise old root. */
68980 + result = node_plugin_by_node(old_root)->init(old_root);
68981 + znode_make_dirty(old_root);
68982 + if (result == 0) {
68983 + assert("nikita-1279", node_is_empty(old_root));
68984 + ZF_SET(old_root, JNODE_HEARD_BANSHEE);
68985 + old_root->c_count = 0;
68986 + }
68987 + }
68988 + done_lh(&handle_for_uber);
68989 +
68990 + return result;
68991 +}
68992 +
68993 +/* remove tree root
68994 +
68995 + This function removes tree root, decreasing tree height by one. Tree root
68996 + and its only child (that is going to become new tree root) are write locked
68997 + at the entry.
68998 +
68999 + To remove tree root we need to take lock on special "fake" znode that
69000 + protects changes of tree height. See comments in reiser4_add_tree_root() for
69001 + more on this.
69002 +
69003 + Also parent pointers have to be updated in
69004 + old and new root. To simplify code, function is split into two parts: outer
69005 + reiser4_kill_tree_root() collects all necessary arguments and calls
69006 + reiser4_kill_root() to do the actual job.
69007 +
69008 +*/
69009 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
69010 + removing*/)
69011 +{
69012 + int result;
69013 + coord_t down_link;
69014 + znode *new_root;
69015 + reiser4_tree *tree;
69016 +
69017 + assert("umka-266", current_tree != NULL);
69018 + assert("nikita-1194", old_root != NULL);
69019 + assert("nikita-1196", znode_is_root(old_root));
69020 + assert("nikita-1200", node_num_items(old_root) == 1);
69021 + assert("nikita-1401", znode_is_write_locked(old_root));
69022 +
69023 + coord_init_first_unit(&down_link, old_root);
69024 +
69025 + tree = znode_get_tree(old_root);
69026 + new_root = child_znode(&down_link, old_root, 0, 1);
69027 + if (!IS_ERR(new_root)) {
69028 + result =
69029 + reiser4_kill_root(tree, old_root, new_root,
69030 + znode_get_block(new_root));
69031 + zput(new_root);
69032 + } else
69033 + result = PTR_ERR(new_root);
69034 +
69035 + return result;
69036 +}
69037 +
69038 +/* Make Linus happy.
69039 + Local variables:
69040 + c-indentation-style: "K&R"
69041 + mode-name: "LC"
69042 + c-basic-offset: 8
69043 + tab-width: 8
69044 + fill-column: 120
69045 + scroll-step: 1
69046 + End:
69047 +*/
69048 diff -urN linux-2.6.23.orig/fs/reiser4/tree_mod.h linux-2.6.23/fs/reiser4/tree_mod.h
69049 --- linux-2.6.23.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300
69050 +++ linux-2.6.23/fs/reiser4/tree_mod.h 2007-12-04 16:49:30.000000000 +0300
69051 @@ -0,0 +1,29 @@
69052 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69053 + * reiser4/README */
69054 +
69055 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
69056 + * comments. */
69057 +
69058 +#if !defined( __REISER4_TREE_MOD_H__ )
69059 +#define __REISER4_TREE_MOD_H__
69060 +
69061 +#include "forward.h"
69062 +
69063 +znode *reiser4_new_node(znode * brother, tree_level level);
69064 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
69065 +int reiser4_kill_tree_root(znode * old_root);
69066 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
69067 +
69068 +/* __REISER4_TREE_MOD_H__ */
69069 +#endif
69070 +
69071 +/* Make Linus happy.
69072 + Local variables:
69073 + c-indentation-style: "K&R"
69074 + mode-name: "LC"
69075 + c-basic-offset: 8
69076 + tab-width: 8
69077 + fill-column: 120
69078 + scroll-step: 1
69079 + End:
69080 +*/
69081 diff -urN linux-2.6.23.orig/fs/reiser4/tree_walk.c linux-2.6.23/fs/reiser4/tree_walk.c
69082 --- linux-2.6.23.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300
69083 +++ linux-2.6.23/fs/reiser4/tree_walk.c 2007-12-04 16:49:30.000000000 +0300
69084 @@ -0,0 +1,927 @@
69085 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69086 + * reiser4/README */
69087 +
69088 +/* Routines and macros to:
69089 +
69090 + get_left_neighbor()
69091 +
69092 + get_right_neighbor()
69093 +
69094 + get_parent()
69095 +
69096 + get_first_child()
69097 +
69098 + get_last_child()
69099 +
69100 + various routines to walk the whole tree and do things to it like
69101 + repack it, or move it to tertiary storage. Please make them as
69102 + generic as is reasonable.
69103 +
69104 +*/
69105 +
69106 +#include "forward.h"
69107 +#include "debug.h"
69108 +#include "dformat.h"
69109 +#include "coord.h"
69110 +#include "plugin/item/item.h"
69111 +#include "jnode.h"
69112 +#include "znode.h"
69113 +#include "tree_walk.h"
69114 +#include "tree.h"
69115 +#include "super.h"
69116 +
69117 +/* These macros are used internally in tree_walk.c in attempt to make
69118 + lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
69119 + lock_left_neighbor */
69120 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
69121 +#define FIELD_OFFSET(name) offsetof(znode, name)
69122 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
69123 +#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
69124 +#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
69125 +
69126 +/* This is the generic procedure to get and lock `generic' neighbor (left or
69127 + right neighbor or parent). It implements common algorithm for all cases of
69128 + getting lock on neighbor node, only znode structure field is different in
69129 + each case. This is parameterized by ptr_offset argument, which is byte
69130 + offset for the pointer to the desired neighbor within the current node's
69131 + znode structure. This function should be called with the tree lock held */
69132 +static int lock_neighbor(
69133 + /* resulting lock handle */
69134 + lock_handle * result,
69135 + /* znode to lock */
69136 + znode * node,
69137 + /* pointer to neighbor (or parent) znode field offset, in bytes from
69138 + the base address of znode structure */
69139 + int ptr_offset,
69140 + /* lock mode for longterm_lock_znode call */
69141 + znode_lock_mode mode,
69142 + /* lock request for longterm_lock_znode call */
69143 + znode_lock_request req,
69144 + /* GN_* flags */
69145 + int flags, int rlocked)
69146 +{
69147 + reiser4_tree *tree = znode_get_tree(node);
69148 + znode *neighbor;
69149 + int ret;
69150 +
69151 + assert("umka-236", node != NULL);
69152 + assert("umka-237", tree != NULL);
69153 + assert_rw_locked(&(tree->tree_lock));
69154 +
69155 + if (flags & GN_TRY_LOCK)
69156 + req |= ZNODE_LOCK_NONBLOCK;
69157 + if (flags & GN_SAME_ATOM)
69158 + req |= ZNODE_LOCK_DONT_FUSE;
69159 +
69160 + /* get neighbor's address by using of sibling link, quit while loop
69161 + (and return) if link is not available. */
69162 + while (1) {
69163 + neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
69164 +
69165 + /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
69166 + * node pointed by it is not connected.
69167 + *
69168 + * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
69169 + * check and allows passing reference to not connected znode to
69170 + * subsequent longterm_lock_znode() call. This kills possible
69171 + * busy loop if we are trying to get longterm lock on locked but
69172 + * not yet connected parent node. */
69173 + if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
69174 + || znode_is_connected(neighbor))) {
69175 + return RETERR(-E_NO_NEIGHBOR);
69176 + }
69177 +
69178 + /* protect it from deletion. */
69179 + zref(neighbor);
69180 +
69181 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69182 +
69183 + ret = longterm_lock_znode(result, neighbor, mode, req);
69184 +
69185 + /* The lock handle obtains its own reference, release the one from above. */
69186 + zput(neighbor);
69187 +
69188 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69189 +
69190 + /* restart if node we got reference to is being
69191 + invalidated. we should not get reference to this node
69192 + again. */
69193 + if (ret == -EINVAL)
69194 + continue;
69195 + if (ret)
69196 + return ret;
69197 +
69198 + /* check if neighbor link still points to just locked znode;
69199 + the link could have been changed while the process slept. */
69200 + if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
69201 + return 0;
69202 +
69203 + /* znode was locked by mistake; unlock it and restart locking
69204 + process from beginning. */
69205 + rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69206 + longterm_unlock_znode(result);
69207 + rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69208 + }
69209 +}
69210 +
69211 +/* get parent node with longterm lock, accepts GN* flags. */
69212 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
69213 + znode * node /* child node */ ,
69214 + znode_lock_mode mode
69215 + /* type of lock: read or write */ ,
69216 + int flags /* GN_* flags */ )
69217 +{
69218 + int result;
69219 +
69220 + read_lock_tree(znode_get_tree(node));
69221 + result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
69222 + ZNODE_LOCK_HIPRI, flags, 1);
69223 + read_unlock_tree(znode_get_tree(node));
69224 + return result;
69225 +}
69226 +
69227 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
69228 + bit in @flags parameter */
69229 +/* Audited by: umka (2002.06.14) */
69230 +static inline int
69231 +lock_side_neighbor(lock_handle * result,
69232 + znode * node, znode_lock_mode mode, int flags, int rlocked)
69233 +{
69234 + int ret;
69235 + int ptr_offset;
69236 + znode_lock_request req;
69237 +
69238 + if (flags & GN_GO_LEFT) {
69239 + ptr_offset = LEFT_PTR_OFFSET;
69240 + req = ZNODE_LOCK_LOPRI;
69241 + } else {
69242 + ptr_offset = RIGHT_PTR_OFFSET;
69243 + req = ZNODE_LOCK_HIPRI;
69244 + }
69245 +
69246 + ret =
69247 + lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
69248 +
69249 + if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
69250 + * guarantee that neighbor is absent in the
69251 + * tree; in this case we return -ENOENT --
69252 + * means neighbor at least not found in
69253 + * cache */
69254 + return RETERR(-ENOENT);
69255 +
69256 + return ret;
69257 +}
69258 +
69259 +#if REISER4_DEBUG
69260 +
69261 +int check_sibling_list(znode * node)
69262 +{
69263 + znode *scan;
69264 + znode *next;
69265 +
69266 + assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
69267 +
69268 + if (node == NULL)
69269 + return 1;
69270 +
69271 + if (ZF_ISSET(node, JNODE_RIP))
69272 + return 1;
69273 +
69274 + assert("nikita-3270", node != NULL);
69275 + assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
69276 +
69277 + for (scan = node; znode_is_left_connected(scan); scan = next) {
69278 + next = scan->left;
69279 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69280 + assert("nikita-3271", znode_is_right_connected(next));
69281 + assert("nikita-3272", next->right == scan);
69282 + } else
69283 + break;
69284 + }
69285 + for (scan = node; znode_is_right_connected(scan); scan = next) {
69286 + next = scan->right;
69287 + if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69288 + assert("nikita-3273", znode_is_left_connected(next));
69289 + assert("nikita-3274", next->left == scan);
69290 + } else
69291 + break;
69292 + }
69293 + return 1;
69294 +}
69295 +
69296 +#endif
69297 +
69298 +/* Znode sibling pointers maintenence. */
69299 +
69300 +/* Znode sibling pointers are established between any neighbored nodes which are
69301 + in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
69302 + JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
69303 + value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
69304 +
69305 + Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
69306 + take care about searching (hash table lookup may be required) of znode
69307 + neighbors, establishing sibling pointers between them and setting
69308 + JNODE_*_CONNECTED state bits. */
69309 +
69310 +/* adjusting of sibling pointers and `connected' states for two
69311 + neighbors; works if one neighbor is NULL (was not found). */
69312 +
69313 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
69314 +void link_left_and_right(znode * left, znode * right)
69315 +{
69316 + assert("nikita-3275", check_sibling_list(left));
69317 + assert("nikita-3275", check_sibling_list(right));
69318 +
69319 + if (left != NULL) {
69320 + if (left->right == NULL) {
69321 + left->right = right;
69322 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
69323 +
69324 + ON_DEBUG(left->right_version =
69325 + atomic_inc_return(&delim_key_version);
69326 + );
69327 +
69328 + } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
69329 + && left->right != right) {
69330 +
69331 + ON_DEBUG(left->right->left_version =
69332 + atomic_inc_return(&delim_key_version);
69333 + left->right_version =
69334 + atomic_inc_return(&delim_key_version););
69335 +
69336 + left->right->left = NULL;
69337 + left->right = right;
69338 + ZF_SET(left, JNODE_RIGHT_CONNECTED);
69339 + } else
69340 + /*
69341 + * there is a race condition in renew_sibling_link()
69342 + * and assertions below check that it is only one
69343 + * there. Thread T1 calls renew_sibling_link() without
69344 + * GN_NO_ALLOC flag. zlook() doesn't find neighbor
69345 + * node, but before T1 gets to the
69346 + * link_left_and_right(), another thread T2 creates
69347 + * neighbor node and connects it. check for
69348 + * left->right == NULL above protects T1 from
69349 + * overwriting correct left->right pointer installed
69350 + * by T2.
69351 + */
69352 + assert("nikita-3302",
69353 + right == NULL || left->right == right);
69354 + }
69355 + if (right != NULL) {
69356 + if (right->left == NULL) {
69357 + right->left = left;
69358 + ZF_SET(right, JNODE_LEFT_CONNECTED);
69359 +
69360 + ON_DEBUG(right->left_version =
69361 + atomic_inc_return(&delim_key_version);
69362 + );
69363 +
69364 + } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
69365 + && right->left != left) {
69366 +
69367 + ON_DEBUG(right->left->right_version =
69368 + atomic_inc_return(&delim_key_version);
69369 + right->left_version =
69370 + atomic_inc_return(&delim_key_version););
69371 +
69372 + right->left->right = NULL;
69373 + right->left = left;
69374 + ZF_SET(right, JNODE_LEFT_CONNECTED);
69375 +
69376 + } else
69377 + assert("nikita-3303",
69378 + left == NULL || right->left == left);
69379 + }
69380 + assert("nikita-3275", check_sibling_list(left));
69381 + assert("nikita-3275", check_sibling_list(right));
69382 +}
69383 +
69384 +/* Audited by: umka (2002.06.14) */
69385 +static void link_znodes(znode * first, znode * second, int to_left)
69386 +{
69387 + if (to_left)
69388 + link_left_and_right(second, first);
69389 + else
69390 + link_left_and_right(first, second);
69391 +}
69392 +
69393 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
69394 + coord's unit position in horizontal direction, even across node
69395 + boundary. Should be called under tree lock, it protects nonexistence of
69396 + sibling link on parent level, if lock_side_neighbor() fails with
69397 + -ENOENT. */
69398 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
69399 +{
69400 + int ret;
69401 + znode *node;
69402 + reiser4_tree *tree;
69403 +
69404 + assert("umka-243", coord != NULL);
69405 + assert("umka-244", handle != NULL);
69406 + assert("zam-1069", handle->node == NULL);
69407 +
69408 + ret =
69409 + (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
69410 + coord_next_unit(coord);
69411 + if (!ret)
69412 + return 0;
69413 +
69414 + ret =
69415 + lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
69416 + if (ret)
69417 + return ret;
69418 +
69419 + node = handle->node;
69420 + tree = znode_get_tree(node);
69421 + write_unlock_tree(tree);
69422 +
69423 + coord_init_zero(coord);
69424 +
69425 + /* We avoid synchronous read here if it is specified by flag. */
69426 + if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
69427 + ret = jstartio(ZJNODE(handle->node));
69428 + if (!ret)
69429 + ret = -E_REPEAT;
69430 + goto error_locked;
69431 + }
69432 +
69433 + /* corresponded zrelse() should be called by the clients of
69434 + far_next_coord(), in place when this node gets unlocked. */
69435 + ret = zload(handle->node);
69436 + if (ret)
69437 + goto error_locked;
69438 +
69439 + if (flags & GN_GO_LEFT)
69440 + coord_init_last_unit(coord, node);
69441 + else
69442 + coord_init_first_unit(coord, node);
69443 +
69444 + if (0) {
69445 + error_locked:
69446 + longterm_unlock_znode(handle);
69447 + }
69448 + write_lock_tree(tree);
69449 + return ret;
69450 +}
69451 +
69452 +/* Very significant function which performs a step in horizontal direction
69453 + when sibling pointer is not available. Actually, it is only function which
69454 + does it.
69455 + Note: this function does not restore locking status at exit,
69456 + caller should does care about proper unlocking and zrelsing */
69457 +static int
69458 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
69459 + tree_level level, int flags, int *nr_locked)
69460 +{
69461 + int ret;
69462 + int to_left = flags & GN_GO_LEFT;
69463 + reiser4_block_nr da;
69464 + /* parent of the neighbor node; we set it to parent until not sharing
69465 + of one parent between child and neighbor node is detected */
69466 + znode *side_parent = coord->node;
69467 + reiser4_tree *tree = znode_get_tree(child);
69468 + znode *neighbor = NULL;
69469 +
69470 + assert("umka-245", coord != NULL);
69471 + assert("umka-246", handle != NULL);
69472 + assert("umka-247", child != NULL);
69473 + assert("umka-303", tree != NULL);
69474 +
69475 + init_lh(handle);
69476 + write_lock_tree(tree);
69477 + ret = far_next_coord(coord, handle, flags);
69478 +
69479 + if (ret) {
69480 + if (ret != -ENOENT) {
69481 + write_unlock_tree(tree);
69482 + return ret;
69483 + }
69484 + } else {
69485 + item_plugin *iplug;
69486 +
69487 + if (handle->node != NULL) {
69488 + (*nr_locked)++;
69489 + side_parent = handle->node;
69490 + }
69491 +
69492 + /* does coord object points to internal item? We do not
69493 + support sibling pointers between znode for formatted and
69494 + unformatted nodes and return -E_NO_NEIGHBOR in that case. */
69495 + iplug = item_plugin_by_coord(coord);
69496 + if (!item_is_internal(coord)) {
69497 + link_znodes(child, NULL, to_left);
69498 + write_unlock_tree(tree);
69499 + /* we know there can't be formatted neighbor */
69500 + return RETERR(-E_NO_NEIGHBOR);
69501 + }
69502 + write_unlock_tree(tree);
69503 +
69504 + iplug->s.internal.down_link(coord, NULL, &da);
69505 +
69506 + if (flags & GN_NO_ALLOC) {
69507 + neighbor = zlook(tree, &da);
69508 + } else {
69509 + neighbor =
69510 + zget(tree, &da, side_parent, level,
69511 + reiser4_ctx_gfp_mask_get());
69512 + }
69513 +
69514 + if (IS_ERR(neighbor)) {
69515 + ret = PTR_ERR(neighbor);
69516 + return ret;
69517 + }
69518 +
69519 + if (neighbor)
69520 + /* update delimiting keys */
69521 + set_child_delimiting_keys(coord->node, coord, neighbor);
69522 +
69523 + write_lock_tree(tree);
69524 + }
69525 +
69526 + if (likely(neighbor == NULL ||
69527 + (znode_get_level(child) == znode_get_level(neighbor)
69528 + && child != neighbor)))
69529 + link_znodes(child, neighbor, to_left);
69530 + else {
69531 + warning("nikita-3532",
69532 + "Sibling nodes on the different levels: %i != %i\n",
69533 + znode_get_level(child), znode_get_level(neighbor));
69534 + ret = RETERR(-EIO);
69535 + }
69536 +
69537 + write_unlock_tree(tree);
69538 +
69539 + /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
69540 + if (neighbor != NULL && (flags & GN_NO_ALLOC))
69541 + /* atomic_dec(&ZJNODE(neighbor)->x_count); */
69542 + zput(neighbor);
69543 +
69544 + return ret;
69545 +}
69546 +
69547 +/* This function is for establishing of one side relation. */
69548 +/* Audited by: umka (2002.06.14) */
69549 +static int connect_one_side(coord_t * coord, znode * node, int flags)
69550 +{
69551 + coord_t local;
69552 + lock_handle handle;
69553 + int nr_locked;
69554 + int ret;
69555 +
69556 + assert("umka-248", coord != NULL);
69557 + assert("umka-249", node != NULL);
69558 +
69559 + coord_dup_nocheck(&local, coord);
69560 +
69561 + init_lh(&handle);
69562 +
69563 + ret =
69564 + renew_sibling_link(&local, &handle, node, znode_get_level(node),
69565 + flags | GN_NO_ALLOC, &nr_locked);
69566 +
69567 + if (handle.node != NULL) {
69568 + /* complementary operations for zload() and lock() in far_next_coord() */
69569 + zrelse(handle.node);
69570 + longterm_unlock_znode(&handle);
69571 + }
69572 +
69573 + /* we catch error codes which are not interesting for us because we
69574 + run renew_sibling_link() only for znode connection. */
69575 + if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
69576 + return 0;
69577 +
69578 + return ret;
69579 +}
69580 +
69581 +/* if @child is not in `connected' state, performs hash searches for left and
69582 + right neighbor nodes and establishes horizontal sibling links */
69583 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69584 +int connect_znode(coord_t * parent_coord, znode * child)
69585 +{
69586 + reiser4_tree *tree = znode_get_tree(child);
69587 + int ret = 0;
69588 +
69589 + assert("zam-330", parent_coord != NULL);
69590 + assert("zam-331", child != NULL);
69591 + assert("zam-332", parent_coord->node != NULL);
69592 + assert("umka-305", tree != NULL);
69593 +
69594 + /* it is trivial to `connect' root znode because it can't have
69595 + neighbors */
69596 + if (znode_above_root(parent_coord->node)) {
69597 + child->left = NULL;
69598 + child->right = NULL;
69599 + ZF_SET(child, JNODE_LEFT_CONNECTED);
69600 + ZF_SET(child, JNODE_RIGHT_CONNECTED);
69601 +
69602 + ON_DEBUG(child->left_version =
69603 + atomic_inc_return(&delim_key_version);
69604 + child->right_version =
69605 + atomic_inc_return(&delim_key_version););
69606 +
69607 + return 0;
69608 + }
69609 +
69610 + /* load parent node */
69611 + coord_clear_iplug(parent_coord);
69612 + ret = zload(parent_coord->node);
69613 +
69614 + if (ret != 0)
69615 + return ret;
69616 +
69617 + /* protect `connected' state check by tree_lock */
69618 + read_lock_tree(tree);
69619 +
69620 + if (!znode_is_right_connected(child)) {
69621 + read_unlock_tree(tree);
69622 + /* connect right (default is right) */
69623 + ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
69624 + if (ret)
69625 + goto zrelse_and_ret;
69626 +
69627 + read_lock_tree(tree);
69628 + }
69629 +
69630 + ret = znode_is_left_connected(child);
69631 +
69632 + read_unlock_tree(tree);
69633 +
69634 + if (!ret) {
69635 + ret =
69636 + connect_one_side(parent_coord, child,
69637 + GN_NO_ALLOC | GN_GO_LEFT);
69638 + } else
69639 + ret = 0;
69640 +
69641 + zrelse_and_ret:
69642 + zrelse(parent_coord->node);
69643 +
69644 + return ret;
69645 +}
69646 +
69647 +/* this function is like renew_sibling_link() but allocates neighbor node if
69648 + it doesn't exist and `connects' it. It may require making two steps in
69649 + horizontal direction, first one for neighbor node finding/allocation,
69650 + second one is for finding neighbor of neighbor to connect freshly allocated
69651 + znode. */
69652 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69653 +static int
69654 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
69655 +{
69656 + coord_t local;
69657 + lock_handle empty[2];
69658 + reiser4_tree *tree = znode_get_tree(node);
69659 + znode *neighbor = NULL;
69660 + int nr_locked = 0;
69661 + int ret;
69662 +
69663 + assert("umka-250", coord != NULL);
69664 + assert("umka-251", node != NULL);
69665 + assert("umka-307", tree != NULL);
69666 + assert("umka-308", level <= tree->height);
69667 +
69668 + /* umka (2002.06.14)
69669 + Here probably should be a check for given "level" validness.
69670 + Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
69671 + */
69672 +
69673 + coord_dup(&local, coord);
69674 +
69675 + ret =
69676 + renew_sibling_link(&local, &empty[0], node, level,
69677 + flags & ~GN_NO_ALLOC, &nr_locked);
69678 + if (ret)
69679 + goto out;
69680 +
69681 + /* tree lock is not needed here because we keep parent node(s) locked
69682 + and reference to neighbor znode incremented */
69683 + neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
69684 +
69685 + read_lock_tree(tree);
69686 + ret = znode_is_connected(neighbor);
69687 + read_unlock_tree(tree);
69688 + if (ret) {
69689 + ret = 0;
69690 + goto out;
69691 + }
69692 +
69693 + ret =
69694 + renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
69695 + flags | GN_NO_ALLOC, &nr_locked);
69696 + /* second renew_sibling_link() call is used for znode connection only,
69697 + so we can live with these errors */
69698 + if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
69699 + ret = 0;
69700 +
69701 + out:
69702 +
69703 + for (--nr_locked; nr_locked >= 0; --nr_locked) {
69704 + zrelse(empty[nr_locked].node);
69705 + longterm_unlock_znode(&empty[nr_locked]);
69706 + }
69707 +
69708 + if (neighbor != NULL)
69709 + /* decrement znode reference counter without actually
69710 + releasing it. */
69711 + atomic_dec(&ZJNODE(neighbor)->x_count);
69712 +
69713 + return ret;
69714 +}
69715 +
69716 +/*
69717 + reiser4_get_neighbor() -- lock node's neighbor.
69718 +
69719 + reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
69720 + given parameter) using sibling link to it. If sibling link is not available
69721 + (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
69722 + level up for information about neighbor's disk address. We lock node's
69723 + parent, if it is common parent for both 'node' and its neighbor, neighbor's
69724 + disk address is in next (to left or to right) down link from link that points
69725 + to original node. If not, we need to lock parent's neighbor, read its content
69726 + and take first(last) downlink with neighbor's disk address. That locking
69727 + could be done by using sibling link and lock_neighbor() function, if sibling
69728 + link exists. In another case we have to go level up again until we find
69729 + common parent or valid sibling link. Then go down
69730 + allocating/connecting/locking/reading nodes until neighbor of first one is
69731 + locked.
69732 +
69733 + @neighbor: result lock handle,
69734 + @node: a node which we lock neighbor of,
69735 + @lock_mode: lock mode {LM_READ, LM_WRITE},
69736 + @flags: logical OR of {GN_*} (see description above) subset.
69737 +
69738 + @return: 0 if success, negative value if lock was impossible due to an error
69739 + or lack of neighbor node.
69740 +*/
69741 +
69742 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69743 +int
69744 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
69745 + znode_lock_mode lock_mode, int flags)
69746 +{
69747 + reiser4_tree *tree = znode_get_tree(node);
69748 + lock_handle path[REAL_MAX_ZTREE_HEIGHT];
69749 +
69750 + coord_t coord;
69751 +
69752 + tree_level base_level;
69753 + tree_level h = 0;
69754 + int ret;
69755 +
69756 + assert("umka-252", tree != NULL);
69757 + assert("umka-253", neighbor != NULL);
69758 + assert("umka-254", node != NULL);
69759 +
69760 + base_level = znode_get_level(node);
69761 +
69762 + assert("umka-310", base_level <= tree->height);
69763 +
69764 + coord_init_zero(&coord);
69765 +
69766 + again:
69767 + /* first, we try to use simple lock_neighbor() which requires sibling
69768 + link existence */
69769 + read_lock_tree(tree);
69770 + ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
69771 + read_unlock_tree(tree);
69772 + if (!ret) {
69773 + /* load znode content if it was specified */
69774 + if (flags & GN_LOAD_NEIGHBOR) {
69775 + ret = zload(node);
69776 + if (ret)
69777 + longterm_unlock_znode(neighbor);
69778 + }
69779 + return ret;
69780 + }
69781 +
69782 + /* only -ENOENT means we may look upward and try to connect
69783 + @node with its neighbor (if @flags allow us to do it) */
69784 + if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
69785 + return ret;
69786 +
69787 + /* before establishing of sibling link we lock parent node; it is
69788 + required by renew_neighbor() to work. */
69789 + init_lh(&path[0]);
69790 + ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
69791 + if (ret)
69792 + return ret;
69793 + if (znode_above_root(path[0].node)) {
69794 + longterm_unlock_znode(&path[0]);
69795 + return RETERR(-E_NO_NEIGHBOR);
69796 + }
69797 +
69798 + while (1) {
69799 + znode *child = (h == 0) ? node : path[h - 1].node;
69800 + znode *parent = path[h].node;
69801 +
69802 + ret = zload(parent);
69803 + if (ret)
69804 + break;
69805 +
69806 + ret = find_child_ptr(parent, child, &coord);
69807 +
69808 + if (ret) {
69809 + zrelse(parent);
69810 + break;
69811 + }
69812 +
69813 + /* try to establish missing sibling link */
69814 + ret = renew_neighbor(&coord, child, h + base_level, flags);
69815 +
69816 + zrelse(parent);
69817 +
69818 + switch (ret) {
69819 + case 0:
69820 + /* unlocking of parent znode prevents simple
69821 + deadlock situation */
69822 + done_lh(&path[h]);
69823 +
69824 + /* depend on tree level we stay on we repeat first
69825 + locking attempt ... */
69826 + if (h == 0)
69827 + goto again;
69828 +
69829 + /* ... or repeat establishing of sibling link at
69830 + one level below. */
69831 + --h;
69832 + break;
69833 +
69834 + case -ENOENT:
69835 + /* sibling link is not available -- we go
69836 + upward. */
69837 + init_lh(&path[h + 1]);
69838 + ret =
69839 + reiser4_get_parent(&path[h + 1], parent,
69840 + ZNODE_READ_LOCK);
69841 + if (ret)
69842 + goto fail;
69843 + ++h;
69844 + if (znode_above_root(path[h].node)) {
69845 + ret = RETERR(-E_NO_NEIGHBOR);
69846 + goto fail;
69847 + }
69848 + break;
69849 +
69850 + case -E_DEADLOCK:
69851 + /* there was lock request from hi-pri locker. if
69852 + it is possible we unlock last parent node and
69853 + re-lock it again. */
69854 + for (; reiser4_check_deadlock(); h--) {
69855 + done_lh(&path[h]);
69856 + if (h == 0)
69857 + goto fail;
69858 + }
69859 +
69860 + break;
69861 +
69862 + default: /* other errors. */
69863 + goto fail;
69864 + }
69865 + }
69866 + fail:
69867 + ON_DEBUG(check_lock_node_data(node));
69868 + ON_DEBUG(check_lock_data());
69869 +
69870 + /* unlock path */
69871 + do {
69872 + /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
69873 + fail; path[0] is already done_lh-ed, therefore
69874 + longterm_unlock_znode(&path[h]); is not applicable */
69875 + done_lh(&path[h]);
69876 + --h;
69877 + } while (h + 1 != 0);
69878 +
69879 + return ret;
69880 +}
69881 +
69882 +/* remove node from sibling list */
69883 +/* Audited by: umka (2002.06.14) */
69884 +void sibling_list_remove(znode * node)
69885 +{
69886 + reiser4_tree *tree;
69887 +
69888 + tree = znode_get_tree(node);
69889 + assert("umka-255", node != NULL);
69890 + assert_rw_write_locked(&(tree->tree_lock));
69891 + assert("nikita-3275", check_sibling_list(node));
69892 +
69893 + write_lock_dk(tree);
69894 + if (znode_is_right_connected(node) && node->right != NULL &&
69895 + znode_is_left_connected(node) && node->left != NULL) {
69896 + assert("zam-32245",
69897 + keyeq(znode_get_rd_key(node),
69898 + znode_get_ld_key(node->right)));
69899 + znode_set_rd_key(node->left, znode_get_ld_key(node->right));
69900 + }
69901 + write_unlock_dk(tree);
69902 +
69903 + if (znode_is_right_connected(node) && node->right != NULL) {
69904 + assert("zam-322", znode_is_left_connected(node->right));
69905 + node->right->left = node->left;
69906 + ON_DEBUG(node->right->left_version =
69907 + atomic_inc_return(&delim_key_version);
69908 + );
69909 + }
69910 + if (znode_is_left_connected(node) && node->left != NULL) {
69911 + assert("zam-323", znode_is_right_connected(node->left));
69912 + node->left->right = node->right;
69913 + ON_DEBUG(node->left->right_version =
69914 + atomic_inc_return(&delim_key_version);
69915 + );
69916 + }
69917 +
69918 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
69919 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
69920 + ON_DEBUG(node->left = node->right = NULL;
69921 + node->left_version = atomic_inc_return(&delim_key_version);
69922 + node->right_version = atomic_inc_return(&delim_key_version););
69923 + assert("nikita-3276", check_sibling_list(node));
69924 +}
69925 +
69926 +/* disconnect node from sibling list */
69927 +void sibling_list_drop(znode * node)
69928 +{
69929 + znode *right;
69930 + znode *left;
69931 +
69932 + assert("nikita-2464", node != NULL);
69933 + assert("nikita-3277", check_sibling_list(node));
69934 +
69935 + right = node->right;
69936 + if (right != NULL) {
69937 + assert("nikita-2465", znode_is_left_connected(right));
69938 + right->left = NULL;
69939 + ON_DEBUG(right->left_version =
69940 + atomic_inc_return(&delim_key_version);
69941 + );
69942 + }
69943 + left = node->left;
69944 + if (left != NULL) {
69945 + assert("zam-323", znode_is_right_connected(left));
69946 + left->right = NULL;
69947 + ON_DEBUG(left->right_version =
69948 + atomic_inc_return(&delim_key_version);
69949 + );
69950 + }
69951 + ZF_CLR(node, JNODE_LEFT_CONNECTED);
69952 + ZF_CLR(node, JNODE_RIGHT_CONNECTED);
69953 + ON_DEBUG(node->left = node->right = NULL;
69954 + node->left_version = atomic_inc_return(&delim_key_version);
69955 + node->right_version = atomic_inc_return(&delim_key_version););
69956 +}
69957 +
69958 +/* Insert new node into sibling list. Regular balancing inserts new node
69959 + after (at right side) existing and locked node (@before), except one case
69960 + of adding new tree root node. @before should be NULL in that case. */
69961 +void sibling_list_insert_nolock(znode * new, znode * before)
69962 +{
69963 + assert("zam-334", new != NULL);
69964 + assert("nikita-3298", !znode_is_left_connected(new));
69965 + assert("nikita-3299", !znode_is_right_connected(new));
69966 + assert("nikita-3300", new->left == NULL);
69967 + assert("nikita-3301", new->right == NULL);
69968 + assert("nikita-3278", check_sibling_list(new));
69969 + assert("nikita-3279", check_sibling_list(before));
69970 +
69971 + if (before != NULL) {
69972 + assert("zam-333", znode_is_connected(before));
69973 + new->right = before->right;
69974 + new->left = before;
69975 + ON_DEBUG(new->right_version =
69976 + atomic_inc_return(&delim_key_version);
69977 + new->left_version =
69978 + atomic_inc_return(&delim_key_version););
69979 + if (before->right != NULL) {
69980 + before->right->left = new;
69981 + ON_DEBUG(before->right->left_version =
69982 + atomic_inc_return(&delim_key_version);
69983 + );
69984 + }
69985 + before->right = new;
69986 + ON_DEBUG(before->right_version =
69987 + atomic_inc_return(&delim_key_version);
69988 + );
69989 + } else {
69990 + new->right = NULL;
69991 + new->left = NULL;
69992 + ON_DEBUG(new->right_version =
69993 + atomic_inc_return(&delim_key_version);
69994 + new->left_version =
69995 + atomic_inc_return(&delim_key_version););
69996 + }
69997 + ZF_SET(new, JNODE_LEFT_CONNECTED);
69998 + ZF_SET(new, JNODE_RIGHT_CONNECTED);
69999 + assert("nikita-3280", check_sibling_list(new));
70000 + assert("nikita-3281", check_sibling_list(before));
70001 +}
70002 +
70003 +/*
70004 + Local variables:
70005 + c-indentation-style: "K&R"
70006 + mode-name: "LC"
70007 + c-basic-offset: 8
70008 + tab-width: 8
70009 + fill-column: 80
70010 + End:
70011 +*/
70012 diff -urN linux-2.6.23.orig/fs/reiser4/tree_walk.h linux-2.6.23/fs/reiser4/tree_walk.h
70013 --- linux-2.6.23.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300
70014 +++ linux-2.6.23/fs/reiser4/tree_walk.h 2007-12-04 16:49:30.000000000 +0300
70015 @@ -0,0 +1,125 @@
70016 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
70017 +
70018 +/* definitions of reiser4 tree walk functions */
70019 +
70020 +#ifndef __FS_REISER4_TREE_WALK_H__
70021 +#define __FS_REISER4_TREE_WALK_H__
70022 +
70023 +#include "debug.h"
70024 +#include "forward.h"
70025 +
70026 +/* establishes horizontal links between cached znodes */
70027 +int connect_znode(coord_t * coord, znode * node);
70028 +
70029 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
70030 + have the following common arguments:
70031 +
70032 + return codes:
70033 +
70034 + @return : 0 - OK,
70035 +
70036 +ZAM-FIXME-HANS: wrong return code name. Change them all.
70037 + -ENOENT - neighbor is not in cache, what is detected by sibling
70038 + link absence.
70039 +
70040 + -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
70041 + found (because we are left-/right- most node of the
70042 + tree, for example). Also, this return code is for
70043 + reiser4_get_parent() when we see no parent link -- it
70044 + means that our node is root node.
70045 +
70046 + -E_DEADLOCK - deadlock detected (request from high-priority process
70047 + received), other error codes are conformed to
70048 + /usr/include/asm/errno.h .
70049 +*/
70050 +
70051 +int
70052 +reiser4_get_parent_flags(lock_handle * result, znode * node,
70053 + znode_lock_mode mode, int flags);
70054 +
70055 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
70056 +typedef enum {
70057 + /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
70058 + * find not allocated not connected neigbor by going though upper
70059 + * levels */
70060 + GN_CAN_USE_UPPER_LEVELS = 0x1,
70061 + /* locking left neighbor instead of right one */
70062 + GN_GO_LEFT = 0x2,
70063 + /* automatically load neighbor node content */
70064 + GN_LOAD_NEIGHBOR = 0x4,
70065 + /* return -E_REPEAT if can't lock */
70066 + GN_TRY_LOCK = 0x8,
70067 + /* used internally in tree_walk.c, causes renew_sibling to not
70068 + allocate neighbor znode, but only search for it in znode cache */
70069 + GN_NO_ALLOC = 0x10,
70070 + /* do not go across atom boundaries */
70071 + GN_SAME_ATOM = 0x20,
70072 + /* allow to lock not connected nodes */
70073 + GN_ALLOW_NOT_CONNECTED = 0x40,
70074 + /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
70075 + GN_ASYNC = 0x80
70076 +} znode_get_neigbor_flags;
70077 +
70078 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
70079 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
70080 + znode_lock_mode mode)
70081 +{
70082 + return reiser4_get_parent_flags(result, node, mode,
70083 + GN_ALLOW_NOT_CONNECTED);
70084 +}
70085 +
70086 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70087 + znode_lock_mode lock_mode, int flags);
70088 +
70089 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
70090 +static inline int
70091 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
70092 + int flags)
70093 +{
70094 + return reiser4_get_neighbor(result, node, lock_mode,
70095 + flags | GN_GO_LEFT);
70096 +}
70097 +
70098 +static inline int
70099 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
70100 + int flags)
70101 +{
70102 + ON_DEBUG(check_lock_node_data(node));
70103 + ON_DEBUG(check_lock_data());
70104 + return reiser4_get_neighbor(result, node, lock_mode,
70105 + flags & (~GN_GO_LEFT));
70106 +}
70107 +
70108 +extern void sibling_list_remove(znode * node);
70109 +extern void sibling_list_drop(znode * node);
70110 +extern void sibling_list_insert_nolock(znode * new, znode * before);
70111 +extern void link_left_and_right(znode * left, znode * right);
70112 +
70113 +/* Functions called by tree_walk() when tree_walk() ... */
70114 +struct tree_walk_actor {
70115 + /* ... meets a formatted node, */
70116 + int (*process_znode) (tap_t *, void *);
70117 + /* ... meets an extent, */
70118 + int (*process_extent) (tap_t *, void *);
70119 + /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
70120 + * node or extent processing functions. */
70121 + int (*before) (void *);
70122 +};
70123 +
70124 +#if REISER4_DEBUG
70125 +int check_sibling_list(znode * node);
70126 +#else
70127 +#define check_sibling_list(n) (1)
70128 +#endif
70129 +
70130 +#endif /* __FS_REISER4_TREE_WALK_H__ */
70131 +
70132 +/*
70133 + Local variables:
70134 + c-indentation-style: "K&R"
70135 + mode-name: "LC"
70136 + c-basic-offset: 8
70137 + tab-width: 8
70138 + fill-column: 120
70139 + End:
70140 +*/
70141 diff -urN linux-2.6.23.orig/fs/reiser4/txnmgr.c linux-2.6.23/fs/reiser4/txnmgr.c
70142 --- linux-2.6.23.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
70143 +++ linux-2.6.23/fs/reiser4/txnmgr.c 2007-12-04 16:49:30.000000000 +0300
70144 @@ -0,0 +1,3164 @@
70145 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70146 + * reiser4/README */
70147 +
70148 +/* Joshua MacDonald wrote the first draft of this code. */
70149 +
70150 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
70151 +filesystem scales only as well as its worst locking design. You need to
70152 +substantially restructure this code. Josh was not as experienced a programmer
70153 +as you. Particularly review how the locking style differs from what you did
70154 +for znodes usingt hi-lo priority locking, and present to me an opinion on
70155 +whether the differences are well founded. */
70156 +
70157 +/* I cannot help but to disagree with the sentiment above. Locking of
70158 + * transaction manager is _not_ badly designed, and, at the very least, is not
70159 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
70160 + * locking on znodes, especially on the root node of the tree. --nikita,
70161 + * 2003.10.13 */
70162 +
70163 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
70164 + txnmgr processes capture_block requests and manages the relationship between jnodes and
70165 + atoms through the various stages of a transcrash, and it also oversees the fusion and
70166 + capture-on-copy processes. The main difficulty with this task is maintaining a
70167 + deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
70168 + difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
70169 + must be broken. The main requirement is that atom-fusion be deadlock free, so once you
70170 + hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
70171 + that any time you check the atom-pointer of a jnode or handle and then try to lock that
70172 + atom, you must use trylock() and possibly reverse the order.
70173 +
70174 + This code implements the design documented at:
70175 +
70176 + http://namesys.com/txn-doc.html
70177 +
70178 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
70179 +above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
70180 +topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
70181 +year old --- define all technical terms used.
70182 +
70183 +*/
70184 +
70185 +/* Thoughts on the external transaction interface:
70186 +
70187 + In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
70188 + creates state that lasts for the duration of a system call and is called at the start
70189 + of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
70190 + occupying the scope of a single system call. We wish to give certain applications an
70191 + interface to begin and close (commit) transactions. Since our implementation of
70192 + transactions does not yet support isolation, allowing an application to open a
70193 + transaction implies trusting it to later close the transaction. Part of the
70194 + transaction interface will be aimed at enabling that trust, but the interface for
70195 + actually using transactions is fairly narrow.
70196 +
70197 + BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
70198 + this identifier into a string that a shell-script could use, allowing you to start a
70199 + transaction by issuing a command. Once open, the transcrash should be set in the task
70200 + structure, and there should be options (I suppose) to allow it to be carried across
70201 + fork/exec. A transcrash has several options:
70202 +
70203 + - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
70204 + on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
70205 + capture on reads as well, it should set READ_FUSING.
70206 +
70207 + - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
70208 + eventually close (or else the machine must crash). If the application dies an
70209 + unexpected death with an open transcrash, for example, or if it hangs for a long
70210 + duration, one solution (to avoid crashing the machine) is to simply close it anyway.
70211 + This is a dangerous option, but it is one way to solve the problem until isolated
70212 + transcrashes are available for untrusted applications.
70213 +
70214 + It seems to be what databases do, though it is unclear how one avoids a DoS attack
70215 + creating a vulnerability based on resource starvation. Guaranteeing that some
70216 + minimum amount of computational resources are made available would seem more correct
70217 + than guaranteeing some amount of time. When we again have someone to code the work,
70218 + this issue should be considered carefully. -Hans
70219 +
70220 + RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
70221 + many dirty blocks it expects. The reserve_blocks interface should be called at a point
70222 + where it is safe for the application to fail, because the system may not be able to
70223 + grant the allocation and the application must be able to back-out. For this reason,
70224 + the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
70225 + the application may also wish to extend the allocation after beginning its transcrash.
70226 +
70227 + CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
70228 + modifications that require transaction protection. When isolated transactions are
70229 + supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
70230 + RESERVE_BLOCKS call fails for the application, it should "abort" by calling
70231 + CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
70232 + why, for safety, the application should call RESERVE_BLOCKS before making any changes).
70233 +
70234 + For actually implementing these out-of-system-call-scopped transcrashes, the
70235 + reiser4_context has a "txn_handle *trans" pointer that may be set to an open
70236 + transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
70237 + "struct kmem_cache *_txnh_slab" created for that purpose in this file.
70238 +*/
70239 +
70240 +/* Extending the other system call interfaces for future transaction features:
70241 +
70242 + Specialized applications may benefit from passing flags to the ordinary system call
70243 + interface such as read(), write(), or stat(). For example, the application specifies
70244 + WRITE_FUSING by default but wishes to add that a certain read() command should be
70245 + treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
70246 + read, or the file-data read? These issues are straight-forward, but there are a lot of
70247 + them and adding the necessary flags-passing code will be tedious.
70248 +
70249 + When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
70250 + flag, which specifies that although it is a read operation being requested, a
70251 + write-lock should be taken. The reason is that read-locks are shared while write-locks
70252 + are exclusive, so taking a read-lock when a later-write is known in advance will often
70253 + leads to deadlock. If a reader knows it will write later, it should issue read
70254 + requests with the RMW flag set.
70255 +*/
70256 +
70257 +/*
70258 + The znode/atom deadlock avoidance.
70259 +
70260 + FIXME(Zam): writing of this comment is in progress.
70261 +
70262 + The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
70263 + long-term locking, which makes reiser4 locking scheme more complex. It had
70264 + deadlocks until we implement deadlock avoidance algorithms. That deadlocks
70265 + looked as the following: one stopped thread waits for a long-term lock on
70266 + znode, the thread who owns that lock waits when fusion with another atom will
70267 + be allowed.
70268 +
70269 + The source of the deadlocks is an optimization of not capturing index nodes
70270 + for read. Let's prove it. Suppose we have dumb node capturing scheme which
70271 + unconditionally captures each block before locking it.
70272 +
70273 + That scheme has no deadlocks. Let's begin with the thread which stage is
70274 + ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
70275 + a capture because it's stage allows fusion with any atom except which are
70276 + being committed currently. A process of atom commit can't deadlock because
70277 + atom commit procedure does not acquire locks and does not fuse with other
70278 + atoms. Reiser4 does capturing right before going to sleep inside the
70279 + longtertm_lock_znode() function, it means the znode which we want to lock is
70280 + already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
70281 + continue the analysis we understand that no one process in the sequence may
70282 + waits atom fusion. Thereby there are no deadlocks of described kind.
70283 +
70284 + The capturing optimization makes the deadlocks possible. A thread can wait a
70285 + lock which owner did not captured that node. The lock owner's current atom
70286 + is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
70287 + state. A deadlock is possible when that atom meets another one which is in
70288 + ASTAGE_CAPTURE_WAIT already.
70289 +
70290 + The deadlock avoidance scheme includes two algorithms:
70291 +
70292 + First algorithm is used when a thread captures a node which is locked but not
70293 + captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
70294 + moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
70295 + being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
70296 + routine which forces all lock owners to join with current atom is executed.
70297 +
70298 + Second algorithm does not allow to skip capturing of already captured nodes.
70299 +
70300 + Both algorithms together prevent waiting a longterm lock without atom fusion
70301 + with atoms of all lock owners, which is a key thing for getting atom/znode
70302 + locking deadlocks.
70303 +*/
70304 +
70305 +/*
70306 + * Transactions and mmap(2).
70307 + *
70308 + * 1. Transactions are not supported for accesses through mmap(2), because
70309 + * this would effectively amount to user-level transactions whose duration
70310 + * is beyond control of the kernel.
70311 + *
70312 + * 2. That said, we still want to preserve some decency with regard to
70313 + * mmap(2). During normal write(2) call, following sequence of events
70314 + * happens:
70315 + *
70316 + * 1. page is created;
70317 + *
70318 + * 2. jnode is created, dirtied and captured into current atom.
70319 + *
70320 + * 3. extent is inserted and modified.
70321 + *
70322 + * Steps (2) and (3) take place under long term lock on the twig node.
70323 + *
70324 + * When file is accessed through mmap(2) page is always created during
70325 + * page fault.
70326 + * After this (in reiser4_readpage()->reiser4_readpage_extent()):
70327 + *
70328 + * 1. if access is made to non-hole page new jnode is created, (if
70329 + * necessary)
70330 + *
70331 + * 2. if access is made to the hole page, jnode is not created (XXX
70332 + * not clear why).
70333 + *
70334 + * Also, even if page is created by write page fault it is not marked
70335 + * dirty immediately by handle_mm_fault(). Probably this is to avoid races
70336 + * with page write-out.
70337 + *
70338 + * Dirty bit installed by hardware is only transferred to the struct page
70339 + * later, when page is unmapped (in zap_pte_range(), or
70340 + * try_to_unmap_one()).
70341 + *
70342 + * So, with mmap(2) we have to handle following irksome situations:
70343 + *
70344 + * 1. there exists modified page (clean or dirty) without jnode
70345 + *
70346 + * 2. there exists modified page (clean or dirty) with clean jnode
70347 + *
70348 + * 3. clean page which is a part of atom can be transparently modified
70349 + * at any moment through mapping without becoming dirty.
70350 + *
70351 + * (1) and (2) can lead to the out-of-memory situation: ->writepage()
70352 + * doesn't know what to do with such pages and ->sync_sb()/->writepages()
70353 + * don't see them, because these methods operate on atoms.
70354 + *
70355 + * (3) can lead to the loss of data: suppose we have dirty page with dirty
70356 + * captured jnode captured by some atom. As part of early flush (for
70357 + * example) page was written out. Dirty bit was cleared on both page and
70358 + * jnode. After this page is modified through mapping, but kernel doesn't
70359 + * notice and just discards page and jnode as part of commit. (XXX
70360 + * actually it doesn't, because to reclaim page ->releasepage() has to be
70361 + * called and before this dirty bit will be transferred to the struct
70362 + * page).
70363 + *
70364 + */
70365 +
70366 +#include "debug.h"
70367 +#include "txnmgr.h"
70368 +#include "jnode.h"
70369 +#include "znode.h"
70370 +#include "block_alloc.h"
70371 +#include "tree.h"
70372 +#include "wander.h"
70373 +#include "ktxnmgrd.h"
70374 +#include "super.h"
70375 +#include "page_cache.h"
70376 +#include "reiser4.h"
70377 +#include "vfs_ops.h"
70378 +#include "inode.h"
70379 +#include "flush.h"
70380 +
70381 +#include <asm/atomic.h>
70382 +#include <linux/types.h>
70383 +#include <linux/fs.h>
70384 +#include <linux/mm.h>
70385 +#include <linux/slab.h>
70386 +#include <linux/pagemap.h>
70387 +#include <linux/writeback.h>
70388 +#include <linux/swap.h> /* for totalram_pages */
70389 +
70390 +static void atom_free(txn_atom * atom);
70391 +
70392 +static int commit_txnh(txn_handle * txnh);
70393 +
70394 +static void wakeup_atom_waitfor_list(txn_atom * atom);
70395 +static void wakeup_atom_waiting_list(txn_atom * atom);
70396 +
70397 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
70398 +
70399 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
70400 +
70401 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
70402 +
70403 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
70404 + txn_capture mode);
70405 +
70406 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
70407 +
70408 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
70409 +
70410 +void reiser4_invalidate_list(struct list_head *);
70411 +
70412 +/* GENERIC STRUCTURES */
70413 +
70414 +typedef struct _txn_wait_links txn_wait_links;
70415 +
70416 +struct _txn_wait_links {
70417 + lock_stack *_lock_stack;
70418 + struct list_head _fwaitfor_link;
70419 + struct list_head _fwaiting_link;
70420 + int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70421 + int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70422 +};
70423 +
70424 +/* FIXME: In theory, we should be using the slab cache init & destructor
70425 + methods instead of, e.g., jnode_init, etc. */
70426 +static struct kmem_cache *_atom_slab = NULL;
70427 +/* this is for user-visible, cross system-call transactions. */
70428 +static struct kmem_cache *_txnh_slab = NULL;
70429 +
70430 +/**
70431 + * init_txnmgr_static - create transaction manager slab caches
70432 + *
70433 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
70434 + * initialization.
70435 + */
70436 +int init_txnmgr_static(void)
70437 +{
70438 + assert("jmacd-600", _atom_slab == NULL);
70439 + assert("jmacd-601", _txnh_slab == NULL);
70440 +
70441 + ON_DEBUG(atomic_set(&flush_cnt, 0));
70442 +
70443 + _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
70444 + SLAB_HWCACHE_ALIGN |
70445 + SLAB_RECLAIM_ACCOUNT, NULL);
70446 + if (_atom_slab == NULL)
70447 + return RETERR(-ENOMEM);
70448 +
70449 + _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
70450 + SLAB_HWCACHE_ALIGN, NULL);
70451 + if (_txnh_slab == NULL) {
70452 + kmem_cache_destroy(_atom_slab);
70453 + _atom_slab = NULL;
70454 + return RETERR(-ENOMEM);
70455 + }
70456 +
70457 + return 0;
70458 +}
70459 +
70460 +/**
70461 + * done_txnmgr_static - delete txn_atom and txn_handle caches
70462 + *
70463 + * This is called on reiser4 module unloading or system shutdown.
70464 + */
70465 +void done_txnmgr_static(void)
70466 +{
70467 + destroy_reiser4_cache(&_atom_slab);
70468 + destroy_reiser4_cache(&_txnh_slab);
70469 +}
70470 +
70471 +/**
70472 + * init_txnmgr - initialize a new transaction manager
70473 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70474 + *
70475 + * This is called on mount. Makes necessary initializations.
70476 + */
70477 +void reiser4_init_txnmgr(txn_mgr *mgr)
70478 +{
70479 + assert("umka-169", mgr != NULL);
70480 +
70481 + mgr->atom_count = 0;
70482 + mgr->id_count = 1;
70483 + INIT_LIST_HEAD(&mgr->atoms_list);
70484 + spin_lock_init(&mgr->tmgr_lock);
70485 + mutex_init(&mgr->commit_mutex);
70486 +}
70487 +
70488 +/**
70489 + * reiser4_done_txnmgr - stop transaction manager
70490 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70491 + *
70492 + * This is called on umount. Does sanity checks.
70493 + */
70494 +void reiser4_done_txnmgr(txn_mgr *mgr)
70495 +{
70496 + assert("umka-170", mgr != NULL);
70497 + assert("umka-1701", list_empty_careful(&mgr->atoms_list));
70498 + assert("umka-1702", mgr->atom_count == 0);
70499 +}
70500 +
70501 +/* Initialize a transaction handle. */
70502 +/* Audited by: umka (2002.06.13) */
70503 +static void txnh_init(txn_handle * txnh, txn_mode mode)
70504 +{
70505 + assert("umka-171", txnh != NULL);
70506 +
70507 + txnh->mode = mode;
70508 + txnh->atom = NULL;
70509 + reiser4_ctx_gfp_mask_set();
70510 + txnh->flags = 0;
70511 + spin_lock_init(&txnh->hlock);
70512 + INIT_LIST_HEAD(&txnh->txnh_link);
70513 +}
70514 +
70515 +#if REISER4_DEBUG
70516 +/* Check if a transaction handle is clean. */
70517 +static int txnh_isclean(txn_handle * txnh)
70518 +{
70519 + assert("umka-172", txnh != NULL);
70520 + return txnh->atom == NULL &&
70521 + LOCK_CNT_NIL(spin_locked_txnh);
70522 +}
70523 +#endif
70524 +
70525 +/* Initialize an atom. */
70526 +static void atom_init(txn_atom * atom)
70527 +{
70528 + int level;
70529 +
70530 + assert("umka-173", atom != NULL);
70531 +
70532 + memset(atom, 0, sizeof(txn_atom));
70533 +
70534 + atom->stage = ASTAGE_FREE;
70535 + atom->start_time = jiffies;
70536 +
70537 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
70538 + INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
70539 +
70540 + INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
70541 + INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
70542 + INIT_LIST_HEAD(ATOM_WB_LIST(atom));
70543 + INIT_LIST_HEAD(&atom->inodes);
70544 + spin_lock_init(&(atom->alock));
70545 + /* list of transaction handles */
70546 + INIT_LIST_HEAD(&atom->txnh_list);
70547 + /* link to transaction manager's list of atoms */
70548 + INIT_LIST_HEAD(&atom->atom_link);
70549 + INIT_LIST_HEAD(&atom->fwaitfor_list);
70550 + INIT_LIST_HEAD(&atom->fwaiting_list);
70551 + blocknr_set_init(&atom->delete_set);
70552 + blocknr_set_init(&atom->wandered_map);
70553 +
70554 + init_atom_fq_parts(atom);
70555 +}
70556 +
70557 +#if REISER4_DEBUG
70558 +/* Check if an atom is clean. */
70559 +static int atom_isclean(txn_atom * atom)
70560 +{
70561 + int level;
70562 +
70563 + assert("umka-174", atom != NULL);
70564 +
70565 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
70566 + if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
70567 + return 0;
70568 + }
70569 + }
70570 +
70571 + return atom->stage == ASTAGE_FREE &&
70572 + atom->txnh_count == 0 &&
70573 + atom->capture_count == 0 &&
70574 + atomic_read(&atom->refcount) == 0 &&
70575 + (&atom->atom_link == atom->atom_link.next &&
70576 + &atom->atom_link == atom->atom_link.prev) &&
70577 + list_empty_careful(&atom->txnh_list) &&
70578 + list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
70579 + list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
70580 + list_empty_careful(ATOM_WB_LIST(atom)) &&
70581 + list_empty_careful(&atom->fwaitfor_list) &&
70582 + list_empty_careful(&atom->fwaiting_list) &&
70583 + atom_fq_parts_are_clean(atom);
70584 +}
70585 +#endif
70586 +
70587 +/* Begin a transaction in this context. Currently this uses the reiser4_context's
70588 + trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
70589 + this will be extended to allow transaction handles to span several contexts. */
70590 +/* Audited by: umka (2002.06.13) */
70591 +void reiser4_txn_begin(reiser4_context * context)
70592 +{
70593 + assert("jmacd-544", context->trans == NULL);
70594 +
70595 + context->trans = &context->trans_in_ctx;
70596 +
70597 + /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
70598 + transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
70599 + stack allocated right now, but we would like to allow for dynamically allocated
70600 + transcrashes that span multiple system calls.
70601 + */
70602 + txnh_init(context->trans, TXN_WRITE_FUSING);
70603 +}
70604 +
70605 +/* Finish a transaction handle context. */
70606 +int reiser4_txn_end(reiser4_context * context)
70607 +{
70608 + long ret = 0;
70609 + txn_handle *txnh;
70610 +
70611 + assert("umka-283", context != NULL);
70612 + assert("nikita-3012", reiser4_schedulable());
70613 + assert("vs-24", context == get_current_context());
70614 + assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
70615 +
70616 + txnh = context->trans;
70617 + if (txnh != NULL) {
70618 + if (txnh->atom != NULL)
70619 + ret = commit_txnh(txnh);
70620 + assert("jmacd-633", txnh_isclean(txnh));
70621 + context->trans = NULL;
70622 + }
70623 + return ret;
70624 +}
70625 +
70626 +void reiser4_txn_restart(reiser4_context * context)
70627 +{
70628 + reiser4_txn_end(context);
70629 + reiser4_preempt_point();
70630 + reiser4_txn_begin(context);
70631 +}
70632 +
70633 +void reiser4_txn_restart_current(void)
70634 +{
70635 + reiser4_txn_restart(get_current_context());
70636 +}
70637 +
70638 +/* TXN_ATOM */
70639 +
70640 +/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
70641 + is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
70642 + return NULL. */
70643 +static txn_atom *txnh_get_atom(txn_handle * txnh)
70644 +{
70645 + txn_atom *atom;
70646 +
70647 + assert("umka-180", txnh != NULL);
70648 + assert_spin_not_locked(&(txnh->hlock));
70649 +
70650 + while (1) {
70651 + spin_lock_txnh(txnh);
70652 + atom = txnh->atom;
70653 +
70654 + if (atom == NULL)
70655 + break;
70656 +
70657 + if (spin_trylock_atom(atom))
70658 + break;
70659 +
70660 + atomic_inc(&atom->refcount);
70661 +
70662 + spin_unlock_txnh(txnh);
70663 + spin_lock_atom(atom);
70664 + spin_lock_txnh(txnh);
70665 +
70666 + if (txnh->atom == atom) {
70667 + atomic_dec(&atom->refcount);
70668 + break;
70669 + }
70670 +
70671 + spin_unlock_txnh(txnh);
70672 + atom_dec_and_unlock(atom);
70673 + }
70674 +
70675 + return atom;
70676 +}
70677 +
70678 +/* Get the current atom and spinlock it if current atom present. May return NULL */
70679 +txn_atom *get_current_atom_locked_nocheck(void)
70680 +{
70681 + reiser4_context *cx;
70682 + txn_atom *atom;
70683 + txn_handle *txnh;
70684 +
70685 + cx = get_current_context();
70686 + assert("zam-437", cx != NULL);
70687 +
70688 + txnh = cx->trans;
70689 + assert("zam-435", txnh != NULL);
70690 +
70691 + atom = txnh_get_atom(txnh);
70692 +
70693 + spin_unlock_txnh(txnh);
70694 + return atom;
70695 +}
70696 +
70697 +/* Get the atom belonging to a jnode, which is initially locked. Return with
70698 + both jnode and atom locked. This performs the necessary spin_trylock to
70699 + break the lock-ordering cycle. Assumes the jnode is already locked, and
70700 + returns NULL if atom is not set. */
70701 +txn_atom *jnode_get_atom(jnode * node)
70702 +{
70703 + txn_atom *atom;
70704 +
70705 + assert("umka-181", node != NULL);
70706 +
70707 + while (1) {
70708 + assert_spin_locked(&(node->guard));
70709 +
70710 + atom = node->atom;
70711 + /* node is not in any atom */
70712 + if (atom == NULL)
70713 + break;
70714 +
70715 + /* If atom is not locked, grab the lock and return */
70716 + if (spin_trylock_atom(atom))
70717 + break;
70718 +
70719 + /* At least one jnode belongs to this atom it guarantees that
70720 + * atom->refcount > 0, we can safely increment refcount. */
70721 + atomic_inc(&atom->refcount);
70722 + spin_unlock_jnode(node);
70723 +
70724 + /* re-acquire spin locks in the right order */
70725 + spin_lock_atom(atom);
70726 + spin_lock_jnode(node);
70727 +
70728 + /* check if node still points to the same atom. */
70729 + if (node->atom == atom) {
70730 + atomic_dec(&atom->refcount);
70731 + break;
70732 + }
70733 +
70734 + /* releasing of atom lock and reference requires not holding
70735 + * locks on jnodes. */
70736 + spin_unlock_jnode(node);
70737 +
70738 + /* We do not sure that this atom has extra references except our
70739 + * one, so we should call proper function which may free atom if
70740 + * last reference is released. */
70741 + atom_dec_and_unlock(atom);
70742 +
70743 + /* lock jnode again for getting valid node->atom pointer
70744 + * value. */
70745 + spin_lock_jnode(node);
70746 + }
70747 +
70748 + return atom;
70749 +}
70750 +
70751 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
70752 + by flush code to indicate whether the next node (in some direction) is suitable for
70753 + flushing. */
70754 +int
70755 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
70756 +{
70757 + int compat;
70758 + txn_atom *atom;
70759 +
70760 + assert("umka-182", node != NULL);
70761 + assert("umka-183", check != NULL);
70762 +
70763 + /* Not sure what this function is supposed to do if supplied with @check that is
70764 + neither formatted nor unformatted (bitmap or so). */
70765 + assert("nikita-2373", jnode_is_znode(check)
70766 + || jnode_is_unformatted(check));
70767 +
70768 + /* Need a lock on CHECK to get its atom and to check various state bits.
70769 + Don't need a lock on NODE once we get the atom lock. */
70770 + /* It is not enough to lock two nodes and check (node->atom ==
70771 + check->atom) because atom could be locked and being fused at that
70772 + moment, jnodes of the atom of that state (being fused) can point to
70773 + different objects, but the atom is the same. */
70774 + spin_lock_jnode(check);
70775 +
70776 + atom = jnode_get_atom(check);
70777 +
70778 + if (atom == NULL) {
70779 + compat = 0;
70780 + } else {
70781 + compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
70782 +
70783 + if (compat && jnode_is_znode(check)) {
70784 + compat &= znode_is_connected(JZNODE(check));
70785 + }
70786 +
70787 + if (compat && alloc_check) {
70788 + compat &= (alloc_value == jnode_is_flushprepped(check));
70789 + }
70790 +
70791 + spin_unlock_atom(atom);
70792 + }
70793 +
70794 + spin_unlock_jnode(check);
70795 +
70796 + return compat;
70797 +}
70798 +
70799 +/* Decrement the atom's reference count and if it falls to zero, free it. */
70800 +void atom_dec_and_unlock(txn_atom * atom)
70801 +{
70802 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70803 +
70804 + assert("umka-186", atom != NULL);
70805 + assert_spin_locked(&(atom->alock));
70806 + assert("zam-1039", atomic_read(&atom->refcount) > 0);
70807 +
70808 + if (atomic_dec_and_test(&atom->refcount)) {
70809 + /* take txnmgr lock and atom lock in proper order. */
70810 + if (!spin_trylock_txnmgr(mgr)) {
70811 + /* This atom should exist after we re-acquire its
70812 + * spinlock, so we increment its reference counter. */
70813 + atomic_inc(&atom->refcount);
70814 + spin_unlock_atom(atom);
70815 + spin_lock_txnmgr(mgr);
70816 + spin_lock_atom(atom);
70817 +
70818 + if (!atomic_dec_and_test(&atom->refcount)) {
70819 + spin_unlock_atom(atom);
70820 + spin_unlock_txnmgr(mgr);
70821 + return;
70822 + }
70823 + }
70824 + assert_spin_locked(&(mgr->tmgr_lock));
70825 + atom_free(atom);
70826 + spin_unlock_txnmgr(mgr);
70827 + } else
70828 + spin_unlock_atom(atom);
70829 +}
70830 +
70831 +/* Create new atom and connect it to given transaction handle. This adds the
70832 + atom to the transaction manager's list and sets its reference count to 1, an
70833 + artificial reference which is kept until it commits. We play strange games
70834 + to avoid allocation under jnode & txnh spinlocks.*/
70835 +
70836 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
70837 +{
70838 + txn_atom *atom;
70839 + txn_mgr *mgr;
70840 +
70841 + if (REISER4_DEBUG && rofs_tree(current_tree)) {
70842 + warning("nikita-3366", "Creating atom on rofs");
70843 + dump_stack();
70844 + }
70845 +
70846 + if (*atom_alloc == NULL) {
70847 + (*atom_alloc) = kmem_cache_alloc(_atom_slab,
70848 + reiser4_ctx_gfp_mask_get());
70849 +
70850 + if (*atom_alloc == NULL)
70851 + return RETERR(-ENOMEM);
70852 + }
70853 +
70854 + /* and, also, txnmgr spin lock should be taken before jnode and txnh
70855 + locks. */
70856 + mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70857 + spin_lock_txnmgr(mgr);
70858 + spin_lock_txnh(txnh);
70859 +
70860 + /* Check whether new atom still needed */
70861 + if (txnh->atom != NULL) {
70862 + /* NOTE-NIKITA probably it is rather better to free
70863 + * atom_alloc here than thread it up to reiser4_try_capture() */
70864 +
70865 + spin_unlock_txnh(txnh);
70866 + spin_unlock_txnmgr(mgr);
70867 +
70868 + return -E_REPEAT;
70869 + }
70870 +
70871 + atom = *atom_alloc;
70872 + *atom_alloc = NULL;
70873 +
70874 + atom_init(atom);
70875 +
70876 + assert("jmacd-17", atom_isclean(atom));
70877 +
70878 + /*
70879 + * lock ordering is broken here. It is ok, as long as @atom is new
70880 + * and inaccessible for others. We can't use spin_lock_atom or
70881 + * spin_lock(&atom->alock) because they care about locking
70882 + * dependencies. spin_trylock_lock doesn't.
70883 + */
70884 + check_me("", spin_trylock_atom(atom));
70885 +
70886 + /* add atom to the end of transaction manager's list of atoms */
70887 + list_add_tail(&atom->atom_link, &mgr->atoms_list);
70888 + atom->atom_id = mgr->id_count++;
70889 + mgr->atom_count += 1;
70890 +
70891 + /* Release txnmgr lock */
70892 + spin_unlock_txnmgr(mgr);
70893 +
70894 + /* One reference until it commits. */
70895 + atomic_inc(&atom->refcount);
70896 + atom->stage = ASTAGE_CAPTURE_FUSE;
70897 + atom->super = reiser4_get_current_sb();
70898 + capture_assign_txnh_nolock(atom, txnh);
70899 +
70900 + spin_unlock_atom(atom);
70901 + spin_unlock_txnh(txnh);
70902 +
70903 + return -E_REPEAT;
70904 +}
70905 +
70906 +/* Return true if an atom is currently "open". */
70907 +static int atom_isopen(const txn_atom * atom)
70908 +{
70909 + assert("umka-185", atom != NULL);
70910 +
70911 + return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
70912 +}
70913 +
70914 +/* Return the number of pointers to this atom that must be updated during fusion. This
70915 + approximates the amount of work to be done. Fusion chooses the atom with fewer
70916 + pointers to fuse into the atom with more pointers. */
70917 +static int atom_pointer_count(const txn_atom * atom)
70918 +{
70919 + assert("umka-187", atom != NULL);
70920 +
70921 + /* This is a measure of the amount of work needed to fuse this atom
70922 + * into another. */
70923 + return atom->txnh_count + atom->capture_count;
70924 +}
70925 +
70926 +/* Called holding the atom lock, this removes the atom from the transaction manager list
70927 + and frees it. */
70928 +static void atom_free(txn_atom * atom)
70929 +{
70930 + txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70931 +
70932 + assert("umka-188", atom != NULL);
70933 + assert_spin_locked(&(atom->alock));
70934 +
70935 + /* Remove from the txn_mgr's atom list */
70936 + assert_spin_locked(&(mgr->tmgr_lock));
70937 + mgr->atom_count -= 1;
70938 + list_del_init(&atom->atom_link);
70939 +
70940 + /* Clean the atom */
70941 + assert("jmacd-16",
70942 + (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
70943 + atom->stage = ASTAGE_FREE;
70944 +
70945 + blocknr_set_destroy(&atom->delete_set);
70946 + blocknr_set_destroy(&atom->wandered_map);
70947 +
70948 + assert("jmacd-16", atom_isclean(atom));
70949 +
70950 + spin_unlock_atom(atom);
70951 +
70952 + kmem_cache_free(_atom_slab, atom);
70953 +}
70954 +
70955 +static int atom_is_dotard(const txn_atom * atom)
70956 +{
70957 + return time_after(jiffies, atom->start_time +
70958 + get_current_super_private()->tmgr.atom_max_age);
70959 +}
70960 +
70961 +static int atom_can_be_committed(txn_atom * atom)
70962 +{
70963 + assert_spin_locked(&(atom->alock));
70964 + assert("zam-885", atom->txnh_count > atom->nr_waiters);
70965 + return atom->txnh_count == atom->nr_waiters + 1;
70966 +}
70967 +
70968 +/* Return true if an atom should commit now. This is determined by aging, atom
70969 + size or atom flags. */
70970 +static int atom_should_commit(const txn_atom * atom)
70971 +{
70972 + assert("umka-189", atom != NULL);
70973 + return
70974 + (atom->flags & ATOM_FORCE_COMMIT) ||
70975 + ((unsigned)atom_pointer_count(atom) >
70976 + get_current_super_private()->tmgr.atom_max_size)
70977 + || atom_is_dotard(atom);
70978 +}
70979 +
70980 +/* return 1 if current atom exists and requires commit. */
70981 +int current_atom_should_commit(void)
70982 +{
70983 + txn_atom *atom;
70984 + int result = 0;
70985 +
70986 + atom = get_current_atom_locked_nocheck();
70987 + if (atom) {
70988 + result = atom_should_commit(atom);
70989 + spin_unlock_atom(atom);
70990 + }
70991 + return result;
70992 +}
70993 +
70994 +static int atom_should_commit_asap(const txn_atom * atom)
70995 +{
70996 + unsigned int captured;
70997 + unsigned int pinnedpages;
70998 +
70999 + assert("nikita-3309", atom != NULL);
71000 +
71001 + captured = (unsigned)atom->capture_count;
71002 + pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
71003 +
71004 + return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
71005 +}
71006 +
71007 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
71008 +{
71009 + jnode *first_dirty;
71010 +
71011 + list_for_each_entry(first_dirty, head, capture_link) {
71012 + if (!(flags & JNODE_FLUSH_COMMIT)) {
71013 + /*
71014 + * skip jnodes which "heard banshee" or having active
71015 + * I/O
71016 + */
71017 + if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
71018 + JF_ISSET(first_dirty, JNODE_WRITEBACK))
71019 + continue;
71020 + }
71021 + return first_dirty;
71022 + }
71023 + return NULL;
71024 +}
71025 +
71026 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
71027 + nodes on atom's lists */
71028 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
71029 +{
71030 + jnode *first_dirty;
71031 + tree_level level;
71032 +
71033 + assert_spin_locked(&(atom->alock));
71034 +
71035 + /* The flush starts from LEAF_LEVEL (=1). */
71036 + for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
71037 + if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
71038 + continue;
71039 +
71040 + first_dirty =
71041 + find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
71042 + flags);
71043 + if (first_dirty)
71044 + return first_dirty;
71045 + }
71046 +
71047 + /* znode-above-root is on the list #0. */
71048 + return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
71049 +}
71050 +
71051 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
71052 +{
71053 + jnode *cur;
71054 +
71055 + assert("zam-905", atom_is_protected(atom));
71056 +
71057 + cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
71058 + while (ATOM_WB_LIST(atom) != &cur->capture_link) {
71059 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
71060 +
71061 + spin_lock_jnode(cur);
71062 + if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
71063 + if (JF_ISSET(cur, JNODE_DIRTY)) {
71064 + queue_jnode(fq, cur);
71065 + } else {
71066 + /* move jnode to atom's clean list */
71067 + list_move_tail(&cur->capture_link,
71068 + ATOM_CLEAN_LIST(atom));
71069 + }
71070 + }
71071 + spin_unlock_jnode(cur);
71072 +
71073 + cur = next;
71074 + }
71075 +}
71076 +
71077 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
71078 + * jnodes to disk. */
71079 +static int submit_wb_list(void)
71080 +{
71081 + int ret;
71082 + flush_queue_t *fq;
71083 +
71084 + fq = get_fq_for_current_atom();
71085 + if (IS_ERR(fq))
71086 + return PTR_ERR(fq);
71087 +
71088 + dispatch_wb_list(fq->atom, fq);
71089 + spin_unlock_atom(fq->atom);
71090 +
71091 + ret = reiser4_write_fq(fq, NULL, 1);
71092 + reiser4_fq_put(fq);
71093 +
71094 + return ret;
71095 +}
71096 +
71097 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
71098 +static int current_atom_complete_writes(void)
71099 +{
71100 + int ret;
71101 +
71102 + /* Each jnode from that list was modified and dirtied when it had i/o
71103 + * request running already. After i/o completion we have to resubmit
71104 + * them to disk again.*/
71105 + ret = submit_wb_list();
71106 + if (ret < 0)
71107 + return ret;
71108 +
71109 + /* Wait all i/o completion */
71110 + ret = current_atom_finish_all_fq();
71111 + if (ret)
71112 + return ret;
71113 +
71114 + /* Scan wb list again; all i/o should be completed, we re-submit dirty
71115 + * nodes to disk */
71116 + ret = submit_wb_list();
71117 + if (ret < 0)
71118 + return ret;
71119 +
71120 + /* Wait all nodes we just submitted */
71121 + return current_atom_finish_all_fq();
71122 +}
71123 +
71124 +#if REISER4_DEBUG
71125 +
71126 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
71127 +{
71128 + if (atom == NULL) {
71129 + printk("%s: no atom\n", prefix);
71130 + return;
71131 + }
71132 +
71133 + printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
71134 + " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
71135 + atomic_read(&atom->refcount), atom->atom_id, atom->flags,
71136 + atom->txnh_count, atom->capture_count, atom->stage,
71137 + atom->start_time, atom->flushed);
71138 +}
71139 +
71140 +#else /* REISER4_DEBUG */
71141 +
71142 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
71143 +
71144 +#endif /* REISER4_DEBUG */
71145 +
71146 +#define TOOMANYFLUSHES (1 << 13)
71147 +
71148 +/* Called with the atom locked and no open "active" transaction handlers except
71149 + ours, this function calls flush_current_atom() until all dirty nodes are
71150 + processed. Then it initiates commit processing.
71151 +
71152 + Called by the single remaining open "active" txnh, which is closing. Other
71153 + open txnhs belong to processes which wait atom commit in commit_txnh()
71154 + routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
71155 + long as we hold the atom lock none of the jnodes can be captured and/or
71156 + locked.
71157 +
71158 + Return value is an error code if commit fails.
71159 +*/
71160 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
71161 +{
71162 + reiser4_super_info_data *sbinfo = get_current_super_private();
71163 + long ret = 0;
71164 + /* how many times jnode_flush() was called as a part of attempt to
71165 + * commit this atom. */
71166 + int flushiters;
71167 +
71168 + assert("zam-888", atom != NULL && *atom != NULL);
71169 + assert_spin_locked(&((*atom)->alock));
71170 + assert("zam-887", get_current_context()->trans->atom == *atom);
71171 + assert("jmacd-151", atom_isopen(*atom));
71172 +
71173 + assert("nikita-3184",
71174 + get_current_super_private()->delete_mutex_owner != current);
71175 +
71176 + for (flushiters = 0;; ++flushiters) {
71177 + ret =
71178 + flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
71179 + JNODE_FLUSH_COMMIT,
71180 + LONG_MAX /* nr_to_write */ ,
71181 + nr_submitted, atom, NULL);
71182 + if (ret != -E_REPEAT)
71183 + break;
71184 +
71185 + /* if atom's dirty list contains one znode which is
71186 + HEARD_BANSHEE and is locked we have to allow lock owner to
71187 + continue and uncapture that znode */
71188 + reiser4_preempt_point();
71189 +
71190 + *atom = get_current_atom_locked();
71191 + if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
71192 + warning("nikita-3176",
71193 + "Flushing like mad: %i", flushiters);
71194 + reiser4_info_atom("atom", *atom);
71195 + DEBUGON(flushiters > (1 << 20));
71196 + }
71197 + }
71198 +
71199 + if (ret)
71200 + return ret;
71201 +
71202 + assert_spin_locked(&((*atom)->alock));
71203 +
71204 + if (!atom_can_be_committed(*atom)) {
71205 + spin_unlock_atom(*atom);
71206 + return RETERR(-E_REPEAT);
71207 + }
71208 +
71209 + if ((*atom)->capture_count == 0)
71210 + goto done;
71211 +
71212 + /* Up to this point we have been flushing and after flush is called we
71213 + return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
71214 + at this point, commit should be successful. */
71215 + reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
71216 + ON_DEBUG(((*atom)->committer = current));
71217 + spin_unlock_atom(*atom);
71218 +
71219 + ret = current_atom_complete_writes();
71220 + if (ret)
71221 + return ret;
71222 +
71223 + assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
71224 +
71225 + /* isolate critical code path which should be executed by only one
71226 + * thread using tmgr mutex */
71227 + mutex_lock(&sbinfo->tmgr.commit_mutex);
71228 +
71229 + ret = reiser4_write_logs(nr_submitted);
71230 + if (ret < 0)
71231 + reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
71232 +
71233 + /* The atom->ovrwr_nodes list is processed under commit mutex held
71234 + because of bitmap nodes which are captured by special way in
71235 + reiser4_pre_commit_hook_bitmap(), that way does not include
71236 + capture_fuse_wait() as a capturing of other nodes does -- the commit
71237 + mutex is used for transaction isolation instead. */
71238 + reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
71239 + mutex_unlock(&sbinfo->tmgr.commit_mutex);
71240 +
71241 + reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
71242 + reiser4_invalidate_list(ATOM_WB_LIST(*atom));
71243 + assert("zam-927", list_empty(&(*atom)->inodes));
71244 +
71245 + spin_lock_atom(*atom);
71246 + done:
71247 + reiser4_atom_set_stage(*atom, ASTAGE_DONE);
71248 + ON_DEBUG((*atom)->committer = NULL);
71249 +
71250 + /* Atom's state changes, so wake up everybody waiting for this
71251 + event. */
71252 + wakeup_atom_waiting_list(*atom);
71253 +
71254 + /* Decrement the "until commit" reference, at least one txnh (the caller) is
71255 + still open. */
71256 + atomic_dec(&(*atom)->refcount);
71257 +
71258 + assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
71259 + assert("jmacd-1062", (*atom)->capture_count == 0);
71260 + BUG_ON((*atom)->capture_count != 0);
71261 + assert_spin_locked(&((*atom)->alock));
71262 +
71263 + return ret;
71264 +}
71265 +
71266 +/* TXN_TXNH */
71267 +
71268 +/**
71269 + * force_commit_atom - commit current atom and wait commit completion
71270 + * @txnh:
71271 + *
71272 + * Commits current atom and wait commit completion; current atom and @txnh have
71273 + * to be spinlocked before call, this function unlocks them on exit.
71274 + */
71275 +int force_commit_atom(txn_handle *txnh)
71276 +{
71277 + txn_atom *atom;
71278 +
71279 + assert("zam-837", txnh != NULL);
71280 + assert_spin_locked(&(txnh->hlock));
71281 + assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
71282 +
71283 + atom = txnh->atom;
71284 +
71285 + assert("zam-834", atom != NULL);
71286 + assert_spin_locked(&(atom->alock));
71287 +
71288 + /*
71289 + * Set flags for atom and txnh: forcing atom commit and waiting for
71290 + * commit completion
71291 + */
71292 + txnh->flags |= TXNH_WAIT_COMMIT;
71293 + atom->flags |= ATOM_FORCE_COMMIT;
71294 +
71295 + spin_unlock_txnh(txnh);
71296 + spin_unlock_atom(atom);
71297 +
71298 + /* commit is here */
71299 + reiser4_txn_restart_current();
71300 + return 0;
71301 +}
71302 +
71303 +/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
71304 + * should we commit all atoms including new ones which are created after this
71305 + * functions is called. */
71306 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
71307 +{
71308 + int ret;
71309 + txn_atom *atom;
71310 + txn_mgr *mgr;
71311 + txn_handle *txnh;
71312 + unsigned long start_time = jiffies;
71313 + reiser4_context *ctx = get_current_context();
71314 +
71315 + assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
71316 + assert("nikita-3058", reiser4_commit_check_locks());
71317 +
71318 + reiser4_txn_restart_current();
71319 +
71320 + mgr = &get_super_private(super)->tmgr;
71321 +
71322 + txnh = ctx->trans;
71323 +
71324 + again:
71325 +
71326 + spin_lock_txnmgr(mgr);
71327 +
71328 + list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
71329 + spin_lock_atom(atom);
71330 +
71331 + /* Commit any atom which can be committed. If @commit_new_atoms
71332 + * is not set we commit only atoms which were created before
71333 + * this call is started. */
71334 + if (commit_all_atoms
71335 + || time_before_eq(atom->start_time, start_time)) {
71336 + if (atom->stage <= ASTAGE_POST_COMMIT) {
71337 + spin_unlock_txnmgr(mgr);
71338 +
71339 + if (atom->stage < ASTAGE_PRE_COMMIT) {
71340 + spin_lock_txnh(txnh);
71341 + /* Add force-context txnh */
71342 + capture_assign_txnh_nolock(atom, txnh);
71343 + ret = force_commit_atom(txnh);
71344 + if (ret)
71345 + return ret;
71346 + } else
71347 + /* wait atom commit */
71348 + reiser4_atom_wait_event(atom);
71349 +
71350 + goto again;
71351 + }
71352 + }
71353 +
71354 + spin_unlock_atom(atom);
71355 + }
71356 +
71357 +#if REISER4_DEBUG
71358 + if (commit_all_atoms) {
71359 + reiser4_super_info_data *sbinfo = get_super_private(super);
71360 + spin_lock_reiser4_super(sbinfo);
71361 + assert("zam-813",
71362 + sbinfo->blocks_fake_allocated_unformatted == 0);
71363 + assert("zam-812", sbinfo->blocks_fake_allocated == 0);
71364 + spin_unlock_reiser4_super(sbinfo);
71365 + }
71366 +#endif
71367 +
71368 + spin_unlock_txnmgr(mgr);
71369 +
71370 + return 0;
71371 +}
71372 +
71373 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
71374 + * caller */
71375 +static int atom_is_committable(txn_atom * atom)
71376 +{
71377 + return
71378 + atom->stage < ASTAGE_PRE_COMMIT &&
71379 + atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
71380 +}
71381 +
71382 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
71383 + * lock at exit */
71384 +int commit_some_atoms(txn_mgr * mgr)
71385 +{
71386 + int ret = 0;
71387 + txn_atom *atom;
71388 + txn_handle *txnh;
71389 + reiser4_context *ctx;
71390 + struct list_head *pos, *tmp;
71391 +
71392 + ctx = get_current_context();
71393 + assert("nikita-2444", ctx != NULL);
71394 +
71395 + txnh = ctx->trans;
71396 + spin_lock_txnmgr(mgr);
71397 +
71398 + /*
71399 + * this is to avoid gcc complain that atom might be used
71400 + * uninitialized
71401 + */
71402 + atom = NULL;
71403 +
71404 + /* look for atom to commit */
71405 + list_for_each_safe(pos, tmp, &mgr->atoms_list) {
71406 + atom = list_entry(pos, txn_atom, atom_link);
71407 + /*
71408 + * first test without taking atom spin lock, whether it is
71409 + * eligible for committing at all
71410 + */
71411 + if (atom_is_committable(atom)) {
71412 + /* now, take spin lock and re-check */
71413 + spin_lock_atom(atom);
71414 + if (atom_is_committable(atom))
71415 + break;
71416 + spin_unlock_atom(atom);
71417 + }
71418 + }
71419 +
71420 + ret = (&mgr->atoms_list == pos);
71421 + spin_unlock_txnmgr(mgr);
71422 +
71423 + if (ret) {
71424 + /* nothing found */
71425 + spin_unlock(&mgr->daemon->guard);
71426 + return 0;
71427 + }
71428 +
71429 + spin_lock_txnh(txnh);
71430 +
71431 + BUG_ON(atom == NULL);
71432 + /* Set the atom to force committing */
71433 + atom->flags |= ATOM_FORCE_COMMIT;
71434 +
71435 + /* Add force-context txnh */
71436 + capture_assign_txnh_nolock(atom, txnh);
71437 +
71438 + spin_unlock_txnh(txnh);
71439 + spin_unlock_atom(atom);
71440 +
71441 + /* we are about to release daemon spin lock, notify daemon it
71442 + has to rescan atoms */
71443 + mgr->daemon->rescan = 1;
71444 + spin_unlock(&mgr->daemon->guard);
71445 + reiser4_txn_restart_current();
71446 + return 0;
71447 +}
71448 +
71449 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
71450 +{
71451 + int atom_stage;
71452 + txn_atom *atom_2;
71453 + int repeat;
71454 +
71455 + assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
71456 +
71457 + atom_stage = atom->stage;
71458 + repeat = 0;
71459 +
71460 + if (!spin_trylock_txnmgr(tmgr)) {
71461 + atomic_inc(&atom->refcount);
71462 + spin_unlock_atom(atom);
71463 + spin_lock_txnmgr(tmgr);
71464 + spin_lock_atom(atom);
71465 + repeat = 1;
71466 + if (atom->stage != atom_stage) {
71467 + spin_unlock_txnmgr(tmgr);
71468 + atom_dec_and_unlock(atom);
71469 + return -E_REPEAT;
71470 + }
71471 + atomic_dec(&atom->refcount);
71472 + }
71473 +
71474 + list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
71475 + if (atom == atom_2)
71476 + continue;
71477 + /*
71478 + * if trylock does not succeed we just do not fuse with that
71479 + * atom.
71480 + */
71481 + if (spin_trylock_atom(atom_2)) {
71482 + if (atom_2->stage < ASTAGE_PRE_COMMIT) {
71483 + spin_unlock_txnmgr(tmgr);
71484 + capture_fuse_into(atom_2, atom);
71485 + /* all locks are lost we can only repeat here */
71486 + return -E_REPEAT;
71487 + }
71488 + spin_unlock_atom(atom_2);
71489 + }
71490 + }
71491 + atom->flags |= ATOM_CANCEL_FUSION;
71492 + spin_unlock_txnmgr(tmgr);
71493 + if (repeat) {
71494 + spin_unlock_atom(atom);
71495 + return -E_REPEAT;
71496 + }
71497 + return 0;
71498 +}
71499 +
71500 +/* Calls jnode_flush for current atom if it exists; if not, just take another
71501 + atom and call jnode_flush() for him. If current transaction handle has
71502 + already assigned atom (current atom) we have to close current transaction
71503 + prior to switch to another atom or do something with current atom. This
71504 + code tries to flush current atom.
71505 +
71506 + flush_some_atom() is called as part of memory clearing process. It is
71507 + invoked from balance_dirty_pages(), pdflushd, and entd.
71508 +
71509 + If we can flush no nodes, atom is committed, because this frees memory.
71510 +
71511 + If atom is too large or too old it is committed also.
71512 +*/
71513 +int
71514 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
71515 + int flags)
71516 +{
71517 + reiser4_context *ctx = get_current_context();
71518 + txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
71519 + txn_handle *txnh = ctx->trans;
71520 + txn_atom *atom;
71521 + int ret;
71522 +
71523 + BUG_ON(wbc->nr_to_write == 0);
71524 + BUG_ON(*nr_submitted != 0);
71525 + assert("zam-1042", txnh != NULL);
71526 + repeat:
71527 + if (txnh->atom == NULL) {
71528 + /* current atom is not available, take first from txnmgr */
71529 + spin_lock_txnmgr(tmgr);
71530 +
71531 + /* traverse the list of all atoms */
71532 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71533 + /* lock atom before checking its state */
71534 + spin_lock_atom(atom);
71535 +
71536 + /*
71537 + * we need an atom which is not being committed and
71538 + * which has no flushers (jnode_flush() add one flusher
71539 + * at the beginning and subtract one at the end).
71540 + */
71541 + if (atom->stage < ASTAGE_PRE_COMMIT &&
71542 + atom->nr_flushers == 0) {
71543 + spin_lock_txnh(txnh);
71544 + capture_assign_txnh_nolock(atom, txnh);
71545 + spin_unlock_txnh(txnh);
71546 +
71547 + goto found;
71548 + }
71549 +
71550 + spin_unlock_atom(atom);
71551 + }
71552 +
71553 + /*
71554 + * Write throttling is case of no one atom can be
71555 + * flushed/committed.
71556 + */
71557 + if (!current_is_pdflush() && !wbc->nonblocking) {
71558 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71559 + spin_lock_atom(atom);
71560 + /* Repeat the check from the above. */
71561 + if (atom->stage < ASTAGE_PRE_COMMIT
71562 + && atom->nr_flushers == 0) {
71563 + spin_lock_txnh(txnh);
71564 + capture_assign_txnh_nolock(atom, txnh);
71565 + spin_unlock_txnh(txnh);
71566 +
71567 + goto found;
71568 + }
71569 + if (atom->stage <= ASTAGE_POST_COMMIT) {
71570 + spin_unlock_txnmgr(tmgr);
71571 + /*
71572 + * we just wait until atom's flusher
71573 + * makes a progress in flushing or
71574 + * committing the atom
71575 + */
71576 + reiser4_atom_wait_event(atom);
71577 + goto repeat;
71578 + }
71579 + spin_unlock_atom(atom);
71580 + }
71581 + }
71582 + spin_unlock_txnmgr(tmgr);
71583 + return 0;
71584 + found:
71585 + spin_unlock_txnmgr(tmgr);
71586 + } else
71587 + atom = get_current_atom_locked();
71588 +
71589 + BUG_ON(atom->super != ctx->super);
71590 + assert("vs-35", atom->super == ctx->super);
71591 + if (start) {
71592 + spin_lock_jnode(start);
71593 + ret = (atom == start->atom) ? 1 : 0;
71594 + spin_unlock_jnode(start);
71595 + if (ret == 0)
71596 + start = NULL;
71597 + }
71598 + ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
71599 + if (ret == 0) {
71600 + /* flush_current_atom returns 0 only if it submitted for write
71601 + nothing */
71602 + BUG_ON(*nr_submitted != 0);
71603 + if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
71604 + if (atom->capture_count < tmgr->atom_min_size &&
71605 + !(atom->flags & ATOM_CANCEL_FUSION)) {
71606 + ret = txn_try_to_fuse_small_atom(tmgr, atom);
71607 + if (ret == -E_REPEAT) {
71608 + reiser4_preempt_point();
71609 + goto repeat;
71610 + }
71611 + }
71612 + /* if early flushing could not make more nodes clean,
71613 + * or atom is too old/large,
71614 + * we force current atom to commit */
71615 + /* wait for commit completion but only if this
71616 + * wouldn't stall pdflushd and ent thread. */
71617 + if (!wbc->nonblocking && !ctx->entd)
71618 + txnh->flags |= TXNH_WAIT_COMMIT;
71619 + atom->flags |= ATOM_FORCE_COMMIT;
71620 + }
71621 + spin_unlock_atom(atom);
71622 + } else if (ret == -E_REPEAT) {
71623 + if (*nr_submitted == 0) {
71624 + /* let others who hampers flushing (hold longterm locks,
71625 + for instance) to free the way for flush */
71626 + reiser4_preempt_point();
71627 + goto repeat;
71628 + }
71629 + ret = 0;
71630 + }
71631 +/*
71632 + if (*nr_submitted > wbc->nr_to_write)
71633 + warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
71634 +*/
71635 + reiser4_txn_restart(ctx);
71636 +
71637 + return ret;
71638 +}
71639 +
71640 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
71641 +void reiser4_invalidate_list(struct list_head *head)
71642 +{
71643 + while (!list_empty(head)) {
71644 + jnode *node;
71645 +
71646 + node = list_entry(head->next, jnode, capture_link);
71647 + spin_lock_jnode(node);
71648 + reiser4_uncapture_block(node);
71649 + jput(node);
71650 + }
71651 +}
71652 +
71653 +static void init_wlinks(txn_wait_links * wlinks)
71654 +{
71655 + wlinks->_lock_stack = get_current_lock_stack();
71656 + INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
71657 + INIT_LIST_HEAD(&wlinks->_fwaiting_link);
71658 + wlinks->waitfor_cb = NULL;
71659 + wlinks->waiting_cb = NULL;
71660 +}
71661 +
71662 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
71663 +void reiser4_atom_wait_event(txn_atom * atom)
71664 +{
71665 + txn_wait_links _wlinks;
71666 +
71667 + assert_spin_locked(&(atom->alock));
71668 + assert("nikita-3156",
71669 + lock_stack_isclean(get_current_lock_stack()) ||
71670 + atom->nr_running_queues > 0);
71671 +
71672 + init_wlinks(&_wlinks);
71673 + list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
71674 + atomic_inc(&atom->refcount);
71675 + spin_unlock_atom(atom);
71676 +
71677 + reiser4_prepare_to_sleep(_wlinks._lock_stack);
71678 + reiser4_go_to_sleep(_wlinks._lock_stack);
71679 +
71680 + spin_lock_atom(atom);
71681 + list_del(&_wlinks._fwaitfor_link);
71682 + atom_dec_and_unlock(atom);
71683 +}
71684 +
71685 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
71686 +{
71687 + assert("nikita-3535", atom != NULL);
71688 + assert_spin_locked(&(atom->alock));
71689 + assert("nikita-3536", stage <= ASTAGE_INVALID);
71690 + /* Excelsior! */
71691 + assert("nikita-3537", stage >= atom->stage);
71692 + if (atom->stage != stage) {
71693 + atom->stage = stage;
71694 + reiser4_atom_send_event(atom);
71695 + }
71696 +}
71697 +
71698 +/* wake all threads which wait for an event */
71699 +void reiser4_atom_send_event(txn_atom * atom)
71700 +{
71701 + assert_spin_locked(&(atom->alock));
71702 + wakeup_atom_waitfor_list(atom);
71703 +}
71704 +
71705 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
71706 + example, because it does fsync(2)) */
71707 +static int should_wait_commit(txn_handle * h)
71708 +{
71709 + return h->flags & TXNH_WAIT_COMMIT;
71710 +}
71711 +
71712 +typedef struct commit_data {
71713 + txn_atom *atom;
71714 + txn_handle *txnh;
71715 + long nr_written;
71716 + /* as an optimization we start committing atom by first trying to
71717 + * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
71718 + * allows to reduce stalls due to other threads waiting for atom in
71719 + * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
71720 + * preliminary flushes. */
71721 + int preflush;
71722 + /* have we waited on atom. */
71723 + int wait;
71724 + int failed;
71725 + int wake_ktxnmgrd_up;
71726 +} commit_data;
71727 +
71728 +/*
71729 + * Called from commit_txnh() repeatedly, until either error happens, or atom
71730 + * commits successfully.
71731 + */
71732 +static int try_commit_txnh(commit_data * cd)
71733 +{
71734 + int result;
71735 +
71736 + assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
71737 +
71738 + /* Get the atom and txnh locked. */
71739 + cd->atom = txnh_get_atom(cd->txnh);
71740 + assert("jmacd-309", cd->atom != NULL);
71741 + spin_unlock_txnh(cd->txnh);
71742 +
71743 + if (cd->wait) {
71744 + cd->atom->nr_waiters--;
71745 + cd->wait = 0;
71746 + }
71747 +
71748 + if (cd->atom->stage == ASTAGE_DONE)
71749 + return 0;
71750 +
71751 + if (cd->failed)
71752 + return 0;
71753 +
71754 + if (atom_should_commit(cd->atom)) {
71755 + /* if atom is _very_ large schedule it for commit as soon as
71756 + * possible. */
71757 + if (atom_should_commit_asap(cd->atom)) {
71758 + /*
71759 + * When atom is in PRE_COMMIT or later stage following
71760 + * invariant (encoded in atom_can_be_committed())
71761 + * holds: there is exactly one non-waiter transaction
71762 + * handle opened on this atom. When thread wants to
71763 + * wait until atom commits (for example sync()) it
71764 + * waits on atom event after increasing
71765 + * atom->nr_waiters (see blow in this function). It
71766 + * cannot be guaranteed that atom is already committed
71767 + * after receiving event, so loop has to be
71768 + * re-started. But if atom switched into PRE_COMMIT
71769 + * stage and became too large, we cannot change its
71770 + * state back to CAPTURE_WAIT (atom stage can only
71771 + * increase monotonically), hence this check.
71772 + */
71773 + if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
71774 + reiser4_atom_set_stage(cd->atom,
71775 + ASTAGE_CAPTURE_WAIT);
71776 + cd->atom->flags |= ATOM_FORCE_COMMIT;
71777 + }
71778 + if (cd->txnh->flags & TXNH_DONT_COMMIT) {
71779 + /*
71780 + * this thread (transaction handle that is) doesn't
71781 + * want to commit atom. Notify waiters that handle is
71782 + * closed. This can happen, for example, when we are
71783 + * under VFS directory lock and don't want to commit
71784 + * atom right now to avoid stalling other threads
71785 + * working in the same directory.
71786 + */
71787 +
71788 + /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
71789 + * commit this atom: no atom waiters and only one
71790 + * (our) open transaction handle. */
71791 + cd->wake_ktxnmgrd_up =
71792 + cd->atom->txnh_count == 1 &&
71793 + cd->atom->nr_waiters == 0;
71794 + reiser4_atom_send_event(cd->atom);
71795 + result = 0;
71796 + } else if (!atom_can_be_committed(cd->atom)) {
71797 + if (should_wait_commit(cd->txnh)) {
71798 + /* sync(): wait for commit */
71799 + cd->atom->nr_waiters++;
71800 + cd->wait = 1;
71801 + reiser4_atom_wait_event(cd->atom);
71802 + result = RETERR(-E_REPEAT);
71803 + } else {
71804 + result = 0;
71805 + }
71806 + } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
71807 + /*
71808 + * optimization: flush atom without switching it into
71809 + * ASTAGE_CAPTURE_WAIT.
71810 + *
71811 + * But don't do this for ktxnmgrd, because ktxnmgrd
71812 + * should never block on atom fusion.
71813 + */
71814 + result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
71815 + LONG_MAX, &cd->nr_written,
71816 + &cd->atom, NULL);
71817 + if (result == 0) {
71818 + spin_unlock_atom(cd->atom);
71819 + cd->preflush = 0;
71820 + result = RETERR(-E_REPEAT);
71821 + } else /* Atoms wasn't flushed
71822 + * completely. Rinse. Repeat. */
71823 + --cd->preflush;
71824 + } else {
71825 + /* We change atom state to ASTAGE_CAPTURE_WAIT to
71826 + prevent atom fusion and count ourself as an active
71827 + flusher */
71828 + reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
71829 + cd->atom->flags |= ATOM_FORCE_COMMIT;
71830 +
71831 + result =
71832 + commit_current_atom(&cd->nr_written, &cd->atom);
71833 + if (result != 0 && result != -E_REPEAT)
71834 + cd->failed = 1;
71835 + }
71836 + } else
71837 + result = 0;
71838 +
71839 +#if REISER4_DEBUG
71840 + if (result == 0)
71841 + assert_spin_locked(&(cd->atom->alock));
71842 +#endif
71843 +
71844 + /* perfectly valid assertion, except that when atom/txnh is not locked
71845 + * fusion can take place, and cd->atom points nowhere. */
71846 + /*
71847 + assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
71848 + */
71849 + return result;
71850 +}
71851 +
71852 +/* Called to commit a transaction handle. This decrements the atom's number of open
71853 + handles and if it is the last handle to commit and the atom should commit, initiates
71854 + atom commit. if commit does not fail, return number of written blocks */
71855 +static int commit_txnh(txn_handle * txnh)
71856 +{
71857 + commit_data cd;
71858 + assert("umka-192", txnh != NULL);
71859 +
71860 + memset(&cd, 0, sizeof cd);
71861 + cd.txnh = txnh;
71862 + cd.preflush = 10;
71863 +
71864 + /* calls try_commit_txnh() until either atom commits, or error
71865 + * happens */
71866 + while (try_commit_txnh(&cd) != 0)
71867 + reiser4_preempt_point();
71868 +
71869 + spin_lock_txnh(txnh);
71870 +
71871 + cd.atom->txnh_count -= 1;
71872 + txnh->atom = NULL;
71873 + /* remove transaction handle from atom's list of transaction handles */
71874 + list_del_init(&txnh->txnh_link);
71875 +
71876 + spin_unlock_txnh(txnh);
71877 + atom_dec_and_unlock(cd.atom);
71878 + /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
71879 + * because it takes time) by current thread, we do that work
71880 + * asynchronously by ktxnmgrd daemon. */
71881 + if (cd.wake_ktxnmgrd_up)
71882 + ktxnmgrd_kick(&get_current_super_private()->tmgr);
71883 +
71884 + return 0;
71885 +}
71886 +
71887 +/* TRY_CAPTURE */
71888 +
71889 +/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
71890 + condition indicates that the request should be retried, and it may block if the
71891 + txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
71892 +
71893 + This routine encodes the basic logic of block capturing described by:
71894 +
71895 + http://namesys.com/v4/v4.html
71896 +
71897 + Our goal here is to ensure that any two blocks that contain dependent modifications
71898 + should commit at the same time. This function enforces this discipline by initiating
71899 + fusion whenever a transaction handle belonging to one atom requests to read or write a
71900 + block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
71901 +
71902 + In addition, this routine handles the initial assignment of atoms to blocks and
71903 + transaction handles. These are possible outcomes of this function:
71904 +
71905 + 1. The block and handle are already part of the same atom: return immediate success
71906 +
71907 + 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
71908 + the handle to the block's atom.
71909 +
71910 + 3. The handle is assigned but the block is not: call capture_assign_block to assign
71911 + the block to the handle's atom.
71912 +
71913 + 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
71914 + to fuse atoms.
71915 +
71916 + 5. Neither block nor handle are assigned: create a new atom and assign them both.
71917 +
71918 + 6. A read request for a non-captured block: return immediate success.
71919 +
71920 + This function acquires and releases the handle's spinlock. This function is called
71921 + under the jnode lock and if the return value is 0, it returns with the jnode lock still
71922 + held. If the return is -E_REPEAT or some other error condition, the jnode lock is
71923 + released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
71924 + lock in the failure case.
71925 +*/
71926 +static int try_capture_block(
71927 + txn_handle * txnh, jnode * node, txn_capture mode,
71928 + txn_atom ** atom_alloc)
71929 +{
71930 + txn_atom *block_atom;
71931 + txn_atom *txnh_atom;
71932 +
71933 + /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
71934 + assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
71935 +
71936 + /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
71937 + * node->tree somewhere. */
71938 + assert("umka-194", txnh != NULL);
71939 + assert("umka-195", node != NULL);
71940 +
71941 + /* The jnode is already locked! Being called from reiser4_try_capture(). */
71942 + assert_spin_locked(&(node->guard));
71943 + block_atom = node->atom;
71944 +
71945 + /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
71946 + let us touch the atoms themselves. */
71947 + spin_lock_txnh(txnh);
71948 + txnh_atom = txnh->atom;
71949 + /* Process of capturing continues into one of four branches depends on
71950 + which atoms from (block atom (node->atom), current atom (txnh->atom))
71951 + exist. */
71952 + if (txnh_atom == NULL) {
71953 + if (block_atom == NULL) {
71954 + spin_unlock_txnh(txnh);
71955 + spin_unlock_jnode(node);
71956 + /* assign empty atom to the txnh and repeat */
71957 + return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
71958 + } else {
71959 + atomic_inc(&block_atom->refcount);
71960 + /* node spin-lock isn't needed anymore */
71961 + spin_unlock_jnode(node);
71962 + if (!spin_trylock_atom(block_atom)) {
71963 + spin_unlock_txnh(txnh);
71964 + spin_lock_atom(block_atom);
71965 + spin_lock_txnh(txnh);
71966 + }
71967 + /* re-check state after getting txnh and the node
71968 + * atom spin-locked */
71969 + if (node->atom != block_atom || txnh->atom != NULL) {
71970 + spin_unlock_txnh(txnh);
71971 + atom_dec_and_unlock(block_atom);
71972 + return RETERR(-E_REPEAT);
71973 + }
71974 + atomic_dec(&block_atom->refcount);
71975 + if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
71976 + (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
71977 + block_atom->txnh_count != 0))
71978 + return capture_fuse_wait(txnh, block_atom, NULL, mode);
71979 + capture_assign_txnh_nolock(block_atom, txnh);
71980 + spin_unlock_txnh(txnh);
71981 + spin_unlock_atom(block_atom);
71982 + return RETERR(-E_REPEAT);
71983 + }
71984 + } else {
71985 + /* It is time to perform deadlock prevention check over the
71986 + node we want to capture. It is possible this node was locked
71987 + for read without capturing it. The optimization which allows
71988 + to do it helps us in keeping atoms independent as long as
71989 + possible but it may cause lock/fuse deadlock problems.
71990 +
71991 + A number of similar deadlock situations with locked but not
71992 + captured nodes were found. In each situation there are two
71993 + or more threads: one of them does flushing while another one
71994 + does routine balancing or tree lookup. The flushing thread
71995 + (F) sleeps in long term locking request for node (N), another
71996 + thread (A) sleeps in trying to capture some node already
71997 + belonging the atom F, F has a state which prevents
71998 + immediately fusion .
71999 +
72000 + Deadlocks of this kind cannot happen if node N was properly
72001 + captured by thread A. The F thread fuse atoms before locking
72002 + therefore current atom of thread F and current atom of thread
72003 + A became the same atom and thread A may proceed. This does
72004 + not work if node N was not captured because the fusion of
72005 + atom does not happens.
72006 +
72007 + The following scheme solves the deadlock: If
72008 + longterm_lock_znode locks and does not capture a znode, that
72009 + znode is marked as MISSED_IN_CAPTURE. A node marked this way
72010 + is processed by the code below which restores the missed
72011 + capture and fuses current atoms of all the node lock owners
72012 + by calling the fuse_not_fused_lock_owners() function. */
72013 + if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
72014 + JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
72015 + if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
72016 + spin_unlock_txnh(txnh);
72017 + spin_unlock_jnode(node);
72018 + fuse_not_fused_lock_owners(txnh, JZNODE(node));
72019 + return RETERR(-E_REPEAT);
72020 + }
72021 + }
72022 + if (block_atom == NULL) {
72023 + atomic_inc(&txnh_atom->refcount);
72024 + spin_unlock_txnh(txnh);
72025 + if (!spin_trylock_atom(txnh_atom)) {
72026 + spin_unlock_jnode(node);
72027 + spin_lock_atom(txnh_atom);
72028 + spin_lock_jnode(node);
72029 + }
72030 + if (txnh->atom != txnh_atom || node->atom != NULL
72031 + || JF_ISSET(node, JNODE_IS_DYING)) {
72032 + spin_unlock_jnode(node);
72033 + atom_dec_and_unlock(txnh_atom);
72034 + return RETERR(-E_REPEAT);
72035 + }
72036 + atomic_dec(&txnh_atom->refcount);
72037 + capture_assign_block_nolock(txnh_atom, node);
72038 + spin_unlock_atom(txnh_atom);
72039 + } else {
72040 + if (txnh_atom != block_atom) {
72041 + if (mode & TXN_CAPTURE_DONT_FUSE) {
72042 + spin_unlock_txnh(txnh);
72043 + spin_unlock_jnode(node);
72044 + /* we are in a "no-fusion" mode and @node is
72045 + * already part of transaction. */
72046 + return RETERR(-E_NO_NEIGHBOR);
72047 + }
72048 + return capture_init_fusion(node, txnh, mode);
72049 + }
72050 + spin_unlock_txnh(txnh);
72051 + }
72052 + }
72053 + return 0;
72054 +}
72055 +
72056 +static txn_capture
72057 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
72058 +{
72059 + txn_capture cap_mode;
72060 +
72061 + assert_spin_locked(&(node->guard));
72062 +
72063 + /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
72064 +
72065 + if (lock_mode == ZNODE_WRITE_LOCK) {
72066 + cap_mode = TXN_CAPTURE_WRITE;
72067 + } else if (node->atom != NULL) {
72068 + cap_mode = TXN_CAPTURE_WRITE;
72069 + } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
72070 + jnode_get_level(node) == LEAF_LEVEL) {
72071 + /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
72072 + /* We only need a READ_FUSING capture at the leaf level. This
72073 + is because the internal levels of the tree (twigs included)
72074 + are redundant from the point of the user that asked for a
72075 + read-fusing transcrash. The user only wants to read-fuse
72076 + atoms due to reading uncommitted data that another user has
72077 + written. It is the file system that reads/writes the
72078 + internal tree levels, the user only reads/writes leaves. */
72079 + cap_mode = TXN_CAPTURE_READ_ATOMIC;
72080 + } else {
72081 + /* In this case (read lock at a non-leaf) there's no reason to
72082 + * capture. */
72083 + /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
72084 + return 0;
72085 + }
72086 +
72087 + cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
72088 + assert("nikita-3186", cap_mode != 0);
72089 + return cap_mode;
72090 +}
72091 +
72092 +/* This is an external interface to try_capture_block(), it calls
72093 + try_capture_block() repeatedly as long as -E_REPEAT is returned.
72094 +
72095 + @node: node to capture,
72096 + @lock_mode: read or write lock is used in capture mode calculation,
72097 + @flags: see txn_capture flags enumeration,
72098 + @can_coc : can copy-on-capture
72099 +
72100 + @return: 0 - node was successfully captured, -E_REPEAT - capture request
72101 + cannot be processed immediately as it was requested in flags,
72102 + < 0 - other errors.
72103 +*/
72104 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
72105 + txn_capture flags)
72106 +{
72107 + txn_atom *atom_alloc = NULL;
72108 + txn_capture cap_mode;
72109 + txn_handle *txnh = get_current_context()->trans;
72110 + int ret;
72111 +
72112 + assert_spin_locked(&(node->guard));
72113 +
72114 + repeat:
72115 + if (JF_ISSET(node, JNODE_IS_DYING))
72116 + return RETERR(-EINVAL);
72117 + if (node->atom != NULL && txnh->atom == node->atom)
72118 + return 0;
72119 + cap_mode = build_capture_mode(node, lock_mode, flags);
72120 + if (cap_mode == 0 ||
72121 + (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
72122 + /* Mark this node as "MISSED". It helps in further deadlock
72123 + * analysis */
72124 + if (jnode_is_znode(node))
72125 + JF_SET(node, JNODE_MISSED_IN_CAPTURE);
72126 + return 0;
72127 + }
72128 + /* Repeat try_capture as long as -E_REPEAT is returned. */
72129 + ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
72130 + /* Regardless of non_blocking:
72131 +
72132 + If ret == 0 then jnode is still locked.
72133 + If ret != 0 then jnode is unlocked.
72134 + */
72135 +#if REISER4_DEBUG
72136 + if (ret == 0)
72137 + assert_spin_locked(&(node->guard));
72138 + else
72139 + assert_spin_not_locked(&(node->guard));
72140 +#endif
72141 + assert_spin_not_locked(&(txnh->guard));
72142 +
72143 + if (ret == -E_REPEAT) {
72144 + /* E_REPEAT implies all locks were released, therefore we need
72145 + to take the jnode's lock again. */
72146 + spin_lock_jnode(node);
72147 +
72148 + /* Although this may appear to be a busy loop, it is not.
72149 + There are several conditions that cause E_REPEAT to be
72150 + returned by the call to try_capture_block, all cases
72151 + indicating some kind of state change that means you should
72152 + retry the request and will get a different result. In some
72153 + cases this could be avoided with some extra code, but
72154 + generally it is done because the necessary locks were
72155 + released as a result of the operation and repeating is the
72156 + simplest thing to do (less bug potential). The cases are:
72157 + atom fusion returns E_REPEAT after it completes (jnode and
72158 + txnh were unlocked); race conditions in assign_block,
72159 + assign_txnh, and init_fusion return E_REPEAT (trylock
72160 + failure); after going to sleep in capture_fuse_wait
72161 + (request was blocked but may now succeed). I'm not quite
72162 + sure how capture_copy works yet, but it may also return
72163 + E_REPEAT. When the request is legitimately blocked, the
72164 + requestor goes to sleep in fuse_wait, so this is not a busy
72165 + loop. */
72166 + /* NOTE-NIKITA: still don't understand:
72167 +
72168 + try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
72169 +
72170 + looks like busy loop?
72171 + */
72172 + goto repeat;
72173 + }
72174 +
72175 + /* free extra atom object that was possibly allocated by
72176 + try_capture_block().
72177 +
72178 + Do this before acquiring jnode spin lock to
72179 + minimize time spent under lock. --nikita */
72180 + if (atom_alloc != NULL) {
72181 + kmem_cache_free(_atom_slab, atom_alloc);
72182 + }
72183 +
72184 + if (ret != 0) {
72185 + if (ret == -E_BLOCK) {
72186 + assert("nikita-3360",
72187 + cap_mode & TXN_CAPTURE_NONBLOCKING);
72188 + ret = -E_REPEAT;
72189 + }
72190 +
72191 + /* Failure means jnode is not locked. FIXME_LATER_JMACD May
72192 + want to fix the above code to avoid releasing the lock and
72193 + re-acquiring it, but there are cases were failure occurs
72194 + when the lock is not held, and those cases would need to be
72195 + modified to re-take the lock. */
72196 + spin_lock_jnode(node);
72197 + }
72198 +
72199 + /* Jnode is still locked. */
72200 + assert_spin_locked(&(node->guard));
72201 + return ret;
72202 +}
72203 +
72204 +static void release_two_atoms(txn_atom *one, txn_atom *two)
72205 +{
72206 + spin_unlock_atom(one);
72207 + atom_dec_and_unlock(two);
72208 + spin_lock_atom(one);
72209 + atom_dec_and_unlock(one);
72210 +}
72211 +
72212 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
72213 + returned by that routine. The txn_capture request mode is computed here depending on
72214 + the transaction handle's type and the lock request. This is called from the depths of
72215 + the lock manager with the jnode lock held and it always returns with the jnode lock
72216 + held.
72217 +*/
72218 +
72219 +/* fuse all 'active' atoms of lock owners of given node. */
72220 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
72221 +{
72222 + lock_handle *lh;
72223 + int repeat;
72224 + txn_atom *atomh, *atomf;
72225 + reiser4_context *me = get_current_context();
72226 + reiser4_context *ctx = NULL;
72227 +
72228 + assert_spin_not_locked(&(ZJNODE(node)->guard));
72229 + assert_spin_not_locked(&(txnh->hlock));
72230 +
72231 + repeat:
72232 + repeat = 0;
72233 + atomh = txnh_get_atom(txnh);
72234 + spin_unlock_txnh(txnh);
72235 + assert("zam-692", atomh != NULL);
72236 +
72237 + spin_lock_zlock(&node->lock);
72238 + /* inspect list of lock owners */
72239 + list_for_each_entry(lh, &node->lock.owners, owners_link) {
72240 + ctx = get_context_by_lock_stack(lh->owner);
72241 + if (ctx == me)
72242 + continue;
72243 + /* below we use two assumptions to avoid addition spin-locks
72244 + for checking the condition :
72245 +
72246 + 1) if the lock stack has lock, the transaction should be
72247 + opened, i.e. ctx->trans != NULL;
72248 +
72249 + 2) reading of well-aligned ctx->trans->atom is atomic, if it
72250 + equals to the address of spin-locked atomh, we take that
72251 + the atoms are the same, nothing has to be captured. */
72252 + if (atomh != ctx->trans->atom) {
72253 + reiser4_wake_up(lh->owner);
72254 + repeat = 1;
72255 + break;
72256 + }
72257 + }
72258 + if (repeat) {
72259 + if (!spin_trylock_txnh(ctx->trans)) {
72260 + spin_unlock_zlock(&node->lock);
72261 + spin_unlock_atom(atomh);
72262 + goto repeat;
72263 + }
72264 + atomf = ctx->trans->atom;
72265 + if (atomf == NULL) {
72266 + capture_assign_txnh_nolock(atomh, ctx->trans);
72267 + /* release zlock lock _after_ assigning the atom to the
72268 + * transaction handle, otherwise the lock owner thread
72269 + * may unlock all znodes, exit kernel context and here
72270 + * we would access an invalid transaction handle. */
72271 + spin_unlock_zlock(&node->lock);
72272 + spin_unlock_atom(atomh);
72273 + spin_unlock_txnh(ctx->trans);
72274 + goto repeat;
72275 + }
72276 + assert("zam-1059", atomf != atomh);
72277 + spin_unlock_zlock(&node->lock);
72278 + atomic_inc(&atomh->refcount);
72279 + atomic_inc(&atomf->refcount);
72280 + spin_unlock_txnh(ctx->trans);
72281 + if (atomf > atomh) {
72282 + spin_lock_atom_nested(atomf);
72283 + } else {
72284 + spin_unlock_atom(atomh);
72285 + spin_lock_atom(atomf);
72286 + spin_lock_atom_nested(atomh);
72287 + }
72288 + if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
72289 + release_two_atoms(atomf, atomh);
72290 + goto repeat;
72291 + }
72292 + atomic_dec(&atomh->refcount);
72293 + atomic_dec(&atomf->refcount);
72294 + capture_fuse_into(atomf, atomh);
72295 + goto repeat;
72296 + }
72297 + spin_unlock_zlock(&node->lock);
72298 + spin_unlock_atom(atomh);
72299 +}
72300 +
72301 +/* This is the interface to capture unformatted nodes via their struct page
72302 + reference. Currently it is only used in reiser4_invalidatepage */
72303 +int try_capture_page_to_invalidate(struct page *pg)
72304 +{
72305 + int ret;
72306 + jnode *node;
72307 +
72308 + assert("umka-292", pg != NULL);
72309 + assert("nikita-2597", PageLocked(pg));
72310 +
72311 + if (IS_ERR(node = jnode_of_page(pg))) {
72312 + return PTR_ERR(node);
72313 + }
72314 +
72315 + spin_lock_jnode(node);
72316 + unlock_page(pg);
72317 +
72318 + ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
72319 + spin_unlock_jnode(node);
72320 + jput(node);
72321 + lock_page(pg);
72322 + return ret;
72323 +}
72324 +
72325 +/* This informs the transaction manager when a node is deleted. Add the block to the
72326 + atom's delete set and uncapture the block.
72327 +
72328 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
72329 +explanations. find all the functions that use it, and unless there is some very
72330 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
72331 +move the loop to inside the function.
72332 +
72333 +VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
72334 + */
72335 +void reiser4_uncapture_page(struct page *pg)
72336 +{
72337 + jnode *node;
72338 + txn_atom *atom;
72339 +
72340 + assert("umka-199", pg != NULL);
72341 + assert("nikita-3155", PageLocked(pg));
72342 +
72343 + clear_page_dirty_for_io(pg);
72344 +
72345 + reiser4_wait_page_writeback(pg);
72346 +
72347 + node = jprivate(pg);
72348 + BUG_ON(node == NULL);
72349 +
72350 + spin_lock_jnode(node);
72351 +
72352 + atom = jnode_get_atom(node);
72353 + if (atom == NULL) {
72354 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72355 + spin_unlock_jnode(node);
72356 + return;
72357 + }
72358 +
72359 + /* We can remove jnode from transaction even if it is on flush queue
72360 + * prepped list, we only need to be sure that flush queue is not being
72361 + * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
72362 + * spin lock for protection of the prepped nodes list, instead
72363 + * write_fq() increments atom's nr_running_queues counters for the time
72364 + * when prepped list is not protected by spin lock. Here we check this
72365 + * counter if we want to remove jnode from flush queue and, if the
72366 + * counter is not zero, wait all reiser4_write_fq() for this atom to
72367 + * complete. This is not significant overhead. */
72368 + while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
72369 + spin_unlock_jnode(node);
72370 + /*
72371 + * at this moment we want to wait for "atom event", viz. wait
72372 + * until @node can be removed from flush queue. But
72373 + * reiser4_atom_wait_event() cannot be called with page locked,
72374 + * because it deadlocks with jnode_extent_write(). Unlock page,
72375 + * after making sure (through page_cache_get()) that it cannot
72376 + * be released from memory.
72377 + */
72378 + page_cache_get(pg);
72379 + unlock_page(pg);
72380 + reiser4_atom_wait_event(atom);
72381 + lock_page(pg);
72382 + /*
72383 + * page may has been detached by ->writepage()->releasepage().
72384 + */
72385 + reiser4_wait_page_writeback(pg);
72386 + spin_lock_jnode(node);
72387 + page_cache_release(pg);
72388 + atom = jnode_get_atom(node);
72389 +/* VS-FIXME-HANS: improve the commenting in this function */
72390 + if (atom == NULL) {
72391 + spin_unlock_jnode(node);
72392 + return;
72393 + }
72394 + }
72395 + reiser4_uncapture_block(node);
72396 + spin_unlock_atom(atom);
72397 + jput(node);
72398 +}
72399 +
72400 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
72401 + * inode's tree of jnodes */
72402 +void reiser4_uncapture_jnode(jnode * node)
72403 +{
72404 + txn_atom *atom;
72405 +
72406 + assert_spin_locked(&(node->guard));
72407 + assert("", node->pg == 0);
72408 +
72409 + atom = jnode_get_atom(node);
72410 + if (atom == NULL) {
72411 + assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72412 + spin_unlock_jnode(node);
72413 + return;
72414 + }
72415 +
72416 + reiser4_uncapture_block(node);
72417 + spin_unlock_atom(atom);
72418 + jput(node);
72419 +}
72420 +
72421 +/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
72422 + increases atom refcount and txnh_count, adds to txnh_list. */
72423 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
72424 +{
72425 + assert("umka-200", atom != NULL);
72426 + assert("umka-201", txnh != NULL);
72427 +
72428 + assert_spin_locked(&(txnh->hlock));
72429 + assert_spin_locked(&(atom->alock));
72430 + assert("jmacd-824", txnh->atom == NULL);
72431 + assert("nikita-3540", atom_isopen(atom));
72432 + BUG_ON(txnh->atom != NULL);
72433 +
72434 + atomic_inc(&atom->refcount);
72435 + txnh->atom = atom;
72436 + reiser4_ctx_gfp_mask_set();
72437 + list_add_tail(&txnh->txnh_link, &atom->txnh_list);
72438 + atom->txnh_count += 1;
72439 +}
72440 +
72441 +/* No-locking version of assign_block. Sets the block's atom pointer, references the
72442 + block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
72443 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
72444 +{
72445 + assert("umka-202", atom != NULL);
72446 + assert("umka-203", node != NULL);
72447 + assert_spin_locked(&(node->guard));
72448 + assert_spin_locked(&(atom->alock));
72449 + assert("jmacd-323", node->atom == NULL);
72450 + BUG_ON(!list_empty_careful(&node->capture_link));
72451 + assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
72452 +
72453 + /* Pointer from jnode to atom is not counted in atom->refcount. */
72454 + node->atom = atom;
72455 +
72456 + list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
72457 + atom->capture_count += 1;
72458 + /* reference to jnode is acquired by atom. */
72459 + jref(node);
72460 +
72461 + ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
72462 +
72463 + LOCK_CNT_INC(t_refs);
72464 +}
72465 +
72466 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
72467 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
72468 +{
72469 + assert_spin_locked(&(node->guard));
72470 + assert_spin_locked(&(atom->alock));
72471 + assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
72472 +
72473 + JF_SET(node, JNODE_DIRTY);
72474 +
72475 + get_current_context()->nr_marked_dirty++;
72476 +
72477 + /* We grab2flush_reserve one additional block only if node was
72478 + not CREATED and jnode_flush did not sort it into neither
72479 + relocate set nor overwrite one. If node is in overwrite or
72480 + relocate set we assume that atom's flush reserved counter was
72481 + already adjusted. */
72482 + if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
72483 + && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
72484 + && !jnode_is_cluster_page(node)) {
72485 + assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
72486 + assert("vs-1506", *jnode_get_block(node) != 0);
72487 + grabbed2flush_reserved_nolock(atom, (__u64) 1);
72488 + JF_SET(node, JNODE_FLUSH_RESERVED);
72489 + }
72490 +
72491 + if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
72492 + /* If the atom is not set yet, it will be added to the appropriate list in
72493 + capture_assign_block_nolock. */
72494 + /* Sometimes a node is set dirty before being captured -- the case for new
72495 + jnodes. In that case the jnode will be added to the appropriate list
72496 + in capture_assign_block_nolock. Another reason not to re-link jnode is
72497 + that jnode is on a flush queue (see flush.c for details) */
72498 +
72499 + int level = jnode_get_level(node);
72500 +
72501 + assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
72502 + assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
72503 + assert("nikita-2607", 0 <= level);
72504 + assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
72505 +
72506 + /* move node to atom's dirty list */
72507 + list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
72508 + ON_DEBUG(count_jnode
72509 + (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
72510 + }
72511 +}
72512 +
72513 +/* Set the dirty status for this (spin locked) jnode. */
72514 +void jnode_make_dirty_locked(jnode * node)
72515 +{
72516 + assert("umka-204", node != NULL);
72517 + assert_spin_locked(&(node->guard));
72518 +
72519 + if (REISER4_DEBUG && rofs_jnode(node)) {
72520 + warning("nikita-3365", "Dirtying jnode on rofs");
72521 + dump_stack();
72522 + }
72523 +
72524 + /* Fast check for already dirty node */
72525 + if (!JF_ISSET(node, JNODE_DIRTY)) {
72526 + txn_atom *atom;
72527 +
72528 + atom = jnode_get_atom(node);
72529 + assert("vs-1094", atom);
72530 + /* Check jnode dirty status again because node spin lock might
72531 + * be released inside jnode_get_atom(). */
72532 + if (likely(!JF_ISSET(node, JNODE_DIRTY)))
72533 + do_jnode_make_dirty(node, atom);
72534 + spin_unlock_atom(atom);
72535 + }
72536 +}
72537 +
72538 +/* Set the dirty status for this znode. */
72539 +void znode_make_dirty(znode * z)
72540 +{
72541 + jnode *node;
72542 + struct page *page;
72543 +
72544 + assert("umka-204", z != NULL);
72545 + assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
72546 + assert("nikita-3560", znode_is_write_locked(z));
72547 +
72548 + node = ZJNODE(z);
72549 + /* znode is longterm locked, we can check dirty bit without spinlock */
72550 + if (JF_ISSET(node, JNODE_DIRTY)) {
72551 + /* znode is dirty already. All we have to do is to change znode version */
72552 + z->version = znode_build_version(jnode_get_tree(node));
72553 + return;
72554 + }
72555 +
72556 + spin_lock_jnode(node);
72557 + jnode_make_dirty_locked(node);
72558 + page = jnode_page(node);
72559 + if (page != NULL) {
72560 + /* this is useful assertion (allows one to check that no
72561 + * modifications are lost due to update of in-flight page),
72562 + * but it requires locking on page to check PG_writeback
72563 + * bit. */
72564 + /* assert("nikita-3292",
72565 + !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
72566 + page_cache_get(page);
72567 +
72568 + /* jnode lock is not needed for the rest of
72569 + * znode_set_dirty(). */
72570 + spin_unlock_jnode(node);
72571 + /* reiser4 file write code calls set_page_dirty for
72572 + * unformatted nodes, for formatted nodes we do it here. */
72573 + reiser4_set_page_dirty_internal(page);
72574 + page_cache_release(page);
72575 + /* bump version counter in znode */
72576 + z->version = znode_build_version(jnode_get_tree(node));
72577 + } else {
72578 + assert("zam-596", znode_above_root(JZNODE(node)));
72579 + spin_unlock_jnode(node);
72580 + }
72581 +
72582 + assert("nikita-1900", znode_is_write_locked(z));
72583 + assert("jmacd-9777", node->atom != NULL);
72584 +}
72585 +
72586 +int reiser4_sync_atom(txn_atom * atom)
72587 +{
72588 + int result;
72589 + txn_handle *txnh;
72590 +
72591 + txnh = get_current_context()->trans;
72592 +
72593 + result = 0;
72594 + if (atom != NULL) {
72595 + if (atom->stage < ASTAGE_PRE_COMMIT) {
72596 + spin_lock_txnh(txnh);
72597 + capture_assign_txnh_nolock(atom, txnh);
72598 + result = force_commit_atom(txnh);
72599 + } else if (atom->stage < ASTAGE_POST_COMMIT) {
72600 + /* wait atom commit */
72601 + reiser4_atom_wait_event(atom);
72602 + /* try once more */
72603 + result = RETERR(-E_REPEAT);
72604 + } else
72605 + spin_unlock_atom(atom);
72606 + }
72607 + return result;
72608 +}
72609 +
72610 +#if REISER4_DEBUG
72611 +
72612 +/* move jnode form one list to another
72613 + call this after atom->capture_count is updated */
72614 +void
72615 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
72616 + atom_list new_list, int check_lists)
72617 +{
72618 + struct list_head *pos;
72619 +
72620 + assert("zam-1018", atom_is_protected(atom));
72621 + assert_spin_locked(&(node->guard));
72622 + assert("", NODE_LIST(node) == old_list);
72623 +
72624 + switch (NODE_LIST(node)) {
72625 + case NOT_CAPTURED:
72626 + break;
72627 + case DIRTY_LIST:
72628 + assert("", atom->dirty > 0);
72629 + atom->dirty--;
72630 + break;
72631 + case CLEAN_LIST:
72632 + assert("", atom->clean > 0);
72633 + atom->clean--;
72634 + break;
72635 + case FQ_LIST:
72636 + assert("", atom->fq > 0);
72637 + atom->fq--;
72638 + break;
72639 + case WB_LIST:
72640 + assert("", atom->wb > 0);
72641 + atom->wb--;
72642 + break;
72643 + case OVRWR_LIST:
72644 + assert("", atom->ovrwr > 0);
72645 + atom->ovrwr--;
72646 + break;
72647 + default:
72648 + impossible("", "");
72649 + }
72650 +
72651 + switch (new_list) {
72652 + case NOT_CAPTURED:
72653 + break;
72654 + case DIRTY_LIST:
72655 + atom->dirty++;
72656 + break;
72657 + case CLEAN_LIST:
72658 + atom->clean++;
72659 + break;
72660 + case FQ_LIST:
72661 + atom->fq++;
72662 + break;
72663 + case WB_LIST:
72664 + atom->wb++;
72665 + break;
72666 + case OVRWR_LIST:
72667 + atom->ovrwr++;
72668 + break;
72669 + default:
72670 + impossible("", "");
72671 + }
72672 + ASSIGN_NODE_LIST(node, new_list);
72673 + if (0 && check_lists) {
72674 + int count;
72675 + tree_level level;
72676 +
72677 + count = 0;
72678 +
72679 + /* flush queue list */
72680 + /* reiser4_check_fq(atom); */
72681 +
72682 + /* dirty list */
72683 + count = 0;
72684 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72685 + list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
72686 + count++;
72687 + }
72688 + if (count != atom->dirty)
72689 + warning("", "dirty counter %d, real %d\n", atom->dirty,
72690 + count);
72691 +
72692 + /* clean list */
72693 + count = 0;
72694 + list_for_each(pos, ATOM_CLEAN_LIST(atom))
72695 + count++;
72696 + if (count != atom->clean)
72697 + warning("", "clean counter %d, real %d\n", atom->clean,
72698 + count);
72699 +
72700 + /* wb list */
72701 + count = 0;
72702 + list_for_each(pos, ATOM_WB_LIST(atom))
72703 + count++;
72704 + if (count != atom->wb)
72705 + warning("", "wb counter %d, real %d\n", atom->wb,
72706 + count);
72707 +
72708 + /* overwrite list */
72709 + count = 0;
72710 + list_for_each(pos, ATOM_OVRWR_LIST(atom))
72711 + count++;
72712 +
72713 + if (count != atom->ovrwr)
72714 + warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
72715 + count);
72716 + }
72717 + assert("vs-1624", atom->num_queued == atom->fq);
72718 + if (atom->capture_count !=
72719 + atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
72720 + printk
72721 + ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
72722 + atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
72723 + atom->wb, atom->fq);
72724 + assert("vs-1622",
72725 + atom->capture_count ==
72726 + atom->dirty + atom->clean + atom->ovrwr + atom->wb +
72727 + atom->fq);
72728 + }
72729 +}
72730 +
72731 +#endif
72732 +
72733 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
72734 + * lock should be taken before calling this function. */
72735 +void jnode_make_wander_nolock(jnode * node)
72736 +{
72737 + txn_atom *atom;
72738 +
72739 + assert("nikita-2431", node != NULL);
72740 + assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
72741 + assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
72742 + assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72743 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72744 +
72745 + atom = node->atom;
72746 +
72747 + assert("zam-895", atom != NULL);
72748 + assert("zam-894", atom_is_protected(atom));
72749 +
72750 + JF_SET(node, JNODE_OVRWR);
72751 + /* move node to atom's overwrite list */
72752 + list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
72753 + ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
72754 +}
72755 +
72756 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
72757 + * this function. */
72758 +void jnode_make_wander(jnode * node)
72759 +{
72760 + txn_atom *atom;
72761 +
72762 + spin_lock_jnode(node);
72763 + atom = jnode_get_atom(node);
72764 + assert("zam-913", atom != NULL);
72765 + assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
72766 +
72767 + jnode_make_wander_nolock(node);
72768 + spin_unlock_atom(atom);
72769 + spin_unlock_jnode(node);
72770 +}
72771 +
72772 +/* this just sets RELOC bit */
72773 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
72774 +{
72775 + assert_spin_locked(&(node->guard));
72776 + assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
72777 + assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
72778 + assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
72779 + assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72780 + assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72781 + jnode_set_reloc(node);
72782 +}
72783 +
72784 +/* Make znode RELOC and put it on flush queue */
72785 +void znode_make_reloc(znode * z, flush_queue_t * fq)
72786 +{
72787 + jnode *node;
72788 + txn_atom *atom;
72789 +
72790 + node = ZJNODE(z);
72791 + spin_lock_jnode(node);
72792 +
72793 + atom = jnode_get_atom(node);
72794 + assert("zam-919", atom != NULL);
72795 +
72796 + jnode_make_reloc_nolock(fq, node);
72797 + queue_jnode(fq, node);
72798 +
72799 + spin_unlock_atom(atom);
72800 + spin_unlock_jnode(node);
72801 +
72802 +}
72803 +
72804 +/* Make unformatted node RELOC and put it on flush queue */
72805 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
72806 +{
72807 + assert("vs-1479", jnode_is_unformatted(node));
72808 +
72809 + jnode_make_reloc_nolock(fq, node);
72810 + queue_jnode(fq, node);
72811 +}
72812 +
72813 +int reiser4_capture_super_block(struct super_block *s)
72814 +{
72815 + int result;
72816 + znode *uber;
72817 + lock_handle lh;
72818 +
72819 + init_lh(&lh);
72820 + result = get_uber_znode(reiser4_get_tree(s),
72821 + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
72822 + if (result)
72823 + return result;
72824 +
72825 + uber = lh.node;
72826 + /* Grabbing one block for superblock */
72827 + result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
72828 + if (result != 0)
72829 + return result;
72830 +
72831 + znode_make_dirty(uber);
72832 +
72833 + done_lh(&lh);
72834 + return 0;
72835 +}
72836 +
72837 +/* Wakeup every handle on the atom's WAITFOR list */
72838 +static void wakeup_atom_waitfor_list(txn_atom * atom)
72839 +{
72840 + txn_wait_links *wlinks;
72841 +
72842 + assert("umka-210", atom != NULL);
72843 +
72844 + /* atom is locked */
72845 + list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
72846 + if (wlinks->waitfor_cb == NULL ||
72847 + wlinks->waitfor_cb(atom, wlinks))
72848 + /* Wake up. */
72849 + reiser4_wake_up(wlinks->_lock_stack);
72850 + }
72851 +}
72852 +
72853 +/* Wakeup every handle on the atom's WAITING list */
72854 +static void wakeup_atom_waiting_list(txn_atom * atom)
72855 +{
72856 + txn_wait_links *wlinks;
72857 +
72858 + assert("umka-211", atom != NULL);
72859 +
72860 + /* atom is locked */
72861 + list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
72862 + if (wlinks->waiting_cb == NULL ||
72863 + wlinks->waiting_cb(atom, wlinks))
72864 + /* Wake up. */
72865 + reiser4_wake_up(wlinks->_lock_stack);
72866 + }
72867 +}
72868 +
72869 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
72870 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
72871 +{
72872 + assert("nikita-3330", atom != NULL);
72873 + assert_spin_locked(&(atom->alock));
72874 +
72875 + /* atom->txnh_count == 1 is for waking waiters up if we are releasing
72876 + * last transaction handle. */
72877 + return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
72878 +}
72879 +
72880 +/* The general purpose of this function is to wait on the first of two possible events.
72881 + The situation is that a handle (and its atom atomh) is blocked trying to capture a
72882 + block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
72883 + handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
72884 + another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
72885 + needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
72886 + proceed and fuse the two atoms in the CAPTURE_WAIT state.
72887 +
72888 + In other words, if either atomh or atomf change state, the handle will be awakened,
72889 + thus there are two lists per atom: WAITING and WAITFOR.
72890 +
72891 + This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
72892 + close but it is not assigned to an atom of its own.
72893 +
72894 + Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
72895 + BOTH_ATOM_LOCKS. Result: all four locks are released.
72896 +*/
72897 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
72898 + txn_atom * atomh, txn_capture mode)
72899 +{
72900 + int ret;
72901 + txn_wait_links wlinks;
72902 +
72903 + assert("umka-213", txnh != NULL);
72904 + assert("umka-214", atomf != NULL);
72905 +
72906 + if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
72907 + spin_unlock_txnh(txnh);
72908 + spin_unlock_atom(atomf);
72909 +
72910 + if (atomh) {
72911 + spin_unlock_atom(atomh);
72912 + }
72913 +
72914 + return RETERR(-E_BLOCK);
72915 + }
72916 +
72917 + /* Initialize the waiting list links. */
72918 + init_wlinks(&wlinks);
72919 +
72920 + /* Add txnh to atomf's waitfor list, unlock atomf. */
72921 + list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
72922 + wlinks.waitfor_cb = wait_for_fusion;
72923 + atomic_inc(&atomf->refcount);
72924 + spin_unlock_atom(atomf);
72925 +
72926 + if (atomh) {
72927 + /* Add txnh to atomh's waiting list, unlock atomh. */
72928 + list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
72929 + atomic_inc(&atomh->refcount);
72930 + spin_unlock_atom(atomh);
72931 + }
72932 +
72933 + /* Go to sleep. */
72934 + spin_unlock_txnh(txnh);
72935 +
72936 + ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
72937 + if (ret == 0) {
72938 + reiser4_go_to_sleep(wlinks._lock_stack);
72939 + ret = RETERR(-E_REPEAT);
72940 + }
72941 +
72942 + /* Remove from the waitfor list. */
72943 + spin_lock_atom(atomf);
72944 +
72945 + list_del(&wlinks._fwaitfor_link);
72946 + atom_dec_and_unlock(atomf);
72947 +
72948 + if (atomh) {
72949 + /* Remove from the waiting list. */
72950 + spin_lock_atom(atomh);
72951 + list_del(&wlinks._fwaiting_link);
72952 + atom_dec_and_unlock(atomh);
72953 + }
72954 + return ret;
72955 +}
72956 +
72957 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
72958 +{
72959 + assert("zam-1067", one != two);
72960 +
72961 + /* lock the atom with lesser address first */
72962 + if (one < two) {
72963 + spin_lock_atom(one);
72964 + spin_lock_atom_nested(two);
72965 + } else {
72966 + spin_lock_atom(two);
72967 + spin_lock_atom_nested(one);
72968 + }
72969 +}
72970 +
72971 +/* Perform the necessary work to prepare for fusing two atoms, which involves
72972 + * acquiring two atom locks in the proper order. If one of the node's atom is
72973 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
72974 + * atom is not then the handle's request is put to sleep. If the node's atom
72975 + * is committing, then the node can be copy-on-captured. Otherwise, pick the
72976 + * atom with fewer pointers to be fused into the atom with more pointer and
72977 + * call capture_fuse_into.
72978 + */
72979 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
72980 +{
72981 + txn_atom * txnh_atom = txnh->atom;
72982 + txn_atom * block_atom = node->atom;
72983 +
72984 + atomic_inc(&txnh_atom->refcount);
72985 + atomic_inc(&block_atom->refcount);
72986 +
72987 + spin_unlock_txnh(txnh);
72988 + spin_unlock_jnode(node);
72989 +
72990 + lock_two_atoms(txnh_atom, block_atom);
72991 +
72992 + if (txnh->atom != txnh_atom || node->atom != block_atom ) {
72993 + release_two_atoms(txnh_atom, block_atom);
72994 + return RETERR(-E_REPEAT);
72995 + }
72996 +
72997 + atomic_dec(&txnh_atom->refcount);
72998 + atomic_dec(&block_atom->refcount);
72999 +
73000 + assert ("zam-1066", atom_isopen(txnh_atom));
73001 +
73002 + if (txnh_atom->stage >= block_atom->stage ||
73003 + (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
73004 + capture_fuse_into(txnh_atom, block_atom);
73005 + return RETERR(-E_REPEAT);
73006 + }
73007 + spin_lock_txnh(txnh);
73008 + return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
73009 +}
73010 +
73011 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
73012 + the small list to point to the large atom. Returns the length of the list. */
73013 +static int
73014 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
73015 + struct list_head *small_head)
73016 +{
73017 + int count = 0;
73018 + jnode *node;
73019 +
73020 + assert("umka-218", large != NULL);
73021 + assert("umka-219", large_head != NULL);
73022 + assert("umka-220", small_head != NULL);
73023 + /* small atom should be locked also. */
73024 + assert_spin_locked(&(large->alock));
73025 +
73026 + /* For every jnode on small's capture list... */
73027 + list_for_each_entry(node, small_head, capture_link) {
73028 + count += 1;
73029 +
73030 + /* With the jnode lock held, update atom pointer. */
73031 + spin_lock_jnode(node);
73032 + node->atom = large;
73033 + spin_unlock_jnode(node);
73034 + }
73035 +
73036 + /* Splice the lists. */
73037 + list_splice_init(small_head, large_head->prev);
73038 +
73039 + return count;
73040 +}
73041 +
73042 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
73043 + the small list to point to the large atom. Returns the length of the list. */
73044 +static int
73045 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
73046 + struct list_head *small_head)
73047 +{
73048 + int count = 0;
73049 + txn_handle *txnh;
73050 +
73051 + assert("umka-221", large != NULL);
73052 + assert("umka-222", large_head != NULL);
73053 + assert("umka-223", small_head != NULL);
73054 +
73055 + /* Adjust every txnh to the new atom. */
73056 + list_for_each_entry(txnh, small_head, txnh_link) {
73057 + count += 1;
73058 +
73059 + /* With the txnh lock held, update atom pointer. */
73060 + spin_lock_txnh(txnh);
73061 + txnh->atom = large;
73062 + spin_unlock_txnh(txnh);
73063 + }
73064 +
73065 + /* Splice the txn_handle list. */
73066 + list_splice_init(small_head, large_head->prev);
73067 +
73068 + return count;
73069 +}
73070 +
73071 +/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
73072 + added to LARGE and their ->atom pointers are all updated. The associated counts are
73073 + updated as well, and any waiting handles belonging to either are awakened. Finally the
73074 + smaller atom's refcount is decremented.
73075 +*/
73076 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
73077 +{
73078 + int level;
73079 + unsigned zcount = 0;
73080 + unsigned tcount = 0;
73081 +
73082 + assert("umka-224", small != NULL);
73083 + assert("umka-225", small != NULL);
73084 +
73085 + assert_spin_locked(&(large->alock));
73086 + assert_spin_locked(&(small->alock));
73087 +
73088 + assert("jmacd-201", atom_isopen(small));
73089 + assert("jmacd-202", atom_isopen(large));
73090 +
73091 + /* Splice and update the per-level dirty jnode lists */
73092 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73093 + zcount +=
73094 + capture_fuse_jnode_lists(large,
73095 + ATOM_DIRTY_LIST(large, level),
73096 + ATOM_DIRTY_LIST(small, level));
73097 + }
73098 +
73099 + /* Splice and update the [clean,dirty] jnode and txnh lists */
73100 + zcount +=
73101 + capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
73102 + ATOM_CLEAN_LIST(small));
73103 + zcount +=
73104 + capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
73105 + ATOM_OVRWR_LIST(small));
73106 + zcount +=
73107 + capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
73108 + ATOM_WB_LIST(small));
73109 + zcount +=
73110 + capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
73111 + tcount +=
73112 + capture_fuse_txnh_lists(large, &large->txnh_list,
73113 + &small->txnh_list);
73114 +
73115 + /* Check our accounting. */
73116 + assert("jmacd-1063",
73117 + zcount + small->num_queued == small->capture_count);
73118 + assert("jmacd-1065", tcount == small->txnh_count);
73119 +
73120 + /* sum numbers of waiters threads */
73121 + large->nr_waiters += small->nr_waiters;
73122 + small->nr_waiters = 0;
73123 +
73124 + /* splice flush queues */
73125 + reiser4_fuse_fq(large, small);
73126 +
73127 + /* update counter of jnode on every atom' list */
73128 + ON_DEBUG(large->dirty += small->dirty;
73129 + small->dirty = 0;
73130 + large->clean += small->clean;
73131 + small->clean = 0;
73132 + large->ovrwr += small->ovrwr;
73133 + small->ovrwr = 0;
73134 + large->wb += small->wb;
73135 + small->wb = 0;
73136 + large->fq += small->fq;
73137 + small->fq = 0;);
73138 +
73139 + /* count flushers in result atom */
73140 + large->nr_flushers += small->nr_flushers;
73141 + small->nr_flushers = 0;
73142 +
73143 + /* update counts of flushed nodes */
73144 + large->flushed += small->flushed;
73145 + small->flushed = 0;
73146 +
73147 + /* Transfer list counts to large. */
73148 + large->txnh_count += small->txnh_count;
73149 + large->capture_count += small->capture_count;
73150 +
73151 + /* Add all txnh references to large. */
73152 + atomic_add(small->txnh_count, &large->refcount);
73153 + atomic_sub(small->txnh_count, &small->refcount);
73154 +
73155 + /* Reset small counts */
73156 + small->txnh_count = 0;
73157 + small->capture_count = 0;
73158 +
73159 + /* Assign the oldest start_time, merge flags. */
73160 + large->start_time = min(large->start_time, small->start_time);
73161 + large->flags |= small->flags;
73162 +
73163 + /* Merge blocknr sets. */
73164 + blocknr_set_merge(&small->delete_set, &large->delete_set);
73165 + blocknr_set_merge(&small->wandered_map, &large->wandered_map);
73166 +
73167 + /* Merge allocated/deleted file counts */
73168 + large->nr_objects_deleted += small->nr_objects_deleted;
73169 + large->nr_objects_created += small->nr_objects_created;
73170 +
73171 + small->nr_objects_deleted = 0;
73172 + small->nr_objects_created = 0;
73173 +
73174 + /* Merge allocated blocks counts */
73175 + large->nr_blocks_allocated += small->nr_blocks_allocated;
73176 +
73177 + large->nr_running_queues += small->nr_running_queues;
73178 + small->nr_running_queues = 0;
73179 +
73180 + /* Merge blocks reserved for overwrite set. */
73181 + large->flush_reserved += small->flush_reserved;
73182 + small->flush_reserved = 0;
73183 +
73184 + if (large->stage < small->stage) {
73185 + /* Large only needs to notify if it has changed state. */
73186 + reiser4_atom_set_stage(large, small->stage);
73187 + wakeup_atom_waiting_list(large);
73188 + }
73189 +
73190 + reiser4_atom_set_stage(small, ASTAGE_INVALID);
73191 +
73192 + /* Notify any waiters--small needs to unload its wait lists. Waiters
73193 + actually remove themselves from the list before returning from the
73194 + fuse_wait function. */
73195 + wakeup_atom_waiting_list(small);
73196 +
73197 + /* Unlock atoms */
73198 + spin_unlock_atom(large);
73199 + atom_dec_and_unlock(small);
73200 +}
73201 +
73202 +/* TXNMGR STUFF */
73203 +
73204 +/* Release a block from the atom, reversing the effects of being captured,
73205 + do not release atom's reference to jnode due to holding spin-locks.
73206 + Currently this is only called when the atom commits.
73207 +
73208 + NOTE: this function does not release a (journal) reference to jnode
73209 + due to locking optimizations, you should call jput() somewhere after
73210 + calling reiser4_uncapture_block(). */
73211 +void reiser4_uncapture_block(jnode * node)
73212 +{
73213 + txn_atom *atom;
73214 +
73215 + assert("umka-226", node != NULL);
73216 + atom = node->atom;
73217 + assert("umka-228", atom != NULL);
73218 +
73219 + assert("jmacd-1021", node->atom == atom);
73220 + assert_spin_locked(&(node->guard));
73221 + assert("jmacd-1023", atom_is_protected(atom));
73222 +
73223 + JF_CLR(node, JNODE_DIRTY);
73224 + JF_CLR(node, JNODE_RELOC);
73225 + JF_CLR(node, JNODE_OVRWR);
73226 + JF_CLR(node, JNODE_CREATED);
73227 + JF_CLR(node, JNODE_WRITEBACK);
73228 + JF_CLR(node, JNODE_REPACK);
73229 +
73230 + list_del_init(&node->capture_link);
73231 + if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
73232 + assert("zam-925", atom_isopen(atom));
73233 + assert("vs-1623", NODE_LIST(node) == FQ_LIST);
73234 + ON_DEBUG(atom->num_queued--);
73235 + JF_CLR(node, JNODE_FLUSH_QUEUED);
73236 + }
73237 + atom->capture_count -= 1;
73238 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
73239 + node->atom = NULL;
73240 +
73241 + spin_unlock_jnode(node);
73242 + LOCK_CNT_DEC(t_refs);
73243 +}
73244 +
73245 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
73246 + bitmap-based allocator code for adding modified bitmap blocks the
73247 + transaction. @atom and @node are spin locked */
73248 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
73249 +{
73250 + assert("zam-538", atom_is_protected(atom));
73251 + assert_spin_locked(&(node->guard));
73252 + assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
73253 + assert("zam-543", node->atom == NULL);
73254 + assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
73255 +
73256 + list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
73257 + jref(node);
73258 + node->atom = atom;
73259 + atom->capture_count++;
73260 + ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
73261 +}
73262 +
73263 +static int count_deleted_blocks_actor(txn_atom * atom,
73264 + const reiser4_block_nr * a,
73265 + const reiser4_block_nr * b, void *data)
73266 +{
73267 + reiser4_block_nr *counter = data;
73268 +
73269 + assert("zam-995", data != NULL);
73270 + assert("zam-996", a != NULL);
73271 + if (b == NULL)
73272 + *counter += 1;
73273 + else
73274 + *counter += *b;
73275 + return 0;
73276 +}
73277 +
73278 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
73279 +{
73280 + reiser4_block_nr result;
73281 + txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73282 + txn_atom *atom;
73283 +
73284 + result = 0;
73285 +
73286 + spin_lock_txnmgr(tmgr);
73287 + list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73288 + spin_lock_atom(atom);
73289 + if (atom_isopen(atom))
73290 + blocknr_set_iterator(
73291 + atom, &atom->delete_set,
73292 + count_deleted_blocks_actor, &result, 0);
73293 + spin_unlock_atom(atom);
73294 + }
73295 + spin_unlock_txnmgr(tmgr);
73296 +
73297 + return result;
73298 +}
73299 +
73300 +/*
73301 + * Local variables:
73302 + * c-indentation-style: "K&R"
73303 + * mode-name: "LC"
73304 + * c-basic-offset: 8
73305 + * tab-width: 8
73306 + * fill-column: 79
73307 + * End:
73308 + */
73309 diff -urN linux-2.6.23.orig/fs/reiser4/txnmgr.h linux-2.6.23/fs/reiser4/txnmgr.h
73310 --- linux-2.6.23.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
73311 +++ linux-2.6.23/fs/reiser4/txnmgr.h 2007-12-04 16:49:30.000000000 +0300
73312 @@ -0,0 +1,701 @@
73313 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73314 + * reiser4/README */
73315 +
73316 +/* data-types and function declarations for transaction manager. See txnmgr.c
73317 + * for details. */
73318 +
73319 +#ifndef __REISER4_TXNMGR_H__
73320 +#define __REISER4_TXNMGR_H__
73321 +
73322 +#include "forward.h"
73323 +#include "dformat.h"
73324 +
73325 +#include <linux/fs.h>
73326 +#include <linux/mm.h>
73327 +#include <linux/types.h>
73328 +#include <linux/spinlock.h>
73329 +#include <asm/atomic.h>
73330 +#include <linux/wait.h>
73331 +
73332 +/* TYPE DECLARATIONS */
73333 +
73334 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
73335 + A capture request dynamically assigns a block to the calling thread's transaction
73336 + handle. */
73337 +typedef enum {
73338 + /* A READ_ATOMIC request indicates that a block will be read and that the caller's
73339 + atom should fuse in order to ensure that the block commits atomically with the
73340 + caller. */
73341 + TXN_CAPTURE_READ_ATOMIC = (1 << 0),
73342 +
73343 + /* A READ_NONCOM request indicates that a block will be read and that the caller is
73344 + willing to read a non-committed block without causing atoms to fuse. */
73345 + TXN_CAPTURE_READ_NONCOM = (1 << 1),
73346 +
73347 + /* A READ_MODIFY request indicates that a block will be read but that the caller
73348 + wishes for the block to be captured as it will be written. This capture request
73349 + mode is not currently used, but eventually it will be useful for preventing
73350 + deadlock in read-modify-write cycles. */
73351 + TXN_CAPTURE_READ_MODIFY = (1 << 2),
73352 +
73353 + /* A WRITE capture request indicates that a block will be modified and that atoms
73354 + should fuse to make the commit atomic. */
73355 + TXN_CAPTURE_WRITE = (1 << 3),
73356 +
73357 + /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
73358 + exclusive type designation from extra bits that may be supplied -- see
73359 + below. */
73360 + TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
73361 + TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
73362 + TXN_CAPTURE_WRITE),
73363 +
73364 + /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
73365 + indicate modification will occur. */
73366 + TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
73367 +
73368 + /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
73369 + prefer not to sleep waiting for an aging atom to commit. */
73370 + TXN_CAPTURE_NONBLOCKING = (1 << 4),
73371 +
73372 + /* An option to reiser4_try_capture to prevent atom fusion, just simple
73373 + capturing is allowed */
73374 + TXN_CAPTURE_DONT_FUSE = (1 << 5)
73375 +
73376 + /* This macro selects only the exclusive capture request types, stripping out any
73377 + options that were supplied (i.e., NONBLOCKING). */
73378 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
73379 +} txn_capture;
73380 +
73381 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
73382 + difference is in the handling of read requests. A WRITE_FUSING transaction handle
73383 + defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
73384 + transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
73385 +typedef enum {
73386 + TXN_WRITE_FUSING = (1 << 0),
73387 + TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
73388 +} txn_mode;
73389 +
73390 +/* Every atom has a stage, which is one of these exclusive values: */
73391 +typedef enum {
73392 + /* Initially an atom is free. */
73393 + ASTAGE_FREE = 0,
73394 +
73395 + /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
73396 + blocks and fuse with other atoms. */
73397 + ASTAGE_CAPTURE_FUSE = 1,
73398 +
73399 + /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
73400 +
73401 + /* When an atom reaches a certain age it must do all it can to commit. An atom in
73402 + the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
73403 + atoms in the CAPTURE_FUSE stage. */
73404 + ASTAGE_CAPTURE_WAIT = 2,
73405 +
73406 + /* Waiting for I/O before commit. Copy-on-capture (see
73407 + http://namesys.com/v4/v4.html). */
73408 + ASTAGE_PRE_COMMIT = 3,
73409 +
73410 + /* Post-commit overwrite I/O. Steal-on-capture. */
73411 + ASTAGE_POST_COMMIT = 4,
73412 +
73413 + /* Atom which waits for the removal of the last reference to (it? ) to
73414 + * be deleted from memory */
73415 + ASTAGE_DONE = 5,
73416 +
73417 + /* invalid atom. */
73418 + ASTAGE_INVALID = 6,
73419 +
73420 +} txn_stage;
73421 +
73422 +/* Certain flags may be set in the txn_atom->flags field. */
73423 +typedef enum {
73424 + /* Indicates that the atom should commit as soon as possible. */
73425 + ATOM_FORCE_COMMIT = (1 << 0),
73426 + /* to avoid endless loop, mark the atom (which was considered as too
73427 + * small) after failed attempt to fuse it. */
73428 + ATOM_CANCEL_FUSION = (1 << 1)
73429 +} txn_flags;
73430 +
73431 +/* Flags for controlling commit_txnh */
73432 +typedef enum {
73433 + /* Wait commit atom completion in commit_txnh */
73434 + TXNH_WAIT_COMMIT = 0x2,
73435 + /* Don't commit atom when this handle is closed */
73436 + TXNH_DONT_COMMIT = 0x4
73437 +} txn_handle_flags_t;
73438 +
73439 +/* TYPE DEFINITIONS */
73440 +
73441 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
73442 + fields, so typically an operation on the atom through either of these objects must (1)
73443 + lock the object, (2) read the atom pointer, (3) lock the atom.
73444 +
73445 + During atom fusion, the process holds locks on both atoms at once. Then, it iterates
73446 + through the list of handles and pages held by the smaller of the two atoms. For each
73447 + handle and page referencing the smaller atom, the fusing process must: (1) lock the
73448 + object, and (2) update the atom pointer.
73449 +
73450 + You can see that there is a conflict of lock ordering here, so the more-complex
73451 + procedure should have priority, i.e., the fusing process has priority so that it is
73452 + guaranteed to make progress and to avoid restarts.
73453 +
73454 + This decision, however, means additional complexity for aquiring the atom lock in the
73455 + first place.
73456 +
73457 + The general original procedure followed in the code was:
73458 +
73459 + TXN_OBJECT *obj = ...;
73460 + TXN_ATOM *atom;
73461 +
73462 + spin_lock (& obj->_lock);
73463 +
73464 + atom = obj->_atom;
73465 +
73466 + if (! spin_trylock_atom (atom))
73467 + {
73468 + spin_unlock (& obj->_lock);
73469 + RESTART OPERATION, THERE WAS A RACE;
73470 + }
73471 +
73472 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73473 +
73474 + It has however been found that this wastes CPU a lot in a manner that is
73475 + hard to profile. So, proper refcounting was added to atoms, and new
73476 + standard locking sequence is like following:
73477 +
73478 + TXN_OBJECT *obj = ...;
73479 + TXN_ATOM *atom;
73480 +
73481 + spin_lock (& obj->_lock);
73482 +
73483 + atom = obj->_atom;
73484 +
73485 + if (! spin_trylock_atom (atom))
73486 + {
73487 + atomic_inc (& atom->refcount);
73488 + spin_unlock (& obj->_lock);
73489 + spin_lock (&atom->_lock);
73490 + atomic_dec (& atom->refcount);
73491 + // HERE atom is locked
73492 + spin_unlock (&atom->_lock);
73493 + RESTART OPERATION, THERE WAS A RACE;
73494 + }
73495 +
73496 + ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73497 +
73498 + (core of this is implemented in trylock_throttle() function)
73499 +
73500 + See the jnode_get_atom() function for a common case.
73501 +
73502 + As an additional (and important) optimization allowing to avoid restarts,
73503 + it is possible to re-check required pre-conditions at the HERE point in
73504 + code above and proceed without restarting if they are still satisfied.
73505 +*/
73506 +
73507 +/* An atomic transaction: this is the underlying system representation
73508 + of a transaction, not the one seen by clients.
73509 +
73510 + Invariants involving this data-type:
73511 +
73512 + [sb-fake-allocated]
73513 +*/
73514 +struct txn_atom {
73515 + /* The spinlock protecting the atom, held during fusion and various other state
73516 + changes. */
73517 + spinlock_t alock;
73518 +
73519 + /* The atom's reference counter, increasing (in case of a duplication
73520 + of an existing reference or when we are sure that some other
73521 + reference exists) may be done without taking spinlock, decrementing
73522 + of the ref. counter requires a spinlock to be held.
73523 +
73524 + Each transaction handle counts in ->refcount. All jnodes count as
73525 + one reference acquired in atom_begin_andlock(), released in
73526 + commit_current_atom().
73527 + */
73528 + atomic_t refcount;
73529 +
73530 + /* The atom_id identifies the atom in persistent records such as the log. */
73531 + __u32 atom_id;
73532 +
73533 + /* Flags holding any of the txn_flags enumerated values (e.g.,
73534 + ATOM_FORCE_COMMIT). */
73535 + __u32 flags;
73536 +
73537 + /* Number of open handles. */
73538 + __u32 txnh_count;
73539 +
73540 + /* The number of znodes captured by this atom. Equal to the sum of lengths of the
73541 + dirty_nodes[level] and clean_nodes lists. */
73542 + __u32 capture_count;
73543 +
73544 +#if REISER4_DEBUG
73545 + int clean;
73546 + int dirty;
73547 + int ovrwr;
73548 + int wb;
73549 + int fq;
73550 +#endif
73551 +
73552 + __u32 flushed;
73553 +
73554 + /* Current transaction stage. */
73555 + txn_stage stage;
73556 +
73557 + /* Start time. */
73558 + unsigned long start_time;
73559 +
73560 + /* The atom's delete set. It collects block numbers of the nodes
73561 + which were deleted during the transaction. */
73562 + struct list_head delete_set;
73563 +
73564 + /* The atom's wandered_block mapping. */
73565 + struct list_head wandered_map;
73566 +
73567 + /* The transaction's list of dirty captured nodes--per level. Index
73568 + by (level). dirty_nodes[0] is for znode-above-root */
73569 + struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
73570 +
73571 + /* The transaction's list of clean captured nodes. */
73572 + struct list_head clean_nodes;
73573 +
73574 + /* The atom's overwrite set */
73575 + struct list_head ovrwr_nodes;
73576 +
73577 + /* nodes which are being written to disk */
73578 + struct list_head writeback_nodes;
73579 +
73580 + /* list of inodes */
73581 + struct list_head inodes;
73582 +
73583 + /* List of handles associated with this atom. */
73584 + struct list_head txnh_list;
73585 +
73586 + /* Transaction list link: list of atoms in the transaction manager. */
73587 + struct list_head atom_link;
73588 +
73589 + /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
73590 + struct list_head fwaitfor_list;
73591 +
73592 + /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
73593 + struct list_head fwaiting_list;
73594 +
73595 + /* Numbers of objects which were deleted/created in this transaction
73596 + thereby numbers of objects IDs which were released/deallocated. */
73597 + int nr_objects_deleted;
73598 + int nr_objects_created;
73599 + /* number of blocks allocated during the transaction */
73600 + __u64 nr_blocks_allocated;
73601 + /* All atom's flush queue objects are on this list */
73602 + struct list_head flush_queues;
73603 +#if REISER4_DEBUG
73604 + /* number of flush queues for this atom. */
73605 + int nr_flush_queues;
73606 + /* Number of jnodes which were removed from atom's lists and put
73607 + on flush_queue */
73608 + int num_queued;
73609 +#endif
73610 + /* number of threads who wait for this atom to complete commit */
73611 + int nr_waiters;
73612 + /* number of threads which do jnode_flush() over this atom */
73613 + int nr_flushers;
73614 + /* number of flush queues which are IN_USE and jnodes from fq->prepped
73615 + are submitted to disk by the reiser4_write_fq() routine. */
73616 + int nr_running_queues;
73617 + /* A counter of grabbed unformatted nodes, see a description of the
73618 + * reiser4 space reservation scheme at block_alloc.c */
73619 + reiser4_block_nr flush_reserved;
73620 +#if REISER4_DEBUG
73621 + void *committer;
73622 +#endif
73623 + struct super_block *super;
73624 +};
73625 +
73626 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
73627 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
73628 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
73629 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
73630 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
73631 +
73632 +#define NODE_LIST(node) (node)->list
73633 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
73634 +ON_DEBUG(void
73635 + count_jnode(txn_atom *, jnode *, atom_list old_list,
73636 + atom_list new_list, int check_lists));
73637 +
73638 +/* A transaction handle: the client obtains and commits this handle which is assigned by
73639 + the system to a txn_atom. */
73640 +struct txn_handle {
73641 + /* Spinlock protecting ->atom pointer */
73642 + spinlock_t hlock;
73643 +
73644 + /* Flags for controlling commit_txnh() behavior */
73645 + /* from txn_handle_flags_t */
73646 + txn_handle_flags_t flags;
73647 +
73648 + /* Whether it is READ_FUSING or WRITE_FUSING. */
73649 + txn_mode mode;
73650 +
73651 + /* If assigned, the atom it is part of. */
73652 + txn_atom *atom;
73653 +
73654 + /* Transaction list link. Head is in txn_atom. */
73655 + struct list_head txnh_link;
73656 +};
73657 +
73658 +/* The transaction manager: one is contained in the reiser4_super_info_data */
73659 +struct txn_mgr {
73660 + /* A spinlock protecting the atom list, id_count, flush_control */
73661 + spinlock_t tmgr_lock;
73662 +
73663 + /* List of atoms. */
73664 + struct list_head atoms_list;
73665 +
73666 + /* Number of atoms. */
73667 + int atom_count;
73668 +
73669 + /* A counter used to assign atom->atom_id values. */
73670 + __u32 id_count;
73671 +
73672 + /* a mutex object for commit serialization */
73673 + struct mutex commit_mutex;
73674 +
73675 + /* a list of all txnmrgs served by particular daemon. */
73676 + struct list_head linkage;
73677 +
73678 + /* description of daemon for this txnmgr */
73679 + ktxnmgrd_context *daemon;
73680 +
73681 + /* parameters. Adjustable through mount options. */
73682 + unsigned int atom_max_size;
73683 + unsigned int atom_max_age;
73684 + unsigned int atom_min_size;
73685 + /* max number of concurrent flushers for one atom, 0 - unlimited. */
73686 + unsigned int atom_max_flushers;
73687 + struct dentry *debugfs_atom_count;
73688 + struct dentry *debugfs_id_count;
73689 +};
73690 +
73691 +/* FUNCTION DECLARATIONS */
73692 +
73693 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
73694 + are prefixed with "txn_". For comments, see txnmgr.c. */
73695 +
73696 +extern int init_txnmgr_static(void);
73697 +extern void done_txnmgr_static(void);
73698 +
73699 +extern void reiser4_init_txnmgr(txn_mgr *);
73700 +extern void reiser4_done_txnmgr(txn_mgr *);
73701 +
73702 +extern int reiser4_txn_reserve(int reserved);
73703 +
73704 +extern void reiser4_txn_begin(reiser4_context * context);
73705 +extern int reiser4_txn_end(reiser4_context * context);
73706 +
73707 +extern void reiser4_txn_restart(reiser4_context * context);
73708 +extern void reiser4_txn_restart_current(void);
73709 +
73710 +extern int txnmgr_force_commit_all(struct super_block *, int);
73711 +extern int current_atom_should_commit(void);
73712 +
73713 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
73714 +
73715 +extern int commit_some_atoms(txn_mgr *);
73716 +extern int force_commit_atom(txn_handle *);
73717 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
73718 +
73719 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
73720 +
73721 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
73722 +
73723 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
73724 + int alloc_value);
73725 +extern void atom_dec_and_unlock(txn_atom * atom);
73726 +
73727 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
73728 +extern int try_capture_page_to_invalidate(struct page *pg);
73729 +
73730 +extern void reiser4_uncapture_page(struct page *pg);
73731 +extern void reiser4_uncapture_block(jnode *);
73732 +extern void reiser4_uncapture_jnode(jnode *);
73733 +
73734 +extern int reiser4_capture_inode(struct inode *);
73735 +extern int reiser4_uncapture_inode(struct inode *);
73736 +
73737 +extern txn_atom *get_current_atom_locked_nocheck(void);
73738 +
73739 +#if REISER4_DEBUG
73740 +
73741 +/**
73742 + * atom_is_protected - make sure that nobody but us can do anything with atom
73743 + * @atom: atom to be checked
73744 + *
73745 + * This is used to assert that atom either entered commit stages or is spin
73746 + * locked.
73747 + */
73748 +static inline int atom_is_protected(txn_atom *atom)
73749 +{
73750 + if (atom->stage >= ASTAGE_PRE_COMMIT)
73751 + return 1;
73752 + assert_spin_locked(&(atom->alock));
73753 + return 1;
73754 +}
73755 +
73756 +#endif
73757 +
73758 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
73759 +static inline txn_atom *get_current_atom_locked(void)
73760 +{
73761 + txn_atom *atom;
73762 +
73763 + atom = get_current_atom_locked_nocheck();
73764 + assert("zam-761", atom != NULL);
73765 +
73766 + return atom;
73767 +}
73768 +
73769 +extern txn_atom *jnode_get_atom(jnode *);
73770 +
73771 +extern void reiser4_atom_wait_event(txn_atom *);
73772 +extern void reiser4_atom_send_event(txn_atom *);
73773 +
73774 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
73775 +extern int reiser4_capture_super_block(struct super_block *s);
73776 +int capture_bulk(jnode **, int count);
73777 +
73778 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
73779 + calling convention of these three routines. */
73780 +extern void blocknr_set_init(struct list_head * bset);
73781 +extern void blocknr_set_destroy(struct list_head * bset);
73782 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
73783 +extern int blocknr_set_add_extent(txn_atom * atom,
73784 + struct list_head * bset,
73785 + blocknr_set_entry ** new_bsep,
73786 + const reiser4_block_nr * start,
73787 + const reiser4_block_nr * len);
73788 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
73789 + blocknr_set_entry ** new_bsep,
73790 + const reiser4_block_nr * a,
73791 + const reiser4_block_nr * b);
73792 +
73793 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
73794 + const reiser4_block_nr *, void *);
73795 +
73796 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
73797 + blocknr_set_actor_f actor, void *data,
73798 + int delete);
73799 +
73800 +/* flush code takes care about how to fuse flush queues */
73801 +extern void flush_init_atom(txn_atom * atom);
73802 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
73803 +
73804 +static inline void spin_lock_atom(txn_atom *atom)
73805 +{
73806 + /* check that spinlocks of lower priorities are not held */
73807 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73808 + LOCK_CNT_NIL(spin_locked_atom) &&
73809 + LOCK_CNT_NIL(spin_locked_jnode) &&
73810 + LOCK_CNT_NIL(spin_locked_zlock) &&
73811 + LOCK_CNT_NIL(rw_locked_dk) &&
73812 + LOCK_CNT_NIL(rw_locked_tree)));
73813 +
73814 + spin_lock(&(atom->alock));
73815 +
73816 + LOCK_CNT_INC(spin_locked_atom);
73817 + LOCK_CNT_INC(spin_locked);
73818 +}
73819 +
73820 +static inline void spin_lock_atom_nested(txn_atom *atom)
73821 +{
73822 + assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73823 + LOCK_CNT_NIL(spin_locked_jnode) &&
73824 + LOCK_CNT_NIL(spin_locked_zlock) &&
73825 + LOCK_CNT_NIL(rw_locked_dk) &&
73826 + LOCK_CNT_NIL(rw_locked_tree)));
73827 +
73828 + spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
73829 +
73830 + LOCK_CNT_INC(spin_locked_atom);
73831 + LOCK_CNT_INC(spin_locked);
73832 +}
73833 +
73834 +static inline int spin_trylock_atom(txn_atom *atom)
73835 +{
73836 + if (spin_trylock(&(atom->alock))) {
73837 + LOCK_CNT_INC(spin_locked_atom);
73838 + LOCK_CNT_INC(spin_locked);
73839 + return 1;
73840 + }
73841 + return 0;
73842 +}
73843 +
73844 +static inline void spin_unlock_atom(txn_atom *atom)
73845 +{
73846 + assert_spin_locked(&(atom->alock));
73847 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
73848 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73849 +
73850 + LOCK_CNT_DEC(spin_locked_atom);
73851 + LOCK_CNT_DEC(spin_locked);
73852 +
73853 + spin_unlock(&(atom->alock));
73854 +}
73855 +
73856 +static inline void spin_lock_txnh(txn_handle *txnh)
73857 +{
73858 + /* check that spinlocks of lower priorities are not held */
73859 + assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
73860 + LOCK_CNT_NIL(spin_locked_zlock) &&
73861 + LOCK_CNT_NIL(rw_locked_tree)));
73862 +
73863 + spin_lock(&(txnh->hlock));
73864 +
73865 + LOCK_CNT_INC(spin_locked_txnh);
73866 + LOCK_CNT_INC(spin_locked);
73867 +}
73868 +
73869 +static inline int spin_trylock_txnh(txn_handle *txnh)
73870 +{
73871 + if (spin_trylock(&(txnh->hlock))) {
73872 + LOCK_CNT_INC(spin_locked_txnh);
73873 + LOCK_CNT_INC(spin_locked);
73874 + return 1;
73875 + }
73876 + return 0;
73877 +}
73878 +
73879 +static inline void spin_unlock_txnh(txn_handle *txnh)
73880 +{
73881 + assert_spin_locked(&(txnh->hlock));
73882 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
73883 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73884 +
73885 + LOCK_CNT_DEC(spin_locked_txnh);
73886 + LOCK_CNT_DEC(spin_locked);
73887 +
73888 + spin_unlock(&(txnh->hlock));
73889 +}
73890 +
73891 +#define spin_ordering_pred_txnmgr(tmgr) \
73892 + ( LOCK_CNT_NIL(spin_locked_atom) && \
73893 + LOCK_CNT_NIL(spin_locked_txnh) && \
73894 + LOCK_CNT_NIL(spin_locked_jnode) && \
73895 + LOCK_CNT_NIL(rw_locked_zlock) && \
73896 + LOCK_CNT_NIL(rw_locked_dk) && \
73897 + LOCK_CNT_NIL(rw_locked_tree) )
73898 +
73899 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
73900 +{
73901 + /* check that spinlocks of lower priorities are not held */
73902 + assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
73903 + LOCK_CNT_NIL(spin_locked_txnh) &&
73904 + LOCK_CNT_NIL(spin_locked_jnode) &&
73905 + LOCK_CNT_NIL(spin_locked_zlock) &&
73906 + LOCK_CNT_NIL(rw_locked_dk) &&
73907 + LOCK_CNT_NIL(rw_locked_tree)));
73908 +
73909 + spin_lock(&(mgr->tmgr_lock));
73910 +
73911 + LOCK_CNT_INC(spin_locked_txnmgr);
73912 + LOCK_CNT_INC(spin_locked);
73913 +}
73914 +
73915 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
73916 +{
73917 + if (spin_trylock(&(mgr->tmgr_lock))) {
73918 + LOCK_CNT_INC(spin_locked_txnmgr);
73919 + LOCK_CNT_INC(spin_locked);
73920 + return 1;
73921 + }
73922 + return 0;
73923 +}
73924 +
73925 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
73926 +{
73927 + assert_spin_locked(&(mgr->tmgr_lock));
73928 + assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
73929 + assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73930 +
73931 + LOCK_CNT_DEC(spin_locked_txnmgr);
73932 + LOCK_CNT_DEC(spin_locked);
73933 +
73934 + spin_unlock(&(mgr->tmgr_lock));
73935 +}
73936 +
73937 +typedef enum {
73938 + FQ_IN_USE = 0x1
73939 +} flush_queue_state_t;
73940 +
73941 +typedef struct flush_queue flush_queue_t;
73942 +
73943 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
73944 + is filled by the jnode_flush() routine, and written to disk under memory
73945 + pressure or at atom commit time. */
73946 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
73947 + field and fq->prepped list can be modified if atom is spin-locked and fq
73948 + object is "in-use" state. For read-only traversal of the fq->prepped list
73949 + and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
73950 + only have atom spin-locked. */
73951 +struct flush_queue {
73952 + /* linkage element is the first in this structure to make debugging
73953 + easier. See field in atom struct for description of list. */
73954 + struct list_head alink;
73955 + /* A spinlock to protect changes of fq state and fq->atom pointer */
73956 + spinlock_t guard;
73957 + /* flush_queue state: [in_use | ready] */
73958 + flush_queue_state_t state;
73959 + /* A list which contains queued nodes, queued nodes are removed from any
73960 + * atom's list and put on this ->prepped one. */
73961 + struct list_head prepped;
73962 + /* number of submitted i/o requests */
73963 + atomic_t nr_submitted;
73964 + /* number of i/o errors */
73965 + atomic_t nr_errors;
73966 + /* An atom this flush queue is attached to */
73967 + txn_atom *atom;
73968 + /* A wait queue head to wait on i/o completion */
73969 + wait_queue_head_t wait;
73970 +#if REISER4_DEBUG
73971 + /* A thread which took this fq in exclusive use, NULL if fq is free,
73972 + * used for debugging. */
73973 + struct task_struct *owner;
73974 +#endif
73975 +};
73976 +
73977 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
73978 +extern void reiser4_fq_put_nolock(flush_queue_t *);
73979 +extern void reiser4_fq_put(flush_queue_t *);
73980 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
73981 +extern void queue_jnode(flush_queue_t *, jnode *);
73982 +
73983 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
73984 +extern int current_atom_finish_all_fq(void);
73985 +extern void init_atom_fq_parts(txn_atom *);
73986 +
73987 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
73988 +
73989 +extern void znode_make_dirty(znode * node);
73990 +extern void jnode_make_dirty_locked(jnode * node);
73991 +
73992 +extern int reiser4_sync_atom(txn_atom * atom);
73993 +
73994 +#if REISER4_DEBUG
73995 +extern int atom_fq_parts_are_clean(txn_atom *);
73996 +#endif
73997 +
73998 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
73999 +extern flush_queue_t *get_fq_for_current_atom(void);
74000 +
74001 +void reiser4_invalidate_list(struct list_head * head);
74002 +
74003 +# endif /* __REISER4_TXNMGR_H__ */
74004 +
74005 +/* Make Linus happy.
74006 + Local variables:
74007 + c-indentation-style: "K&R"
74008 + mode-name: "LC"
74009 + c-basic-offset: 8
74010 + tab-width: 8
74011 + fill-column: 120
74012 + End:
74013 +*/
74014 diff -urN linux-2.6.23.orig/fs/reiser4/type_safe_hash.h linux-2.6.23/fs/reiser4/type_safe_hash.h
74015 --- linux-2.6.23.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
74016 +++ linux-2.6.23/fs/reiser4/type_safe_hash.h 2007-12-04 16:49:30.000000000 +0300
74017 @@ -0,0 +1,320 @@
74018 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74019 + * reiser4/README */
74020 +
74021 +/* A hash table class that uses hash chains (singly-linked) and is
74022 + parametrized to provide type safety. */
74023 +
74024 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
74025 +#define __REISER4_TYPE_SAFE_HASH_H__
74026 +
74027 +#include "debug.h"
74028 +
74029 +#include <asm/errno.h>
74030 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
74031 + based on the object type. You need to declare the item type before
74032 + this definition, define it after this definition. */
74033 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
74034 + \
74035 +typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
74036 +typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
74037 + \
74038 +struct PREFIX##_hash_table_ \
74039 +{ \
74040 + ITEM_TYPE **_table; \
74041 + __u32 _buckets; \
74042 +}; \
74043 + \
74044 +struct PREFIX##_hash_link_ \
74045 +{ \
74046 + ITEM_TYPE *_next; \
74047 +}
74048 +
74049 +/* Step 2: Define the object type of the hash: give it field of type
74050 + PREFIX_hash_link. */
74051 +
74052 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
74053 + the type and field name used in step 3. The arguments are:
74054 +
74055 + ITEM_TYPE The item type being hashed
74056 + KEY_TYPE The type of key being hashed
74057 + KEY_NAME The name of the key field within the item
74058 + LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
74059 + HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
74060 + EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
74061 +
74062 + It implements these functions:
74063 +
74064 + prefix_hash_init Initialize the table given its size.
74065 + prefix_hash_insert Insert an item
74066 + prefix_hash_insert_index Insert an item w/ precomputed hash_index
74067 + prefix_hash_find Find an item by key
74068 + prefix_hash_find_index Find an item w/ precomputed hash_index
74069 + prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
74070 + prefix_hash_remove_index Remove an item w/ precomputed hash_index
74071 +
74072 + If you'd like something to be done differently, feel free to ask me
74073 + for modifications. Additional features that could be added but
74074 + have not been:
74075 +
74076 + prefix_hash_remove_key Find and remove an item by key
74077 + prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
74078 +
74079 + The hash_function currently receives only the key as an argument,
74080 + meaning it must somehow know the number of buckets. If this is a
74081 + problem let me know.
74082 +
74083 + This hash table uses a single-linked hash chain. This means
74084 + insertion is fast but deletion requires searching the chain.
74085 +
74086 + There is also the doubly-linked hash chain approach, under which
74087 + deletion requires no search but the code is longer and it takes two
74088 + pointers per item.
74089 +
74090 + The circularly-linked approach has the shortest code but requires
74091 + two pointers per bucket, doubling the size of the bucket array (in
74092 + addition to two pointers per item).
74093 +*/
74094 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
74095 + \
74096 +static __inline__ void \
74097 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
74098 + __u32 hash UNUSED_ARG) \
74099 +{ \
74100 + assert("nikita-2780", hash < table->_buckets); \
74101 +} \
74102 + \
74103 +static __inline__ int \
74104 +PREFIX##_hash_init (PREFIX##_hash_table *hash, \
74105 + __u32 buckets) \
74106 +{ \
74107 + hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
74108 + hash->_buckets = buckets; \
74109 + if (hash->_table == NULL) \
74110 + { \
74111 + return RETERR(-ENOMEM); \
74112 + } \
74113 + memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
74114 + ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
74115 + return 0; \
74116 +} \
74117 + \
74118 +static __inline__ void \
74119 +PREFIX##_hash_done (PREFIX##_hash_table *hash) \
74120 +{ \
74121 + if (REISER4_DEBUG && hash->_table != NULL) { \
74122 + __u32 i; \
74123 + for (i = 0 ; i < hash->_buckets ; ++ i) \
74124 + assert("nikita-2905", hash->_table[i] == NULL); \
74125 + } \
74126 + if (hash->_table != NULL) \
74127 + KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
74128 + hash->_table = NULL; \
74129 +} \
74130 + \
74131 +static __inline__ void \
74132 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
74133 +{ \
74134 + prefetch(item->LINK_NAME._next); \
74135 +} \
74136 + \
74137 +static __inline__ void \
74138 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
74139 + __u32 index) \
74140 +{ \
74141 + prefetch(hash->_table[index]); \
74142 +} \
74143 + \
74144 +static __inline__ ITEM_TYPE* \
74145 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
74146 + __u32 hash_index, \
74147 + KEY_TYPE const *find_key) \
74148 +{ \
74149 + ITEM_TYPE *item; \
74150 + \
74151 + PREFIX##_check_hash(hash, hash_index); \
74152 + \
74153 + for (item = hash->_table[hash_index]; \
74154 + item != NULL; \
74155 + item = item->LINK_NAME._next) \
74156 + { \
74157 + prefetch(item->LINK_NAME._next); \
74158 + prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
74159 + if (EQ_FUNC (& item->KEY_NAME, find_key)) \
74160 + { \
74161 + return item; \
74162 + } \
74163 + } \
74164 + \
74165 + return NULL; \
74166 +} \
74167 + \
74168 +static __inline__ ITEM_TYPE* \
74169 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
74170 + __u32 hash_index, \
74171 + KEY_TYPE const *find_key) \
74172 +{ \
74173 + ITEM_TYPE ** item = &hash->_table[hash_index]; \
74174 + \
74175 + PREFIX##_check_hash(hash, hash_index); \
74176 + \
74177 + while (*item != NULL) { \
74178 + prefetch(&(*item)->LINK_NAME._next); \
74179 + if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
74180 + ITEM_TYPE *found; \
74181 + \
74182 + found = *item; \
74183 + *item = found->LINK_NAME._next; \
74184 + found->LINK_NAME._next = hash->_table[hash_index]; \
74185 + hash->_table[hash_index] = found; \
74186 + return found; \
74187 + } \
74188 + item = &(*item)->LINK_NAME._next; \
74189 + } \
74190 + return NULL; \
74191 +} \
74192 + \
74193 +static __inline__ int \
74194 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
74195 + __u32 hash_index, \
74196 + ITEM_TYPE *del_item) \
74197 +{ \
74198 + ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
74199 + \
74200 + PREFIX##_check_hash(hash, hash_index); \
74201 + \
74202 + while (*hash_item_p != NULL) { \
74203 + prefetch(&(*hash_item_p)->LINK_NAME._next); \
74204 + if (*hash_item_p == del_item) { \
74205 + *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
74206 + return 1; \
74207 + } \
74208 + hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
74209 + } \
74210 + return 0; \
74211 +} \
74212 + \
74213 +static __inline__ void \
74214 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
74215 + __u32 hash_index, \
74216 + ITEM_TYPE *ins_item) \
74217 +{ \
74218 + PREFIX##_check_hash(hash, hash_index); \
74219 + \
74220 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74221 + hash->_table[hash_index] = ins_item; \
74222 +} \
74223 + \
74224 +static __inline__ void \
74225 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
74226 + __u32 hash_index, \
74227 + ITEM_TYPE *ins_item) \
74228 +{ \
74229 + PREFIX##_check_hash(hash, hash_index); \
74230 + \
74231 + ins_item->LINK_NAME._next = hash->_table[hash_index]; \
74232 + smp_wmb(); \
74233 + hash->_table[hash_index] = ins_item; \
74234 +} \
74235 + \
74236 +static __inline__ ITEM_TYPE* \
74237 +PREFIX##_hash_find (PREFIX##_hash_table *hash, \
74238 + KEY_TYPE const *find_key) \
74239 +{ \
74240 + return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
74241 +} \
74242 + \
74243 +static __inline__ ITEM_TYPE* \
74244 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
74245 + KEY_TYPE const *find_key) \
74246 +{ \
74247 + return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
74248 +} \
74249 + \
74250 +static __inline__ int \
74251 +PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
74252 + ITEM_TYPE *del_item) \
74253 +{ \
74254 + return PREFIX##_hash_remove_index (hash, \
74255 + HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
74256 +} \
74257 + \
74258 +static __inline__ int \
74259 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
74260 + ITEM_TYPE *del_item) \
74261 +{ \
74262 + return PREFIX##_hash_remove (hash, del_item); \
74263 +} \
74264 + \
74265 +static __inline__ void \
74266 +PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
74267 + ITEM_TYPE *ins_item) \
74268 +{ \
74269 + return PREFIX##_hash_insert_index (hash, \
74270 + HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
74271 +} \
74272 + \
74273 +static __inline__ void \
74274 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
74275 + ITEM_TYPE *ins_item) \
74276 +{ \
74277 + return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
74278 + ins_item); \
74279 +} \
74280 + \
74281 +static __inline__ ITEM_TYPE * \
74282 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
74283 +{ \
74284 + ITEM_TYPE *first; \
74285 + \
74286 + for (first = NULL; ind < hash->_buckets; ++ ind) { \
74287 + first = hash->_table[ind]; \
74288 + if (first != NULL) \
74289 + break; \
74290 + } \
74291 + return first; \
74292 +} \
74293 + \
74294 +static __inline__ ITEM_TYPE * \
74295 +PREFIX##_hash_next (PREFIX##_hash_table *hash, \
74296 + ITEM_TYPE *item) \
74297 +{ \
74298 + ITEM_TYPE *next; \
74299 + \
74300 + if (item == NULL) \
74301 + return NULL; \
74302 + next = item->LINK_NAME._next; \
74303 + if (next == NULL) \
74304 + next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
74305 + return next; \
74306 +} \
74307 + \
74308 +typedef struct {} PREFIX##_hash_dummy
74309 +
74310 +#define for_all_ht_buckets(table, head) \
74311 +for ((head) = &(table) -> _table[ 0 ] ; \
74312 + (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
74313 +
74314 +#define for_all_in_bucket(bucket, item, next, field) \
74315 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
74316 + (item) != NULL ; \
74317 + (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
74318 +
74319 +#define for_all_in_htable(table, prefix, item, next) \
74320 +for ((item) = prefix ## _hash_first ((table), 0), \
74321 + (next) = prefix ## _hash_next ((table), (item)) ; \
74322 + (item) != NULL ; \
74323 + (item) = (next), \
74324 + (next) = prefix ## _hash_next ((table), (item)))
74325 +
74326 +/* __REISER4_TYPE_SAFE_HASH_H__ */
74327 +#endif
74328 +
74329 +/* Make Linus happy.
74330 + Local variables:
74331 + c-indentation-style: "K&R"
74332 + mode-name: "LC"
74333 + c-basic-offset: 8
74334 + tab-width: 8
74335 + fill-column: 120
74336 + End:
74337 +*/
74338 diff -urN linux-2.6.23.orig/fs/reiser4/vfs_ops.c linux-2.6.23/fs/reiser4/vfs_ops.c
74339 --- linux-2.6.23.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300
74340 +++ linux-2.6.23/fs/reiser4/vfs_ops.c 2007-12-04 16:49:30.000000000 +0300
74341 @@ -0,0 +1,259 @@
74342 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74343 + * reiser4/README */
74344 +
74345 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
74346 + here. */
74347 +
74348 +#include "forward.h"
74349 +#include "debug.h"
74350 +#include "dformat.h"
74351 +#include "coord.h"
74352 +#include "plugin/item/item.h"
74353 +#include "plugin/file/file.h"
74354 +#include "plugin/security/perm.h"
74355 +#include "plugin/disk_format/disk_format.h"
74356 +#include "plugin/plugin.h"
74357 +#include "plugin/plugin_set.h"
74358 +#include "plugin/object.h"
74359 +#include "txnmgr.h"
74360 +#include "jnode.h"
74361 +#include "znode.h"
74362 +#include "block_alloc.h"
74363 +#include "tree.h"
74364 +#include "vfs_ops.h"
74365 +#include "inode.h"
74366 +#include "page_cache.h"
74367 +#include "ktxnmgrd.h"
74368 +#include "super.h"
74369 +#include "reiser4.h"
74370 +#include "entd.h"
74371 +#include "status_flags.h"
74372 +#include "flush.h"
74373 +#include "dscale.h"
74374 +
74375 +#include <linux/profile.h>
74376 +#include <linux/types.h>
74377 +#include <linux/mount.h>
74378 +#include <linux/vfs.h>
74379 +#include <linux/mm.h>
74380 +#include <linux/buffer_head.h>
74381 +#include <linux/dcache.h>
74382 +#include <linux/list.h>
74383 +#include <linux/pagemap.h>
74384 +#include <linux/slab.h>
74385 +#include <linux/seq_file.h>
74386 +#include <linux/init.h>
74387 +#include <linux/module.h>
74388 +#include <linux/writeback.h>
74389 +#include <linux/blkdev.h>
74390 +#include <linux/quotaops.h>
74391 +#include <linux/security.h>
74392 +#include <linux/reboot.h>
74393 +#include <linux/rcupdate.h>
74394 +
74395 +/* update inode stat-data by calling plugin */
74396 +int reiser4_update_sd(struct inode *object)
74397 +{
74398 + file_plugin *fplug;
74399 +
74400 + assert("nikita-2338", object != NULL);
74401 + /* check for read-only file system. */
74402 + if (IS_RDONLY(object))
74403 + return 0;
74404 +
74405 + fplug = inode_file_plugin(object);
74406 + assert("nikita-2339", fplug != NULL);
74407 + return fplug->write_sd_by_inode(object);
74408 +}
74409 +
74410 +/* helper function: increase inode nlink count and call plugin method to save
74411 + updated stat-data.
74412 +
74413 + Used by link/create and during creation of dot and dotdot in mkdir
74414 +*/
74415 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
74416 + struct inode *parent /* parent where new entry will be */
74417 + ,
74418 + int write_sd_p /* true if stat-data has to be
74419 + * updated */ )
74420 +{
74421 + file_plugin *fplug;
74422 + int result;
74423 +
74424 + assert("nikita-1351", object != NULL);
74425 +
74426 + fplug = inode_file_plugin(object);
74427 + assert("nikita-1445", fplug != NULL);
74428 +
74429 + /* ask plugin whether it can add yet another link to this
74430 + object */
74431 + if (!fplug->can_add_link(object))
74432 + return RETERR(-EMLINK);
74433 +
74434 + assert("nikita-2211", fplug->add_link != NULL);
74435 + /* call plugin to do actual addition of link */
74436 + result = fplug->add_link(object, parent);
74437 +
74438 + /* optionally update stat data */
74439 + if (result == 0 && write_sd_p)
74440 + result = fplug->write_sd_by_inode(object);
74441 + return result;
74442 +}
74443 +
74444 +/* helper function: decrease inode nlink count and call plugin method to save
74445 + updated stat-data.
74446 +
74447 + Used by unlink/create
74448 +*/
74449 +int reiser4_del_nlink(struct inode *object /* object from which link is
74450 + * removed */ ,
74451 + struct inode *parent /* parent where entry was */ ,
74452 + int write_sd_p /* true is stat-data has to be
74453 + * updated */ )
74454 +{
74455 + file_plugin *fplug;
74456 + int result;
74457 +
74458 + assert("nikita-1349", object != NULL);
74459 +
74460 + fplug = inode_file_plugin(object);
74461 + assert("nikita-1350", fplug != NULL);
74462 + assert("nikita-1446", object->i_nlink > 0);
74463 + assert("nikita-2210", fplug->rem_link != NULL);
74464 +
74465 + /* call plugin to do actual deletion of link */
74466 + result = fplug->rem_link(object, parent);
74467 +
74468 + /* optionally update stat data */
74469 + if (result == 0 && write_sd_p)
74470 + result = fplug->write_sd_by_inode(object);
74471 + return result;
74472 +}
74473 +
74474 +/* Release reiser4 dentry. This is d_op->d_release() method. */
74475 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
74476 +{
74477 + reiser4_free_dentry_fsdata(dentry);
74478 +}
74479 +
74480 +/*
74481 + * Called by reiser4_sync_inodes(), during speculative write-back (through
74482 + * pdflush, or balance_dirty_pages()).
74483 + */
74484 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
74485 +{
74486 + long written = 0;
74487 + int repeats = 0;
74488 + int result;
74489 + struct address_space *mapping;
74490 +
74491 + /*
74492 + * Performs early flushing, trying to free some memory. If there is
74493 + * nothing to flush, commits some atoms.
74494 + */
74495 +
74496 + /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
74497 + sys_fsync(). */
74498 + if (wbc->sync_mode != WB_SYNC_NONE) {
74499 + txnmgr_force_commit_all(sb, 0);
74500 + return;
74501 + }
74502 +
74503 + BUG_ON(reiser4_get_super_fake(sb) == NULL);
74504 + mapping = reiser4_get_super_fake(sb)->i_mapping;
74505 + do {
74506 + long nr_submitted = 0;
74507 + jnode *node = NULL;
74508 +
74509 + /* do not put more requests to overload write queue */
74510 + if (wbc->nonblocking &&
74511 + bdi_write_congested(mapping->backing_dev_info)) {
74512 + blk_run_address_space(mapping);
74513 + wbc->encountered_congestion = 1;
74514 + break;
74515 + }
74516 + repeats++;
74517 + BUG_ON(wbc->nr_to_write <= 0);
74518 +
74519 + if (get_current_context()->entd) {
74520 + entd_context *ent = get_entd_context(sb);
74521 +
74522 + if (ent->cur_request->node)
74523 + /*
74524 + * this is ent thread and it managed to capture
74525 + * requested page itself - start flush from
74526 + * that page
74527 + */
74528 + node = jref(ent->cur_request->node);
74529 + }
74530 +
74531 + result = flush_some_atom(node, &nr_submitted, wbc,
74532 + JNODE_FLUSH_WRITE_BLOCKS);
74533 + if (result != 0)
74534 + warning("nikita-31001", "Flush failed: %i", result);
74535 + if (node)
74536 + jput(node);
74537 + if (!nr_submitted)
74538 + break;
74539 +
74540 + wbc->nr_to_write -= nr_submitted;
74541 + written += nr_submitted;
74542 + } while (wbc->nr_to_write > 0);
74543 +}
74544 +
74545 +void reiser4_throttle_write(struct inode *inode)
74546 +{
74547 + reiser4_txn_restart_current();
74548 + balance_dirty_pages_ratelimited(inode->i_mapping);
74549 +}
74550 +
74551 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
74552 +const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
74553 + * beginning of device */
74554 +
74555 +/*
74556 + * Reiser4 initialization/shutdown.
74557 + *
74558 + * Code below performs global reiser4 initialization that is done either as
74559 + * part of kernel initialization (when reiser4 is statically built-in), or
74560 + * during reiser4 module load (when compiled as module).
74561 + */
74562 +
74563 +void reiser4_handle_error(void)
74564 +{
74565 + struct super_block *sb = reiser4_get_current_sb();
74566 +
74567 + if (!sb)
74568 + return;
74569 + reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
74570 + "Filesystem error occured");
74571 + switch (get_super_private(sb)->onerror) {
74572 + case 0:
74573 + reiser4_panic("foobar-42", "Filesystem error occured\n");
74574 + case 1:
74575 + default:
74576 + if (sb->s_flags & MS_RDONLY)
74577 + return;
74578 + sb->s_flags |= MS_RDONLY;
74579 + break;
74580 + }
74581 +}
74582 +
74583 +struct dentry_operations reiser4_dentry_operations = {
74584 + .d_revalidate = NULL,
74585 + .d_hash = NULL,
74586 + .d_compare = NULL,
74587 + .d_delete = NULL,
74588 + .d_release = reiser4_d_release,
74589 + .d_iput = NULL,
74590 +};
74591 +
74592 +/* Make Linus happy.
74593 + Local variables:
74594 + c-indentation-style: "K&R"
74595 + mode-name: "LC"
74596 + c-basic-offset: 8
74597 + tab-width: 8
74598 + fill-column: 120
74599 + End:
74600 +*/
74601 diff -urN linux-2.6.23.orig/fs/reiser4/vfs_ops.h linux-2.6.23/fs/reiser4/vfs_ops.h
74602 --- linux-2.6.23.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300
74603 +++ linux-2.6.23/fs/reiser4/vfs_ops.h 2007-12-04 16:49:30.000000000 +0300
74604 @@ -0,0 +1,53 @@
74605 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74606 + * reiser4/README */
74607 +
74608 +/* vfs_ops.c's exported symbols */
74609 +
74610 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
74611 +#define __FS_REISER4_VFS_OPS_H__
74612 +
74613 +#include "forward.h"
74614 +#include "coord.h"
74615 +#include "seal.h"
74616 +#include "plugin/file/file.h"
74617 +#include "super.h"
74618 +#include "readahead.h"
74619 +
74620 +#include <linux/types.h> /* for loff_t */
74621 +#include <linux/fs.h> /* for struct address_space */
74622 +#include <linux/dcache.h> /* for struct dentry */
74623 +#include <linux/mm.h>
74624 +#include <linux/backing-dev.h>
74625 +
74626 +/* address space operations */
74627 +int reiser4_writepage(struct page *, struct writeback_control *);
74628 +int reiser4_set_page_dirty(struct page *);
74629 +void reiser4_invalidatepage(struct page *, unsigned long offset);
74630 +int reiser4_releasepage(struct page *, gfp_t);
74631 +
74632 +extern int reiser4_update_sd(struct inode *);
74633 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
74634 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
74635 +
74636 +extern int reiser4_start_up_io(struct page *page);
74637 +extern void reiser4_throttle_write(struct inode *);
74638 +extern int jnode_is_releasable(jnode *);
74639 +
74640 +#define CAPTURE_APAGE_BURST (1024l)
74641 +void reiser4_writeout(struct super_block *, struct writeback_control *);
74642 +
74643 +extern void reiser4_handle_error(void);
74644 +
74645 +/* __FS_REISER4_VFS_OPS_H__ */
74646 +#endif
74647 +
74648 +/* Make Linus happy.
74649 + Local variables:
74650 + c-indentation-style: "K&R"
74651 + mode-name: "LC"
74652 + c-basic-offset: 8
74653 + tab-width: 8
74654 + fill-column: 120
74655 + scroll-step: 1
74656 + End:
74657 +*/
74658 diff -urN linux-2.6.23.orig/fs/reiser4/wander.c linux-2.6.23/fs/reiser4/wander.c
74659 --- linux-2.6.23.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
74660 +++ linux-2.6.23/fs/reiser4/wander.c 2007-12-04 16:49:30.000000000 +0300
74661 @@ -0,0 +1,1797 @@
74662 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74663 + * reiser4/README */
74664 +
74665 +/* Reiser4 Wandering Log */
74666 +
74667 +/* You should read http://www.namesys.com/txn-doc.html
74668 +
74669 + That describes how filesystem operations are performed as atomic
74670 + transactions, and how we try to arrange it so that we can write most of the
74671 + data only once while performing the operation atomically.
74672 +
74673 + For the purposes of this code, it is enough for it to understand that it
74674 + has been told a given block should be written either once, or twice (if
74675 + twice then once to the wandered location and once to the real location).
74676 +
74677 + This code guarantees that those blocks that are defined to be part of an
74678 + atom either all take effect or none of them take effect.
74679 +
74680 + Relocate set nodes are submitted to write by the jnode_flush() routine, and
74681 + the overwrite set is submitted by reiser4_write_log(). This is because with
74682 + the overwrite set we seek to optimize writes, and with the relocate set we
74683 + seek to cause disk order to correlate with the parent first pre-order.
74684 +
74685 + reiser4_write_log() allocates and writes wandered blocks and maintains
74686 + additional on-disk structures of the atom as wander records (each wander
74687 + record occupies one block) for storing of the "wandered map" (a table which
74688 + contains a relation between wandered and real block numbers) and other
74689 + information which might be needed at transaction recovery time.
74690 +
74691 + The wander records are unidirectionally linked into a circle: each wander
74692 + record contains a block number of the next wander record, the last wander
74693 + record points to the first one.
74694 +
74695 + One wander record (named "tx head" in this file) has a format which is
74696 + different from the other wander records. The "tx head" has a reference to the
74697 + "tx head" block of the previously committed atom. Also, "tx head" contains
74698 + fs information (the free blocks counter, and the oid allocator state) which
74699 + is logged in a special way .
74700 +
74701 + There are two journal control blocks, named journal header and journal
74702 + footer which have fixed on-disk locations. The journal header has a
74703 + reference to the "tx head" block of the last committed atom. The journal
74704 + footer points to the "tx head" of the last flushed atom. The atom is
74705 + "played" when all blocks from its overwrite set are written to disk the
74706 + second time (i.e. written to their real locations).
74707 +
74708 + NOTE: People who know reiserfs internals and its journal structure might be
74709 + confused with these terms journal footer and journal header. There is a table
74710 + with terms of similar semantics in reiserfs (reiser3) and reiser4:
74711 +
74712 + REISER3 TERM | REISER4 TERM | DESCRIPTION
74713 + --------------------+-----------------------+----------------------------
74714 + commit record | journal header | atomic write of this record
74715 + | | ends transaction commit
74716 + --------------------+-----------------------+----------------------------
74717 + journal header | journal footer | atomic write of this record
74718 + | | ends post-commit writes.
74719 + | | After successful
74720 + | | writing of this journal
74721 + | | blocks (in reiser3) or
74722 + | | wandered blocks/records are
74723 + | | free for re-use.
74724 + --------------------+-----------------------+----------------------------
74725 +
74726 + The atom commit process is the following:
74727 +
74728 + 1. The overwrite set is taken from atom's clean list, and its size is
74729 + counted.
74730 +
74731 + 2. The number of necessary wander records (including tx head) is calculated,
74732 + and the wander record blocks are allocated.
74733 +
74734 + 3. Allocate wandered blocks and populate wander records by wandered map.
74735 +
74736 + 4. submit write requests for wander records and wandered blocks.
74737 +
74738 + 5. wait until submitted write requests complete.
74739 +
74740 + 6. update journal header: change the pointer to the block number of just
74741 + written tx head, submit an i/o for modified journal header block and wait
74742 + for i/o completion.
74743 +
74744 + NOTE: The special logging for bitmap blocks and some reiser4 super block
74745 + fields makes processes of atom commit, flush and recovering a bit more
74746 + complex (see comments in the source code for details).
74747 +
74748 + The atom playing process is the following:
74749 +
74750 + 1. Write atom's overwrite set in-place.
74751 +
74752 + 2. Wait on i/o.
74753 +
74754 + 3. Update journal footer: change the pointer to block number of tx head
74755 + block of the atom we currently flushing, submit an i/o, wait on i/o
74756 + completion.
74757 +
74758 + 4. Free disk space which was used for wandered blocks and wander records.
74759 +
74760 + After the freeing of wandered blocks and wander records we have that journal
74761 + footer points to the on-disk structure which might be overwritten soon.
74762 + Neither the log writer nor the journal recovery procedure use that pointer
74763 + for accessing the data. When the journal recovery procedure finds the oldest
74764 + transaction it compares the journal footer pointer value with the "prev_tx"
74765 + pointer value in tx head, if values are equal the oldest not flushed
74766 + transaction is found.
74767 +
74768 + NOTE on disk space leakage: the information about of what blocks and how many
74769 + blocks are allocated for wandered blocks, wandered records is not written to
74770 + the disk because of special logging for bitmaps and some super blocks
74771 + counters. After a system crash we the reiser4 does not remember those
74772 + objects allocation, thus we have no such a kind of disk space leakage.
74773 +*/
74774 +
74775 +/* Special logging of reiser4 super block fields. */
74776 +
74777 +/* There are some reiser4 super block fields (free block count and OID allocator
74778 + state (number of files and next free OID) which are logged separately from
74779 + super block to avoid unnecessary atom fusion.
74780 +
74781 + So, the reiser4 super block can be not captured by a transaction with
74782 + allocates/deallocates disk blocks or create/delete file objects. Moreover,
74783 + the reiser4 on-disk super block is not touched when such a transaction is
74784 + committed and flushed. Those "counters logged specially" are logged in "tx
74785 + head" blocks and in the journal footer block.
74786 +
74787 + A step-by-step description of special logging:
74788 +
74789 + 0. The per-atom information about deleted or created files and allocated or
74790 + freed blocks is collected during the transaction. The atom's
74791 + ->nr_objects_created and ->nr_objects_deleted are for object
74792 + deletion/creation tracking, the numbers of allocated and freed blocks are
74793 + calculated using atom's delete set and atom's capture list -- all new and
74794 + relocated nodes should be on atom's clean list and should have JNODE_RELOC
74795 + bit set.
74796 +
74797 + 1. The "logged specially" reiser4 super block fields have their "committed"
74798 + versions in the reiser4 in-memory super block. They get modified only at
74799 + atom commit time. The atom's commit thread has an exclusive access to those
74800 + "committed" fields because the log writer implementation supports only one
74801 + atom commit a time (there is a per-fs "commit" mutex). At
74802 + that time "committed" counters are modified using per-atom information
74803 + collected during the transaction. These counters are stored on disk as a
74804 + part of tx head block when atom is committed.
74805 +
74806 + 2. When the atom is flushed the value of the free block counter and the OID
74807 + allocator state get written to the journal footer block. A special journal
74808 + procedure (journal_recover_sb_data()) takes those values from the journal
74809 + footer and updates the reiser4 in-memory super block.
74810 +
74811 + NOTE: That means free block count and OID allocator state are logged
74812 + separately from the reiser4 super block regardless of the fact that the
74813 + reiser4 super block has fields to store both the free block counter and the
74814 + OID allocator.
74815 +
74816 + Writing the whole super block at commit time requires knowing true values of
74817 + all its fields without changes made by not yet committed transactions. It is
74818 + possible by having their "committed" version of the super block like the
74819 + reiser4 bitmap blocks have "committed" and "working" versions. However,
74820 + another scheme was implemented which stores special logged values in the
74821 + unused free space inside transaction head block. In my opinion it has an
74822 + advantage of not writing whole super block when only part of it was
74823 + modified. */
74824 +
74825 +#include "debug.h"
74826 +#include "dformat.h"
74827 +#include "txnmgr.h"
74828 +#include "jnode.h"
74829 +#include "znode.h"
74830 +#include "block_alloc.h"
74831 +#include "page_cache.h"
74832 +#include "wander.h"
74833 +#include "reiser4.h"
74834 +#include "super.h"
74835 +#include "vfs_ops.h"
74836 +#include "writeout.h"
74837 +#include "inode.h"
74838 +#include "entd.h"
74839 +
74840 +#include <linux/types.h>
74841 +#include <linux/fs.h> /* for struct super_block */
74842 +#include <linux/mm.h> /* for struct page */
74843 +#include <linux/pagemap.h>
74844 +#include <linux/bio.h> /* for struct bio */
74845 +#include <linux/blkdev.h>
74846 +
74847 +static int write_jnodes_to_disk_extent(
74848 + jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
74849 +
74850 +/* The commit_handle is a container for objects needed at atom commit time */
74851 +struct commit_handle {
74852 + /* A pointer to atom's list of OVRWR nodes */
74853 + struct list_head *overwrite_set;
74854 + /* atom's overwrite set size */
74855 + int overwrite_set_size;
74856 + /* jnodes for wander record blocks */
74857 + struct list_head tx_list;
74858 + /* number of wander records */
74859 + __u32 tx_size;
74860 + /* 'committed' sb counters are saved here until atom is completely
74861 + flushed */
74862 + __u64 free_blocks;
74863 + __u64 nr_files;
74864 + __u64 next_oid;
74865 + /* A pointer to the atom which is being committed */
74866 + txn_atom *atom;
74867 + /* A pointer to current super block */
74868 + struct super_block *super;
74869 + /* The counter of modified bitmaps */
74870 + reiser4_block_nr nr_bitmap;
74871 +};
74872 +
74873 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
74874 +{
74875 + memset(ch, 0, sizeof(struct commit_handle));
74876 + INIT_LIST_HEAD(&ch->tx_list);
74877 +
74878 + ch->atom = atom;
74879 + ch->super = reiser4_get_current_sb();
74880 +}
74881 +
74882 +static void done_commit_handle(struct commit_handle *ch)
74883 +{
74884 + assert("zam-690", list_empty(&ch->tx_list));
74885 +}
74886 +
74887 +static inline int reiser4_use_write_barrier(struct super_block * s)
74888 +{
74889 + return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
74890 +}
74891 +
74892 +static void disable_write_barrier(struct super_block * s)
74893 +{
74894 + notice("zam-1055", "%s does not support write barriers,"
74895 + " using synchronous write instead.", s->s_id);
74896 + set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
74897 +}
74898 +
74899 +/* fill journal header block data */
74900 +static void format_journal_header(struct commit_handle *ch)
74901 +{
74902 + struct reiser4_super_info_data *sbinfo;
74903 + struct journal_header *header;
74904 + jnode *txhead;
74905 +
74906 + sbinfo = get_super_private(ch->super);
74907 + assert("zam-479", sbinfo != NULL);
74908 + assert("zam-480", sbinfo->journal_header != NULL);
74909 +
74910 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
74911 +
74912 + jload(sbinfo->journal_header);
74913 +
74914 + header = (struct journal_header *)jdata(sbinfo->journal_header);
74915 + assert("zam-484", header != NULL);
74916 +
74917 + put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
74918 + &header->last_committed_tx);
74919 +
74920 + jrelse(sbinfo->journal_header);
74921 +}
74922 +
74923 +/* fill journal footer block data */
74924 +static void format_journal_footer(struct commit_handle *ch)
74925 +{
74926 + struct reiser4_super_info_data *sbinfo;
74927 + struct journal_footer *footer;
74928 + jnode *tx_head;
74929 +
74930 + sbinfo = get_super_private(ch->super);
74931 +
74932 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
74933 +
74934 + assert("zam-493", sbinfo != NULL);
74935 + assert("zam-494", sbinfo->journal_header != NULL);
74936 +
74937 + check_me("zam-691", jload(sbinfo->journal_footer) == 0);
74938 +
74939 + footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
74940 + assert("zam-495", footer != NULL);
74941 +
74942 + put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
74943 + &footer->last_flushed_tx);
74944 + put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
74945 +
74946 + put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
74947 + put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
74948 +
74949 + jrelse(sbinfo->journal_footer);
74950 +}
74951 +
74952 +/* wander record capacity depends on current block size */
74953 +static int wander_record_capacity(const struct super_block *super)
74954 +{
74955 + return (super->s_blocksize -
74956 + sizeof(struct wander_record_header)) /
74957 + sizeof(struct wander_entry);
74958 +}
74959 +
74960 +/* Fill first wander record (tx head) in accordance with supplied given data */
74961 +static void format_tx_head(struct commit_handle *ch)
74962 +{
74963 + jnode *tx_head;
74964 + jnode *next;
74965 + struct tx_header *header;
74966 +
74967 + tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
74968 + assert("zam-692", &ch->tx_list != &tx_head->capture_link);
74969 +
74970 + next = list_entry(tx_head->capture_link.next, jnode, capture_link);
74971 + if (&ch->tx_list == &next->capture_link)
74972 + next = tx_head;
74973 +
74974 + header = (struct tx_header *)jdata(tx_head);
74975 +
74976 + assert("zam-460", header != NULL);
74977 + assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
74978 +
74979 + memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
74980 + memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
74981 +
74982 + put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
74983 + put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
74984 + &header->prev_tx);
74985 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
74986 + put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
74987 + put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
74988 + put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
74989 +}
74990 +
74991 +/* prepare ordinary wander record block (fill all service fields) */
74992 +static void
74993 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
74994 +{
74995 + struct wander_record_header *LRH;
74996 + jnode *next;
74997 +
74998 + assert("zam-464", node != NULL);
74999 +
75000 + LRH = (struct wander_record_header *)jdata(node);
75001 + next = list_entry(node->capture_link.next, jnode, capture_link);
75002 +
75003 + if (&ch->tx_list == &next->capture_link)
75004 + next = list_entry(ch->tx_list.next, jnode, capture_link);
75005 +
75006 + assert("zam-465", LRH != NULL);
75007 + assert("zam-463",
75008 + ch->super->s_blocksize > sizeof(struct wander_record_header));
75009 +
75010 + memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
75011 + memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
75012 +
75013 + put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
75014 + put_unaligned(cpu_to_le32(serial), &LRH->serial);
75015 + put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
75016 +}
75017 +
75018 +/* add one wandered map entry to formatted wander record */
75019 +static void
75020 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
75021 + const reiser4_block_nr * b)
75022 +{
75023 + char *data;
75024 + struct wander_entry *pairs;
75025 +
75026 + data = jdata(node);
75027 + assert("zam-451", data != NULL);
75028 +
75029 + pairs =
75030 + (struct wander_entry *)(data + sizeof(struct wander_record_header));
75031 +
75032 + put_unaligned(cpu_to_le64(*a), &pairs[index].original);
75033 + put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
75034 +}
75035 +
75036 +/* currently, wander records contains contain only wandered map, which depend on
75037 + overwrite set size */
75038 +static void get_tx_size(struct commit_handle *ch)
75039 +{
75040 + assert("zam-440", ch->overwrite_set_size != 0);
75041 + assert("zam-695", ch->tx_size == 0);
75042 +
75043 + /* count all ordinary wander records
75044 + (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
75045 + for tx head block */
75046 + ch->tx_size =
75047 + (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
75048 + 2;
75049 +}
75050 +
75051 +/* A special structure for using in store_wmap_actor() for saving its state
75052 + between calls */
75053 +struct store_wmap_params {
75054 + jnode *cur; /* jnode of current wander record to fill */
75055 + int idx; /* free element index in wander record */
75056 + int capacity; /* capacity */
75057 +
75058 +#if REISER4_DEBUG
75059 + struct list_head *tx_list;
75060 +#endif
75061 +};
75062 +
75063 +/* an actor for use in blocknr_set_iterator routine which populates the list
75064 + of pre-formatted wander records by wandered map info */
75065 +static int
75066 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
75067 + const reiser4_block_nr * b, void *data)
75068 +{
75069 + struct store_wmap_params *params = data;
75070 +
75071 + if (params->idx >= params->capacity) {
75072 + /* a new wander record should be taken from the tx_list */
75073 + params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
75074 + assert("zam-454",
75075 + params->tx_list != &params->cur->capture_link);
75076 +
75077 + params->idx = 0;
75078 + }
75079 +
75080 + store_entry(params->cur, params->idx, a, b);
75081 + params->idx++;
75082 +
75083 + return 0;
75084 +}
75085 +
75086 +/* This function is called after Relocate set gets written to disk, Overwrite
75087 + set is written to wandered locations and all wander records are written
75088 + also. Updated journal header blocks contains a pointer (block number) to
75089 + first wander record of the just written transaction */
75090 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
75091 +{
75092 + struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75093 + jnode *jh = sbinfo->journal_header;
75094 + jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
75095 + int ret;
75096 +
75097 + format_journal_header(ch);
75098 +
75099 + ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
75100 + use_barrier ? WRITEOUT_BARRIER : 0);
75101 + if (ret)
75102 + return ret;
75103 +
75104 + // blk_run_address_space(sbinfo->fake->i_mapping);
75105 + /*blk_run_queues(); */
75106 +
75107 + ret = jwait_io(jh, WRITE);
75108 +
75109 + if (ret)
75110 + return ret;
75111 +
75112 + sbinfo->last_committed_tx = *jnode_get_block(head);
75113 +
75114 + return 0;
75115 +}
75116 +
75117 +/* This function is called after write-back is finished. We update journal
75118 + footer block and free blocks which were occupied by wandered blocks and
75119 + transaction wander records */
75120 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
75121 +{
75122 + reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75123 +
75124 + jnode *jf = sbinfo->journal_footer;
75125 +
75126 + int ret;
75127 +
75128 + format_journal_footer(ch);
75129 +
75130 + ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
75131 + use_barrier ? WRITEOUT_BARRIER : 0);
75132 + if (ret)
75133 + return ret;
75134 +
75135 + // blk_run_address_space(sbinfo->fake->i_mapping);
75136 + /*blk_run_queue(); */
75137 +
75138 + ret = jwait_io(jf, WRITE);
75139 + if (ret)
75140 + return ret;
75141 +
75142 + return 0;
75143 +}
75144 +
75145 +/* free block numbers of wander records of already written in place transaction */
75146 +static void dealloc_tx_list(struct commit_handle *ch)
75147 +{
75148 + while (!list_empty(&ch->tx_list)) {
75149 + jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
75150 + list_del(&cur->capture_link);
75151 + ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
75152 + reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
75153 + BA_FORMATTED);
75154 +
75155 + unpin_jnode_data(cur);
75156 + reiser4_drop_io_head(cur);
75157 + }
75158 +}
75159 +
75160 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
75161 + from atom's overwrite set. */
75162 +static int
75163 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
75164 + const reiser4_block_nr * a UNUSED_ARG,
75165 + const reiser4_block_nr * b, void *data UNUSED_ARG)
75166 +{
75167 +
75168 + assert("zam-499", b != NULL);
75169 + assert("zam-500", *b != 0);
75170 + assert("zam-501", !reiser4_blocknr_is_fake(b));
75171 +
75172 + reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
75173 + return 0;
75174 +}
75175 +
75176 +/* free wandered block locations of already written in place transaction */
75177 +static void dealloc_wmap(struct commit_handle *ch)
75178 +{
75179 + assert("zam-696", ch->atom != NULL);
75180 +
75181 + blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
75182 + dealloc_wmap_actor, NULL, 1);
75183 +}
75184 +
75185 +/* helper function for alloc wandered blocks, which refill set of block
75186 + numbers needed for wandered blocks */
75187 +static int
75188 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
75189 +{
75190 + reiser4_blocknr_hint hint;
75191 + int ret;
75192 +
75193 + reiser4_block_nr wide_len = count;
75194 +
75195 + /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
75196 + ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
75197 + reserved allocation area so as to get the best qualities of fixed
75198 + journals? */
75199 + reiser4_blocknr_hint_init(&hint);
75200 + hint.block_stage = BLOCK_GRABBED;
75201 +
75202 + ret = reiser4_alloc_blocks(&hint, start, &wide_len,
75203 + BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
75204 + *len = (int)wide_len;
75205 +
75206 + return ret;
75207 +}
75208 +
75209 +/*
75210 + * roll back changes made before issuing BIO in the case of IO error.
75211 + */
75212 +static void undo_bio(struct bio *bio)
75213 +{
75214 + int i;
75215 +
75216 + for (i = 0; i < bio->bi_vcnt; ++i) {
75217 + struct page *pg;
75218 + jnode *node;
75219 +
75220 + pg = bio->bi_io_vec[i].bv_page;
75221 + end_page_writeback(pg);
75222 + node = jprivate(pg);
75223 + spin_lock_jnode(node);
75224 + JF_CLR(node, JNODE_WRITEBACK);
75225 + JF_SET(node, JNODE_DIRTY);
75226 + spin_unlock_jnode(node);
75227 + }
75228 + bio_put(bio);
75229 +}
75230 +
75231 +/* put overwrite set back to atom's clean list */
75232 +static void put_overwrite_set(struct commit_handle *ch)
75233 +{
75234 + jnode *cur;
75235 +
75236 + list_for_each_entry(cur, ch->overwrite_set, capture_link)
75237 + jrelse_tail(cur);
75238 +}
75239 +
75240 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
75241 + Since we have a separate list for atom's overwrite set we just scan the list,
75242 + count bitmap and other not leaf nodes which wandered blocks allocation we
75243 + have to grab space for. */
75244 +static int get_overwrite_set(struct commit_handle *ch)
75245 +{
75246 + int ret;
75247 + jnode *cur;
75248 + __u64 nr_not_leaves = 0;
75249 +#if REISER4_DEBUG
75250 + __u64 nr_formatted_leaves = 0;
75251 + __u64 nr_unformatted_leaves = 0;
75252 +#endif
75253 +
75254 + assert("zam-697", ch->overwrite_set_size == 0);
75255 +
75256 + ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
75257 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75258 +
75259 + while (ch->overwrite_set != &cur->capture_link) {
75260 + jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
75261 +
75262 + /* Count bitmap locks for getting correct statistics what number
75263 + * of blocks were cleared by the transaction commit. */
75264 + if (jnode_get_type(cur) == JNODE_BITMAP)
75265 + ch->nr_bitmap++;
75266 +
75267 + assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
75268 + || jnode_get_type(cur) == JNODE_BITMAP);
75269 +
75270 + if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
75271 + /* we replace fake znode by another (real)
75272 + znode which is suggested by disk_layout
75273 + plugin */
75274 +
75275 + /* FIXME: it looks like fake znode should be
75276 + replaced by jnode supplied by
75277 + disk_layout. */
75278 +
75279 + struct super_block *s = reiser4_get_current_sb();
75280 + reiser4_super_info_data *sbinfo =
75281 + get_current_super_private();
75282 +
75283 + if (sbinfo->df_plug->log_super) {
75284 + jnode *sj = sbinfo->df_plug->log_super(s);
75285 +
75286 + assert("zam-593", sj != NULL);
75287 +
75288 + if (IS_ERR(sj))
75289 + return PTR_ERR(sj);
75290 +
75291 + spin_lock_jnode(sj);
75292 + JF_SET(sj, JNODE_OVRWR);
75293 + insert_into_atom_ovrwr_list(ch->atom, sj);
75294 + spin_unlock_jnode(sj);
75295 +
75296 + /* jload it as the rest of overwrite set */
75297 + jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
75298 +
75299 + ch->overwrite_set_size++;
75300 + }
75301 + spin_lock_jnode(cur);
75302 + reiser4_uncapture_block(cur);
75303 + jput(cur);
75304 +
75305 + } else {
75306 + int ret;
75307 + ch->overwrite_set_size++;
75308 + ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
75309 + if (ret)
75310 + reiser4_panic("zam-783",
75311 + "cannot load e-flushed jnode back (ret = %d)\n",
75312 + ret);
75313 + }
75314 +
75315 + /* Count not leaves here because we have to grab disk space
75316 + * for wandered blocks. They were not counted as "flush
75317 + * reserved". Counting should be done _after_ nodes are pinned
75318 + * into memory by jload(). */
75319 + if (!jnode_is_leaf(cur))
75320 + nr_not_leaves++;
75321 + else {
75322 +#if REISER4_DEBUG
75323 + /* at this point @cur either has JNODE_FLUSH_RESERVED
75324 + * or is eflushed. Locking is not strong enough to
75325 + * write an assertion checking for this. */
75326 + if (jnode_is_znode(cur))
75327 + nr_formatted_leaves++;
75328 + else
75329 + nr_unformatted_leaves++;
75330 +#endif
75331 + JF_CLR(cur, JNODE_FLUSH_RESERVED);
75332 + }
75333 +
75334 + cur = next;
75335 + }
75336 +
75337 + /* Grab space for writing (wandered blocks) of not leaves found in
75338 + * overwrite set. */
75339 + ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
75340 + if (ret)
75341 + return ret;
75342 +
75343 + /* Disk space for allocation of wandered blocks of leaf nodes already
75344 + * reserved as "flush reserved", move it to grabbed space counter. */
75345 + spin_lock_atom(ch->atom);
75346 + assert("zam-940",
75347 + nr_formatted_leaves + nr_unformatted_leaves <=
75348 + ch->atom->flush_reserved);
75349 + flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
75350 + spin_unlock_atom(ch->atom);
75351 +
75352 + return ch->overwrite_set_size;
75353 +}
75354 +
75355 +/**
75356 + * write_jnodes_to_disk_extent - submit write request
75357 + * @head:
75358 + * @first: first jnode of the list
75359 + * @nr: number of jnodes on the list
75360 + * @block_p:
75361 + * @fq:
75362 + * @flags: used to decide whether page is to get PG_reclaim flag
75363 + *
75364 + * Submits a write request for @nr jnodes beginning from the @first, other
75365 + * jnodes are after the @first on the double-linked "capture" list. All jnodes
75366 + * will be written to the disk region of @nr blocks starting with @block_p block
75367 + * number. If @fq is not NULL it means that waiting for i/o completion will be
75368 + * done more efficiently by using flush_queue_t objects.
75369 + * This function is the one which writes list of jnodes in batch mode. It does
75370 + * all low-level things as bio construction and page states manipulation.
75371 + *
75372 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
75373 + * aggregated in this function instead of being left to the layers below
75374 + *
75375 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
75376 + * Why that layer needed? Why BIOs cannot be constructed here?
75377 + */
75378 +static int write_jnodes_to_disk_extent(
75379 + jnode *first, int nr, const reiser4_block_nr *block_p,
75380 + flush_queue_t *fq, int flags)
75381 +{
75382 + struct super_block *super = reiser4_get_current_sb();
75383 + int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
75384 + int max_blocks;
75385 + jnode *cur = first;
75386 + reiser4_block_nr block;
75387 +
75388 + assert("zam-571", first != NULL);
75389 + assert("zam-572", block_p != NULL);
75390 + assert("zam-570", nr > 0);
75391 +
75392 + block = *block_p;
75393 + max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
75394 +
75395 + while (nr > 0) {
75396 + struct bio *bio;
75397 + int nr_blocks = min(nr, max_blocks);
75398 + int i;
75399 + int nr_used;
75400 +
75401 + bio = bio_alloc(GFP_NOIO, nr_blocks);
75402 + if (!bio)
75403 + return RETERR(-ENOMEM);
75404 +
75405 + bio->bi_bdev = super->s_bdev;
75406 + bio->bi_sector = block * (super->s_blocksize >> 9);
75407 + for (nr_used = 0, i = 0; i < nr_blocks; i++) {
75408 + struct page *pg;
75409 +
75410 + pg = jnode_page(cur);
75411 + assert("zam-573", pg != NULL);
75412 +
75413 + page_cache_get(pg);
75414 +
75415 + lock_and_wait_page_writeback(pg);
75416 +
75417 + if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
75418 + /*
75419 + * underlying device is satiated. Stop adding
75420 + * pages to the bio.
75421 + */
75422 + unlock_page(pg);
75423 + page_cache_release(pg);
75424 + break;
75425 + }
75426 +
75427 + spin_lock_jnode(cur);
75428 + assert("nikita-3166",
75429 + pg->mapping == jnode_get_mapping(cur));
75430 + assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
75431 +#if REISER4_DEBUG
75432 + spin_lock(&cur->load);
75433 + assert("nikita-3165", !jnode_is_releasable(cur));
75434 + spin_unlock(&cur->load);
75435 +#endif
75436 + JF_SET(cur, JNODE_WRITEBACK);
75437 + JF_CLR(cur, JNODE_DIRTY);
75438 + ON_DEBUG(cur->written++);
75439 + spin_unlock_jnode(cur);
75440 +
75441 + ClearPageError(pg);
75442 + set_page_writeback(pg);
75443 +
75444 + if (get_current_context()->entd) {
75445 + /* this is ent thread */
75446 + entd_context *ent = get_entd_context(super);
75447 + struct wbq *rq, *next;
75448 +
75449 + spin_lock(&ent->guard);
75450 +
75451 + if (pg == ent->cur_request->page) {
75452 + /*
75453 + * entd is called for this page. This
75454 + * request is not in th etodo list
75455 + */
75456 + ent->cur_request->written = 1;
75457 + } else {
75458 + /*
75459 + * if we have written a page for which writepage
75460 + * is called for - move request to another list.
75461 + */
75462 + list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
75463 + assert("", rq->magic == WBQ_MAGIC);
75464 + if (pg == rq->page) {
75465 + /*
75466 + * remove request from
75467 + * entd's queue, but do
75468 + * not wake up a thread
75469 + * which put this
75470 + * request
75471 + */
75472 + list_del_init(&rq->link);
75473 + ent->nr_todo_reqs --;
75474 + list_add_tail(&rq->link, &ent->done_list);
75475 + ent->nr_done_reqs ++;
75476 + rq->written = 1;
75477 + break;
75478 + }
75479 + }
75480 + }
75481 + spin_unlock(&ent->guard);
75482 + }
75483 +
75484 + clear_page_dirty_for_io(pg);
75485 +
75486 + unlock_page(pg);
75487 +
75488 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75489 + nr_used++;
75490 + }
75491 + if (nr_used > 0) {
75492 + assert("nikita-3453",
75493 + bio->bi_size == super->s_blocksize * nr_used);
75494 + assert("nikita-3454", bio->bi_vcnt == nr_used);
75495 +
75496 + /* Check if we are allowed to write at all */
75497 + if (super->s_flags & MS_RDONLY)
75498 + undo_bio(bio);
75499 + else {
75500 + int not_supported;
75501 +
75502 + add_fq_to_bio(fq, bio);
75503 + bio_get(bio);
75504 + reiser4_submit_bio(write_op, bio);
75505 + not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
75506 + bio_put(bio);
75507 + if (not_supported)
75508 + return -EOPNOTSUPP;
75509 + }
75510 +
75511 + block += nr_used - 1;
75512 + update_blocknr_hint_default(super, &block);
75513 + block += 1;
75514 + } else {
75515 + bio_put(bio);
75516 + }
75517 + nr -= nr_used;
75518 + }
75519 +
75520 + return 0;
75521 +}
75522 +
75523 +/* This is a procedure which recovers a contiguous sequences of disk block
75524 + numbers in the given list of j-nodes and submits write requests on this
75525 + per-sequence basis */
75526 +int
75527 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
75528 + long *nr_submitted, int flags)
75529 +{
75530 + int ret;
75531 + jnode *beg = list_entry(head->next, jnode, capture_link);
75532 +
75533 + while (head != &beg->capture_link) {
75534 + int nr = 1;
75535 + jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
75536 +
75537 + while (head != &cur->capture_link) {
75538 + if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
75539 + break;
75540 + ++nr;
75541 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75542 + }
75543 +
75544 + ret = write_jnodes_to_disk_extent(
75545 + beg, nr, jnode_get_block(beg), fq, flags);
75546 + if (ret)
75547 + return ret;
75548 +
75549 + if (nr_submitted)
75550 + *nr_submitted += nr;
75551 +
75552 + beg = cur;
75553 + }
75554 +
75555 + return 0;
75556 +}
75557 +
75558 +/* add given wandered mapping to atom's wandered map */
75559 +static int
75560 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
75561 +{
75562 + int ret;
75563 + blocknr_set_entry *new_bsep = NULL;
75564 + reiser4_block_nr block;
75565 +
75566 + txn_atom *atom;
75567 +
75568 + assert("zam-568", block_p != NULL);
75569 + block = *block_p;
75570 + assert("zam-569", len > 0);
75571 +
75572 + while ((len--) > 0) {
75573 + do {
75574 + atom = get_current_atom_locked();
75575 + assert("zam-536",
75576 + !reiser4_blocknr_is_fake(jnode_get_block(cur)));
75577 + ret =
75578 + blocknr_set_add_pair(atom, &atom->wandered_map,
75579 + &new_bsep,
75580 + jnode_get_block(cur), &block);
75581 + } while (ret == -E_REPEAT);
75582 +
75583 + if (ret) {
75584 + /* deallocate blocks which were not added to wandered
75585 + map */
75586 + reiser4_block_nr wide_len = len;
75587 +
75588 + reiser4_dealloc_blocks(&block, &wide_len,
75589 + BLOCK_NOT_COUNTED,
75590 + BA_FORMATTED
75591 + /* formatted, without defer */ );
75592 +
75593 + return ret;
75594 + }
75595 +
75596 + spin_unlock_atom(atom);
75597 +
75598 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75599 + ++block;
75600 + }
75601 +
75602 + return 0;
75603 +}
75604 +
75605 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
75606 + submit IO for allocated blocks. We assume that current atom is in a stage
75607 + when any atom fusion is impossible and atom is unlocked and it is safe. */
75608 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
75609 +{
75610 + reiser4_block_nr block;
75611 +
75612 + int rest;
75613 + int len;
75614 + int ret;
75615 +
75616 + jnode *cur;
75617 +
75618 + assert("zam-534", ch->overwrite_set_size > 0);
75619 +
75620 + rest = ch->overwrite_set_size;
75621 +
75622 + cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75623 + while (ch->overwrite_set != &cur->capture_link) {
75624 + assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
75625 +
75626 + ret = get_more_wandered_blocks(rest, &block, &len);
75627 + if (ret)
75628 + return ret;
75629 +
75630 + rest -= len;
75631 +
75632 + ret = add_region_to_wmap(cur, len, &block);
75633 + if (ret)
75634 + return ret;
75635 +
75636 + ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
75637 + if (ret)
75638 + return ret;
75639 +
75640 + while ((len--) > 0) {
75641 + assert("zam-604",
75642 + ch->overwrite_set != &cur->capture_link);
75643 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75644 + }
75645 + }
75646 +
75647 + return 0;
75648 +}
75649 +
75650 +/* allocate given number of nodes over the journal area and link them into a
75651 + list, return pointer to the first jnode in the list */
75652 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
75653 +{
75654 + reiser4_blocknr_hint hint;
75655 + reiser4_block_nr allocated = 0;
75656 + reiser4_block_nr first, len;
75657 + jnode *cur;
75658 + jnode *txhead;
75659 + int ret;
75660 + reiser4_context *ctx;
75661 + reiser4_super_info_data *sbinfo;
75662 +
75663 + assert("zam-698", ch->tx_size > 0);
75664 + assert("zam-699", list_empty_careful(&ch->tx_list));
75665 +
75666 + ctx = get_current_context();
75667 + sbinfo = get_super_private(ctx->super);
75668 +
75669 + while (allocated < (unsigned)ch->tx_size) {
75670 + len = (ch->tx_size - allocated);
75671 +
75672 + reiser4_blocknr_hint_init(&hint);
75673 +
75674 + hint.block_stage = BLOCK_GRABBED;
75675 +
75676 + /* FIXME: there should be some block allocation policy for
75677 + nodes which contain wander records */
75678 +
75679 + /* We assume that disk space for wandered record blocks can be
75680 + * taken from reserved area. */
75681 + ret = reiser4_alloc_blocks(&hint, &first, &len,
75682 + BA_FORMATTED | BA_RESERVED |
75683 + BA_USE_DEFAULT_SEARCH_START);
75684 + reiser4_blocknr_hint_done(&hint);
75685 +
75686 + if (ret)
75687 + return ret;
75688 +
75689 + allocated += len;
75690 +
75691 + /* create jnodes for all wander records */
75692 + while (len--) {
75693 + cur = reiser4_alloc_io_head(&first);
75694 +
75695 + if (cur == NULL) {
75696 + ret = RETERR(-ENOMEM);
75697 + goto free_not_assigned;
75698 + }
75699 +
75700 + ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
75701 +
75702 + if (ret != 0) {
75703 + jfree(cur);
75704 + goto free_not_assigned;
75705 + }
75706 +
75707 + pin_jnode_data(cur);
75708 +
75709 + list_add_tail(&cur->capture_link, &ch->tx_list);
75710 +
75711 + first++;
75712 + }
75713 + }
75714 +
75715 + { /* format a on-disk linked list of wander records */
75716 + int serial = 1;
75717 +
75718 + txhead = list_entry(ch->tx_list.next, jnode, capture_link);
75719 + format_tx_head(ch);
75720 +
75721 + cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75722 + while (&ch->tx_list != &cur->capture_link) {
75723 + format_wander_record(ch, cur, serial++);
75724 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75725 + }
75726 + }
75727 +
75728 + { /* Fill wander records with Wandered Set */
75729 + struct store_wmap_params params;
75730 + txn_atom *atom;
75731 +
75732 + params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75733 +
75734 + params.idx = 0;
75735 + params.capacity =
75736 + wander_record_capacity(reiser4_get_current_sb());
75737 +
75738 + atom = get_current_atom_locked();
75739 + blocknr_set_iterator(atom, &atom->wandered_map,
75740 + &store_wmap_actor, &params, 0);
75741 + spin_unlock_atom(atom);
75742 + }
75743 +
75744 + { /* relse all jnodes from tx_list */
75745 + cur = list_entry(ch->tx_list.next, jnode, capture_link);
75746 + while (&ch->tx_list != &cur->capture_link) {
75747 + jrelse(cur);
75748 + cur = list_entry(cur->capture_link.next, jnode, capture_link);
75749 + }
75750 + }
75751 +
75752 + ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
75753 +
75754 + return ret;
75755 +
75756 + free_not_assigned:
75757 + /* We deallocate blocks not yet assigned to jnodes on tx_list. The
75758 + caller takes care about invalidating of tx list */
75759 + reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
75760 +
75761 + return ret;
75762 +}
75763 +
75764 +static int commit_tx(struct commit_handle *ch)
75765 +{
75766 + flush_queue_t *fq;
75767 + int barrier;
75768 + int ret;
75769 +
75770 + /* Grab more space for wandered records. */
75771 + ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
75772 + if (ret)
75773 + return ret;
75774 +
75775 + fq = get_fq_for_current_atom();
75776 + if (IS_ERR(fq))
75777 + return PTR_ERR(fq);
75778 +
75779 + spin_unlock_atom(fq->atom);
75780 + do {
75781 + ret = alloc_wandered_blocks(ch, fq);
75782 + if (ret)
75783 + break;
75784 + ret = alloc_tx(ch, fq);
75785 + if (ret)
75786 + break;
75787 + } while (0);
75788 +
75789 + reiser4_fq_put(fq);
75790 + if (ret)
75791 + return ret;
75792 + repeat_wo_barrier:
75793 + barrier = reiser4_use_write_barrier(ch->super);
75794 + if (!barrier) {
75795 + ret = current_atom_finish_all_fq();
75796 + if (ret)
75797 + return ret;
75798 + }
75799 + ret = update_journal_header(ch, barrier);
75800 + if (barrier) {
75801 + if (ret) {
75802 + if (ret == -EOPNOTSUPP) {
75803 + disable_write_barrier(ch->super);
75804 + goto repeat_wo_barrier;
75805 + }
75806 + return ret;
75807 + }
75808 + ret = current_atom_finish_all_fq();
75809 + }
75810 + return ret;
75811 +}
75812 +
75813 +static int write_tx_back(struct commit_handle * ch)
75814 +{
75815 + flush_queue_t *fq;
75816 + int ret;
75817 + int barrier;
75818 +
75819 + reiser4_post_commit_hook();
75820 + fq = get_fq_for_current_atom();
75821 + if (IS_ERR(fq))
75822 + return PTR_ERR(fq);
75823 + spin_unlock_atom(fq->atom);
75824 + ret = write_jnode_list(
75825 + ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
75826 + reiser4_fq_put(fq);
75827 + if (ret)
75828 + return ret;
75829 + repeat_wo_barrier:
75830 + barrier = reiser4_use_write_barrier(ch->super);
75831 + if (!barrier) {
75832 + ret = current_atom_finish_all_fq();
75833 + if (ret)
75834 + return ret;
75835 + }
75836 + ret = update_journal_footer(ch, barrier);
75837 + if (barrier) {
75838 + if (ret) {
75839 + if (ret == -EOPNOTSUPP) {
75840 + disable_write_barrier(ch->super);
75841 + goto repeat_wo_barrier;
75842 + }
75843 + return ret;
75844 + }
75845 + ret = current_atom_finish_all_fq();
75846 + }
75847 + if (ret)
75848 + return ret;
75849 + reiser4_post_write_back_hook();
75850 + return 0;
75851 +}
75852 +
75853 +/* We assume that at this moment all captured blocks are marked as RELOC or
75854 + WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
75855 + are submitted to write.
75856 +*/
75857 +
75858 +int reiser4_write_logs(long *nr_submitted)
75859 +{
75860 + txn_atom *atom;
75861 + struct super_block *super = reiser4_get_current_sb();
75862 + reiser4_super_info_data *sbinfo = get_super_private(super);
75863 + struct commit_handle ch;
75864 + int ret;
75865 +
75866 + writeout_mode_enable();
75867 +
75868 + /* block allocator may add j-nodes to the clean_list */
75869 + ret = reiser4_pre_commit_hook();
75870 + if (ret)
75871 + return ret;
75872 +
75873 + /* No locks are required if we take atom which stage >=
75874 + * ASTAGE_PRE_COMMIT */
75875 + atom = get_current_context()->trans->atom;
75876 + assert("zam-965", atom != NULL);
75877 +
75878 + /* relocate set is on the atom->clean_nodes list after
75879 + * current_atom_complete_writes() finishes. It can be safely
75880 + * uncaptured after commit_mutex is locked, because any atom that
75881 + * captures these nodes is guaranteed to commit after current one.
75882 + *
75883 + * This can only be done after reiser4_pre_commit_hook(), because it is where
75884 + * early flushed jnodes with CREATED bit are transferred to the
75885 + * overwrite list. */
75886 + reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
75887 + spin_lock_atom(atom);
75888 + /* There might be waiters for the relocate nodes which we have
75889 + * released, wake them up. */
75890 + reiser4_atom_send_event(atom);
75891 + spin_unlock_atom(atom);
75892 +
75893 + if (REISER4_DEBUG) {
75894 + int level;
75895 +
75896 + for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
75897 + assert("nikita-3352",
75898 + list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
75899 + }
75900 +
75901 + sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
75902 + sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
75903 +
75904 + init_commit_handle(&ch, atom);
75905 +
75906 + ch.free_blocks = sbinfo->blocks_free_committed;
75907 + ch.nr_files = sbinfo->nr_files_committed;
75908 + /* ZAM-FIXME-HANS: email me what the contention level is for the super
75909 + * lock. */
75910 + ch.next_oid = oid_next(super);
75911 +
75912 + /* count overwrite set and place it in a separate list */
75913 + ret = get_overwrite_set(&ch);
75914 +
75915 + if (ret <= 0) {
75916 + /* It is possible that overwrite set is empty here, it means
75917 + all captured nodes are clean */
75918 + goto up_and_ret;
75919 + }
75920 +
75921 + /* Inform the caller about what number of dirty pages will be
75922 + * submitted to disk. */
75923 + *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
75924 +
75925 + /* count all records needed for storing of the wandered set */
75926 + get_tx_size(&ch);
75927 +
75928 + ret = commit_tx(&ch);
75929 + if (ret)
75930 + goto up_and_ret;
75931 +
75932 + spin_lock_atom(atom);
75933 + reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
75934 + spin_unlock_atom(atom);
75935 +
75936 + ret = write_tx_back(&ch);
75937 + reiser4_post_write_back_hook();
75938 +
75939 + up_and_ret:
75940 + if (ret) {
75941 + /* there could be fq attached to current atom; the only way to
75942 + remove them is: */
75943 + current_atom_finish_all_fq();
75944 + }
75945 +
75946 + /* free blocks of flushed transaction */
75947 + dealloc_tx_list(&ch);
75948 + dealloc_wmap(&ch);
75949 +
75950 + put_overwrite_set(&ch);
75951 +
75952 + done_commit_handle(&ch);
75953 +
75954 + writeout_mode_disable();
75955 +
75956 + return ret;
75957 +}
75958 +
75959 +/* consistency checks for journal data/control blocks: header, footer, log
75960 + records, transactions head blocks. All functions return zero on success. */
75961 +
75962 +static int check_journal_header(const jnode * node UNUSED_ARG)
75963 +{
75964 + /* FIXME: journal header has no magic field yet. */
75965 + return 0;
75966 +}
75967 +
75968 +/* wait for write completion for all jnodes from given list */
75969 +static int wait_on_jnode_list(struct list_head *head)
75970 +{
75971 + jnode *scan;
75972 + int ret = 0;
75973 +
75974 + list_for_each_entry(scan, head, capture_link) {
75975 + struct page *pg = jnode_page(scan);
75976 +
75977 + if (pg) {
75978 + if (PageWriteback(pg))
75979 + wait_on_page_writeback(pg);
75980 +
75981 + if (PageError(pg))
75982 + ret++;
75983 + }
75984 + }
75985 +
75986 + return ret;
75987 +}
75988 +
75989 +static int check_journal_footer(const jnode * node UNUSED_ARG)
75990 +{
75991 + /* FIXME: journal footer has no magic field yet. */
75992 + return 0;
75993 +}
75994 +
75995 +static int check_tx_head(const jnode * node)
75996 +{
75997 + struct tx_header *header = (struct tx_header *)jdata(node);
75998 +
75999 + if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
76000 + warning("zam-627", "tx head at block %s corrupted\n",
76001 + sprint_address(jnode_get_block(node)));
76002 + return RETERR(-EIO);
76003 + }
76004 +
76005 + return 0;
76006 +}
76007 +
76008 +static int check_wander_record(const jnode * node)
76009 +{
76010 + struct wander_record_header *RH =
76011 + (struct wander_record_header *)jdata(node);
76012 +
76013 + if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
76014 + 0) {
76015 + warning("zam-628", "wander record at block %s corrupted\n",
76016 + sprint_address(jnode_get_block(node)));
76017 + return RETERR(-EIO);
76018 + }
76019 +
76020 + return 0;
76021 +}
76022 +
76023 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
76024 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
76025 +{
76026 + struct tx_header *TXH;
76027 + int ret;
76028 +
76029 + ret = jload(tx_head);
76030 + if (ret)
76031 + return ret;
76032 +
76033 + TXH = (struct tx_header *)jdata(tx_head);
76034 +
76035 + ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
76036 + ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
76037 + ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
76038 +
76039 + jrelse(tx_head);
76040 +
76041 + list_add(&tx_head->capture_link, &ch->tx_list);
76042 +
76043 + return 0;
76044 +}
76045 +
76046 +/* replay one transaction: restore and write overwrite set in place */
76047 +static int replay_transaction(const struct super_block *s,
76048 + jnode * tx_head,
76049 + const reiser4_block_nr * log_rec_block_p,
76050 + const reiser4_block_nr * end_block,
76051 + unsigned int nr_wander_records)
76052 +{
76053 + reiser4_block_nr log_rec_block = *log_rec_block_p;
76054 + struct commit_handle ch;
76055 + LIST_HEAD(overwrite_set);
76056 + jnode *log;
76057 + int ret;
76058 +
76059 + init_commit_handle(&ch, NULL);
76060 + ch.overwrite_set = &overwrite_set;
76061 +
76062 + restore_commit_handle(&ch, tx_head);
76063 +
76064 + while (log_rec_block != *end_block) {
76065 + struct wander_record_header *header;
76066 + struct wander_entry *entry;
76067 +
76068 + int i;
76069 +
76070 + if (nr_wander_records == 0) {
76071 + warning("zam-631",
76072 + "number of wander records in the linked list"
76073 + " greater than number stored in tx head.\n");
76074 + ret = RETERR(-EIO);
76075 + goto free_ow_set;
76076 + }
76077 +
76078 + log = reiser4_alloc_io_head(&log_rec_block);
76079 + if (log == NULL)
76080 + return RETERR(-ENOMEM);
76081 +
76082 + ret = jload(log);
76083 + if (ret < 0) {
76084 + reiser4_drop_io_head(log);
76085 + return ret;
76086 + }
76087 +
76088 + ret = check_wander_record(log);
76089 + if (ret) {
76090 + jrelse(log);
76091 + reiser4_drop_io_head(log);
76092 + return ret;
76093 + }
76094 +
76095 + header = (struct wander_record_header *)jdata(log);
76096 + log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
76097 +
76098 + entry = (struct wander_entry *)(header + 1);
76099 +
76100 + /* restore overwrite set from wander record content */
76101 + for (i = 0; i < wander_record_capacity(s); i++) {
76102 + reiser4_block_nr block;
76103 + jnode *node;
76104 +
76105 + block = le64_to_cpu(get_unaligned(&entry->wandered));
76106 + if (block == 0)
76107 + break;
76108 +
76109 + node = reiser4_alloc_io_head(&block);
76110 + if (node == NULL) {
76111 + ret = RETERR(-ENOMEM);
76112 + /*
76113 + * FIXME-VS:???
76114 + */
76115 + jrelse(log);
76116 + reiser4_drop_io_head(log);
76117 + goto free_ow_set;
76118 + }
76119 +
76120 + ret = jload(node);
76121 +
76122 + if (ret < 0) {
76123 + reiser4_drop_io_head(node);
76124 + /*
76125 + * FIXME-VS:???
76126 + */
76127 + jrelse(log);
76128 + reiser4_drop_io_head(log);
76129 + goto free_ow_set;
76130 + }
76131 +
76132 + block = le64_to_cpu(get_unaligned(&entry->original));
76133 +
76134 + assert("zam-603", block != 0);
76135 +
76136 + jnode_set_block(node, &block);
76137 +
76138 + list_add_tail(&node->capture_link, ch.overwrite_set);
76139 +
76140 + ++entry;
76141 + }
76142 +
76143 + jrelse(log);
76144 + reiser4_drop_io_head(log);
76145 +
76146 + --nr_wander_records;
76147 + }
76148 +
76149 + if (nr_wander_records != 0) {
76150 + warning("zam-632", "number of wander records in the linked list"
76151 + " less than number stored in tx head.\n");
76152 + ret = RETERR(-EIO);
76153 + goto free_ow_set;
76154 + }
76155 +
76156 + { /* write wandered set in place */
76157 + write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
76158 + ret = wait_on_jnode_list(ch.overwrite_set);
76159 +
76160 + if (ret) {
76161 + ret = RETERR(-EIO);
76162 + goto free_ow_set;
76163 + }
76164 + }
76165 +
76166 + ret = update_journal_footer(&ch, 0);
76167 +
76168 + free_ow_set:
76169 +
76170 + while (!list_empty(ch.overwrite_set)) {
76171 + jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
76172 + list_del_init(&cur->capture_link);
76173 + jrelse(cur);
76174 + reiser4_drop_io_head(cur);
76175 + }
76176 +
76177 + list_del_init(&tx_head->capture_link);
76178 +
76179 + done_commit_handle(&ch);
76180 +
76181 + return ret;
76182 +}
76183 +
76184 +/* find oldest committed and not played transaction and play it. The transaction
76185 + * was committed and journal header block was updated but the blocks from the
76186 + * process of writing the atom's overwrite set in-place and updating of journal
76187 + * footer block were not completed. This function completes the process by
76188 + * recovering the atom's overwrite set from their wandered locations and writes
76189 + * them in-place and updating the journal footer. */
76190 +static int replay_oldest_transaction(struct super_block *s)
76191 +{
76192 + reiser4_super_info_data *sbinfo = get_super_private(s);
76193 + jnode *jf = sbinfo->journal_footer;
76194 + unsigned int total;
76195 + struct journal_footer *F;
76196 + struct tx_header *T;
76197 +
76198 + reiser4_block_nr prev_tx;
76199 + reiser4_block_nr last_flushed_tx;
76200 + reiser4_block_nr log_rec_block = 0;
76201 +
76202 + jnode *tx_head;
76203 +
76204 + int ret;
76205 +
76206 + if ((ret = jload(jf)) < 0)
76207 + return ret;
76208 +
76209 + F = (struct journal_footer *)jdata(jf);
76210 +
76211 + last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
76212 +
76213 + jrelse(jf);
76214 +
76215 + if (sbinfo->last_committed_tx == last_flushed_tx) {
76216 + /* all transactions are replayed */
76217 + return 0;
76218 + }
76219 +
76220 + prev_tx = sbinfo->last_committed_tx;
76221 +
76222 + /* searching for oldest not flushed transaction */
76223 + while (1) {
76224 + tx_head = reiser4_alloc_io_head(&prev_tx);
76225 + if (!tx_head)
76226 + return RETERR(-ENOMEM);
76227 +
76228 + ret = jload(tx_head);
76229 + if (ret < 0) {
76230 + reiser4_drop_io_head(tx_head);
76231 + return ret;
76232 + }
76233 +
76234 + ret = check_tx_head(tx_head);
76235 + if (ret) {
76236 + jrelse(tx_head);
76237 + reiser4_drop_io_head(tx_head);
76238 + return ret;
76239 + }
76240 +
76241 + T = (struct tx_header *)jdata(tx_head);
76242 +
76243 + prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
76244 +
76245 + if (prev_tx == last_flushed_tx)
76246 + break;
76247 +
76248 + jrelse(tx_head);
76249 + reiser4_drop_io_head(tx_head);
76250 + }
76251 +
76252 + total = le32_to_cpu(get_unaligned(&T->total));
76253 + log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
76254 +
76255 + pin_jnode_data(tx_head);
76256 + jrelse(tx_head);
76257 +
76258 + ret =
76259 + replay_transaction(s, tx_head, &log_rec_block,
76260 + jnode_get_block(tx_head), total - 1);
76261 +
76262 + unpin_jnode_data(tx_head);
76263 + reiser4_drop_io_head(tx_head);
76264 +
76265 + if (ret)
76266 + return ret;
76267 + return -E_REPEAT;
76268 +}
76269 +
76270 +/* The reiser4 journal current implementation was optimized to not to capture
76271 + super block if certain super blocks fields are modified. Currently, the set
76272 + is (<free block count>, <OID allocator>). These fields are logged by
76273 + special way which includes storing them in each transaction head block at
76274 + atom commit time and writing that information to journal footer block at
76275 + atom flush time. For getting info from journal footer block to the
76276 + in-memory super block there is a special function
76277 + reiser4_journal_recover_sb_data() which should be called after disk format
76278 + plugin re-reads super block after journal replaying.
76279 +*/
76280 +
76281 +/* get the information from journal footer in-memory super block */
76282 +int reiser4_journal_recover_sb_data(struct super_block *s)
76283 +{
76284 + reiser4_super_info_data *sbinfo = get_super_private(s);
76285 + struct journal_footer *jf;
76286 + int ret;
76287 +
76288 + assert("zam-673", sbinfo->journal_footer != NULL);
76289 +
76290 + ret = jload(sbinfo->journal_footer);
76291 + if (ret != 0)
76292 + return ret;
76293 +
76294 + ret = check_journal_footer(sbinfo->journal_footer);
76295 + if (ret != 0)
76296 + goto out;
76297 +
76298 + jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
76299 +
76300 + /* was there at least one flushed transaction? */
76301 + if (jf->last_flushed_tx) {
76302 +
76303 + /* restore free block counter logged in this transaction */
76304 + reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
76305 +
76306 + /* restore oid allocator state */
76307 + oid_init_allocator(s,
76308 + le64_to_cpu(get_unaligned(&jf->nr_files)),
76309 + le64_to_cpu(get_unaligned(&jf->next_oid)));
76310 + }
76311 + out:
76312 + jrelse(sbinfo->journal_footer);
76313 + return ret;
76314 +}
76315 +
76316 +/* reiser4 replay journal procedure */
76317 +int reiser4_journal_replay(struct super_block *s)
76318 +{
76319 + reiser4_super_info_data *sbinfo = get_super_private(s);
76320 + jnode *jh, *jf;
76321 + struct journal_header *header;
76322 + int nr_tx_replayed = 0;
76323 + int ret;
76324 +
76325 + assert("zam-582", sbinfo != NULL);
76326 +
76327 + jh = sbinfo->journal_header;
76328 + jf = sbinfo->journal_footer;
76329 +
76330 + if (!jh || !jf) {
76331 + /* it is possible that disk layout does not support journal
76332 + structures, we just warn about this */
76333 + warning("zam-583",
76334 + "journal control blocks were not loaded by disk layout plugin. "
76335 + "journal replaying is not possible.\n");
76336 + return 0;
76337 + }
76338 +
76339 + /* Take free block count from journal footer block. The free block
76340 + counter value corresponds the last flushed transaction state */
76341 + ret = jload(jf);
76342 + if (ret < 0)
76343 + return ret;
76344 +
76345 + ret = check_journal_footer(jf);
76346 + if (ret) {
76347 + jrelse(jf);
76348 + return ret;
76349 + }
76350 +
76351 + jrelse(jf);
76352 +
76353 + /* store last committed transaction info in reiser4 in-memory super
76354 + block */
76355 + ret = jload(jh);
76356 + if (ret < 0)
76357 + return ret;
76358 +
76359 + ret = check_journal_header(jh);
76360 + if (ret) {
76361 + jrelse(jh);
76362 + return ret;
76363 + }
76364 +
76365 + header = (struct journal_header *)jdata(jh);
76366 + sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
76367 +
76368 + jrelse(jh);
76369 +
76370 + /* replay committed transactions */
76371 + while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
76372 + nr_tx_replayed++;
76373 +
76374 + return ret;
76375 +}
76376 +
76377 +/* load journal control block (either journal header or journal footer block) */
76378 +static int
76379 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
76380 +{
76381 + int ret;
76382 +
76383 + *node = reiser4_alloc_io_head(block);
76384 + if (!(*node))
76385 + return RETERR(-ENOMEM);
76386 +
76387 + ret = jload(*node);
76388 +
76389 + if (ret) {
76390 + reiser4_drop_io_head(*node);
76391 + *node = NULL;
76392 + return ret;
76393 + }
76394 +
76395 + pin_jnode_data(*node);
76396 + jrelse(*node);
76397 +
76398 + return 0;
76399 +}
76400 +
76401 +/* unload journal header or footer and free jnode */
76402 +static void unload_journal_control_block(jnode ** node)
76403 +{
76404 + if (*node) {
76405 + unpin_jnode_data(*node);
76406 + reiser4_drop_io_head(*node);
76407 + *node = NULL;
76408 + }
76409 +}
76410 +
76411 +/* release journal control blocks */
76412 +void reiser4_done_journal_info(struct super_block *s)
76413 +{
76414 + reiser4_super_info_data *sbinfo = get_super_private(s);
76415 +
76416 + assert("zam-476", sbinfo != NULL);
76417 +
76418 + unload_journal_control_block(&sbinfo->journal_header);
76419 + unload_journal_control_block(&sbinfo->journal_footer);
76420 + rcu_barrier();
76421 +}
76422 +
76423 +/* load journal control blocks */
76424 +int reiser4_init_journal_info(struct super_block *s)
76425 +{
76426 + reiser4_super_info_data *sbinfo = get_super_private(s);
76427 + journal_location *loc;
76428 + int ret;
76429 +
76430 + loc = &sbinfo->jloc;
76431 +
76432 + assert("zam-651", loc != NULL);
76433 + assert("zam-652", loc->header != 0);
76434 + assert("zam-653", loc->footer != 0);
76435 +
76436 + ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
76437 +
76438 + if (ret)
76439 + return ret;
76440 +
76441 + ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
76442 +
76443 + if (ret) {
76444 + unload_journal_control_block(&sbinfo->journal_header);
76445 + }
76446 +
76447 + return ret;
76448 +}
76449 +
76450 +/* Make Linus happy.
76451 + Local variables:
76452 + c-indentation-style: "K&R"
76453 + mode-name: "LC"
76454 + c-basic-offset: 8
76455 + tab-width: 8
76456 + fill-column: 80
76457 + End:
76458 +*/
76459 diff -urN linux-2.6.23.orig/fs/reiser4/wander.h linux-2.6.23/fs/reiser4/wander.h
76460 --- linux-2.6.23.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
76461 +++ linux-2.6.23/fs/reiser4/wander.h 2007-12-04 16:49:30.000000000 +0300
76462 @@ -0,0 +1,135 @@
76463 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
76464 +
76465 +#if !defined (__FS_REISER4_WANDER_H__)
76466 +#define __FS_REISER4_WANDER_H__
76467 +
76468 +#include "dformat.h"
76469 +
76470 +#include <linux/fs.h> /* for struct super_block */
76471 +
76472 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
76473 +
76474 +#define TX_HEADER_MAGIC "TxMagic4"
76475 +#define WANDER_RECORD_MAGIC "LogMagc4"
76476 +
76477 +#define TX_HEADER_MAGIC_SIZE (8)
76478 +#define WANDER_RECORD_MAGIC_SIZE (8)
76479 +
76480 +/* journal header block format */
76481 +struct journal_header {
76482 + /* last written transaction head location */
76483 + d64 last_committed_tx;
76484 +};
76485 +
76486 +typedef struct journal_location {
76487 + reiser4_block_nr footer;
76488 + reiser4_block_nr header;
76489 +} journal_location;
76490 +
76491 +/* The wander.c head comment describes usage and semantic of all these structures */
76492 +/* journal footer block format */
76493 +struct journal_footer {
76494 + /* last flushed transaction location. */
76495 + /* This block number is no more valid after the transaction it points
76496 + to gets flushed, this number is used only at journal replaying time
76497 + for detection of the end of on-disk list of committed transactions
76498 + which were not flushed completely */
76499 + d64 last_flushed_tx;
76500 +
76501 + /* free block counter is written in journal footer at transaction
76502 + flushing , not in super block because free blocks counter is logged
76503 + by another way than super block fields (root pointer, for
76504 + example). */
76505 + d64 free_blocks;
76506 +
76507 + /* number of used OIDs and maximal used OID are logged separately from
76508 + super block */
76509 + d64 nr_files;
76510 + d64 next_oid;
76511 +};
76512 +
76513 +/* Each wander record (except the first one) has unified format with wander
76514 + record header followed by an array of log entries */
76515 +struct wander_record_header {
76516 + /* when there is no predefined location for wander records, this magic
76517 + string should help reiser4fsck. */
76518 + char magic[WANDER_RECORD_MAGIC_SIZE];
76519 +
76520 + /* transaction id */
76521 + d64 id;
76522 +
76523 + /* total number of wander records in current transaction */
76524 + d32 total;
76525 +
76526 + /* this block number in transaction */
76527 + d32 serial;
76528 +
76529 + /* number of previous block in commit */
76530 + d64 next_block;
76531 +};
76532 +
76533 +/* The first wander record (transaction head) of written transaction has the
76534 + special format */
76535 +struct tx_header {
76536 + /* magic string makes first block in transaction different from other
76537 + logged blocks, it should help fsck. */
76538 + char magic[TX_HEADER_MAGIC_SIZE];
76539 +
76540 + /* transaction id */
76541 + d64 id;
76542 +
76543 + /* total number of records (including this first tx head) in the
76544 + transaction */
76545 + d32 total;
76546 +
76547 + /* align next field to 8-byte boundary; this field always is zero */
76548 + d32 padding;
76549 +
76550 + /* block number of previous transaction head */
76551 + d64 prev_tx;
76552 +
76553 + /* next wander record location */
76554 + d64 next_block;
76555 +
76556 + /* committed versions of free blocks counter */
76557 + d64 free_blocks;
76558 +
76559 + /* number of used OIDs (nr_files) and maximal used OID are logged
76560 + separately from super block */
76561 + d64 nr_files;
76562 + d64 next_oid;
76563 +};
76564 +
76565 +/* A transaction gets written to disk as a set of wander records (each wander
76566 + record size is fs block) */
76567 +
76568 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
76569 + by zeroes */
76570 +struct wander_entry {
76571 + d64 original; /* block original location */
76572 + d64 wandered; /* block wandered location */
76573 +};
76574 +
76575 +/* REISER4 JOURNAL WRITER FUNCTIONS */
76576 +
76577 +extern int reiser4_write_logs(long *);
76578 +extern int reiser4_journal_replay(struct super_block *);
76579 +extern int reiser4_journal_recover_sb_data(struct super_block *);
76580 +
76581 +extern int reiser4_init_journal_info(struct super_block *);
76582 +extern void reiser4_done_journal_info(struct super_block *);
76583 +
76584 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
76585 +
76586 +#endif /* __FS_REISER4_WANDER_H__ */
76587 +
76588 +/* Make Linus happy.
76589 + Local variables:
76590 + c-indentation-style: "K&R"
76591 + mode-name: "LC"
76592 + c-basic-offset: 8
76593 + tab-width: 8
76594 + fill-column: 80
76595 + scroll-step: 1
76596 + End:
76597 +*/
76598 diff -urN linux-2.6.23.orig/fs/reiser4/writeout.h linux-2.6.23/fs/reiser4/writeout.h
76599 --- linux-2.6.23.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300
76600 +++ linux-2.6.23/fs/reiser4/writeout.h 2007-12-04 16:49:30.000000000 +0300
76601 @@ -0,0 +1,21 @@
76602 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
76603 +
76604 +#if !defined (__FS_REISER4_WRITEOUT_H__)
76605 +
76606 +#define WRITEOUT_SINGLE_STREAM (0x1)
76607 +#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
76608 +#define WRITEOUT_BARRIER (0x4)
76609 +
76610 +extern int reiser4_get_writeout_flags(void);
76611 +
76612 +#endif /* __FS_REISER4_WRITEOUT_H__ */
76613 +
76614 +/* Make Linus happy.
76615 + Local variables:
76616 + c-indentation-style: "K&R"
76617 + mode-name: "LC"
76618 + c-basic-offset: 8
76619 + tab-width: 8
76620 + fill-column: 80
76621 + End:
76622 +*/
76623 diff -urN linux-2.6.23.orig/fs/reiser4/znode.c linux-2.6.23/fs/reiser4/znode.c
76624 --- linux-2.6.23.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300
76625 +++ linux-2.6.23/fs/reiser4/znode.c 2007-12-04 16:49:30.000000000 +0300
76626 @@ -0,0 +1,1029 @@
76627 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76628 + * reiser4/README */
76629 +/* Znode manipulation functions. */
76630 +/* Znode is the in-memory header for a tree node. It is stored
76631 + separately from the node itself so that it does not get written to
76632 + disk. In this respect znode is like buffer head or page head. We
76633 + also use znodes for additional reiser4 specific purposes:
76634 +
76635 + . they are organized into tree structure which is a part of whole
76636 + reiser4 tree.
76637 + . they are used to implement node grained locking
76638 + . they are used to keep additional state associated with a
76639 + node
76640 + . they contain links to lists used by the transaction manager
76641 +
76642 + Znode is attached to some variable "block number" which is instance of
76643 + fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
76644 + appropriate node being actually loaded in memory. Existence of znode itself
76645 + is regulated by reference count (->x_count) in it. Each time thread
76646 + acquires reference to znode through call to zget(), ->x_count is
76647 + incremented and decremented on call to zput(). Data (content of node) are
76648 + brought in memory through call to zload(), which also increments ->d_count
76649 + reference counter. zload can block waiting on IO. Call to zrelse()
76650 + decreases this counter. Also, ->c_count keeps track of number of child
76651 + znodes and prevents parent znode from being recycled until all of its
76652 + children are. ->c_count is decremented whenever child goes out of existence
76653 + (being actually recycled in zdestroy()) which can be some time after last
76654 + reference to this child dies if we support some form of LRU cache for
76655 + znodes.
76656 +
76657 +*/
76658 +/* EVERY ZNODE'S STORY
76659 +
76660 + 1. His infancy.
76661 +
76662 + Once upon a time, the znode was born deep inside of zget() by call to
76663 + zalloc(). At the return from zget() znode had:
76664 +
76665 + . reference counter (x_count) of 1
76666 + . assigned block number, marked as used in bitmap
76667 + . pointer to parent znode. Root znode parent pointer points
76668 + to its father: "fake" znode. This, in turn, has NULL parent pointer.
76669 + . hash table linkage
76670 + . no data loaded from disk
76671 + . no node plugin
76672 + . no sibling linkage
76673 +
76674 + 2. His childhood
76675 +
76676 + Each node is either brought into memory as a result of tree traversal, or
76677 + created afresh, creation of the root being a special case of the latter. In
76678 + either case it's inserted into sibling list. This will typically require
76679 + some ancillary tree traversing, but ultimately both sibling pointers will
76680 + exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
76681 + zjnode.state.
76682 +
76683 + 3. His youth.
76684 +
76685 + If znode is bound to already existing node in a tree, its content is read
76686 + from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
76687 + in zjnode.state and zdata() function starts to return non null for this
76688 + znode. zload() further calls zparse() that determines which node layout
76689 + this node is rendered in, and sets ->nplug on success.
76690 +
76691 + If znode is for new node just created, memory for it is allocated and
76692 + zinit_new() function is called to initialise data, according to selected
76693 + node layout.
76694 +
76695 + 4. His maturity.
76696 +
76697 + After this point, znode lingers in memory for some time. Threads can
76698 + acquire references to znode either by blocknr through call to zget(), or by
76699 + following a pointer to unallocated znode from internal item. Each time
76700 + reference to znode is obtained, x_count is increased. Thread can read/write
76701 + lock znode. Znode data can be loaded through calls to zload(), d_count will
76702 + be increased appropriately. If all references to znode are released
76703 + (x_count drops to 0), znode is not recycled immediately. Rather, it is
76704 + still cached in the hash table in the hope that it will be accessed
76705 + shortly.
76706 +
76707 + There are two ways in which znode existence can be terminated:
76708 +
76709 + . sudden death: node bound to this znode is removed from the tree
76710 + . overpopulation: znode is purged out of memory due to memory pressure
76711 +
76712 + 5. His death.
76713 +
76714 + Death is complex process.
76715 +
76716 + When we irrevocably commit ourselves to decision to remove node from the
76717 + tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
76718 + znode. This is done either in ->kill_hook() of internal item or in
76719 + reiser4_kill_root() function when tree root is removed.
76720 +
76721 + At this moment znode still has:
76722 +
76723 + . locks held on it, necessary write ones
76724 + . references to it
76725 + . disk block assigned to it
76726 + . data loaded from the disk
76727 + . pending requests for lock
76728 +
76729 + But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
76730 + deletion. Node deletion includes two phases. First all ways to get
76731 + references to that znode (sibling and parent links and hash lookup using
76732 + block number stored in parent node) should be deleted -- it is done through
76733 + sibling_list_remove(), also we assume that nobody uses down link from
76734 + parent node due to its nonexistence or proper parent node locking and
76735 + nobody uses parent pointers from children due to absence of them. Second we
76736 + invalidate all pending lock requests which still are on znode's lock
76737 + request queue, this is done by reiser4_invalidate_lock(). Another
76738 + JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
76739 + Once it set all requesters are forced to return -EINVAL from
76740 + longterm_lock_znode(). Future locking attempts are not possible because all
76741 + ways to get references to that znode are removed already. Last, node is
76742 + uncaptured from transaction.
76743 +
76744 + When last reference to the dying znode is just about to be released,
76745 + block number for this lock is released and znode is removed from the
76746 + hash table.
76747 +
76748 + Now znode can be recycled.
76749 +
76750 + [it's possible to free bitmap block and remove znode from the hash
76751 + table when last lock is released. This will result in having
76752 + referenced but completely orphaned znode]
76753 +
76754 + 6. Limbo
76755 +
76756 + As have been mentioned above znodes with reference counter 0 are
76757 + still cached in a hash table. Once memory pressure increases they are
76758 + purged out of there [this requires something like LRU list for
76759 + efficient implementation. LRU list would also greatly simplify
76760 + implementation of coord cache that would in this case morph to just
76761 + scanning some initial segment of LRU list]. Data loaded into
76762 + unreferenced znode are flushed back to the durable storage if
76763 + necessary and memory is freed. Znodes themselves can be recycled at
76764 + this point too.
76765 +
76766 +*/
76767 +
76768 +#include "debug.h"
76769 +#include "dformat.h"
76770 +#include "key.h"
76771 +#include "coord.h"
76772 +#include "plugin/plugin_header.h"
76773 +#include "plugin/node/node.h"
76774 +#include "plugin/plugin.h"
76775 +#include "txnmgr.h"
76776 +#include "jnode.h"
76777 +#include "znode.h"
76778 +#include "block_alloc.h"
76779 +#include "tree.h"
76780 +#include "tree_walk.h"
76781 +#include "super.h"
76782 +#include "reiser4.h"
76783 +
76784 +#include <linux/pagemap.h>
76785 +#include <linux/spinlock.h>
76786 +#include <linux/slab.h>
76787 +#include <linux/err.h>
76788 +
76789 +static z_hash_table *get_htable(reiser4_tree *,
76790 + const reiser4_block_nr * const blocknr);
76791 +static z_hash_table *znode_get_htable(const znode *);
76792 +static void zdrop(znode *);
76793 +
76794 +/* hash table support */
76795 +
76796 +/* compare two block numbers for equality. Used by hash-table macros */
76797 +static inline int
76798 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
76799 +{
76800 + assert("nikita-534", b1 != NULL);
76801 + assert("nikita-535", b2 != NULL);
76802 +
76803 + return *b1 == *b2;
76804 +}
76805 +
76806 +/* Hash znode by block number. Used by hash-table macros */
76807 +/* Audited by: umka (2002.06.11) */
76808 +static inline __u32
76809 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
76810 +{
76811 + assert("nikita-536", b != NULL);
76812 +
76813 + return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
76814 +}
76815 +
76816 +/* The hash table definition */
76817 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
76818 +#define KFREE(ptr, size) kfree(ptr)
76819 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
76820 + blknrhashfn, blknreq);
76821 +#undef KFREE
76822 +#undef KMALLOC
76823 +
76824 +/* slab for znodes */
76825 +static struct kmem_cache *znode_cache;
76826 +
76827 +int znode_shift_order;
76828 +
76829 +/**
76830 + * init_znodes - create znode cache
76831 + *
76832 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
76833 + */
76834 +int init_znodes(void)
76835 +{
76836 + znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
76837 + SLAB_HWCACHE_ALIGN |
76838 + SLAB_RECLAIM_ACCOUNT, NULL);
76839 + if (znode_cache == NULL)
76840 + return RETERR(-ENOMEM);
76841 +
76842 + for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
76843 + ++znode_shift_order);
76844 + --znode_shift_order;
76845 + return 0;
76846 +}
76847 +
76848 +/**
76849 + * done_znodes - delete znode cache
76850 + *
76851 + * This is called on reiser4 module unloading or system shutdown.
76852 + */
76853 +void done_znodes(void)
76854 +{
76855 + destroy_reiser4_cache(&znode_cache);
76856 +}
76857 +
76858 +/* call this to initialise tree of znodes */
76859 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
76860 +{
76861 + int result;
76862 + assert("umka-050", tree != NULL);
76863 +
76864 + rwlock_init(&tree->dk_lock);
76865 +
76866 + result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76867 + if (result != 0)
76868 + return result;
76869 + result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76870 + return result;
76871 +}
76872 +
76873 +/* free this znode */
76874 +void zfree(znode * node /* znode to free */ )
76875 +{
76876 + assert("nikita-465", node != NULL);
76877 + assert("nikita-2120", znode_page(node) == NULL);
76878 + assert("nikita-2301", list_empty_careful(&node->lock.owners));
76879 + assert("nikita-2302", list_empty_careful(&node->lock.requestors));
76880 + assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
76881 + NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
76882 + assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
76883 + assert("nikita-3293", !znode_is_right_connected(node));
76884 + assert("nikita-3294", !znode_is_left_connected(node));
76885 + assert("nikita-3295", node->left == NULL);
76886 + assert("nikita-3296", node->right == NULL);
76887 +
76888 + /* not yet phash_jnode_destroy(ZJNODE(node)); */
76889 +
76890 + kmem_cache_free(znode_cache, node);
76891 +}
76892 +
76893 +/* call this to free tree of znodes */
76894 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
76895 +{
76896 + znode *node;
76897 + znode *next;
76898 + z_hash_table *ztable;
76899 +
76900 + /* scan znode hash-tables and kill all znodes, then free hash tables
76901 + * themselves. */
76902 +
76903 + assert("nikita-795", tree != NULL);
76904 +
76905 + ztable = &tree->zhash_table;
76906 +
76907 + if (ztable->_table != NULL) {
76908 + for_all_in_htable(ztable, z, node, next) {
76909 + node->c_count = 0;
76910 + node->in_parent.node = NULL;
76911 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
76912 + zdrop(node);
76913 + }
76914 +
76915 + z_hash_done(&tree->zhash_table);
76916 + }
76917 +
76918 + ztable = &tree->zfake_table;
76919 +
76920 + if (ztable->_table != NULL) {
76921 + for_all_in_htable(ztable, z, node, next) {
76922 + node->c_count = 0;
76923 + node->in_parent.node = NULL;
76924 + assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
76925 + zdrop(node);
76926 + }
76927 +
76928 + z_hash_done(&tree->zfake_table);
76929 + }
76930 +}
76931 +
76932 +/* ZNODE STRUCTURES */
76933 +
76934 +/* allocate fresh znode */
76935 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
76936 +{
76937 + znode *node;
76938 +
76939 + node = kmem_cache_alloc(znode_cache, gfp_flag);
76940 + return node;
76941 +}
76942 +
76943 +/* Initialize fields of znode
76944 + @node: znode to initialize;
76945 + @parent: parent znode;
76946 + @tree: tree we are in. */
76947 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
76948 +{
76949 + assert("nikita-466", node != NULL);
76950 + assert("umka-268", current_tree != NULL);
76951 +
76952 + memset(node, 0, sizeof *node);
76953 +
76954 + assert("umka-051", tree != NULL);
76955 +
76956 + jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
76957 + reiser4_init_lock(&node->lock);
76958 + init_parent_coord(&node->in_parent, parent);
76959 +}
76960 +
76961 +/*
76962 + * remove znode from indices. This is called jput() when last reference on
76963 + * znode is released.
76964 + */
76965 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
76966 +{
76967 + assert("nikita-2108", node != NULL);
76968 + assert("nikita-470", node->c_count == 0);
76969 + assert_rw_write_locked(&(tree->tree_lock));
76970 +
76971 + /* remove reference to this znode from cbk cache */
76972 + cbk_cache_invalidate(node, tree);
76973 +
76974 + /* update c_count of parent */
76975 + if (znode_parent(node) != NULL) {
76976 + assert("nikita-472", znode_parent(node)->c_count > 0);
76977 + /* father, onto your hands I forward my spirit... */
76978 + znode_parent(node)->c_count--;
76979 + node->in_parent.node = NULL;
76980 + } else {
76981 + /* orphaned znode?! Root? */
76982 + }
76983 +
76984 + /* remove znode from hash-table */
76985 + z_hash_remove_rcu(znode_get_htable(node), node);
76986 +}
76987 +
76988 +/* zdrop() -- Remove znode from the tree.
76989 +
76990 + This is called when znode is removed from the memory. */
76991 +static void zdrop(znode * node /* znode to finish with */ )
76992 +{
76993 + jdrop(ZJNODE(node));
76994 +}
76995 +
76996 +/*
76997 + * put znode into right place in the hash table. This is called by relocate
76998 + * code.
76999 + */
77000 +int znode_rehash(znode * node /* node to rehash */ ,
77001 + const reiser4_block_nr * new_block_nr /* new block number */ )
77002 +{
77003 + z_hash_table *oldtable;
77004 + z_hash_table *newtable;
77005 + reiser4_tree *tree;
77006 +
77007 + assert("nikita-2018", node != NULL);
77008 +
77009 + tree = znode_get_tree(node);
77010 + oldtable = znode_get_htable(node);
77011 + newtable = get_htable(tree, new_block_nr);
77012 +
77013 + write_lock_tree(tree);
77014 + /* remove znode from hash-table */
77015 + z_hash_remove_rcu(oldtable, node);
77016 +
77017 + /* assertion no longer valid due to RCU */
77018 + /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
77019 +
77020 + /* update blocknr */
77021 + znode_set_block(node, new_block_nr);
77022 + node->zjnode.key.z = *new_block_nr;
77023 +
77024 + /* insert it into hash */
77025 + z_hash_insert_rcu(newtable, node);
77026 + write_unlock_tree(tree);
77027 + return 0;
77028 +}
77029 +
77030 +/* ZNODE LOOKUP, GET, PUT */
77031 +
77032 +/* zlook() - get znode with given block_nr in a hash table or return NULL
77033 +
77034 + If result is non-NULL then the znode's x_count is incremented. Internal version
77035 + accepts pre-computed hash index. The hash table is accessed under caller's
77036 + tree->hash_lock.
77037 +*/
77038 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
77039 +{
77040 + znode *result;
77041 + __u32 hash;
77042 + z_hash_table *htable;
77043 +
77044 + assert("jmacd-506", tree != NULL);
77045 + assert("jmacd-507", blocknr != NULL);
77046 +
77047 + htable = get_htable(tree, blocknr);
77048 + hash = blknrhashfn(htable, blocknr);
77049 +
77050 + rcu_read_lock();
77051 + result = z_hash_find_index(htable, hash, blocknr);
77052 +
77053 + if (result != NULL) {
77054 + add_x_ref(ZJNODE(result));
77055 + result = znode_rip_check(tree, result);
77056 + }
77057 + rcu_read_unlock();
77058 +
77059 + return result;
77060 +}
77061 +
77062 +/* return hash table where znode with block @blocknr is (or should be)
77063 + * stored */
77064 +static z_hash_table *get_htable(reiser4_tree * tree,
77065 + const reiser4_block_nr * const blocknr)
77066 +{
77067 + z_hash_table *table;
77068 + if (is_disk_addr_unallocated(blocknr))
77069 + table = &tree->zfake_table;
77070 + else
77071 + table = &tree->zhash_table;
77072 + return table;
77073 +}
77074 +
77075 +/* return hash table where znode @node is (or should be) stored */
77076 +static z_hash_table *znode_get_htable(const znode * node)
77077 +{
77078 + return get_htable(znode_get_tree(node), znode_get_block(node));
77079 +}
77080 +
77081 +/* zget() - get znode from hash table, allocating it if necessary.
77082 +
77083 + First a call to zlook, locating a x-referenced znode if one
77084 + exists. If znode is not found, allocate new one and return. Result
77085 + is returned with x_count reference increased.
77086 +
77087 + LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
77088 + LOCK ORDERING: NONE
77089 +*/
77090 +znode *zget(reiser4_tree * tree,
77091 + const reiser4_block_nr * const blocknr,
77092 + znode * parent, tree_level level, gfp_t gfp_flag)
77093 +{
77094 + znode *result;
77095 + __u32 hashi;
77096 +
77097 + z_hash_table *zth;
77098 +
77099 + assert("jmacd-512", tree != NULL);
77100 + assert("jmacd-513", blocknr != NULL);
77101 + assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
77102 +
77103 + zth = get_htable(tree, blocknr);
77104 + hashi = blknrhashfn(zth, blocknr);
77105 +
77106 + /* NOTE-NIKITA address-as-unallocated-blocknr still is not
77107 + implemented. */
77108 +
77109 + z_hash_prefetch_bucket(zth, hashi);
77110 +
77111 + rcu_read_lock();
77112 + /* Find a matching BLOCKNR in the hash table. If the znode is found,
77113 + we obtain an reference (x_count) but the znode remains unlocked.
77114 + Have to worry about race conditions later. */
77115 + result = z_hash_find_index(zth, hashi, blocknr);
77116 + /* According to the current design, the hash table lock protects new
77117 + znode references. */
77118 + if (result != NULL) {
77119 + add_x_ref(ZJNODE(result));
77120 + /* NOTE-NIKITA it should be so, but special case during
77121 + creation of new root makes such assertion highly
77122 + complicated. */
77123 + assert("nikita-2131", 1 || znode_parent(result) == parent ||
77124 + (ZF_ISSET(result, JNODE_ORPHAN)
77125 + && (znode_parent(result) == NULL)));
77126 + result = znode_rip_check(tree, result);
77127 + }
77128 +
77129 + rcu_read_unlock();
77130 +
77131 + if (!result) {
77132 + znode *shadow;
77133 +
77134 + result = zalloc(gfp_flag);
77135 + if (!result) {
77136 + return ERR_PTR(RETERR(-ENOMEM));
77137 + }
77138 +
77139 + zinit(result, parent, tree);
77140 + ZJNODE(result)->blocknr = *blocknr;
77141 + ZJNODE(result)->key.z = *blocknr;
77142 + result->level = level;
77143 +
77144 + write_lock_tree(tree);
77145 +
77146 + shadow = z_hash_find_index(zth, hashi, blocknr);
77147 + if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
77148 + jnode_list_remove(ZJNODE(result));
77149 + zfree(result);
77150 + result = shadow;
77151 + } else {
77152 + result->version = znode_build_version(tree);
77153 + z_hash_insert_index_rcu(zth, hashi, result);
77154 +
77155 + if (parent != NULL)
77156 + ++parent->c_count;
77157 + }
77158 +
77159 + add_x_ref(ZJNODE(result));
77160 +
77161 + write_unlock_tree(tree);
77162 + }
77163 +#if REISER4_DEBUG
77164 + if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
77165 + reiser4_check_block(blocknr, 1);
77166 +#endif
77167 + /* Check for invalid tree level, return -EIO */
77168 + if (unlikely(znode_get_level(result) != level)) {
77169 + warning("jmacd-504",
77170 + "Wrong level for cached block %llu: %i expecting %i",
77171 + (unsigned long long)(*blocknr), znode_get_level(result),
77172 + level);
77173 + zput(result);
77174 + return ERR_PTR(RETERR(-EIO));
77175 + }
77176 +
77177 + assert("nikita-1227", znode_invariant(result));
77178 +
77179 + return result;
77180 +}
77181 +
77182 +/* ZNODE PLUGINS/DATA */
77183 +
77184 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
77185 + stored at the fixed offset from the beginning of the node. */
77186 +static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
77187 + * plugin of */ )
77188 +{
77189 + reiser4_tree *tree;
77190 +
77191 + assert("nikita-1053", node != NULL);
77192 + assert("nikita-1055", zdata(node) != NULL);
77193 +
77194 + tree = znode_get_tree(node);
77195 + assert("umka-053", tree != NULL);
77196 +
77197 + if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
77198 + return tree->nplug;
77199 + } else {
77200 + return node_plugin_by_disk_id
77201 + (tree, &((common_node_header *) zdata(node))->plugin_id);
77202 +#ifdef GUESS_EXISTS
77203 + reiser4_plugin *plugin;
77204 +
77205 + /* NOTE-NIKITA add locking here when dynamic plugins will be
77206 + * implemented */
77207 + for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
77208 + if ((plugin->u.node.guess != NULL)
77209 + && plugin->u.node.guess(node))
77210 + return plugin;
77211 + }
77212 + warning("nikita-1057", "Cannot guess node plugin");
77213 + print_znode("node", node);
77214 + return NULL;
77215 +#endif
77216 + }
77217 +}
77218 +
77219 +/* parse node header and install ->node_plugin */
77220 +int zparse(znode * node /* znode to parse */ )
77221 +{
77222 + int result;
77223 +
77224 + assert("nikita-1233", node != NULL);
77225 + assert("nikita-2370", zdata(node) != NULL);
77226 +
77227 + if (node->nplug == NULL) {
77228 + node_plugin *nplug;
77229 +
77230 + nplug = znode_guess_plugin(node);
77231 + if (likely(nplug != NULL)) {
77232 + result = nplug->parse(node);
77233 + if (likely(result == 0))
77234 + node->nplug = nplug;
77235 + } else {
77236 + result = RETERR(-EIO);
77237 + }
77238 + } else
77239 + result = 0;
77240 + return result;
77241 +}
77242 +
77243 +/* zload with readahead */
77244 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
77245 +{
77246 + int result;
77247 +
77248 + assert("nikita-484", node != NULL);
77249 + assert("nikita-1377", znode_invariant(node));
77250 + assert("jmacd-7771", !znode_above_root(node));
77251 + assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
77252 + assert("nikita-3016", reiser4_schedulable());
77253 +
77254 + if (info)
77255 + formatted_readahead(node, info);
77256 +
77257 + result = jload(ZJNODE(node));
77258 + assert("nikita-1378", znode_invariant(node));
77259 + return result;
77260 +}
77261 +
77262 +/* load content of node into memory */
77263 +int zload(znode * node)
77264 +{
77265 + return zload_ra(node, NULL);
77266 +}
77267 +
77268 +/* call node plugin to initialise newly allocated node. */
77269 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
77270 +{
77271 + return jinit_new(ZJNODE(node), gfp_flags);
77272 +}
77273 +
77274 +/* drop reference to node data. When last reference is dropped, data are
77275 + unloaded. */
77276 +void zrelse(znode * node /* znode to release references to */ )
77277 +{
77278 + assert("nikita-1381", znode_invariant(node));
77279 +
77280 + jrelse(ZJNODE(node));
77281 +}
77282 +
77283 +/* returns free space in node */
77284 +unsigned znode_free_space(znode * node /* znode to query */ )
77285 +{
77286 + assert("nikita-852", node != NULL);
77287 + return node_plugin_by_node(node)->free_space(node);
77288 +}
77289 +
77290 +/* left delimiting key of znode */
77291 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
77292 +{
77293 + assert("nikita-958", node != NULL);
77294 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77295 + assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
77296 + assert("nikita-30671", node->rd_key_version != 0);
77297 + return &node->rd_key;
77298 +}
77299 +
77300 +/* right delimiting key of znode */
77301 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
77302 +{
77303 + assert("nikita-974", node != NULL);
77304 + assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77305 + assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
77306 + assert("nikita-30681", node->ld_key_version != 0);
77307 + return &node->ld_key;
77308 +}
77309 +
77310 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
77311 + )
77312 +
77313 +/* update right-delimiting key of @node */
77314 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
77315 +{
77316 + assert("nikita-2937", node != NULL);
77317 + assert("nikita-2939", key != NULL);
77318 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77319 + assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
77320 + assert("nikita-2944",
77321 + znode_is_any_locked(node) ||
77322 + znode_get_level(node) != LEAF_LEVEL ||
77323 + keyge(key, &node->rd_key) ||
77324 + keyeq(&node->rd_key, reiser4_min_key()) ||
77325 + ZF_ISSET(node, JNODE_HEARD_BANSHEE));
77326 +
77327 + node->rd_key = *key;
77328 + ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
77329 + return &node->rd_key;
77330 +}
77331 +
77332 +/* update left-delimiting key of @node */
77333 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
77334 +{
77335 + assert("nikita-2940", node != NULL);
77336 + assert("nikita-2941", key != NULL);
77337 + assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77338 + assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
77339 + assert("nikita-2943",
77340 + znode_is_any_locked(node) || keyeq(&node->ld_key,
77341 + reiser4_min_key()));
77342 +
77343 + node->ld_key = *key;
77344 + ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
77345 + return &node->ld_key;
77346 +}
77347 +
77348 +/* true if @key is inside key range for @node */
77349 +int znode_contains_key(znode * node /* znode to look in */ ,
77350 + const reiser4_key * key /* key to look for */ )
77351 +{
77352 + assert("nikita-1237", node != NULL);
77353 + assert("nikita-1238", key != NULL);
77354 +
77355 + /* left_delimiting_key <= key <= right_delimiting_key */
77356 + return keyle(znode_get_ld_key(node), key)
77357 + && keyle(key, znode_get_rd_key(node));
77358 +}
77359 +
77360 +/* same as znode_contains_key(), but lock dk lock */
77361 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
77362 + const reiser4_key * key /* key to look for */ )
77363 +{
77364 + int result;
77365 +
77366 + assert("umka-056", node != NULL);
77367 + assert("umka-057", key != NULL);
77368 +
77369 + read_lock_dk(znode_get_tree(node));
77370 + result = znode_contains_key(node, key);
77371 + read_unlock_dk(znode_get_tree(node));
77372 + return result;
77373 +}
77374 +
77375 +/* get parent pointer, assuming tree is not locked */
77376 +znode *znode_parent_nolock(const znode * node /* child znode */ )
77377 +{
77378 + assert("nikita-1444", node != NULL);
77379 + return node->in_parent.node;
77380 +}
77381 +
77382 +/* get parent pointer of znode */
77383 +znode *znode_parent(const znode * node /* child znode */ )
77384 +{
77385 + assert("nikita-1226", node != NULL);
77386 + assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
77387 + return znode_parent_nolock(node);
77388 +}
77389 +
77390 +/* detect uber znode used to protect in-superblock tree root pointer */
77391 +int znode_above_root(const znode * node /* znode to query */ )
77392 +{
77393 + assert("umka-059", node != NULL);
77394 +
77395 + return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
77396 +}
77397 +
77398 +/* check that @node is root---that its block number is recorder in the tree as
77399 + that of root node */
77400 +#if REISER4_DEBUG
77401 +static int znode_is_true_root(const znode * node /* znode to query */ )
77402 +{
77403 + assert("umka-060", node != NULL);
77404 + assert("umka-061", current_tree != NULL);
77405 +
77406 + return disk_addr_eq(znode_get_block(node),
77407 + &znode_get_tree(node)->root_block);
77408 +}
77409 +#endif
77410 +
77411 +/* check that @node is root */
77412 +int znode_is_root(const znode * node /* znode to query */ )
77413 +{
77414 + assert("nikita-1206", node != NULL);
77415 +
77416 + return znode_get_level(node) == znode_get_tree(node)->height;
77417 +}
77418 +
77419 +/* Returns true is @node was just created by zget() and wasn't ever loaded
77420 + into memory. */
77421 +/* NIKITA-HANS: yes */
77422 +int znode_just_created(const znode * node)
77423 +{
77424 + assert("nikita-2188", node != NULL);
77425 + return (znode_page(node) == NULL);
77426 +}
77427 +
77428 +/* obtain updated ->znode_epoch. See seal.c for description. */
77429 +__u64 znode_build_version(reiser4_tree * tree)
77430 +{
77431 + __u64 result;
77432 +
77433 + spin_lock(&tree->epoch_lock);
77434 + result = ++tree->znode_epoch;
77435 + spin_unlock(&tree->epoch_lock);
77436 + return result;
77437 +}
77438 +
77439 +void init_load_count(load_count * dh)
77440 +{
77441 + assert("nikita-2105", dh != NULL);
77442 + memset(dh, 0, sizeof *dh);
77443 +}
77444 +
77445 +void done_load_count(load_count * dh)
77446 +{
77447 + assert("nikita-2106", dh != NULL);
77448 + if (dh->node != NULL) {
77449 + for (; dh->d_ref > 0; --dh->d_ref)
77450 + zrelse(dh->node);
77451 + dh->node = NULL;
77452 + }
77453 +}
77454 +
77455 +static int incr_load_count(load_count * dh)
77456 +{
77457 + int result;
77458 +
77459 + assert("nikita-2110", dh != NULL);
77460 + assert("nikita-2111", dh->node != NULL);
77461 +
77462 + result = zload(dh->node);
77463 + if (result == 0)
77464 + ++dh->d_ref;
77465 + return result;
77466 +}
77467 +
77468 +int incr_load_count_znode(load_count * dh, znode * node)
77469 +{
77470 + assert("nikita-2107", dh != NULL);
77471 + assert("nikita-2158", node != NULL);
77472 + assert("nikita-2109",
77473 + ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
77474 +
77475 + dh->node = node;
77476 + return incr_load_count(dh);
77477 +}
77478 +
77479 +int incr_load_count_jnode(load_count * dh, jnode * node)
77480 +{
77481 + if (jnode_is_znode(node)) {
77482 + return incr_load_count_znode(dh, JZNODE(node));
77483 + }
77484 + return 0;
77485 +}
77486 +
77487 +void copy_load_count(load_count * new, load_count * old)
77488 +{
77489 + int ret = 0;
77490 + done_load_count(new);
77491 + new->node = old->node;
77492 + new->d_ref = 0;
77493 +
77494 + while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
77495 + }
77496 +
77497 + assert("jmacd-87589", ret == 0);
77498 +}
77499 +
77500 +void move_load_count(load_count * new, load_count * old)
77501 +{
77502 + done_load_count(new);
77503 + new->node = old->node;
77504 + new->d_ref = old->d_ref;
77505 + old->node = NULL;
77506 + old->d_ref = 0;
77507 +}
77508 +
77509 +/* convert parent pointer into coord */
77510 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
77511 +{
77512 + assert("nikita-3204", pcoord != NULL);
77513 + assert("nikita-3205", coord != NULL);
77514 +
77515 + coord_init_first_unit_nocheck(coord, pcoord->node);
77516 + coord_set_item_pos(coord, pcoord->item_pos);
77517 + coord->between = AT_UNIT;
77518 +}
77519 +
77520 +/* pack coord into parent_coord_t */
77521 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
77522 +{
77523 + assert("nikita-3206", pcoord != NULL);
77524 + assert("nikita-3207", coord != NULL);
77525 +
77526 + pcoord->node = coord->node;
77527 + pcoord->item_pos = coord->item_pos;
77528 +}
77529 +
77530 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
77531 + look for comments there) */
77532 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
77533 +{
77534 + pcoord->node = (znode *) node;
77535 + pcoord->item_pos = (unsigned short)~0;
77536 +}
77537 +
77538 +#if REISER4_DEBUG
77539 +
77540 +/* debugging aid: znode invariant */
77541 +static int znode_invariant_f(const znode * node /* znode to check */ ,
77542 + char const **msg /* where to store error
77543 + * message, if any */ )
77544 +{
77545 +#define _ergo(ant, con) \
77546 + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
77547 +
77548 +#define _equi(e1, e2) \
77549 + ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
77550 +
77551 +#define _check(exp) ((*msg) = #exp, (exp))
77552 +
77553 + return jnode_invariant_f(ZJNODE(node), msg) &&
77554 + /* [znode-fake] invariant */
77555 + /* fake znode doesn't have a parent, and */
77556 + _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
77557 + /* there is another way to express this very check, and */
77558 + _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
77559 + /* it has special block number, and */
77560 + _ergo(znode_get_level(node) == 0,
77561 + disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77562 + /* it is the only znode with such block number, and */
77563 + _ergo(!znode_above_root(node) && znode_is_loaded(node),
77564 + !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77565 + /* it is parent of the tree root node */
77566 + _ergo(znode_is_true_root(node),
77567 + znode_above_root(znode_parent(node))) &&
77568 + /* [znode-level] invariant */
77569 + /* level of parent znode is one larger than that of child,
77570 + except for the fake znode, and */
77571 + _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
77572 + znode_get_level(znode_parent(node)) ==
77573 + znode_get_level(node) + 1) &&
77574 + /* left neighbor is at the same level, and */
77575 + _ergo(znode_is_left_connected(node) && node->left != NULL,
77576 + znode_get_level(node) == znode_get_level(node->left)) &&
77577 + /* right neighbor is at the same level */
77578 + _ergo(znode_is_right_connected(node) && node->right != NULL,
77579 + znode_get_level(node) == znode_get_level(node->right)) &&
77580 + /* [znode-connected] invariant */
77581 + _ergo(node->left != NULL, znode_is_left_connected(node)) &&
77582 + _ergo(node->right != NULL, znode_is_right_connected(node)) &&
77583 + _ergo(!znode_is_root(node) && node->left != NULL,
77584 + znode_is_right_connected(node->left) &&
77585 + node->left->right == node) &&
77586 + _ergo(!znode_is_root(node) && node->right != NULL,
77587 + znode_is_left_connected(node->right) &&
77588 + node->right->left == node) &&
77589 + /* [znode-c_count] invariant */
77590 + /* for any znode, c_count of its parent is greater than 0 */
77591 + _ergo(znode_parent(node) != NULL &&
77592 + !znode_above_root(znode_parent(node)),
77593 + znode_parent(node)->c_count > 0) &&
77594 + /* leaves don't have children */
77595 + _ergo(znode_get_level(node) == LEAF_LEVEL,
77596 + node->c_count == 0) &&
77597 + _check(node->zjnode.jnodes.prev != NULL) &&
77598 + _check(node->zjnode.jnodes.next != NULL) &&
77599 + /* orphan doesn't have a parent */
77600 + _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
77601 + /* [znode-modify] invariant */
77602 + /* if znode is not write-locked, its checksum remains
77603 + * invariant */
77604 + /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
77605 + * cannot check this. */
77606 + /* [znode-refs] invariant */
77607 + /* only referenced znode can be long-term locked */
77608 + _ergo(znode_is_locked(node),
77609 + atomic_read(&ZJNODE(node)->x_count) != 0);
77610 +}
77611 +
77612 +/* debugging aid: check znode invariant and panic if it doesn't hold */
77613 +int znode_invariant(znode * node /* znode to check */ )
77614 +{
77615 + char const *failed_msg;
77616 + int result;
77617 +
77618 + assert("umka-063", node != NULL);
77619 + assert("umka-064", current_tree != NULL);
77620 +
77621 + spin_lock_znode(node);
77622 + read_lock_tree(znode_get_tree(node));
77623 + result = znode_invariant_f(node, &failed_msg);
77624 + if (!result) {
77625 + /* print_znode("corrupted node", node); */
77626 + warning("jmacd-555", "Condition %s failed", failed_msg);
77627 + }
77628 + read_unlock_tree(znode_get_tree(node));
77629 + spin_unlock_znode(node);
77630 + return result;
77631 +}
77632 +
77633 +/* return non-0 iff data are loaded into znode */
77634 +int znode_is_loaded(const znode * node /* znode to query */ )
77635 +{
77636 + assert("nikita-497", node != NULL);
77637 + return jnode_is_loaded(ZJNODE(node));
77638 +}
77639 +
77640 +unsigned long znode_times_locked(const znode * z)
77641 +{
77642 + return z->times_locked;
77643 +}
77644 +
77645 +#endif /* REISER4_DEBUG */
77646 +
77647 +/* Make Linus happy.
77648 + Local variables:
77649 + c-indentation-style: "K&R"
77650 + mode-name: "LC"
77651 + c-basic-offset: 8
77652 + tab-width: 8
77653 + fill-column: 120
77654 + End:
77655 +*/
77656 diff -urN linux-2.6.23.orig/fs/reiser4/znode.h linux-2.6.23/fs/reiser4/znode.h
77657 --- linux-2.6.23.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300
77658 +++ linux-2.6.23/fs/reiser4/znode.h 2007-12-04 16:49:30.000000000 +0300
77659 @@ -0,0 +1,434 @@
77660 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
77661 + * reiser4/README */
77662 +
77663 +/* Declaration of znode (Zam's node). See znode.c for more details. */
77664 +
77665 +#ifndef __ZNODE_H__
77666 +#define __ZNODE_H__
77667 +
77668 +#include "forward.h"
77669 +#include "debug.h"
77670 +#include "dformat.h"
77671 +#include "key.h"
77672 +#include "coord.h"
77673 +#include "plugin/node/node.h"
77674 +#include "jnode.h"
77675 +#include "lock.h"
77676 +#include "readahead.h"
77677 +
77678 +#include <linux/types.h>
77679 +#include <linux/spinlock.h>
77680 +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
77681 +#include <asm/atomic.h>
77682 +#include <asm/semaphore.h>
77683 +
77684 +/* znode tracks its position within parent (internal item in a parent node,
77685 + * that contains znode's block number). */
77686 +typedef struct parent_coord {
77687 + znode *node;
77688 + pos_in_node_t item_pos;
77689 +} parent_coord_t;
77690 +
77691 +/* &znode - node in a reiser4 tree.
77692 +
77693 + NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
77694 + cacheline pressure.
77695 +
77696 + Locking:
77697 +
77698 + Long term: data in a disk node attached to this znode are protected
77699 + by long term, deadlock aware lock ->lock;
77700 +
77701 + Spin lock: the following fields are protected by the spin lock:
77702 +
77703 + ->lock
77704 +
77705 + Following fields are protected by the global tree lock:
77706 +
77707 + ->left
77708 + ->right
77709 + ->in_parent
77710 + ->c_count
77711 +
77712 + Following fields are protected by the global delimiting key lock (dk_lock):
77713 +
77714 + ->ld_key (to update ->ld_key long-term lock on the node is also required)
77715 + ->rd_key
77716 +
77717 + Following fields are protected by the long term lock:
77718 +
77719 + ->nr_items
77720 +
77721 + ->node_plugin is never changed once set. This means that after code made
77722 + itself sure that field is valid it can be accessed without any additional
77723 + locking.
77724 +
77725 + ->level is immutable.
77726 +
77727 + Invariants involving this data-type:
77728 +
77729 + [znode-fake]
77730 + [znode-level]
77731 + [znode-connected]
77732 + [znode-c_count]
77733 + [znode-refs]
77734 + [jnode-refs]
77735 + [jnode-queued]
77736 + [znode-modify]
77737 +
77738 + For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
77739 + Suggestions for how to do that are desired.*/
77740 +struct znode {
77741 + /* Embedded jnode. */
77742 + jnode zjnode;
77743 +
77744 + /* contains three subfields, node, pos_in_node, and pos_in_unit.
77745 +
77746 + pos_in_node and pos_in_unit are only hints that are cached to
77747 + speed up lookups during balancing. They are not required to be up to
77748 + date. Synched in find_child_ptr().
77749 +
77750 + This value allows us to avoid expensive binary searches.
77751 +
77752 + in_parent->node points to the parent of this node, and is NOT a
77753 + hint.
77754 + */
77755 + parent_coord_t in_parent;
77756 +
77757 + /*
77758 + * sibling list pointers
77759 + */
77760 +
77761 + /* left-neighbor */
77762 + znode *left;
77763 + /* right-neighbor */
77764 + znode *right;
77765 +
77766 + /* long term lock on node content. This lock supports deadlock
77767 + detection. See lock.c
77768 + */
77769 + zlock lock;
77770 +
77771 + /* You cannot remove from memory a node that has children in
77772 + memory. This is because we rely on the fact that parent of given
77773 + node can always be reached without blocking for io. When reading a
77774 + node into memory you must increase the c_count of its parent, when
77775 + removing it from memory you must decrease the c_count. This makes
77776 + the code simpler, and the cases where it is suboptimal are truly
77777 + obscure.
77778 + */
77779 + int c_count;
77780 +
77781 + /* plugin of node attached to this znode. NULL if znode is not
77782 + loaded. */
77783 + node_plugin *nplug;
77784 +
77785 + /* version of znode data. This is increased on each modification. This
77786 + * is necessary to implement seals (see seal.[ch]) efficiently. */
77787 + __u64 version;
77788 +
77789 + /* left delimiting key. Necessary to efficiently perform
77790 + balancing with node-level locking. Kept in memory only. */
77791 + reiser4_key ld_key;
77792 + /* right delimiting key. */
77793 + reiser4_key rd_key;
77794 +
77795 + /* znode's tree level */
77796 + __u16 level;
77797 + /* number of items in this node. This field is modified by node
77798 + * plugin. */
77799 + __u16 nr_items;
77800 +
77801 +#if REISER4_DEBUG
77802 + void *creator;
77803 + reiser4_key first_key;
77804 + unsigned long times_locked;
77805 + int left_version; /* when node->left was updated */
77806 + int right_version; /* when node->right was updated */
77807 + int ld_key_version; /* when node->ld_key was updated */
77808 + int rd_key_version; /* when node->rd_key was updated */
77809 +#endif
77810 +
77811 +} __attribute__ ((aligned(16)));
77812 +
77813 +ON_DEBUG(extern atomic_t delim_key_version;
77814 + )
77815 +
77816 +/* In general I think these macros should not be exposed. */
77817 +#define znode_is_locked(node) (lock_is_locked(&node->lock))
77818 +#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
77819 +#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
77820 +#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
77821 +#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
77822 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
77823 +/* Macros for accessing the znode state. */
77824 +#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
77825 +#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
77826 +#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
77827 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
77828 + znode * parent, tree_level level, gfp_t gfp_flag);
77829 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
77830 +extern int zload(znode * node);
77831 +extern int zload_ra(znode * node, ra_info_t * info);
77832 +extern int zinit_new(znode * node, gfp_t gfp_flags);
77833 +extern void zrelse(znode * node);
77834 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
77835 +
77836 +/* size of data in znode */
77837 +static inline unsigned
77838 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
77839 +{
77840 + assert("nikita-1416", node != NULL);
77841 + return PAGE_CACHE_SIZE;
77842 +}
77843 +
77844 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
77845 + coord_t * coord);
77846 +extern void coord_to_parent_coord(const coord_t * coord,
77847 + parent_coord_t * pcoord);
77848 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
77849 +
77850 +extern unsigned znode_free_space(znode * node);
77851 +
77852 +extern reiser4_key *znode_get_rd_key(znode * node);
77853 +extern reiser4_key *znode_get_ld_key(znode * node);
77854 +
77855 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
77856 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
77857 +
77858 +/* `connected' state checks */
77859 +static inline int znode_is_right_connected(const znode * node)
77860 +{
77861 + return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
77862 +}
77863 +
77864 +static inline int znode_is_left_connected(const znode * node)
77865 +{
77866 + return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
77867 +}
77868 +
77869 +static inline int znode_is_connected(const znode * node)
77870 +{
77871 + return znode_is_right_connected(node) && znode_is_left_connected(node);
77872 +}
77873 +
77874 +extern int znode_shift_order;
77875 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
77876 +extern void znode_remove(znode *, reiser4_tree *);
77877 +extern znode *znode_parent(const znode * node);
77878 +extern znode *znode_parent_nolock(const znode * node);
77879 +extern int znode_above_root(const znode * node);
77880 +extern int init_znodes(void);
77881 +extern void done_znodes(void);
77882 +extern int znodes_tree_init(reiser4_tree * ztree);
77883 +extern void znodes_tree_done(reiser4_tree * ztree);
77884 +extern int znode_contains_key(znode * node, const reiser4_key * key);
77885 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
77886 +extern unsigned znode_save_free_space(znode * node);
77887 +extern unsigned znode_recover_free_space(znode * node);
77888 +extern znode *zalloc(gfp_t gfp_flag);
77889 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
77890 +extern int zparse(znode * node);
77891 +
77892 +extern int znode_just_created(const znode * node);
77893 +
77894 +extern void zfree(znode * node);
77895 +
77896 +#if REISER4_DEBUG
77897 +extern void print_znode(const char *prefix, const znode * node);
77898 +#else
77899 +#define print_znode( p, n ) noop
77900 +#endif
77901 +
77902 +/* Make it look like various znode functions exist instead of treating znodes as
77903 + jnodes in znode-specific code. */
77904 +#define znode_page(x) jnode_page ( ZJNODE(x) )
77905 +#define zdata(x) jdata ( ZJNODE(x) )
77906 +#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
77907 +#define znode_created(x) jnode_created ( ZJNODE(x) )
77908 +#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
77909 +#define znode_convertible(x) jnode_convertible (ZJNODE(x))
77910 +#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
77911 +
77912 +#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
77913 +#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
77914 +#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
77915 +#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
77916 +
77917 +#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
77918 +#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
77919 +#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
77920 +#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
77921 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
77922 +
77923 +#if REISER4_DEBUG
77924 +extern int znode_x_count_is_protected(const znode * node);
77925 +extern int znode_invariant(znode * node);
77926 +#endif
77927 +
77928 +/* acquire reference to @node */
77929 +static inline znode *zref(znode * node)
77930 +{
77931 + /* change of x_count from 0 to 1 is protected by tree spin-lock */
77932 + return JZNODE(jref(ZJNODE(node)));
77933 +}
77934 +
77935 +/* release reference to @node */
77936 +static inline void zput(znode * node)
77937 +{
77938 + assert("nikita-3564", znode_invariant(node));
77939 + jput(ZJNODE(node));
77940 +}
77941 +
77942 +/* get the level field for a znode */
77943 +static inline tree_level znode_get_level(const znode * node)
77944 +{
77945 + return node->level;
77946 +}
77947 +
77948 +/* get the level field for a jnode */
77949 +static inline tree_level jnode_get_level(const jnode * node)
77950 +{
77951 + if (jnode_is_znode(node))
77952 + return znode_get_level(JZNODE(node));
77953 + else
77954 + /* unformatted nodes are all at the LEAF_LEVEL and for
77955 + "semi-formatted" nodes like bitmaps, level doesn't matter. */
77956 + return LEAF_LEVEL;
77957 +}
77958 +
77959 +/* true if jnode is on leaf level */
77960 +static inline int jnode_is_leaf(const jnode * node)
77961 +{
77962 + if (jnode_is_znode(node))
77963 + return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
77964 + if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
77965 + return 1;
77966 + return 0;
77967 +}
77968 +
77969 +/* return znode's tree */
77970 +static inline reiser4_tree *znode_get_tree(const znode * node)
77971 +{
77972 + assert("nikita-2692", node != NULL);
77973 + return jnode_get_tree(ZJNODE(node));
77974 +}
77975 +
77976 +/* resolve race with zput */
77977 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
77978 +{
77979 + jnode *j;
77980 +
77981 + j = jnode_rip_sync(tree, ZJNODE(node));
77982 + if (likely(j != NULL))
77983 + node = JZNODE(j);
77984 + else
77985 + node = NULL;
77986 + return node;
77987 +}
77988 +
77989 +#if defined(REISER4_DEBUG)
77990 +int znode_is_loaded(const znode * node /* znode to query */ );
77991 +#endif
77992 +
77993 +extern __u64 znode_build_version(reiser4_tree * tree);
77994 +
77995 +/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
77996 + must load the data for a node in many places. We could do this by simply calling
77997 + zload() everywhere, the difficulty arises when we must release the loaded data by
77998 + calling zrelse. In a function with many possible error/return paths, it requires extra
77999 + work to figure out which exit paths must call zrelse and those which do not. The data
78000 + handle automatically calls zrelse for every zload that it is responsible for. In that
78001 + sense, it acts much like a lock_handle.
78002 +*/
78003 +typedef struct load_count {
78004 + znode *node;
78005 + int d_ref;
78006 +} load_count;
78007 +
78008 +extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
78009 +extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
78010 +extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
78011 +extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
78012 + * incr_load_count_znode, otherwise do nothing (unformatted nodes
78013 + * don't require zload/zrelse treatment). */
78014 +extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
78015 +extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
78016 +
78017 +/* Variable initializers for load_count. */
78018 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
78019 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
78020 +/* A convenience macro for use in assertions or debug-only code, where loaded
78021 + data is only required to perform the debugging check. This macro
78022 + encapsulates an expression inside a pair of calls to zload()/zrelse(). */
78023 +#define WITH_DATA( node, exp ) \
78024 +({ \
78025 + long __with_dh_result; \
78026 + znode *__with_dh_node; \
78027 + \
78028 + __with_dh_node = ( node ); \
78029 + __with_dh_result = zload( __with_dh_node ); \
78030 + if( __with_dh_result == 0 ) { \
78031 + __with_dh_result = ( long )( exp ); \
78032 + zrelse( __with_dh_node ); \
78033 + } \
78034 + __with_dh_result; \
78035 +})
78036 +
78037 +/* Same as above, but accepts a return value in case zload fails. */
78038 +#define WITH_DATA_RET( node, ret, exp ) \
78039 +({ \
78040 + int __with_dh_result; \
78041 + znode *__with_dh_node; \
78042 + \
78043 + __with_dh_node = ( node ); \
78044 + __with_dh_result = zload( __with_dh_node ); \
78045 + if( __with_dh_result == 0 ) { \
78046 + __with_dh_result = ( int )( exp ); \
78047 + zrelse( __with_dh_node ); \
78048 + } else \
78049 + __with_dh_result = ( ret ); \
78050 + __with_dh_result; \
78051 +})
78052 +
78053 +#define WITH_COORD(coord, exp) \
78054 +({ \
78055 + coord_t *__coord; \
78056 + \
78057 + __coord = (coord); \
78058 + coord_clear_iplug(__coord); \
78059 + WITH_DATA(__coord->node, exp); \
78060 +})
78061 +
78062 +#if REISER4_DEBUG
78063 +#define STORE_COUNTERS \
78064 + reiser4_lock_cnt_info __entry_counters = \
78065 + *reiser4_lock_counters()
78066 +#define CHECK_COUNTERS \
78067 +ON_DEBUG_CONTEXT( \
78068 +({ \
78069 + __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
78070 + __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
78071 + __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
78072 + assert("nikita-2159", \
78073 + !memcmp(&__entry_counters, reiser4_lock_counters(), \
78074 + sizeof __entry_counters)); \
78075 +}) )
78076 +
78077 +#else
78078 +#define STORE_COUNTERS
78079 +#define CHECK_COUNTERS noop
78080 +#endif
78081 +
78082 +/* __ZNODE_H__ */
78083 +#endif
78084 +
78085 +/* Make Linus happy.
78086 + Local variables:
78087 + c-indentation-style: "K&R"
78088 + mode-name: "LC"
78089 + c-basic-offset: 8
78090 + tab-width: 8
78091 + fill-column: 120
78092 + End:
78093 +*/
78094 diff -urN linux-2.6.23.orig/include/linux/fs.h linux-2.6.23/include/linux/fs.h
78095 --- linux-2.6.23.orig/include/linux/fs.h 2007-10-10 00:31:38.000000000 +0400
78096 +++ linux-2.6.23/include/linux/fs.h 2007-12-04 20:02:08.277902069 +0300
78097 @@ -1198,6 +1198,8 @@
78098 void (*clear_inode) (struct inode *);
78099 void (*umount_begin) (struct vfsmount *, int);
78100
78101 + void (*sync_inodes) (struct super_block *sb,
78102 + struct writeback_control *wbc);
78103 int (*show_options)(struct seq_file *, struct vfsmount *);
78104 int (*show_stats)(struct seq_file *, struct vfsmount *);
78105 #ifdef CONFIG_QUOTA
78106 @@ -1539,6 +1541,7 @@
78107 extern int invalidate_inode_pages2_range(struct address_space *mapping,
78108 pgoff_t start, pgoff_t end);
78109 extern int write_inode_now(struct inode *, int);
78110 +extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
78111 extern int filemap_fdatawrite(struct address_space *);
78112 extern int filemap_flush(struct address_space *);
78113 extern int filemap_fdatawait(struct address_space *);
78114 diff -urN linux-2.6.23.orig/mm/filemap.c linux-2.6.23/mm/filemap.c
78115 --- linux-2.6.23.orig/mm/filemap.c 2007-10-10 00:31:38.000000000 +0400
78116 +++ linux-2.6.23/mm/filemap.c 2007-12-04 20:02:09.534225346 +0300
78117 @@ -122,6 +122,7 @@
78118 __dec_zone_page_state(page, NR_FILE_PAGES);
78119 BUG_ON(page_mapped(page));
78120 }
78121 +EXPORT_SYMBOL(__remove_from_page_cache);
78122
78123 void remove_from_page_cache(struct page *page)
78124 {
78125 @@ -133,6 +134,7 @@
78126 __remove_from_page_cache(page);
78127 write_unlock_irq(&mapping->tree_lock);
78128 }
78129 +EXPORT_SYMBOL(remove_from_page_cache);
78130
78131 static int sync_page(void *word)
78132 {
78133 @@ -720,6 +722,7 @@
78134 read_unlock_irq(&mapping->tree_lock);
78135 return ret;
78136 }
78137 +EXPORT_SYMBOL(add_to_page_cache_lru);
78138
78139 /**
78140 * find_get_pages_contig - gang contiguous pagecache lookup
78141 @@ -839,6 +842,7 @@
78142
78143 ra->ra_pages /= 4;
78144 }
78145 +EXPORT_SYMBOL(find_get_pages);
78146
78147 /**
78148 * do_generic_mapping_read - generic file read routine