]> git.ipfire.org Git - ipfire-2.x.git/blame_incremental - src/patches/reiser4-for-2.6.20.patch
Updated vlc to actual version
[ipfire-2.x.git] / src / patches / reiser4-for-2.6.20.patch
... / ...
CommitLineData
1diff -urN linux-2.6.20.orig/arch/i386/lib/usercopy.c linux-2.6.20/arch/i386/lib/usercopy.c
2--- linux-2.6.20.orig/arch/i386/lib/usercopy.c 2006-11-30 00:57:37.000000000 +0300
3+++ linux-2.6.20/arch/i386/lib/usercopy.c 2007-05-06 14:50:43.658963226 +0400
4@@ -812,6 +812,7 @@
5 #endif
6 return n;
7 }
8+EXPORT_SYMBOL(__copy_from_user_ll_nocache);
9
10 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
11 unsigned long n)
12@@ -827,6 +828,7 @@
13 #endif
14 return n;
15 }
16+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
17
18 /**
19 * copy_to_user: - Copy a block of data into user space.
20diff -urN linux-2.6.20.orig/Documentation/Changes linux-2.6.20/Documentation/Changes
21--- linux-2.6.20.orig/Documentation/Changes 2007-05-06 15:04:34.226399593 +0400
22+++ linux-2.6.20/Documentation/Changes 2007-05-06 14:50:43.658963226 +0400
23@@ -36,6 +36,7 @@
24 o e2fsprogs 1.29 # tune2fs
25 o jfsutils 1.1.3 # fsck.jfs -V
26 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
27+o reiser4progs 1.0.0 # fsck.reiser4 -V
28 o xfsprogs 2.6.0 # xfs_db -V
29 o pcmciautils 004 # pccardctl -V
30 o quota-tools 3.09 # quota -V
31@@ -144,6 +145,13 @@
32 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
33 reiserfsck. These utils work on both i386 and alpha platforms.
34
35+Reiser4progs
36+------------
37+
38+The reiser4progs package contains utilities for the reiser4 file system.
39+Detailed instructions are provided in the README file located at:
40+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
41+
42 Xfsprogs
43 --------
44
45@@ -322,6 +330,10 @@
46 -------------
47 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
48
49+Reiser4progs
50+------------
51+o <ftp://ftp.namesys.com/pub/reiser4progs/>
52+
53 Xfsprogs
54 --------
55 o <ftp://oss.sgi.com/projects/xfs/download/>
56diff -urN linux-2.6.20.orig/Documentation/filesystems/reiser4.txt linux-2.6.20/Documentation/filesystems/reiser4.txt
57--- linux-2.6.20.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
58+++ linux-2.6.20/Documentation/filesystems/reiser4.txt 2007-05-06 14:50:43.658963226 +0400
59@@ -0,0 +1,75 @@
60+Reiser4 filesystem
61+==================
62+Reiser4 is a file system based on dancing tree algorithms, and is
63+described at http://www.namesys.com
64+
65+
66+References
67+==========
68+web page http://namesys.com/v4/v4.html
69+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
70+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
71+install page http://www.namesys.com/install_v4.html
72+
73+Compile options
74+===============
75+Enable reiser4 debug mode
76+ This checks everything imaginable while reiser4
77+ runs
78+
79+Mount options
80+=============
81+tmgr.atom_max_size=N
82+ Atoms containing more than N blocks will be forced to commit.
83+ N is decimal.
84+ Default is nr_free_pagecache_pages() / 2 at mount time.
85+
86+tmgr.atom_max_age=N
87+ Atoms older than N seconds will be forced to commit. N is decimal.
88+ Default is 600.
89+
90+tmgr.atom_max_flushers=N
91+ Limit of concurrent flushers for one atom. 0 means no limit.
92+ Default is 0.
93+
94+tree.cbk_cache.nr_slots=N
95+ Number of slots in the cbk cache.
96+
97+flush.relocate_threshold=N
98+ If flush finds more than N adjacent dirty leaf-level blocks it
99+ will force them to be relocated.
100+ Default is 64.
101+
102+flush.relocate_distance=N
103+ If flush finds can find a block allocation closer than at most
104+ N from the preceder it will relocate to that position.
105+ Default is 64.
106+
107+flush.scan_maxnodes=N
108+ The maximum number of nodes to scan left on a level during
109+ flush.
110+ Default is 10000.
111+
112+optimal_io_size=N
113+ Preferred IO size. This value is used to set st_blksize of
114+ struct stat.
115+ Default is 65536.
116+
117+bsdgroups
118+ Turn on BSD-style gid assignment.
119+
120+32bittimes
121+ By default file in reiser4 have 64 bit timestamps. Files
122+ created when filesystem is mounted with 32bittimes mount
123+ option will get 32 bit timestamps.
124+
125+mtflush
126+ Turn off concurrent flushing.
127+
128+nopseudo
129+ Disable pseudo files support. See
130+ http://namesys.com/v4/pseudo.html for more about pseudo files.
131+
132+dont_load_bitmap
133+ Don't load all bitmap blocks at mount time, it is useful for
134+ machines with tiny RAM and large disks.
135diff -urN linux-2.6.20.orig/fs/fs-writeback.c linux-2.6.20/fs/fs-writeback.c
136--- linux-2.6.20.orig/fs/fs-writeback.c 2007-05-06 15:04:39.848155607 +0400
137+++ linux-2.6.20/fs/fs-writeback.c 2007-05-06 14:50:43.662964476 +0400
138@@ -296,8 +296,6 @@
139 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
140 * that it can be located for waiting on in __writeback_single_inode().
141 *
142- * Called under inode_lock.
143- *
144 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
145 * This function assumes that the blockdev superblock's inodes are backed by
146 * a variety of queues, so all inodes are searched. For other superblocks,
147@@ -313,11 +311,13 @@
148 * on the writer throttling path, and we get decent balancing between many
149 * throttled threads: we don't want them all piling up on __wait_on_inode.
150 */
151-static void
152-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
153+void
154+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
155 {
156 const unsigned long start = jiffies; /* livelock avoidance */
157
158+ spin_lock(&inode_lock);
159+
160 if (!wbc->for_kupdate || list_empty(&sb->s_io))
161 list_splice_init(&sb->s_dirty, &sb->s_io);
162
163@@ -397,8 +397,19 @@
164 if (wbc->nr_to_write <= 0)
165 break;
166 }
167+ spin_unlock(&inode_lock);
168 return; /* Leave any unwritten inodes on s_io */
169 }
170+EXPORT_SYMBOL(generic_sync_sb_inodes);
171+
172+static void
173+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
174+{
175+ if (sb->s_op->sync_inodes)
176+ sb->s_op->sync_inodes(sb, wbc);
177+ else
178+ generic_sync_sb_inodes(sb, wbc);
179+}
180
181 /*
182 * Start writeback of dirty pagecache data against all unlocked inodes.
183@@ -439,11 +450,8 @@
184 * be unmounted by the time it is released.
185 */
186 if (down_read_trylock(&sb->s_umount)) {
187- if (sb->s_root) {
188- spin_lock(&inode_lock);
189+ if (sb->s_root)
190 sync_sb_inodes(sb, wbc);
191- spin_unlock(&inode_lock);
192- }
193 up_read(&sb->s_umount);
194 }
195 spin_lock(&sb_lock);
196@@ -481,9 +489,7 @@
197 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
198 nr_dirty + nr_unstable;
199 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
200- spin_lock(&inode_lock);
201 sync_sb_inodes(sb, &wbc);
202- spin_unlock(&inode_lock);
203 }
204
205 /*
206diff -urN linux-2.6.20.orig/fs/Kconfig linux-2.6.20/fs/Kconfig
207--- linux-2.6.20.orig/fs/Kconfig 2007-05-06 15:04:39.668099364 +0400
208+++ linux-2.6.20/fs/Kconfig 2007-05-06 14:50:43.662964476 +0400
209@@ -272,6 +272,8 @@
210 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
211 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
212
213+source "fs/reiser4/Kconfig"
214+
215 config REISERFS_FS
216 tristate "Reiserfs support"
217 help
218diff -urN linux-2.6.20.orig/fs/Makefile linux-2.6.20/fs/Makefile
219--- linux-2.6.20.orig/fs/Makefile 2007-05-06 15:04:39.668099364 +0400
220+++ linux-2.6.20/fs/Makefile 2007-05-06 14:50:43.666965726 +0400
221@@ -62,6 +62,7 @@
222
223 # Do not add any filesystems before this line
224 obj-$(CONFIG_REISERFS_FS) += reiserfs/
225+obj-$(CONFIG_REISER4_FS) += reiser4/
226 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
227 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
228 obj-$(CONFIG_JBD) += jbd/
229diff -urN linux-2.6.20.orig/fs/reiser4/as_ops.c linux-2.6.20/fs/reiser4/as_ops.c
230--- linux-2.6.20.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
231+++ linux-2.6.20/fs/reiser4/as_ops.c 2007-05-06 14:50:43.666965726 +0400
232@@ -0,0 +1,337 @@
233+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
234+
235+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
236+
237+#include "forward.h"
238+#include "debug.h"
239+#include "dformat.h"
240+#include "coord.h"
241+#include "plugin/item/item.h"
242+#include "plugin/file/file.h"
243+#include "plugin/security/perm.h"
244+#include "plugin/disk_format/disk_format.h"
245+#include "plugin/plugin.h"
246+#include "plugin/plugin_set.h"
247+#include "plugin/object.h"
248+#include "txnmgr.h"
249+#include "jnode.h"
250+#include "znode.h"
251+#include "block_alloc.h"
252+#include "tree.h"
253+#include "vfs_ops.h"
254+#include "inode.h"
255+#include "page_cache.h"
256+#include "ktxnmgrd.h"
257+#include "super.h"
258+#include "reiser4.h"
259+#include "entd.h"
260+
261+#include <linux/profile.h>
262+#include <linux/types.h>
263+#include <linux/mount.h>
264+#include <linux/vfs.h>
265+#include <linux/mm.h>
266+#include <linux/buffer_head.h>
267+#include <linux/dcache.h>
268+#include <linux/list.h>
269+#include <linux/pagemap.h>
270+#include <linux/slab.h>
271+#include <linux/seq_file.h>
272+#include <linux/init.h>
273+#include <linux/module.h>
274+#include <linux/writeback.h>
275+#include <linux/backing-dev.h>
276+#include <linux/quotaops.h>
277+#include <linux/security.h>
278+
279+/* address space operations */
280+
281+/**
282+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
283+ * @page: page to be dirtied
284+ *
285+ * Operation of struct address_space_operations. This implementation is used by
286+ * unix and cryptcompress file plugins.
287+ *
288+ * This is called when reiser4 page gets dirtied outside of reiser4, for
289+ * example, when dirty bit is moved from pte to physical page.
290+ *
291+ * Tags page in the mapping's page tree with special tag so that it is possible
292+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
293+ * capturing by an atom) later because it can not be done in the contexts where
294+ * set_page_dirty is called.
295+ */
296+int reiser4_set_page_dirty(struct page *page)
297+{
298+ /* this page can be unformatted only */
299+ assert("vs-1734", (page->mapping &&
300+ page->mapping->host &&
301+ reiser4_get_super_fake(page->mapping->host->i_sb) !=
302+ page->mapping->host
303+ && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
304+ page->mapping->host
305+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
306+ page->mapping->host));
307+
308+ if (!TestSetPageDirty(page)) {
309+ struct address_space *mapping = page->mapping;
310+
311+ if (mapping) {
312+ write_lock_irq(&mapping->tree_lock);
313+
314+ /* check for race with truncate */
315+ if (page->mapping) {
316+ assert("vs-1652", page->mapping == mapping);
317+ if (mapping_cap_account_dirty(mapping))
318+ inc_zone_page_state(page,
319+ NR_FILE_DIRTY);
320+ radix_tree_tag_set(&mapping->page_tree,
321+ page->index,
322+ PAGECACHE_TAG_REISER4_MOVED);
323+ }
324+ write_unlock_irq(&mapping->tree_lock);
325+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
326+ }
327+ }
328+ return 0;
329+}
330+
331+/* ->invalidatepage method for reiser4 */
332+
333+/*
334+ * this is called for each truncated page from
335+ * truncate_inode_pages()->truncate_{complete,partial}_page().
336+ *
337+ * At the moment of call, page is under lock, and outstanding io (if any) has
338+ * completed.
339+ */
340+
341+/**
342+ * reiser4_invalidatepage
343+ * @page: page to invalidate
344+ * @offset: starting offset for partial invalidation
345+ *
346+ */
347+void reiser4_invalidatepage(struct page *page, unsigned long offset)
348+{
349+ int ret = 0;
350+ reiser4_context *ctx;
351+ struct inode *inode;
352+ jnode *node;
353+
354+ /*
355+ * This is called to truncate file's page.
356+ *
357+ * Originally, reiser4 implemented truncate in a standard way
358+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
359+ * first, then file system ->truncate() call-back is invoked).
360+ *
361+ * This lead to the problem when ->invalidatepage() was called on a
362+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
363+ * process. That is, truncate was bypassing transactions. To avoid
364+ * this, try_capture_page_to_invalidate() call was added here.
365+ *
366+ * After many troubles with vmtruncate() based truncate (including
367+ * races with flush, tail conversion, etc.) it was re-written in the
368+ * top-to-bottom style: items are killed in reiser4_cut_tree_object()
369+ * and pages belonging to extent are invalidated in kill_hook_extent().
370+ * So probably now additional call to capture is not needed here.
371+ */
372+
373+ assert("nikita-3137", PageLocked(page));
374+ assert("nikita-3138", !PageWriteback(page));
375+ inode = page->mapping->host;
376+
377+ /*
378+ * ->invalidatepage() should only be called for the unformatted
379+ * jnodes. Destruction of all other types of jnodes is performed
380+ * separately. But, during some corner cases (like handling errors
381+ * during mount) it is simpler to let ->invalidatepage to be called on
382+ * them. Check for this, and do nothing.
383+ */
384+ if (reiser4_get_super_fake(inode->i_sb) == inode)
385+ return;
386+ if (reiser4_get_cc_fake(inode->i_sb) == inode)
387+ return;
388+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
389+ return;
390+ assert("vs-1426", PagePrivate(page));
391+ assert("vs-1427",
392+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
393+ assert("", jprivate(page) != NULL);
394+ assert("", ergo(inode_file_plugin(inode) !=
395+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
396+ offset == 0));
397+
398+ ctx = reiser4_init_context(inode->i_sb);
399+ if (IS_ERR(ctx))
400+ return;
401+
402+ node = jprivate(page);
403+ spin_lock_jnode(node);
404+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
405+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
406+ /* there is not need to capture */
407+ jref(node);
408+ JF_SET(node, JNODE_HEARD_BANSHEE);
409+ page_clear_jnode(page, node);
410+ reiser4_uncapture_jnode(node);
411+ unhash_unformatted_jnode(node);
412+ jput(node);
413+ reiser4_exit_context(ctx);
414+ return;
415+ }
416+ spin_unlock_jnode(node);
417+
418+ /* capture page being truncated. */
419+ ret = try_capture_page_to_invalidate(page);
420+ if (ret != 0)
421+ warning("nikita-3141", "Cannot capture: %i", ret);
422+
423+ if (offset == 0) {
424+ /* remove jnode from transaction and detach it from page. */
425+ jref(node);
426+ JF_SET(node, JNODE_HEARD_BANSHEE);
427+ /* page cannot be detached from jnode concurrently, because it
428+ * is locked */
429+ reiser4_uncapture_page(page);
430+
431+ /* this detaches page from jnode, so that jdelete will not try
432+ * to lock page which is already locked */
433+ spin_lock_jnode(node);
434+ page_clear_jnode(page, node);
435+ spin_unlock_jnode(node);
436+ unhash_unformatted_jnode(node);
437+
438+ jput(node);
439+ }
440+
441+ reiser4_exit_context(ctx);
442+}
443+
444+/* help function called from reiser4_releasepage(). It returns true if jnode
445+ * can be detached from its page and page released. */
446+int jnode_is_releasable(jnode * node /* node to check */ )
447+{
448+ assert("nikita-2781", node != NULL);
449+ assert_spin_locked(&(node->guard));
450+ assert_spin_locked(&(node->load));
451+
452+ /* is some thread is currently using jnode page, later cannot be
453+ * detached */
454+ if (atomic_read(&node->d_count) != 0) {
455+ return 0;
456+ }
457+
458+ assert("vs-1214", !jnode_is_loaded(node));
459+
460+ /*
461+ * can only release page if real block number is assigned to it. Simple
462+ * check for ->atom wouldn't do, because it is possible for node to be
463+ * clean, not it atom yet, and still having fake block number. For
464+ * example, node just created in jinit_new().
465+ */
466+ if (reiser4_blocknr_is_fake(jnode_get_block(node)))
467+ return 0;
468+
469+ /*
470+ * pages prepared for write can not be released anyway, so avoid
471+ * detaching jnode from the page
472+ */
473+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
474+ return 0;
475+
476+ /*
477+ * dirty jnode cannot be released. It can however be submitted to disk
478+ * as part of early flushing, but only after getting flush-prepped.
479+ */
480+ if (JF_ISSET(node, JNODE_DIRTY))
481+ return 0;
482+
483+ /* overwrite set is only written by log writer. */
484+ if (JF_ISSET(node, JNODE_OVRWR))
485+ return 0;
486+
487+ /* jnode is already under writeback */
488+ if (JF_ISSET(node, JNODE_WRITEBACK))
489+ return 0;
490+
491+ /* don't flush bitmaps or journal records */
492+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
493+ return 0;
494+
495+ return 1;
496+}
497+
498+/*
499+ * ->releasepage method for reiser4
500+ *
501+ * This is called by VM scanner when it comes across clean page. What we have
502+ * to do here is to check whether page can really be released (freed that is)
503+ * and if so, detach jnode from it and remove page from the page cache.
504+ *
505+ * Check for releasability is done by releasable() function.
506+ */
507+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
508+{
509+ jnode *node;
510+
511+ assert("nikita-2257", PagePrivate(page));
512+ assert("nikita-2259", PageLocked(page));
513+ assert("nikita-2892", !PageWriteback(page));
514+ assert("nikita-3019", reiser4_schedulable());
515+
516+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
517+ is not clear what to do in this case. A lot of deadlocks seems be
518+ possible. */
519+
520+ node = jnode_by_page(page);
521+ assert("nikita-2258", node != NULL);
522+ assert("reiser4-4", page->mapping != NULL);
523+ assert("reiser4-5", page->mapping->host != NULL);
524+
525+ if (PageDirty(page))
526+ return 0;
527+
528+ /* extra page reference is used by reiser4 to protect
529+ * jnode<->page link from this ->releasepage(). */
530+ if (page_count(page) > 3)
531+ return 0;
532+
533+ /* releasable() needs jnode lock, because it looks at the jnode fields
534+ * and we need jload_lock here to avoid races with jload(). */
535+ spin_lock_jnode(node);
536+ spin_lock(&(node->load));
537+ if (jnode_is_releasable(node)) {
538+ struct address_space *mapping;
539+
540+ mapping = page->mapping;
541+ jref(node);
542+ /* there is no need to synchronize against
543+ * jnode_extent_write() here, because pages seen by
544+ * jnode_extent_write() are !releasable(). */
545+ page_clear_jnode(page, node);
546+ spin_unlock(&(node->load));
547+ spin_unlock_jnode(node);
548+
549+ /* we are under memory pressure so release jnode also. */
550+ jput(node);
551+
552+ return 1;
553+ } else {
554+ spin_unlock(&(node->load));
555+ spin_unlock_jnode(node);
556+ assert("nikita-3020", reiser4_schedulable());
557+ return 0;
558+ }
559+}
560+
561+/* Make Linus happy.
562+ Local variables:
563+ c-indentation-style: "K&R"
564+ mode-name: "LC"
565+ c-basic-offset: 8
566+ tab-width: 8
567+ fill-column: 120
568+ End:
569+*/
570diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.c linux-2.6.20/fs/reiser4/block_alloc.c
571--- linux-2.6.20.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
572+++ linux-2.6.20/fs/reiser4/block_alloc.c 2007-05-06 14:50:43.682970725 +0400
573@@ -0,0 +1,1137 @@
574+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
575+
576+#include "debug.h"
577+#include "dformat.h"
578+#include "plugin/plugin.h"
579+#include "txnmgr.h"
580+#include "znode.h"
581+#include "block_alloc.h"
582+#include "tree.h"
583+#include "super.h"
584+
585+#include <linux/types.h> /* for __u?? */
586+#include <linux/fs.h> /* for struct super_block */
587+#include <linux/spinlock.h>
588+
589+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
590+
591+/* We need to be able to reserve enough disk space to ensure that an atomic
592+ operation will have enough disk space to flush (see flush.c and
593+ http://namesys.com/v4/v4.html) and commit it once it is started.
594+
595+ In our design a call for reserving disk space may fail but not an actual
596+ block allocation.
597+
598+ All free blocks, already allocated blocks, and all kinds of reserved blocks
599+ are counted in different per-fs block counters.
600+
601+ A reiser4 super block's set of block counters currently is:
602+
603+ free -- free blocks,
604+ used -- already allocated blocks,
605+
606+ grabbed -- initially reserved for performing an fs operation, those blocks
607+ are taken from free blocks, then grabbed disk space leaks from grabbed
608+ blocks counter to other counters like "fake allocated", "flush
609+ reserved", "used", the rest of not used grabbed space is returned to
610+ free space at the end of fs operation;
611+
612+ fake allocated -- counts all nodes without real disk block numbers assigned,
613+ we have separate accounting for formatted and unformatted
614+ nodes (for easier debugging);
615+
616+ flush reserved -- disk space needed for flushing and committing an atom.
617+ Each dirty already allocated block could be written as a
618+ part of atom's overwrite set or as a part of atom's
619+ relocate set. In both case one additional block is needed,
620+ it is used as a wandered block if we do overwrite or as a
621+ new location for a relocated block.
622+
623+ In addition, blocks in some states are counted on per-thread and per-atom
624+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
625+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
626+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
627+ blocks, which are reserved for flush processing and atom commit. */
628+
629+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
630+ number of blocks to grab for most expensive case of balancing when the leaf
631+ node we insert new item to gets split and new leaf node is allocated.
632+
633+ So, we need to grab blocks for
634+
635+ 1) one block for possible dirtying the node we insert an item to. That block
636+ would be used for node relocation at flush time or for allocating of a
637+ wandered one, it depends what will be a result (what set, relocate or
638+ overwrite the node gets assigned to) of the node processing by the flush
639+ algorithm.
640+
641+ 2) one block for either allocating a new node, or dirtying of right or left
642+ clean neighbor, only one case may happen.
643+
644+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
645+ node, and creation of new node. have I forgotten something? email me.
646+
647+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
648+ counter and in the fs-wide one (both ctx->grabbed_blocks and
649+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
650+ decremented by 2.
651+
652+ Suppose both two blocks were spent for dirtying of an already allocated clean
653+ node (one block went from "grabbed" to "flush reserved") and for new block
654+ allocating (one block went from "grabbed" to "fake allocated formatted").
655+
656+ Inserting of a child pointer to the parent node caused parent node to be
657+ split, the balancing code takes care about this grabbing necessary space
658+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
659+ "can use the 5% reserved disk space".
660+
661+ At this moment insertion completes and grabbed blocks (if they were not used)
662+ should be returned to the free space counter.
663+
664+ However the atom life-cycle is not completed. The atom had one "flush
665+ reserved" block added by our insertion and the new fake allocated node is
666+ counted as a "fake allocated formatted" one. The atom has to be fully
667+ processed by flush before commit. Suppose that the flush moved the first,
668+ already allocated node to the atom's overwrite list, the new fake allocated
669+ node, obviously, went into the atom relocate set. The reiser4 flush
670+ allocates the new node using one unit from "fake allocated formatted"
671+ counter, the log writer uses one from "flush reserved" for wandered block
672+ allocation.
673+
674+ And, it is not the end. When the wandered block is deallocated after the
675+ atom gets fully played (see wander.c for term description), the disk space
676+ occupied for it is returned to free blocks. */
677+
678+/* BLOCK NUMBERS */
679+
680+/* Any reiser4 node has a block number assigned to it. We use these numbers for
681+ indexing in hash tables, so if a block has not yet been assigned a location
682+ on disk we need to give it a temporary fake block number.
683+
684+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
685+ use highest bit in 64-bit block number to distinguish fake and real block
686+ numbers. So, only 63 bits may be used to addressing of real device
687+ blocks. That "fake" block numbers space is divided into subspaces of fake
688+ block numbers for data blocks and for shadow (working) bitmap blocks.
689+
690+ Fake block numbers for data blocks are generated by a cyclic counter, which
691+ gets incremented after each real block allocation. We assume that it is
692+ impossible to overload this counter during one transaction life. */
693+
694+/* Initialize a blocknr hint. */
695+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
696+{
697+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
698+}
699+
700+/* Release any resources of a blocknr hint. */
701+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
702+{
703+ /* No resources should be freed in current blocknr_hint implementation. */
704+}
705+
706+/* see above for explanation of fake block number. */
707+/* Audited by: green(2002.06.11) */
708+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
709+{
710+ /* The reason for not simply returning result of '&' operation is that
711+ while return value is (possibly 32bit) int, the reiser4_block_nr is
712+ at least 64 bits long, and high bit (which is the only possible
713+ non zero bit after the masking) would be stripped off */
714+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
715+}
716+
717+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
718+ arithmetic. Mostly, they are isolated to not to code same assertions in
719+ several places. */
720+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
721+{
722+ BUG_ON(ctx->grabbed_blocks < count);
723+ assert("zam-527", ctx->grabbed_blocks >= count);
724+ ctx->grabbed_blocks -= count;
725+}
726+
727+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
728+{
729+ ctx->grabbed_blocks += count;
730+}
731+
732+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
733+{
734+ assert("zam-525", sbinfo->blocks_grabbed >= count);
735+ sbinfo->blocks_grabbed -= count;
736+}
737+
738+/* Decrease the counter of block reserved for flush in super block. */
739+static void
740+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
741+{
742+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
743+ sbinfo->blocks_flush_reserved -= count;
744+}
745+
746+static void
747+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
748+ reiser4_ba_flags_t flags)
749+{
750+ if (flags & BA_FORMATTED) {
751+ assert("zam-806", sbinfo->blocks_fake_allocated >= count);
752+ sbinfo->blocks_fake_allocated -= count;
753+ } else {
754+ assert("zam-528",
755+ sbinfo->blocks_fake_allocated_unformatted >= count);
756+ sbinfo->blocks_fake_allocated_unformatted -= count;
757+ }
758+}
759+
760+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
761+{
762+ assert("zam-530",
763+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
764+ sbinfo->blocks_used -= count;
765+}
766+
767+static void
768+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
769+{
770+ assert("edward-501", sbinfo->blocks_clustered >= count);
771+ sbinfo->blocks_clustered -= count;
772+}
773+
774+/* Increase the counter of block reserved for flush in atom. */
775+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
776+{
777+ assert("zam-772", atom != NULL);
778+ assert_spin_locked(&(atom->alock));
779+ atom->flush_reserved += count;
780+}
781+
782+/* Decrease the counter of block reserved for flush in atom. */
783+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
784+{
785+ assert("zam-774", atom != NULL);
786+ assert_spin_locked(&(atom->alock));
787+ assert("nikita-2790", atom->flush_reserved >= count);
788+ atom->flush_reserved -= count;
789+}
790+
791+/* super block has 6 counters: free, used, grabbed, fake allocated
792+ (formatted and unformatted) and flush reserved. Their sum must be
793+ number of blocks on a device. This function checks this */
794+int reiser4_check_block_counters(const struct super_block *super)
795+{
796+ __u64 sum;
797+
798+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
799+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
800+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
801+ reiser4_clustered_blocks(super);
802+ if (reiser4_block_count(super) != sum) {
803+ printk("super block counters: "
804+ "used %llu, free %llu, "
805+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
806+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
807+ (unsigned long long)reiser4_data_blocks(super),
808+ (unsigned long long)reiser4_free_blocks(super),
809+ (unsigned long long)reiser4_grabbed_blocks(super),
810+ (unsigned long long)reiser4_fake_allocated(super),
811+ (unsigned long long)
812+ reiser4_fake_allocated_unformatted(super),
813+ (unsigned long long)reiser4_flush_reserved(super),
814+ (unsigned long long)reiser4_clustered_blocks(super),
815+ (unsigned long long)sum,
816+ (unsigned long long)reiser4_block_count(super));
817+ return 0;
818+ }
819+ return 1;
820+}
821+
822+/* Adjust "working" free blocks counter for number of blocks we are going to
823+ allocate. Record number of grabbed blocks in fs-wide and per-thread
824+ counters. This function should be called before bitmap scanning or
825+ allocating fake block numbers
826+
827+ @super -- pointer to reiser4 super block;
828+ @count -- number of blocks we reserve;
829+
830+ @return -- 0 if success, -ENOSPC, if all
831+ free blocks are preserved or already allocated.
832+*/
833+
834+static int
835+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
836+{
837+ __u64 free_blocks;
838+ int ret = 0, use_reserved = flags & BA_RESERVED;
839+ reiser4_super_info_data *sbinfo;
840+
841+ assert("vs-1276", ctx == get_current_context());
842+
843+ /* Do not grab anything on ro-mounted fs. */
844+ if (rofs_super(ctx->super)) {
845+ ctx->grab_enabled = 0;
846+ return 0;
847+ }
848+
849+ sbinfo = get_super_private(ctx->super);
850+
851+ spin_lock_reiser4_super(sbinfo);
852+
853+ free_blocks = sbinfo->blocks_free;
854+
855+ if ((use_reserved && free_blocks < count) ||
856+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
857+ ret = RETERR(-ENOSPC);
858+ goto unlock_and_ret;
859+ }
860+
861+ add_to_ctx_grabbed(ctx, count);
862+
863+ sbinfo->blocks_grabbed += count;
864+ sbinfo->blocks_free -= count;
865+
866+#if REISER4_DEBUG
867+ if (ctx->grabbed_initially == 0)
868+ ctx->grabbed_initially = count;
869+#endif
870+
871+ assert("nikita-2986", reiser4_check_block_counters(ctx->super));
872+
873+ /* disable grab space in current context */
874+ ctx->grab_enabled = 0;
875+
876+ unlock_and_ret:
877+ spin_unlock_reiser4_super(sbinfo);
878+
879+ return ret;
880+}
881+
882+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
883+{
884+ int ret;
885+ reiser4_context *ctx;
886+
887+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
888+ lock_stack_isclean(get_current_lock_stack
889+ ())));
890+ ctx = get_current_context();
891+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
892+ return 0;
893+ }
894+
895+ ret = reiser4_grab(ctx, count, flags);
896+ if (ret == -ENOSPC) {
897+
898+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
899+ if (flags & BA_CAN_COMMIT) {
900+ txnmgr_force_commit_all(ctx->super, 0);
901+ ctx->grab_enabled = 1;
902+ ret = reiser4_grab(ctx, count, flags);
903+ }
904+ }
905+ /*
906+ * allocation from reserved pool cannot fail. This is severe error.
907+ */
908+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
909+ return ret;
910+}
911+
912+/*
913+ * SPACE RESERVED FOR UNLINK/TRUNCATE
914+ *
915+ * Unlink and truncate require space in transaction (to update stat data, at
916+ * least). But we don't want rm(1) to fail with "No space on device" error.
917+ *
918+ * Solution is to reserve 5% of disk space for truncates and
919+ * unlinks. Specifically, normal space grabbing requests don't grab space from
920+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
921+ * drain it. Per super block delete mutex is used to allow only one
922+ * thread at a time to grab from reserved area.
923+ *
924+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
925+ * flag.
926+ *
927+ */
928+
929+int reiser4_grab_reserved(struct super_block *super,
930+ __u64 count, reiser4_ba_flags_t flags)
931+{
932+ reiser4_super_info_data *sbinfo = get_super_private(super);
933+
934+ assert("nikita-3175", flags & BA_CAN_COMMIT);
935+
936+ /* Check the delete mutex already taken by us, we assume that
937+ * reading of machine word is atomic. */
938+ if (sbinfo->delete_mutex_owner == current) {
939+ if (reiser4_grab_space
940+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
941+ warning("zam-1003",
942+ "nested call of grab_reserved fails count=(%llu)",
943+ (unsigned long long)count);
944+ reiser4_release_reserved(super);
945+ return RETERR(-ENOSPC);
946+ }
947+ return 0;
948+ }
949+
950+ if (reiser4_grab_space(count, flags)) {
951+ mutex_lock(&sbinfo->delete_mutex);
952+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
953+ sbinfo->delete_mutex_owner = current;
954+
955+ if (reiser4_grab_space(count, flags | BA_RESERVED)) {
956+ warning("zam-833",
957+ "reserved space is not enough (%llu)",
958+ (unsigned long long)count);
959+ reiser4_release_reserved(super);
960+ return RETERR(-ENOSPC);
961+ }
962+ }
963+ return 0;
964+}
965+
966+void reiser4_release_reserved(struct super_block *super)
967+{
968+ reiser4_super_info_data *info;
969+
970+ info = get_super_private(super);
971+ if (info->delete_mutex_owner == current) {
972+ info->delete_mutex_owner = NULL;
973+ mutex_unlock(&info->delete_mutex);
974+ }
975+}
976+
977+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
978+{
979+ reiser4_context *ctx;
980+ reiser4_super_info_data *sbinfo;
981+
982+ ctx = get_current_context();
983+ sub_from_ctx_grabbed(ctx, count);
984+
985+ sbinfo = get_super_private(ctx->super);
986+ spin_lock_reiser4_super(sbinfo);
987+
988+ sub_from_sb_grabbed(sbinfo, count);
989+ /* return sbinfo locked */
990+ return sbinfo;
991+}
992+
993+/* is called after @count fake block numbers are allocated and pointer to
994+ those blocks are inserted into tree. */
995+static void grabbed2fake_allocated_formatted(void)
996+{
997+ reiser4_super_info_data *sbinfo;
998+
999+ sbinfo = grabbed2fake_allocated_head(1);
1000+ sbinfo->blocks_fake_allocated++;
1001+
1002+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1003+
1004+ spin_unlock_reiser4_super(sbinfo);
1005+}
1006+
1007+/**
1008+ * grabbed2fake_allocated_unformatted
1009+ * @count:
1010+ *
1011+ */
1012+static void grabbed2fake_allocated_unformatted(int count)
1013+{
1014+ reiser4_super_info_data *sbinfo;
1015+
1016+ sbinfo = grabbed2fake_allocated_head(count);
1017+ sbinfo->blocks_fake_allocated_unformatted += count;
1018+
1019+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1020+
1021+ spin_unlock_reiser4_super(sbinfo);
1022+}
1023+
1024+void grabbed2cluster_reserved(int count)
1025+{
1026+ reiser4_context *ctx;
1027+ reiser4_super_info_data *sbinfo;
1028+
1029+ ctx = get_current_context();
1030+ sub_from_ctx_grabbed(ctx, count);
1031+
1032+ sbinfo = get_super_private(ctx->super);
1033+ spin_lock_reiser4_super(sbinfo);
1034+
1035+ sub_from_sb_grabbed(sbinfo, count);
1036+ sbinfo->blocks_clustered += count;
1037+
1038+ assert("edward-504", reiser4_check_block_counters(ctx->super));
1039+
1040+ spin_unlock_reiser4_super(sbinfo);
1041+}
1042+
1043+void cluster_reserved2grabbed(int count)
1044+{
1045+ reiser4_context *ctx;
1046+ reiser4_super_info_data *sbinfo;
1047+
1048+ ctx = get_current_context();
1049+
1050+ sbinfo = get_super_private(ctx->super);
1051+ spin_lock_reiser4_super(sbinfo);
1052+
1053+ sub_from_cluster_reserved(sbinfo, count);
1054+ sbinfo->blocks_grabbed += count;
1055+
1056+ assert("edward-505", reiser4_check_block_counters(ctx->super));
1057+
1058+ spin_unlock_reiser4_super(sbinfo);
1059+ add_to_ctx_grabbed(ctx, count);
1060+}
1061+
1062+void cluster_reserved2free(int count)
1063+{
1064+ reiser4_context *ctx;
1065+ reiser4_super_info_data *sbinfo;
1066+
1067+ ctx = get_current_context();
1068+ sbinfo = get_super_private(ctx->super);
1069+
1070+ cluster_reserved2grabbed(count);
1071+ grabbed2free(ctx, sbinfo, count);
1072+}
1073+
1074+static DEFINE_SPINLOCK(fake_lock);
1075+static reiser4_block_nr fake_gen = 0;
1076+
1077+/**
1078+ * assign_fake_blocknr
1079+ * @blocknr:
1080+ * @count:
1081+ *
1082+ * Obtain a fake block number for new node which will be used to refer to
1083+ * this newly allocated node until real allocation is done.
1084+ */
1085+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1086+{
1087+ spin_lock(&fake_lock);
1088+ *blocknr = fake_gen;
1089+ fake_gen += count;
1090+ spin_unlock(&fake_lock);
1091+
1092+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1093+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1094+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1095+ assert("zam-394", zlook(current_tree, blocknr) == NULL);
1096+}
1097+
1098+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1099+{
1100+ assign_fake_blocknr(blocknr, 1);
1101+ grabbed2fake_allocated_formatted();
1102+ return 0;
1103+}
1104+
1105+/**
1106+ * fake_blocknrs_unformatted
1107+ * @count: number of fake numbers to get
1108+ *
1109+ * Allocates @count fake block numbers which will be assigned to jnodes
1110+ */
1111+reiser4_block_nr fake_blocknr_unformatted(int count)
1112+{
1113+ reiser4_block_nr blocknr;
1114+
1115+ assign_fake_blocknr(&blocknr, count);
1116+ grabbed2fake_allocated_unformatted(count);
1117+
1118+ return blocknr;
1119+}
1120+
1121+/* adjust sb block counters, if real (on-disk) block allocation immediately
1122+ follows grabbing of free disk space. */
1123+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1124+ __u64 count)
1125+{
1126+ sub_from_ctx_grabbed(ctx, count);
1127+
1128+ spin_lock_reiser4_super(sbinfo);
1129+
1130+ sub_from_sb_grabbed(sbinfo, count);
1131+ sbinfo->blocks_used += count;
1132+
1133+ assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1134+
1135+ spin_unlock_reiser4_super(sbinfo);
1136+}
1137+
1138+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1139+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1140+ reiser4_ba_flags_t flags)
1141+{
1142+ spin_lock_reiser4_super(sbinfo);
1143+
1144+ sub_from_sb_fake_allocated(sbinfo, count, flags);
1145+ sbinfo->blocks_used += count;
1146+
1147+ assert("nikita-2680",
1148+ reiser4_check_block_counters(reiser4_get_current_sb()));
1149+
1150+ spin_unlock_reiser4_super(sbinfo);
1151+}
1152+
1153+static void flush_reserved2used(txn_atom * atom, __u64 count)
1154+{
1155+ reiser4_super_info_data *sbinfo;
1156+
1157+ assert("zam-787", atom != NULL);
1158+ assert_spin_locked(&(atom->alock));
1159+
1160+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1161+
1162+ sbinfo = get_current_super_private();
1163+ spin_lock_reiser4_super(sbinfo);
1164+
1165+ sub_from_sb_flush_reserved(sbinfo, count);
1166+ sbinfo->blocks_used += count;
1167+
1168+ assert("zam-789",
1169+ reiser4_check_block_counters(reiser4_get_current_sb()));
1170+
1171+ spin_unlock_reiser4_super(sbinfo);
1172+}
1173+
1174+/* update the per fs blocknr hint default value. */
1175+void
1176+update_blocknr_hint_default(const struct super_block *s,
1177+ const reiser4_block_nr * block)
1178+{
1179+ reiser4_super_info_data *sbinfo = get_super_private(s);
1180+
1181+ assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1182+
1183+ spin_lock_reiser4_super(sbinfo);
1184+ if (*block < sbinfo->block_count) {
1185+ sbinfo->blocknr_hint_default = *block;
1186+ } else {
1187+ warning("zam-676",
1188+ "block number %llu is too large to be used in a blocknr hint\n",
1189+ (unsigned long long)*block);
1190+ dump_stack();
1191+ DEBUGON(1);
1192+ }
1193+ spin_unlock_reiser4_super(sbinfo);
1194+}
1195+
1196+/* get current value of the default blocknr hint. */
1197+void get_blocknr_hint_default(reiser4_block_nr * result)
1198+{
1199+ reiser4_super_info_data *sbinfo = get_current_super_private();
1200+
1201+ spin_lock_reiser4_super(sbinfo);
1202+ *result = sbinfo->blocknr_hint_default;
1203+ assert("zam-677", *result < sbinfo->block_count);
1204+ spin_unlock_reiser4_super(sbinfo);
1205+}
1206+
1207+/* Allocate "real" disk blocks by calling a proper space allocation plugin
1208+ * method. Blocks are allocated in one contiguous disk region. The plugin
1209+ * independent part accounts blocks by subtracting allocated amount from grabbed
1210+ * or fake block counter and add the same amount to the counter of allocated
1211+ * blocks.
1212+ *
1213+ * @hint -- a reiser4 blocknr hint object which contains further block
1214+ * allocation hints and parameters (search start, a stage of block
1215+ * which will be mapped to disk, etc.),
1216+ * @blk -- an out parameter for the beginning of the allocated region,
1217+ * @len -- in/out parameter, it should contain the maximum number of allocated
1218+ * blocks, after block allocation completes, it contains the length of
1219+ * allocated disk region.
1220+ * @flags -- see reiser4_ba_flags_t description.
1221+ *
1222+ * @return -- 0 if success, error code otherwise.
1223+ */
1224+int
1225+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1226+ reiser4_block_nr * len, reiser4_ba_flags_t flags)
1227+{
1228+ __u64 needed = *len;
1229+ reiser4_context *ctx;
1230+ reiser4_super_info_data *sbinfo;
1231+ int ret;
1232+
1233+ assert("zam-986", hint != NULL);
1234+
1235+ ctx = get_current_context();
1236+ sbinfo = get_super_private(ctx->super);
1237+
1238+ /* For write-optimized data we use default search start value, which is
1239+ * close to last write location. */
1240+ if (flags & BA_USE_DEFAULT_SEARCH_START) {
1241+ get_blocknr_hint_default(&hint->blk);
1242+ }
1243+
1244+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1245+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1246+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
1247+ ret = reiser4_grab_space_force(*len, flags);
1248+ if (ret != 0)
1249+ return ret;
1250+ }
1251+
1252+ ret =
1253+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1254+ hint, (int)needed, blk, len);
1255+
1256+ if (!ret) {
1257+ assert("zam-680", *blk < reiser4_block_count(ctx->super));
1258+ assert("zam-681",
1259+ *blk + *len <= reiser4_block_count(ctx->super));
1260+
1261+ if (flags & BA_PERMANENT) {
1262+ /* we assume that current atom exists at this moment */
1263+ txn_atom *atom = get_current_atom_locked();
1264+ atom->nr_blocks_allocated += *len;
1265+ spin_unlock_atom(atom);
1266+ }
1267+
1268+ switch (hint->block_stage) {
1269+ case BLOCK_NOT_COUNTED:
1270+ case BLOCK_GRABBED:
1271+ grabbed2used(ctx, sbinfo, *len);
1272+ break;
1273+ case BLOCK_UNALLOCATED:
1274+ fake_allocated2used(sbinfo, *len, flags);
1275+ break;
1276+ case BLOCK_FLUSH_RESERVED:
1277+ {
1278+ txn_atom *atom = get_current_atom_locked();
1279+ flush_reserved2used(atom, *len);
1280+ spin_unlock_atom(atom);
1281+ }
1282+ break;
1283+ default:
1284+ impossible("zam-531", "wrong block stage");
1285+ }
1286+ } else {
1287+ assert("zam-821",
1288+ ergo(hint->max_dist == 0
1289+ && !hint->backward, ret != -ENOSPC));
1290+ if (hint->block_stage == BLOCK_NOT_COUNTED)
1291+ grabbed2free(ctx, sbinfo, needed);
1292+ }
1293+
1294+ return ret;
1295+}
1296+
1297+/* used -> fake_allocated -> grabbed -> free */
1298+
1299+/* adjust sb block counters when @count unallocated blocks get unmapped from
1300+ disk */
1301+static void
1302+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1303+ int formatted)
1304+{
1305+ spin_lock_reiser4_super(sbinfo);
1306+
1307+ if (formatted)
1308+ sbinfo->blocks_fake_allocated += count;
1309+ else
1310+ sbinfo->blocks_fake_allocated_unformatted += count;
1311+
1312+ sub_from_sb_used(sbinfo, count);
1313+
1314+ assert("nikita-2681",
1315+ reiser4_check_block_counters(reiser4_get_current_sb()));
1316+
1317+ spin_unlock_reiser4_super(sbinfo);
1318+}
1319+
1320+static void
1321+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1322+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1323+{
1324+ assert("nikita-2791", atom != NULL);
1325+ assert_spin_locked(&(atom->alock));
1326+
1327+ add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1328+
1329+ spin_lock_reiser4_super(sbinfo);
1330+
1331+ sbinfo->blocks_flush_reserved += count;
1332+ /*add_to_sb_flush_reserved(sbinfo, count); */
1333+ sub_from_sb_used(sbinfo, count);
1334+
1335+ assert("nikita-2681",
1336+ reiser4_check_block_counters(reiser4_get_current_sb()));
1337+
1338+ spin_unlock_reiser4_super(sbinfo);
1339+}
1340+
1341+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1342+static void
1343+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1344+ __u64 count, reiser4_ba_flags_t flags)
1345+{
1346+ add_to_ctx_grabbed(ctx, count);
1347+
1348+ spin_lock_reiser4_super(sbinfo);
1349+
1350+ assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1351+
1352+ sbinfo->blocks_grabbed += count;
1353+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1354+
1355+ assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1356+
1357+ spin_unlock_reiser4_super(sbinfo);
1358+}
1359+
1360+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1361+{
1362+ reiser4_context *ctx;
1363+ reiser4_super_info_data *sbinfo;
1364+
1365+ ctx = get_current_context();
1366+ sbinfo = get_super_private(ctx->super);
1367+
1368+ fake_allocated2grabbed(ctx, sbinfo, count, flags);
1369+ grabbed2free(ctx, sbinfo, count);
1370+}
1371+
1372+void grabbed2free_mark(__u64 mark)
1373+{
1374+ reiser4_context *ctx;
1375+ reiser4_super_info_data *sbinfo;
1376+
1377+ ctx = get_current_context();
1378+ sbinfo = get_super_private(ctx->super);
1379+
1380+ assert("nikita-3007", (__s64) mark >= 0);
1381+ assert("nikita-3006", ctx->grabbed_blocks >= mark);
1382+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1383+}
1384+
1385+/**
1386+ * grabbed2free - adjust grabbed and free block counters
1387+ * @ctx: context to update grabbed block counter of
1388+ * @sbinfo: super block to update grabbed and free block counters of
1389+ * @count: number of blocks to adjust counters by
1390+ *
1391+ * Decreases context's and per filesystem's counters of grabbed
1392+ * blocks. Increases per filesystem's counter of free blocks.
1393+ */
1394+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1395+ __u64 count)
1396+{
1397+ sub_from_ctx_grabbed(ctx, count);
1398+
1399+ spin_lock_reiser4_super(sbinfo);
1400+
1401+ sub_from_sb_grabbed(sbinfo, count);
1402+ sbinfo->blocks_free += count;
1403+ assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1404+
1405+ spin_unlock_reiser4_super(sbinfo);
1406+}
1407+
1408+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1409+{
1410+ reiser4_context *ctx;
1411+ reiser4_super_info_data *sbinfo;
1412+
1413+ assert("vs-1095", atom);
1414+
1415+ ctx = get_current_context();
1416+ sbinfo = get_super_private(ctx->super);
1417+
1418+ sub_from_ctx_grabbed(ctx, count);
1419+
1420+ add_to_atom_flush_reserved_nolock(atom, count);
1421+
1422+ spin_lock_reiser4_super(sbinfo);
1423+
1424+ sbinfo->blocks_flush_reserved += count;
1425+ sub_from_sb_grabbed(sbinfo, count);
1426+
1427+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1428+
1429+ spin_unlock_reiser4_super(sbinfo);
1430+}
1431+
1432+void grabbed2flush_reserved(__u64 count)
1433+{
1434+ txn_atom *atom = get_current_atom_locked();
1435+
1436+ grabbed2flush_reserved_nolock(atom, count);
1437+
1438+ spin_unlock_atom(atom);
1439+}
1440+
1441+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1442+{
1443+ reiser4_context *ctx;
1444+ reiser4_super_info_data *sbinfo;
1445+
1446+ assert("nikita-2788", atom != NULL);
1447+ assert_spin_locked(&(atom->alock));
1448+
1449+ ctx = get_current_context();
1450+ sbinfo = get_super_private(ctx->super);
1451+
1452+ add_to_ctx_grabbed(ctx, count);
1453+
1454+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1455+
1456+ spin_lock_reiser4_super(sbinfo);
1457+
1458+ sbinfo->blocks_grabbed += count;
1459+ sub_from_sb_flush_reserved(sbinfo, count);
1460+
1461+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1462+
1463+ spin_unlock_reiser4_super(sbinfo);
1464+}
1465+
1466+/**
1467+ * all_grabbed2free - releases all blocks grabbed in context
1468+ *
1469+ * Decreases context's and super block's grabbed block counters by number of
1470+ * blocks grabbed by current context and increases super block's free block
1471+ * counter correspondingly.
1472+ */
1473+void all_grabbed2free(void)
1474+{
1475+ reiser4_context *ctx = get_current_context();
1476+
1477+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1478+}
1479+
1480+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1481+ after freeing, @count blocks become "grabbed". */
1482+static void
1483+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1484+ __u64 count)
1485+{
1486+ add_to_ctx_grabbed(ctx, count);
1487+
1488+ spin_lock_reiser4_super(sbinfo);
1489+
1490+ sbinfo->blocks_grabbed += count;
1491+ sub_from_sb_used(sbinfo, count);
1492+
1493+ assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1494+
1495+ spin_unlock_reiser4_super(sbinfo);
1496+}
1497+
1498+/* this used to be done through used2grabbed and grabbed2free*/
1499+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1500+{
1501+ spin_lock_reiser4_super(sbinfo);
1502+
1503+ sbinfo->blocks_free += count;
1504+ sub_from_sb_used(sbinfo, count);
1505+
1506+ assert("nikita-2685",
1507+ reiser4_check_block_counters(reiser4_get_current_sb()));
1508+
1509+ spin_unlock_reiser4_super(sbinfo);
1510+}
1511+
1512+#if REISER4_DEBUG
1513+
1514+/* check "allocated" state of given block range */
1515+static void
1516+reiser4_check_blocks(const reiser4_block_nr * start,
1517+ const reiser4_block_nr * len, int desired)
1518+{
1519+ sa_check_blocks(start, len, desired);
1520+}
1521+
1522+/* check "allocated" state of given block */
1523+void reiser4_check_block(const reiser4_block_nr * block, int desired)
1524+{
1525+ const reiser4_block_nr one = 1;
1526+
1527+ reiser4_check_blocks(block, &one, desired);
1528+}
1529+
1530+#endif
1531+
1532+/* Blocks deallocation function may do an actual deallocation through space
1533+ plugin allocation or store deleted block numbers in atom's delete_set data
1534+ structure depend on @defer parameter. */
1535+
1536+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1537+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1538+ freed but disk space is still grabbed by current thread, or these blocks must
1539+ not be counted in any reiser4 sb block counters, see block_stage_t comment */
1540+
1541+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1542+ distinguish blocks allocated for unformatted and formatted nodes */
1543+
1544+int
1545+reiser4_dealloc_blocks(const reiser4_block_nr * start,
1546+ const reiser4_block_nr * len,
1547+ block_stage_t target_stage, reiser4_ba_flags_t flags)
1548+{
1549+ txn_atom *atom = NULL;
1550+ int ret;
1551+ reiser4_context *ctx;
1552+ reiser4_super_info_data *sbinfo;
1553+
1554+ ctx = get_current_context();
1555+ sbinfo = get_super_private(ctx->super);
1556+
1557+ if (REISER4_DEBUG) {
1558+ assert("zam-431", *len != 0);
1559+ assert("zam-432", *start != 0);
1560+ assert("zam-558", !reiser4_blocknr_is_fake(start));
1561+
1562+ spin_lock_reiser4_super(sbinfo);
1563+ assert("zam-562", *start < sbinfo->block_count);
1564+ spin_unlock_reiser4_super(sbinfo);
1565+ }
1566+
1567+ if (flags & BA_DEFER) {
1568+ blocknr_set_entry *bsep = NULL;
1569+
1570+ /* storing deleted block numbers in a blocknr set
1571+ datastructure for further actual deletion */
1572+ do {
1573+ atom = get_current_atom_locked();
1574+ assert("zam-430", atom != NULL);
1575+
1576+ ret =
1577+ blocknr_set_add_extent(atom, &atom->delete_set,
1578+ &bsep, start, len);
1579+
1580+ if (ret == -ENOMEM)
1581+ return ret;
1582+
1583+ /* This loop might spin at most two times */
1584+ } while (ret == -E_REPEAT);
1585+
1586+ assert("zam-477", ret == 0);
1587+ assert("zam-433", atom != NULL);
1588+
1589+ spin_unlock_atom(atom);
1590+
1591+ } else {
1592+ assert("zam-425", get_current_super_private() != NULL);
1593+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1594+ *start, *len);
1595+
1596+ if (flags & BA_PERMANENT) {
1597+ /* These blocks were counted as allocated, we have to revert it
1598+ * back if allocation is discarded. */
1599+ txn_atom *atom = get_current_atom_locked();
1600+ atom->nr_blocks_allocated -= *len;
1601+ spin_unlock_atom(atom);
1602+ }
1603+
1604+ switch (target_stage) {
1605+ case BLOCK_NOT_COUNTED:
1606+ assert("vs-960", flags & BA_FORMATTED);
1607+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1608+ used2free(sbinfo, *len);
1609+ break;
1610+
1611+ case BLOCK_GRABBED:
1612+ used2grabbed(ctx, sbinfo, *len);
1613+ break;
1614+
1615+ case BLOCK_UNALLOCATED:
1616+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1617+ break;
1618+
1619+ case BLOCK_FLUSH_RESERVED:{
1620+ txn_atom *atom;
1621+
1622+ atom = get_current_atom_locked();
1623+ used2flush_reserved(sbinfo, atom, *len,
1624+ flags & BA_FORMATTED);
1625+ spin_unlock_atom(atom);
1626+ break;
1627+ }
1628+ default:
1629+ impossible("zam-532", "wrong block stage");
1630+ }
1631+ }
1632+
1633+ return 0;
1634+}
1635+
1636+/* wrappers for block allocator plugin methods */
1637+int reiser4_pre_commit_hook(void)
1638+{
1639+ assert("zam-502", get_current_super_private() != NULL);
1640+ sa_pre_commit_hook();
1641+ return 0;
1642+}
1643+
1644+/* an actor which applies delete set to block allocator data */
1645+static int
1646+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1647+ const reiser4_block_nr * b, void *data UNUSED_ARG)
1648+{
1649+ reiser4_context *ctx;
1650+ reiser4_super_info_data *sbinfo;
1651+
1652+ __u64 len = 1;
1653+
1654+ ctx = get_current_context();
1655+ sbinfo = get_super_private(ctx->super);
1656+
1657+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1658+ assert("zam-552", sbinfo != NULL);
1659+
1660+ if (b != NULL)
1661+ len = *b;
1662+
1663+ if (REISER4_DEBUG) {
1664+ spin_lock_reiser4_super(sbinfo);
1665+
1666+ assert("zam-554", *a < reiser4_block_count(ctx->super));
1667+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1668+
1669+ spin_unlock_reiser4_super(sbinfo);
1670+ }
1671+
1672+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1673+ /* adjust sb block counters */
1674+ used2free(sbinfo, len);
1675+ return 0;
1676+}
1677+
1678+void reiser4_post_commit_hook(void)
1679+{
1680+ txn_atom *atom;
1681+
1682+ atom = get_current_atom_locked();
1683+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1684+ spin_unlock_atom(atom);
1685+
1686+ /* do the block deallocation which was deferred
1687+ until commit is done */
1688+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1689+
1690+ assert("zam-504", get_current_super_private() != NULL);
1691+ sa_post_commit_hook();
1692+}
1693+
1694+void reiser4_post_write_back_hook(void)
1695+{
1696+ assert("zam-504", get_current_super_private() != NULL);
1697+
1698+ sa_post_commit_hook();
1699+}
1700+
1701+/*
1702+ Local variables:
1703+ c-indentation-style: "K&R"
1704+ mode-name: "LC"
1705+ c-basic-offset: 8
1706+ tab-width: 8
1707+ fill-column: 120
1708+ scroll-step: 1
1709+ End:
1710+*/
1711diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.h linux-2.6.20/fs/reiser4/block_alloc.h
1712--- linux-2.6.20.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1713+++ linux-2.6.20/fs/reiser4/block_alloc.h 2007-05-06 14:50:43.682970725 +0400
1714@@ -0,0 +1,175 @@
1715+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1716+
1717+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1718+#define __FS_REISER4_BLOCK_ALLOC_H__
1719+
1720+#include "dformat.h"
1721+#include "forward.h"
1722+
1723+#include <linux/types.h> /* for __u?? */
1724+#include <linux/fs.h>
1725+
1726+/* Mask when is applied to given block number shows is that block number is a fake one */
1727+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1728+/* Mask which isolates a type of object this fake block number was assigned to */
1729+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1730+
1731+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1732+ against these two values to understand is the object unallocated or bitmap
1733+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1734+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1735+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1736+
1737+/* specification how block allocation was counted in sb block counters */
1738+typedef enum {
1739+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1740+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1741+ of this block */
1742+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1743+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1744+ ( unallocated formatted or unformatted
1745+ node) */
1746+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1747+ number assigned */
1748+} block_stage_t;
1749+
1750+/* a hint for block allocator */
1751+struct reiser4_blocknr_hint {
1752+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1753+ is to prevent jnode_flush() calls from interleaving allocations on the same
1754+ bitmap, once a hint is established. */
1755+
1756+ /* search start hint */
1757+ reiser4_block_nr blk;
1758+ /* if not zero, it is a region size we search for free blocks in */
1759+ reiser4_block_nr max_dist;
1760+ /* level for allocation, may be useful have branch-level and higher
1761+ write-optimized. */
1762+ tree_level level;
1763+ /* block allocator assumes that blocks, which will be mapped to disk,
1764+ are in this specified block_stage */
1765+ block_stage_t block_stage;
1766+ /* If direction = 1 allocate blocks in backward direction from the end
1767+ * of disk to the beginning of disk. */
1768+ unsigned int backward:1;
1769+
1770+};
1771+
1772+/* These flags control block allocation/deallocation behavior */
1773+enum reiser4_ba_flags {
1774+ /* do allocatations from reserved (5%) area */
1775+ BA_RESERVED = (1 << 0),
1776+
1777+ /* block allocator can do commit trying to recover free space */
1778+ BA_CAN_COMMIT = (1 << 1),
1779+
1780+ /* if operation will be applied to formatted block */
1781+ BA_FORMATTED = (1 << 2),
1782+
1783+ /* defer actual block freeing until transaction commit */
1784+ BA_DEFER = (1 << 3),
1785+
1786+ /* allocate blocks for permanent fs objects (formatted or unformatted), not
1787+ wandered of log blocks */
1788+ BA_PERMANENT = (1 << 4),
1789+
1790+ /* grab space even it was disabled */
1791+ BA_FORCE = (1 << 5),
1792+
1793+ /* use default start value for free blocks search. */
1794+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1795+};
1796+
1797+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1798+
1799+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1800+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1801+extern void update_blocknr_hint_default(const struct super_block *,
1802+ const reiser4_block_nr *);
1803+extern void get_blocknr_hint_default(reiser4_block_nr *);
1804+
1805+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1806+
1807+int assign_fake_blocknr_formatted(reiser4_block_nr *);
1808+reiser4_block_nr fake_blocknr_unformatted(int);
1809+
1810+/* free -> grabbed -> fake_allocated -> used */
1811+
1812+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1813+void all_grabbed2free(void);
1814+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1815+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1816+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1817+void grabbed2flush_reserved(__u64 count);
1818+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1819+ reiser4_block_nr * start,
1820+ reiser4_block_nr * len, reiser4_ba_flags_t flags);
1821+int reiser4_dealloc_blocks(const reiser4_block_nr *,
1822+ const reiser4_block_nr *,
1823+ block_stage_t, reiser4_ba_flags_t flags);
1824+
1825+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1826+ reiser4_block_nr * start,
1827+ reiser4_ba_flags_t flags)
1828+{
1829+ reiser4_block_nr one = 1;
1830+ return reiser4_alloc_blocks(hint, start, &one, flags);
1831+}
1832+
1833+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1834+ block_stage_t stage,
1835+ reiser4_ba_flags_t flags)
1836+{
1837+ const reiser4_block_nr one = 1;
1838+ return reiser4_dealloc_blocks(block, &one, stage, flags);
1839+}
1840+
1841+#define reiser4_grab_space_force(count, flags) \
1842+ reiser4_grab_space(count, flags | BA_FORCE)
1843+
1844+extern void grabbed2free_mark(__u64 mark);
1845+extern int reiser4_grab_reserved(struct super_block *,
1846+ __u64, reiser4_ba_flags_t);
1847+extern void reiser4_release_reserved(struct super_block *super);
1848+
1849+/* grabbed -> fake_allocated */
1850+
1851+/* fake_allocated -> used */
1852+
1853+/* used -> fake_allocated -> grabbed -> free */
1854+
1855+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1856+
1857+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1858+
1859+extern void grabbed2cluster_reserved(int count);
1860+extern void cluster_reserved2grabbed(int count);
1861+extern void cluster_reserved2free(int count);
1862+
1863+extern int reiser4_check_block_counters(const struct super_block *);
1864+
1865+#if REISER4_DEBUG
1866+
1867+extern void reiser4_check_block(const reiser4_block_nr *, int);
1868+
1869+#else
1870+
1871+# define reiser4_check_block(beg, val) noop
1872+
1873+#endif
1874+
1875+extern int reiser4_pre_commit_hook(void);
1876+extern void reiser4_post_commit_hook(void);
1877+extern void reiser4_post_write_back_hook(void);
1878+
1879+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1880+
1881+/* Make Linus happy.
1882+ Local variables:
1883+ c-indentation-style: "K&R"
1884+ mode-name: "LC"
1885+ c-basic-offset: 8
1886+ tab-width: 8
1887+ fill-column: 120
1888+ End:
1889+*/
1890diff -urN linux-2.6.20.orig/fs/reiser4/blocknrset.c linux-2.6.20/fs/reiser4/blocknrset.c
1891--- linux-2.6.20.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1892+++ linux-2.6.20/fs/reiser4/blocknrset.c 2007-05-06 14:50:43.686971975 +0400
1893@@ -0,0 +1,368 @@
1894+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1895+
1896+/* This file contains code for various block number sets used by the atom to
1897+ track the deleted set and wandered block mappings. */
1898+
1899+#include "debug.h"
1900+#include "dformat.h"
1901+#include "txnmgr.h"
1902+#include "context.h"
1903+
1904+#include <linux/slab.h>
1905+
1906+/* The proposed data structure for storing unordered block number sets is a
1907+ list of elements, each of which contains an array of block number or/and
1908+ array of block number pairs. That element called blocknr_set_entry is used
1909+ to store block numbers from the beginning and for extents from the end of
1910+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1911+ count numbers of blocks and extents.
1912+
1913+ +------------------- blocknr_set_entry->data ------------------+
1914+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1915+ +------------------------------------------------------------+
1916+
1917+ When current blocknr_set_entry is full, allocate a new one. */
1918+
1919+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1920+ * set (single blocks and block extents), in that case blocknr pair represent an
1921+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1922+ * there represent a (real block) -> (wandered block) mapping. */
1923+
1924+/* Protection: blocknr sets belong to reiser4 atom, and
1925+ * their modifications are performed with the atom lock held */
1926+
1927+typedef struct blocknr_pair blocknr_pair;
1928+
1929+/* The total size of a blocknr_set_entry. */
1930+#define BLOCKNR_SET_ENTRY_SIZE 128
1931+
1932+/* The number of blocks that can fit the blocknr data area. */
1933+#define BLOCKNR_SET_ENTRIES_NUMBER \
1934+ ((BLOCKNR_SET_ENTRY_SIZE - \
1935+ 2 * sizeof (unsigned) - \
1936+ sizeof(struct list_head)) / \
1937+ sizeof(reiser4_block_nr))
1938+
1939+/* An entry of the blocknr_set */
1940+struct blocknr_set_entry {
1941+ unsigned nr_singles;
1942+ unsigned nr_pairs;
1943+ struct list_head link;
1944+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1945+};
1946+
1947+/* A pair of blocks as recorded in the blocknr_set_entry data. */
1948+struct blocknr_pair {
1949+ reiser4_block_nr a;
1950+ reiser4_block_nr b;
1951+};
1952+
1953+/* Return the number of blocknr slots available in a blocknr_set_entry. */
1954+/* Audited by: green(2002.06.11) */
1955+static unsigned bse_avail(blocknr_set_entry * bse)
1956+{
1957+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1958+
1959+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1960+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1961+
1962+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
1963+}
1964+
1965+/* Initialize a blocknr_set_entry. */
1966+static void bse_init(blocknr_set_entry *bse)
1967+{
1968+ bse->nr_singles = 0;
1969+ bse->nr_pairs = 0;
1970+ INIT_LIST_HEAD(&bse->link);
1971+}
1972+
1973+/* Allocate and initialize a blocknr_set_entry. */
1974+/* Audited by: green(2002.06.11) */
1975+static blocknr_set_entry *bse_alloc(void)
1976+{
1977+ blocknr_set_entry *e;
1978+
1979+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
1980+ reiser4_ctx_gfp_mask_get())) == NULL)
1981+ return NULL;
1982+
1983+ bse_init(e);
1984+
1985+ return e;
1986+}
1987+
1988+/* Free a blocknr_set_entry. */
1989+/* Audited by: green(2002.06.11) */
1990+static void bse_free(blocknr_set_entry * bse)
1991+{
1992+ kfree(bse);
1993+}
1994+
1995+/* Add a block number to a blocknr_set_entry */
1996+/* Audited by: green(2002.06.11) */
1997+static void
1998+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
1999+{
2000+ assert("jmacd-5099", bse_avail(bse) >= 1);
2001+
2002+ bse->entries[bse->nr_singles++] = *block;
2003+}
2004+
2005+/* Get a pair of block numbers */
2006+/* Audited by: green(2002.06.11) */
2007+static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2008+{
2009+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2010+
2011+ return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2012+ 2 * (pno + 1));
2013+}
2014+
2015+/* Add a pair of block numbers to a blocknr_set_entry */
2016+/* Audited by: green(2002.06.11) */
2017+static void
2018+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2019+ const reiser4_block_nr * b)
2020+{
2021+ blocknr_pair *pair;
2022+
2023+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2024+
2025+ pair = bse_get_pair(bse, bse->nr_pairs++);
2026+
2027+ pair->a = *a;
2028+ pair->b = *b;
2029+}
2030+
2031+/* Add either a block or pair of blocks to the block number set. The first
2032+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2033+ @b is non-NULL a pair is added. The block number set belongs to atom, and
2034+ the call is made with the atom lock held. There may not be enough space in
2035+ the current blocknr_set_entry. If new_bsep points to a non-NULL
2036+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2037+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
2038+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
2039+ returned with the atom unlocked for the operation to be tried again. If
2040+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2041+ used during the call, it will be freed automatically. */
2042+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2043+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2044+ const reiser4_block_nr *b)
2045+{
2046+ blocknr_set_entry *bse;
2047+ unsigned entries_needed;
2048+
2049+ assert("jmacd-5101", a != NULL);
2050+
2051+ entries_needed = (b == NULL) ? 1 : 2;
2052+ if (list_empty(bset) ||
2053+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2054+ /* See if a bse was previously allocated. */
2055+ if (*new_bsep == NULL) {
2056+ spin_unlock_atom(atom);
2057+ *new_bsep = bse_alloc();
2058+ return (*new_bsep != NULL) ? -E_REPEAT :
2059+ RETERR(-ENOMEM);
2060+ }
2061+
2062+ /* Put it on the head of the list. */
2063+ list_add(&((*new_bsep)->link), bset);
2064+
2065+ *new_bsep = NULL;
2066+ }
2067+
2068+ /* Add the single or pair. */
2069+ bse = list_entry(bset->next, blocknr_set_entry, link);
2070+ if (b == NULL) {
2071+ bse_put_single(bse, a);
2072+ } else {
2073+ bse_put_pair(bse, a, b);
2074+ }
2075+
2076+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2077+ if (*new_bsep != NULL) {
2078+ bse_free(*new_bsep);
2079+ *new_bsep = NULL;
2080+ }
2081+
2082+ return 0;
2083+}
2084+
2085+/* Add an extent to the block set. If the length is 1, it is treated as a
2086+ single block (e.g., reiser4_set_add_block). */
2087+/* Audited by: green(2002.06.11) */
2088+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2089+ kmalloc might schedule. The only exception is atom spinlock, which is
2090+ properly freed. */
2091+int
2092+blocknr_set_add_extent(txn_atom * atom,
2093+ struct list_head * bset,
2094+ blocknr_set_entry ** new_bsep,
2095+ const reiser4_block_nr * start,
2096+ const reiser4_block_nr * len)
2097+{
2098+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2099+ return blocknr_set_add(atom, bset, new_bsep, start,
2100+ *len == 1 ? NULL : len);
2101+}
2102+
2103+/* Add a block pair to the block set. It adds exactly a pair, which is checked
2104+ * by an assertion that both arguments are not null.*/
2105+/* Audited by: green(2002.06.11) */
2106+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2107+ kmalloc might schedule. The only exception is atom spinlock, which is
2108+ properly freed. */
2109+int
2110+blocknr_set_add_pair(txn_atom * atom,
2111+ struct list_head * bset,
2112+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2113+ const reiser4_block_nr * b)
2114+{
2115+ assert("jmacd-5103", a != NULL && b != NULL);
2116+ return blocknr_set_add(atom, bset, new_bsep, a, b);
2117+}
2118+
2119+/* Initialize a blocknr_set. */
2120+void blocknr_set_init(struct list_head *bset)
2121+{
2122+ INIT_LIST_HEAD(bset);
2123+}
2124+
2125+/* Release the entries of a blocknr_set. */
2126+void blocknr_set_destroy(struct list_head *bset)
2127+{
2128+ blocknr_set_entry *bse;
2129+
2130+ while (!list_empty(bset)) {
2131+ bse = list_entry(bset->next, blocknr_set_entry, link);
2132+ list_del_init(&bse->link);
2133+ bse_free(bse);
2134+ }
2135+}
2136+
2137+/* Merge blocknr_set entries out of @from into @into. */
2138+/* Audited by: green(2002.06.11) */
2139+/* Auditor comments: This merge does not know if merged sets contain
2140+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
2141+ overlapping ranges if there is some. So I believe it may lead to
2142+ some blocks being presented several times in one blocknr_set. To help
2143+ debugging such problems it might help to check for duplicate entries on
2144+ actual processing of this set. Testing this kind of stuff right here is
2145+ also complicated by the fact that these sets are not sorted and going
2146+ through whole set on each element addition is going to be CPU-heavy task */
2147+void blocknr_set_merge(struct list_head * from, struct list_head * into)
2148+{
2149+ blocknr_set_entry *bse_into = NULL;
2150+
2151+ /* If @from is empty, no work to perform. */
2152+ if (list_empty(from))
2153+ return;
2154+ /* If @into is not empty, try merging partial-entries. */
2155+ if (!list_empty(into)) {
2156+
2157+ /* Neither set is empty, pop the front to members and try to combine them. */
2158+ blocknr_set_entry *bse_from;
2159+ unsigned into_avail;
2160+
2161+ bse_into = list_entry(into->next, blocknr_set_entry, link);
2162+ list_del_init(&bse_into->link);
2163+ bse_from = list_entry(from->next, blocknr_set_entry, link);
2164+ list_del_init(&bse_from->link);
2165+
2166+ /* Combine singles. */
2167+ for (into_avail = bse_avail(bse_into);
2168+ into_avail != 0 && bse_from->nr_singles != 0;
2169+ into_avail -= 1) {
2170+ bse_put_single(bse_into,
2171+ &bse_from->entries[--bse_from->
2172+ nr_singles]);
2173+ }
2174+
2175+ /* Combine pairs. */
2176+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
2177+ into_avail -= 2) {
2178+ blocknr_pair *pair =
2179+ bse_get_pair(bse_from, --bse_from->nr_pairs);
2180+ bse_put_pair(bse_into, &pair->a, &pair->b);
2181+ }
2182+
2183+ /* If bse_from is empty, delete it now. */
2184+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2185+ bse_free(bse_from);
2186+ } else {
2187+ /* Otherwise, bse_into is full or nearly full (e.g.,
2188+ it could have one slot avail and bse_from has one
2189+ pair left). Push it back onto the list. bse_from
2190+ becomes bse_into, which will be the new partial. */
2191+ list_add(&bse_into->link, into);
2192+ bse_into = bse_from;
2193+ }
2194+ }
2195+
2196+ /* Splice lists together. */
2197+ list_splice_init(from, into->prev);
2198+
2199+ /* Add the partial entry back to the head of the list. */
2200+ if (bse_into != NULL)
2201+ list_add(&bse_into->link, into);
2202+}
2203+
2204+/* Iterate over all blocknr set elements. */
2205+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2206+ blocknr_set_actor_f actor, void *data, int delete)
2207+{
2208+
2209+ blocknr_set_entry *entry;
2210+
2211+ assert("zam-429", atom != NULL);
2212+ assert("zam-430", atom_is_protected(atom));
2213+ assert("zam-431", bset != 0);
2214+ assert("zam-432", actor != NULL);
2215+
2216+ entry = list_entry(bset->next, blocknr_set_entry, link);
2217+ while (bset != &entry->link) {
2218+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2219+ unsigned int i;
2220+ int ret;
2221+
2222+ for (i = 0; i < entry->nr_singles; i++) {
2223+ ret = actor(atom, &entry->entries[i], NULL, data);
2224+
2225+ /* We can't break a loop if delete flag is set. */
2226+ if (ret != 0 && !delete)
2227+ return ret;
2228+ }
2229+
2230+ for (i = 0; i < entry->nr_pairs; i++) {
2231+ struct blocknr_pair *ab;
2232+
2233+ ab = bse_get_pair(entry, i);
2234+
2235+ ret = actor(atom, &ab->a, &ab->b, data);
2236+
2237+ if (ret != 0 && !delete)
2238+ return ret;
2239+ }
2240+
2241+ if (delete) {
2242+ list_del(&entry->link);
2243+ bse_free(entry);
2244+ }
2245+
2246+ entry = tmp;
2247+ }
2248+
2249+ return 0;
2250+}
2251+
2252+/*
2253+ * Local variables:
2254+ * c-indentation-style: "K&R"
2255+ * mode-name: "LC"
2256+ * c-basic-offset: 8
2257+ * tab-width: 8
2258+ * fill-column: 79
2259+ * scroll-step: 1
2260+ * End:
2261+ */
2262diff -urN linux-2.6.20.orig/fs/reiser4/carry.c linux-2.6.20/fs/reiser4/carry.c
2263--- linux-2.6.20.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2264+++ linux-2.6.20/fs/reiser4/carry.c 2007-05-06 14:50:43.686971975 +0400
2265@@ -0,0 +1,1391 @@
2266+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2267+/* Functions to "carry" tree modification(s) upward. */
2268+/* Tree is modified one level at a time. As we modify a level we accumulate a
2269+ set of changes that need to be propagated to the next level. We manage
2270+ node locking such that any searches that collide with carrying are
2271+ restarted, from the root if necessary.
2272+
2273+ Insertion of a new item may result in items being moved among nodes and
2274+ this requires the delimiting key to be updated at the least common parent
2275+ of the nodes modified to preserve search tree invariants. Also, insertion
2276+ may require allocation of a new node. A pointer to the new node has to be
2277+ inserted into some node on the parent level, etc.
2278+
2279+ Tree carrying is meant to be analogous to arithmetic carrying.
2280+
2281+ A carry operation is always associated with some node (&carry_node).
2282+
2283+ Carry process starts with some initial set of operations to be performed
2284+ and an initial set of already locked nodes. Operations are performed one
2285+ by one. Performing each single operation has following possible effects:
2286+
2287+ - content of carry node associated with operation is modified
2288+ - new carry nodes are locked and involved into carry process on this level
2289+ - new carry operations are posted to the next level
2290+
2291+ After all carry operations on this level are done, process is repeated for
2292+ the accumulated sequence on carry operations for the next level. This
2293+ starts by trying to lock (in left to right order) all carry nodes
2294+ associated with carry operations on the parent level. After this, we decide
2295+ whether more nodes are required on the left of already locked set. If so,
2296+ all locks taken on the parent level are released, new carry nodes are
2297+ added, and locking process repeats.
2298+
2299+ It may happen that balancing process fails owing to unrecoverable error on
2300+ some of upper levels of a tree (possible causes are io error, failure to
2301+ allocate new node, etc.). In this case we should unmount the filesystem,
2302+ rebooting if it is the root, and possibly advise the use of fsck.
2303+
2304+ USAGE:
2305+
2306+ int some_tree_operation( znode *node, ... )
2307+ {
2308+ // Allocate on a stack pool of carry objects: operations and nodes.
2309+ // Most carry processes will only take objects from here, without
2310+ // dynamic allocation.
2311+
2312+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2313+
2314+ carry_pool pool;
2315+ carry_level lowest_level;
2316+ carry_op *op;
2317+
2318+ init_carry_pool( &pool );
2319+ init_carry_level( &lowest_level, &pool );
2320+
2321+ // operation may be one of:
2322+ // COP_INSERT --- insert new item into node
2323+ // COP_CUT --- remove part of or whole node
2324+ // COP_PASTE --- increase size of item
2325+ // COP_DELETE --- delete pointer from parent node
2326+ // COP_UPDATE --- update delimiting key in least
2327+ // common ancestor of two
2328+
2329+ op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2330+ if( IS_ERR( op ) || ( op == NULL ) ) {
2331+ handle error
2332+ } else {
2333+ // fill in remaining fields in @op, according to carry.h:carry_op
2334+ result = carry( &lowest_level, NULL );
2335+ }
2336+ done_carry_pool( &pool );
2337+ }
2338+
2339+ When you are implementing node plugin method that participates in carry
2340+ (shifting, insertion, deletion, etc.), do the following:
2341+
2342+ int foo_node_method( znode *node, ..., carry_level *todo )
2343+ {
2344+ carry_op *op;
2345+
2346+ ....
2347+
2348+ // note, that last argument to reiser4_post_carry() is non-null
2349+ // here, because @op is to be applied to the parent of @node, rather
2350+ // than to the @node itself as in the previous case.
2351+
2352+ op = node_post_carry( todo, operation, node, 1 );
2353+ // fill in remaining fields in @op, according to carry.h:carry_op
2354+
2355+ ....
2356+
2357+ }
2358+
2359+ BATCHING:
2360+
2361+ One of the main advantages of level-by-level balancing implemented here is
2362+ ability to batch updates on a parent level and to peform them more
2363+ efficiently as a result.
2364+
2365+ Description To Be Done (TBD).
2366+
2367+ DIFFICULTIES AND SUBTLE POINTS:
2368+
2369+ 1. complex plumbing is required, because:
2370+
2371+ a. effective allocation through pools is needed
2372+
2373+ b. target of operation is not exactly known when operation is
2374+ posted. This is worked around through bitfields in &carry_node and
2375+ logic in lock_carry_node()
2376+
2377+ c. of interaction with locking code: node should be added into sibling
2378+ list when pointer to it is inserted into its parent, which is some time
2379+ after node was created. Between these moments, node is somewhat in
2380+ suspended state and is only registered in the carry lists
2381+
2382+ 2. whole balancing logic is implemented here, in particular, insertion
2383+ logic is coded in make_space().
2384+
2385+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2386+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2387+ (insert_paste()) have to be handled.
2388+
2389+ 4. there is non-trivial interdependency between allocation of new nodes
2390+ and almost everything else. This is mainly due to the (1.c) above. I shall
2391+ write about this later.
2392+
2393+*/
2394+
2395+#include "forward.h"
2396+#include "debug.h"
2397+#include "key.h"
2398+#include "coord.h"
2399+#include "plugin/item/item.h"
2400+#include "plugin/item/extent.h"
2401+#include "plugin/node/node.h"
2402+#include "jnode.h"
2403+#include "znode.h"
2404+#include "tree_mod.h"
2405+#include "tree_walk.h"
2406+#include "block_alloc.h"
2407+#include "pool.h"
2408+#include "tree.h"
2409+#include "carry.h"
2410+#include "carry_ops.h"
2411+#include "super.h"
2412+#include "reiser4.h"
2413+
2414+#include <linux/types.h>
2415+
2416+/* level locking/unlocking */
2417+static int lock_carry_level(carry_level * level);
2418+static void unlock_carry_level(carry_level * level, int failure);
2419+static void done_carry_level(carry_level * level);
2420+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2421+
2422+int lock_carry_node(carry_level * level, carry_node * node);
2423+int lock_carry_node_tail(carry_node * node);
2424+
2425+/* carry processing proper */
2426+static int carry_on_level(carry_level * doing, carry_level * todo);
2427+
2428+static carry_op *add_op(carry_level * level, pool_ordering order,
2429+ carry_op * reference);
2430+
2431+/* handlers for carry operations. */
2432+
2433+static void fatal_carry_error(carry_level * doing, int ecode);
2434+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2435+
2436+static void print_level(const char *prefix, carry_level * level);
2437+
2438+#if REISER4_DEBUG
2439+typedef enum {
2440+ CARRY_TODO,
2441+ CARRY_DOING
2442+} carry_queue_state;
2443+static int carry_level_invariant(carry_level * level, carry_queue_state state);
2444+#endif
2445+
2446+/* main entry point for tree balancing.
2447+
2448+ Tree carry performs operations from @doing and while doing so accumulates
2449+ information about operations to be performed on the next level ("carried"
2450+ to the parent level). Carried operations are performed, causing possibly
2451+ more operations to be carried upward etc. carry() takes care about
2452+ locking and pinning znodes while operating on them.
2453+
2454+ For usage, see comment at the top of fs/reiser4/carry.c
2455+
2456+*/
2457+int reiser4_carry(carry_level * doing /* set of carry operations to be
2458+ * performed */ ,
2459+ carry_level * done /* set of nodes, already performed
2460+ * at the previous level.
2461+ * NULL in most cases */)
2462+{
2463+ int result = 0;
2464+ /* queue of new requests */
2465+ carry_level *todo;
2466+ ON_DEBUG(STORE_COUNTERS);
2467+
2468+ assert("nikita-888", doing != NULL);
2469+ BUG_ON(done != NULL);
2470+
2471+ todo = doing + 1;
2472+ init_carry_level(todo, doing->pool);
2473+
2474+ /* queue of requests preformed on the previous level */
2475+ done = todo + 1;
2476+ init_carry_level(done, doing->pool);
2477+
2478+ /* iterate until there is nothing more to do */
2479+ while (result == 0 && doing->ops_num > 0) {
2480+ carry_level *tmp;
2481+
2482+ /* at this point @done is locked. */
2483+ /* repeat lock/do/unlock while
2484+
2485+ (1) lock_carry_level() fails due to deadlock avoidance, or
2486+
2487+ (2) carry_on_level() decides that more nodes have to
2488+ be involved.
2489+
2490+ (3) some unexpected error occurred while balancing on the
2491+ upper levels. In this case all changes are rolled back.
2492+
2493+ */
2494+ while (1) {
2495+ result = lock_carry_level(doing);
2496+ if (result == 0) {
2497+ /* perform operations from @doing and
2498+ accumulate new requests in @todo */
2499+ result = carry_on_level(doing, todo);
2500+ if (result == 0)
2501+ break;
2502+ else if (result != -E_REPEAT ||
2503+ !doing->restartable) {
2504+ warning("nikita-1043",
2505+ "Fatal error during carry: %i",
2506+ result);
2507+ print_level("done", done);
2508+ print_level("doing", doing);
2509+ print_level("todo", todo);
2510+ /* do some rough stuff like aborting
2511+ all pending transcrashes and thus
2512+ pushing tree back to the consistent
2513+ state. Alternatvely, just panic.
2514+ */
2515+ fatal_carry_error(doing, result);
2516+ return result;
2517+ }
2518+ } else if (result != -E_REPEAT) {
2519+ fatal_carry_error(doing, result);
2520+ return result;
2521+ }
2522+ unlock_carry_level(doing, 1);
2523+ }
2524+ /* at this point @done can be safely unlocked */
2525+ done_carry_level(done);
2526+
2527+ /* cyclically shift queues */
2528+ tmp = done;
2529+ done = doing;
2530+ doing = todo;
2531+ todo = tmp;
2532+ init_carry_level(todo, doing->pool);
2533+
2534+ /* give other threads chance to run */
2535+ reiser4_preempt_point();
2536+ }
2537+ done_carry_level(done);
2538+
2539+ /* all counters, but x_refs should remain the same. x_refs can change
2540+ owing to transaction manager */
2541+ ON_DEBUG(CHECK_COUNTERS);
2542+ return result;
2543+}
2544+
2545+/* perform carry operations on given level.
2546+
2547+ Optimizations proposed by pooh:
2548+
2549+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2550+ required;
2551+
2552+ (2) unlock node if there are no more operations to be performed upon it and
2553+ node didn't add any operation to @todo. This can be implemented by
2554+ attaching to each node two counters: counter of operaions working on this
2555+ node and counter and operations carried upward from this node.
2556+
2557+*/
2558+static int carry_on_level(carry_level * doing /* queue of carry operations to
2559+ * do on this level */ ,
2560+ carry_level * todo /* queue where new carry
2561+ * operations to be performed on
2562+ * the * parent level are
2563+ * accumulated during @doing
2564+ * processing. */ )
2565+{
2566+ int result;
2567+ int (*f) (carry_op *, carry_level *, carry_level *);
2568+ carry_op *op;
2569+ carry_op *tmp_op;
2570+
2571+ assert("nikita-1034", doing != NULL);
2572+ assert("nikita-1035", todo != NULL);
2573+
2574+ /* @doing->nodes are locked. */
2575+
2576+ /* This function can be split into two phases: analysis and modification.
2577+
2578+ Analysis calculates precisely what items should be moved between
2579+ nodes. This information is gathered in some structures attached to
2580+ each carry_node in a @doing queue. Analysis also determines whether
2581+ new nodes are to be allocated etc.
2582+
2583+ After analysis is completed, actual modification is performed. Here
2584+ we can take advantage of "batch modification": if there are several
2585+ operations acting on the same node, modifications can be performed
2586+ more efficiently when batched together.
2587+
2588+ Above is an optimization left for the future.
2589+ */
2590+ /* Important, but delayed optimization: it's possible to batch
2591+ operations together and perform them more efficiently as a
2592+ result. For example, deletion of several neighboring items from a
2593+ node can be converted to a single ->cut() operation.
2594+
2595+ Before processing queue, it should be scanned and "mergeable"
2596+ operations merged.
2597+ */
2598+ result = 0;
2599+ for_all_ops(doing, op, tmp_op) {
2600+ carry_opcode opcode;
2601+
2602+ assert("nikita-1041", op != NULL);
2603+ opcode = op->op;
2604+ assert("nikita-1042", op->op < COP_LAST_OP);
2605+ f = op_dispatch_table[op->op].handler;
2606+ result = f(op, doing, todo);
2607+ /* locking can fail with -E_REPEAT. Any different error is fatal
2608+ and will be handled by fatal_carry_error() sledgehammer.
2609+ */
2610+ if (result != 0)
2611+ break;
2612+ }
2613+ if (result == 0) {
2614+ carry_plugin_info info;
2615+ carry_node *scan;
2616+ carry_node *tmp_scan;
2617+
2618+ info.doing = doing;
2619+ info.todo = todo;
2620+
2621+ assert("nikita-3002",
2622+ carry_level_invariant(doing, CARRY_DOING));
2623+ for_all_nodes(doing, scan, tmp_scan) {
2624+ znode *node;
2625+
2626+ node = reiser4_carry_real(scan);
2627+ assert("nikita-2547", node != NULL);
2628+ if (node_is_empty(node)) {
2629+ result =
2630+ node_plugin_by_node(node)->
2631+ prepare_removal(node, &info);
2632+ if (result != 0)
2633+ break;
2634+ }
2635+ }
2636+ }
2637+ return result;
2638+}
2639+
2640+/* post carry operation
2641+
2642+ This is main function used by external carry clients: node layout plugins
2643+ and tree operations to create new carry operation to be performed on some
2644+ level.
2645+
2646+ New operation will be included in the @level queue. To actually perform it,
2647+ call carry( level, ... ). This function takes write lock on @node. Carry
2648+ manages all its locks by itself, don't worry about this.
2649+
2650+ This function adds operation and node at the end of the queue. It is up to
2651+ caller to guarantee proper ordering of node queue.
2652+
2653+*/
2654+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2655+ * is to be posted at */ ,
2656+ carry_opcode op /* opcode of operation */ ,
2657+ znode * node /* node on which this operation
2658+ * will operate */ ,
2659+ int apply_to_parent_p /* whether operation will
2660+ * operate directly on @node
2661+ * or on it parent. */)
2662+{
2663+ carry_op *result;
2664+ carry_node *child;
2665+
2666+ assert("nikita-1046", level != NULL);
2667+ assert("nikita-1788", znode_is_write_locked(node));
2668+
2669+ result = add_op(level, POOLO_LAST, NULL);
2670+ if (IS_ERR(result))
2671+ return result;
2672+ child = reiser4_add_carry(level, POOLO_LAST, NULL);
2673+ if (IS_ERR(child)) {
2674+ reiser4_pool_free(&level->pool->op_pool, &result->header);
2675+ return (carry_op *) child;
2676+ }
2677+ result->node = child;
2678+ result->op = op;
2679+ child->parent = apply_to_parent_p;
2680+ if (ZF_ISSET(node, JNODE_ORPHAN))
2681+ child->left_before = 1;
2682+ child->node = node;
2683+ return result;
2684+}
2685+
2686+/* initialize carry queue */
2687+void init_carry_level(carry_level * level /* level to initialize */ ,
2688+ carry_pool * pool /* pool @level will allocate objects
2689+ * from */ )
2690+{
2691+ assert("nikita-1045", level != NULL);
2692+ assert("nikita-967", pool != NULL);
2693+
2694+ memset(level, 0, sizeof *level);
2695+ level->pool = pool;
2696+
2697+ INIT_LIST_HEAD(&level->nodes);
2698+ INIT_LIST_HEAD(&level->ops);
2699+}
2700+
2701+/* allocate carry pool and initialize pools within queue */
2702+carry_pool *init_carry_pool(int size)
2703+{
2704+ carry_pool *pool;
2705+
2706+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2707+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2708+ if (pool == NULL)
2709+ return ERR_PTR(RETERR(-ENOMEM));
2710+
2711+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2712+ (char *)pool->op);
2713+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2714+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2715+ return pool;
2716+}
2717+
2718+/* finish with queue pools */
2719+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2720+{
2721+ reiser4_done_pool(&pool->op_pool);
2722+ reiser4_done_pool(&pool->node_pool);
2723+ kfree(pool);
2724+}
2725+
2726+/* add new carry node to the @level.
2727+
2728+ Returns pointer to the new carry node allocated from pool. It's up to
2729+ callers to maintain proper order in the @level. Assumption is that if carry
2730+ nodes on one level are already sorted and modifications are peroformed from
2731+ left to right, carry nodes added on the parent level will be ordered
2732+ automatically. To control ordering use @order and @reference parameters.
2733+
2734+*/
2735+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2736+ * node to */ ,
2737+ pool_ordering order /* where to insert:
2738+ * at the beginning of
2739+ * @level,
2740+ * before @reference,
2741+ * after @reference,
2742+ * at the end of @level
2743+ */ ,
2744+ carry_node * reference/* reference node for
2745+ * insertion */)
2746+{
2747+ ON_DEBUG(carry_node * orig_ref = reference);
2748+
2749+ if (order == POOLO_BEFORE) {
2750+ reference = find_left_carry(reference, level);
2751+ if (reference == NULL)
2752+ reference = list_entry(level->nodes.next, carry_node,
2753+ header.level_linkage);
2754+ else
2755+ reference = list_entry(reference->header.level_linkage.next,
2756+ carry_node, header.level_linkage);
2757+ } else if (order == POOLO_AFTER) {
2758+ reference = find_right_carry(reference, level);
2759+ if (reference == NULL)
2760+ reference = list_entry(level->nodes.prev, carry_node,
2761+ header.level_linkage);
2762+ else
2763+ reference = list_entry(reference->header.level_linkage.prev,
2764+ carry_node, header.level_linkage);
2765+ }
2766+ assert("nikita-2209",
2767+ ergo(orig_ref != NULL,
2768+ reiser4_carry_real(reference) ==
2769+ reiser4_carry_real(orig_ref)));
2770+ return reiser4_add_carry(level, order, reference);
2771+}
2772+
2773+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
2774+ * to */ ,
2775+ pool_ordering order /* where to insert: at the
2776+ * beginning of @level, before
2777+ * @reference, after @reference,
2778+ * at the end of @level */ ,
2779+ carry_node * reference /* reference node for
2780+ * insertion */ )
2781+{
2782+ carry_node *result;
2783+
2784+ result =
2785+ (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2786+ &level->nodes,
2787+ order, &reference->header);
2788+ if (!IS_ERR(result) && (result != NULL))
2789+ ++level->nodes_num;
2790+ return result;
2791+}
2792+
2793+/* add new carry operation to the @level.
2794+
2795+ Returns pointer to the new carry operations allocated from pool. It's up to
2796+ callers to maintain proper order in the @level. To control ordering use
2797+ @order and @reference parameters.
2798+
2799+*/
2800+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2801+ pool_ordering order /* where to insert: at the beginning of
2802+ * @level, before @reference, after
2803+ * @reference, at the end of @level */ ,
2804+ carry_op *
2805+ reference /* reference node for insertion */ )
2806+{
2807+ carry_op *result;
2808+
2809+ result =
2810+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2811+ order, &reference->header);
2812+ if (!IS_ERR(result) && (result != NULL))
2813+ ++level->ops_num;
2814+ return result;
2815+}
2816+
2817+/* Return node on the right of which @node was created.
2818+
2819+ Each node is created on the right of some existing node (or it is new root,
2820+ which is special case not handled here).
2821+
2822+ @node is new node created on some level, but not yet inserted into its
2823+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2824+
2825+*/
2826+static carry_node *find_begetting_brother(carry_node * node /* node to start search
2827+ * from */ ,
2828+ carry_level * kin UNUSED_ARG /* level to
2829+ * scan */ )
2830+{
2831+ carry_node *scan;
2832+
2833+ assert("nikita-1614", node != NULL);
2834+ assert("nikita-1615", kin != NULL);
2835+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2836+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2837+ ZF_ISSET(reiser4_carry_real(node),
2838+ JNODE_ORPHAN)));
2839+ for (scan = node;;
2840+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
2841+ header.level_linkage)) {
2842+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2843+ if ((scan->node != node->node) &&
2844+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2845+ assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2846+ break;
2847+ }
2848+ }
2849+ return scan;
2850+}
2851+
2852+static cmp_t
2853+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2854+{
2855+ assert("nikita-2199", n1 != NULL);
2856+ assert("nikita-2200", n2 != NULL);
2857+
2858+ if (n1 == n2)
2859+ return EQUAL_TO;
2860+ while (1) {
2861+ n1 = carry_node_next(n1);
2862+ if (carry_node_end(level, n1))
2863+ return GREATER_THAN;
2864+ if (n1 == n2)
2865+ return LESS_THAN;
2866+ }
2867+ impossible("nikita-2201", "End of level reached");
2868+}
2869+
2870+carry_node *find_carry_node(carry_level * level, const znode * node)
2871+{
2872+ carry_node *scan;
2873+ carry_node *tmp_scan;
2874+
2875+ assert("nikita-2202", level != NULL);
2876+ assert("nikita-2203", node != NULL);
2877+
2878+ for_all_nodes(level, scan, tmp_scan) {
2879+ if (reiser4_carry_real(scan) == node)
2880+ return scan;
2881+ }
2882+ return NULL;
2883+}
2884+
2885+znode *reiser4_carry_real(const carry_node * node)
2886+{
2887+ assert("nikita-3061", node != NULL);
2888+
2889+ return node->lock_handle.node;
2890+}
2891+
2892+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2893+ const znode * node)
2894+{
2895+ carry_node *base;
2896+ carry_node *scan;
2897+ carry_node *tmp_scan;
2898+ carry_node *proj;
2899+
2900+ base = find_carry_node(doing, node);
2901+ assert("nikita-2204", base != NULL);
2902+
2903+ for_all_nodes(todo, scan, tmp_scan) {
2904+ proj = find_carry_node(doing, scan->node);
2905+ assert("nikita-2205", proj != NULL);
2906+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2907+ break;
2908+ }
2909+ return scan;
2910+}
2911+
2912+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2913+ znode * node)
2914+{
2915+ carry_node *reference;
2916+
2917+ assert("nikita-2994", doing != NULL);
2918+ assert("nikita-2995", todo != NULL);
2919+ assert("nikita-2996", node != NULL);
2920+
2921+ reference = insert_carry_node(doing, todo, node);
2922+ assert("nikita-2997", reference != NULL);
2923+
2924+ return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2925+}
2926+
2927+/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2928+ This function is different from reiser4_post_carry() in that it finds proper
2929+ place to insert node in the queue. */
2930+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2931+ * passed down to node
2932+ * plugin */ ,
2933+ carry_opcode op /* opcode of operation */ ,
2934+ znode * node /* node on which this
2935+ * operation will operate */ ,
2936+ int apply_to_parent_p /* whether operation will
2937+ * operate directly on @node
2938+ * or on it parent. */ )
2939+{
2940+ carry_op *result;
2941+ carry_node *child;
2942+
2943+ assert("nikita-2207", info != NULL);
2944+ assert("nikita-2208", info->todo != NULL);
2945+
2946+ if (info->doing == NULL)
2947+ return reiser4_post_carry(info->todo, op, node,
2948+ apply_to_parent_p);
2949+
2950+ result = add_op(info->todo, POOLO_LAST, NULL);
2951+ if (IS_ERR(result))
2952+ return result;
2953+ child = add_carry_atplace(info->doing, info->todo, node);
2954+ if (IS_ERR(child)) {
2955+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2956+ return (carry_op *) child;
2957+ }
2958+ result->node = child;
2959+ result->op = op;
2960+ child->parent = apply_to_parent_p;
2961+ if (ZF_ISSET(node, JNODE_ORPHAN))
2962+ child->left_before = 1;
2963+ child->node = node;
2964+ return result;
2965+}
2966+
2967+/* lock all carry nodes in @level */
2968+static int lock_carry_level(carry_level * level /* level to lock */ )
2969+{
2970+ int result;
2971+ carry_node *node;
2972+ carry_node *tmp_node;
2973+
2974+ assert("nikita-881", level != NULL);
2975+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
2976+
2977+ /* lock nodes from left to right */
2978+ result = 0;
2979+ for_all_nodes(level, node, tmp_node) {
2980+ result = lock_carry_node(level, node);
2981+ if (result != 0)
2982+ break;
2983+ }
2984+ return result;
2985+}
2986+
2987+/* Synchronize delimiting keys between @node and its left neighbor.
2988+
2989+ To reduce contention on dk key and simplify carry code, we synchronize
2990+ delimiting keys only when carry ultimately leaves tree level (carrying
2991+ changes upward) and unlocks nodes at this level.
2992+
2993+ This function first finds left neighbor of @node and then updates left
2994+ neighbor's right delimiting key to conincide with least key in @node.
2995+
2996+*/
2997+
2998+ON_DEBUG(extern atomic_t delim_key_version;
2999+ )
3000+
3001+static void sync_dkeys(znode * spot /* node to update */ )
3002+{
3003+ reiser4_key pivot;
3004+ reiser4_tree *tree;
3005+
3006+ assert("nikita-1610", spot != NULL);
3007+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3008+
3009+ tree = znode_get_tree(spot);
3010+ read_lock_tree(tree);
3011+ write_lock_dk(tree);
3012+
3013+ assert("nikita-2192", znode_is_loaded(spot));
3014+
3015+ /* sync left delimiting key of @spot with key in its leftmost item */
3016+ if (node_is_empty(spot))
3017+ pivot = *znode_get_rd_key(spot);
3018+ else
3019+ leftmost_key_in_node(spot, &pivot);
3020+
3021+ znode_set_ld_key(spot, &pivot);
3022+
3023+ /* there can be sequence of empty nodes pending removal on the left of
3024+ @spot. Scan them and update their left and right delimiting keys to
3025+ match left delimiting key of @spot. Also, update right delimiting
3026+ key of first non-empty left neighbor.
3027+ */
3028+ while (1) {
3029+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3030+ break;
3031+
3032+ spot = spot->left;
3033+ if (spot == NULL)
3034+ break;
3035+
3036+ znode_set_rd_key(spot, &pivot);
3037+ /* don't sink into the domain of another balancing */
3038+ if (!znode_is_write_locked(spot))
3039+ break;
3040+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3041+ znode_set_ld_key(spot, &pivot);
3042+ else
3043+ break;
3044+ }
3045+
3046+ write_unlock_dk(tree);
3047+ read_unlock_tree(tree);
3048+}
3049+
3050+/* unlock all carry nodes in @level */
3051+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3052+ int failure /* true if unlocking owing to
3053+ * failure */ )
3054+{
3055+ carry_node *node;
3056+ carry_node *tmp_node;
3057+
3058+ assert("nikita-889", level != NULL);
3059+
3060+ if (!failure) {
3061+ znode *spot;
3062+
3063+ spot = NULL;
3064+ /* update delimiting keys */
3065+ for_all_nodes(level, node, tmp_node) {
3066+ if (reiser4_carry_real(node) != spot) {
3067+ spot = reiser4_carry_real(node);
3068+ sync_dkeys(spot);
3069+ }
3070+ }
3071+ }
3072+
3073+ /* nodes can be unlocked in arbitrary order. In preemptible
3074+ environment it's better to unlock in reverse order of locking,
3075+ though.
3076+ */
3077+ for_all_nodes_back(level, node, tmp_node) {
3078+ /* all allocated nodes should be already linked to their
3079+ parents at this moment. */
3080+ assert("nikita-1631",
3081+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3082+ JNODE_ORPHAN)));
3083+ ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3084+ unlock_carry_node(level, node, failure);
3085+ }
3086+ level->new_root = NULL;
3087+}
3088+
3089+/* finish with @level
3090+
3091+ Unlock nodes and release all allocated resources */
3092+static void done_carry_level(carry_level * level /* level to finish */ )
3093+{
3094+ carry_node *node;
3095+ carry_node *tmp_node;
3096+ carry_op *op;
3097+ carry_op *tmp_op;
3098+
3099+ assert("nikita-1076", level != NULL);
3100+
3101+ unlock_carry_level(level, 0);
3102+ for_all_nodes(level, node, tmp_node) {
3103+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3104+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3105+ reiser4_pool_free(&level->pool->node_pool, &node->header);
3106+ }
3107+ for_all_ops(level, op, tmp_op)
3108+ reiser4_pool_free(&level->pool->op_pool, &op->header);
3109+}
3110+
3111+/* helper function to complete locking of carry node
3112+
3113+ Finish locking of carry node. There are several ways in which new carry
3114+ node can be added into carry level and locked. Normal is through
3115+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
3116+ function factors out common final part of all locking scenarios. It
3117+ supposes that @node -> lock_handle is lock handle for lock just taken and
3118+ fills ->real_node from this lock handle.
3119+
3120+*/
3121+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3122+{
3123+ assert("nikita-1052", node != NULL);
3124+ assert("nikita-1187", reiser4_carry_real(node) != NULL);
3125+ assert("nikita-1188", !node->unlock);
3126+
3127+ node->unlock = 1;
3128+ /* Load node content into memory and install node plugin by
3129+ looking at the node header.
3130+
3131+ Most of the time this call is cheap because the node is
3132+ already in memory.
3133+
3134+ Corresponding zrelse() is in unlock_carry_node()
3135+ */
3136+ return zload(reiser4_carry_real(node));
3137+}
3138+
3139+/* lock carry node
3140+
3141+ "Resolve" node to real znode, lock it and mark as locked.
3142+ This requires recursive locking of znodes.
3143+
3144+ When operation is posted to the parent level, node it will be applied to is
3145+ not yet known. For example, when shifting data between two nodes,
3146+ delimiting has to be updated in parent or parents of nodes involved. But
3147+ their parents is not yet locked and, moreover said nodes can be reparented
3148+ by concurrent balancing.
3149+
3150+ To work around this, carry operation is applied to special "carry node"
3151+ rather than to the znode itself. Carry node consists of some "base" or
3152+ "reference" znode and flags indicating how to get to the target of carry
3153+ operation (->real_node field of carry_node) from base.
3154+
3155+*/
3156+int lock_carry_node(carry_level * level /* level @node is in */ ,
3157+ carry_node * node /* node to lock */ )
3158+{
3159+ int result;
3160+ znode *reference_point;
3161+ lock_handle lh;
3162+ lock_handle tmp_lh;
3163+ reiser4_tree *tree;
3164+
3165+ assert("nikita-887", level != NULL);
3166+ assert("nikita-882", node != NULL);
3167+
3168+ result = 0;
3169+ reference_point = node->node;
3170+ init_lh(&lh);
3171+ init_lh(&tmp_lh);
3172+ if (node->left_before) {
3173+ /* handling of new nodes, allocated on the previous level:
3174+
3175+ some carry ops were propably posted from the new node, but
3176+ this node neither has parent pointer set, nor is
3177+ connected. This will be done in ->create_hook() for
3178+ internal item.
3179+
3180+ No then less, parent of new node has to be locked. To do
3181+ this, first go to the "left" in the carry order. This
3182+ depends on the decision to always allocate new node on the
3183+ right of existing one.
3184+
3185+ Loop handles case when multiple nodes, all orphans, were
3186+ inserted.
3187+
3188+ Strictly speaking, taking tree lock is not necessary here,
3189+ because all nodes scanned by loop in
3190+ find_begetting_brother() are write-locked by this thread,
3191+ and thus, their sibling linkage cannot change.
3192+
3193+ */
3194+ tree = znode_get_tree(reference_point);
3195+ read_lock_tree(tree);
3196+ reference_point = find_begetting_brother(node, level)->node;
3197+ read_unlock_tree(tree);
3198+ assert("nikita-1186", reference_point != NULL);
3199+ }
3200+ if (node->parent && (result == 0)) {
3201+ result =
3202+ reiser4_get_parent(&tmp_lh, reference_point,
3203+ ZNODE_WRITE_LOCK);
3204+ if (result != 0) {
3205+ ; /* nothing */
3206+ } else if (znode_get_level(tmp_lh.node) == 0) {
3207+ assert("nikita-1347", znode_above_root(tmp_lh.node));
3208+ result = add_new_root(level, node, tmp_lh.node);
3209+ if (result == 0) {
3210+ reference_point = level->new_root;
3211+ move_lh(&lh, &node->lock_handle);
3212+ }
3213+ } else if ((level->new_root != NULL)
3214+ && (level->new_root !=
3215+ znode_parent_nolock(reference_point))) {
3216+ /* parent of node exists, but this level aready
3217+ created different new root, so */
3218+ warning("nikita-1109",
3219+ /* it should be "radicis", but tradition is
3220+ tradition. do banshees read latin? */
3221+ "hodie natus est radici frater");
3222+ result = -EIO;
3223+ } else {
3224+ move_lh(&lh, &tmp_lh);
3225+ reference_point = lh.node;
3226+ }
3227+ }
3228+ if (node->left && (result == 0)) {
3229+ assert("nikita-1183", node->parent);
3230+ assert("nikita-883", reference_point != NULL);
3231+ result =
3232+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
3233+ ZNODE_WRITE_LOCK,
3234+ GN_CAN_USE_UPPER_LEVELS);
3235+ if (result == 0) {
3236+ done_lh(&lh);
3237+ move_lh(&lh, &tmp_lh);
3238+ reference_point = lh.node;
3239+ }
3240+ }
3241+ if (!node->parent && !node->left && !node->left_before) {
3242+ result =
3243+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3244+ ZNODE_LOCK_HIPRI);
3245+ }
3246+ if (result == 0) {
3247+ move_lh(&node->lock_handle, &lh);
3248+ result = lock_carry_node_tail(node);
3249+ }
3250+ done_lh(&tmp_lh);
3251+ done_lh(&lh);
3252+ return result;
3253+}
3254+
3255+/* release a lock on &carry_node.
3256+
3257+ Release if necessary lock on @node. This opearion is pair of
3258+ lock_carry_node() and is idempotent: you can call it more than once on the
3259+ same node.
3260+
3261+*/
3262+static void
3263+unlock_carry_node(carry_level * level,
3264+ carry_node * node /* node to be released */ ,
3265+ int failure /* 0 if node is unlocked due
3266+ * to some error */ )
3267+{
3268+ znode *real_node;
3269+
3270+ assert("nikita-884", node != NULL);
3271+
3272+ real_node = reiser4_carry_real(node);
3273+ /* pair to zload() in lock_carry_node_tail() */
3274+ zrelse(real_node);
3275+ if (node->unlock && (real_node != NULL)) {
3276+ assert("nikita-899", real_node == node->lock_handle.node);
3277+ longterm_unlock_znode(&node->lock_handle);
3278+ }
3279+ if (failure) {
3280+ if (node->deallocate && (real_node != NULL)) {
3281+ /* free node in bitmap
3282+
3283+ Prepare node for removal. Last zput() will finish
3284+ with it.
3285+ */
3286+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3287+ }
3288+ if (node->free) {
3289+ assert("nikita-2177",
3290+ list_empty_careful(&node->lock_handle.locks_link));
3291+ assert("nikita-2112",
3292+ list_empty_careful(&node->lock_handle.owners_link));
3293+ reiser4_pool_free(&level->pool->node_pool,
3294+ &node->header);
3295+ }
3296+ }
3297+}
3298+
3299+/* fatal_carry_error() - all-catching error handling function
3300+
3301+ It is possible that carry faces unrecoverable error, like unability to
3302+ insert pointer at the internal level. Our simple solution is just panic in
3303+ this situation. More sophisticated things like attempt to remount
3304+ file-system as read-only can be implemented without much difficlties.
3305+
3306+ It is believed, that:
3307+
3308+ 1. in stead of panicking, all current transactions can be aborted rolling
3309+ system back to the consistent state.
3310+
3311+Umm, if you simply panic without doing anything more at all, then all current
3312+transactions are aborted and the system is rolled back to a consistent state,
3313+by virtue of the design of the transactional mechanism. Well, wait, let's be
3314+precise. If an internal node is corrupted on disk due to hardware failure,
3315+then there may be no consistent state that can be rolled back to, so instead
3316+we should say that it will rollback the transactions, which barring other
3317+factors means rolling back to a consistent state.
3318+
3319+# Nikita: there is a subtle difference between panic and aborting
3320+# transactions: machine doesn't reboot. Processes aren't killed. Processes
3321+# don't using reiser4 (not that we care about such processes), or using other
3322+# reiser4 mounts (about them we do care) will simply continue to run. With
3323+# some luck, even application using aborted file system can survive: it will
3324+# get some error, like EBADF, from each file descriptor on failed file system,
3325+# but applications that do care about tolerance will cope with this (squid
3326+# will).
3327+
3328+It would be a nice feature though to support rollback without rebooting
3329+followed by remount, but this can wait for later versions.
3330+
3331+ 2. once isolated transactions will be implemented it will be possible to
3332+ roll back offending transaction.
3333+
3334+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3335+it more before deciding if it should be done. -Hans
3336+
3337+*/
3338+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3339+ * where
3340+ * unrecoverable
3341+ * error
3342+ * occurred */ ,
3343+ int ecode /* error code */ )
3344+{
3345+ assert("nikita-1230", doing != NULL);
3346+ assert("nikita-1231", ecode < 0);
3347+
3348+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3349+}
3350+
3351+/* add new root to the tree
3352+
3353+ This function itself only manages changes in carry structures and delegates
3354+ all hard work (allocation of znode for new root, changes of parent and
3355+ sibling pointers to the reiser4_add_tree_root().
3356+
3357+ Locking: old tree root is locked by carry at this point. Fake znode is also
3358+ locked.
3359+
3360+*/
3361+static int add_new_root(carry_level * level /* carry level in context of which
3362+ * operation is performed */ ,
3363+ carry_node * node /* carry node for existing root */ ,
3364+ znode * fake /* "fake" znode already locked by
3365+ * us */ )
3366+{
3367+ int result;
3368+
3369+ assert("nikita-1104", level != NULL);
3370+ assert("nikita-1105", node != NULL);
3371+
3372+ assert("nikita-1403", znode_is_write_locked(node->node));
3373+ assert("nikita-1404", znode_is_write_locked(fake));
3374+
3375+ /* trying to create new root. */
3376+ /* @node is root and it's already locked by us. This
3377+ means that nobody else can be trying to add/remove
3378+ tree root right now.
3379+ */
3380+ if (level->new_root == NULL)
3381+ level->new_root = reiser4_add_tree_root(node->node, fake);
3382+ if (!IS_ERR(level->new_root)) {
3383+ assert("nikita-1210", znode_is_root(level->new_root));
3384+ node->deallocate = 1;
3385+ result =
3386+ longterm_lock_znode(&node->lock_handle, level->new_root,
3387+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3388+ if (result == 0)
3389+ zput(level->new_root);
3390+ } else {
3391+ result = PTR_ERR(level->new_root);
3392+ level->new_root = NULL;
3393+ }
3394+ return result;
3395+}
3396+
3397+/* allocate new znode and add the operation that inserts the
3398+ pointer to it into the parent node into the todo level
3399+
3400+ Allocate new znode, add it into carry queue and post into @todo queue
3401+ request to add pointer to new node into its parent.
3402+
3403+ This is carry related routing that calls reiser4_new_node() to allocate new
3404+ node.
3405+*/
3406+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3407+ * node */ ,
3408+ carry_node * ref /* carry node after which new
3409+ * carry node is to be inserted
3410+ * into queue. This affects
3411+ * locking. */ ,
3412+ carry_level * doing /* carry queue where new node is
3413+ * to be added */ ,
3414+ carry_level * todo /* carry queue where COP_INSERT
3415+ * operation to add pointer to
3416+ * new node will ne added */ )
3417+{
3418+ carry_node *fresh;
3419+ znode *new_znode;
3420+ carry_op *add_pointer;
3421+ carry_plugin_info info;
3422+
3423+ assert("nikita-1048", brother != NULL);
3424+ assert("nikita-1049", todo != NULL);
3425+
3426+ /* There is a lot of possible variations here: to what parent
3427+ new node will be attached and where. For simplicity, always
3428+ do the following:
3429+
3430+ (1) new node and @brother will have the same parent.
3431+
3432+ (2) new node is added on the right of @brother
3433+
3434+ */
3435+
3436+ fresh = reiser4_add_carry_skip(doing,
3437+ ref ? POOLO_AFTER : POOLO_LAST, ref);
3438+ if (IS_ERR(fresh))
3439+ return fresh;
3440+
3441+ fresh->deallocate = 1;
3442+ fresh->free = 1;
3443+
3444+ new_znode = reiser4_new_node(brother, znode_get_level(brother));
3445+ if (IS_ERR(new_znode))
3446+ /* @fresh will be deallocated automatically by error
3447+ handling code in the caller. */
3448+ return (carry_node *) new_znode;
3449+
3450+ /* new_znode returned znode with x_count 1. Caller has to decrease
3451+ it. make_space() does. */
3452+
3453+ ZF_SET(new_znode, JNODE_ORPHAN);
3454+ fresh->node = new_znode;
3455+
3456+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3457+ ref = carry_node_prev(ref);
3458+ assert("nikita-1606", !carry_node_end(doing, ref));
3459+ }
3460+
3461+ info.todo = todo;
3462+ info.doing = doing;
3463+ add_pointer = node_post_carry(&info, COP_INSERT,
3464+ reiser4_carry_real(ref), 1);
3465+ if (IS_ERR(add_pointer)) {
3466+ /* no need to deallocate @new_znode here: it will be
3467+ deallocated during carry error handling. */
3468+ return (carry_node *) add_pointer;
3469+ }
3470+
3471+ add_pointer->u.insert.type = COPT_CHILD;
3472+ add_pointer->u.insert.child = fresh;
3473+ add_pointer->u.insert.brother = brother;
3474+ /* initially new node spawns empty key range */
3475+ write_lock_dk(znode_get_tree(brother));
3476+ znode_set_ld_key(new_znode,
3477+ znode_set_rd_key(new_znode,
3478+ znode_get_rd_key(brother)));
3479+ write_unlock_dk(znode_get_tree(brother));
3480+ return fresh;
3481+}
3482+
3483+/* DEBUGGING FUNCTIONS.
3484+
3485+ Probably we also should leave them on even when
3486+ debugging is turned off to print dumps at errors.
3487+*/
3488+#if REISER4_DEBUG
3489+static int carry_level_invariant(carry_level * level, carry_queue_state state)
3490+{
3491+ carry_node *node;
3492+ carry_node *tmp_node;
3493+
3494+ if (level == NULL)
3495+ return 0;
3496+
3497+ if (level->track_type != 0 &&
3498+ level->track_type != CARRY_TRACK_NODE &&
3499+ level->track_type != CARRY_TRACK_CHANGE)
3500+ return 0;
3501+
3502+ /* check that nodes are in ascending order */
3503+ for_all_nodes(level, node, tmp_node) {
3504+ znode *left;
3505+ znode *right;
3506+
3507+ reiser4_key lkey;
3508+ reiser4_key rkey;
3509+
3510+ if (node != carry_node_front(level)) {
3511+ if (state == CARRY_TODO) {
3512+ right = node->node;
3513+ left = carry_node_prev(node)->node;
3514+ } else {
3515+ right = reiser4_carry_real(node);
3516+ left = reiser4_carry_real(carry_node_prev(node));
3517+ }
3518+ if (right == NULL || left == NULL)
3519+ continue;
3520+ if (node_is_empty(right) || node_is_empty(left))
3521+ continue;
3522+ if (!keyle(leftmost_key_in_node(left, &lkey),
3523+ leftmost_key_in_node(right, &rkey))) {
3524+ warning("", "wrong key order");
3525+ return 0;
3526+ }
3527+ }
3528+ }
3529+ return 1;
3530+}
3531+#endif
3532+
3533+/* get symbolic name for boolean */
3534+static const char *tf(int boolean /* truth value */ )
3535+{
3536+ return boolean ? "t" : "f";
3537+}
3538+
3539+/* symbolic name for carry operation */
3540+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3541+{
3542+ switch (op) {
3543+ case COP_INSERT:
3544+ return "COP_INSERT";
3545+ case COP_DELETE:
3546+ return "COP_DELETE";
3547+ case COP_CUT:
3548+ return "COP_CUT";
3549+ case COP_PASTE:
3550+ return "COP_PASTE";
3551+ case COP_UPDATE:
3552+ return "COP_UPDATE";
3553+ case COP_EXTENT:
3554+ return "COP_EXTENT";
3555+ case COP_INSERT_FLOW:
3556+ return "COP_INSERT_FLOW";
3557+ default:{
3558+ /* not mt safe, but who cares? */
3559+ static char buf[20];
3560+
3561+ sprintf(buf, "unknown op: %x", op);
3562+ return buf;
3563+ }
3564+ }
3565+}
3566+
3567+/* dump information about carry node */
3568+static void print_carry(const char *prefix /* prefix to print */ ,
3569+ carry_node * node /* node to print */ )
3570+{
3571+ if (node == NULL) {
3572+ printk("%s: null\n", prefix);
3573+ return;
3574+ }
3575+ printk
3576+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3577+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3578+ tf(node->free), tf(node->deallocate));
3579+}
3580+
3581+/* dump information about carry operation */
3582+static void print_op(const char *prefix /* prefix to print */ ,
3583+ carry_op * op /* operation to print */ )
3584+{
3585+ if (op == NULL) {
3586+ printk("%s: null\n", prefix);
3587+ return;
3588+ }
3589+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3590+ print_carry("\tnode", op->node);
3591+ switch (op->op) {
3592+ case COP_INSERT:
3593+ case COP_PASTE:
3594+ print_coord("\tcoord",
3595+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3596+ reiser4_print_key("\tkey",
3597+ op->u.insert.d ? op->u.insert.d->key : NULL);
3598+ print_carry("\tchild", op->u.insert.child);
3599+ break;
3600+ case COP_DELETE:
3601+ print_carry("\tchild", op->u.delete.child);
3602+ break;
3603+ case COP_CUT:
3604+ if (op->u.cut_or_kill.is_cut) {
3605+ print_coord("\tfrom",
3606+ op->u.cut_or_kill.u.kill->params.from, 0);
3607+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3608+ 0);
3609+ } else {
3610+ print_coord("\tfrom",
3611+ op->u.cut_or_kill.u.cut->params.from, 0);
3612+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3613+ 0);
3614+ }
3615+ break;
3616+ case COP_UPDATE:
3617+ print_carry("\tleft", op->u.update.left);
3618+ break;
3619+ default:
3620+ /* do nothing */
3621+ break;
3622+ }
3623+}
3624+
3625+/* dump information about all nodes and operations in a @level */
3626+static void print_level(const char *prefix /* prefix to print */ ,
3627+ carry_level * level /* level to print */ )
3628+{
3629+ carry_node *node;
3630+ carry_node *tmp_node;
3631+ carry_op *op;
3632+ carry_op *tmp_op;
3633+
3634+ if (level == NULL) {
3635+ printk("%s: null\n", prefix);
3636+ return;
3637+ }
3638+ printk("%s: %p, restartable: %s\n",
3639+ prefix, level, tf(level->restartable));
3640+
3641+ for_all_nodes(level, node, tmp_node)
3642+ print_carry("\tcarry node", node);
3643+ for_all_ops(level, op, tmp_op)
3644+ print_op("\tcarry op", op);
3645+}
3646+
3647+/* Make Linus happy.
3648+ Local variables:
3649+ c-indentation-style: "K&R"
3650+ mode-name: "LC"
3651+ c-basic-offset: 8
3652+ tab-width: 8
3653+ fill-column: 120
3654+ scroll-step: 1
3655+ End:
3656+*/
3657diff -urN linux-2.6.20.orig/fs/reiser4/carry.h linux-2.6.20/fs/reiser4/carry.h
3658--- linux-2.6.20.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3659+++ linux-2.6.20/fs/reiser4/carry.h 2007-05-06 14:50:43.690973225 +0400
3660@@ -0,0 +1,442 @@
3661+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3662+
3663+/* Functions and data types to "carry" tree modification(s) upward.
3664+ See fs/reiser4/carry.c for details. */
3665+
3666+#if !defined( __FS_REISER4_CARRY_H__ )
3667+#define __FS_REISER4_CARRY_H__
3668+
3669+#include "forward.h"
3670+#include "debug.h"
3671+#include "pool.h"
3672+#include "znode.h"
3673+
3674+#include <linux/types.h>
3675+
3676+/* &carry_node - "location" of carry node.
3677+
3678+ "location" of node that is involved or going to be involved into
3679+ carry process. Node where operation will be carried to on the
3680+ parent level cannot be recorded explicitly. Operation will be carried
3681+ usually to the parent of some node (where changes are performed at
3682+ the current level) or, to the left neighbor of its parent. But while
3683+ modifications are performed at the current level, parent may
3684+ change. So, we have to allow some indirection (or, positevly,
3685+ flexibility) in locating carry nodes.
3686+
3687+*/
3688+typedef struct carry_node {
3689+ /* pool linkage */
3690+ reiser4_pool_header header;
3691+
3692+ /* base node from which real_node is calculated. See
3693+ fs/reiser4/carry.c:lock_carry_node(). */
3694+ znode *node;
3695+
3696+ /* how to get ->real_node */
3697+ /* to get ->real_node obtain parent of ->node */
3698+ __u32 parent:1;
3699+ /* to get ->real_node obtain left neighbor of parent of
3700+ ->node */
3701+ __u32 left:1;
3702+ __u32 left_before:1;
3703+
3704+ /* locking */
3705+
3706+ /* this node was locked by carry process and should be
3707+ unlocked when carry leaves a level */
3708+ __u32 unlock:1;
3709+
3710+ /* disk block for this node was allocated by carry process and
3711+ should be deallocated when carry leaves a level */
3712+ __u32 deallocate:1;
3713+ /* this carry node was allocated by carry process and should be
3714+ freed when carry leaves a level */
3715+ __u32 free:1;
3716+
3717+ /* type of lock we want to take on this node */
3718+ lock_handle lock_handle;
3719+} carry_node;
3720+
3721+/* &carry_opcode - elementary operations that can be carried upward
3722+
3723+ Operations that carry() can handle. This list is supposed to be
3724+ expanded.
3725+
3726+ Each carry operation (cop) is handled by appropriate function defined
3727+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
3728+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3729+ call plugins of nodes affected by operation to modify nodes' content
3730+ and to gather operations to be performed on the next level.
3731+
3732+*/
3733+typedef enum {
3734+ /* insert new item into node. */
3735+ COP_INSERT,
3736+ /* delete pointer from parent node */
3737+ COP_DELETE,
3738+ /* remove part of or whole node. */
3739+ COP_CUT,
3740+ /* increase size of item. */
3741+ COP_PASTE,
3742+ /* insert extent (that is sequence of unformatted nodes). */
3743+ COP_EXTENT,
3744+ /* update delimiting key in least common ancestor of two
3745+ nodes. This is performed when items are moved between two
3746+ nodes.
3747+ */
3748+ COP_UPDATE,
3749+ /* insert flow */
3750+ COP_INSERT_FLOW,
3751+ COP_LAST_OP,
3752+} carry_opcode;
3753+
3754+#define CARRY_FLOW_NEW_NODES_LIMIT 20
3755+
3756+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3757+ item is determined. */
3758+typedef enum {
3759+ /* target item is one containing pointer to the ->child node */
3760+ COPT_CHILD,
3761+ /* target item is given explicitly by @coord */
3762+ COPT_ITEM_DATA,
3763+ /* target item is given by key */
3764+ COPT_KEY,
3765+ /* see insert_paste_common() for more comments on this. */
3766+ COPT_PASTE_RESTARTED,
3767+} cop_insert_pos_type;
3768+
3769+/* flags to cut and delete */
3770+typedef enum {
3771+ /* don't kill node even if it became completely empty as results of
3772+ * cut. This is needed for eottl handling. See carry_extent() for
3773+ * details. */
3774+ DELETE_RETAIN_EMPTY = (1 << 0)
3775+} cop_delete_flag;
3776+
3777+/*
3778+ * carry() implements "lock handle tracking" feature.
3779+ *
3780+ * Callers supply carry with node where to perform initial operation and lock
3781+ * handle on this node. Trying to optimize node utilization carry may actually
3782+ * move insertion point to different node. Callers expect that lock handle
3783+ * will rebe transferred to the new node also.
3784+ *
3785+ */
3786+typedef enum {
3787+ /* transfer lock handle along with insertion point */
3788+ CARRY_TRACK_CHANGE = 1,
3789+ /* acquire new lock handle to the node where insertion point is. This
3790+ * is used when carry() client doesn't initially possess lock handle
3791+ * on the insertion point node, for example, by extent insertion
3792+ * code. See carry_extent(). */
3793+ CARRY_TRACK_NODE = 2
3794+} carry_track_type;
3795+
3796+/* data supplied to COP_{INSERT|PASTE} by callers */
3797+typedef struct carry_insert_data {
3798+ /* position where new item is to be inserted */
3799+ coord_t *coord;
3800+ /* new item description */
3801+ reiser4_item_data *data;
3802+ /* key of new item */
3803+ const reiser4_key *key;
3804+} carry_insert_data;
3805+
3806+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3807+struct cut_kill_params {
3808+ /* coord where cut starts (inclusive) */
3809+ coord_t *from;
3810+ /* coord where cut stops (inclusive, this item/unit will also be
3811+ * cut) */
3812+ coord_t *to;
3813+ /* starting key. This is necessary when item and unit pos don't
3814+ * uniquely identify what portion or tree to remove. For example, this
3815+ * indicates what portion of extent unit will be affected. */
3816+ const reiser4_key *from_key;
3817+ /* exclusive stop key */
3818+ const reiser4_key *to_key;
3819+ /* if this is not NULL, smallest actually removed key is stored
3820+ * here. */
3821+ reiser4_key *smallest_removed;
3822+ /* kill_node_content() is called for file truncate */
3823+ int truncate;
3824+};
3825+
3826+struct carry_cut_data {
3827+ struct cut_kill_params params;
3828+};
3829+
3830+struct carry_kill_data {
3831+ struct cut_kill_params params;
3832+ /* parameter to be passed to the ->kill_hook() method of item
3833+ * plugin */
3834+ /*void *iplug_params; *//* FIXME: unused currently */
3835+ /* if not NULL---inode whose items are being removed. This is needed
3836+ * for ->kill_hook() of extent item to update VM structures when
3837+ * removing pages. */
3838+ struct inode *inode;
3839+ /* sibling list maintenance is complicated by existence of eottl. When
3840+ * eottl whose left and right neighbors are formatted leaves is
3841+ * removed, one has to connect said leaves in the sibling list. This
3842+ * cannot be done when extent removal is just started as locking rules
3843+ * require sibling list update to happen atomically with removal of
3844+ * extent item. Therefore: 1. pointers to left and right neighbors
3845+ * have to be passed down to the ->kill_hook() of extent item, and
3846+ * 2. said neighbors have to be locked. */
3847+ lock_handle *left;
3848+ lock_handle *right;
3849+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3850+ unsigned flags;
3851+ char *buf;
3852+};
3853+
3854+/* &carry_tree_op - operation to "carry" upward.
3855+
3856+ Description of an operation we want to "carry" to the upper level of
3857+ a tree: e.g, when we insert something and there is not enough space
3858+ we allocate a new node and "carry" the operation of inserting a
3859+ pointer to the new node to the upper level, on removal of empty node,
3860+ we carry up operation of removing appropriate entry from parent.
3861+
3862+ There are two types of carry ops: when adding or deleting node we
3863+ node at the parent level where appropriate modification has to be
3864+ performed is known in advance. When shifting items between nodes
3865+ (split, merge), delimiting key should be changed in the least common
3866+ parent of the nodes involved that is not known in advance.
3867+
3868+ For the operations of the first type we store in &carry_op pointer to
3869+ the &carry_node at the parent level. For the operation of the second
3870+ type we store &carry_node or parents of the left and right nodes
3871+ modified and keep track of them upward until they coincide.
3872+
3873+*/
3874+typedef struct carry_op {
3875+ /* pool linkage */
3876+ reiser4_pool_header header;
3877+ carry_opcode op;
3878+ /* node on which operation is to be performed:
3879+
3880+ for insert, paste: node where new item is to be inserted
3881+
3882+ for delete: node where pointer is to be deleted
3883+
3884+ for cut: node to cut from
3885+
3886+ for update: node where delimiting key is to be modified
3887+
3888+ for modify: parent of modified node
3889+
3890+ */
3891+ carry_node *node;
3892+ union {
3893+ struct {
3894+ /* (sub-)type of insertion/paste. Taken from
3895+ cop_insert_pos_type. */
3896+ __u8 type;
3897+ /* various operation flags. Taken from
3898+ cop_insert_flag. */
3899+ __u8 flags;
3900+ carry_insert_data *d;
3901+ carry_node *child;
3902+ znode *brother;
3903+ } insert, paste, extent;
3904+
3905+ struct {
3906+ int is_cut;
3907+ union {
3908+ carry_kill_data *kill;
3909+ carry_cut_data *cut;
3910+ } u;
3911+ } cut_or_kill;
3912+
3913+ struct {
3914+ carry_node *left;
3915+ } update;
3916+ struct {
3917+ /* changed child */
3918+ carry_node *child;
3919+ /* bitmask of changes. See &cop_modify_flag */
3920+ __u32 flag;
3921+ } modify;
3922+ struct {
3923+ /* flags to deletion operation. Are taken from
3924+ cop_delete_flag */
3925+ __u32 flags;
3926+ /* child to delete from parent. If this is
3927+ NULL, delete op->node. */
3928+ carry_node *child;
3929+ } delete;
3930+ struct {
3931+ /* various operation flags. Taken from
3932+ cop_insert_flag. */
3933+ __u32 flags;
3934+ flow_t *flow;
3935+ coord_t *insert_point;
3936+ reiser4_item_data *data;
3937+ /* flow insertion is limited by number of new blocks
3938+ added in that operation which do not get any data
3939+ but part of flow. This limit is set by macro
3940+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3941+ of nodes added already during one carry_flow */
3942+ int new_nodes;
3943+ } insert_flow;
3944+ } u;
3945+} carry_op;
3946+
3947+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3948+typedef struct carry_pool {
3949+ carry_op op[CARRIES_POOL_SIZE];
3950+ reiser4_pool op_pool;
3951+ carry_node node[NODES_LOCKED_POOL_SIZE];
3952+ reiser4_pool node_pool;
3953+} carry_pool;
3954+
3955+/* &carry_tree_level - carry process on given level
3956+
3957+ Description of balancing process on the given level.
3958+
3959+ No need for locking here, as carry_tree_level is essentially per
3960+ thread thing (for now).
3961+
3962+*/
3963+struct carry_level {
3964+ /* this level may be restarted */
3965+ __u32 restartable:1;
3966+ /* list of carry nodes on this level, ordered by key order */
3967+ struct list_head nodes;
3968+ struct list_head ops;
3969+ /* pool where new objects are allocated from */
3970+ carry_pool *pool;
3971+ int ops_num;
3972+ int nodes_num;
3973+ /* new root created on this level, if any */
3974+ znode *new_root;
3975+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
3976+ when they want ->tracked to automagically wander to the node where
3977+ insertion point moved after insert or paste.
3978+ */
3979+ carry_track_type track_type;
3980+ /* lock handle supplied by user that we are tracking. See
3981+ above. */
3982+ lock_handle *tracked;
3983+};
3984+
3985+/* information carry passes to plugin methods that may add new operations to
3986+ the @todo queue */
3987+struct carry_plugin_info {
3988+ carry_level *doing;
3989+ carry_level *todo;
3990+};
3991+
3992+int reiser4_carry(carry_level * doing, carry_level * done);
3993+
3994+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
3995+ carry_node * reference);
3996+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
3997+ carry_node * reference);
3998+
3999+extern carry_node *insert_carry_node(carry_level * doing,
4000+ carry_level * todo, const znode * node);
4001+
4002+extern carry_pool *init_carry_pool(int);
4003+extern void done_carry_pool(carry_pool * pool);
4004+
4005+extern void init_carry_level(carry_level * level, carry_pool * pool);
4006+
4007+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4008+ znode * node, int apply_to_parent);
4009+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4010+ znode * node, int apply_to_parent_p);
4011+
4012+carry_node *add_new_znode(znode * brother, carry_node * reference,
4013+ carry_level * doing, carry_level * todo);
4014+
4015+carry_node *find_carry_node(carry_level * level, const znode * node);
4016+
4017+extern znode *reiser4_carry_real(const carry_node * node);
4018+
4019+/* helper macros to iterate over carry queues */
4020+
4021+#define carry_node_next( node ) \
4022+ list_entry((node)->header.level_linkage.next, carry_node, \
4023+ header.level_linkage)
4024+
4025+#define carry_node_prev( node ) \
4026+ list_entry((node)->header.level_linkage.prev, carry_node, \
4027+ header.level_linkage)
4028+
4029+#define carry_node_front( level ) \
4030+ list_entry((level)->nodes.next, carry_node, header.level_linkage)
4031+
4032+#define carry_node_back( level ) \
4033+ list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4034+
4035+#define carry_node_end( level, node ) \
4036+ (&(level)->nodes == &(node)->header.level_linkage)
4037+
4038+/* macro to iterate over all operations in a @level */
4039+#define for_all_ops( level /* carry level (of type carry_level *) */, \
4040+ op /* pointer to carry operation, modified by loop (of \
4041+ * type carry_op *) */, \
4042+ tmp /* pointer to carry operation (of type carry_op *), \
4043+ * used to make iterator stable in the face of \
4044+ * deletions from the level */ ) \
4045+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4046+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4047+ &op->header.level_linkage != &level->ops; \
4048+ op = tmp, \
4049+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4050+
4051+#if 0
4052+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4053+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4054+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4055+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4056+#endif
4057+
4058+/* macro to iterate over all nodes in a @level */ \
4059+#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4060+ node /* pointer to carry node, modified by loop (of \
4061+ * type carry_node *) */, \
4062+ tmp /* pointer to carry node (of type carry_node *), \
4063+ * used to make iterator stable in the face of * \
4064+ * deletions from the level */ ) \
4065+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4066+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4067+ &node->header.level_linkage != &level->nodes; \
4068+ node = tmp, \
4069+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4070+
4071+#if 0
4072+for( node = carry_node_front( level ), \
4073+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4074+ node = tmp, tmp = carry_node_next( node ) )
4075+#endif
4076+
4077+/* macro to iterate over all nodes in a @level in reverse order
4078+
4079+ This is used, because nodes are unlocked in reversed order of locking */
4080+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4081+ node /* pointer to carry node, modified by loop \
4082+ * (of type carry_node *) */, \
4083+ tmp /* pointer to carry node (of type carry_node \
4084+ * *), used to make iterator stable in the \
4085+ * face of deletions from the level */ ) \
4086+for( node = carry_node_back( level ), \
4087+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4088+ node = tmp, tmp = carry_node_prev( node ) )
4089+
4090+/* __FS_REISER4_CARRY_H__ */
4091+#endif
4092+
4093+/* Make Linus happy.
4094+ Local variables:
4095+ c-indentation-style: "K&R"
4096+ mode-name: "LC"
4097+ c-basic-offset: 8
4098+ tab-width: 8
4099+ fill-column: 120
4100+ scroll-step: 1
4101+ End:
4102+*/
4103diff -urN linux-2.6.20.orig/fs/reiser4/carry_ops.c linux-2.6.20/fs/reiser4/carry_ops.c
4104--- linux-2.6.20.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300
4105+++ linux-2.6.20/fs/reiser4/carry_ops.c 2007-05-06 14:50:43.694974475 +0400
4106@@ -0,0 +1,2131 @@
4107+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4108+
4109+/* implementation of carry operations */
4110+
4111+#include "forward.h"
4112+#include "debug.h"
4113+#include "key.h"
4114+#include "coord.h"
4115+#include "plugin/item/item.h"
4116+#include "plugin/node/node.h"
4117+#include "jnode.h"
4118+#include "znode.h"
4119+#include "block_alloc.h"
4120+#include "tree_walk.h"
4121+#include "pool.h"
4122+#include "tree_mod.h"
4123+#include "carry.h"
4124+#include "carry_ops.h"
4125+#include "tree.h"
4126+#include "super.h"
4127+#include "reiser4.h"
4128+
4129+#include <linux/types.h>
4130+#include <linux/err.h>
4131+
4132+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4133+ carry_level * doing, carry_level * todo,
4134+ unsigned int including_insert_coord_p);
4135+
4136+extern int lock_carry_node(carry_level * level, carry_node * node);
4137+extern int lock_carry_node_tail(carry_node * node);
4138+
4139+/* find left neighbor of a carry node
4140+
4141+ Look for left neighbor of @node and add it to the @doing queue. See
4142+ comments in the body.
4143+
4144+*/
4145+static carry_node *find_left_neighbor(carry_op * op /* node to find left
4146+ * neighbor of */ ,
4147+ carry_level * doing /* level to scan */ )
4148+{
4149+ int result;
4150+ carry_node *node;
4151+ carry_node *left;
4152+ int flags;
4153+ reiser4_tree *tree;
4154+
4155+ node = op->node;
4156+
4157+ tree = current_tree;
4158+ read_lock_tree(tree);
4159+ /* first, check whether left neighbor is already in a @doing queue */
4160+ if (reiser4_carry_real(node)->left != NULL) {
4161+ /* NOTE: there is locking subtlety here. Look into
4162+ * find_right_neighbor() for more info */
4163+ if (find_carry_node(doing,
4164+ reiser4_carry_real(node)->left) != NULL) {
4165+ read_unlock_tree(tree);
4166+ left = node;
4167+ do {
4168+ left = list_entry(left->header.level_linkage.prev,
4169+ carry_node, header.level_linkage);
4170+ assert("nikita-3408", !carry_node_end(doing,
4171+ left));
4172+ } while (reiser4_carry_real(left) ==
4173+ reiser4_carry_real(node));
4174+ return left;
4175+ }
4176+ }
4177+ read_unlock_tree(tree);
4178+
4179+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4180+ if (IS_ERR(left))
4181+ return left;
4182+
4183+ left->node = node->node;
4184+ left->free = 1;
4185+
4186+ flags = GN_TRY_LOCK;
4187+ if (!op->u.insert.flags & COPI_LOAD_LEFT)
4188+ flags |= GN_NO_ALLOC;
4189+
4190+ /* then, feeling lucky, peek left neighbor in the cache. */
4191+ result = reiser4_get_left_neighbor(&left->lock_handle,
4192+ reiser4_carry_real(node),
4193+ ZNODE_WRITE_LOCK, flags);
4194+ if (result == 0) {
4195+ /* ok, node found and locked. */
4196+ result = lock_carry_node_tail(left);
4197+ if (result != 0)
4198+ left = ERR_PTR(result);
4199+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4200+ /* node is leftmost node in a tree, or neighbor wasn't in
4201+ cache, or there is an extent on the left. */
4202+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4203+ left = NULL;
4204+ } else if (doing->restartable) {
4205+ /* if left neighbor is locked, and level is restartable, add
4206+ new node to @doing and restart. */
4207+ assert("nikita-913", node->parent != 0);
4208+ assert("nikita-914", node->node != NULL);
4209+ left->left = 1;
4210+ left->free = 0;
4211+ left = ERR_PTR(-E_REPEAT);
4212+ } else {
4213+ /* left neighbor is locked, level cannot be restarted. Just
4214+ ignore left neighbor. */
4215+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4216+ left = NULL;
4217+ }
4218+ return left;
4219+}
4220+
4221+/* find right neighbor of a carry node
4222+
4223+ Look for right neighbor of @node and add it to the @doing queue. See
4224+ comments in the body.
4225+
4226+*/
4227+static carry_node *find_right_neighbor(carry_op * op /* node to find right
4228+ * neighbor of */ ,
4229+ carry_level * doing /* level to scan */ )
4230+{
4231+ int result;
4232+ carry_node *node;
4233+ carry_node *right;
4234+ lock_handle lh;
4235+ int flags;
4236+ reiser4_tree *tree;
4237+
4238+ init_lh(&lh);
4239+
4240+ node = op->node;
4241+
4242+ tree = current_tree;
4243+ read_lock_tree(tree);
4244+ /* first, check whether right neighbor is already in a @doing queue */
4245+ if (reiser4_carry_real(node)->right != NULL) {
4246+ /*
4247+ * Tree lock is taken here anyway, because, even if _outcome_
4248+ * of (find_carry_node() != NULL) doesn't depends on
4249+ * concurrent updates to ->right, find_carry_node() cannot
4250+ * work with second argument NULL. Hence, following comment is
4251+ * of historic importance only.
4252+ *
4253+ * Subtle:
4254+ *
4255+ * Q: why don't we need tree lock here, looking for the right
4256+ * neighbor?
4257+ *
4258+ * A: even if value of node->real_node->right were changed
4259+ * during find_carry_node() execution, outcome of execution
4260+ * wouldn't change, because (in short) other thread cannot add
4261+ * elements to the @doing, and if node->real_node->right
4262+ * already was in @doing, value of node->real_node->right
4263+ * couldn't change, because node cannot be inserted between
4264+ * locked neighbors.
4265+ */
4266+ if (find_carry_node(doing,
4267+ reiser4_carry_real(node)->right) != NULL) {
4268+ read_unlock_tree(tree);
4269+ /*
4270+ * What we are doing here (this is also applicable to
4271+ * the find_left_neighbor()).
4272+ *
4273+ * tree_walk.c code requires that insertion of a
4274+ * pointer to a child, modification of parent pointer
4275+ * in the child, and insertion of the child into
4276+ * sibling list are atomic (see
4277+ * plugin/item/internal.c:create_hook_internal()).
4278+ *
4279+ * carry allocates new node long before pointer to it
4280+ * is inserted into parent and, actually, long before
4281+ * parent is even known. Such allocated-but-orphaned
4282+ * nodes are only trackable through carry level lists.
4283+ *
4284+ * Situation that is handled here is following: @node
4285+ * has valid ->right pointer, but there is
4286+ * allocated-but-orphaned node in the carry queue that
4287+ * is logically between @node and @node->right. Here
4288+ * we are searching for it. Critical point is that
4289+ * this is only possible if @node->right is also in
4290+ * the carry queue (this is checked above), because
4291+ * this is the only way new orphaned node could be
4292+ * inserted between them (before inserting new node,
4293+ * make_space() first tries to shift to the right, so,
4294+ * right neighbor will be locked and queued).
4295+ *
4296+ */
4297+ right = node;
4298+ do {
4299+ right = list_entry(right->header.level_linkage.next,
4300+ carry_node, header.level_linkage);
4301+ assert("nikita-3408", !carry_node_end(doing,
4302+ right));
4303+ } while (reiser4_carry_real(right) ==
4304+ reiser4_carry_real(node));
4305+ return right;
4306+ }
4307+ }
4308+ read_unlock_tree(tree);
4309+
4310+ flags = GN_CAN_USE_UPPER_LEVELS;
4311+ if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4312+ flags = GN_NO_ALLOC;
4313+
4314+ /* then, try to lock right neighbor */
4315+ init_lh(&lh);
4316+ result = reiser4_get_right_neighbor(&lh,
4317+ reiser4_carry_real(node),
4318+ ZNODE_WRITE_LOCK, flags);
4319+ if (result == 0) {
4320+ /* ok, node found and locked. */
4321+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4322+ if (!IS_ERR(right)) {
4323+ right->node = lh.node;
4324+ move_lh(&right->lock_handle, &lh);
4325+ right->free = 1;
4326+ result = lock_carry_node_tail(right);
4327+ if (result != 0)
4328+ right = ERR_PTR(result);
4329+ }
4330+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4331+ /* node is rightmost node in a tree, or neighbor wasn't in
4332+ cache, or there is an extent on the right. */
4333+ right = NULL;
4334+ } else
4335+ right = ERR_PTR(result);
4336+ done_lh(&lh);
4337+ return right;
4338+}
4339+
4340+/* how much free space in a @node is needed for @op
4341+
4342+ How much space in @node is required for completion of @op, where @op is
4343+ insert or paste operation.
4344+*/
4345+static unsigned int space_needed_for_op(znode * node /* znode data are
4346+ * inserted or
4347+ * pasted in */ ,
4348+ carry_op * op /* carry
4349+ operation */ )
4350+{
4351+ assert("nikita-919", op != NULL);
4352+
4353+ switch (op->op) {
4354+ default:
4355+ impossible("nikita-1701", "Wrong opcode");
4356+ case COP_INSERT:
4357+ return space_needed(node, NULL, op->u.insert.d->data, 1);
4358+ case COP_PASTE:
4359+ return space_needed(node, op->u.insert.d->coord,
4360+ op->u.insert.d->data, 0);
4361+ }
4362+}
4363+
4364+/* how much space in @node is required to insert or paste @data at
4365+ @coord. */
4366+unsigned int space_needed(const znode * node /* node data are inserted or
4367+ * pasted in */ ,
4368+ const coord_t * coord /* coord where data are
4369+ * inserted or pasted
4370+ * at */ ,
4371+ const reiser4_item_data * data /* data to insert or
4372+ * paste */ ,
4373+ int insertion /* non-0 is inserting, 0---paste */ )
4374+{
4375+ int result;
4376+ item_plugin *iplug;
4377+
4378+ assert("nikita-917", node != NULL);
4379+ assert("nikita-918", node_plugin_by_node(node) != NULL);
4380+ assert("vs-230", !insertion || (coord == NULL));
4381+
4382+ result = 0;
4383+ iplug = data->iplug;
4384+ if (iplug->b.estimate != NULL) {
4385+ /* ask item plugin how much space is needed to insert this
4386+ item */
4387+ result += iplug->b.estimate(insertion ? NULL : coord, data);
4388+ } else {
4389+ /* reasonable default */
4390+ result += data->length;
4391+ }
4392+ if (insertion) {
4393+ node_plugin *nplug;
4394+
4395+ nplug = node->nplug;
4396+ /* and add node overhead */
4397+ if (nplug->item_overhead != NULL) {
4398+ result += nplug->item_overhead(node, NULL);
4399+ }
4400+ }
4401+ return result;
4402+}
4403+
4404+/* find &coord in parent where pointer to new child is to be stored. */
4405+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4406+ * insert pointer to new
4407+ * child */ )
4408+{
4409+ int result;
4410+ znode *node;
4411+ znode *child;
4412+
4413+ assert("nikita-941", op != NULL);
4414+ assert("nikita-942", op->op == COP_INSERT);
4415+
4416+ node = reiser4_carry_real(op->node);
4417+ assert("nikita-943", node != NULL);
4418+ assert("nikita-944", node_plugin_by_node(node) != NULL);
4419+
4420+ child = reiser4_carry_real(op->u.insert.child);
4421+ result =
4422+ find_new_child_ptr(node, child, op->u.insert.brother,
4423+ op->u.insert.d->coord);
4424+
4425+ build_child_ptr_data(child, op->u.insert.d->data);
4426+ return result;
4427+}
4428+
4429+/* additional amount of free space in @node required to complete @op */
4430+static int free_space_shortage(znode * node /* node to check */ ,
4431+ carry_op * op /* operation being performed */ )
4432+{
4433+ assert("nikita-1061", node != NULL);
4434+ assert("nikita-1062", op != NULL);
4435+
4436+ switch (op->op) {
4437+ default:
4438+ impossible("nikita-1702", "Wrong opcode");
4439+ case COP_INSERT:
4440+ case COP_PASTE:
4441+ return space_needed_for_op(node, op) - znode_free_space(node);
4442+ case COP_EXTENT:
4443+ /* when inserting extent shift data around until insertion
4444+ point is utmost in the node. */
4445+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4446+ return +1;
4447+ else
4448+ return -1;
4449+ }
4450+}
4451+
4452+/* helper function: update node pointer in operation after insertion
4453+ point was probably shifted into @target. */
4454+static znode *sync_op(carry_op * op, carry_node * target)
4455+{
4456+ znode *insertion_node;
4457+
4458+ /* reget node from coord: shift might move insertion coord to
4459+ the neighbor */
4460+ insertion_node = op->u.insert.d->coord->node;
4461+ /* if insertion point was actually moved into new node,
4462+ update carry node pointer in operation. */
4463+ if (insertion_node != reiser4_carry_real(op->node)) {
4464+ op->node = target;
4465+ assert("nikita-2540",
4466+ reiser4_carry_real(target) == insertion_node);
4467+ }
4468+ assert("nikita-2541",
4469+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4470+ return insertion_node;
4471+}
4472+
4473+/*
4474+ * complete make_space() call: update tracked lock handle if necessary. See
4475+ * comments for fs/reiser4/carry.h:carry_track_type
4476+ */
4477+static int
4478+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4479+{
4480+ int result;
4481+ carry_track_type tracking;
4482+ znode *node;
4483+
4484+ tracking = doing->track_type;
4485+ node = op->u.insert.d->coord->node;
4486+
4487+ if (tracking == CARRY_TRACK_NODE ||
4488+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4489+ /* inserting or pasting into node different from
4490+ original. Update lock handle supplied by caller. */
4491+ assert("nikita-1417", doing->tracked != NULL);
4492+ done_lh(doing->tracked);
4493+ init_lh(doing->tracked);
4494+ result = longterm_lock_znode(doing->tracked, node,
4495+ ZNODE_WRITE_LOCK,
4496+ ZNODE_LOCK_HIPRI);
4497+ } else
4498+ result = 0;
4499+ return result;
4500+}
4501+
4502+/* This is insertion policy function. It shifts data to the left and right
4503+ neighbors of insertion coord and allocates new nodes until there is enough
4504+ free space to complete @op.
4505+
4506+ See comments in the body.
4507+
4508+ Assumes that the node format favors insertions at the right end of the node
4509+ as node40 does.
4510+
4511+ See carry_flow() on detail about flow insertion
4512+*/
4513+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4514+ carry_level * doing /* current carry queue */ ,
4515+ carry_level * todo /* carry queue on the parent level */ )
4516+{
4517+ znode *node;
4518+ int result;
4519+ int not_enough_space;
4520+ int blk_alloc;
4521+ znode *orig_node;
4522+ __u32 flags;
4523+
4524+ coord_t *coord;
4525+
4526+ assert("nikita-890", op != NULL);
4527+ assert("nikita-891", todo != NULL);
4528+ assert("nikita-892",
4529+ op->op == COP_INSERT ||
4530+ op->op == COP_PASTE || op->op == COP_EXTENT);
4531+ assert("nikita-1607",
4532+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4533+
4534+ flags = op->u.insert.flags;
4535+
4536+ /* NOTE check that new node can only be allocated after checking left
4537+ * and right neighbors. This is necessary for proper work of
4538+ * find_{left,right}_neighbor(). */
4539+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4540+ flags & COPI_DONT_SHIFT_LEFT));
4541+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4542+ flags & COPI_DONT_SHIFT_RIGHT));
4543+
4544+ coord = op->u.insert.d->coord;
4545+ orig_node = node = coord->node;
4546+
4547+ assert("nikita-908", node != NULL);
4548+ assert("nikita-909", node_plugin_by_node(node) != NULL);
4549+
4550+ result = 0;
4551+ /* If there is not enough space in a node, try to shift something to
4552+ the left neighbor. This is a bit tricky, as locking to the left is
4553+ low priority. This is handled by restart logic in carry().
4554+ */
4555+ not_enough_space = free_space_shortage(node, op);
4556+ if (not_enough_space <= 0)
4557+ /* it is possible that carry was called when there actually
4558+ was enough space in the node. For example, when inserting
4559+ leftmost item so that delimiting keys have to be updated.
4560+ */
4561+ return make_space_tail(op, doing, orig_node);
4562+ if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4563+ carry_node *left;
4564+ /* make note in statistics of an attempt to move
4565+ something into the left neighbor */
4566+ left = find_left_neighbor(op, doing);
4567+ if (unlikely(IS_ERR(left))) {
4568+ if (PTR_ERR(left) == -E_REPEAT)
4569+ return -E_REPEAT;
4570+ else {
4571+ /* some error other than restart request
4572+ occurred. This shouldn't happen. Issue a
4573+ warning and continue as if left neighbor
4574+ weren't existing.
4575+ */
4576+ warning("nikita-924",
4577+ "Error accessing left neighbor: %li",
4578+ PTR_ERR(left));
4579+ }
4580+ } else if (left != NULL) {
4581+
4582+ /* shift everything possible on the left of and
4583+ including insertion coord into the left neighbor */
4584+ result = carry_shift_data(LEFT_SIDE, coord,
4585+ reiser4_carry_real(left),
4586+ doing, todo,
4587+ flags & COPI_GO_LEFT);
4588+
4589+ /* reget node from coord: shift_left() might move
4590+ insertion coord to the left neighbor */
4591+ node = sync_op(op, left);
4592+
4593+ not_enough_space = free_space_shortage(node, op);
4594+ /* There is not enough free space in @node, but
4595+ may be, there is enough free space in
4596+ @left. Various balancing decisions are valid here.
4597+ The same for the shifiting to the right.
4598+ */
4599+ }
4600+ }
4601+ /* If there still is not enough space, shift to the right */
4602+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4603+ carry_node *right;
4604+
4605+ right = find_right_neighbor(op, doing);
4606+ if (IS_ERR(right)) {
4607+ warning("nikita-1065",
4608+ "Error accessing right neighbor: %li",
4609+ PTR_ERR(right));
4610+ } else if (right != NULL) {
4611+ /* node containing insertion point, and its right
4612+ neighbor node are write locked by now.
4613+
4614+ shift everything possible on the right of but
4615+ excluding insertion coord into the right neighbor
4616+ */
4617+ result = carry_shift_data(RIGHT_SIDE, coord,
4618+ reiser4_carry_real(right),
4619+ doing, todo,
4620+ flags & COPI_GO_RIGHT);
4621+ /* reget node from coord: shift_right() might move
4622+ insertion coord to the right neighbor */
4623+ node = sync_op(op, right);
4624+ not_enough_space = free_space_shortage(node, op);
4625+ }
4626+ }
4627+ /* If there is still not enough space, allocate new node(s).
4628+
4629+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4630+ the carry operation flags (currently this is needed during flush
4631+ only).
4632+ */
4633+ for (blk_alloc = 0;
4634+ not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4635+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4636+ carry_node *fresh; /* new node we are allocating */
4637+ coord_t coord_shadow; /* remembered insertion point before
4638+ * shifting data into new node */
4639+ carry_node *node_shadow; /* remembered insertion node before
4640+ * shifting */
4641+ unsigned int gointo; /* whether insertion point should move
4642+ * into newly allocated node */
4643+
4644+ /* allocate new node on the right of @node. Znode and disk
4645+ fake block number for new node are allocated.
4646+
4647+ add_new_znode() posts carry operation COP_INSERT with
4648+ COPT_CHILD option to the parent level to add
4649+ pointer to newly created node to its parent.
4650+
4651+ Subtle point: if several new nodes are required to complete
4652+ insertion operation at this level, they will be inserted
4653+ into their parents in the order of creation, which means
4654+ that @node will be valid "cookie" at the time of insertion.
4655+
4656+ */
4657+ fresh = add_new_znode(node, op->node, doing, todo);
4658+ if (IS_ERR(fresh))
4659+ return PTR_ERR(fresh);
4660+
4661+ /* Try to shift into new node. */
4662+ result = lock_carry_node(doing, fresh);
4663+ zput(reiser4_carry_real(fresh));
4664+ if (result != 0) {
4665+ warning("nikita-947",
4666+ "Cannot lock new node: %i", result);
4667+ return result;
4668+ }
4669+
4670+ /* both nodes are write locked by now.
4671+
4672+ shift everything possible on the right of and
4673+ including insertion coord into the right neighbor.
4674+ */
4675+ coord_dup(&coord_shadow, op->u.insert.d->coord);
4676+ node_shadow = op->node;
4677+ /* move insertion point into newly created node if:
4678+
4679+ . insertion point is rightmost in the source node, or
4680+ . this is not the first node we are allocating in a row.
4681+ */
4682+ gointo =
4683+ (blk_alloc > 0) ||
4684+ coord_is_after_rightmost(op->u.insert.d->coord);
4685+
4686+ if (gointo &&
4687+ op->op == COP_PASTE &&
4688+ coord_is_existing_item(op->u.insert.d->coord) &&
4689+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4690+ /* paste into solid (atomic) item, which can contain
4691+ only one unit, so we need to shift it right, where
4692+ insertion point supposed to be */
4693+
4694+ assert("edward-1444", op->u.insert.d->data->iplug ==
4695+ item_plugin_by_id(STATIC_STAT_DATA_ID));
4696+ assert("edward-1445",
4697+ op->u.insert.d->data->length >
4698+ node_plugin_by_node(coord->node)->free_space
4699+ (coord->node));
4700+
4701+ op->u.insert.d->coord->between = BEFORE_UNIT;
4702+ }
4703+
4704+ result = carry_shift_data(RIGHT_SIDE, coord,
4705+ reiser4_carry_real(fresh),
4706+ doing, todo, gointo);
4707+ /* if insertion point was actually moved into new node,
4708+ update carry node pointer in operation. */
4709+ node = sync_op(op, fresh);
4710+ not_enough_space = free_space_shortage(node, op);
4711+ if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4712+ /* there is not enough free in new node. Shift
4713+ insertion point back to the @shadow_node so that
4714+ next new node would be inserted between
4715+ @shadow_node and @fresh.
4716+ */
4717+ coord_normalize(&coord_shadow);
4718+ coord_dup(coord, &coord_shadow);
4719+ node = coord->node;
4720+ op->node = node_shadow;
4721+ if (1 || (flags & COPI_STEP_BACK)) {
4722+ /* still not enough space?! Maybe there is
4723+ enough space in the source node (i.e., node
4724+ data are moved from) now.
4725+ */
4726+ not_enough_space =
4727+ free_space_shortage(node, op);
4728+ }
4729+ }
4730+ }
4731+ if (not_enough_space > 0) {
4732+ if (!(flags & COPI_DONT_ALLOCATE))
4733+ warning("nikita-948", "Cannot insert new item");
4734+ result = -E_NODE_FULL;
4735+ }
4736+ assert("nikita-1622", ergo(result == 0,
4737+ reiser4_carry_real(op->node) == coord->node));
4738+ assert("nikita-2616", coord == op->u.insert.d->coord);
4739+ if (result == 0)
4740+ result = make_space_tail(op, doing, orig_node);
4741+ return result;
4742+}
4743+
4744+/* insert_paste_common() - common part of insert and paste operations
4745+
4746+ This function performs common part of COP_INSERT and COP_PASTE.
4747+
4748+ There are two ways in which insertion/paste can be requested:
4749+
4750+ . by directly supplying reiser4_item_data. In this case, op ->
4751+ u.insert.type is set to COPT_ITEM_DATA.
4752+
4753+ . by supplying child pointer to which is to inserted into parent. In this
4754+ case op -> u.insert.type == COPT_CHILD.
4755+
4756+ . by supplying key of new item/unit. This is currently only used during
4757+ extent insertion
4758+
4759+ This is required, because when new node is allocated we don't know at what
4760+ position pointer to it is to be stored in the parent. Actually, we don't
4761+ even know what its parent will be, because parent can be re-balanced
4762+ concurrently and new node re-parented, and because parent can be full and
4763+ pointer to the new node will go into some other node.
4764+
4765+ insert_paste_common() resolves pointer to child node into position in the
4766+ parent by calling find_new_child_coord(), that fills
4767+ reiser4_item_data. After this, insertion/paste proceeds uniformly.
4768+
4769+ Another complication is with finding free space during pasting. It may
4770+ happen that while shifting items to the neighbors and newly allocated
4771+ nodes, insertion coord can no longer be in the item we wanted to paste
4772+ into. At this point, paste becomes (morphs) into insert. Moreover free
4773+ space analysis has to be repeated, because amount of space required for
4774+ insertion is different from that of paste (item header overhead, etc).
4775+
4776+ This function "unifies" different insertion modes (by resolving child
4777+ pointer or key into insertion coord), and then calls make_space() to free
4778+ enough space in the node by shifting data to the left and right and by
4779+ allocating new nodes if necessary. Carry operation knows amount of space
4780+ required for its completion. After enough free space is obtained, caller of
4781+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4782+ by calling item plugin method.
4783+
4784+*/
4785+static int insert_paste_common(carry_op * op /* carry operation being
4786+ * performed */ ,
4787+ carry_level * doing /* current carry level */ ,
4788+ carry_level * todo /* next carry level */ ,
4789+ carry_insert_data * cdata /* pointer to
4790+ * cdata */ ,
4791+ coord_t * coord /* insertion/paste coord */ ,
4792+ reiser4_item_data * data /* data to be
4793+ * inserted/pasted */ )
4794+{
4795+ assert("nikita-981", op != NULL);
4796+ assert("nikita-980", todo != NULL);
4797+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4798+ || (op->op == COP_EXTENT));
4799+
4800+ if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4801+ /* nothing to do. Fall through to make_space(). */
4802+ ;
4803+ } else if (op->u.insert.type == COPT_KEY) {
4804+ node_search_result intra_node;
4805+ znode *node;
4806+ /* Problem with doing batching at the lowest level, is that
4807+ operations here are given by coords where modification is
4808+ to be performed, and one modification can invalidate coords
4809+ of all following operations.
4810+
4811+ So, we are implementing yet another type for operation that
4812+ will use (the only) "locator" stable across shifting of
4813+ data between nodes, etc.: key (COPT_KEY).
4814+
4815+ This clause resolves key to the coord in the node.
4816+
4817+ But node can change also. Probably some pieces have to be
4818+ added to the lock_carry_node(), to lock node by its key.
4819+
4820+ */
4821+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4822+ if you need something else. */
4823+ op->u.insert.d->coord = coord;
4824+ node = reiser4_carry_real(op->node);
4825+ intra_node = node_plugin_by_node(node)->lookup
4826+ (node, op->u.insert.d->key, FIND_EXACT,
4827+ op->u.insert.d->coord);
4828+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4829+ warning("nikita-1715", "Intra node lookup failure: %i",
4830+ intra_node);
4831+ return intra_node;
4832+ }
4833+ } else if (op->u.insert.type == COPT_CHILD) {
4834+ /* if we are asked to insert pointer to the child into
4835+ internal node, first convert pointer to the child into
4836+ coord within parent node.
4837+ */
4838+ znode *child;
4839+ int result;
4840+
4841+ op->u.insert.d = cdata;
4842+ op->u.insert.d->coord = coord;
4843+ op->u.insert.d->data = data;
4844+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4845+ result = find_new_child_coord(op);
4846+ child = reiser4_carry_real(op->u.insert.child);
4847+ if (result != NS_NOT_FOUND) {
4848+ warning("nikita-993",
4849+ "Cannot find a place for child pointer: %i",
4850+ result);
4851+ return result;
4852+ }
4853+ /* This only happens when we did multiple insertions at
4854+ the previous level, trying to insert single item and
4855+ it so happened, that insertion of pointers to all new
4856+ nodes before this one already caused parent node to
4857+ split (may be several times).
4858+
4859+ I am going to come up with better solution.
4860+
4861+ You are not expected to understand this.
4862+ -- v6root/usr/sys/ken/slp.c
4863+
4864+ Basically, what happens here is the following: carry came
4865+ to the parent level and is about to insert internal item
4866+ pointing to the child node that it just inserted in the
4867+ level below. Position where internal item is to be inserted
4868+ was found by find_new_child_coord() above, but node of the
4869+ current carry operation (that is, parent node of child
4870+ inserted on the previous level), was determined earlier in
4871+ the lock_carry_level/lock_carry_node. It could so happen
4872+ that other carry operations already performed on the parent
4873+ level already split parent node, so that insertion point
4874+ moved into another node. Handle this by creating new carry
4875+ node for insertion point if necessary.
4876+ */
4877+ if (reiser4_carry_real(op->node) !=
4878+ op->u.insert.d->coord->node) {
4879+ pool_ordering direction;
4880+ znode *z1;
4881+ znode *z2;
4882+ reiser4_key k1;
4883+ reiser4_key k2;
4884+
4885+ /*
4886+ * determine in what direction insertion point
4887+ * moved. Do this by comparing delimiting keys.
4888+ */
4889+ z1 = op->u.insert.d->coord->node;
4890+ z2 = reiser4_carry_real(op->node);
4891+ if (keyle(leftmost_key_in_node(z1, &k1),
4892+ leftmost_key_in_node(z2, &k2)))
4893+ /* insertion point moved to the left */
4894+ direction = POOLO_BEFORE;
4895+ else
4896+ /* insertion point moved to the right */
4897+ direction = POOLO_AFTER;
4898+
4899+ op->node = reiser4_add_carry_skip(doing,
4900+ direction, op->node);
4901+ if (IS_ERR(op->node))
4902+ return PTR_ERR(op->node);
4903+ op->node->node = op->u.insert.d->coord->node;
4904+ op->node->free = 1;
4905+ result = lock_carry_node(doing, op->node);
4906+ if (result != 0)
4907+ return result;
4908+ }
4909+
4910+ /*
4911+ * set up key of an item being inserted: we are inserting
4912+ * internal item and its key is (by the very definition of
4913+ * search tree) is leftmost key in the child node.
4914+ */
4915+ write_lock_dk(znode_get_tree(child));
4916+ op->u.insert.d->key = leftmost_key_in_node(child,
4917+ znode_get_ld_key(child));
4918+ write_unlock_dk(znode_get_tree(child));
4919+ op->u.insert.d->data->arg = op->u.insert.brother;
4920+ } else {
4921+ assert("vs-243", op->u.insert.d->coord != NULL);
4922+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4923+ }
4924+
4925+ /* find free space. */
4926+ return make_space(op, doing, todo);
4927+}
4928+
4929+/* handle carry COP_INSERT operation.
4930+
4931+ Insert new item into node. New item can be given in one of two ways:
4932+
4933+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4934+ only applicable at the leaf/twig level.
4935+
4936+ - by passing a child node pointer to which is to be inserted by this
4937+ operation.
4938+
4939+*/
4940+static int carry_insert(carry_op * op /* operation to perform */ ,
4941+ carry_level * doing /* queue of operations @op
4942+ * is part of */ ,
4943+ carry_level * todo /* queue where new operations
4944+ * are accumulated */ )
4945+{
4946+ znode *node;
4947+ carry_insert_data cdata;
4948+ coord_t coord;
4949+ reiser4_item_data data;
4950+ carry_plugin_info info;
4951+ int result;
4952+
4953+ assert("nikita-1036", op != NULL);
4954+ assert("nikita-1037", todo != NULL);
4955+ assert("nikita-1038", op->op == COP_INSERT);
4956+
4957+ coord_init_zero(&coord);
4958+
4959+ /* perform common functionality of insert and paste. */
4960+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
4961+ if (result != 0)
4962+ return result;
4963+
4964+ node = op->u.insert.d->coord->node;
4965+ assert("nikita-1039", node != NULL);
4966+ assert("nikita-1040", node_plugin_by_node(node) != NULL);
4967+
4968+ assert("nikita-949",
4969+ space_needed_for_op(node, op) <= znode_free_space(node));
4970+
4971+ /* ask node layout to create new item. */
4972+ info.doing = doing;
4973+ info.todo = todo;
4974+ result = node_plugin_by_node(node)->create_item
4975+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
4976+ &info);
4977+ doing->restartable = 0;
4978+ znode_make_dirty(node);
4979+
4980+ return result;
4981+}
4982+
4983+/*
4984+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
4985+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
4986+ * by slicing into multiple items.
4987+ */
4988+
4989+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
4990+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
4991+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
4992+
4993+static size_t item_data_overhead(carry_op * op)
4994+{
4995+ if (flow_insert_data(op)->iplug->b.estimate == NULL)
4996+ return 0;
4997+ return (flow_insert_data(op)->iplug->b.
4998+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
4999+ flow_insert_data(op)->length);
5000+}
5001+
5002+/* FIXME-VS: this is called several times during one make_flow_for_insertion
5003+ and it will always return the same result. Some optimization could be made
5004+ by calculating this value once at the beginning and passing it around. That
5005+ would reduce some flexibility in future changes
5006+*/
5007+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5008+static size_t flow_insertion_overhead(carry_op * op)
5009+{
5010+ znode *node;
5011+ size_t insertion_overhead;
5012+
5013+ node = flow_insert_point(op)->node;
5014+ insertion_overhead = 0;
5015+ if (node->nplug->item_overhead &&
5016+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5017+ flow_insert_data(op)))
5018+ insertion_overhead =
5019+ node->nplug->item_overhead(node, NULL) +
5020+ item_data_overhead(op);
5021+ return insertion_overhead;
5022+}
5023+
5024+/* how many bytes of flow does fit to the node */
5025+static int what_can_fit_into_node(carry_op * op)
5026+{
5027+ size_t free, overhead;
5028+
5029+ overhead = flow_insertion_overhead(op);
5030+ free = znode_free_space(flow_insert_point(op)->node);
5031+ if (free <= overhead)
5032+ return 0;
5033+ free -= overhead;
5034+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5035+ if (free < op->u.insert_flow.flow->length)
5036+ return free;
5037+ return (int)op->u.insert_flow.flow->length;
5038+}
5039+
5040+/* in make_space_for_flow_insertion we need to check either whether whole flow
5041+ fits into a node or whether minimal fraction of flow fits into a node */
5042+static int enough_space_for_whole_flow(carry_op * op)
5043+{
5044+ return (unsigned)what_can_fit_into_node(op) ==
5045+ op->u.insert_flow.flow->length;
5046+}
5047+
5048+#define MIN_FLOW_FRACTION 1
5049+static int enough_space_for_min_flow_fraction(carry_op * op)
5050+{
5051+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5052+
5053+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5054+}
5055+
5056+/* this returns 0 if left neighbor was obtained successfully and everything
5057+ upto insertion point including it were shifted and left neighbor still has
5058+ some free space to put minimal fraction of flow into it */
5059+static int
5060+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5061+{
5062+ carry_node *left;
5063+ znode *orig;
5064+
5065+ left = find_left_neighbor(op, doing);
5066+ if (unlikely(IS_ERR(left))) {
5067+ warning("vs-899",
5068+ "make_space_by_shift_left: "
5069+ "error accessing left neighbor: %li", PTR_ERR(left));
5070+ return 1;
5071+ }
5072+ if (left == NULL)
5073+ /* left neighbor either does not exist or is unformatted
5074+ node */
5075+ return 1;
5076+
5077+ orig = flow_insert_point(op)->node;
5078+ /* try to shift content of node @orig from its head upto insert point
5079+ including insertion point into the left neighbor */
5080+ carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5081+ reiser4_carry_real(left), doing, todo,
5082+ 1 /* including insert point */);
5083+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5084+ /* insertion point did not move */
5085+ return 1;
5086+ }
5087+
5088+ /* insertion point is set after last item in the node */
5089+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5090+
5091+ if (!enough_space_for_min_flow_fraction(op)) {
5092+ /* insertion point node does not have enough free space to put
5093+ even minimal portion of flow into it, therefore, move
5094+ insertion point back to orig node (before first item) */
5095+ coord_init_before_first_item(flow_insert_point(op), orig);
5096+ return 1;
5097+ }
5098+
5099+ /* part of flow is to be written to the end of node */
5100+ op->node = left;
5101+ return 0;
5102+}
5103+
5104+/* this returns 0 if right neighbor was obtained successfully and everything to
5105+ the right of insertion point was shifted to it and node got enough free
5106+ space to put minimal fraction of flow into it */
5107+static int
5108+make_space_by_shift_right(carry_op * op, carry_level * doing,
5109+ carry_level * todo)
5110+{
5111+ carry_node *right;
5112+
5113+ right = find_right_neighbor(op, doing);
5114+ if (unlikely(IS_ERR(right))) {
5115+ warning("nikita-1065", "shift_right_excluding_insert_point: "
5116+ "error accessing right neighbor: %li", PTR_ERR(right));
5117+ return 1;
5118+ }
5119+ if (right) {
5120+ /* shift everything possible on the right of but excluding
5121+ insertion coord into the right neighbor */
5122+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5123+ reiser4_carry_real(right), doing, todo,
5124+ 0 /* not including insert point */);
5125+ } else {
5126+ /* right neighbor either does not exist or is unformatted
5127+ node */
5128+ ;
5129+ }
5130+ if (coord_is_after_rightmost(flow_insert_point(op))) {
5131+ if (enough_space_for_min_flow_fraction(op)) {
5132+ /* part of flow is to be written to the end of node */
5133+ return 0;
5134+ }
5135+ }
5136+
5137+ /* new node is to be added if insert point node did not get enough
5138+ space for whole flow */
5139+ return 1;
5140+}
5141+
5142+/* this returns 0 when insert coord is set at the node end and fraction of flow
5143+ fits into that node */
5144+static int
5145+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5146+{
5147+ int result;
5148+ znode *node;
5149+ carry_node *new;
5150+
5151+ node = flow_insert_point(op)->node;
5152+
5153+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5154+ return RETERR(-E_NODE_FULL);
5155+ /* add new node after insert point node */
5156+ new = add_new_znode(node, op->node, doing, todo);
5157+ if (unlikely(IS_ERR(new))) {
5158+ return PTR_ERR(new);
5159+ }
5160+ result = lock_carry_node(doing, new);
5161+ zput(reiser4_carry_real(new));
5162+ if (unlikely(result)) {
5163+ return result;
5164+ }
5165+ op->u.insert_flow.new_nodes++;
5166+ if (!coord_is_after_rightmost(flow_insert_point(op))) {
5167+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5168+ reiser4_carry_real(new), doing, todo,
5169+ 0 /* not including insert point */);
5170+ assert("vs-901",
5171+ coord_is_after_rightmost(flow_insert_point(op)));
5172+
5173+ if (enough_space_for_min_flow_fraction(op)) {
5174+ return 0;
5175+ }
5176+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5177+ return RETERR(-E_NODE_FULL);
5178+
5179+ /* add one more new node */
5180+ new = add_new_znode(node, op->node, doing, todo);
5181+ if (unlikely(IS_ERR(new))) {
5182+ return PTR_ERR(new);
5183+ }
5184+ result = lock_carry_node(doing, new);
5185+ zput(reiser4_carry_real(new));
5186+ if (unlikely(result)) {
5187+ return result;
5188+ }
5189+ op->u.insert_flow.new_nodes++;
5190+ }
5191+
5192+ /* move insertion point to new node */
5193+ coord_init_before_first_item(flow_insert_point(op),
5194+ reiser4_carry_real(new));
5195+ op->node = new;
5196+ return 0;
5197+}
5198+
5199+static int
5200+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5201+ carry_level * todo)
5202+{
5203+ __u32 flags = op->u.insert_flow.flags;
5204+
5205+ if (enough_space_for_whole_flow(op)) {
5206+ /* whole flow fits into insert point node */
5207+ return 0;
5208+ }
5209+
5210+ if (!(flags & COPI_DONT_SHIFT_LEFT)
5211+ && (make_space_by_shift_left(op, doing, todo) == 0)) {
5212+ /* insert point is shifted to left neighbor of original insert
5213+ point node and is set after last unit in that node. It has
5214+ enough space to fit at least minimal fraction of flow. */
5215+ return 0;
5216+ }
5217+
5218+ if (enough_space_for_whole_flow(op)) {
5219+ /* whole flow fits into insert point node */
5220+ return 0;
5221+ }
5222+
5223+ if (!(flags & COPI_DONT_SHIFT_RIGHT)
5224+ && (make_space_by_shift_right(op, doing, todo) == 0)) {
5225+ /* insert point is still set to the same node, but there is
5226+ nothing to the right of insert point. */
5227+ return 0;
5228+ }
5229+
5230+ if (enough_space_for_whole_flow(op)) {
5231+ /* whole flow fits into insert point node */
5232+ return 0;
5233+ }
5234+
5235+ return make_space_by_new_nodes(op, doing, todo);
5236+}
5237+
5238+/* implements COP_INSERT_FLOW operation */
5239+static int
5240+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5241+{
5242+ int result;
5243+ flow_t *f;
5244+ coord_t *insert_point;
5245+ node_plugin *nplug;
5246+ carry_plugin_info info;
5247+ znode *orig_node;
5248+ lock_handle *orig_lh;
5249+
5250+ f = op->u.insert_flow.flow;
5251+ result = 0;
5252+
5253+ /* carry system needs this to work */
5254+ info.doing = doing;
5255+ info.todo = todo;
5256+
5257+ orig_node = flow_insert_point(op)->node;
5258+ orig_lh = doing->tracked;
5259+
5260+ while (f->length) {
5261+ result = make_space_for_flow_insertion(op, doing, todo);
5262+ if (result)
5263+ break;
5264+
5265+ insert_point = flow_insert_point(op);
5266+ nplug = node_plugin_by_node(insert_point->node);
5267+
5268+ /* compose item data for insertion/pasting */
5269+ flow_insert_data(op)->data = f->data;
5270+ flow_insert_data(op)->length = what_can_fit_into_node(op);
5271+
5272+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5273+ /* insert point is set to item of file we are writing to and we have to append to it */
5274+ assert("vs-903", insert_point->between == AFTER_UNIT);
5275+ nplug->change_item_size(insert_point,
5276+ flow_insert_data(op)->length);
5277+ flow_insert_data(op)->iplug->b.paste(insert_point,
5278+ flow_insert_data
5279+ (op), &info);
5280+ } else {
5281+ /* new item must be inserted */
5282+ pos_in_node_t new_pos;
5283+ flow_insert_data(op)->length += item_data_overhead(op);
5284+
5285+ /* FIXME-VS: this is because node40_create_item changes
5286+ insert_point for obscure reasons */
5287+ switch (insert_point->between) {
5288+ case AFTER_ITEM:
5289+ new_pos = insert_point->item_pos + 1;
5290+ break;
5291+ case EMPTY_NODE:
5292+ new_pos = 0;
5293+ break;
5294+ case BEFORE_ITEM:
5295+ assert("vs-905", insert_point->item_pos == 0);
5296+ new_pos = 0;
5297+ break;
5298+ default:
5299+ impossible("vs-906",
5300+ "carry_insert_flow: invalid coord");
5301+ new_pos = 0;
5302+ break;
5303+ }
5304+
5305+ nplug->create_item(insert_point, &f->key,
5306+ flow_insert_data(op), &info);
5307+ coord_set_item_pos(insert_point, new_pos);
5308+ }
5309+ coord_init_after_item_end(insert_point);
5310+ doing->restartable = 0;
5311+ znode_make_dirty(insert_point->node);
5312+
5313+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5314+ }
5315+
5316+ if (orig_node != flow_insert_point(op)->node) {
5317+ /* move lock to new insert point */
5318+ done_lh(orig_lh);
5319+ init_lh(orig_lh);
5320+ result =
5321+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5322+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5323+ }
5324+
5325+ return result;
5326+}
5327+
5328+/* implements COP_DELETE operation
5329+
5330+ Remove pointer to @op -> u.delete.child from it's parent.
5331+
5332+ This function also handles killing of a tree root is last pointer from it
5333+ was removed. This is complicated by our handling of "twig" level: root on
5334+ twig level is never killed.
5335+
5336+*/
5337+static int carry_delete(carry_op * op /* operation to be performed */ ,
5338+ carry_level * doing UNUSED_ARG /* current carry
5339+ * level */ ,
5340+ carry_level * todo /* next carry level */ )
5341+{
5342+ int result;
5343+ coord_t coord;
5344+ coord_t coord2;
5345+ znode *parent;
5346+ znode *child;
5347+ carry_plugin_info info;
5348+ reiser4_tree *tree;
5349+
5350+ /*
5351+ * This operation is called to delete internal item pointing to the
5352+ * child node that was removed by carry from the tree on the previous
5353+ * tree level.
5354+ */
5355+
5356+ assert("nikita-893", op != NULL);
5357+ assert("nikita-894", todo != NULL);
5358+ assert("nikita-895", op->op == COP_DELETE);
5359+
5360+ coord_init_zero(&coord);
5361+ coord_init_zero(&coord2);
5362+
5363+ parent = reiser4_carry_real(op->node);
5364+ child = op->u.delete.child ?
5365+ reiser4_carry_real(op->u.delete.child) : op->node->node;
5366+ tree = znode_get_tree(child);
5367+ read_lock_tree(tree);
5368+
5369+ /*
5370+ * @parent was determined when carry entered parent level
5371+ * (lock_carry_level/lock_carry_node). Since then, actual parent of
5372+ * @child node could change due to other carry operations performed on
5373+ * the parent level. Check for this.
5374+ */
5375+
5376+ if (znode_parent(child) != parent) {
5377+ /* NOTE-NIKITA add stat counter for this. */
5378+ parent = znode_parent(child);
5379+ assert("nikita-2581", find_carry_node(doing, parent));
5380+ }
5381+ read_unlock_tree(tree);
5382+
5383+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5384+
5385+ /* Twig level horrors: tree should be of height at least 2. So, last
5386+ pointer from the root at twig level is preserved even if child is
5387+ empty. This is ugly, but so it was architectured.
5388+ */
5389+
5390+ if (znode_is_root(parent) &&
5391+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5392+ node_num_items(parent) == 1) {
5393+ /* Delimiting key manipulations. */
5394+ write_lock_dk(tree);
5395+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5396+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5397+ ZF_SET(child, JNODE_DKSET);
5398+ write_unlock_dk(tree);
5399+
5400+ /* @child escaped imminent death! */
5401+ ZF_CLR(child, JNODE_HEARD_BANSHEE);
5402+ return 0;
5403+ }
5404+
5405+ /* convert child pointer to the coord_t */
5406+ result = find_child_ptr(parent, child, &coord);
5407+ if (result != NS_FOUND) {
5408+ warning("nikita-994", "Cannot find child pointer: %i", result);
5409+ print_coord_content("coord", &coord);
5410+ return result;
5411+ }
5412+
5413+ coord_dup(&coord2, &coord);
5414+ info.doing = doing;
5415+ info.todo = todo;
5416+ {
5417+ /*
5418+ * Actually kill internal item: prepare structure with
5419+ * arguments for ->cut_and_kill() method...
5420+ */
5421+
5422+ struct carry_kill_data kdata;
5423+ kdata.params.from = &coord;
5424+ kdata.params.to = &coord2;
5425+ kdata.params.from_key = NULL;
5426+ kdata.params.to_key = NULL;
5427+ kdata.params.smallest_removed = NULL;
5428+ kdata.params.truncate = 1;
5429+ kdata.flags = op->u.delete.flags;
5430+ kdata.inode = NULL;
5431+ kdata.left = NULL;
5432+ kdata.right = NULL;
5433+ kdata.buf = NULL;
5434+ /* ... and call it. */
5435+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5436+ &info);
5437+ }
5438+ doing->restartable = 0;
5439+
5440+ /* check whether root should be killed violently */
5441+ if (znode_is_root(parent) &&
5442+ /* don't kill roots at and lower than twig level */
5443+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5444+ node_num_items(parent) == 1) {
5445+ result = reiser4_kill_tree_root(coord.node);
5446+ }
5447+
5448+ return result < 0 ? : 0;
5449+}
5450+
5451+/* implements COP_CUT opration
5452+
5453+ Cuts part or whole content of node.
5454+
5455+*/
5456+static int carry_cut(carry_op * op /* operation to be performed */ ,
5457+ carry_level * doing /* current carry level */ ,
5458+ carry_level * todo /* next carry level */ )
5459+{
5460+ int result;
5461+ carry_plugin_info info;
5462+ node_plugin *nplug;
5463+
5464+ assert("nikita-896", op != NULL);
5465+ assert("nikita-897", todo != NULL);
5466+ assert("nikita-898", op->op == COP_CUT);
5467+
5468+ info.doing = doing;
5469+ info.todo = todo;
5470+
5471+ nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5472+ if (op->u.cut_or_kill.is_cut)
5473+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5474+ else
5475+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5476+
5477+ doing->restartable = 0;
5478+ return result < 0 ? : 0;
5479+}
5480+
5481+/* helper function for carry_paste(): returns true if @op can be continued as
5482+ paste */
5483+static int
5484+can_paste(coord_t * icoord, const reiser4_key * key,
5485+ const reiser4_item_data * data)
5486+{
5487+ coord_t circa;
5488+ item_plugin *new_iplug;
5489+ item_plugin *old_iplug;
5490+ int result = 0; /* to keep gcc shut */
5491+
5492+ assert("", icoord->between != AT_UNIT);
5493+
5494+ /* obviously, one cannot paste when node is empty---there is nothing
5495+ to paste into. */
5496+ if (node_is_empty(icoord->node))
5497+ return 0;
5498+ /* if insertion point is at the middle of the item, then paste */
5499+ if (!coord_is_between_items(icoord))
5500+ return 1;
5501+ coord_dup(&circa, icoord);
5502+ circa.between = AT_UNIT;
5503+
5504+ old_iplug = item_plugin_by_coord(&circa);
5505+ new_iplug = data->iplug;
5506+
5507+ /* check whether we can paste to the item @icoord is "at" when we
5508+ ignore ->between field */
5509+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5510+ result = 1;
5511+ } else if (icoord->between == BEFORE_UNIT
5512+ || icoord->between == BEFORE_ITEM) {
5513+ /* otherwise, try to glue to the item at the left, if any */
5514+ coord_dup(&circa, icoord);
5515+ if (coord_set_to_left(&circa)) {
5516+ result = 0;
5517+ coord_init_before_item(icoord);
5518+ } else {
5519+ old_iplug = item_plugin_by_coord(&circa);
5520+ result = (old_iplug == new_iplug)
5521+ && item_can_contain_key(icoord, key, data);
5522+ if (result) {
5523+ coord_dup(icoord, &circa);
5524+ icoord->between = AFTER_UNIT;
5525+ }
5526+ }
5527+ } else if (icoord->between == AFTER_UNIT
5528+ || icoord->between == AFTER_ITEM) {
5529+ coord_dup(&circa, icoord);
5530+ /* otherwise, try to glue to the item at the right, if any */
5531+ if (coord_set_to_right(&circa)) {
5532+ result = 0;
5533+ coord_init_after_item(icoord);
5534+ } else {
5535+ int (*cck) (const coord_t *, const reiser4_key *,
5536+ const reiser4_item_data *);
5537+
5538+ old_iplug = item_plugin_by_coord(&circa);
5539+
5540+ cck = old_iplug->b.can_contain_key;
5541+ if (cck == NULL)
5542+ /* item doesn't define ->can_contain_key
5543+ method? So it is not expandable. */
5544+ result = 0;
5545+ else {
5546+ result = (old_iplug == new_iplug)
5547+ && cck(&circa /*icoord */ , key, data);
5548+ if (result) {
5549+ coord_dup(icoord, &circa);
5550+ icoord->between = BEFORE_UNIT;
5551+ }
5552+ }
5553+ }
5554+ } else
5555+ impossible("nikita-2513", "Nothing works");
5556+ if (result) {
5557+ if (icoord->between == BEFORE_ITEM) {
5558+ assert("vs-912", icoord->unit_pos == 0);
5559+ icoord->between = BEFORE_UNIT;
5560+ } else if (icoord->between == AFTER_ITEM) {
5561+ coord_init_after_item_end(icoord);
5562+ }
5563+ }
5564+ return result;
5565+}
5566+
5567+/* implements COP_PASTE operation
5568+
5569+ Paste data into existing item. This is complicated by the fact that after
5570+ we shifted something to the left or right neighbors trying to free some
5571+ space, item we were supposed to paste into can be in different node than
5572+ insertion coord. If so, we are no longer doing paste, but insert. See
5573+ comments in insert_paste_common().
5574+
5575+*/
5576+static int carry_paste(carry_op * op /* operation to be performed */ ,
5577+ carry_level * doing UNUSED_ARG /* current carry
5578+ * level */ ,
5579+ carry_level * todo /* next carry level */ )
5580+{
5581+ znode *node;
5582+ carry_insert_data cdata;
5583+ coord_t dcoord;
5584+ reiser4_item_data data;
5585+ int result;
5586+ int real_size;
5587+ item_plugin *iplug;
5588+ carry_plugin_info info;
5589+ coord_t *coord;
5590+
5591+ assert("nikita-982", op != NULL);
5592+ assert("nikita-983", todo != NULL);
5593+ assert("nikita-984", op->op == COP_PASTE);
5594+
5595+ coord_init_zero(&dcoord);
5596+
5597+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5598+ if (result != 0)
5599+ return result;
5600+
5601+ coord = op->u.insert.d->coord;
5602+
5603+ /* handle case when op -> u.insert.coord doesn't point to the item
5604+ of required type. restart as insert. */
5605+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5606+ op->op = COP_INSERT;
5607+ op->u.insert.type = COPT_PASTE_RESTARTED;
5608+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5609+
5610+ return result;
5611+ }
5612+
5613+ node = coord->node;
5614+ iplug = item_plugin_by_coord(coord);
5615+ assert("nikita-992", iplug != NULL);
5616+
5617+ assert("nikita-985", node != NULL);
5618+ assert("nikita-986", node_plugin_by_node(node) != NULL);
5619+
5620+ assert("nikita-987",
5621+ space_needed_for_op(node, op) <= znode_free_space(node));
5622+
5623+ assert("nikita-1286", coord_is_existing_item(coord));
5624+
5625+ /*
5626+ * if item is expanded as a result of this operation, we should first
5627+ * change item size, than call ->b.paste item method. If item is
5628+ * shrunk, it should be done other way around: first call ->b.paste
5629+ * method, then reduce item size.
5630+ */
5631+
5632+ real_size = space_needed_for_op(node, op);
5633+ if (real_size > 0)
5634+ node->nplug->change_item_size(coord, real_size);
5635+
5636+ doing->restartable = 0;
5637+ info.doing = doing;
5638+ info.todo = todo;
5639+
5640+ result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5641+
5642+ if (real_size < 0)
5643+ node->nplug->change_item_size(coord, real_size);
5644+
5645+ /* if we pasted at the beginning of the item, update item's key. */
5646+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5647+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5648+
5649+ znode_make_dirty(node);
5650+ return result;
5651+}
5652+
5653+/* handle carry COP_EXTENT operation. */
5654+static int carry_extent(carry_op * op /* operation to perform */ ,
5655+ carry_level * doing /* queue of operations @op
5656+ * is part of */ ,
5657+ carry_level * todo /* queue where new operations
5658+ * are accumulated */ )
5659+{
5660+ znode *node;
5661+ carry_insert_data cdata;
5662+ coord_t coord;
5663+ reiser4_item_data data;
5664+ carry_op *delete_dummy;
5665+ carry_op *insert_extent;
5666+ int result;
5667+ carry_plugin_info info;
5668+
5669+ assert("nikita-1751", op != NULL);
5670+ assert("nikita-1752", todo != NULL);
5671+ assert("nikita-1753", op->op == COP_EXTENT);
5672+
5673+ /* extent insertion overview:
5674+
5675+ extents live on the TWIG LEVEL, which is level one above the leaf
5676+ one. This complicates extent insertion logic somewhat: it may
5677+ happen (and going to happen all the time) that in logical key
5678+ ordering extent has to be placed between items I1 and I2, located
5679+ at the leaf level, but I1 and I2 are in the same formatted leaf
5680+ node N1. To insert extent one has to
5681+
5682+ (1) reach node N1 and shift data between N1, its neighbors and
5683+ possibly newly allocated nodes until I1 and I2 fall into different
5684+ nodes. Since I1 and I2 are still neighboring items in logical key
5685+ order, they will be necessary utmost items in their respective
5686+ nodes.
5687+
5688+ (2) After this new extent item is inserted into node on the twig
5689+ level.
5690+
5691+ Fortunately this process can reuse almost all code from standard
5692+ insertion procedure (viz. make_space() and insert_paste_common()),
5693+ due to the following observation: make_space() only shifts data up
5694+ to and excluding or including insertion point. It never
5695+ "over-moves" through insertion point. Thus, one can use
5696+ make_space() to perform step (1). All required for this is just to
5697+ instruct free_space_shortage() to keep make_space() shifting data
5698+ until insertion point is at the node border.
5699+
5700+ */
5701+
5702+ /* perform common functionality of insert and paste. */
5703+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5704+ if (result != 0)
5705+ return result;
5706+
5707+ node = op->u.extent.d->coord->node;
5708+ assert("nikita-1754", node != NULL);
5709+ assert("nikita-1755", node_plugin_by_node(node) != NULL);
5710+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5711+
5712+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5713+ extent fits between items. */
5714+
5715+ info.doing = doing;
5716+ info.todo = todo;
5717+
5718+ /* there is another complication due to placement of extents on the
5719+ twig level: extents are "rigid" in the sense that key-range
5720+ occupied by extent cannot grow indefinitely to the right as it is
5721+ for the formatted leaf nodes. Because of this when search finds two
5722+ adjacent extents on the twig level, it has to "drill" to the leaf
5723+ level, creating new node. Here we are removing this node.
5724+ */
5725+ if (node_is_empty(node)) {
5726+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5727+ if (IS_ERR(delete_dummy))
5728+ return PTR_ERR(delete_dummy);
5729+ delete_dummy->u.delete.child = NULL;
5730+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5731+ ZF_SET(node, JNODE_HEARD_BANSHEE);
5732+ }
5733+
5734+ /* proceed with inserting extent item into parent. We are definitely
5735+ inserting rather than pasting if we get that far. */
5736+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5737+ if (IS_ERR(insert_extent))
5738+ /* @delete_dummy will be automatically destroyed on the level
5739+ exiting */
5740+ return PTR_ERR(insert_extent);
5741+ /* NOTE-NIKITA insertion by key is simplest option here. Another
5742+ possibility is to insert on the left or right of already existing
5743+ item.
5744+ */
5745+ insert_extent->u.insert.type = COPT_KEY;
5746+ insert_extent->u.insert.d = op->u.extent.d;
5747+ assert("nikita-1719", op->u.extent.d->key != NULL);
5748+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5749+ insert_extent->u.insert.flags =
5750+ znode_get_tree(node)->carry.new_extent_flags;
5751+
5752+ /*
5753+ * if carry was asked to track lock handle we should actually track
5754+ * lock handle on the twig node rather than on the leaf where
5755+ * operation was started from. Transfer tracked lock handle.
5756+ */
5757+ if (doing->track_type) {
5758+ assert("nikita-3242", doing->tracked != NULL);
5759+ assert("nikita-3244", todo->tracked == NULL);
5760+ todo->tracked = doing->tracked;
5761+ todo->track_type = CARRY_TRACK_NODE;
5762+ doing->tracked = NULL;
5763+ doing->track_type = 0;
5764+ }
5765+
5766+ return 0;
5767+}
5768+
5769+/* update key in @parent between pointers to @left and @right.
5770+
5771+ Find coords of @left and @right and update delimiting key between them.
5772+ This is helper function called by carry_update(). Finds position of
5773+ internal item involved. Updates item key. Updates delimiting keys of child
5774+ nodes involved.
5775+*/
5776+static int update_delimiting_key(znode * parent /* node key is updated
5777+ * in */ ,
5778+ znode * left /* child of @parent */ ,
5779+ znode * right /* child of @parent */ ,
5780+ carry_level * doing /* current carry
5781+ * level */ ,
5782+ carry_level * todo /* parent carry
5783+ * level */ ,
5784+ const char **error_msg /* place to
5785+ * store error
5786+ * message */ )
5787+{
5788+ coord_t left_pos;
5789+ coord_t right_pos;
5790+ int result;
5791+ reiser4_key ldkey;
5792+ carry_plugin_info info;
5793+
5794+ assert("nikita-1177", right != NULL);
5795+ /* find position of right left child in a parent */
5796+ result = find_child_ptr(parent, right, &right_pos);
5797+ if (result != NS_FOUND) {
5798+ *error_msg = "Cannot find position of right child";
5799+ return result;
5800+ }
5801+
5802+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5803+ /* find position of the left child in a parent */
5804+ result = find_child_ptr(parent, left, &left_pos);
5805+ if (result != NS_FOUND) {
5806+ *error_msg = "Cannot find position of left child";
5807+ return result;
5808+ }
5809+ assert("nikita-1355", left_pos.node != NULL);
5810+ } else
5811+ left_pos.node = NULL;
5812+
5813+ /* check that they are separated by exactly one key and are basically
5814+ sane */
5815+ if (REISER4_DEBUG) {
5816+ if ((left_pos.node != NULL)
5817+ && !coord_is_existing_unit(&left_pos)) {
5818+ *error_msg = "Left child is bastard";
5819+ return RETERR(-EIO);
5820+ }
5821+ if (!coord_is_existing_unit(&right_pos)) {
5822+ *error_msg = "Right child is bastard";
5823+ return RETERR(-EIO);
5824+ }
5825+ if (left_pos.node != NULL &&
5826+ !coord_are_neighbors(&left_pos, &right_pos)) {
5827+ *error_msg = "Children are not direct siblings";
5828+ return RETERR(-EIO);
5829+ }
5830+ }
5831+ *error_msg = NULL;
5832+
5833+ info.doing = doing;
5834+ info.todo = todo;
5835+
5836+ /*
5837+ * If child node is not empty, new key of internal item is a key of
5838+ * leftmost item in the child node. If the child is empty, take its
5839+ * right delimiting key as a new key of the internal item. Precise key
5840+ * in the latter case is not important per se, because the child (and
5841+ * the internal item) are going to be killed shortly anyway, but we
5842+ * have to preserve correct order of keys in the parent node.
5843+ */
5844+
5845+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5846+ leftmost_key_in_node(right, &ldkey);
5847+ else {
5848+ read_lock_dk(znode_get_tree(parent));
5849+ ldkey = *znode_get_rd_key(right);
5850+ read_unlock_dk(znode_get_tree(parent));
5851+ }
5852+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5853+ doing->restartable = 0;
5854+ znode_make_dirty(parent);
5855+ return 0;
5856+}
5857+
5858+/* implements COP_UPDATE opration
5859+
5860+ Update delimiting keys.
5861+
5862+*/
5863+static int carry_update(carry_op * op /* operation to be performed */ ,
5864+ carry_level * doing /* current carry level */ ,
5865+ carry_level * todo /* next carry level */ )
5866+{
5867+ int result;
5868+ carry_node *missing UNUSED_ARG;
5869+ znode *left;
5870+ znode *right;
5871+ carry_node *lchild;
5872+ carry_node *rchild;
5873+ const char *error_msg;
5874+ reiser4_tree *tree;
5875+
5876+ /*
5877+ * This operation is called to update key of internal item. This is
5878+ * necessary when carry shifted of cut data on the child
5879+ * level. Arguments of this operation are:
5880+ *
5881+ * @right --- child node. Operation should update key of internal
5882+ * item pointing to @right.
5883+ *
5884+ * @left --- left neighbor of @right. This parameter is optional.
5885+ */
5886+
5887+ assert("nikita-902", op != NULL);
5888+ assert("nikita-903", todo != NULL);
5889+ assert("nikita-904", op->op == COP_UPDATE);
5890+
5891+ lchild = op->u.update.left;
5892+ rchild = op->node;
5893+
5894+ if (lchild != NULL) {
5895+ assert("nikita-1001", lchild->parent);
5896+ assert("nikita-1003", !lchild->left);
5897+ left = reiser4_carry_real(lchild);
5898+ } else
5899+ left = NULL;
5900+
5901+ tree = znode_get_tree(rchild->node);
5902+ read_lock_tree(tree);
5903+ right = znode_parent(rchild->node);
5904+ read_unlock_tree(tree);
5905+
5906+ if (right != NULL) {
5907+ result = update_delimiting_key(right,
5908+ lchild ? lchild->node : NULL,
5909+ rchild->node,
5910+ doing, todo, &error_msg);
5911+ } else {
5912+ error_msg = "Cannot find node to update key in";
5913+ result = RETERR(-EIO);
5914+ }
5915+ /* operation will be reposted to the next level by the
5916+ ->update_item_key() method of node plugin, if necessary. */
5917+
5918+ if (result != 0) {
5919+ warning("nikita-999", "Error updating delimiting key: %s (%i)",
5920+ error_msg ? : "", result);
5921+ }
5922+ return result;
5923+}
5924+
5925+/* move items from @node during carry */
5926+static int carry_shift_data(sideof side /* in what direction to move data */ ,
5927+ coord_t * insert_coord /* coord where new item
5928+ * is to be inserted */ ,
5929+ znode * node /* node which data are moved from */ ,
5930+ carry_level * doing /* active carry queue */ ,
5931+ carry_level * todo /* carry queue where new
5932+ * operations are to be put
5933+ * in */ ,
5934+ unsigned int including_insert_coord_p /* true if
5935+ * @insertion_coord
5936+ * can be moved */ )
5937+{
5938+ int result;
5939+ znode *source;
5940+ carry_plugin_info info;
5941+ node_plugin *nplug;
5942+
5943+ source = insert_coord->node;
5944+
5945+ info.doing = doing;
5946+ info.todo = todo;
5947+
5948+ nplug = node_plugin_by_node(node);
5949+ result = nplug->shift(insert_coord, node,
5950+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5951+ (int)including_insert_coord_p, &info);
5952+ /* the only error ->shift() method of node plugin can return is
5953+ -ENOMEM due to carry node/operation allocation. */
5954+ assert("nikita-915", result >= 0 || result == -ENOMEM);
5955+ if (result > 0) {
5956+ /*
5957+ * if some number of bytes was actually shifted, mark nodes
5958+ * dirty, and carry level as non-restartable.
5959+ */
5960+ doing->restartable = 0;
5961+ znode_make_dirty(source);
5962+ znode_make_dirty(node);
5963+ }
5964+
5965+ assert("nikita-2077", coord_check(insert_coord));
5966+ return 0;
5967+}
5968+
5969+typedef carry_node *(*carry_iterator) (carry_node * node);
5970+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
5971+ carry_iterator iterator);
5972+
5973+static carry_node *pool_level_list_prev(carry_node *node)
5974+{
5975+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
5976+}
5977+
5978+/* look for the left neighbor of given carry node in a carry queue.
5979+
5980+ This is used by find_left_neighbor(), but I am not sure that this
5981+ really gives any advantage. More statistics required.
5982+
5983+*/
5984+carry_node *find_left_carry(carry_node * node /* node to find left neighbor
5985+ * of */ ,
5986+ carry_level * level /* level to scan */ )
5987+{
5988+ return find_dir_carry(node, level,
5989+ (carry_iterator) pool_level_list_prev);
5990+}
5991+
5992+static carry_node *pool_level_list_next(carry_node *node)
5993+{
5994+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
5995+}
5996+
5997+/* look for the right neighbor of given carry node in a
5998+ carry queue.
5999+
6000+ This is used by find_right_neighbor(), but I am not sure that this
6001+ really gives any advantage. More statistics required.
6002+
6003+*/
6004+carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6005+ * of */ ,
6006+ carry_level * level /* level to scan */ )
6007+{
6008+ return find_dir_carry(node, level,
6009+ (carry_iterator) pool_level_list_next);
6010+}
6011+
6012+/* look for the left or right neighbor of given carry node in a carry
6013+ queue.
6014+
6015+ Helper function used by find_{left|right}_carry().
6016+*/
6017+static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6018+ * from */ ,
6019+ carry_level * level /* level to scan */ ,
6020+ carry_iterator iterator /* operation to
6021+ * move to the next
6022+ * node */ )
6023+{
6024+ carry_node *neighbor;
6025+
6026+ assert("nikita-1059", node != NULL);
6027+ assert("nikita-1060", level != NULL);
6028+
6029+ /* scan list of carry nodes on this list dir-ward, skipping all
6030+ carry nodes referencing the same znode. */
6031+ neighbor = node;
6032+ while (1) {
6033+ neighbor = iterator(neighbor);
6034+ if (carry_node_end(level, neighbor))
6035+ /* list head is reached */
6036+ return NULL;
6037+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6038+ return neighbor;
6039+ }
6040+}
6041+
6042+/*
6043+ * Memory reservation estimation.
6044+ *
6045+ * Carry process proceeds through tree levels upwards. Carry assumes that it
6046+ * takes tree in consistent state (e.g., that search tree invariants hold),
6047+ * and leaves tree consistent after it finishes. This means that when some
6048+ * error occurs carry cannot simply return if there are pending carry
6049+ * operations. Generic solution for this problem is carry-undo either as
6050+ * transaction manager feature (requiring checkpoints and isolation), or
6051+ * through some carry specific mechanism.
6052+ *
6053+ * Our current approach is to panic if carry hits an error while tree is
6054+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6055+ * this "memory reservation" mechanism was added.
6056+ *
6057+ * Memory reservation is implemented by perthread-pages.diff patch from
6058+ * core-patches. Its API is defined in <linux/gfp.h>
6059+ *
6060+ * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6061+ * void perthread_pages_release(int nrpages);
6062+ * int perthread_pages_count(void);
6063+ *
6064+ * carry estimates its worst case memory requirements at the entry, reserved
6065+ * enough memory, and released unused pages before returning.
6066+ *
6067+ * Code below estimates worst case memory requirements for a given carry
6068+ * queue. This is dome by summing worst case memory requirements for each
6069+ * operation in the queue.
6070+ *
6071+ */
6072+
6073+/*
6074+ * Memory memory requirements of many operations depends on the tree
6075+ * height. For example, item insertion requires new node to be inserted at
6076+ * each tree level in the worst case. What tree height should be used for
6077+ * estimation? Current tree height is wrong, because tree height can change
6078+ * between the time when estimation was done and the time when operation is
6079+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6080+ * is also not desirable, because it would lead to the huge over-estimation
6081+ * all the time. Plausible solution is "capped tree height": if current tree
6082+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6083+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6084+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6085+ * to be increased even more during short interval of time.
6086+ */
6087+#define TREE_HEIGHT_CAP (5)
6088+
6089+/* return capped tree height for the @tree. See comment above. */
6090+static int cap_tree_height(reiser4_tree * tree)
6091+{
6092+ return max_t(int, tree->height, TREE_HEIGHT_CAP);
6093+}
6094+
6095+/* return capped tree height for the current tree. */
6096+static int capped_height(void)
6097+{
6098+ return cap_tree_height(current_tree);
6099+}
6100+
6101+/* return number of pages required to store given number of bytes */
6102+static int bytes_to_pages(int bytes)
6103+{
6104+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6105+}
6106+
6107+/* how many pages are required to allocate znodes during item insertion. */
6108+static int carry_estimate_znodes(void)
6109+{
6110+ /*
6111+ * Note, that there we have some problem here: there is no way to
6112+ * reserve pages specifically for the given slab. This means that
6113+ * these pages can be hijacked for some other end.
6114+ */
6115+
6116+ /* in the worst case we need 3 new znode on each tree level */
6117+ return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6118+}
6119+
6120+/*
6121+ * how many pages are required to load bitmaps. One bitmap per level.
6122+ */
6123+static int carry_estimate_bitmaps(void)
6124+{
6125+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6126+ int bytes;
6127+
6128+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6129+ * bitmap.c, skip for now. */
6130+ 2 * sizeof(jnode)); /* working and commit jnodes */
6131+ return bytes_to_pages(bytes) + 2; /* and their contents */
6132+ } else
6133+ /* bitmaps were pre-loaded during mount */
6134+ return 0;
6135+}
6136+
6137+/* worst case item insertion memory requirements */
6138+static int carry_estimate_insert(carry_op * op, carry_level * level)
6139+{
6140+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6141+ capped_height() + /* new block on each level */
6142+ 1 + /* and possibly extra new block at the leaf level */
6143+ 3; /* loading of leaves into memory */
6144+}
6145+
6146+/* worst case item deletion memory requirements */
6147+static int carry_estimate_delete(carry_op * op, carry_level * level)
6148+{
6149+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6150+ 3; /* loading of leaves into memory */
6151+}
6152+
6153+/* worst case tree cut memory requirements */
6154+static int carry_estimate_cut(carry_op * op, carry_level * level)
6155+{
6156+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6157+ 3; /* loading of leaves into memory */
6158+}
6159+
6160+/* worst case memory requirements of pasting into item */
6161+static int carry_estimate_paste(carry_op * op, carry_level * level)
6162+{
6163+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6164+ capped_height() + /* new block on each level */
6165+ 1 + /* and possibly extra new block at the leaf level */
6166+ 3; /* loading of leaves into memory */
6167+}
6168+
6169+/* worst case memory requirements of extent insertion */
6170+static int carry_estimate_extent(carry_op * op, carry_level * level)
6171+{
6172+ return carry_estimate_insert(op, level) + /* insert extent */
6173+ carry_estimate_delete(op, level); /* kill leaf */
6174+}
6175+
6176+/* worst case memory requirements of key update */
6177+static int carry_estimate_update(carry_op * op, carry_level * level)
6178+{
6179+ return 0;
6180+}
6181+
6182+/* worst case memory requirements of flow insertion */
6183+static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6184+{
6185+ int newnodes;
6186+
6187+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6188+ CARRY_FLOW_NEW_NODES_LIMIT);
6189+ /*
6190+ * roughly estimate insert_flow as a sequence of insertions.
6191+ */
6192+ return newnodes * carry_estimate_insert(op, level);
6193+}
6194+
6195+/* This is dispatch table for carry operations. It can be trivially
6196+ abstracted into useful plugin: tunable balancing policy is a good
6197+ thing. */
6198+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6199+ [COP_INSERT] = {
6200+ .handler = carry_insert,
6201+ .estimate = carry_estimate_insert}
6202+ ,
6203+ [COP_DELETE] = {
6204+ .handler = carry_delete,
6205+ .estimate = carry_estimate_delete}
6206+ ,
6207+ [COP_CUT] = {
6208+ .handler = carry_cut,
6209+ .estimate = carry_estimate_cut}
6210+ ,
6211+ [COP_PASTE] = {
6212+ .handler = carry_paste,
6213+ .estimate = carry_estimate_paste}
6214+ ,
6215+ [COP_EXTENT] = {
6216+ .handler = carry_extent,
6217+ .estimate = carry_estimate_extent}
6218+ ,
6219+ [COP_UPDATE] = {
6220+ .handler = carry_update,
6221+ .estimate = carry_estimate_update}
6222+ ,
6223+ [COP_INSERT_FLOW] = {
6224+ .handler = carry_insert_flow,
6225+ .estimate = carry_estimate_insert_flow}
6226+};
6227+
6228+/* Make Linus happy.
6229+ Local variables:
6230+ c-indentation-style: "K&R"
6231+ mode-name: "LC"
6232+ c-basic-offset: 8
6233+ tab-width: 8
6234+ fill-column: 120
6235+ scroll-step: 1
6236+ End:
6237+*/
6238diff -urN linux-2.6.20.orig/fs/reiser4/carry_ops.h linux-2.6.20/fs/reiser4/carry_ops.h
6239--- linux-2.6.20.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300
6240+++ linux-2.6.20/fs/reiser4/carry_ops.h 2007-05-06 14:50:43.694974475 +0400
6241@@ -0,0 +1,42 @@
6242+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6243+
6244+/* implementation of carry operations. See carry_ops.c for details. */
6245+
6246+#if !defined( __CARRY_OPS_H__ )
6247+#define __CARRY_OPS_H__
6248+
6249+#include "forward.h"
6250+#include "znode.h"
6251+#include "carry.h"
6252+
6253+/* carry operation handlers */
6254+typedef struct carry_op_handler {
6255+ /* perform operation */
6256+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6257+ /* estimate memory requirements for @op */
6258+ int (*estimate) (carry_op * op, carry_level * level);
6259+} carry_op_handler;
6260+
6261+/* This is dispatch table for carry operations. It can be trivially
6262+ abstracted into useful plugin: tunable balancing policy is a good
6263+ thing. */
6264+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6265+
6266+unsigned int space_needed(const znode * node, const coord_t * coord,
6267+ const reiser4_item_data * data, int inserting);
6268+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6269+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6270+
6271+/* __CARRY_OPS_H__ */
6272+#endif
6273+
6274+/* Make Linus happy.
6275+ Local variables:
6276+ c-indentation-style: "K&R"
6277+ mode-name: "LC"
6278+ c-basic-offset: 8
6279+ tab-width: 8
6280+ fill-column: 120
6281+ scroll-step: 1
6282+ End:
6283+*/
6284diff -urN linux-2.6.20.orig/fs/reiser4/context.c linux-2.6.20/fs/reiser4/context.c
6285--- linux-2.6.20.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300
6286+++ linux-2.6.20/fs/reiser4/context.c 2007-05-06 14:50:43.694974475 +0400
6287@@ -0,0 +1,288 @@
6288+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6289+
6290+/* Manipulation of reiser4_context */
6291+
6292+/*
6293+ * global context used during system call. Variable of this type is allocated
6294+ * on the stack at the beginning of the reiser4 part of the system call and
6295+ * pointer to it is stored in the current->fs_context. This allows us to avoid
6296+ * passing pointer to current transaction and current lockstack (both in
6297+ * one-to-one mapping with threads) all over the call chain.
6298+ *
6299+ * It's kind of like those global variables the prof used to tell you not to
6300+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6301+ *
6302+ * In some situations it is desirable to have ability to enter reiser4_context
6303+ * more than once for the same thread (nested contexts). For example, there
6304+ * are some functions that can be called either directly from VFS/VM or from
6305+ * already active reiser4 context (->writepage, for example).
6306+ *
6307+ * In such situations "child" context acts like dummy: all activity is
6308+ * actually performed in the top level context, and get_current_context()
6309+ * always returns top level context.
6310+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6311+ * nested any way.
6312+ *
6313+ * Note that there is an important difference between reiser4 uses
6314+ * ->fs_context and the way other file systems use it. Other file systems
6315+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6316+ * (this is why ->fs_context was initially called ->journal_info). This means,
6317+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6318+ * to the file system, they assume that some transaction is already underway,
6319+ * and usually bail out, because starting nested transaction would most likely
6320+ * lead to the deadlock. This gives false positives with reiser4, because we
6321+ * set ->fs_context before starting transaction.
6322+ */
6323+
6324+#include "debug.h"
6325+#include "super.h"
6326+#include "context.h"
6327+
6328+#include <linux/writeback.h> /* balance_dirty_pages() */
6329+#include <linux/hardirq.h>
6330+
6331+static void _reiser4_init_context(reiser4_context * context,
6332+ struct super_block *super)
6333+{
6334+ memset(context, 0, sizeof(*context));
6335+
6336+ context->super = super;
6337+ context->magic = context_magic;
6338+ context->outer = current->journal_info;
6339+ current->journal_info = (void *)context;
6340+ context->nr_children = 0;
6341+ context->gfp_mask = GFP_KERNEL;
6342+
6343+ init_lock_stack(&context->stack);
6344+
6345+ reiser4_txn_begin(context);
6346+
6347+ /* initialize head of tap list */
6348+ INIT_LIST_HEAD(&context->taps);
6349+#if REISER4_DEBUG
6350+ context->task = current;
6351+#endif
6352+ grab_space_enable();
6353+}
6354+
6355+/* initialize context and bind it to the current thread
6356+
6357+ This function should be called at the beginning of reiser4 part of
6358+ syscall.
6359+*/
6360+reiser4_context * reiser4_init_context(struct super_block * super)
6361+{
6362+ reiser4_context *context;
6363+
6364+ assert("nikita-2662", !in_interrupt() && !in_irq());
6365+ assert("nikita-3357", super != NULL);
6366+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6367+
6368+ context = get_current_context_check();
6369+ if (context && context->super == super) {
6370+ context = (reiser4_context *) current->journal_info;
6371+ context->nr_children++;
6372+ return context;
6373+ }
6374+
6375+ context = kmalloc(sizeof(*context), GFP_KERNEL);
6376+ if (context == NULL)
6377+ return ERR_PTR(RETERR(-ENOMEM));
6378+
6379+ _reiser4_init_context(context, super);
6380+ return context;
6381+}
6382+
6383+/* this is used in scan_mgr which is called with spinlock held and in
6384+ reiser4_fill_super magic */
6385+void init_stack_context(reiser4_context *context, struct super_block *super)
6386+{
6387+ assert("nikita-2662", !in_interrupt() && !in_irq());
6388+ assert("nikita-3357", super != NULL);
6389+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6390+ assert("vs-12", !is_in_reiser4_context());
6391+
6392+ _reiser4_init_context(context, super);
6393+ context->on_stack = 1;
6394+ return;
6395+}
6396+
6397+/* cast lock stack embedded into reiser4 context up to its container */
6398+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6399+{
6400+ return container_of(owner, reiser4_context, stack);
6401+}
6402+
6403+/* true if there is already _any_ reiser4 context for the current thread */
6404+int is_in_reiser4_context(void)
6405+{
6406+ reiser4_context *ctx;
6407+
6408+ ctx = current->journal_info;
6409+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6410+}
6411+
6412+/*
6413+ * call balance dirty pages for the current context.
6414+ *
6415+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
6416+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6417+ * write---this covers vast majority of all dirty traffic), but we cannot do
6418+ * this immediately when formatted node is dirtied, because long term lock is
6419+ * usually held at that time. To work around this, dirtying of formatted node
6420+ * simply increases ->nr_marked_dirty counter in the current reiser4
6421+ * context. When we are about to leave this context,
6422+ * balance_dirty_pages_ratelimited() is called, if necessary.
6423+ *
6424+ * This introduces another problem: sometimes we do not want to run
6425+ * balance_dirty_pages_ratelimited() when leaving a context, for example
6426+ * because some important lock (like ->i_mutex on the parent directory) is
6427+ * held. To achieve this, ->nobalance flag can be set in the current context.
6428+ */
6429+static void balance_dirty_pages_at(reiser4_context *context)
6430+{
6431+ reiser4_super_info_data *sbinfo = get_super_private(context->super);
6432+
6433+ /*
6434+ * call balance_dirty_pages_ratelimited() to process formatted nodes
6435+ * dirtied during this system call. Do that only if we are not in mount
6436+ * and there were nodes dirtied in this context and we are not in
6437+ * writepage (to avoid deadlock) and not in pdflush
6438+ */
6439+ if (sbinfo != NULL && sbinfo->fake != NULL &&
6440+ context->nr_marked_dirty != 0 &&
6441+ !(current->flags & PF_MEMALLOC) &&
6442+ !current_is_pdflush())
6443+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6444+}
6445+
6446+/* release resources associated with context.
6447+
6448+ This function should be called at the end of "session" with reiser4,
6449+ typically just before leaving reiser4 driver back to VFS.
6450+
6451+ This is good place to put some degugging consistency checks, like that
6452+ thread released all locks and closed transcrash etc.
6453+
6454+*/
6455+static void reiser4_done_context(reiser4_context * context /* context being released */ )
6456+{
6457+ assert("nikita-860", context != NULL);
6458+ assert("nikita-859", context->magic == context_magic);
6459+ assert("vs-646", (reiser4_context *) current->journal_info == context);
6460+ assert("zam-686", !in_interrupt() && !in_irq());
6461+
6462+ /* only do anything when leaving top-level reiser4 context. All nested
6463+ * contexts are just dummies. */
6464+ if (context->nr_children == 0) {
6465+ assert("jmacd-673", context->trans == NULL);
6466+ assert("jmacd-1002", lock_stack_isclean(&context->stack));
6467+ assert("nikita-1936", reiser4_no_counters_are_held());
6468+ assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6469+ assert("zam-1004", ergo(get_super_private(context->super),
6470+ get_super_private(context->super)->delete_mutex_owner !=
6471+ current));
6472+
6473+ /* release all grabbed but as yet unused blocks */
6474+ if (context->grabbed_blocks != 0)
6475+ all_grabbed2free();
6476+
6477+ /*
6478+ * synchronize against longterm_unlock_znode():
6479+ * wake_up_requestor() wakes up requestors without holding
6480+ * zlock (otherwise they will immediately bump into that lock
6481+ * after wake up on another CPU). To work around (rare)
6482+ * situation where requestor has been woken up asynchronously
6483+ * and managed to run until completion (and destroy its
6484+ * context and lock stack) before wake_up_requestor() called
6485+ * wake_up() on it, wake_up_requestor() synchronize on lock
6486+ * stack spin lock. It has actually been observed that spin
6487+ * lock _was_ locked at this point, because
6488+ * wake_up_requestor() took interrupt.
6489+ */
6490+ spin_lock_stack(&context->stack);
6491+ spin_unlock_stack(&context->stack);
6492+
6493+ assert("zam-684", context->nr_children == 0);
6494+ /* restore original ->fs_context value */
6495+ current->journal_info = context->outer;
6496+ if (context->on_stack == 0)
6497+ kfree(context);
6498+ } else {
6499+ context->nr_children--;
6500+#if REISER4_DEBUG
6501+ assert("zam-685", context->nr_children >= 0);
6502+#endif
6503+ }
6504+}
6505+
6506+/*
6507+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6508+ * transaction. Call done_context() to do context related book-keeping.
6509+ */
6510+void reiser4_exit_context(reiser4_context * context)
6511+{
6512+ assert("nikita-3021", reiser4_schedulable());
6513+
6514+ if (context->nr_children == 0) {
6515+ if (!context->nobalance) {
6516+ reiser4_txn_restart(context);
6517+ balance_dirty_pages_at(context);
6518+ }
6519+
6520+ /* if filesystem is mounted with -o sync or -o dirsync - commit
6521+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6522+ commiting on exit_context when inode semaphore is held and
6523+ to have ktxnmgrd to do commit instead to get better
6524+ concurrent filesystem accesses. But, when one mounts with -o
6525+ sync, he cares more about reliability than about
6526+ performance. So, for now we have this simple mount -o sync
6527+ support. */
6528+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6529+ txn_atom *atom;
6530+
6531+ atom = get_current_atom_locked_nocheck();
6532+ if (atom) {
6533+ atom->flags |= ATOM_FORCE_COMMIT;
6534+ context->trans->flags &= ~TXNH_DONT_COMMIT;
6535+ spin_unlock_atom(atom);
6536+ }
6537+ }
6538+ reiser4_txn_end(context);
6539+ }
6540+ reiser4_done_context(context);
6541+}
6542+
6543+void reiser4_ctx_gfp_mask_set(void)
6544+{
6545+ reiser4_context *ctx;
6546+
6547+ ctx = get_current_context();
6548+ if (ctx->entd == 0 &&
6549+ list_empty(&ctx->stack.locks) &&
6550+ ctx->trans->atom == NULL)
6551+ ctx->gfp_mask = GFP_KERNEL;
6552+ else
6553+ ctx->gfp_mask = GFP_NOFS;
6554+}
6555+
6556+void reiser4_ctx_gfp_mask_force (gfp_t mask)
6557+{
6558+ reiser4_context *ctx;
6559+ ctx = get_current_context();
6560+
6561+ assert("edward-1454", ctx != NULL);
6562+
6563+ ctx->gfp_mask = mask;
6564+}
6565+
6566+/*
6567+ * Local variables:
6568+ * c-indentation-style: "K&R"
6569+ * mode-name: "LC"
6570+ * c-basic-offset: 8
6571+ * tab-width: 8
6572+ * fill-column: 120
6573+ * scroll-step: 1
6574+ * End:
6575+ */
6576diff -urN linux-2.6.20.orig/fs/reiser4/context.h linux-2.6.20/fs/reiser4/context.h
6577--- linux-2.6.20.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300
6578+++ linux-2.6.20/fs/reiser4/context.h 2007-05-06 14:50:43.698975725 +0400
6579@@ -0,0 +1,228 @@
6580+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6581+ * reiser4/README */
6582+
6583+/* Reiser4 context. See context.c for details. */
6584+
6585+#if !defined( __REISER4_CONTEXT_H__ )
6586+#define __REISER4_CONTEXT_H__
6587+
6588+#include "forward.h"
6589+#include "debug.h"
6590+#include "dformat.h"
6591+#include "tap.h"
6592+#include "lock.h"
6593+
6594+#include <linux/types.h> /* for __u?? */
6595+#include <linux/fs.h> /* for struct super_block */
6596+#include <linux/spinlock.h>
6597+#include <linux/sched.h> /* for struct task_struct */
6598+
6599+/* reiser4 per-thread context */
6600+struct reiser4_context {
6601+ /* magic constant. For identification of reiser4 contexts. */
6602+ __u32 magic;
6603+
6604+ /* current lock stack. See lock.[ch]. This is where list of all
6605+ locks taken by current thread is kept. This is also used in
6606+ deadlock detection. */
6607+ lock_stack stack;
6608+
6609+ /* current transcrash. */
6610+ txn_handle *trans;
6611+ /* transaction handle embedded into reiser4_context. ->trans points
6612+ * here by default. */
6613+ txn_handle trans_in_ctx;
6614+
6615+ /* super block we are working with. To get the current tree
6616+ use &get_super_private (reiser4_get_current_sb ())->tree. */
6617+ struct super_block *super;
6618+
6619+ /* parent fs activation */
6620+ struct fs_activation *outer;
6621+
6622+ /* per-thread grabbed (for further allocation) blocks counter */
6623+ reiser4_block_nr grabbed_blocks;
6624+
6625+ /* list of taps currently monitored. See tap.c */
6626+ struct list_head taps;
6627+
6628+ /* grabbing space is enabled */
6629+ unsigned int grab_enabled:1;
6630+ /* should be set when we are write dirty nodes to disk in jnode_flush or
6631+ * reiser4_write_logs() */
6632+ unsigned int writeout_mode:1;
6633+ /* true, if current thread is an ent thread */
6634+ unsigned int entd:1;
6635+ /* true, if balance_dirty_pages() should not be run when leaving this
6636+ * context. This is used to avoid lengthly balance_dirty_pages()
6637+ * operation when holding some important resource, like directory
6638+ * ->i_mutex */
6639+ unsigned int nobalance:1;
6640+
6641+ /* this bit is used on reiser4_done_context to decide whether context is
6642+ kmalloc-ed and has to be kfree-ed */
6643+ unsigned int on_stack:1;
6644+
6645+ /* count non-trivial jnode_set_dirty() calls */
6646+ unsigned long nr_marked_dirty;
6647+
6648+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6649+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6650+ * captures pages. When number of pages captured in one
6651+ * reiser4_sync_inodes reaches some threshold - some atoms get
6652+ * flushed */
6653+ int nr_captured;
6654+ int nr_children; /* number of child contexts */
6655+#if REISER4_DEBUG
6656+ /* debugging information about reiser4 locks held by the current
6657+ * thread */
6658+ reiser4_lock_counters_info locks;
6659+ struct task_struct *task; /* so we can easily find owner of the stack */
6660+
6661+ /*
6662+ * disk space grabbing debugging support
6663+ */
6664+ /* how many disk blocks were grabbed by the first call to
6665+ * reiser4_grab_space() in this context */
6666+ reiser4_block_nr grabbed_initially;
6667+
6668+ /* list of all threads doing flush currently */
6669+ struct list_head flushers_link;
6670+ /* information about last error encountered by reiser4 */
6671+ err_site err;
6672+#endif
6673+ void *vp;
6674+ gfp_t gfp_mask;
6675+};
6676+
6677+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6678+
6679+/* Debugging helps. */
6680+#if REISER4_DEBUG
6681+extern void print_contexts(void);
6682+#endif
6683+
6684+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6685+#define current_blocksize reiser4_get_current_sb()->s_blocksize
6686+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6687+
6688+extern reiser4_context *reiser4_init_context(struct super_block *);
6689+extern void init_stack_context(reiser4_context *, struct super_block *);
6690+extern void reiser4_exit_context(reiser4_context *);
6691+
6692+/* magic constant we store in reiser4_context allocated at the stack. Used to
6693+ catch accesses to staled or uninitialized contexts. */
6694+#define context_magic ((__u32) 0x4b1b5d0b)
6695+
6696+extern int is_in_reiser4_context(void);
6697+
6698+/*
6699+ * return reiser4_context for the thread @tsk
6700+ */
6701+static inline reiser4_context *get_context(const struct task_struct *tsk)
6702+{
6703+ assert("vs-1682",
6704+ ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6705+ return (reiser4_context *) tsk->journal_info;
6706+}
6707+
6708+/*
6709+ * return reiser4 context of the current thread, or NULL if there is none.
6710+ */
6711+static inline reiser4_context *get_current_context_check(void)
6712+{
6713+ if (is_in_reiser4_context())
6714+ return get_context(current);
6715+ else
6716+ return NULL;
6717+}
6718+
6719+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6720+
6721+/* return context associated with current thread */
6722+static inline reiser4_context *get_current_context(void)
6723+{
6724+ return get_context(current);
6725+}
6726+
6727+static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6728+{
6729+ reiser4_context *ctx;
6730+
6731+ ctx = get_current_context_check();
6732+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6733+}
6734+
6735+void reiser4_ctx_gfp_mask_set(void);
6736+void reiser4_ctx_gfp_mask_force (gfp_t mask);
6737+
6738+/*
6739+ * true if current thread is in the write-out mode. Thread enters write-out
6740+ * mode during jnode_flush and reiser4_write_logs().
6741+ */
6742+static inline int is_writeout_mode(void)
6743+{
6744+ return get_current_context()->writeout_mode;
6745+}
6746+
6747+/*
6748+ * enter write-out mode
6749+ */
6750+static inline void writeout_mode_enable(void)
6751+{
6752+ assert("zam-941", !get_current_context()->writeout_mode);
6753+ get_current_context()->writeout_mode = 1;
6754+}
6755+
6756+/*
6757+ * leave write-out mode
6758+ */
6759+static inline void writeout_mode_disable(void)
6760+{
6761+ assert("zam-942", get_current_context()->writeout_mode);
6762+ get_current_context()->writeout_mode = 0;
6763+}
6764+
6765+static inline void grab_space_enable(void)
6766+{
6767+ get_current_context()->grab_enabled = 1;
6768+}
6769+
6770+static inline void grab_space_disable(void)
6771+{
6772+ get_current_context()->grab_enabled = 0;
6773+}
6774+
6775+static inline void grab_space_set_enabled(int enabled)
6776+{
6777+ get_current_context()->grab_enabled = enabled;
6778+}
6779+
6780+static inline int is_grab_enabled(reiser4_context * ctx)
6781+{
6782+ return ctx->grab_enabled;
6783+}
6784+
6785+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6786+ * flush would be performed when it is closed. This is necessary when handle
6787+ * has to be closed under some coarse semaphore, like i_mutex of
6788+ * directory. Commit will be performed by ktxnmgrd. */
6789+static inline void context_set_commit_async(reiser4_context * context)
6790+{
6791+ context->nobalance = 1;
6792+ context->trans->flags |= TXNH_DONT_COMMIT;
6793+}
6794+
6795+/* __REISER4_CONTEXT_H__ */
6796+#endif
6797+
6798+/* Make Linus happy.
6799+ Local variables:
6800+ c-indentation-style: "K&R"
6801+ mode-name: "LC"
6802+ c-basic-offset: 8
6803+ tab-width: 8
6804+ fill-column: 120
6805+ scroll-step: 1
6806+ End:
6807+*/
6808diff -urN linux-2.6.20.orig/fs/reiser4/coord.c linux-2.6.20/fs/reiser4/coord.c
6809--- linux-2.6.20.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300
6810+++ linux-2.6.20/fs/reiser4/coord.c 2007-05-06 14:50:43.698975725 +0400
6811@@ -0,0 +1,935 @@
6812+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6813+
6814+#include "forward.h"
6815+#include "debug.h"
6816+#include "dformat.h"
6817+#include "tree.h"
6818+#include "plugin/item/item.h"
6819+#include "znode.h"
6820+#include "coord.h"
6821+
6822+/* Internal constructor. */
6823+static inline void
6824+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6825+ pos_in_node_t unit_pos, between_enum between)
6826+{
6827+ coord->node = (znode *) node;
6828+ coord_set_item_pos(coord, item_pos);
6829+ coord->unit_pos = unit_pos;
6830+ coord->between = between;
6831+ ON_DEBUG(coord->plug_v = 0);
6832+ ON_DEBUG(coord->body_v = 0);
6833+
6834+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6835+}
6836+
6837+/* after shifting of node content, coord previously set properly may become
6838+ invalid, try to "normalize" it. */
6839+void coord_normalize(coord_t * coord)
6840+{
6841+ znode *node;
6842+
6843+ node = coord->node;
6844+ assert("vs-683", node);
6845+
6846+ coord_clear_iplug(coord);
6847+
6848+ if (node_is_empty(node)) {
6849+ coord_init_first_unit(coord, node);
6850+ } else if ((coord->between == AFTER_ITEM)
6851+ || (coord->between == AFTER_UNIT)) {
6852+ return;
6853+ } else if (coord->item_pos == coord_num_items(coord)
6854+ && coord->between == BEFORE_ITEM) {
6855+ coord_dec_item_pos(coord);
6856+ coord->between = AFTER_ITEM;
6857+ } else if (coord->unit_pos == coord_num_units(coord)
6858+ && coord->between == BEFORE_UNIT) {
6859+ coord->unit_pos--;
6860+ coord->between = AFTER_UNIT;
6861+ } else if (coord->item_pos == coord_num_items(coord)
6862+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6863+ coord_dec_item_pos(coord);
6864+ coord->unit_pos = 0;
6865+ coord->between = AFTER_ITEM;
6866+ }
6867+}
6868+
6869+/* Copy a coordinate. */
6870+void coord_dup(coord_t * coord, const coord_t * old_coord)
6871+{
6872+ assert("jmacd-9800", coord_check(old_coord));
6873+ coord_dup_nocheck(coord, old_coord);
6874+}
6875+
6876+/* Copy a coordinate without check. Useful when old_coord->node is not
6877+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6878+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6879+{
6880+ coord->node = old_coord->node;
6881+ coord_set_item_pos(coord, old_coord->item_pos);
6882+ coord->unit_pos = old_coord->unit_pos;
6883+ coord->between = old_coord->between;
6884+ coord->iplugid = old_coord->iplugid;
6885+ ON_DEBUG(coord->plug_v = old_coord->plug_v);
6886+ ON_DEBUG(coord->body_v = old_coord->body_v);
6887+}
6888+
6889+/* Initialize an invalid coordinate. */
6890+void coord_init_invalid(coord_t * coord, const znode * node)
6891+{
6892+ coord_init_values(coord, node, 0, 0, INVALID_COORD);
6893+}
6894+
6895+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6896+{
6897+ coord_init_values(coord, node, 0, 0, AT_UNIT);
6898+}
6899+
6900+/* Initialize a coordinate to point at the first unit of the first item. If the node is
6901+ empty, it is positioned at the EMPTY_NODE. */
6902+void coord_init_first_unit(coord_t * coord, const znode * node)
6903+{
6904+ int is_empty = node_is_empty(node);
6905+
6906+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6907+
6908+ assert("jmacd-9801", coord_check(coord));
6909+}
6910+
6911+/* Initialize a coordinate to point at the last unit of the last item. If the node is
6912+ empty, it is positioned at the EMPTY_NODE. */
6913+void coord_init_last_unit(coord_t * coord, const znode * node)
6914+{
6915+ int is_empty = node_is_empty(node);
6916+
6917+ coord_init_values(coord, node,
6918+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6919+ (is_empty ? EMPTY_NODE : AT_UNIT));
6920+ if (!is_empty)
6921+ coord->unit_pos = coord_last_unit_pos(coord);
6922+ assert("jmacd-9802", coord_check(coord));
6923+}
6924+
6925+/* Initialize a coordinate to before the first item. If the node is empty, it is
6926+ positioned at the EMPTY_NODE. */
6927+void coord_init_before_first_item(coord_t * coord, const znode * node)
6928+{
6929+ int is_empty = node_is_empty(node);
6930+
6931+ coord_init_values(coord, node, 0, 0,
6932+ (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6933+
6934+ assert("jmacd-9803", coord_check(coord));
6935+}
6936+
6937+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
6938+ at the EMPTY_NODE. */
6939+void coord_init_after_last_item(coord_t * coord, const znode * node)
6940+{
6941+ int is_empty = node_is_empty(node);
6942+
6943+ coord_init_values(coord, node,
6944+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6945+ (is_empty ? EMPTY_NODE : AFTER_ITEM));
6946+
6947+ assert("jmacd-9804", coord_check(coord));
6948+}
6949+
6950+/* Initialize a coordinate to after last unit in the item. Coord must be set
6951+ already to existing item */
6952+void coord_init_after_item_end(coord_t * coord)
6953+{
6954+ coord->between = AFTER_UNIT;
6955+ coord->unit_pos = coord_last_unit_pos(coord);
6956+}
6957+
6958+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6959+void coord_init_before_item(coord_t * coord)
6960+{
6961+ coord->unit_pos = 0;
6962+ coord->between = BEFORE_ITEM;
6963+}
6964+
6965+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
6966+void coord_init_after_item(coord_t * coord)
6967+{
6968+ coord->unit_pos = 0;
6969+ coord->between = AFTER_ITEM;
6970+}
6971+
6972+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
6973+ it was not clear how actually */
6974+void coord_init_zero(coord_t * coord)
6975+{
6976+ memset(coord, 0, sizeof(*coord));
6977+}
6978+
6979+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
6980+unsigned coord_num_units(const coord_t * coord)
6981+{
6982+ assert("jmacd-9806", coord_is_existing_item(coord));
6983+
6984+ return item_plugin_by_coord(coord)->b.nr_units(coord);
6985+}
6986+
6987+/* Returns true if the coord was initializewd by coord_init_invalid (). */
6988+/* Audited by: green(2002.06.15) */
6989+int coord_is_invalid(const coord_t * coord)
6990+{
6991+ return coord->between == INVALID_COORD;
6992+}
6993+
6994+/* Returns true if the coordinate is positioned at an existing item, not before or after
6995+ an item. It may be placed at, before, or after any unit within the item, whether
6996+ existing or not. */
6997+int coord_is_existing_item(const coord_t * coord)
6998+{
6999+ switch (coord->between) {
7000+ case EMPTY_NODE:
7001+ case BEFORE_ITEM:
7002+ case AFTER_ITEM:
7003+ case INVALID_COORD:
7004+ return 0;
7005+
7006+ case BEFORE_UNIT:
7007+ case AT_UNIT:
7008+ case AFTER_UNIT:
7009+ return coord->item_pos < coord_num_items(coord);
7010+ }
7011+
7012+ impossible("jmacd-9900", "unreachable coord: %p", coord);
7013+ return 0;
7014+}
7015+
7016+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7017+ unit. */
7018+/* Audited by: green(2002.06.15) */
7019+int coord_is_existing_unit(const coord_t * coord)
7020+{
7021+ switch (coord->between) {
7022+ case EMPTY_NODE:
7023+ case BEFORE_UNIT:
7024+ case AFTER_UNIT:
7025+ case BEFORE_ITEM:
7026+ case AFTER_ITEM:
7027+ case INVALID_COORD:
7028+ return 0;
7029+
7030+ case AT_UNIT:
7031+ return (coord->item_pos < coord_num_items(coord)
7032+ && coord->unit_pos < coord_num_units(coord));
7033+ }
7034+
7035+ impossible("jmacd-9902", "unreachable");
7036+ return 0;
7037+}
7038+
7039+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7040+ true for empty nodes nor coordinates positioned before the first item. */
7041+/* Audited by: green(2002.06.15) */
7042+int coord_is_leftmost_unit(const coord_t * coord)
7043+{
7044+ return (coord->between == AT_UNIT && coord->item_pos == 0
7045+ && coord->unit_pos == 0);
7046+}
7047+
7048+#if REISER4_DEBUG
7049+/* For assertions only, checks for a valid coordinate. */
7050+int coord_check(const coord_t * coord)
7051+{
7052+ if (coord->node == NULL) {
7053+ return 0;
7054+ }
7055+ if (znode_above_root(coord->node))
7056+ return 1;
7057+
7058+ switch (coord->between) {
7059+ default:
7060+ case INVALID_COORD:
7061+ return 0;
7062+ case EMPTY_NODE:
7063+ if (!node_is_empty(coord->node)) {
7064+ return 0;
7065+ }
7066+ return coord->item_pos == 0 && coord->unit_pos == 0;
7067+
7068+ case BEFORE_UNIT:
7069+ case AFTER_UNIT:
7070+ if (node_is_empty(coord->node) && (coord->item_pos == 0)
7071+ && (coord->unit_pos == 0))
7072+ return 1;
7073+ case AT_UNIT:
7074+ break;
7075+ case AFTER_ITEM:
7076+ case BEFORE_ITEM:
7077+ /* before/after item should not set unit_pos. */
7078+ if (coord->unit_pos != 0) {
7079+ return 0;
7080+ }
7081+ break;
7082+ }
7083+
7084+ if (coord->item_pos >= node_num_items(coord->node)) {
7085+ return 0;
7086+ }
7087+
7088+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7089+ between is set either AFTER_ITEM or BEFORE_ITEM */
7090+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7091+ return 1;
7092+
7093+ if (coord_is_iplug_set(coord) &&
7094+ coord->unit_pos >
7095+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7096+ return 0;
7097+ }
7098+ return 1;
7099+}
7100+#endif
7101+
7102+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7103+ Returns 1 if the new position is does not exist. */
7104+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7105+{
7106+ /* If the node is invalid, leave it. */
7107+ if (coord->between == INVALID_COORD) {
7108+ return 1;
7109+ }
7110+
7111+ /* If the node is empty, set it appropriately. */
7112+ if (items == 0) {
7113+ coord->between = EMPTY_NODE;
7114+ coord_set_item_pos(coord, 0);
7115+ coord->unit_pos = 0;
7116+ return 1;
7117+ }
7118+
7119+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7120+ if (coord->between == EMPTY_NODE) {
7121+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7122+ coord_set_item_pos(coord, 0);
7123+ coord->unit_pos = 0;
7124+ return 0;
7125+ }
7126+
7127+ /* If the item_pos is out-of-range, set it appropriatly. */
7128+ if (coord->item_pos >= items) {
7129+ coord->between = AFTER_ITEM;
7130+ coord_set_item_pos(coord, items - 1);
7131+ coord->unit_pos = 0;
7132+ /* If is_next, return 1 (can't go any further). */
7133+ return is_next;
7134+ }
7135+
7136+ return 0;
7137+}
7138+
7139+/* Advances the coordinate by one unit to the right. If empty, no change. If
7140+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7141+ existing unit. */
7142+int coord_next_unit(coord_t * coord)
7143+{
7144+ unsigned items = coord_num_items(coord);
7145+
7146+ if (coord_adjust_items(coord, items, 1) == 1) {
7147+ return 1;
7148+ }
7149+
7150+ switch (coord->between) {
7151+ case BEFORE_UNIT:
7152+ /* Now it is positioned at the same unit. */
7153+ coord->between = AT_UNIT;
7154+ return 0;
7155+
7156+ case AFTER_UNIT:
7157+ case AT_UNIT:
7158+ /* If it was at or after a unit and there are more units in this item,
7159+ advance to the next one. */
7160+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7161+ coord->unit_pos += 1;
7162+ coord->between = AT_UNIT;
7163+ return 0;
7164+ }
7165+
7166+ /* Otherwise, it is crossing an item boundary and treated as if it was
7167+ after the current item. */
7168+ coord->between = AFTER_ITEM;
7169+ coord->unit_pos = 0;
7170+ /* FALLTHROUGH */
7171+
7172+ case AFTER_ITEM:
7173+ /* Check for end-of-node. */
7174+ if (coord->item_pos == items - 1) {
7175+ return 1;
7176+ }
7177+
7178+ coord_inc_item_pos(coord);
7179+ coord->unit_pos = 0;
7180+ coord->between = AT_UNIT;
7181+ return 0;
7182+
7183+ case BEFORE_ITEM:
7184+ /* The adjust_items checks ensure that we are valid here. */
7185+ coord->unit_pos = 0;
7186+ coord->between = AT_UNIT;
7187+ return 0;
7188+
7189+ case INVALID_COORD:
7190+ case EMPTY_NODE:
7191+ /* Handled in coord_adjust_items(). */
7192+ break;
7193+ }
7194+
7195+ impossible("jmacd-9902", "unreachable");
7196+ return 0;
7197+}
7198+
7199+/* Advances the coordinate by one item to the right. If empty, no change. If
7200+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7201+ an existing item. */
7202+int coord_next_item(coord_t * coord)
7203+{
7204+ unsigned items = coord_num_items(coord);
7205+
7206+ if (coord_adjust_items(coord, items, 1) == 1) {
7207+ return 1;
7208+ }
7209+
7210+ switch (coord->between) {
7211+ case AFTER_UNIT:
7212+ case AT_UNIT:
7213+ case BEFORE_UNIT:
7214+ case AFTER_ITEM:
7215+ /* Check for end-of-node. */
7216+ if (coord->item_pos == items - 1) {
7217+ coord->between = AFTER_ITEM;
7218+ coord->unit_pos = 0;
7219+ coord_clear_iplug(coord);
7220+ return 1;
7221+ }
7222+
7223+ /* Anywhere in an item, go to the next one. */
7224+ coord->between = AT_UNIT;
7225+ coord_inc_item_pos(coord);
7226+ coord->unit_pos = 0;
7227+ return 0;
7228+
7229+ case BEFORE_ITEM:
7230+ /* The out-of-range check ensures that we are valid here. */
7231+ coord->unit_pos = 0;
7232+ coord->between = AT_UNIT;
7233+ return 0;
7234+ case INVALID_COORD:
7235+ case EMPTY_NODE:
7236+ /* Handled in coord_adjust_items(). */
7237+ break;
7238+ }
7239+
7240+ impossible("jmacd-9903", "unreachable");
7241+ return 0;
7242+}
7243+
7244+/* Advances the coordinate by one unit to the left. If empty, no change. If
7245+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7246+ is an existing unit. */
7247+int coord_prev_unit(coord_t * coord)
7248+{
7249+ unsigned items = coord_num_items(coord);
7250+
7251+ if (coord_adjust_items(coord, items, 0) == 1) {
7252+ return 1;
7253+ }
7254+
7255+ switch (coord->between) {
7256+ case AT_UNIT:
7257+ case BEFORE_UNIT:
7258+ if (coord->unit_pos > 0) {
7259+ coord->unit_pos -= 1;
7260+ coord->between = AT_UNIT;
7261+ return 0;
7262+ }
7263+
7264+ if (coord->item_pos == 0) {
7265+ coord->between = BEFORE_ITEM;
7266+ return 1;
7267+ }
7268+
7269+ coord_dec_item_pos(coord);
7270+ coord->unit_pos = coord_last_unit_pos(coord);
7271+ coord->between = AT_UNIT;
7272+ return 0;
7273+
7274+ case AFTER_UNIT:
7275+ /* What if unit_pos is out-of-range? */
7276+ assert("jmacd-5442",
7277+ coord->unit_pos <= coord_last_unit_pos(coord));
7278+ coord->between = AT_UNIT;
7279+ return 0;
7280+
7281+ case BEFORE_ITEM:
7282+ if (coord->item_pos == 0) {
7283+ return 1;
7284+ }
7285+
7286+ coord_dec_item_pos(coord);
7287+ /* FALLTHROUGH */
7288+
7289+ case AFTER_ITEM:
7290+ coord->between = AT_UNIT;
7291+ coord->unit_pos = coord_last_unit_pos(coord);
7292+ return 0;
7293+
7294+ case INVALID_COORD:
7295+ case EMPTY_NODE:
7296+ break;
7297+ }
7298+
7299+ impossible("jmacd-9904", "unreachable");
7300+ return 0;
7301+}
7302+
7303+/* Advances the coordinate by one item to the left. If empty, no change. If
7304+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7305+ is an existing item. */
7306+int coord_prev_item(coord_t * coord)
7307+{
7308+ unsigned items = coord_num_items(coord);
7309+
7310+ if (coord_adjust_items(coord, items, 0) == 1) {
7311+ return 1;
7312+ }
7313+
7314+ switch (coord->between) {
7315+ case AT_UNIT:
7316+ case AFTER_UNIT:
7317+ case BEFORE_UNIT:
7318+ case BEFORE_ITEM:
7319+
7320+ if (coord->item_pos == 0) {
7321+ coord->between = BEFORE_ITEM;
7322+ coord->unit_pos = 0;
7323+ return 1;
7324+ }
7325+
7326+ coord_dec_item_pos(coord);
7327+ coord->unit_pos = 0;
7328+ coord->between = AT_UNIT;
7329+ return 0;
7330+
7331+ case AFTER_ITEM:
7332+ coord->between = AT_UNIT;
7333+ coord->unit_pos = 0;
7334+ return 0;
7335+
7336+ case INVALID_COORD:
7337+ case EMPTY_NODE:
7338+ break;
7339+ }
7340+
7341+ impossible("jmacd-9905", "unreachable");
7342+ return 0;
7343+}
7344+
7345+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7346+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7347+{
7348+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7349+ if (dir == LEFT_SIDE) {
7350+ coord_init_first_unit(coord, node);
7351+ } else {
7352+ coord_init_last_unit(coord, node);
7353+ }
7354+}
7355+
7356+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7357+ argument. */
7358+/* Audited by: green(2002.06.15) */
7359+int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7360+{
7361+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7362+ if (dir == LEFT_SIDE) {
7363+ return coord_is_before_leftmost(coord);
7364+ } else {
7365+ return coord_is_after_rightmost(coord);
7366+ }
7367+}
7368+
7369+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7370+/* Audited by: green(2002.06.15) */
7371+int coord_sideof_unit(coord_t * coord, sideof dir)
7372+{
7373+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7374+ if (dir == LEFT_SIDE) {
7375+ return coord_prev_unit(coord);
7376+ } else {
7377+ return coord_next_unit(coord);
7378+ }
7379+}
7380+
7381+#if REISER4_DEBUG
7382+int coords_equal(const coord_t * c1, const coord_t * c2)
7383+{
7384+ assert("nikita-2840", c1 != NULL);
7385+ assert("nikita-2841", c2 != NULL);
7386+
7387+ return
7388+ c1->node == c2->node &&
7389+ c1->item_pos == c2->item_pos &&
7390+ c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7391+}
7392+#endif /* REISER4_DEBUG */
7393+
7394+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7395+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7396+/* Audited by: green(2002.06.15) */
7397+coord_wrt_node coord_wrt(const coord_t * coord)
7398+{
7399+ if (coord_is_before_leftmost(coord)) {
7400+ return COORD_ON_THE_LEFT;
7401+ }
7402+
7403+ if (coord_is_after_rightmost(coord)) {
7404+ return COORD_ON_THE_RIGHT;
7405+ }
7406+
7407+ return COORD_INSIDE;
7408+}
7409+
7410+/* Returns true if the coordinate is positioned after the last item or after the last unit
7411+ of the last item or it is an empty node. */
7412+/* Audited by: green(2002.06.15) */
7413+int coord_is_after_rightmost(const coord_t * coord)
7414+{
7415+ assert("jmacd-7313", coord_check(coord));
7416+
7417+ switch (coord->between) {
7418+ case INVALID_COORD:
7419+ case AT_UNIT:
7420+ case BEFORE_UNIT:
7421+ case BEFORE_ITEM:
7422+ return 0;
7423+
7424+ case EMPTY_NODE:
7425+ return 1;
7426+
7427+ case AFTER_ITEM:
7428+ return (coord->item_pos == node_num_items(coord->node) - 1);
7429+
7430+ case AFTER_UNIT:
7431+ return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7432+ coord->unit_pos == coord_last_unit_pos(coord));
7433+ }
7434+
7435+ impossible("jmacd-9908", "unreachable");
7436+ return 0;
7437+}
7438+
7439+/* Returns true if the coordinate is positioned before the first item or it is an empty
7440+ node. */
7441+int coord_is_before_leftmost(const coord_t * coord)
7442+{
7443+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7444+ necessary to check if coord is set before leftmost
7445+ assert ("jmacd-7313", coord_check (coord)); */
7446+ switch (coord->between) {
7447+ case INVALID_COORD:
7448+ case AT_UNIT:
7449+ case AFTER_ITEM:
7450+ case AFTER_UNIT:
7451+ return 0;
7452+
7453+ case EMPTY_NODE:
7454+ return 1;
7455+
7456+ case BEFORE_ITEM:
7457+ case BEFORE_UNIT:
7458+ return (coord->item_pos == 0) && (coord->unit_pos == 0);
7459+ }
7460+
7461+ impossible("jmacd-9908", "unreachable");
7462+ return 0;
7463+}
7464+
7465+/* Returns true if the coordinate is positioned after a item, before a item, after the
7466+ last unit of an item, before the first unit of an item, or at an empty node. */
7467+/* Audited by: green(2002.06.15) */
7468+int coord_is_between_items(const coord_t * coord)
7469+{
7470+ assert("jmacd-7313", coord_check(coord));
7471+
7472+ switch (coord->between) {
7473+ case INVALID_COORD:
7474+ case AT_UNIT:
7475+ return 0;
7476+
7477+ case AFTER_ITEM:
7478+ case BEFORE_ITEM:
7479+ case EMPTY_NODE:
7480+ return 1;
7481+
7482+ case BEFORE_UNIT:
7483+ return coord->unit_pos == 0;
7484+
7485+ case AFTER_UNIT:
7486+ return coord->unit_pos == coord_last_unit_pos(coord);
7487+ }
7488+
7489+ impossible("jmacd-9908", "unreachable");
7490+ return 0;
7491+}
7492+
7493+#if REISER4_DEBUG
7494+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7495+ before-after or item boundaries. */
7496+int coord_are_neighbors(coord_t * c1, coord_t * c2)
7497+{
7498+ coord_t *left;
7499+ coord_t *right;
7500+
7501+ assert("nikita-1241", c1 != NULL);
7502+ assert("nikita-1242", c2 != NULL);
7503+ assert("nikita-1243", c1->node == c2->node);
7504+ assert("nikita-1244", coord_is_existing_unit(c1));
7505+ assert("nikita-1245", coord_is_existing_unit(c2));
7506+
7507+ left = right = NULL;
7508+ switch (coord_compare(c1, c2)) {
7509+ case COORD_CMP_ON_LEFT:
7510+ left = c1;
7511+ right = c2;
7512+ break;
7513+ case COORD_CMP_ON_RIGHT:
7514+ left = c2;
7515+ right = c1;
7516+ break;
7517+ case COORD_CMP_SAME:
7518+ return 0;
7519+ default:
7520+ wrong_return_value("nikita-1246", "compare_coords()");
7521+ }
7522+ assert("vs-731", left && right);
7523+ if (left->item_pos == right->item_pos) {
7524+ return left->unit_pos + 1 == right->unit_pos;
7525+ } else if (left->item_pos + 1 == right->item_pos) {
7526+ return (left->unit_pos == coord_last_unit_pos(left))
7527+ && (right->unit_pos == 0);
7528+ } else {
7529+ return 0;
7530+ }
7531+}
7532+#endif /* REISER4_DEBUG */
7533+
7534+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7535+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7536+/* Audited by: green(2002.06.15) */
7537+coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7538+{
7539+ assert("vs-209", c1->node == c2->node);
7540+ assert("vs-194", coord_is_existing_unit(c1)
7541+ && coord_is_existing_unit(c2));
7542+
7543+ if (c1->item_pos > c2->item_pos)
7544+ return COORD_CMP_ON_RIGHT;
7545+ if (c1->item_pos < c2->item_pos)
7546+ return COORD_CMP_ON_LEFT;
7547+ if (c1->unit_pos > c2->unit_pos)
7548+ return COORD_CMP_ON_RIGHT;
7549+ if (c1->unit_pos < c2->unit_pos)
7550+ return COORD_CMP_ON_LEFT;
7551+ return COORD_CMP_SAME;
7552+}
7553+
7554+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7555+ non-zero if there is no position to the right. */
7556+int coord_set_to_right(coord_t * coord)
7557+{
7558+ unsigned items = coord_num_items(coord);
7559+
7560+ if (coord_adjust_items(coord, items, 1) == 1) {
7561+ return 1;
7562+ }
7563+
7564+ switch (coord->between) {
7565+ case AT_UNIT:
7566+ return 0;
7567+
7568+ case BEFORE_ITEM:
7569+ case BEFORE_UNIT:
7570+ coord->between = AT_UNIT;
7571+ return 0;
7572+
7573+ case AFTER_UNIT:
7574+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7575+ coord->unit_pos += 1;
7576+ coord->between = AT_UNIT;
7577+ return 0;
7578+ } else {
7579+
7580+ coord->unit_pos = 0;
7581+
7582+ if (coord->item_pos == items - 1) {
7583+ coord->between = AFTER_ITEM;
7584+ return 1;
7585+ }
7586+
7587+ coord_inc_item_pos(coord);
7588+ coord->between = AT_UNIT;
7589+ return 0;
7590+ }
7591+
7592+ case AFTER_ITEM:
7593+ if (coord->item_pos == items - 1) {
7594+ return 1;
7595+ }
7596+
7597+ coord_inc_item_pos(coord);
7598+ coord->unit_pos = 0;
7599+ coord->between = AT_UNIT;
7600+ return 0;
7601+
7602+ case EMPTY_NODE:
7603+ return 1;
7604+
7605+ case INVALID_COORD:
7606+ break;
7607+ }
7608+
7609+ impossible("jmacd-9920", "unreachable");
7610+ return 0;
7611+}
7612+
7613+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7614+ non-zero if there is no position to the left. */
7615+int coord_set_to_left(coord_t * coord)
7616+{
7617+ unsigned items = coord_num_items(coord);
7618+
7619+ if (coord_adjust_items(coord, items, 0) == 1) {
7620+ return 1;
7621+ }
7622+
7623+ switch (coord->between) {
7624+ case AT_UNIT:
7625+ return 0;
7626+
7627+ case AFTER_UNIT:
7628+ coord->between = AT_UNIT;
7629+ return 0;
7630+
7631+ case AFTER_ITEM:
7632+ coord->between = AT_UNIT;
7633+ coord->unit_pos = coord_last_unit_pos(coord);
7634+ return 0;
7635+
7636+ case BEFORE_UNIT:
7637+ if (coord->unit_pos > 0) {
7638+ coord->unit_pos -= 1;
7639+ coord->between = AT_UNIT;
7640+ return 0;
7641+ } else {
7642+
7643+ if (coord->item_pos == 0) {
7644+ coord->between = BEFORE_ITEM;
7645+ return 1;
7646+ }
7647+
7648+ coord->unit_pos = coord_last_unit_pos(coord);
7649+ coord_dec_item_pos(coord);
7650+ coord->between = AT_UNIT;
7651+ return 0;
7652+ }
7653+
7654+ case BEFORE_ITEM:
7655+ if (coord->item_pos == 0) {
7656+ return 1;
7657+ }
7658+
7659+ coord_dec_item_pos(coord);
7660+ coord->unit_pos = coord_last_unit_pos(coord);
7661+ coord->between = AT_UNIT;
7662+ return 0;
7663+
7664+ case EMPTY_NODE:
7665+ return 1;
7666+
7667+ case INVALID_COORD:
7668+ break;
7669+ }
7670+
7671+ impossible("jmacd-9920", "unreachable");
7672+ return 0;
7673+}
7674+
7675+static const char *coord_tween_tostring(between_enum n)
7676+{
7677+ switch (n) {
7678+ case BEFORE_UNIT:
7679+ return "before unit";
7680+ case BEFORE_ITEM:
7681+ return "before item";
7682+ case AT_UNIT:
7683+ return "at unit";
7684+ case AFTER_UNIT:
7685+ return "after unit";
7686+ case AFTER_ITEM:
7687+ return "after item";
7688+ case EMPTY_NODE:
7689+ return "empty node";
7690+ case INVALID_COORD:
7691+ return "invalid";
7692+ default:
7693+ {
7694+ static char buf[30];
7695+
7696+ sprintf(buf, "unknown: %i", n);
7697+ return buf;
7698+ }
7699+ }
7700+}
7701+
7702+void print_coord(const char *mes, const coord_t * coord, int node)
7703+{
7704+ if (coord == NULL) {
7705+ printk("%s: null\n", mes);
7706+ return;
7707+ }
7708+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7709+ mes, coord->item_pos, coord->unit_pos,
7710+ coord_tween_tostring(coord->between), coord->iplugid);
7711+}
7712+
7713+int
7714+item_utmost_child_real_block(const coord_t * coord, sideof side,
7715+ reiser4_block_nr * blk)
7716+{
7717+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7718+ side,
7719+ blk);
7720+}
7721+
7722+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7723+{
7724+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7725+}
7726+
7727+/* @count bytes of flow @f got written, update correspondingly f->length,
7728+ f->data and f->key */
7729+void move_flow_forward(flow_t * f, unsigned count)
7730+{
7731+ if (f->data)
7732+ f->data += count;
7733+ f->length -= count;
7734+ set_key_offset(&f->key, get_key_offset(&f->key) + count);
7735+}
7736+
7737+/*
7738+ Local variables:
7739+ c-indentation-style: "K&R"
7740+ mode-name: "LC"
7741+ c-basic-offset: 8
7742+ tab-width: 8
7743+ fill-column: 120
7744+ scroll-step: 1
7745+ End:
7746+*/
7747diff -urN linux-2.6.20.orig/fs/reiser4/coord.h linux-2.6.20/fs/reiser4/coord.h
7748--- linux-2.6.20.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300
7749+++ linux-2.6.20/fs/reiser4/coord.h 2007-05-06 14:50:43.698975725 +0400
7750@@ -0,0 +1,389 @@
7751+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7752+
7753+/* Coords */
7754+
7755+#if !defined( __REISER4_COORD_H__ )
7756+#define __REISER4_COORD_H__
7757+
7758+#include "forward.h"
7759+#include "debug.h"
7760+#include "dformat.h"
7761+#include "key.h"
7762+
7763+/* insertions happen between coords in the tree, so we need some means
7764+ of specifying the sense of betweenness. */
7765+typedef enum {
7766+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7767+ AT_UNIT,
7768+ AFTER_UNIT,
7769+ BEFORE_ITEM,
7770+ AFTER_ITEM,
7771+ INVALID_COORD,
7772+ EMPTY_NODE,
7773+} between_enum;
7774+
7775+/* location of coord w.r.t. its node */
7776+typedef enum {
7777+ COORD_ON_THE_LEFT = -1,
7778+ COORD_ON_THE_RIGHT = +1,
7779+ COORD_INSIDE = 0
7780+} coord_wrt_node;
7781+
7782+typedef enum {
7783+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7784+} coord_cmp;
7785+
7786+struct coord {
7787+ /* node in a tree */
7788+ /* 0 */ znode *node;
7789+
7790+ /* position of item within node */
7791+ /* 4 */ pos_in_node_t item_pos;
7792+ /* position of unit within item */
7793+ /* 6 */ pos_in_node_t unit_pos;
7794+ /* optimization: plugin of item is stored in coord_t. Until this was
7795+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7796+ is invalidated (set to 0xff) on each modification of ->item_pos,
7797+ and all such modifications are funneled through coord_*_item_pos()
7798+ functions below.
7799+ */
7800+ /* 8 */ char iplugid;
7801+ /* position of coord w.r.t. to neighboring items and/or units.
7802+ Values are taken from &between_enum above.
7803+ */
7804+ /* 9 */ char between;
7805+ /* padding. It will be added by the compiler anyway to conform to the
7806+ * C language alignment requirements. We keep it here to be on the
7807+ * safe side and to have a clear picture of the memory layout of this
7808+ * structure. */
7809+ /* 10 */ __u16 pad;
7810+ /* 12 */ int offset;
7811+#if REISER4_DEBUG
7812+ unsigned long plug_v;
7813+ unsigned long body_v;
7814+#endif
7815+};
7816+
7817+#define INVALID_PLUGID ((char)((1 << 8) - 1))
7818+#define INVALID_OFFSET -1
7819+
7820+static inline void coord_clear_iplug(coord_t * coord)
7821+{
7822+ assert("nikita-2835", coord != NULL);
7823+ coord->iplugid = INVALID_PLUGID;
7824+ coord->offset = INVALID_OFFSET;
7825+}
7826+
7827+static inline int coord_is_iplug_set(const coord_t * coord)
7828+{
7829+ assert("nikita-2836", coord != NULL);
7830+ return coord->iplugid != INVALID_PLUGID;
7831+}
7832+
7833+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7834+{
7835+ assert("nikita-2478", coord != NULL);
7836+ coord->item_pos = pos;
7837+ coord_clear_iplug(coord);
7838+}
7839+
7840+static inline void coord_dec_item_pos(coord_t * coord)
7841+{
7842+ assert("nikita-2480", coord != NULL);
7843+ --coord->item_pos;
7844+ coord_clear_iplug(coord);
7845+}
7846+
7847+static inline void coord_inc_item_pos(coord_t * coord)
7848+{
7849+ assert("nikita-2481", coord != NULL);
7850+ ++coord->item_pos;
7851+ coord_clear_iplug(coord);
7852+}
7853+
7854+static inline void coord_add_item_pos(coord_t * coord, int delta)
7855+{
7856+ assert("nikita-2482", coord != NULL);
7857+ coord->item_pos += delta;
7858+ coord_clear_iplug(coord);
7859+}
7860+
7861+static inline void coord_invalid_item_pos(coord_t * coord)
7862+{
7863+ assert("nikita-2832", coord != NULL);
7864+ coord->item_pos = (unsigned short)~0;
7865+ coord_clear_iplug(coord);
7866+}
7867+
7868+/* Reverse a direction. */
7869+static inline sideof sideof_reverse(sideof side)
7870+{
7871+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7872+}
7873+
7874+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7875+
7876+ "first" and "last"
7877+ "next" and "prev"
7878+ "before" and "after"
7879+ "leftmost" and "rightmost"
7880+
7881+ But I think the chosen names are decent the way they are.
7882+*/
7883+
7884+/* COORD INITIALIZERS */
7885+
7886+/* Initialize an invalid coordinate. */
7887+extern void coord_init_invalid(coord_t * coord, const znode * node);
7888+
7889+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7890+
7891+/* Initialize a coordinate to point at the first unit of the first item. If the node is
7892+ empty, it is positioned at the EMPTY_NODE. */
7893+extern void coord_init_first_unit(coord_t * coord, const znode * node);
7894+
7895+/* Initialize a coordinate to point at the last unit of the last item. If the node is
7896+ empty, it is positioned at the EMPTY_NODE. */
7897+extern void coord_init_last_unit(coord_t * coord, const znode * node);
7898+
7899+/* Initialize a coordinate to before the first item. If the node is empty, it is
7900+ positioned at the EMPTY_NODE. */
7901+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7902+
7903+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7904+ at the EMPTY_NODE. */
7905+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7906+
7907+/* Initialize a coordinate to after last unit in the item. Coord must be set
7908+ already to existing item */
7909+void coord_init_after_item_end(coord_t * coord);
7910+
7911+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7912+void coord_init_before_item(coord_t *);
7913+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7914+void coord_init_after_item(coord_t *);
7915+
7916+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7917+extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7918+ sideof dir);
7919+
7920+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7921+ it was not clear how actually
7922+ FIXME-VS: added by vs (2002, june, 8) */
7923+extern void coord_init_zero(coord_t * coord);
7924+
7925+/* COORD METHODS */
7926+
7927+/* after shifting of node content, coord previously set properly may become
7928+ invalid, try to "normalize" it. */
7929+void coord_normalize(coord_t * coord);
7930+
7931+/* Copy a coordinate. */
7932+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7933+
7934+/* Copy a coordinate without check. */
7935+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7936+
7937+unsigned coord_num_units(const coord_t * coord);
7938+
7939+/* Return the last valid unit number at the present item (i.e.,
7940+ coord_num_units() - 1). */
7941+static inline unsigned coord_last_unit_pos(const coord_t * coord)
7942+{
7943+ return coord_num_units(coord) - 1;
7944+}
7945+
7946+#if REISER4_DEBUG
7947+/* For assertions only, checks for a valid coordinate. */
7948+extern int coord_check(const coord_t * coord);
7949+
7950+extern unsigned long znode_times_locked(const znode * z);
7951+
7952+static inline void coord_update_v(coord_t * coord)
7953+{
7954+ coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7955+}
7956+#endif
7957+
7958+extern int coords_equal(const coord_t * c1, const coord_t * c2);
7959+
7960+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
7961+
7962+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7963+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7964+extern coord_wrt_node coord_wrt(const coord_t * coord);
7965+
7966+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7967+ before-after or item boundaries. */
7968+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
7969+
7970+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
7971+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
7972+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
7973+
7974+/* COORD PREDICATES */
7975+
7976+/* Returns true if the coord was initializewd by coord_init_invalid (). */
7977+extern int coord_is_invalid(const coord_t * coord);
7978+
7979+/* Returns true if the coordinate is positioned at an existing item, not before or after
7980+ an item. It may be placed at, before, or after any unit within the item, whether
7981+ existing or not. If this is true you can call methods of the item plugin. */
7982+extern int coord_is_existing_item(const coord_t * coord);
7983+
7984+/* Returns true if the coordinate is positioned after a item, before a item, after the
7985+ last unit of an item, before the first unit of an item, or at an empty node. */
7986+extern int coord_is_between_items(const coord_t * coord);
7987+
7988+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7989+ unit. */
7990+extern int coord_is_existing_unit(const coord_t * coord);
7991+
7992+/* Returns true if the coordinate is positioned at an empty node. */
7993+extern int coord_is_empty(const coord_t * coord);
7994+
7995+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7996+ true for empty nodes nor coordinates positioned before the first item. */
7997+extern int coord_is_leftmost_unit(const coord_t * coord);
7998+
7999+/* Returns true if the coordinate is positioned after the last item or after the last unit
8000+ of the last item or it is an empty node. */
8001+extern int coord_is_after_rightmost(const coord_t * coord);
8002+
8003+/* Returns true if the coordinate is positioned before the first item or it is an empty
8004+ node. */
8005+extern int coord_is_before_leftmost(const coord_t * coord);
8006+
8007+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8008+ argument. */
8009+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8010+
8011+/* COORD MODIFIERS */
8012+
8013+/* Advances the coordinate by one unit to the right. If empty, no change. If
8014+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8015+ an existing unit. */
8016+extern int coord_next_unit(coord_t * coord);
8017+
8018+/* Advances the coordinate by one item to the right. If empty, no change. If
8019+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8020+ an existing item. */
8021+extern int coord_next_item(coord_t * coord);
8022+
8023+/* Advances the coordinate by one unit to the left. If empty, no change. If
8024+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8025+ is an existing unit. */
8026+extern int coord_prev_unit(coord_t * coord);
8027+
8028+/* Advances the coordinate by one item to the left. If empty, no change. If
8029+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8030+ is an existing item. */
8031+extern int coord_prev_item(coord_t * coord);
8032+
8033+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8034+ non-zero if there is no position to the right. */
8035+extern int coord_set_to_right(coord_t * coord);
8036+
8037+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8038+ non-zero if there is no position to the left. */
8039+extern int coord_set_to_left(coord_t * coord);
8040+
8041+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8042+ and non-zero if the unit did not exist. */
8043+extern int coord_set_after_unit(coord_t * coord);
8044+
8045+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8046+extern int coord_sideof_unit(coord_t * coord, sideof dir);
8047+
8048+/* iterate over all units in @node */
8049+#define for_all_units( coord, node ) \
8050+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8051+ coord_next_unit( coord ) == 0 ; )
8052+
8053+/* iterate over all items in @node */
8054+#define for_all_items( coord, node ) \
8055+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8056+ coord_next_item( coord ) == 0 ; )
8057+
8058+/* COORD/ITEM METHODS */
8059+
8060+extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8061+ reiser4_block_nr * blk);
8062+extern int item_utmost_child(const coord_t * coord, sideof side,
8063+ jnode ** child);
8064+
8065+/* a flow is a sequence of bytes being written to or read from the tree. The
8066+ tree will slice the flow into items while storing it into nodes, but all of
8067+ that is hidden from anything outside the tree. */
8068+
8069+struct flow {
8070+ reiser4_key key; /* key of start of flow's sequence of bytes */
8071+ loff_t length; /* length of flow's sequence of bytes */
8072+ char *data; /* start of flow's sequence of bytes */
8073+ int user; /* if 1 data is user space, 0 - kernel space */
8074+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8075+};
8076+
8077+void move_flow_forward(flow_t * f, unsigned count);
8078+
8079+/* &reiser4_item_data - description of data to be inserted or pasted
8080+
8081+ Q: articulate the reasons for the difference between this and flow.
8082+
8083+ A: Becides flow we insert into tree other things: stat data, directory
8084+ entry, etc. To insert them into tree one has to provide this structure. If
8085+ one is going to insert flow - he can use insert_flow, where this structure
8086+ does not have to be created
8087+*/
8088+struct reiser4_item_data {
8089+ /* actual data to be inserted. If NULL, ->create_item() will not
8090+ do xmemcpy itself, leaving this up to the caller. This can
8091+ save some amount of unnecessary memory copying, for example,
8092+ during insertion of stat data.
8093+
8094+ */
8095+ char *data;
8096+ /* 1 if 'char * data' contains pointer to user space and 0 if it is
8097+ kernel space */
8098+ int user;
8099+ /* amount of data we are going to insert or paste */
8100+ int length;
8101+ /* "Arg" is opaque data that is passed down to the
8102+ ->create_item() method of node layout, which in turn
8103+ hands it to the ->create_hook() of item being created. This
8104+ arg is currently used by:
8105+
8106+ . ->create_hook() of internal item
8107+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8108+ . ->paste() method of directory item.
8109+ . ->create_hook() of extent item
8110+
8111+ For internal item, this is left "brother" of new node being
8112+ inserted and it is used to add new node into sibling list
8113+ after parent to it was just inserted into parent.
8114+
8115+ While ->arg does look somewhat of unnecessary compication,
8116+ it actually saves a lot of headache in many places, because
8117+ all data necessary to insert or paste new data into tree are
8118+ collected in one place, and this eliminates a lot of extra
8119+ argument passing and storing everywhere.
8120+
8121+ */
8122+ void *arg;
8123+ /* plugin of item we are inserting */
8124+ item_plugin *iplug;
8125+};
8126+
8127+/* __REISER4_COORD_H__ */
8128+#endif
8129+
8130+/* Make Linus happy.
8131+ Local variables:
8132+ c-indentation-style: "K&R"
8133+ mode-name: "LC"
8134+ c-basic-offset: 8
8135+ tab-width: 8
8136+ fill-column: 120
8137+ scroll-step: 1
8138+ End:
8139+*/
8140diff -urN linux-2.6.20.orig/fs/reiser4/debug.c linux-2.6.20/fs/reiser4/debug.c
8141--- linux-2.6.20.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300
8142+++ linux-2.6.20/fs/reiser4/debug.c 2007-05-06 14:50:43.702976975 +0400
8143@@ -0,0 +1,308 @@
8144+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8145+ * reiser4/README */
8146+
8147+/* Debugging facilities. */
8148+
8149+/*
8150+ * This file contains generic debugging functions used by reiser4. Roughly
8151+ * following:
8152+ *
8153+ * panicking: reiser4_do_panic(), reiser4_print_prefix().
8154+ *
8155+ * locking:
8156+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8157+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8158+ *
8159+ * error code monitoring (see comment before RETERR macro):
8160+ * reiser4_return_err(), reiser4_report_err().
8161+ *
8162+ * stack back-tracing: fill_backtrace()
8163+ *
8164+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8165+ * reiser4_debugtrap().
8166+ *
8167+ */
8168+
8169+#include "reiser4.h"
8170+#include "context.h"
8171+#include "super.h"
8172+#include "txnmgr.h"
8173+#include "znode.h"
8174+
8175+#include <linux/sysfs.h>
8176+#include <linux/slab.h>
8177+#include <linux/types.h>
8178+#include <linux/fs.h>
8179+#include <linux/spinlock.h>
8180+#include <linux/kallsyms.h>
8181+#include <linux/vmalloc.h>
8182+#include <linux/ctype.h>
8183+#include <linux/sysctl.h>
8184+#include <linux/hardirq.h>
8185+
8186+#if 0
8187+#if REISER4_DEBUG
8188+static void reiser4_report_err(void);
8189+#else
8190+#define reiser4_report_err() noop
8191+#endif
8192+#endif /* 0 */
8193+
8194+/*
8195+ * global buffer where message given to reiser4_panic is formatted.
8196+ */
8197+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8198+
8199+/*
8200+ * lock protecting consistency of panic_buf under concurrent panics
8201+ */
8202+static DEFINE_SPINLOCK(panic_guard);
8203+
8204+/* Your best friend. Call it on each occasion. This is called by
8205+ fs/reiser4/debug.h:reiser4_panic(). */
8206+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8207+{
8208+ static int in_panic = 0;
8209+ va_list args;
8210+
8211+ /*
8212+ * check for recursive panic.
8213+ */
8214+ if (in_panic == 0) {
8215+ in_panic = 1;
8216+
8217+ spin_lock(&panic_guard);
8218+ va_start(args, format);
8219+ vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8220+ va_end(args);
8221+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8222+ spin_unlock(&panic_guard);
8223+
8224+ /*
8225+ * if kernel debugger is configured---drop in. Early dropping
8226+ * into kgdb is not always convenient, because panic message
8227+ * is not yet printed most of the times. But:
8228+ *
8229+ * (1) message can be extracted from printk_buf[]
8230+ * (declared static inside of printk()), and
8231+ *
8232+ * (2) sometimes serial/kgdb combo dies while printing
8233+ * long panic message, so it's more prudent to break into
8234+ * debugger earlier.
8235+ *
8236+ */
8237+ DEBUGON(1);
8238+ }
8239+ /* to make gcc happy about noreturn attribute */
8240+ panic("%s", panic_buf);
8241+}
8242+
8243+#if 0
8244+void
8245+reiser4_print_prefix(const char *level, int reperr, const char *mid,
8246+ const char *function, const char *file, int lineno)
8247+{
8248+ const char *comm;
8249+ int pid;
8250+
8251+ if (unlikely(in_interrupt() || in_irq())) {
8252+ comm = "interrupt";
8253+ pid = 0;
8254+ } else {
8255+ comm = current->comm;
8256+ pid = current->pid;
8257+ }
8258+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8259+ level, comm, pid, function, file, lineno, mid);
8260+ if (reperr)
8261+ reiser4_report_err();
8262+}
8263+#endif /* 0 */
8264+
8265+/* Preemption point: this should be called periodically during long running
8266+ operations (carry, allocate, and squeeze are best examples) */
8267+int reiser4_preempt_point(void)
8268+{
8269+ assert("nikita-3008", reiser4_schedulable());
8270+ cond_resched();
8271+ return signal_pending(current);
8272+}
8273+
8274+#if REISER4_DEBUG
8275+/* Debugging aid: return struct where information about locks taken by current
8276+ thread is accumulated. This can be used to formulate lock ordering
8277+ constraints and various assertions.
8278+
8279+*/
8280+reiser4_lock_counters_info *reiser4_lock_counters(void)
8281+{
8282+ reiser4_context *ctx = get_current_context();
8283+ assert("jmacd-1123", ctx != NULL);
8284+ return &ctx->locks;
8285+}
8286+
8287+/*
8288+ * print human readable information about locks held by the reiser4 context.
8289+ */
8290+static void print_lock_counters(const char *prefix,
8291+ const reiser4_lock_counters_info * info)
8292+{
8293+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8294+ "jload: %i, "
8295+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8296+ "ktxnmgrd: %i, fq: %i\n"
8297+ "inode: %i, "
8298+ "cbk_cache: %i (r:%i,w%i), "
8299+ "eflush: %i, "
8300+ "zlock: %i,\n"
8301+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8302+ "d: %i, x: %i, t: %i\n", prefix,
8303+ info->spin_locked_jnode,
8304+ info->rw_locked_tree, info->read_locked_tree,
8305+ info->write_locked_tree,
8306+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8307+ info->spin_locked_jload,
8308+ info->spin_locked_txnh,
8309+ info->spin_locked_atom, info->spin_locked_stack,
8310+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8311+ info->spin_locked_fq,
8312+ info->spin_locked_inode,
8313+ info->rw_locked_cbk_cache,
8314+ info->read_locked_cbk_cache,
8315+ info->write_locked_cbk_cache,
8316+ info->spin_locked_super_eflush,
8317+ info->spin_locked_zlock,
8318+ info->spin_locked,
8319+ info->long_term_locked_znode,
8320+ info->inode_sem_r, info->inode_sem_w,
8321+ info->d_refs, info->x_refs, info->t_refs);
8322+}
8323+
8324+/* check that no spinlocks are held */
8325+int reiser4_schedulable(void)
8326+{
8327+ if (get_current_context_check() != NULL) {
8328+ if (!LOCK_CNT_NIL(spin_locked)) {
8329+ print_lock_counters("in atomic", reiser4_lock_counters());
8330+ return 0;
8331+ }
8332+ }
8333+ might_sleep();
8334+ return 1;
8335+}
8336+/*
8337+ * return true, iff no locks are held.
8338+ */
8339+int reiser4_no_counters_are_held(void)
8340+{
8341+ reiser4_lock_counters_info *counters;
8342+
8343+ counters = reiser4_lock_counters();
8344+ return
8345+ (counters->spin_locked_zlock == 0) &&
8346+ (counters->spin_locked_jnode == 0) &&
8347+ (counters->rw_locked_tree == 0) &&
8348+ (counters->read_locked_tree == 0) &&
8349+ (counters->write_locked_tree == 0) &&
8350+ (counters->rw_locked_dk == 0) &&
8351+ (counters->read_locked_dk == 0) &&
8352+ (counters->write_locked_dk == 0) &&
8353+ (counters->spin_locked_txnh == 0) &&
8354+ (counters->spin_locked_atom == 0) &&
8355+ (counters->spin_locked_stack == 0) &&
8356+ (counters->spin_locked_txnmgr == 0) &&
8357+ (counters->spin_locked_inode == 0) &&
8358+ (counters->spin_locked == 0) &&
8359+ (counters->long_term_locked_znode == 0) &&
8360+ (counters->inode_sem_r == 0) &&
8361+ (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8362+}
8363+
8364+/*
8365+ * return true, iff transaction commit can be done under locks held by the
8366+ * current thread.
8367+ */
8368+int reiser4_commit_check_locks(void)
8369+{
8370+ reiser4_lock_counters_info *counters;
8371+ int inode_sem_r;
8372+ int inode_sem_w;
8373+ int result;
8374+
8375+ /*
8376+ * inode's read/write semaphore is the only reiser4 lock that can be
8377+ * held during commit.
8378+ */
8379+
8380+ counters = reiser4_lock_counters();
8381+ inode_sem_r = counters->inode_sem_r;
8382+ inode_sem_w = counters->inode_sem_w;
8383+
8384+ counters->inode_sem_r = counters->inode_sem_w = 0;
8385+ result = reiser4_no_counters_are_held();
8386+ counters->inode_sem_r = inode_sem_r;
8387+ counters->inode_sem_w = inode_sem_w;
8388+ return result;
8389+}
8390+
8391+/*
8392+ * fill "error site" in the current reiser4 context. See comment before RETERR
8393+ * macro for more details.
8394+ */
8395+void reiser4_return_err(int code, const char *file, int line)
8396+{
8397+ if (code < 0 && is_in_reiser4_context()) {
8398+ reiser4_context *ctx = get_current_context();
8399+
8400+ if (ctx != NULL) {
8401+ ctx->err.code = code;
8402+ ctx->err.file = file;
8403+ ctx->err.line = line;
8404+ }
8405+ }
8406+}
8407+
8408+#if 0
8409+/*
8410+ * report error information recorder by reiser4_return_err().
8411+ */
8412+static void reiser4_report_err(void)
8413+{
8414+ reiser4_context *ctx = get_current_context_check();
8415+
8416+ if (ctx != NULL) {
8417+ if (ctx->err.code != 0) {
8418+ printk("code: %i at %s:%i\n",
8419+ ctx->err.code, ctx->err.file, ctx->err.line);
8420+ }
8421+ }
8422+}
8423+#endif /* 0 */
8424+
8425+#endif /* REISER4_DEBUG */
8426+
8427+#if KERNEL_DEBUGGER
8428+
8429+/*
8430+ * this functions just drops into kernel debugger. It is a convenient place to
8431+ * put breakpoint in.
8432+ */
8433+void reiser4_debugtrap(void)
8434+{
8435+ /* do nothing. Put break point here. */
8436+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8437+ extern void breakpoint(void);
8438+ breakpoint();
8439+#endif
8440+}
8441+#endif
8442+
8443+/* Make Linus happy.
8444+ Local variables:
8445+ c-indentation-style: "K&R"
8446+ mode-name: "LC"
8447+ c-basic-offset: 8
8448+ tab-width: 8
8449+ fill-column: 120
8450+ End:
8451+*/
8452diff -urN linux-2.6.20.orig/fs/reiser4/debug.h linux-2.6.20/fs/reiser4/debug.h
8453--- linux-2.6.20.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300
8454+++ linux-2.6.20/fs/reiser4/debug.h 2007-05-06 14:50:43.702976975 +0400
8455@@ -0,0 +1,350 @@
8456+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8457+
8458+/* Declarations of debug macros. */
8459+
8460+#if !defined( __FS_REISER4_DEBUG_H__ )
8461+#define __FS_REISER4_DEBUG_H__
8462+
8463+#include "forward.h"
8464+#include "reiser4.h"
8465+
8466+/* generic function to produce formatted output, decorating it with
8467+ whatever standard prefixes/postfixes we want. "Fun" is a function
8468+ that will be actually called, can be printk, panic etc.
8469+ This is for use by other debugging macros, not by users. */
8470+#define DCALL(lev, fun, reperr, label, format, ...) \
8471+({ \
8472+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8473+ current->comm, current->pid, __FUNCTION__, \
8474+ __FILE__, __LINE__, label, ## __VA_ARGS__); \
8475+})
8476+
8477+/*
8478+ * cause kernel to crash
8479+ */
8480+#define reiser4_panic(mid, format, ...) \
8481+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8482+
8483+/* print message with indication of current process, file, line and
8484+ function */
8485+#define reiser4_log(label, format, ...) \
8486+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8487+
8488+/* Assertion checked during compilation.
8489+ If "cond" is false (0) we get duplicate case label in switch.
8490+ Use this to check something like famous
8491+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8492+ in 3.x journal.c. If cassertion fails you get compiler error,
8493+ so no "maintainer-id".
8494+*/
8495+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8496+
8497+#define noop do {;} while(0)
8498+
8499+#if REISER4_DEBUG
8500+/* version of info that only actually prints anything when _d_ebugging
8501+ is on */
8502+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8503+/* macro to catch logical errors. Put it into `default' clause of
8504+ switch() statement. */
8505+#define impossible(label, format, ...) \
8506+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8507+/* assert assures that @cond is true. If it is not, reiser4_panic() is
8508+ called. Use this for checking logical consistency and _never_ call
8509+ this to check correctness of external data: disk blocks and user-input . */
8510+#define assert(label, cond) \
8511+({ \
8512+ /* call_on_each_assert(); */ \
8513+ if (cond) { \
8514+ /* put negated check to avoid using !(cond) that would lose \
8515+ * warnings for things like assert(a = b); */ \
8516+ ; \
8517+ } else { \
8518+ DEBUGON(1); \
8519+ reiser4_panic(label, "assertion failed: %s", #cond); \
8520+ } \
8521+})
8522+
8523+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8524+#define check_me( label, expr ) assert( label, ( expr ) )
8525+
8526+#define ON_DEBUG( exp ) exp
8527+
8528+extern int reiser4_schedulable(void);
8529+extern void call_on_each_assert(void);
8530+
8531+#else
8532+
8533+#define dinfo( format, args... ) noop
8534+#define impossible( label, format, args... ) noop
8535+#define assert( label, cond ) noop
8536+#define check_me( label, expr ) ( ( void ) ( expr ) )
8537+#define ON_DEBUG( exp )
8538+#define reiser4_schedulable() might_sleep()
8539+
8540+/* REISER4_DEBUG */
8541+#endif
8542+
8543+#if REISER4_DEBUG
8544+/* per-thread information about lock acquired by this thread. Used by lock
8545+ * ordering checking in spin_macros.h */
8546+typedef struct reiser4_lock_counters_info {
8547+ int rw_locked_tree;
8548+ int read_locked_tree;
8549+ int write_locked_tree;
8550+
8551+ int rw_locked_dk;
8552+ int read_locked_dk;
8553+ int write_locked_dk;
8554+
8555+ int rw_locked_cbk_cache;
8556+ int read_locked_cbk_cache;
8557+ int write_locked_cbk_cache;
8558+
8559+ int spin_locked_zlock;
8560+ int spin_locked_jnode;
8561+ int spin_locked_jload;
8562+ int spin_locked_txnh;
8563+ int spin_locked_atom;
8564+ int spin_locked_stack;
8565+ int spin_locked_txnmgr;
8566+ int spin_locked_ktxnmgrd;
8567+ int spin_locked_fq;
8568+ int spin_locked_inode;
8569+ int spin_locked_super_eflush;
8570+ int spin_locked;
8571+ int long_term_locked_znode;
8572+
8573+ int inode_sem_r;
8574+ int inode_sem_w;
8575+
8576+ int d_refs;
8577+ int x_refs;
8578+ int t_refs;
8579+} reiser4_lock_counters_info;
8580+
8581+extern reiser4_lock_counters_info *reiser4_lock_counters(void);
8582+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8583+
8584+/* increment lock-counter @counter, if present */
8585+#define LOCK_CNT_INC(counter) \
8586+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8587+
8588+/* decrement lock-counter @counter, if present */
8589+#define LOCK_CNT_DEC(counter) \
8590+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8591+
8592+/* check that lock-counter is zero. This is for use in assertions */
8593+#define LOCK_CNT_NIL(counter) \
8594+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8595+
8596+/* check that lock-counter is greater than zero. This is for use in
8597+ * assertions */
8598+#define LOCK_CNT_GTZ(counter) \
8599+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8600+#define LOCK_CNT_LT(counter,n) \
8601+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8602+
8603+#else /* REISER4_DEBUG */
8604+
8605+/* no-op versions on the above */
8606+
8607+typedef struct reiser4_lock_counters_info {
8608+} reiser4_lock_counters_info;
8609+
8610+#define reiser4_lock_counters() ((reiser4_lock_counters_info *)NULL)
8611+#define LOCK_CNT_INC(counter) noop
8612+#define LOCK_CNT_DEC(counter) noop
8613+#define LOCK_CNT_NIL(counter) (1)
8614+#define LOCK_CNT_GTZ(counter) (1)
8615+#define LOCK_CNT_LT(counter,n) (1)
8616+
8617+#endif /* REISER4_DEBUG */
8618+
8619+#define assert_spin_not_locked(lock) BUG_ON(0)
8620+#define assert_rw_write_locked(lock) BUG_ON(0)
8621+#define assert_rw_read_locked(lock) BUG_ON(0)
8622+#define assert_rw_locked(lock) BUG_ON(0)
8623+#define assert_rw_not_write_locked(lock) BUG_ON(0)
8624+#define assert_rw_not_read_locked(lock) BUG_ON(0)
8625+#define assert_rw_not_locked(lock) BUG_ON(0)
8626+
8627+/* flags controlling debugging behavior. Are set through debug_flags=N mount
8628+ option. */
8629+typedef enum {
8630+ /* print a lot of information during panic. When this is on all jnodes
8631+ * are listed. This can be *very* large output. Usually you don't want
8632+ * this. Especially over serial line. */
8633+ REISER4_VERBOSE_PANIC = 0x00000001,
8634+ /* print a lot of information during umount */
8635+ REISER4_VERBOSE_UMOUNT = 0x00000002,
8636+ /* print gathered statistics on umount */
8637+ REISER4_STATS_ON_UMOUNT = 0x00000004,
8638+ /* check node consistency */
8639+ REISER4_CHECK_NODE = 0x00000008
8640+} reiser4_debug_flags;
8641+
8642+extern int is_in_reiser4_context(void);
8643+
8644+/*
8645+ * evaluate expression @e only if with reiser4 context
8646+ */
8647+#define ON_CONTEXT(e) do { \
8648+ if(is_in_reiser4_context()) { \
8649+ e; \
8650+ } } while(0)
8651+
8652+/*
8653+ * evaluate expression @e only when within reiser4_context and debugging is
8654+ * on.
8655+ */
8656+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8657+
8658+/*
8659+ * complain about unexpected function result and crash. Used in "default"
8660+ * branches of switch statements and alike to assert that invalid results are
8661+ * not silently ignored.
8662+ */
8663+#define wrong_return_value( label, function ) \
8664+ impossible( label, "wrong return value from " function )
8665+
8666+/* Issue different types of reiser4 messages to the console */
8667+#define warning( label, format, ... ) \
8668+ DCALL( KERN_WARNING, \
8669+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8670+#define notice( label, format, ... ) \
8671+ DCALL( KERN_NOTICE, \
8672+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8673+
8674+/* mark not yet implemented functionality */
8675+#define not_yet( label, format, ... ) \
8676+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8677+
8678+extern void reiser4_do_panic(const char *format, ...)
8679+ __attribute__ ((noreturn, format(printf, 1, 2)));
8680+
8681+extern int reiser4_preempt_point(void);
8682+extern void reiser4_print_stats(void);
8683+
8684+#if REISER4_DEBUG
8685+extern int reiser4_no_counters_are_held(void);
8686+extern int reiser4_commit_check_locks(void);
8687+#else
8688+#define reiser4_no_counters_are_held() (1)
8689+#define reiser4_commit_check_locks() (1)
8690+#endif
8691+
8692+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8693+#define IS_POW(i) \
8694+({ \
8695+ typeof(i) __i; \
8696+ \
8697+ __i = (i); \
8698+ !(__i & (__i - 1)); \
8699+})
8700+
8701+#define KERNEL_DEBUGGER (1)
8702+
8703+#if KERNEL_DEBUGGER
8704+
8705+extern void reiser4_debugtrap(void);
8706+
8707+/*
8708+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8709+ * kgdb is not compiled in, do nothing.
8710+ */
8711+#define DEBUGON(cond) \
8712+({ \
8713+ if (unlikely(cond)) \
8714+ reiser4_debugtrap(); \
8715+})
8716+#else
8717+#define DEBUGON(cond) noop
8718+#endif
8719+
8720+/*
8721+ * Error code tracing facility. (Idea is borrowed from XFS code.)
8722+ *
8723+ * Suppose some strange and/or unexpected code is returned from some function
8724+ * (for example, write(2) returns -EEXIST). It is possible to place a
8725+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
8726+ * in what particular place -EEXIST was generated first?
8727+ *
8728+ * In reiser4 all places where actual error codes are produced (that is,
8729+ * statements of the form
8730+ *
8731+ * return -EFOO; // (1), or
8732+ *
8733+ * result = -EFOO; // (2)
8734+ *
8735+ * are replaced with
8736+ *
8737+ * return RETERR(-EFOO); // (1a), and
8738+ *
8739+ * result = RETERR(-EFOO); // (2a) respectively
8740+ *
8741+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8742+ * printed in error and warning messages. Moreover, it's possible to put a
8743+ * conditional breakpoint in reiser4_return_err (low-level function called
8744+ * by RETERR() to do the actual work) to break into debugger immediately
8745+ * when particular error happens.
8746+ *
8747+ */
8748+
8749+#if REISER4_DEBUG
8750+
8751+/*
8752+ * data-type to store information about where error happened ("error site").
8753+ */
8754+typedef struct err_site {
8755+ int code; /* error code */
8756+ const char *file; /* source file, filled by __FILE__ */
8757+ int line; /* source file line, filled by __LINE__ */
8758+} err_site;
8759+
8760+extern void reiser4_return_err(int code, const char *file, int line);
8761+
8762+/*
8763+ * fill &get_current_context()->err_site with error information.
8764+ */
8765+#define RETERR(code) \
8766+({ \
8767+ typeof(code) __code; \
8768+ \
8769+ __code = (code); \
8770+ reiser4_return_err(__code, __FILE__, __LINE__); \
8771+ __code; \
8772+})
8773+
8774+#else
8775+
8776+/*
8777+ * no-op versions of the above
8778+ */
8779+
8780+typedef struct err_site {
8781+} err_site;
8782+#define RETERR(code) code
8783+#endif
8784+
8785+#if REISER4_LARGE_KEY
8786+/*
8787+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8788+ */
8789+#define ON_LARGE_KEY(...) __VA_ARGS__
8790+#else
8791+#define ON_LARGE_KEY(...)
8792+#endif
8793+
8794+/* __FS_REISER4_DEBUG_H__ */
8795+#endif
8796+
8797+/* Make Linus happy.
8798+ Local variables:
8799+ c-indentation-style: "K&R"
8800+ mode-name: "LC"
8801+ c-basic-offset: 8
8802+ tab-width: 8
8803+ fill-column: 120
8804+ End:
8805+*/
8806diff -urN linux-2.6.20.orig/fs/reiser4/dformat.h linux-2.6.20/fs/reiser4/dformat.h
8807--- linux-2.6.20.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300
8808+++ linux-2.6.20/fs/reiser4/dformat.h 2007-05-06 14:50:43.702976975 +0400
8809@@ -0,0 +1,70 @@
8810+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8811+
8812+/* Formats of on-disk data and conversion functions. */
8813+
8814+/* put all item formats in the files describing the particular items,
8815+ our model is, everything you need to do to add an item to reiser4,
8816+ (excepting the changes to the plugin that uses the item which go
8817+ into the file defining that plugin), you put into one file. */
8818+/* Data on disk are stored in little-endian format.
8819+ To declare fields of on-disk structures, use d8, d16, d32 and d64.
8820+ d??tocpu() and cputod??() to convert. */
8821+
8822+#if !defined( __FS_REISER4_DFORMAT_H__ )
8823+#define __FS_REISER4_DFORMAT_H__
8824+
8825+#include <asm/byteorder.h>
8826+#include <asm/unaligned.h>
8827+#include <linux/types.h>
8828+
8829+typedef __u8 d8;
8830+typedef __le16 d16;
8831+typedef __le32 d32;
8832+typedef __le64 d64;
8833+
8834+#define PACKED __attribute__((packed))
8835+
8836+/* data-type for block number */
8837+typedef __u64 reiser4_block_nr;
8838+
8839+/* data-type for block number on disk, disk format */
8840+typedef __le64 reiser4_dblock_nr;
8841+
8842+/**
8843+ * disk_addr_eq - compare disk addresses
8844+ * @b1: pointer to block number ot compare
8845+ * @b2: pointer to block number ot compare
8846+ *
8847+ * Returns true if if disk addresses are the same
8848+ */
8849+static inline int disk_addr_eq(const reiser4_block_nr *b1,
8850+ const reiser4_block_nr * b2)
8851+{
8852+ assert("nikita-1033", b1 != NULL);
8853+ assert("nikita-1266", b2 != NULL);
8854+
8855+ return !memcmp(b1, b2, sizeof *b1);
8856+}
8857+
8858+/* structure of master reiser4 super block */
8859+typedef struct reiser4_master_sb {
8860+ char magic[16]; /* "ReIsEr4" */
8861+ __le16 disk_plugin_id; /* id of disk layout plugin */
8862+ __le16 blocksize;
8863+ char uuid[16]; /* unique id */
8864+ char label[16]; /* filesystem label */
8865+ __le64 diskmap; /* location of the diskmap. 0 if not present */
8866+} reiser4_master_sb;
8867+
8868+/* __FS_REISER4_DFORMAT_H__ */
8869+#endif
8870+
8871+/*
8872+ * Local variables:
8873+ * c-indentation-style: "K&R"
8874+ * mode-name: "LC"
8875+ * c-basic-offset: 8
8876+ * tab-width: 8
8877+ * fill-column: 79
8878+ * End:
8879+ */
8880diff -urN linux-2.6.20.orig/fs/reiser4/dscale.c linux-2.6.20/fs/reiser4/dscale.c
8881--- linux-2.6.20.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
8882+++ linux-2.6.20/fs/reiser4/dscale.c 2007-05-06 14:50:43.702976975 +0400
8883@@ -0,0 +1,174 @@
8884+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8885+ * reiser4/README */
8886+
8887+/* Scalable on-disk integers */
8888+
8889+/*
8890+ * Various on-disk structures contain integer-like structures. Stat-data
8891+ * contain [yes, "data" is plural, check the dictionary] file size, link
8892+ * count; extent unit contains extent width etc. To accommodate for general
8893+ * case enough space is reserved to keep largest possible value. 64 bits in
8894+ * all cases above. But in overwhelming majority of cases numbers actually
8895+ * stored in these fields will be comparatively small and reserving 8 bytes is
8896+ * a waste of precious disk bandwidth.
8897+ *
8898+ * Scalable integers are one way to solve this problem. dscale_write()
8899+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8900+ * depending on the magnitude of the value supplied. dscale_read() reads value
8901+ * previously stored by dscale_write().
8902+ *
8903+ * dscale_write() produces format not completely unlike of UTF: two highest
8904+ * bits of the first byte are used to store "tag". One of 4 possible tag
8905+ * values is chosen depending on the number being encoded:
8906+ *
8907+ * 0 ... 0x3f => 0 [table 1]
8908+ * 0x40 ... 0x3fff => 1
8909+ * 0x4000 ... 0x3fffffff => 2
8910+ * 0x40000000 ... 0xffffffffffffffff => 3
8911+ *
8912+ * (see dscale_range() function)
8913+ *
8914+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8915+ * to be stored, so in this case there is no place in the first byte to store
8916+ * tag. For such values tag is stored in an extra 9th byte.
8917+ *
8918+ * As _highest_ bits are used for the test (which is natural) scaled integers
8919+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8920+ * uses LITTLE-ENDIAN.
8921+ *
8922+ */
8923+
8924+#include "debug.h"
8925+#include "dscale.h"
8926+
8927+/* return tag of scaled integer stored at @address */
8928+static int gettag(const unsigned char *address)
8929+{
8930+ /* tag is stored in two highest bits */
8931+ return (*address) >> 6;
8932+}
8933+
8934+/* clear tag from value. Clear tag embedded into @value. */
8935+static void cleartag(__u64 * value, int tag)
8936+{
8937+ /*
8938+ * W-w-what ?!
8939+ *
8940+ * Actually, this is rather simple: @value passed here was read by
8941+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8942+ * zeroes. Tag is still stored in the highest (arithmetically)
8943+ * non-zero bits of @value, but relative position of tag within __u64
8944+ * depends on @tag.
8945+ *
8946+ * For example if @tag is 0, it's stored 2 highest bits of lowest
8947+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8948+ *
8949+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8950+ * and it's offset if (2 * 8) - 2 == 14 bits.
8951+ *
8952+ * See table 1 above for details.
8953+ *
8954+ * All these cases are captured by the formula:
8955+ */
8956+ *value &= ~(3 << (((1 << tag) << 3) - 2));
8957+ /*
8958+ * That is, clear two (3 == 0t11) bits at the offset
8959+ *
8960+ * 8 * (2 ^ tag) - 2,
8961+ *
8962+ * that is, two highest bits of (2 ^ tag)-th byte of @value.
8963+ */
8964+}
8965+
8966+/* return tag for @value. See table 1 above for details. */
8967+static int dscale_range(__u64 value)
8968+{
8969+ if (value > 0x3fffffff)
8970+ return 3;
8971+ if (value > 0x3fff)
8972+ return 2;
8973+ if (value > 0x3f)
8974+ return 1;
8975+ return 0;
8976+}
8977+
8978+/* restore value stored at @adderss by dscale_write() and return number of
8979+ * bytes consumed */
8980+int dscale_read(unsigned char *address, __u64 * value)
8981+{
8982+ int tag;
8983+
8984+ /* read tag */
8985+ tag = gettag(address);
8986+ switch (tag) {
8987+ case 3:
8988+ /* In this case tag is stored in an extra byte, skip this byte
8989+ * and decode value stored in the next 8 bytes.*/
8990+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
8991+ /* worst case: 8 bytes for value itself plus one byte for
8992+ * tag. */
8993+ return 9;
8994+ case 0:
8995+ *value = get_unaligned(address);
8996+ break;
8997+ case 1:
8998+ *value = __be16_to_cpu(get_unaligned((__be16 *)address));
8999+ break;
9000+ case 2:
9001+ *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9002+ break;
9003+ default:
9004+ return RETERR(-EIO);
9005+ }
9006+ /* clear tag embedded into @value */
9007+ cleartag(value, tag);
9008+ /* number of bytes consumed is (2 ^ tag)---see table 1. */
9009+ return 1 << tag;
9010+}
9011+
9012+/* store @value at @address and return number of bytes consumed */
9013+int dscale_write(unsigned char *address, __u64 value)
9014+{
9015+ int tag;
9016+ int shift;
9017+ __be64 v;
9018+ unsigned char *valarr;
9019+
9020+ tag = dscale_range(value);
9021+ v = __cpu_to_be64(value);
9022+ valarr = (unsigned char *)&v;
9023+ shift = (tag == 3) ? 1 : 0;
9024+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9025+ *address |= (tag << 6);
9026+ return shift + (1 << tag);
9027+}
9028+
9029+/* number of bytes required to store @value */
9030+int dscale_bytes(__u64 value)
9031+{
9032+ int bytes;
9033+
9034+ bytes = 1 << dscale_range(value);
9035+ if (bytes == 8)
9036+ ++bytes;
9037+ return bytes;
9038+}
9039+
9040+/* returns true if @value and @other require the same number of bytes to be
9041+ * stored. Used by detect when data structure (like stat-data) has to be
9042+ * expanded or contracted. */
9043+int dscale_fit(__u64 value, __u64 other)
9044+{
9045+ return dscale_range(value) == dscale_range(other);
9046+}
9047+
9048+/* Make Linus happy.
9049+ Local variables:
9050+ c-indentation-style: "K&R"
9051+ mode-name: "LC"
9052+ c-basic-offset: 8
9053+ tab-width: 8
9054+ fill-column: 120
9055+ scroll-step: 1
9056+ End:
9057+*/
9058diff -urN linux-2.6.20.orig/fs/reiser4/dscale.h linux-2.6.20/fs/reiser4/dscale.h
9059--- linux-2.6.20.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
9060+++ linux-2.6.20/fs/reiser4/dscale.h 2007-05-06 14:50:43.702976975 +0400
9061@@ -0,0 +1,27 @@
9062+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9063+ * reiser4/README */
9064+
9065+/* Scalable on-disk integers. See dscale.h for details. */
9066+
9067+#if !defined( __FS_REISER4_DSCALE_H__ )
9068+#define __FS_REISER4_DSCALE_H__
9069+
9070+#include "dformat.h"
9071+
9072+extern int dscale_read(unsigned char *address, __u64 * value);
9073+extern int dscale_write(unsigned char *address, __u64 value);
9074+extern int dscale_bytes(__u64 value);
9075+extern int dscale_fit(__u64 value, __u64 other);
9076+
9077+/* __FS_REISER4_DSCALE_H__ */
9078+#endif
9079+
9080+/* Make Linus happy.
9081+ Local variables:
9082+ c-indentation-style: "K&R"
9083+ mode-name: "LC"
9084+ c-basic-offset: 8
9085+ tab-width: 8
9086+ fill-column: 120
9087+ End:
9088+*/
9089diff -urN linux-2.6.20.orig/fs/reiser4/entd.c linux-2.6.20/fs/reiser4/entd.c
9090--- linux-2.6.20.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9091+++ linux-2.6.20/fs/reiser4/entd.c 2007-05-06 14:50:43.702976975 +0400
9092@@ -0,0 +1,335 @@
9093+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9094+ * reiser4/README */
9095+
9096+/* Ent daemon. */
9097+
9098+#include "debug.h"
9099+#include "txnmgr.h"
9100+#include "tree.h"
9101+#include "entd.h"
9102+#include "super.h"
9103+#include "context.h"
9104+#include "reiser4.h"
9105+#include "vfs_ops.h"
9106+#include "page_cache.h"
9107+#include "inode.h"
9108+
9109+#include <linux/sched.h> /* struct task_struct */
9110+#include <linux/suspend.h>
9111+#include <linux/kernel.h>
9112+#include <linux/writeback.h>
9113+#include <linux/time.h> /* INITIAL_JIFFIES */
9114+#include <linux/backing-dev.h> /* bdi_write_congested */
9115+#include <linux/wait.h>
9116+#include <linux/kthread.h>
9117+#include <linux/freezer.h>
9118+
9119+#define DEF_PRIORITY 12
9120+#define MAX_ENTD_ITERS 10
9121+
9122+static void entd_flush(struct super_block *, struct wbq *);
9123+static int entd(void *arg);
9124+
9125+/*
9126+ * set ->comm field of end thread to make its state visible to the user level
9127+ */
9128+#define entd_set_comm(state) \
9129+ snprintf(current->comm, sizeof(current->comm), \
9130+ "ent:%s%s", super->s_id, (state))
9131+
9132+/**
9133+ * reiser4_init_entd - initialize entd context and start kernel daemon
9134+ * @super: super block to start ent thread for
9135+ *
9136+ * Creates entd contexts, starts kernel thread and waits until it
9137+ * initializes.
9138+ */
9139+int reiser4_init_entd(struct super_block *super)
9140+{
9141+ entd_context *ctx;
9142+
9143+ assert("nikita-3104", super != NULL);
9144+
9145+ ctx = get_entd_context(super);
9146+
9147+ memset(ctx, 0, sizeof *ctx);
9148+ spin_lock_init(&ctx->guard);
9149+ init_waitqueue_head(&ctx->wait);
9150+#if REISER4_DEBUG
9151+ INIT_LIST_HEAD(&ctx->flushers_list);
9152+#endif
9153+ /* lists of writepage requests */
9154+ INIT_LIST_HEAD(&ctx->todo_list);
9155+ INIT_LIST_HEAD(&ctx->done_list);
9156+ /* start entd */
9157+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9158+ if (IS_ERR(ctx->tsk))
9159+ return PTR_ERR(ctx->tsk);
9160+ return 0;
9161+}
9162+
9163+static void put_wbq(struct wbq *rq)
9164+{
9165+ iput(rq->mapping->host);
9166+ complete(&rq->completion);
9167+}
9168+
9169+/* ent should be locked */
9170+static struct wbq *__get_wbq(entd_context * ent)
9171+{
9172+ struct wbq *wbq;
9173+
9174+ if (list_empty(&ent->todo_list))
9175+ return NULL;
9176+
9177+ ent->nr_todo_reqs --;
9178+ wbq = list_entry(ent->todo_list.next, struct wbq, link);
9179+ list_del_init(&wbq->link);
9180+ return wbq;
9181+}
9182+
9183+/* ent thread function */
9184+static int entd(void *arg)
9185+{
9186+ struct super_block *super;
9187+ entd_context *ent;
9188+ int done = 0;
9189+
9190+ super = arg;
9191+ /* do_fork() just copies task_struct into the new
9192+ thread. ->fs_context shouldn't be copied of course. This shouldn't
9193+ be a problem for the rest of the code though.
9194+ */
9195+ current->journal_info = NULL;
9196+
9197+ ent = get_entd_context(super);
9198+
9199+ while (!done) {
9200+ try_to_freeze();
9201+
9202+ spin_lock(&ent->guard);
9203+ while (ent->nr_todo_reqs != 0) {
9204+ struct wbq *rq;
9205+
9206+ assert("", list_empty(&ent->done_list));
9207+
9208+ /* take request from the queue head */
9209+ rq = __get_wbq(ent);
9210+ assert("", rq != NULL);
9211+ ent->cur_request = rq;
9212+ spin_unlock(&ent->guard);
9213+
9214+ entd_set_comm("!");
9215+ entd_flush(super, rq);
9216+
9217+ put_wbq(rq);
9218+
9219+ /*
9220+ * wakeup all requestors and iput their inodes
9221+ */
9222+ spin_lock(&ent->guard);
9223+ while (!list_empty(&ent->done_list)) {
9224+ rq = list_entry(ent->done_list.next, struct wbq, link);
9225+ list_del_init(&rq->link);
9226+ ent->nr_done_reqs --;
9227+ spin_unlock(&ent->guard);
9228+ assert("", rq->written == 1);
9229+ put_wbq(rq);
9230+ spin_lock(&ent->guard);
9231+ }
9232+ }
9233+ spin_unlock(&ent->guard);
9234+
9235+ entd_set_comm(".");
9236+
9237+ {
9238+ DEFINE_WAIT(__wait);
9239+
9240+ do {
9241+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9242+ if (kthread_should_stop()) {
9243+ done = 1;
9244+ break;
9245+ }
9246+ if (ent->nr_todo_reqs != 0)
9247+ break;
9248+ schedule();
9249+ } while (0);
9250+ finish_wait(&ent->wait, &__wait);
9251+ }
9252+ }
9253+ BUG_ON(ent->nr_todo_reqs != 0);
9254+ return 0;
9255+}
9256+
9257+/**
9258+ * reiser4_done_entd - stop entd kernel thread
9259+ * @super: super block to stop ent thread for
9260+ *
9261+ * It is called on umount. Sends stop signal to entd and wait until it handles
9262+ * it.
9263+ */
9264+void reiser4_done_entd(struct super_block *super)
9265+{
9266+ entd_context *ent;
9267+
9268+ assert("nikita-3103", super != NULL);
9269+
9270+ ent = get_entd_context(super);
9271+ assert("zam-1055", ent->tsk != NULL);
9272+ kthread_stop(ent->tsk);
9273+}
9274+
9275+/* called at the beginning of jnode_flush to register flusher thread with ent
9276+ * daemon */
9277+void reiser4_enter_flush(struct super_block *super)
9278+{
9279+ entd_context *ent;
9280+
9281+ assert("zam-1029", super != NULL);
9282+ ent = get_entd_context(super);
9283+
9284+ assert("zam-1030", ent != NULL);
9285+
9286+ spin_lock(&ent->guard);
9287+ ent->flushers++;
9288+#if REISER4_DEBUG
9289+ list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9290+#endif
9291+ spin_unlock(&ent->guard);
9292+}
9293+
9294+/* called at the end of jnode_flush */
9295+void reiser4_leave_flush(struct super_block *super)
9296+{
9297+ entd_context *ent;
9298+ int wake_up_ent;
9299+
9300+ assert("zam-1027", super != NULL);
9301+ ent = get_entd_context(super);
9302+
9303+ assert("zam-1028", ent != NULL);
9304+
9305+ spin_lock(&ent->guard);
9306+ ent->flushers--;
9307+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9308+#if REISER4_DEBUG
9309+ list_del_init(&get_current_context()->flushers_link);
9310+#endif
9311+ spin_unlock(&ent->guard);
9312+ if (wake_up_ent)
9313+ wake_up(&ent->wait);
9314+}
9315+
9316+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9317+
9318+static void entd_flush(struct super_block *super, struct wbq *rq)
9319+{
9320+ reiser4_context ctx;
9321+ int tmp;
9322+
9323+ init_stack_context(&ctx, super);
9324+ ctx.entd = 1;
9325+ ctx.gfp_mask = GFP_NOFS;
9326+
9327+ rq->wbc->range_start = page_offset(rq->page);
9328+ rq->wbc->range_end = rq->wbc->range_start +
9329+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9330+ tmp = rq->wbc->nr_to_write;
9331+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9332+
9333+ if (rq->wbc->nr_to_write > 0) {
9334+ rq->wbc->range_start = 0;
9335+ rq->wbc->range_end = LLONG_MAX;
9336+ generic_sync_sb_inodes(super, rq->wbc);
9337+ }
9338+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9339+ reiser4_writeout(super, rq->wbc);
9340+
9341+ context_set_commit_async(&ctx);
9342+ reiser4_exit_context(&ctx);
9343+}
9344+
9345+/**
9346+ * write_page_by_ent - ask entd thread to flush this page as part of slum
9347+ * @page: page to be written
9348+ * @wbc: writeback control passed to reiser4_writepage
9349+ *
9350+ * Creates a request, puts it on entd list of requests, wakeups entd if
9351+ * necessary, waits until entd completes with the request.
9352+ */
9353+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9354+{
9355+ struct super_block *sb;
9356+ struct inode *inode;
9357+ entd_context *ent;
9358+ struct wbq rq;
9359+
9360+ assert("", PageLocked(page));
9361+ assert("", page->mapping != NULL);
9362+
9363+ sb = page->mapping->host->i_sb;
9364+ ent = get_entd_context(sb);
9365+ assert("", ent && ent->done == 0);
9366+
9367+ /*
9368+ * we are going to unlock page and ask ent thread to write the
9369+ * page. Re-dirty page before unlocking so that if ent thread fails to
9370+ * write it - it will remain dirty
9371+ */
9372+ reiser4_set_page_dirty_internal(page);
9373+
9374+ /*
9375+ * pin inode in memory, unlock page, entd_flush will iput. We can not
9376+ * iput here becasue we can not allow delete_inode to be called here
9377+ */
9378+ inode = igrab(page->mapping->host);
9379+ unlock_page(page);
9380+ if (inode == NULL)
9381+ /* inode is getting freed */
9382+ return 0;
9383+
9384+ /* init wbq */
9385+ INIT_LIST_HEAD(&rq.link);
9386+ rq.magic = WBQ_MAGIC;
9387+ rq.wbc = wbc;
9388+ rq.page = page;
9389+ rq.mapping = inode->i_mapping;
9390+ rq.node = NULL;
9391+ rq.written = 0;
9392+ init_completion(&rq.completion);
9393+
9394+ /* add request to entd's list of writepage requests */
9395+ spin_lock(&ent->guard);
9396+ ent->nr_todo_reqs++;
9397+ list_add_tail(&rq.link, &ent->todo_list);
9398+ if (ent->nr_todo_reqs == 1)
9399+ wake_up(&ent->wait);
9400+
9401+ spin_unlock(&ent->guard);
9402+
9403+ /* wait until entd finishes */
9404+ wait_for_completion(&rq.completion);
9405+
9406+ if (rq.written)
9407+ /* Eventually ENTD has written the page to disk. */
9408+ return 0;
9409+ return 0;
9410+}
9411+
9412+int wbq_available(void)
9413+{
9414+ struct super_block *sb = reiser4_get_current_sb();
9415+ entd_context *ent = get_entd_context(sb);
9416+ return ent->nr_todo_reqs;
9417+}
9418+
9419+/*
9420+ * Local variables:
9421+ * c-indentation-style: "K&R"
9422+ * mode-name: "LC"
9423+ * c-basic-offset: 8
9424+ * tab-width: 8
9425+ * fill-column: 79
9426+ * End:
9427+ */
9428diff -urN linux-2.6.20.orig/fs/reiser4/entd.h linux-2.6.20/fs/reiser4/entd.h
9429--- linux-2.6.20.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9430+++ linux-2.6.20/fs/reiser4/entd.h 2007-05-06 14:50:43.706978224 +0400
9431@@ -0,0 +1,90 @@
9432+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9433+
9434+/* Ent daemon. */
9435+
9436+#ifndef __ENTD_H__
9437+#define __ENTD_H__
9438+
9439+#include "context.h"
9440+
9441+#include <linux/fs.h>
9442+#include <linux/completion.h>
9443+#include <linux/wait.h>
9444+#include <linux/spinlock.h>
9445+#include <linux/sched.h> /* for struct task_struct */
9446+
9447+#define WBQ_MAGIC 0x7876dc76
9448+
9449+/* write-back request. */
9450+struct wbq {
9451+ int magic;
9452+ struct list_head link; /* list head of this list is in entd context */
9453+ struct writeback_control *wbc;
9454+ struct page *page;
9455+ struct address_space *mapping;
9456+ struct completion completion;
9457+ jnode *node; /* set if ent thread captured requested page */
9458+ int written; /* set if ent thread wrote requested page */
9459+};
9460+
9461+/* ent-thread context. This is used to synchronize starting/stopping ent
9462+ * threads. */
9463+typedef struct entd_context {
9464+ /* wait queue that ent thread waits on for more work. It's
9465+ * signaled by write_page_by_ent(). */
9466+ wait_queue_head_t wait;
9467+ /* spinlock protecting other fields */
9468+ spinlock_t guard;
9469+ /* ent thread */
9470+ struct task_struct *tsk;
9471+ /* set to indicate that ent thread should leave. */
9472+ int done;
9473+ /* counter of active flushers */
9474+ int flushers;
9475+ /*
9476+ * when reiser4_writepage asks entd to write a page - it adds struct
9477+ * wbq to this list
9478+ */
9479+ struct list_head todo_list;
9480+ /* number of elements on the above list */
9481+ int nr_todo_reqs;
9482+
9483+ struct wbq *cur_request;
9484+ /*
9485+ * when entd writes a page it moves write-back request from todo_list
9486+ * to done_list. This list is used at the end of entd iteration to
9487+ * wakeup requestors and iput inodes.
9488+ */
9489+ struct list_head done_list;
9490+ /* number of elements on the above list */
9491+ int nr_done_reqs;
9492+
9493+#if REISER4_DEBUG
9494+ /* list of all active flushers */
9495+ struct list_head flushers_list;
9496+#endif
9497+} entd_context;
9498+
9499+extern int reiser4_init_entd(struct super_block *);
9500+extern void reiser4_done_entd(struct super_block *);
9501+
9502+extern void reiser4_enter_flush(struct super_block *);
9503+extern void reiser4_leave_flush(struct super_block *);
9504+
9505+extern int write_page_by_ent(struct page *, struct writeback_control *);
9506+extern int wbq_available(void);
9507+extern void ent_writes_page(struct super_block *, struct page *);
9508+
9509+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9510+/* __ENTD_H__ */
9511+#endif
9512+
9513+/* Make Linus happy.
9514+ Local variables:
9515+ c-indentation-style: "K&R"
9516+ mode-name: "LC"
9517+ c-basic-offset: 8
9518+ tab-width: 8
9519+ fill-column: 120
9520+ End:
9521+*/
9522diff -urN linux-2.6.20.orig/fs/reiser4/eottl.c linux-2.6.20/fs/reiser4/eottl.c
9523--- linux-2.6.20.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300
9524+++ linux-2.6.20/fs/reiser4/eottl.c 2007-05-06 14:50:43.706978224 +0400
9525@@ -0,0 +1,509 @@
9526+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9527+
9528+#include "forward.h"
9529+#include "debug.h"
9530+#include "key.h"
9531+#include "coord.h"
9532+#include "plugin/item/item.h"
9533+#include "plugin/node/node.h"
9534+#include "znode.h"
9535+#include "block_alloc.h"
9536+#include "tree_walk.h"
9537+#include "tree_mod.h"
9538+#include "carry.h"
9539+#include "tree.h"
9540+#include "super.h"
9541+
9542+#include <linux/types.h> /* for __u?? */
9543+
9544+/*
9545+ * Extents on the twig level (EOTTL) handling.
9546+ *
9547+ * EOTTL poses some problems to the tree traversal, that are better explained
9548+ * by example.
9549+ *
9550+ * Suppose we have block B1 on the twig level with the following items:
9551+ *
9552+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9553+ * offset)
9554+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9555+ * 2. internal item I2 with key (10:0:0:0)
9556+ *
9557+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9558+ * then intra-node lookup is done. This lookup finished on the E1, because the
9559+ * key we are looking for is larger than the key of E1 and is smaller than key
9560+ * the of I2.
9561+ *
9562+ * Here search is stuck.
9563+ *
9564+ * After some thought it is clear what is wrong here: extents on the twig level
9565+ * break some basic property of the *search* tree (on the pretext, that they
9566+ * restore property of balanced tree).
9567+ *
9568+ * Said property is the following: if in the internal node of the search tree
9569+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9570+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9571+ * through the Pointer.
9572+ *
9573+ * This is not true, when Pointer is Extent-Pointer, simply because extent
9574+ * cannot expand indefinitely to the right to include any item with
9575+ *
9576+ * Key1 <= Key <= Key2.
9577+ *
9578+ * For example, our E1 extent is only responsible for the data with keys
9579+ *
9580+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9581+ *
9582+ * so, key range
9583+ *
9584+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9585+ *
9586+ * is orphaned: there is no way to get there from the tree root.
9587+ *
9588+ * In other words, extent pointers are different than normal child pointers as
9589+ * far as search tree is concerned, and this creates such problems.
9590+ *
9591+ * Possible solution for this problem is to insert our item into node pointed
9592+ * to by I2. There are some problems through:
9593+ *
9594+ * (1) I2 can be in a different node.
9595+ * (2) E1 can be immediately followed by another extent E2.
9596+ *
9597+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9598+ * for locks/coords as necessary.
9599+ *
9600+ * (2) is more complex. Solution here is to insert new empty leaf node and
9601+ * insert internal item between E1 and E2 pointing to said leaf node. This is
9602+ * further complicated by possibility that E2 is in a different node, etc.
9603+ *
9604+ * Problems:
9605+ *
9606+ * (1) if there was internal item I2 immediately on the right of an extent E1
9607+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9608+ * key of S1 will be less than smallest key in the N2. Normally, search key
9609+ * checks that key we are looking for is in the range of keys covered by the
9610+ * node key is being looked in. To work around of this situation, while
9611+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9612+ * cbk falgs bitmask. This flag is automatically set on entrance to the
9613+ * coord_by_key() and is only cleared when we are about to enter situation
9614+ * described above.
9615+ *
9616+ * (2) If extent E1 is immediately followed by another extent E2 and we are
9617+ * searching for the key that is between E1 and E2 we only have to insert new
9618+ * empty leaf node when coord_by_key was called for insertion, rather than just
9619+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9620+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9621+ * performed by insert_by_key() and friends.
9622+ *
9623+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9624+ * case it requires modification of node content which is only possible under
9625+ * write lock. It may well happen that we only have read lock on the node where
9626+ * new internal pointer is to be inserted (common case: lookup of non-existent
9627+ * stat-data that fells between two extents). If only read lock is held, tree
9628+ * traversal is restarted with lock_level modified so that next time we hit
9629+ * this problem, write lock will be held. Once we have write lock, balancing
9630+ * will be performed.
9631+ */
9632+
9633+/**
9634+ * is_next_item_internal - check whether next item is internal
9635+ * @coord: coordinate of extent item in twig node
9636+ * @key: search key
9637+ * @lh: twig node lock handle
9638+ *
9639+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9640+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9641+ * to that node, @coord is set to its first unit. If next item is not internal
9642+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9643+ * is returned if search restart has to be done.
9644+ */
9645+static int
9646+is_next_item_internal(coord_t *coord, const reiser4_key *key,
9647+ lock_handle *lh)
9648+{
9649+ coord_t next;
9650+ lock_handle rn;
9651+ int result;
9652+
9653+ coord_dup(&next, coord);
9654+ if (coord_next_unit(&next) == 0) {
9655+ /* next unit is in this node */
9656+ if (item_is_internal(&next)) {
9657+ coord_dup(coord, &next);
9658+ return 1;
9659+ }
9660+ assert("vs-3", item_is_extent(&next));
9661+ return 0;
9662+ }
9663+
9664+ /*
9665+ * next unit either does not exist or is in right neighbor. If it is in
9666+ * right neighbor we have to check right delimiting key because
9667+ * concurrent thread could get their first and insert item with a key
9668+ * smaller than @key
9669+ */
9670+ read_lock_dk(current_tree);
9671+ result = keycmp(key, znode_get_rd_key(coord->node));
9672+ read_unlock_dk(current_tree);
9673+ assert("vs-6", result != EQUAL_TO);
9674+ if (result == GREATER_THAN)
9675+ return 2;
9676+
9677+ /* lock right neighbor */
9678+ init_lh(&rn);
9679+ result = reiser4_get_right_neighbor(&rn, coord->node,
9680+ znode_is_wlocked(coord->node) ?
9681+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9682+ GN_CAN_USE_UPPER_LEVELS);
9683+ if (result == -E_NO_NEIGHBOR) {
9684+ /* we are on the rightmost edge of the tree */
9685+ done_lh(&rn);
9686+ return 0;
9687+ }
9688+
9689+ if (result) {
9690+ assert("vs-4", result < 0);
9691+ done_lh(&rn);
9692+ return result;
9693+ }
9694+
9695+ /*
9696+ * check whether concurrent thread managed to insert item with a key
9697+ * smaller than @key
9698+ */
9699+ read_lock_dk(current_tree);
9700+ result = keycmp(key, znode_get_ld_key(rn.node));
9701+ read_unlock_dk(current_tree);
9702+ assert("vs-6", result != EQUAL_TO);
9703+ if (result == GREATER_THAN) {
9704+ done_lh(&rn);
9705+ return 2;
9706+ }
9707+
9708+ result = zload(rn.node);
9709+ if (result) {
9710+ assert("vs-5", result < 0);
9711+ done_lh(&rn);
9712+ return result;
9713+ }
9714+
9715+ coord_init_first_unit(&next, rn.node);
9716+ if (item_is_internal(&next)) {
9717+ /*
9718+ * next unit is in right neighbor and it is an unit of internal
9719+ * item. Unlock coord->node. Move @lh to right neighbor. @coord
9720+ * is set to the first unit of right neighbor.
9721+ */
9722+ coord_dup(coord, &next);
9723+ zrelse(rn.node);
9724+ done_lh(lh);
9725+ move_lh(lh, &rn);
9726+ return 1;
9727+ }
9728+
9729+ /*
9730+ * next unit is unit of extent item. Return without chaning @lh and
9731+ * @coord.
9732+ */
9733+ assert("vs-6", item_is_extent(&next));
9734+ zrelse(rn.node);
9735+ done_lh(&rn);
9736+ return 0;
9737+}
9738+
9739+/**
9740+ * rd_key - calculate key of an item next to the given one
9741+ * @coord: position in a node
9742+ * @key: storage for result key
9743+ *
9744+ * @coord is set between items or after the last item in a node. Calculate key
9745+ * of item to the right of @coord.
9746+ */
9747+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9748+{
9749+ coord_t dup;
9750+
9751+ assert("nikita-2281", coord_is_between_items(coord));
9752+ coord_dup(&dup, coord);
9753+
9754+ if (coord_set_to_right(&dup) == 0)
9755+ /* next item is in this node. Return its key. */
9756+ unit_key_by_coord(&dup, key);
9757+ else {
9758+ /*
9759+ * next item either does not exist or is in right
9760+ * neighbor. Return znode's right delimiting key.
9761+ */
9762+ read_lock_dk(current_tree);
9763+ *key = *znode_get_rd_key(coord->node);
9764+ read_unlock_dk(current_tree);
9765+ }
9766+ return key;
9767+}
9768+
9769+/**
9770+ * add_empty_leaf - insert empty leaf between two extents
9771+ * @insert_coord: position in twig node between two extents
9772+ * @lh: twig node lock handle
9773+ * @key: left delimiting key of new node
9774+ * @rdkey: right delimiting key of new node
9775+ *
9776+ * Inserts empty leaf node between two extent items. It is necessary when we
9777+ * have to insert an item on leaf level between two extents (items on the twig
9778+ * level).
9779+ */
9780+static int
9781+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9782+ const reiser4_key *key, const reiser4_key *rdkey)
9783+{
9784+ int result;
9785+ carry_pool *pool;
9786+ carry_level *todo;
9787+ reiser4_item_data *item;
9788+ carry_insert_data *cdata;
9789+ carry_op *op;
9790+ znode *node;
9791+ reiser4_tree *tree;
9792+
9793+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9794+ tree = znode_get_tree(insert_coord->node);
9795+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9796+ if (IS_ERR(node))
9797+ return PTR_ERR(node);
9798+
9799+ /* setup delimiting keys for node being inserted */
9800+ write_lock_dk(tree);
9801+ znode_set_ld_key(node, key);
9802+ znode_set_rd_key(node, rdkey);
9803+ ON_DEBUG(node->creator = current);
9804+ ON_DEBUG(node->first_key = *key);
9805+ write_unlock_dk(tree);
9806+
9807+ ZF_SET(node, JNODE_ORPHAN);
9808+
9809+ /*
9810+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9811+ * carry_insert_data
9812+ */
9813+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9814+ sizeof(*item) + sizeof(*cdata));
9815+ if (IS_ERR(pool))
9816+ return PTR_ERR(pool);
9817+ todo = (carry_level *) (pool + 1);
9818+ init_carry_level(todo, pool);
9819+
9820+ item = (reiser4_item_data *) (todo + 3);
9821+ cdata = (carry_insert_data *) (item + 1);
9822+
9823+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9824+ if (!IS_ERR(op)) {
9825+ cdata->coord = insert_coord;
9826+ cdata->key = key;
9827+ cdata->data = item;
9828+ op->u.insert.d = cdata;
9829+ op->u.insert.type = COPT_ITEM_DATA;
9830+ build_child_ptr_data(node, item);
9831+ item->arg = NULL;
9832+ /* have @insert_coord to be set at inserted item after
9833+ insertion is done */
9834+ todo->track_type = CARRY_TRACK_CHANGE;
9835+ todo->tracked = lh;
9836+
9837+ result = reiser4_carry(todo, NULL);
9838+ if (result == 0) {
9839+ /*
9840+ * pin node in memory. This is necessary for
9841+ * znode_make_dirty() below.
9842+ */
9843+ result = zload(node);
9844+ if (result == 0) {
9845+ lock_handle local_lh;
9846+
9847+ /*
9848+ * if we inserted new child into tree we have
9849+ * to mark it dirty so that flush will be able
9850+ * to process it.
9851+ */
9852+ init_lh(&local_lh);
9853+ result = longterm_lock_znode(&local_lh, node,
9854+ ZNODE_WRITE_LOCK,
9855+ ZNODE_LOCK_LOPRI);
9856+ if (result == 0) {
9857+ znode_make_dirty(node);
9858+
9859+ /*
9860+ * when internal item pointing to @node
9861+ * was inserted into twig node
9862+ * create_hook_internal did not connect
9863+ * it properly because its right
9864+ * neighbor was not known. Do it
9865+ * here
9866+ */
9867+ write_lock_tree(tree);
9868+ assert("nikita-3312",
9869+ znode_is_right_connected(node));
9870+ assert("nikita-2984",
9871+ node->right == NULL);
9872+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9873+ write_unlock_tree(tree);
9874+ result =
9875+ connect_znode(insert_coord, node);
9876+ ON_DEBUG(if (result == 0) check_dkeys(node););
9877+
9878+ done_lh(lh);
9879+ move_lh(lh, &local_lh);
9880+ assert("vs-1676", node_is_empty(node));
9881+ coord_init_first_unit(insert_coord,
9882+ node);
9883+ } else {
9884+ warning("nikita-3136",
9885+ "Cannot lock child");
9886+ }
9887+ done_lh(&local_lh);
9888+ zrelse(node);
9889+ }
9890+ }
9891+ } else
9892+ result = PTR_ERR(op);
9893+ zput(node);
9894+ done_carry_pool(pool);
9895+ return result;
9896+}
9897+
9898+/**
9899+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9900+ * @h: search handle
9901+ * @outcome: flag saying whether search has to restart or is done
9902+ *
9903+ * Handles search on twig level. If this function completes search itself then
9904+ * it returns 1. If search has to go one level down then 0 is returned. If
9905+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9906+ * in @h->result.
9907+ */
9908+int handle_eottl(cbk_handle *h, int *outcome)
9909+{
9910+ int result;
9911+ reiser4_key key;
9912+ coord_t *coord;
9913+
9914+ coord = h->coord;
9915+
9916+ if (h->level != TWIG_LEVEL ||
9917+ (coord_is_existing_item(coord) && item_is_internal(coord))) {
9918+ /* Continue to traverse tree downward. */
9919+ return 0;
9920+ }
9921+
9922+ /*
9923+ * make sure that @h->coord is set to twig node and that it is either
9924+ * set to extent item or after extent item
9925+ */
9926+ assert("vs-356", h->level == TWIG_LEVEL);
9927+ assert("vs-357", ( {
9928+ coord_t lcoord;
9929+ coord_dup(&lcoord, coord);
9930+ check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9931+ item_is_extent(&lcoord);
9932+ }
9933+ ));
9934+
9935+ if (*outcome == NS_FOUND) {
9936+ /* we have found desired key on twig level in extent item */
9937+ h->result = CBK_COORD_FOUND;
9938+ *outcome = LOOKUP_DONE;
9939+ return 1;
9940+ }
9941+
9942+ if (!(h->flags & CBK_FOR_INSERT)) {
9943+ /* tree traversal is not for insertion. Just return
9944+ CBK_COORD_NOTFOUND. */
9945+ h->result = CBK_COORD_NOTFOUND;
9946+ *outcome = LOOKUP_DONE;
9947+ return 1;
9948+ }
9949+
9950+ /* take a look at the item to the right of h -> coord */
9951+ result = is_next_item_internal(coord, h->key, h->active_lh);
9952+ if (unlikely(result < 0)) {
9953+ h->error = "get_right_neighbor failed";
9954+ h->result = result;
9955+ *outcome = LOOKUP_DONE;
9956+ return 1;
9957+ }
9958+ if (result == 0) {
9959+ /*
9960+ * item to the right is also an extent one. Allocate a new node
9961+ * and insert pointer to it after item h -> coord.
9962+ *
9963+ * This is a result of extents being located at the twig
9964+ * level. For explanation, see comment just above
9965+ * is_next_item_internal().
9966+ */
9967+ znode *loaded;
9968+
9969+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
9970+ /*
9971+ * we got node read locked, restart coord_by_key to
9972+ * have write lock on twig level
9973+ */
9974+ h->lock_level = TWIG_LEVEL;
9975+ h->lock_mode = ZNODE_WRITE_LOCK;
9976+ *outcome = LOOKUP_REST;
9977+ return 1;
9978+ }
9979+
9980+ loaded = coord->node;
9981+ result =
9982+ add_empty_leaf(coord, h->active_lh, h->key,
9983+ rd_key(coord, &key));
9984+ if (result) {
9985+ h->error = "could not add empty leaf";
9986+ h->result = result;
9987+ *outcome = LOOKUP_DONE;
9988+ return 1;
9989+ }
9990+ /* added empty leaf is locked (h->active_lh), its parent node
9991+ is unlocked, h->coord is set as EMPTY */
9992+ assert("vs-13", coord->between == EMPTY_NODE);
9993+ assert("vs-14", znode_is_write_locked(coord->node));
9994+ assert("vs-15",
9995+ WITH_DATA(coord->node, node_is_empty(coord->node)));
9996+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
9997+ assert("vs-17", coord->node == h->active_lh->node);
9998+ *outcome = LOOKUP_DONE;
9999+ h->result = CBK_COORD_NOTFOUND;
10000+ return 1;
10001+ } else if (result == 1) {
10002+ /*
10003+ * this is special case mentioned in the comment on
10004+ * tree.h:cbk_flags. We have found internal item immediately on
10005+ * the right of extent, and we are going to insert new item
10006+ * there. Key of item we are going to insert is smaller than
10007+ * leftmost key in the node pointed to by said internal item
10008+ * (otherwise search wouldn't come to the extent in the first
10009+ * place).
10010+ *
10011+ * This is a result of extents being located at the twig
10012+ * level. For explanation, see comment just above
10013+ * is_next_item_internal().
10014+ */
10015+ h->flags &= ~CBK_TRUST_DK;
10016+ } else {
10017+ assert("vs-8", result == 2);
10018+ *outcome = LOOKUP_REST;
10019+ return 1;
10020+ }
10021+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10022+ return 0;
10023+}
10024+
10025+/*
10026+ * Local variables:
10027+ * c-indentation-style: "K&R"
10028+ * mode-name: "LC"
10029+ * c-basic-offset: 8
10030+ * tab-width: 8
10031+ * fill-column: 120
10032+ * scroll-step: 1
10033+ * End:
10034+ */
10035diff -urN linux-2.6.20.orig/fs/reiser4/estimate.c linux-2.6.20/fs/reiser4/estimate.c
10036--- linux-2.6.20.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300
10037+++ linux-2.6.20/fs/reiser4/estimate.c 2007-05-06 14:50:43.706978224 +0400
10038@@ -0,0 +1,111 @@
10039+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10040+
10041+#include "debug.h"
10042+#include "dformat.h"
10043+#include "tree.h"
10044+#include "carry.h"
10045+#include "inode.h"
10046+#include "plugin/cluster.h"
10047+#include "plugin/item/ctail.h"
10048+
10049+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10050+
10051+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10052+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10053+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10054+ leaf level, 3 for twig level, 2 on upper + 1 for root.
10055+
10056+ Do not calculate the current node of the lowest level here - this is overhead only.
10057+
10058+ children is almost always 1 here. Exception is flow insertion
10059+*/
10060+static reiser4_block_nr
10061+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10062+{
10063+ reiser4_block_nr ten_percent;
10064+
10065+ ten_percent = ((103 * childen) >> 10);
10066+
10067+ /* If we have too many balancings at the time, tree height can raise on more
10068+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10069+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10070+}
10071+
10072+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10073+ perform insertion of one item into the tree */
10074+/* it is only called when tree height changes, or gets initialized */
10075+reiser4_block_nr calc_estimate_one_insert(tree_level height)
10076+{
10077+ return 1 + max_balance_overhead(1, height);
10078+}
10079+
10080+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10081+{
10082+ return tree->estimate_one_insert;
10083+}
10084+
10085+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10086+ perform insertion of one unit into an item in the tree */
10087+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10088+{
10089+ /* estimate insert into item just like item insertion */
10090+ return tree->estimate_one_insert;
10091+}
10092+
10093+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10094+{
10095+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10096+ level */
10097+ return tree->estimate_one_insert;
10098+}
10099+
10100+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10101+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10102+ levels */
10103+reiser4_block_nr estimate_insert_flow(tree_level height)
10104+{
10105+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10106+ CARRY_FLOW_NEW_NODES_LIMIT,
10107+ height);
10108+}
10109+
10110+/* returnes max number of nodes can be occupied by disk cluster */
10111+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10112+{
10113+ int per_cluster;
10114+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10115+ return 3 + per_cluster +
10116+ max_balance_overhead(3 + per_cluster,
10117+ REISER4_MAX_ZTREE_HEIGHT);
10118+}
10119+
10120+/* how many nodes might get dirty and added
10121+ during insertion of a disk cluster */
10122+reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10123+{
10124+ return estimate_cluster(inode, 1); /* 24 */
10125+}
10126+
10127+/* how many nodes might get dirty and added
10128+ during update of a (prepped or unprepped) disk cluster */
10129+reiser4_block_nr estimate_update_cluster(struct inode * inode)
10130+{
10131+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10132+}
10133+
10134+/* how many nodes occupied by a disk cluster might get dirty */
10135+reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10136+{
10137+ return cluster_nrpages(inode) + 4;
10138+}
10139+
10140+/* Make Linus happy.
10141+ Local variables:
10142+ c-indentation-style: "K&R"
10143+ mode-name: "LC"
10144+ c-basic-offset: 8
10145+ tab-width: 8
10146+ fill-column: 120
10147+ scroll-step: 1
10148+ End:
10149+*/
10150diff -urN linux-2.6.20.orig/fs/reiser4/export_ops.c linux-2.6.20/fs/reiser4/export_ops.c
10151--- linux-2.6.20.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300
10152+++ linux-2.6.20/fs/reiser4/export_ops.c 2007-05-06 14:50:43.706978224 +0400
10153@@ -0,0 +1,295 @@
10154+/* Copyright 2005 by Hans Reiser, licensing governed by
10155+ * reiser4/README */
10156+
10157+#include "inode.h"
10158+#include "plugin/plugin.h"
10159+
10160+/*
10161+ * Supported file-handle types
10162+ */
10163+typedef enum {
10164+ FH_WITH_PARENT = 0x10, /* file handle with parent */
10165+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10166+} reiser4_fhtype;
10167+
10168+#define NFSERROR (255)
10169+
10170+/* initialize place-holder for object */
10171+static void object_on_wire_init(reiser4_object_on_wire *o)
10172+{
10173+ o->plugin = NULL;
10174+}
10175+
10176+/* finish with @o */
10177+static void object_on_wire_done(reiser4_object_on_wire *o)
10178+{
10179+ if (o->plugin != NULL)
10180+ o->plugin->wire.done(o);
10181+}
10182+
10183+/*
10184+ * read serialized object identity from @addr and store information about
10185+ * object in @obj. This is dual to encode_inode().
10186+ */
10187+static char *decode_inode(struct super_block *s, char *addr,
10188+ reiser4_object_on_wire * obj)
10189+{
10190+ file_plugin *fplug;
10191+
10192+ /* identifier of object plugin is stored in the first two bytes,
10193+ * followed by... */
10194+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10195+ if (fplug != NULL) {
10196+ addr += sizeof(d16);
10197+ obj->plugin = fplug;
10198+ assert("nikita-3520", fplug->wire.read != NULL);
10199+ /* plugin specific encoding of object identity. */
10200+ addr = fplug->wire.read(addr, obj);
10201+ } else
10202+ addr = ERR_PTR(RETERR(-EINVAL));
10203+ return addr;
10204+}
10205+
10206+/**
10207+ * reiser4_decode_fh - decode_fh of export operations
10208+ * @super: super block
10209+ * @fh: nfsd file handle
10210+ * @len: length of file handle
10211+ * @fhtype: type of file handle
10212+ * @acceptable: acceptability testing function
10213+ * @context: argument for @acceptable
10214+ *
10215+ * Returns dentry referring to the same file as @fh.
10216+ */
10217+static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10218+ int len, int fhtype,
10219+ int (*acceptable) (void *context,
10220+ struct dentry *de),
10221+ void *context)
10222+{
10223+ reiser4_context *ctx;
10224+ reiser4_object_on_wire object;
10225+ reiser4_object_on_wire parent;
10226+ char *addr;
10227+ int with_parent;
10228+
10229+ ctx = reiser4_init_context(super);
10230+ if (IS_ERR(ctx))
10231+ return (struct dentry *)ctx;
10232+
10233+ assert("vs-1482",
10234+ fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10235+
10236+ with_parent = (fhtype == FH_WITH_PARENT);
10237+
10238+ addr = (char *)fh;
10239+
10240+ object_on_wire_init(&object);
10241+ object_on_wire_init(&parent);
10242+
10243+ addr = decode_inode(super, addr, &object);
10244+ if (!IS_ERR(addr)) {
10245+ if (with_parent)
10246+ addr = decode_inode(super, addr, &parent);
10247+ if (!IS_ERR(addr)) {
10248+ struct dentry *d;
10249+ typeof(super->s_export_op->find_exported_dentry) fn;
10250+
10251+ fn = super->s_export_op->find_exported_dentry;
10252+ assert("nikita-3521", fn != NULL);
10253+ d = fn(super, &object, with_parent ? &parent : NULL,
10254+ acceptable, context);
10255+ if (d != NULL && !IS_ERR(d))
10256+ /* FIXME check for -ENOMEM */
10257+ reiser4_get_dentry_fsdata(d)->stateless = 1;
10258+ addr = (char *)d;
10259+ }
10260+ }
10261+
10262+ object_on_wire_done(&object);
10263+ object_on_wire_done(&parent);
10264+
10265+ reiser4_exit_context(ctx);
10266+ return (void *)addr;
10267+}
10268+
10269+/*
10270+ * Object serialization support.
10271+ *
10272+ * To support knfsd file system provides export_operations that are used to
10273+ * construct and interpret NFS file handles. As a generalization of this,
10274+ * reiser4 object plugins have serialization support: it provides methods to
10275+ * create on-wire representation of identity of reiser4 object, and
10276+ * re-create/locate object given its on-wire identity.
10277+ *
10278+ */
10279+
10280+/*
10281+ * return number of bytes that on-wire representation of @inode's identity
10282+ * consumes.
10283+ */
10284+static int encode_inode_size(struct inode *inode)
10285+{
10286+ assert("nikita-3514", inode != NULL);
10287+ assert("nikita-3515", inode_file_plugin(inode) != NULL);
10288+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10289+
10290+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10291+}
10292+
10293+/*
10294+ * store on-wire representation of @inode's identity at the area beginning at
10295+ * @start.
10296+ */
10297+static char *encode_inode(struct inode *inode, char *start)
10298+{
10299+ assert("nikita-3517", inode != NULL);
10300+ assert("nikita-3518", inode_file_plugin(inode) != NULL);
10301+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10302+
10303+ /*
10304+ * first, store two-byte identifier of object plugin, then
10305+ */
10306+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10307+ (d16 *) start);
10308+ start += sizeof(d16);
10309+ /*
10310+ * call plugin to serialize object's identity
10311+ */
10312+ return inode_file_plugin(inode)->wire.write(inode, start);
10313+}
10314+
10315+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10316+ * returned if file handle can not be stored */
10317+/**
10318+ * reiser4_encode_fh - encode_fh of export operations
10319+ * @dentry:
10320+ * @fh:
10321+ * @lenp:
10322+ * @need_parent:
10323+ *
10324+ */
10325+static int
10326+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10327+ int need_parent)
10328+{
10329+ struct inode *inode;
10330+ struct inode *parent;
10331+ char *addr;
10332+ int need;
10333+ int delta;
10334+ int result;
10335+ reiser4_context *ctx;
10336+
10337+ /*
10338+ * knfsd asks as to serialize object in @dentry, and, optionally its
10339+ * parent (if need_parent != 0).
10340+ *
10341+ * encode_inode() and encode_inode_size() is used to build
10342+ * representation of object and its parent. All hard work is done by
10343+ * object plugins.
10344+ */
10345+ inode = dentry->d_inode;
10346+ parent = dentry->d_parent->d_inode;
10347+
10348+ addr = (char *)fh;
10349+
10350+ need = encode_inode_size(inode);
10351+ if (need < 0)
10352+ return NFSERROR;
10353+ if (need_parent) {
10354+ delta = encode_inode_size(parent);
10355+ if (delta < 0)
10356+ return NFSERROR;
10357+ need += delta;
10358+ }
10359+
10360+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
10361+ if (IS_ERR(ctx))
10362+ return PTR_ERR(ctx);
10363+
10364+ if (need <= sizeof(__u32) * (*lenp)) {
10365+ addr = encode_inode(inode, addr);
10366+ if (need_parent)
10367+ addr = encode_inode(parent, addr);
10368+
10369+ /* store in lenp number of 32bit words required for file
10370+ * handle. */
10371+ *lenp = (need + sizeof(__u32) - 1) >> 2;
10372+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10373+ } else
10374+ /* no enough space in file handle */
10375+ result = NFSERROR;
10376+ reiser4_exit_context(ctx);
10377+ return result;
10378+}
10379+
10380+/**
10381+ * reiser4_get_dentry_parent - get_parent of export operations
10382+ * @child:
10383+ *
10384+ */
10385+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10386+{
10387+ struct inode *dir;
10388+ dir_plugin *dplug;
10389+
10390+ assert("nikita-3527", child != NULL);
10391+ /* see comment in reiser4_get_dentry() about following assertion */
10392+ assert("nikita-3528", is_in_reiser4_context());
10393+
10394+ dir = child->d_inode;
10395+ assert("nikita-3529", dir != NULL);
10396+ dplug = inode_dir_plugin(dir);
10397+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10398+ if (dplug != NULL)
10399+ return dplug->get_parent(dir);
10400+ else
10401+ return ERR_PTR(RETERR(-ENOTDIR));
10402+}
10403+
10404+/**
10405+ * reiser4_get_dentry - get_dentry of export operations
10406+ * @super:
10407+ * @data:
10408+ *
10409+ *
10410+ */
10411+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10412+{
10413+ reiser4_object_on_wire *o;
10414+
10415+ assert("nikita-3522", super != NULL);
10416+ assert("nikita-3523", data != NULL);
10417+ /*
10418+ * this is only supposed to be called by
10419+ *
10420+ * reiser4_decode_fh->find_exported_dentry
10421+ *
10422+ * so, reiser4_context should be here already.
10423+ */
10424+ assert("nikita-3526", is_in_reiser4_context());
10425+
10426+ o = (reiser4_object_on_wire *)data;
10427+ assert("nikita-3524", o->plugin != NULL);
10428+ assert("nikita-3525", o->plugin->wire.get != NULL);
10429+
10430+ return o->plugin->wire.get(super, o);
10431+}
10432+
10433+struct export_operations reiser4_export_operations = {
10434+ .encode_fh = reiser4_encode_fh,
10435+ .decode_fh = reiser4_decode_fh,
10436+ .get_parent = reiser4_get_dentry_parent,
10437+ .get_dentry = reiser4_get_dentry
10438+};
10439+
10440+/*
10441+ * Local variables:
10442+ * c-indentation-style: "K&R"
10443+ * mode-name: "LC"
10444+ * c-basic-offset: 8
10445+ * tab-width: 8
10446+ * fill-column: 79
10447+ * End:
10448+ */
10449diff -urN linux-2.6.20.orig/fs/reiser4/flush.c linux-2.6.20/fs/reiser4/flush.c
10450--- linux-2.6.20.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300
10451+++ linux-2.6.20/fs/reiser4/flush.c 2007-05-06 14:50:43.000000000 +0400
10452@@ -0,0 +1,3622 @@
10453+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10454+
10455+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10456+
10457+#include "forward.h"
10458+#include "debug.h"
10459+#include "dformat.h"
10460+#include "key.h"
10461+#include "coord.h"
10462+#include "plugin/item/item.h"
10463+#include "plugin/plugin.h"
10464+#include "plugin/object.h"
10465+#include "txnmgr.h"
10466+#include "jnode.h"
10467+#include "znode.h"
10468+#include "block_alloc.h"
10469+#include "tree_walk.h"
10470+#include "carry.h"
10471+#include "tree.h"
10472+#include "vfs_ops.h"
10473+#include "inode.h"
10474+#include "page_cache.h"
10475+#include "wander.h"
10476+#include "super.h"
10477+#include "entd.h"
10478+#include "reiser4.h"
10479+#include "flush.h"
10480+#include "writeout.h"
10481+
10482+#include <asm/atomic.h>
10483+#include <linux/fs.h> /* for struct super_block */
10484+#include <linux/mm.h> /* for struct page */
10485+#include <linux/bio.h> /* for struct bio */
10486+#include <linux/pagemap.h>
10487+#include <linux/blkdev.h>
10488+
10489+/* IMPLEMENTATION NOTES */
10490+
10491+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10492+ order to the nodes of the tree in which the parent is placed before its children, which
10493+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10494+ describes the node that "came before in forward parent-first order". When we speak of a
10495+ "parent-first follower", it describes the node that "comes next in parent-first
10496+ order" (alternatively the node that "came before in reverse parent-first order").
10497+
10498+ The following pseudo-code prints the nodes of a tree in forward parent-first order:
10499+
10500+ void parent_first (node)
10501+ {
10502+ print_node (node);
10503+ if (node->level > leaf) {
10504+ for (i = 0; i < num_children; i += 1) {
10505+ parent_first (node->child[i]);
10506+ }
10507+ }
10508+ }
10509+*/
10510+
10511+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10512+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10513+ can be accomplished with sequential reads, which results in reading nodes in their
10514+ parent-first order. This is a read-optimization aspect of the flush algorithm, and
10515+ there is also a write-optimization aspect, which is that we wish to make large
10516+ sequential writes to the disk by allocating or reallocating blocks so that they can be
10517+ written in sequence. Sometimes the read-optimization and write-optimization goals
10518+ conflict with each other, as we discuss in more detail below.
10519+*/
10520+
10521+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10522+ the relevant jnode->state bits and their relevence to flush:
10523+
10524+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10525+ must be allocated first. In order to be considered allocated, the jnode must have
10526+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10527+ all dirtied jnodes eventually have one of these bits set during each transaction.
10528+
10529+ JNODE_CREATED: The node was freshly created in its transaction and has no previous
10530+ block address, so it is unconditionally assigned to be relocated, although this is
10531+ mainly for code-convenience. It is not being 'relocated' from anything, but in
10532+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10533+ remains set even after JNODE_RELOC is set, so the actual relocate can be
10534+ distinguished from the created-and-allocated set easily: relocate-set members
10535+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10536+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10537+
10538+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10539+ decision to maintain the pre-existing location for this node and it will be written
10540+ to the wandered-log.
10541+
10542+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10543+ not created, see note above). A block with JNODE_RELOC set is eligible for
10544+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10545+ bit is set on a znode, the parent node's internal item is modified and the znode is
10546+ rehashed.
10547+
10548+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10549+ and calls plugin->f.squeeze() method for its items. By this technology we update disk
10550+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10551+ has this flag (races with write(), rare case) the flush algorythm makes the decision
10552+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10553+ repeated allocation.
10554+
10555+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10556+ flush queue. This means the jnode is not on any clean or dirty list, instead it is
10557+ moved to one of the flush queue (see flush_queue.h) object private list. This
10558+ prevents multiple concurrent flushes from attempting to start flushing from the
10559+ same node.
10560+
10561+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10562+ squeeze-and-allocate on a node while its children are actively being squeezed and
10563+ allocated. This flag was created to avoid submitting a write request for a node
10564+ while its children are still being allocated and squeezed. Then flush queue was
10565+ re-implemented to allow unlimited number of nodes be queued. This flag support was
10566+ commented out in source code because we decided that there was no reason to submit
10567+ queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10568+ during a slum traversal and may submit "busy nodes" to disk. Probably we can
10569+ re-enable the JNODE_FLUSH_BUSY bit support in future.
10570+
10571+ With these state bits, we describe a test used frequently in the code below,
10572+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10573+ test for "flushprepped" returns true if any of the following are true:
10574+
10575+ - The node is not dirty
10576+ - The node has JNODE_RELOC set
10577+ - The node has JNODE_OVRWR set
10578+
10579+ If either the node is not dirty or it has already been processed by flush (and assigned
10580+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10581+ true then flush has work to do on that node.
10582+*/
10583+
10584+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10585+ flushprepped twice (unless an explicit call to flush_unprep is made as described in
10586+ detail below). For example a node is dirtied, allocated, and then early-flushed to
10587+ disk and set clean. Before the transaction commits, the page is dirtied again and, due
10588+ to memory pressure, the node is flushed again. The flush algorithm will not relocate
10589+ the node to a new disk location, it will simply write it to the same, previously
10590+ relocated position again.
10591+*/
10592+
10593+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10594+ start at a leaf node and allocate in parent-first order by iterating to the right. At
10595+ each step of the iteration, we check for the right neighbor. Before advancing to the
10596+ right neighbor, we check if the current position and the right neighbor share the same
10597+ parent. If they do not share the same parent, the parent is allocated before the right
10598+ neighbor.
10599+
10600+ This process goes recursively up the tree and squeeze nodes level by level as long as
10601+ the right neighbor and the current position have different parents, then it allocates
10602+ the right-neighbors-with-different-parents on the way back down. This process is
10603+ described in more detail in flush_squalloc_changed_ancestor and the recursive function
10604+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10605+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10606+ approaches.
10607+
10608+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10609+ approach, we find a starting point by scanning left along each level past dirty nodes,
10610+ then going up and repeating the process until the left node and the parent node are
10611+ clean. We then perform a parent-first traversal from the starting point, which makes
10612+ allocating in parent-first order trivial. After one subtree has been allocated in this
10613+ manner, we move to the right, try moving upward, then repeat the parent-first
10614+ traversal.
10615+
10616+ Both approaches have problems that need to be addressed. Both are approximately the
10617+ same amount of code, but the bottom-up approach has advantages in the order it acquires
10618+ locks which, at the very least, make it the better approach. At first glance each one
10619+ makes the other one look simpler, so it is important to remember a few of the problems
10620+ with each one.
10621+
10622+ Main problem with the top-down approach: When you encounter a clean child during the
10623+ parent-first traversal, what do you do? You would like to avoid searching through a
10624+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10625+ obvious solution. One of the advantages of the top-down approach is that during the
10626+ parent-first traversal you check every child of a parent to see if it is dirty. In
10627+ this way, the top-down approach easily handles the main problem of the bottom-up
10628+ approach: unallocated children.
10629+
10630+ The unallocated children problem is that before writing a node to disk we must make
10631+ sure that all of its children are allocated. Otherwise, the writing the node means
10632+ extra I/O because the node will have to be written again when the child is finally
10633+ allocated.
10634+
10635+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10636+ should not cause any file system corruption, it only degrades I/O performance because a
10637+ node may be written when it is sure to be written at least one more time in the same
10638+ transaction when the remaining children are allocated. What follows is a description
10639+ of how we will solve the problem.
10640+*/
10641+
10642+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10643+ proceeding in parent first order, allocate some of its left-children, then encounter a
10644+ clean child in the middle of the parent. We do not allocate the clean child, but there
10645+ may remain unallocated (dirty) children to the right of the clean child. If we were to
10646+ stop flushing at this moment and write everything to disk, the parent might still
10647+ contain unallocated children.
10648+
10649+ We could try to allocate all the descendents of every node that we allocate, but this
10650+ is not necessary. Doing so could result in allocating the entire tree: if the root
10651+ node is allocated then every unallocated node would have to be allocated before
10652+ flushing. Actually, we do not have to write a node just because we allocate it. It is
10653+ possible to allocate but not write a node during flush, when it still has unallocated
10654+ children. However, this approach is probably not optimal for the following reason.
10655+
10656+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10657+ to optimize reads that occur in the same order. Thus we are read-optimizing for a
10658+ left-to-right scan through all the leaves in the system, and we are hoping to
10659+ write-optimize at the same time because those nodes will be written together in batch.
10660+ What happens, however, if we assign a block number to a node in its read-optimized
10661+ order but then avoid writing it because it has unallocated children? In that
10662+ situation, we lose out on the write-optimization aspect because a node will have to be
10663+ written again to the its location on the device, later, which likely means seeking back
10664+ to that location.
10665+
10666+ So there are tradeoffs. We can choose either:
10667+
10668+ A. Allocate all unallocated children to preserve both write-optimization and
10669+ read-optimization, but this is not always desirable because it may mean having to
10670+ allocate and flush very many nodes at once.
10671+
10672+ B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10673+ but sacrifice write-optimization because those nodes will be written again.
10674+
10675+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10676+ locations. Instead, choose to write-optimize them later, when they are written. To
10677+ facilitate this, we "undo" the read-optimized allocation that was given to the node so
10678+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10679+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10680+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10681+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10682+ location, and set the JNODE_CREATED bit, effectively setting the node back to an
10683+ unallocated state.
10684+
10685+ We will take the following approach in v4.0: for twig nodes we will always finish
10686+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10687+ writing and choose write-optimization (C).
10688+
10689+ To summarize, there are several parts to a solution that avoids the problem with
10690+ unallocated children:
10691+
10692+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10693+ problem because there was an experiment which was done showed that we have 1-2 nodes
10694+ with unallocated children for thousands of written nodes. The experiment was simple
10695+ like coping / deletion of linux kernel sources. However the problem can arise in more
10696+ complex tests. I think we have jnode_io_hook to insert a check for unallocated
10697+ children and see what kind of problem we have.
10698+
10699+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10700+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
10701+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10702+ comments in that function.
10703+
10704+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10705+ have unallocated children. If the twig level has unallocated children it is an
10706+ assertion failure. If a higher-level node has unallocated children, then it should be
10707+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
10708+ should be simple.
10709+
10710+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10711+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10712+ this somewhat in the case where large sub-trees are flushed. The following observation
10713+ helps: if both the left- and right-neighbor of a node are processed by the flush
10714+ algorithm then the node itself is guaranteed to have all of its children allocated.
10715+ However, the cost of this check may not be so expensive after all: it is not needed for
10716+ leaves and flush can guarantee this property for twigs. That leaves only (level >
10717+ TWIG) nodes that have to be checked, so this optimization only helps if at least three
10718+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10719+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
10720+ then the number of blocks being written will be very large, so the savings may be
10721+ insignificant. That said, the idea is to maintain both the left and right edges of
10722+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively
10723+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
10724+ edge, the slow check is necessary, but if it is in the interior then it can be assumed
10725+ to have all of its children allocated. FIXME: medium complexity to implement, but
10726+ simple to verify given that we must have a slow check anyway.
10727+
10728+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of
10729+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
10730+ left-scan operation to take unallocated children into account. Normally, the left-scan
10731+ operation goes left as long as adjacent nodes are dirty up until some large maximum
10732+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
10733+ may stop at a position where there are unallocated children to the left with the same
10734+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10735+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10736+ with a rapid scan. The rapid scan skips all the interior children of a node--if the
10737+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10738+ twig to the left). If the left neighbor of the leftmost child is also dirty, then
10739+ continue the scan at the left twig and repeat. This option will cause flush to
10740+ allocate more twigs in a single pass, but it also has the potential to write many more
10741+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
10742+ was partially implemented, code removed August 12, 2002 by JMACD.
10743+*/
10744+
10745+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
10746+ starting point for flush is a leaf node, but actually the flush code cares very little
10747+ about whether or not this is true. It is possible that all the leaf nodes are flushed
10748+ and dirty parent nodes still remain, in which case jnode_flush() is called on a
10749+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
10750+ leaf, even when it is not. This is a simple approach, and there may be a more optimal
10751+ policy but until a problem with this approach is discovered, simplest is probably best.
10752+
10753+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10754+ the leaves. This is done as a matter of simplicity and there is only one (shaky)
10755+ justification. When an atom commits, it flushes all leaf level nodes first, followed
10756+ by twigs, and so on. With flushing done in this order, if flush is eventually called
10757+ on a non-leaf node it means that (somehow) we reached a point where all leaves are
10758+ clean and only internal nodes need to be flushed. If that it the case, then it means
10759+ there were no leaves that were the parent-first preceder/follower of the parent. This
10760+ is expected to be a rare case, which is why we do nothing special about it. However,
10761+ memory pressure may pass an internal node to flush when there are still dirty leaf
10762+ nodes that need to be flushed, which could prove our original assumptions
10763+ "inoperative". If this needs to be fixed, then scan_left/right should have
10764+ special checks for the non-leaf levels. For example, instead of passing from a node to
10765+ the left neighbor, it should pass from the node to the left neighbor's rightmost
10766+ descendent (if dirty).
10767+
10768+*/
10769+
10770+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10771+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
10772+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10773+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
10774+ device becomes sorted such that tree order and block number order fully correlate.
10775+
10776+ Resizing is done by shifting everything either all the way to the left or all the way
10777+ to the right, and then reporting the last block.
10778+*/
10779+
10780+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
10781+ descibes the policy from the highest level:
10782+
10783+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10784+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10785+ leaf nodes.
10786+
10787+ Otherwise, there are two contexts in which we make a decision to relocate:
10788+
10789+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10790+ During the initial stages of flush, after scan-right completes, we want to ask the
10791+ question: should we relocate this leaf node and thus dirty the parent node. Then if
10792+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10793+ the question at the next level up, and so on. In these cases we are moving in the
10794+ reverse-parent first direction.
10795+
10796+ There is another case which is considered the reverse direction, which comes at the end
10797+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
10798+ reach a point where there is a clean twig to the right with a dirty leftmost child. In
10799+ this case, we may wish to relocate the child by testing if it should be relocated
10800+ relative to its parent.
10801+
10802+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10803+ allocate_znode. What distinguishes the forward parent-first case from the
10804+ reverse-parent first case is that the preceder has already been allocated in the
10805+ forward case, whereas in the reverse case we don't know what the preceder is until we
10806+ finish "going in reverse". That simplifies the forward case considerably, and there we
10807+ actually use the block allocator to determine whether, e.g., a block closer to the
10808+ preceder is available.
10809+*/
10810+
10811+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
10812+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10813+ squeeze the parent's left neighbor and the parent. This may change the
10814+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child
10815+ is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10816+ Note that we cannot allocate extents during this or they will be out of parent-first
10817+ order. There is also some difficult coordinate maintenence issues. We can't do a tree
10818+ search to find coordinates again (because we hold locks), we have to determine them
10819+ from the two nodes being squeezed. Looks difficult, but has potential to increase
10820+ space utilization. */
10821+
10822+/* Flush-scan helper functions. */
10823+static void scan_init(flush_scan * scan);
10824+static void scan_done(flush_scan * scan);
10825+
10826+/* Flush-scan algorithm. */
10827+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10828+ unsigned limit);
10829+static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10830+static int scan_common(flush_scan * scan, flush_scan * other);
10831+static int scan_formatted(flush_scan * scan);
10832+static int scan_unformatted(flush_scan * scan, flush_scan * other);
10833+static int scan_by_coord(flush_scan * scan);
10834+
10835+/* Initial flush-point ancestor allocation. */
10836+static int alloc_pos_and_ancestors(flush_pos_t * pos);
10837+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10838+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10839+
10840+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
10841+static int squalloc(flush_pos_t * pos);
10842+
10843+/* Flush squeeze implementation. */
10844+static int squeeze_right_non_twig(znode * left, znode * right);
10845+static int shift_one_internal_unit(znode * left, znode * right);
10846+
10847+/* Flush reverse parent-first relocation routines. */
10848+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10849+ const reiser4_block_nr * nblk);
10850+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10851+ flush_pos_t * pos);
10852+static int reverse_relocate_check_dirty_parent(jnode * node,
10853+ const coord_t * parent_coord,
10854+ flush_pos_t * pos);
10855+
10856+/* Flush allocate write-queueing functions: */
10857+static int allocate_znode(znode * node, const coord_t * parent_coord,
10858+ flush_pos_t * pos);
10859+static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10860+ flush_pos_t * pos);
10861+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10862+
10863+/* Flush helper functions: */
10864+static int jnode_lock_parent_coord(jnode * node,
10865+ coord_t * coord,
10866+ lock_handle * parent_lh,
10867+ load_count * parent_zh,
10868+ znode_lock_mode mode, int try);
10869+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
10870+ znode_lock_mode mode, int check_dirty);
10871+static int znode_same_parents(znode * a, znode * b);
10872+
10873+static int znode_check_flushprepped(znode * node)
10874+{
10875+ return jnode_check_flushprepped(ZJNODE(node));
10876+}
10877+
10878+/* Flush position functions */
10879+static void pos_init(flush_pos_t * pos);
10880+static int pos_valid(flush_pos_t * pos);
10881+static void pos_done(flush_pos_t * pos);
10882+static int pos_stop(flush_pos_t * pos);
10883+
10884+/* check that @org is first jnode extent unit, if extent is unallocated,
10885+ * because all jnodes of unallocated extent are dirty and of the same atom. */
10886+#define checkchild(scan) \
10887+assert("nikita-3435", \
10888+ ergo(scan->direction == LEFT_SIDE && \
10889+ (scan->parent_coord.node->level == TWIG_LEVEL) && \
10890+ jnode_is_unformatted(scan->node) && \
10891+ extent_is_unallocated(&scan->parent_coord), \
10892+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10893+
10894+/* This flush_cnt variable is used to track the number of concurrent flush operations,
10895+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
10896+ no static initializer function...) */
10897+ON_DEBUG(atomic_t flush_cnt;
10898+ )
10899+
10900+/* check fs backing device for write congestion */
10901+static int check_write_congestion(void)
10902+{
10903+ struct super_block *sb;
10904+ struct backing_dev_info *bdi;
10905+
10906+ sb = reiser4_get_current_sb();
10907+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
10908+ return bdi_write_congested(bdi);
10909+}
10910+
10911+/* conditionally write flush queue */
10912+static int write_prepped_nodes(flush_pos_t * pos)
10913+{
10914+ int ret;
10915+
10916+ assert("zam-831", pos);
10917+ assert("zam-832", pos->fq);
10918+
10919+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
10920+ return 0;
10921+
10922+ if (check_write_congestion())
10923+ return 0;
10924+
10925+ ret = reiser4_write_fq(pos->fq, pos->nr_written,
10926+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
10927+ return ret;
10928+}
10929+
10930+/* Proper release all flush pos. resources then move flush position to new
10931+ locked node */
10932+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
10933+ load_count * new_load, const coord_t * new_coord)
10934+{
10935+ assert("zam-857", new_lock->node == new_load->node);
10936+
10937+ if (new_coord) {
10938+ assert("zam-858", new_coord->node == new_lock->node);
10939+ coord_dup(&pos->coord, new_coord);
10940+ } else {
10941+ coord_init_first_unit(&pos->coord, new_lock->node);
10942+ }
10943+
10944+ if (pos->child) {
10945+ jput(pos->child);
10946+ pos->child = NULL;
10947+ }
10948+
10949+ move_load_count(&pos->load, new_load);
10950+ done_lh(&pos->lock);
10951+ move_lh(&pos->lock, new_lock);
10952+}
10953+
10954+/* delete empty node which link from the parent still exists. */
10955+static int delete_empty_node(znode * node)
10956+{
10957+ reiser4_key smallest_removed;
10958+
10959+ assert("zam-1019", node != NULL);
10960+ assert("zam-1020", node_is_empty(node));
10961+ assert("zam-1023", znode_is_wlocked(node));
10962+
10963+ return reiser4_delete_node(node, &smallest_removed, NULL, 1);
10964+}
10965+
10966+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
10967+static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
10968+{
10969+ int ret;
10970+ load_count load;
10971+ lock_handle lock;
10972+
10973+ init_lh(&lock);
10974+ init_load_count(&load);
10975+
10976+ if (jnode_is_znode(org)) {
10977+ ret = longterm_lock_znode(&lock, JZNODE(org),
10978+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
10979+ if (ret)
10980+ return ret;
10981+
10982+ ret = incr_load_count_znode(&load, JZNODE(org));
10983+ if (ret)
10984+ return ret;
10985+
10986+ pos->state =
10987+ (jnode_get_level(org) ==
10988+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
10989+ move_flush_pos(pos, &lock, &load, NULL);
10990+ } else {
10991+ coord_t parent_coord;
10992+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
10993+ &load, ZNODE_WRITE_LOCK, 0);
10994+ if (ret)
10995+ goto done;
10996+ if (!item_is_extent(&parent_coord)) {
10997+ /* file was converted to tail, org became HB, we found internal
10998+ item */
10999+ ret = -EAGAIN;
11000+ goto done;
11001+ }
11002+
11003+ pos->state = POS_ON_EPOINT;
11004+ move_flush_pos(pos, &lock, &load, &parent_coord);
11005+ pos->child = jref(org);
11006+ if (extent_is_unallocated(&parent_coord)
11007+ && extent_unit_index(&parent_coord) != index_jnode(org)) {
11008+ /* @org is not first child of its parent unit. This may happen
11009+ because longerm lock of its parent node was released between
11010+ scan_left and scan_right. For now work around this having flush to repeat */
11011+ ret = -EAGAIN;
11012+ }
11013+ }
11014+
11015+ done:
11016+ done_load_count(&load);
11017+ done_lh(&lock);
11018+ return ret;
11019+}
11020+
11021+/* TODO LIST (no particular order): */
11022+/* I have labelled most of the legitimate FIXME comments in this file with letters to
11023+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11024+ specific names mentioned instead that need to be inspected/resolved. */
11025+/* B. There is an issue described in reverse_relocate_test having to do with an
11026+ imprecise is_preceder? check having to do with partially-dirty extents. The code that
11027+ sets preceder hints and computes the preceder is basically untested. Careful testing
11028+ needs to be done that preceder calculations are done correctly, since if it doesn't
11029+ affect correctness we will not catch this stuff during regular testing. */
11030+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11031+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11032+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11033+ Many of the calls that may produce one of these return values (i.e.,
11034+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11035+ values themselves and, for instance, stop flushing instead of resulting in a restart.
11036+ If any of these results are true error conditions then flush will go into a busy-loop,
11037+ as we noticed during testing when a corrupt tree caused find_child_ptr to return
11038+ ENOENT. It needs careful thought and testing of corner conditions.
11039+*/
11040+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11041+ block is assigned a block number then early-flushed to disk. It is dirtied again and
11042+ flush is called again. Concurrently, that block is deleted, and the de-allocation of
11043+ its block number does not need to be deferred, since it is not part of the preserve set
11044+ (i.e., it didn't exist before the transaction). I think there may be a race condition
11045+ where flush writes the dirty, created block after the non-deferred deallocated block
11046+ number is re-allocated, making it possible to write deleted data on top of non-deleted
11047+ data. Its just a theory, but it needs to be thought out. */
11048+/* F. bio_alloc() failure is not handled gracefully. */
11049+/* G. Unallocated children. */
11050+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11051+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11052+
11053+/* JNODE_FLUSH: MAIN ENTRY POINT */
11054+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11055+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11056+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11057+ a part of transaction commit.
11058+
11059+ Our objective here is to prep and flush the slum the jnode belongs to. We want to
11060+ squish the slum together, and allocate the nodes in it as we squish because allocation
11061+ of children affects squishing of parents.
11062+
11063+ The "argument" @node tells flush where to start. From there, flush finds the left edge
11064+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11065+ "better place" to start squalloc first we perform a flush_scan.
11066+
11067+ Flush-scanning may be performed in both left and right directions, but for different
11068+ purposes. When scanning to the left, we are searching for a node that precedes a
11069+ sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11070+ During flush-scanning, we also take the opportunity to count the number of consecutive
11071+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11072+ make a decision to reallocate leaf nodes (thus favoring write-optimization).
11073+
11074+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11075+ also be dirty nodes to the right of the argument. If the scan-left operation does not
11076+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11077+ operation to see whether there is, in fact, enough nodes to meet the relocate
11078+ threshold. Each right- and left-scan operation uses a single flush_scan object.
11079+
11080+ After left-scan and possibly right-scan, we prepare a flush_position object with the
11081+ starting flush point or parent coordinate, which was determined using scan-left.
11082+
11083+ Next we call the main flush routine, squalloc, which iterates along the
11084+ leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11085+
11086+ After squalloc returns we take extra steps to ensure that all the children
11087+ of the final twig node are allocated--this involves repeating squalloc
11088+ until we finish at a twig with no unallocated children.
11089+
11090+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11091+ any above-twig nodes during flush_empty_queue that still have unallocated children, we
11092+ flush_unprep them.
11093+
11094+ Flush treats several "failure" cases as non-failures, essentially causing them to start
11095+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11096+ probably be handled properly rather than restarting, but there are a bunch of cases to
11097+ audit.
11098+*/
11099+
11100+static int
11101+jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11102+ flush_queue_t * fq, int flags)
11103+{
11104+ long ret = 0;
11105+ flush_scan *right_scan;
11106+ flush_scan *left_scan;
11107+ flush_pos_t *flush_pos;
11108+ int todo;
11109+ struct super_block *sb;
11110+ reiser4_super_info_data *sbinfo;
11111+ jnode *leftmost_in_slum = NULL;
11112+
11113+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11114+ assert("nikita-3022", reiser4_schedulable());
11115+
11116+ assert("nikita-3185",
11117+ get_current_super_private()->delete_mutex_owner != current);
11118+
11119+ /* allocate right_scan, left_scan and flush_pos */
11120+ right_scan =
11121+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11122+ reiser4_ctx_gfp_mask_get());
11123+ if (right_scan == NULL)
11124+ return RETERR(-ENOMEM);
11125+ left_scan = right_scan + 1;
11126+ flush_pos = (flush_pos_t *) (left_scan + 1);
11127+
11128+ sb = reiser4_get_current_sb();
11129+ sbinfo = get_super_private(sb);
11130+
11131+ /* Flush-concurrency debug code */
11132+#if REISER4_DEBUG
11133+ atomic_inc(&flush_cnt);
11134+#endif
11135+
11136+ reiser4_enter_flush(sb);
11137+
11138+ /* Initialize a flush position. */
11139+ pos_init(flush_pos);
11140+
11141+ flush_pos->nr_written = nr_written;
11142+ flush_pos->fq = fq;
11143+ flush_pos->flags = flags;
11144+ flush_pos->nr_to_write = nr_to_write;
11145+
11146+ scan_init(right_scan);
11147+ scan_init(left_scan);
11148+
11149+ /* First scan left and remember the leftmost scan position. If the leftmost
11150+ position is unformatted we remember its parent_coord. We scan until counting
11151+ FLUSH_SCAN_MAXNODES.
11152+
11153+ If starting @node is unformatted, at the beginning of left scan its
11154+ parent (twig level node, containing extent item) will be long term
11155+ locked and lock handle will be stored in the
11156+ @right_scan->parent_lock. This lock is used to start the rightward
11157+ scan without redoing the tree traversal (necessary to find parent)
11158+ and, hence, is kept during leftward scan. As a result, we have to
11159+ use try-lock when taking long term locks during the leftward scan.
11160+ */
11161+ ret = scan_left(left_scan, right_scan,
11162+ node, sbinfo->flush.scan_maxnodes);
11163+ if (ret != 0)
11164+ goto failed;
11165+
11166+ leftmost_in_slum = jref(left_scan->node);
11167+ scan_done(left_scan);
11168+
11169+ /* Then possibly go right to decide if we will use a policy of relocating leaves.
11170+ This is only done if we did not scan past (and count) enough nodes during the
11171+ leftward scan. If we do scan right, we only care to go far enough to establish
11172+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11173+ scan limit is the difference between left_scan.count and the threshold. */
11174+
11175+ todo = sbinfo->flush.relocate_threshold - left_scan->count;
11176+ /* scan right is inherently deadlock prone, because we are
11177+ * (potentially) holding a lock on the twig node at this moment.
11178+ * FIXME: this is incorrect comment: lock is not held */
11179+ if (todo > 0) {
11180+ ret = scan_right(right_scan, node, (unsigned)todo);
11181+ if (ret != 0)
11182+ goto failed;
11183+ }
11184+
11185+ /* Only the right-scan count is needed, release any rightward locks right away. */
11186+ scan_done(right_scan);
11187+
11188+ /* ... and the answer is: we should relocate leaf nodes if at least
11189+ FLUSH_RELOCATE_THRESHOLD nodes were found. */
11190+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11191+ (left_scan->count + right_scan->count >=
11192+ sbinfo->flush.relocate_threshold);
11193+
11194+ /* Funny business here. We set the 'point' in the flush_position at prior to
11195+ starting squalloc regardless of whether the first point is
11196+ formatted or unformatted. Without this there would be an invariant, in the
11197+ rest of the code, that if the flush_position is unformatted then
11198+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11199+ and if the flush_position is formatted then flush_position->point is non-NULL
11200+ and no parent info is set.
11201+
11202+ This seems lazy, but it makes the initial calls to reverse_relocate_test
11203+ (which ask "is it the pos->point the leftmost child of its parent") much easier
11204+ because we know the first child already. Nothing is broken by this, but the
11205+ reasoning is subtle. Holding an extra reference on a jnode during flush can
11206+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11207+ removed from sibling lists until they have zero reference count. Flush would
11208+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11209+ deleted to the right. So if nothing is broken, why fix it?
11210+
11211+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11212+ point and in any moment, because of the concurrent file system
11213+ activity (for example, truncate). */
11214+
11215+ /* Check jnode state after flush_scan completed. Having a lock on this
11216+ node or its parent (in case of unformatted) helps us in case of
11217+ concurrent flushing. */
11218+ if (jnode_check_flushprepped(leftmost_in_slum)
11219+ && !jnode_convertible(leftmost_in_slum)) {
11220+ ret = 0;
11221+ goto failed;
11222+ }
11223+
11224+ /* Now setup flush_pos using scan_left's endpoint. */
11225+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11226+ if (ret)
11227+ goto failed;
11228+
11229+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11230+ && node_is_empty(flush_pos->coord.node)) {
11231+ znode *empty = flush_pos->coord.node;
11232+
11233+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11234+ ret = delete_empty_node(empty);
11235+ goto failed;
11236+ }
11237+
11238+ if (jnode_check_flushprepped(leftmost_in_slum)
11239+ && !jnode_convertible(leftmost_in_slum)) {
11240+ ret = 0;
11241+ goto failed;
11242+ }
11243+
11244+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11245+ ret = alloc_pos_and_ancestors(flush_pos);
11246+ if (ret)
11247+ goto failed;
11248+
11249+ /* Do the main rightward-bottom-up squeeze and allocate loop. */
11250+ ret = squalloc(flush_pos);
11251+ pos_stop(flush_pos);
11252+ if (ret)
11253+ goto failed;
11254+
11255+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11256+ First, the pos_stop() and pos_valid() routines should be modified
11257+ so that pos_stop() sets a flush_position->stop flag to 1 without
11258+ releasing the current position immediately--instead release it in
11259+ pos_done(). This is a better implementation than the current one anyway.
11260+
11261+ It is not clear that all fields of the flush_position should not be released,
11262+ but at the very least the parent_lock, parent_coord, and parent_load should
11263+ remain held because they are hold the last twig when pos_stop() is
11264+ called.
11265+
11266+ When we reach this point in the code, if the parent_coord is set to after the
11267+ last item then we know that flush reached the end of a twig (and according to
11268+ the new flush queueing design, we will return now). If parent_coord is not
11269+ past the last item, we should check if the current twig has any unallocated
11270+ children to the right (we are not concerned with unallocated children to the
11271+ left--in that case the twig itself should not have been allocated). If the
11272+ twig has unallocated children to the right, set the parent_coord to that
11273+ position and then repeat the call to squalloc.
11274+
11275+ Testing for unallocated children may be defined in two ways: if any internal
11276+ item has a fake block number, it is unallocated; if any extent item is
11277+ unallocated then all of its children are unallocated. But there is a more
11278+ aggressive approach: if there are any dirty children of the twig to the right
11279+ of the current position, we may wish to relocate those nodes now. Checking for
11280+ potential relocation is more expensive as it requires knowing whether there are
11281+ any dirty children that are not unallocated. The extent_needs_allocation
11282+ should be used after setting the correct preceder.
11283+
11284+ When we reach the end of a twig at this point in the code, if the flush can
11285+ continue (when the queue is ready) it will need some information on the future
11286+ starting point. That should be stored away in the flush_handle using a seal, I
11287+ believe. Holding a jref() on the future starting point may break other code
11288+ that deletes that node.
11289+ */
11290+
11291+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11292+ above the twig level. If the VM calls flush above the twig level, do nothing
11293+ and return (but figure out why this happens). The txnmgr should be modified to
11294+ only flush its leaf-level dirty list. This will do all the necessary squeeze
11295+ and allocate steps but leave unallocated branches and possibly unallocated
11296+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11297+ level, the remaining unallocated nodes should be given write-optimized
11298+ locations. (Possibly, the remaining unallocated twigs should be allocated just
11299+ before their leftmost child.)
11300+ */
11301+
11302+ /* Any failure reaches this point. */
11303+ failed:
11304+
11305+ switch (ret) {
11306+ case -E_REPEAT:
11307+ case -EINVAL:
11308+ case -E_DEADLOCK:
11309+ case -E_NO_NEIGHBOR:
11310+ case -ENOENT:
11311+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11312+ in each case. They already are handled in many cases. */
11313+ /* Something bad happened, but difficult to avoid... Try again! */
11314+ ret = 0;
11315+ }
11316+
11317+ if (leftmost_in_slum)
11318+ jput(leftmost_in_slum);
11319+
11320+ pos_done(flush_pos);
11321+ scan_done(left_scan);
11322+ scan_done(right_scan);
11323+ kfree(right_scan);
11324+
11325+ ON_DEBUG(atomic_dec(&flush_cnt));
11326+
11327+ reiser4_leave_flush(sb);
11328+
11329+ return ret;
11330+}
11331+
11332+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11333+ * flusher should submit all prepped nodes immediately without keeping them in
11334+ * flush queues for long time. The reason for rapid flush mode is to free
11335+ * memory as fast as possible. */
11336+
11337+#if REISER4_USE_RAPID_FLUSH
11338+
11339+/**
11340+ * submit all prepped nodes if rapid flush mode is set,
11341+ * turn rapid flush mode off.
11342+ */
11343+
11344+static int rapid_flush(flush_pos_t * pos)
11345+{
11346+ if (!wbq_available())
11347+ return 0;
11348+
11349+ return write_prepped_nodes(pos);
11350+}
11351+
11352+#else
11353+
11354+#define rapid_flush(pos) (0)
11355+
11356+#endif /* REISER4_USE_RAPID_FLUSH */
11357+
11358+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11359+ flush_queue_t *fq, int *nr_queued,
11360+ int flags)
11361+{
11362+ jnode * node;
11363+
11364+ if (start != NULL) {
11365+ spin_lock_jnode(start);
11366+ if (!jnode_is_flushprepped(start)) {
11367+ assert("zam-1056", start->atom == atom);
11368+ node = start;
11369+ goto enter;
11370+ }
11371+ spin_unlock_jnode(start);
11372+ }
11373+ /*
11374+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11375+ * nodes. The atom spin lock is not released until all dirty nodes processed or
11376+ * not prepped node found in the atom dirty lists.
11377+ */
11378+ while ((node = find_first_dirty_jnode(atom, flags))) {
11379+ spin_lock_jnode(node);
11380+ enter:
11381+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11382+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11383+
11384+ if (JF_ISSET(node, JNODE_WRITEBACK)) {
11385+ /* move node to the end of atom's writeback list */
11386+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11387+
11388+ /*
11389+ * jnode is not necessarily on dirty list: if it was dirtied when
11390+ * it was on flush queue - it does not get moved to dirty list
11391+ */
11392+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11393+ WB_LIST, 1));
11394+
11395+ } else if (jnode_is_znode(node)
11396+ && znode_above_root(JZNODE(node))) {
11397+ /*
11398+ * A special case for znode-above-root. The above-root (fake)
11399+ * znode is captured and dirtied when the tree height changes or
11400+ * when the root node is relocated. This causes atoms to fuse so
11401+ * that changes at the root are serialized. However, this node is
11402+ * never flushed. This special case used to be in lock.c to
11403+ * prevent the above-root node from ever being captured, but now
11404+ * that it is captured we simply prevent it from flushing. The
11405+ * log-writer code relies on this to properly log superblock
11406+ * modifications of the tree height.
11407+ */
11408+ jnode_make_wander_nolock(node);
11409+ } else if (JF_ISSET(node, JNODE_RELOC)) {
11410+ queue_jnode(fq, node);
11411+ ++(*nr_queued);
11412+ } else
11413+ break;
11414+
11415+ spin_unlock_jnode(node);
11416+ }
11417+ return node;
11418+}
11419+
11420+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11421+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11422+ * other errors as they are. */
11423+int
11424+flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11425+ txn_atom ** atom, jnode *start)
11426+{
11427+ reiser4_super_info_data *sinfo = get_current_super_private();
11428+ flush_queue_t *fq = NULL;
11429+ jnode *node;
11430+ int nr_queued;
11431+ int ret;
11432+
11433+ assert("zam-889", atom != NULL && *atom != NULL);
11434+ assert_spin_locked(&((*atom)->alock));
11435+ assert("zam-892", get_current_context()->trans->atom == *atom);
11436+
11437+ nr_to_write = LONG_MAX;
11438+ while (1) {
11439+ ret = reiser4_fq_by_atom(*atom, &fq);
11440+ if (ret != -E_REPEAT)
11441+ break;
11442+ *atom = get_current_atom_locked();
11443+ }
11444+ if (ret)
11445+ return ret;
11446+
11447+ assert_spin_locked(&((*atom)->alock));
11448+
11449+ /* parallel flushers limit */
11450+ if (sinfo->tmgr.atom_max_flushers != 0) {
11451+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11452+ /* An reiser4_atom_send_event() call is inside
11453+ reiser4_fq_put_nolock() which is called when flush is
11454+ finished and nr_flushers is decremented. */
11455+ reiser4_atom_wait_event(*atom);
11456+ *atom = get_current_atom_locked();
11457+ }
11458+ }
11459+
11460+ /* count ourself as a flusher */
11461+ (*atom)->nr_flushers++;
11462+
11463+ writeout_mode_enable();
11464+
11465+ nr_queued = 0;
11466+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11467+
11468+ if (node == NULL) {
11469+ if (nr_queued == 0) {
11470+ (*atom)->nr_flushers--;
11471+ reiser4_fq_put_nolock(fq);
11472+ reiser4_atom_send_event(*atom);
11473+ /* current atom remains locked */
11474+ writeout_mode_disable();
11475+ return 0;
11476+ }
11477+ spin_unlock_atom(*atom);
11478+ } else {
11479+ jref(node);
11480+ BUG_ON((*atom)->super != node->tree->super);
11481+ spin_unlock_atom(*atom);
11482+ spin_unlock_jnode(node);
11483+ BUG_ON(nr_to_write == 0);
11484+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11485+ jput(node);
11486+ }
11487+
11488+ ret =
11489+ reiser4_write_fq(fq, nr_submitted,
11490+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11491+
11492+ *atom = get_current_atom_locked();
11493+ (*atom)->nr_flushers--;
11494+ reiser4_fq_put_nolock(fq);
11495+ reiser4_atom_send_event(*atom);
11496+ spin_unlock_atom(*atom);
11497+
11498+ writeout_mode_disable();
11499+
11500+ if (ret == 0)
11501+ ret = -E_REPEAT;
11502+
11503+ return ret;
11504+}
11505+
11506+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11507+
11508+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11509+ reverse parent-first relocate context. Here all we know is the preceder and the block
11510+ number. Since we are going in reverse, the preceder may still be relocated as well, so
11511+ we can't ask the block allocator "is there a closer block available to relocate?" here.
11512+ In the _forward_ parent-first relocate context (not here) we actually call the block
11513+ allocator to try and find a closer location. */
11514+static int
11515+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11516+ const reiser4_block_nr * nblk)
11517+{
11518+ reiser4_block_nr dist;
11519+
11520+ assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11521+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11522+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11523+
11524+ /* Distance is the absolute value. */
11525+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11526+
11527+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11528+ block, do not relocate. */
11529+ if (dist <= get_current_super_private()->flush.relocate_distance) {
11530+ return 0;
11531+ }
11532+
11533+ return 1;
11534+}
11535+
11536+/* This function is a predicate that tests for relocation. Always called in the
11537+ reverse-parent-first context, when we are asking whether the current node should be
11538+ relocated in order to expand the flush by dirtying the parent level (and thus
11539+ proceeding to flush that level). When traversing in the forward parent-first direction
11540+ (not here), relocation decisions are handled in two places: allocate_znode() and
11541+ extent_needs_allocation(). */
11542+static int
11543+reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11544+ flush_pos_t * pos)
11545+{
11546+ reiser4_block_nr pblk = 0;
11547+ reiser4_block_nr nblk = 0;
11548+
11549+ assert("jmacd-8989", !jnode_is_root(node));
11550+
11551+ /*
11552+ * This function is called only from the
11553+ * reverse_relocate_check_dirty_parent() and only if the parent
11554+ * node is clean. This implies that the parent has the real (i.e., not
11555+ * fake) block number, and, so does the child, because otherwise the
11556+ * parent would be dirty.
11557+ */
11558+
11559+ /* New nodes are treated as if they are being relocated. */
11560+ if (JF_ISSET (node, JNODE_CREATED) ||
11561+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11562+ return 1;
11563+ }
11564+
11565+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11566+ existing node, the coord may be leftmost even though the child is not the
11567+ parent-first preceder of the parent. If the first dirty node appears somewhere
11568+ in the middle of the first extent unit, this preceder calculation is wrong.
11569+ Needs more logic in here. */
11570+ if (coord_is_leftmost_unit(parent_coord)) {
11571+ pblk = *znode_get_block(parent_coord->node);
11572+ } else {
11573+ pblk = pos->preceder.blk;
11574+ }
11575+ check_preceder(pblk);
11576+
11577+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11578+ if (pblk == 0) {
11579+ return 1;
11580+ }
11581+
11582+ nblk = *jnode_get_block(node);
11583+
11584+ if (reiser4_blocknr_is_fake(&nblk))
11585+ /* child is unallocated, mark parent dirty */
11586+ return 1;
11587+
11588+ return reverse_relocate_if_close_enough(&pblk, &nblk);
11589+}
11590+
11591+/* This function calls reverse_relocate_test to make a reverse-parent-first
11592+ relocation decision and then, if yes, it marks the parent dirty. */
11593+static int
11594+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11595+ flush_pos_t * pos)
11596+{
11597+ int ret;
11598+
11599+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11600+
11601+ ret = reverse_relocate_test(node, parent_coord, pos);
11602+ if (ret < 0) {
11603+ return ret;
11604+ }
11605+
11606+ /* FIXME-ZAM
11607+ if parent is already relocated - we do not want to grab space, right? */
11608+ if (ret == 1) {
11609+ int grabbed;
11610+
11611+ grabbed = get_current_context()->grabbed_blocks;
11612+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11613+ 0)
11614+ reiser4_panic("umka-1250",
11615+ "No space left during flush.");
11616+
11617+ assert("jmacd-18923",
11618+ znode_is_write_locked(parent_coord->node));
11619+ znode_make_dirty(parent_coord->node);
11620+ grabbed2free_mark(grabbed);
11621+ }
11622+ }
11623+
11624+ return 0;
11625+}
11626+
11627+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11628+ PARENT-FIRST LOOP BEGINS) */
11629+
11630+/* Get the leftmost child for given coord. */
11631+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11632+{
11633+ int ret;
11634+
11635+ ret = item_utmost_child(coord, LEFT_SIDE, child);
11636+
11637+ if (ret)
11638+ return ret;
11639+
11640+ if (IS_ERR(*child))
11641+ return PTR_ERR(*child);
11642+
11643+ return 0;
11644+}
11645+
11646+/* This step occurs after the left- and right-scans are completed, before starting the
11647+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11648+ flush point, which means continuing in the reverse parent-first direction to the
11649+ parent, grandparent, and so on (as long as the child is a leftmost child). This
11650+ routine calls a recursive process, alloc_one_ancestor, which does the real work,
11651+ except there is special-case handling here for the first ancestor, which may be a twig.
11652+ At each level (here and alloc_one_ancestor), we check for relocation and then, if
11653+ the child is a leftmost child, repeat at the next level. On the way back down (the
11654+ recursion), we allocate the ancestors in parent-first order. */
11655+static int alloc_pos_and_ancestors(flush_pos_t * pos)
11656+{
11657+ int ret = 0;
11658+ lock_handle plock;
11659+ load_count pload;
11660+ coord_t pcoord;
11661+
11662+ if (znode_check_flushprepped(pos->lock.node))
11663+ return 0;
11664+
11665+ coord_init_invalid(&pcoord, NULL);
11666+ init_lh(&plock);
11667+ init_load_count(&pload);
11668+
11669+ if (pos->state == POS_ON_EPOINT) {
11670+ /* a special case for pos on twig level, where we already have
11671+ a lock on parent node. */
11672+ /* The parent may not be dirty, in which case we should decide
11673+ whether to relocate the child now. If decision is made to
11674+ relocate the child, the parent is marked dirty. */
11675+ ret =
11676+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11677+ pos);
11678+ if (ret)
11679+ goto exit;
11680+
11681+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11682+ is leftmost) and the leaf/child, so recursion is not needed.
11683+ Levels above the twig will be allocated for
11684+ write-optimization before the transaction commits. */
11685+
11686+ /* Do the recursive step, allocating zero or more of our
11687+ * ancestors. */
11688+ ret = alloc_one_ancestor(&pos->coord, pos);
11689+
11690+ } else {
11691+ if (!znode_is_root(pos->lock.node)) {
11692+ /* all formatted nodes except tree root */
11693+ ret =
11694+ reiser4_get_parent(&plock, pos->lock.node,
11695+ ZNODE_WRITE_LOCK);
11696+ if (ret)
11697+ goto exit;
11698+
11699+ ret = incr_load_count_znode(&pload, plock.node);
11700+ if (ret)
11701+ goto exit;
11702+
11703+ ret =
11704+ find_child_ptr(plock.node, pos->lock.node, &pcoord);
11705+ if (ret)
11706+ goto exit;
11707+
11708+ ret =
11709+ reverse_relocate_check_dirty_parent(ZJNODE
11710+ (pos->lock.
11711+ node), &pcoord,
11712+ pos);
11713+ if (ret)
11714+ goto exit;
11715+
11716+ ret = alloc_one_ancestor(&pcoord, pos);
11717+ if (ret)
11718+ goto exit;
11719+ }
11720+
11721+ ret = allocate_znode(pos->lock.node, &pcoord, pos);
11722+ }
11723+ exit:
11724+ done_load_count(&pload);
11725+ done_lh(&plock);
11726+ return ret;
11727+}
11728+
11729+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
11730+ call to set_preceder, which is the next function described, this checks if the
11731+ child is a leftmost child and returns if it is not. If the child is a leftmost child
11732+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive
11733+ step. */
11734+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11735+{
11736+ int ret = 0;
11737+ lock_handle alock;
11738+ load_count aload;
11739+ coord_t acoord;
11740+
11741+ /* As we ascend at the left-edge of the region to flush, take this opportunity at
11742+ the twig level to find our parent-first preceder unless we have already set
11743+ it. */
11744+ if (pos->preceder.blk == 0) {
11745+ ret = set_preceder(coord, pos);
11746+ if (ret != 0)
11747+ return ret;
11748+ }
11749+
11750+ /* If the ancestor is clean or already allocated, or if the child is not a
11751+ leftmost child, stop going up, even leaving coord->node not flushprepped. */
11752+ if (znode_check_flushprepped(coord->node)
11753+ || !coord_is_leftmost_unit(coord))
11754+ return 0;
11755+
11756+ init_lh(&alock);
11757+ init_load_count(&aload);
11758+ coord_init_invalid(&acoord, NULL);
11759+
11760+ /* Only ascend to the next level if it is a leftmost child, but write-lock the
11761+ parent in case we will relocate the child. */
11762+ if (!znode_is_root(coord->node)) {
11763+
11764+ ret =
11765+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11766+ &alock, &aload, ZNODE_WRITE_LOCK,
11767+ 0);
11768+ if (ret != 0) {
11769+ /* FIXME(C): check EINVAL, E_DEADLOCK */
11770+ goto exit;
11771+ }
11772+
11773+ ret =
11774+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11775+ &acoord, pos);
11776+ if (ret != 0) {
11777+ goto exit;
11778+ }
11779+
11780+ /* Recursive call. */
11781+ if (!znode_check_flushprepped(acoord.node)) {
11782+ ret = alloc_one_ancestor(&acoord, pos);
11783+ if (ret)
11784+ goto exit;
11785+ }
11786+ }
11787+
11788+ /* Note: we call allocate with the parent write-locked (except at the root) in
11789+ case we relocate the child, in which case it will modify the parent during this
11790+ call. */
11791+ ret = allocate_znode(coord->node, &acoord, pos);
11792+
11793+ exit:
11794+ done_load_count(&aload);
11795+ done_lh(&alock);
11796+ return ret;
11797+}
11798+
11799+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11800+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
11801+ should this node be relocated (in reverse parent-first context)? We repeat this
11802+ process as long as the child is the leftmost child, eventually reaching an ancestor of
11803+ the flush point that is not a leftmost child. The preceder of that ancestors, which is
11804+ not a leftmost child, is actually on the leaf level. The preceder of that block is the
11805+ left-neighbor of the flush point. The preceder of that block is the rightmost child of
11806+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
11807+ level, it stops momentarily to remember the block of the rightmost child of the twig on
11808+ the left and sets it to the flush_position's preceder_hint.
11809+
11810+ There is one other place where we may set the flush_position's preceder hint, which is
11811+ during scan-left.
11812+*/
11813+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11814+{
11815+ int ret;
11816+ coord_t coord;
11817+ lock_handle left_lock;
11818+ load_count left_load;
11819+
11820+ coord_dup(&coord, coord_in);
11821+
11822+ init_lh(&left_lock);
11823+ init_load_count(&left_load);
11824+
11825+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11826+ coord_is_leftmost_unit is not the right test if the unformatted child is in the
11827+ middle of the first extent unit. */
11828+ if (!coord_is_leftmost_unit(&coord)) {
11829+ coord_prev_unit(&coord);
11830+ } else {
11831+ ret =
11832+ reiser4_get_left_neighbor(&left_lock, coord.node,
11833+ ZNODE_READ_LOCK, GN_SAME_ATOM);
11834+ if (ret) {
11835+ /* If we fail for any reason it doesn't matter because the
11836+ preceder is only a hint. We are low-priority at this point, so
11837+ this must be the case. */
11838+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11839+ ret == -ENOENT || ret == -EINVAL
11840+ || ret == -E_DEADLOCK) {
11841+ ret = 0;
11842+ }
11843+ goto exit;
11844+ }
11845+
11846+ ret = incr_load_count_znode(&left_load, left_lock.node);
11847+ if (ret)
11848+ goto exit;
11849+
11850+ coord_init_last_unit(&coord, left_lock.node);
11851+ }
11852+
11853+ ret =
11854+ item_utmost_child_real_block(&coord, RIGHT_SIDE,
11855+ &pos->preceder.blk);
11856+ exit:
11857+ check_preceder(pos->preceder.blk);
11858+ done_load_count(&left_load);
11859+ done_lh(&left_lock);
11860+ return ret;
11861+}
11862+
11863+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11864+
11865+/* This procedure implements the outer loop of the flush algorithm. To put this in
11866+ context, here is the general list of steps taken by the flush routine as a whole:
11867+
11868+ 1. Scan-left
11869+ 2. Scan-right (maybe)
11870+ 3. Allocate initial flush position and its ancestors
11871+ 4. <handle extents>
11872+ 5. <squeeze and next position and its ancestors to-the-right,
11873+ then update position to-the-right>
11874+ 6. <repeat from #4 until flush is stopped>
11875+
11876+ This procedure implements the loop in steps 4 through 6 in the above listing.
11877+
11878+ Step 4: if the current flush position is an extent item (position on the twig level),
11879+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11880+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
11881+ If the next coordinate is an internal item, we descend back to the leaf level,
11882+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
11883+ brings us past the end of the twig level, then we call
11884+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11885+ step #5 which moves to the right.
11886+
11887+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11888+ tree to allocate any ancestors of the next-right flush position that are not also
11889+ ancestors of the current position. Those ancestors (in top-down order) are the next in
11890+ parent-first order. We squeeze adjacent nodes on the way up until the right node and
11891+ current node share the same parent, then allocate on the way back down. Finally, this
11892+ step sets the flush position to the next-right node. Then repeat steps 4 and 5.
11893+*/
11894+
11895+/* SQUEEZE CODE */
11896+
11897+/* squalloc_right_twig helper function, cut a range of extent items from
11898+ cut node to->node from the beginning up to coord @to. */
11899+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11900+ znode * left)
11901+{
11902+ coord_t from;
11903+ reiser4_key from_key;
11904+
11905+ coord_init_first_unit(&from, to->node);
11906+ item_key_by_coord(&from, &from_key);
11907+
11908+ return cut_node_content(&from, to, &from_key, to_key, NULL);
11909+}
11910+
11911+/* Copy as much of the leading extents from @right to @left, allocating
11912+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
11913+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
11914+ internal item it calls shift_one_internal_unit and may then return
11915+ SUBTREE_MOVED. */
11916+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
11917+{
11918+ int ret = SUBTREE_MOVED;
11919+ coord_t coord; /* used to iterate over items */
11920+ reiser4_key stop_key;
11921+
11922+ assert("jmacd-2008", !node_is_empty(right));
11923+ coord_init_first_unit(&coord, right);
11924+
11925+ /* FIXME: can be optimized to cut once */
11926+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
11927+ ON_DEBUG(void *vp);
11928+
11929+ assert("vs-1468", coord_is_leftmost_unit(&coord));
11930+ ON_DEBUG(vp = shift_check_prepare(left, coord.node));
11931+
11932+ /* stop_key is used to find what was copied and what to cut */
11933+ stop_key = *reiser4_min_key();
11934+ ret = squalloc_extent(left, &coord, pos, &stop_key);
11935+ if (ret != SQUEEZE_CONTINUE) {
11936+ ON_DEBUG(kfree(vp));
11937+ break;
11938+ }
11939+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
11940+
11941+ /* Helper function to do the cutting. */
11942+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
11943+ check_me("vs-1466",
11944+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
11945+
11946+ ON_DEBUG(shift_check(vp, left, coord.node));
11947+ }
11948+
11949+ if (node_is_empty(coord.node))
11950+ ret = SQUEEZE_SOURCE_EMPTY;
11951+
11952+ if (ret == SQUEEZE_TARGET_FULL) {
11953+ goto out;
11954+ }
11955+
11956+ if (node_is_empty(right)) {
11957+ /* The whole right node was copied into @left. */
11958+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
11959+ goto out;
11960+ }
11961+
11962+ coord_init_first_unit(&coord, right);
11963+
11964+ if (!item_is_internal(&coord)) {
11965+ /* we do not want to squeeze anything else to left neighbor because "slum"
11966+ is over */
11967+ ret = SQUEEZE_TARGET_FULL;
11968+ goto out;
11969+ }
11970+ assert("jmacd-433", item_is_internal(&coord));
11971+
11972+ /* Shift an internal unit. The child must be allocated before shifting any more
11973+ extents, so we stop here. */
11974+ ret = shift_one_internal_unit(left, right);
11975+
11976+ out:
11977+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
11978+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
11979+
11980+ if (ret == SQUEEZE_TARGET_FULL) {
11981+ /* We submit prepped nodes here and expect that this @left twig
11982+ * will not be modified again during this jnode_flush() call. */
11983+ int ret1;
11984+
11985+ /* NOTE: seems like io is done under long term locks. */
11986+ ret1 = write_prepped_nodes(pos);
11987+ if (ret1 < 0)
11988+ return ret1;
11989+ }
11990+
11991+ return ret;
11992+}
11993+
11994+#if REISER4_DEBUG
11995+static void item_convert_invariant(flush_pos_t * pos)
11996+{
11997+ assert("edward-1225", coord_is_existing_item(&pos->coord));
11998+ if (chaining_data_present(pos)) {
11999+ item_plugin *iplug = item_convert_plug(pos);
12000+
12001+ assert("edward-1000",
12002+ iplug == item_plugin_by_coord(&pos->coord));
12003+ assert("edward-1001", iplug->f.convert != NULL);
12004+ } else
12005+ assert("edward-1226", pos->child == NULL);
12006+}
12007+#else
12008+
12009+#define item_convert_invariant(pos) noop
12010+
12011+#endif
12012+
12013+/* Scan node items starting from the first one and apply for each
12014+ item its flush ->convert() method (if any). This method may
12015+ resize/kill the item so the tree will be changed.
12016+*/
12017+static int convert_node(flush_pos_t * pos, znode * node)
12018+{
12019+ int ret = 0;
12020+ item_plugin *iplug;
12021+
12022+ assert("edward-304", pos != NULL);
12023+ assert("edward-305", pos->child == NULL);
12024+ assert("edward-475", znode_convertible(node));
12025+ assert("edward-669", znode_is_wlocked(node));
12026+ assert("edward-1210", !node_is_empty(node));
12027+
12028+ if (znode_get_level(node) != LEAF_LEVEL)
12029+ /* unsupported */
12030+ goto exit;
12031+
12032+ coord_init_first_unit(&pos->coord, node);
12033+
12034+ while (1) {
12035+ ret = 0;
12036+ coord_set_to_left(&pos->coord);
12037+ item_convert_invariant(pos);
12038+
12039+ iplug = item_plugin_by_coord(&pos->coord);
12040+ assert("edward-844", iplug != NULL);
12041+
12042+ if (iplug->f.convert) {
12043+ ret = iplug->f.convert(pos);
12044+ if (ret)
12045+ goto exit;
12046+ }
12047+ assert("edward-307", pos->child == NULL);
12048+
12049+ if (coord_next_item(&pos->coord)) {
12050+ /* node is over */
12051+
12052+ if (!chaining_data_present(pos))
12053+ /* finished this node */
12054+ break;
12055+ if (should_chain_next_node(pos)) {
12056+ /* go to next node */
12057+ move_chaining_data(pos, 0 /* to next node */ );
12058+ break;
12059+ }
12060+ /* repeat this node */
12061+ move_chaining_data(pos, 1 /* this node */ );
12062+ continue;
12063+ }
12064+ /* Node is not over.
12065+ Check if there is attached convert data.
12066+ If so roll one item position back and repeat
12067+ on this node
12068+ */
12069+ if (chaining_data_present(pos)) {
12070+
12071+ if (iplug != item_plugin_by_coord(&pos->coord))
12072+ set_item_convert_count(pos, 0);
12073+
12074+ ret = coord_prev_item(&pos->coord);
12075+ assert("edward-1003", !ret);
12076+
12077+ move_chaining_data(pos, 1 /* this node */ );
12078+ }
12079+ }
12080+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12081+ znode_make_dirty(node);
12082+ exit:
12083+ assert("edward-1004", !ret);
12084+ return ret;
12085+}
12086+
12087+/* Squeeze and allocate the right neighbor. This is called after @left and
12088+ its current children have been squeezed and allocated already. This
12089+ procedure's job is to squeeze and items from @right to @left.
12090+
12091+ If at the leaf level, use the shift_everything_left memcpy-optimized
12092+ version of shifting (squeeze_right_leaf).
12093+
12094+ If at the twig level, extents are allocated as they are shifted from @right
12095+ to @left (squalloc_right_twig).
12096+
12097+ At any other level, shift one internal item and return to the caller
12098+ (squalloc_parent_first) so that the shifted-subtree can be processed in
12099+ parent-first order.
12100+
12101+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12102+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12103+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12104+ is returned.
12105+*/
12106+
12107+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12108+ znode * right)
12109+{
12110+ int ret;
12111+
12112+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12113+ * tree owing to error (for example, ENOSPC) in write */
12114+ /* assert("jmacd-9321", !node_is_empty(left)); */
12115+ assert("jmacd-9322", !node_is_empty(right));
12116+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12117+
12118+ switch (znode_get_level(left)) {
12119+ case TWIG_LEVEL:
12120+ /* Shift with extent allocating until either an internal item
12121+ is encountered or everything is shifted or no free space
12122+ left in @left */
12123+ ret = squeeze_right_twig(left, right, pos);
12124+ break;
12125+
12126+ default:
12127+ /* All other levels can use shift_everything until we implement per-item
12128+ flush plugins. */
12129+ ret = squeeze_right_non_twig(left, right);
12130+ break;
12131+ }
12132+
12133+ assert("jmacd-2011", (ret < 0 ||
12134+ ret == SQUEEZE_SOURCE_EMPTY
12135+ || ret == SQUEEZE_TARGET_FULL
12136+ || ret == SUBTREE_MOVED));
12137+ return ret;
12138+}
12139+
12140+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12141+ znode * right)
12142+{
12143+ int ret;
12144+
12145+ ret = squeeze_right_twig(pos->lock.node, right, pos);
12146+ if (ret < 0)
12147+ return ret;
12148+ if (ret > 0) {
12149+ coord_init_after_last_item(&pos->coord, pos->lock.node);
12150+ return ret;
12151+ }
12152+
12153+ coord_init_last_unit(&pos->coord, pos->lock.node);
12154+ return 0;
12155+}
12156+
12157+/* forward declaration */
12158+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12159+
12160+/* do a fast check for "same parents" condition before calling
12161+ * squalloc_upper_levels() */
12162+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12163+ znode * left,
12164+ znode * right)
12165+{
12166+ if (znode_same_parents(left, right))
12167+ return 0;
12168+
12169+ return squalloc_upper_levels(pos, left, right);
12170+}
12171+
12172+/* Check whether the parent of given @right node needs to be processes
12173+ ((re)allocated) prior to processing of the child. If @left and @right do not
12174+ share at least the parent of the @right is after the @left but before the
12175+ @right in parent-first order, we have to (re)allocate it before the @right
12176+ gets (re)allocated. */
12177+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12178+{
12179+ int ret;
12180+
12181+ lock_handle left_parent_lock;
12182+ lock_handle right_parent_lock;
12183+
12184+ load_count left_parent_load;
12185+ load_count right_parent_load;
12186+
12187+ init_lh(&left_parent_lock);
12188+ init_lh(&right_parent_lock);
12189+
12190+ init_load_count(&left_parent_load);
12191+ init_load_count(&right_parent_load);
12192+
12193+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12194+ if (ret)
12195+ goto out;
12196+
12197+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12198+ if (ret)
12199+ goto out;
12200+
12201+ /* Check for same parents */
12202+ if (left_parent_lock.node == right_parent_lock.node)
12203+ goto out;
12204+
12205+ if (znode_check_flushprepped(right_parent_lock.node)) {
12206+ /* Keep parent-first order. In the order, the right parent node stands
12207+ before the @right node. If it is already allocated, we set the
12208+ preceder (next block search start point) to its block number, @right
12209+ node should be allocated after it.
12210+
12211+ However, preceder is set only if the right parent is on twig level.
12212+ The explanation is the following: new branch nodes are allocated over
12213+ already allocated children while the tree grows, it is difficult to
12214+ keep tree ordered, we assume that only leaves and twings are correctly
12215+ allocated. So, only twigs are used as a preceder for allocating of the
12216+ rest of the slum. */
12217+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12218+ pos->preceder.blk =
12219+ *znode_get_block(right_parent_lock.node);
12220+ check_preceder(pos->preceder.blk);
12221+ }
12222+ goto out;
12223+ }
12224+
12225+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12226+ if (ret)
12227+ goto out;
12228+
12229+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12230+ if (ret)
12231+ goto out;
12232+
12233+ ret =
12234+ squeeze_right_neighbor(pos, left_parent_lock.node,
12235+ right_parent_lock.node);
12236+ /* We stop if error. We stop if some items/units were shifted (ret == 0)
12237+ * and thus @right changed its parent. It means we have not process
12238+ * right_parent node prior to processing of @right. Positive return
12239+ * values say that shifting items was not happen because of "empty
12240+ * source" or "target full" conditions. */
12241+ if (ret <= 0)
12242+ goto out;
12243+
12244+ /* parent(@left) and parent(@right) may have different parents also. We
12245+ * do a recursive call for checking that. */
12246+ ret =
12247+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12248+ right_parent_lock.node);
12249+ if (ret)
12250+ goto out;
12251+
12252+ /* allocate znode when going down */
12253+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12254+
12255+ out:
12256+ done_load_count(&left_parent_load);
12257+ done_load_count(&right_parent_load);
12258+
12259+ done_lh(&left_parent_lock);
12260+ done_lh(&right_parent_lock);
12261+
12262+ return ret;
12263+}
12264+
12265+/* Check the leftmost child "flushprepped" status, also returns true if child
12266+ * node was not found in cache. */
12267+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12268+{
12269+ int ret;
12270+ int prepped;
12271+
12272+ jnode *child;
12273+
12274+ ret = get_leftmost_child_of_unit(coord, &child);
12275+
12276+ if (ret)
12277+ return ret;
12278+
12279+ if (child) {
12280+ prepped = jnode_check_flushprepped(child);
12281+ jput(child);
12282+ } else {
12283+ /* We consider not existing child as a node which slum
12284+ processing should not continue to. Not cached node is clean,
12285+ so it is flushprepped. */
12286+ prepped = 1;
12287+ }
12288+
12289+ return prepped;
12290+}
12291+
12292+/* (re)allocate znode with automated getting parent node */
12293+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12294+{
12295+ int ret;
12296+ lock_handle parent_lock;
12297+ load_count parent_load;
12298+ coord_t pcoord;
12299+
12300+ assert("zam-851", znode_is_write_locked(node));
12301+
12302+ init_lh(&parent_lock);
12303+ init_load_count(&parent_load);
12304+
12305+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12306+ if (ret)
12307+ goto out;
12308+
12309+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12310+ if (ret)
12311+ goto out;
12312+
12313+ ret = find_child_ptr(parent_lock.node, node, &pcoord);
12314+ if (ret)
12315+ goto out;
12316+
12317+ ret = allocate_znode(node, &pcoord, pos);
12318+
12319+ out:
12320+ done_load_count(&parent_load);
12321+ done_lh(&parent_lock);
12322+ return ret;
12323+}
12324+
12325+/* Process nodes on leaf level until unformatted node or rightmost node in the
12326+ * slum reached. */
12327+static int handle_pos_on_formatted(flush_pos_t * pos)
12328+{
12329+ int ret;
12330+ lock_handle right_lock;
12331+ load_count right_load;
12332+
12333+ init_lh(&right_lock);
12334+ init_load_count(&right_load);
12335+
12336+ if (should_convert_node(pos, pos->lock.node)) {
12337+ ret = convert_node(pos, pos->lock.node);
12338+ if (ret)
12339+ return ret;
12340+ }
12341+
12342+ while (1) {
12343+ ret =
12344+ neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12345+ ZNODE_WRITE_LOCK,
12346+ !should_convert_next_node(pos,
12347+ right_lock.
12348+ node));
12349+ if (ret)
12350+ break;
12351+
12352+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12353+ * can be optimal. For now we choose to live with the risk that it will
12354+ * be suboptimal because it would be quite complex to code it to be
12355+ * smarter. */
12356+ if (znode_check_flushprepped(right_lock.node)
12357+ && !znode_convertible(right_lock.node)) {
12358+ assert("edward-1005",
12359+ !should_convert_next_node(pos, right_lock.node));
12360+ pos_stop(pos);
12361+ break;
12362+ }
12363+
12364+ ret = incr_load_count_znode(&right_load, right_lock.node);
12365+ if (ret)
12366+ break;
12367+
12368+ if (should_convert_node(pos, right_lock.node)) {
12369+ ret = convert_node(pos, right_lock.node);
12370+ if (ret)
12371+ break;
12372+ if (node_is_empty(right_lock.node)) {
12373+ /* node became empty after converting, repeat */
12374+ done_load_count(&right_load);
12375+ done_lh(&right_lock);
12376+ continue;
12377+ }
12378+ }
12379+
12380+ /* squeeze _before_ going upward. */
12381+ ret =
12382+ squeeze_right_neighbor(pos, pos->lock.node,
12383+ right_lock.node);
12384+ if (ret < 0)
12385+ break;
12386+
12387+ if (znode_check_flushprepped(right_lock.node)) {
12388+ if (should_convert_next_node(pos, right_lock.node)) {
12389+ /* in spite of flushprepped status of the node,
12390+ its right slum neighbor should be converted */
12391+ assert("edward-953", convert_data(pos));
12392+ assert("edward-954", item_convert_data(pos));
12393+
12394+ if (node_is_empty(right_lock.node)) {
12395+ done_load_count(&right_load);
12396+ done_lh(&right_lock);
12397+ } else
12398+ move_flush_pos(pos, &right_lock,
12399+ &right_load, NULL);
12400+ continue;
12401+ }
12402+ pos_stop(pos);
12403+ break;
12404+ }
12405+
12406+ if (node_is_empty(right_lock.node)) {
12407+ /* repeat if right node was squeezed completely */
12408+ done_load_count(&right_load);
12409+ done_lh(&right_lock);
12410+ continue;
12411+ }
12412+
12413+ /* parent(right_lock.node) has to be processed before
12414+ * (right_lock.node) due to "parent-first" allocation order. */
12415+ ret =
12416+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12417+ right_lock.node);
12418+ if (ret)
12419+ break;
12420+ /* (re)allocate _after_ going upward */
12421+ ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12422+ if (ret)
12423+ break;
12424+
12425+ if (should_terminate_squalloc(pos)) {
12426+ set_item_convert_count(pos, 0);
12427+ break;
12428+ }
12429+
12430+ /* advance the flush position to the right neighbor */
12431+ move_flush_pos(pos, &right_lock, &right_load, NULL);
12432+
12433+ ret = rapid_flush(pos);
12434+ if (ret)
12435+ break;
12436+ }
12437+
12438+ assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
12439+
12440+ done_load_count(&right_load);
12441+ done_lh(&right_lock);
12442+
12443+ /* This function indicates via pos whether to stop or go to twig or continue on current
12444+ * level. */
12445+ return ret;
12446+
12447+}
12448+
12449+/* Process nodes on leaf level until unformatted node or rightmost node in the
12450+ * slum reached. */
12451+static int handle_pos_on_leaf(flush_pos_t * pos)
12452+{
12453+ int ret;
12454+
12455+ assert("zam-845", pos->state == POS_ON_LEAF);
12456+
12457+ ret = handle_pos_on_formatted(pos);
12458+
12459+ if (ret == -E_NO_NEIGHBOR) {
12460+ /* cannot get right neighbor, go process extents. */
12461+ pos->state = POS_TO_TWIG;
12462+ return 0;
12463+ }
12464+
12465+ return ret;
12466+}
12467+
12468+/* Process slum on level > 1 */
12469+static int handle_pos_on_internal(flush_pos_t * pos)
12470+{
12471+ assert("zam-850", pos->state == POS_ON_INTERNAL);
12472+ return handle_pos_on_formatted(pos);
12473+}
12474+
12475+/* check whether squalloc should stop before processing given extent */
12476+static int squalloc_extent_should_stop(flush_pos_t * pos)
12477+{
12478+ assert("zam-869", item_is_extent(&pos->coord));
12479+
12480+ /* pos->child is a jnode handle_pos_on_extent() should start with in
12481+ * stead of the first child of the first extent unit. */
12482+ if (pos->child) {
12483+ int prepped;
12484+
12485+ assert("vs-1383", jnode_is_unformatted(pos->child));
12486+ prepped = jnode_check_flushprepped(pos->child);
12487+ pos->pos_in_unit =
12488+ jnode_get_index(pos->child) -
12489+ extent_unit_index(&pos->coord);
12490+ assert("vs-1470",
12491+ pos->pos_in_unit < extent_unit_width(&pos->coord));
12492+ assert("nikita-3434",
12493+ ergo(extent_is_unallocated(&pos->coord),
12494+ pos->pos_in_unit == 0));
12495+ jput(pos->child);
12496+ pos->child = NULL;
12497+
12498+ return prepped;
12499+ }
12500+
12501+ pos->pos_in_unit = 0;
12502+ if (extent_is_unallocated(&pos->coord))
12503+ return 0;
12504+
12505+ return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12506+}
12507+
12508+/* Handle the case when regular reiser4 tree (znodes connected one to its
12509+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
12510+ * unformatted nodes. By having a lock on twig level and use extent code
12511+ * routines to process unformatted nodes we swim around an irregular part of
12512+ * reiser4 tree. */
12513+static int handle_pos_on_twig(flush_pos_t * pos)
12514+{
12515+ int ret;
12516+
12517+ assert("zam-844", pos->state == POS_ON_EPOINT);
12518+ assert("zam-843", item_is_extent(&pos->coord));
12519+
12520+ /* We decide should we continue slum processing with current extent
12521+ unit: if leftmost child of current extent unit is flushprepped
12522+ (i.e. clean or already processed by flush) we stop squalloc(). There
12523+ is a fast check for unallocated extents which we assume contain all
12524+ not flushprepped nodes. */
12525+ /* FIXME: Here we implement simple check, we are only looking on the
12526+ leftmost child. */
12527+ ret = squalloc_extent_should_stop(pos);
12528+ if (ret != 0) {
12529+ pos_stop(pos);
12530+ return ret;
12531+ }
12532+
12533+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12534+ && item_is_extent(&pos->coord)) {
12535+ ret = reiser4_alloc_extent(pos);
12536+ if (ret) {
12537+ break;
12538+ }
12539+ coord_next_unit(&pos->coord);
12540+ }
12541+
12542+ if (coord_is_after_rightmost(&pos->coord)) {
12543+ pos->state = POS_END_OF_TWIG;
12544+ return 0;
12545+ }
12546+ if (item_is_internal(&pos->coord)) {
12547+ pos->state = POS_TO_LEAF;
12548+ return 0;
12549+ }
12550+
12551+ assert("zam-860", item_is_extent(&pos->coord));
12552+
12553+ /* "slum" is over */
12554+ pos->state = POS_INVALID;
12555+ return 0;
12556+}
12557+
12558+/* When we about to return flush position from twig to leaf level we can process
12559+ * the right twig node or move position to the leaf. This processes right twig
12560+ * if it is possible and jump to leaf level if not. */
12561+static int handle_pos_end_of_twig(flush_pos_t * pos)
12562+{
12563+ int ret;
12564+ lock_handle right_lock;
12565+ load_count right_load;
12566+ coord_t at_right;
12567+ jnode *child = NULL;
12568+
12569+ assert("zam-848", pos->state == POS_END_OF_TWIG);
12570+ assert("zam-849", coord_is_after_rightmost(&pos->coord));
12571+
12572+ init_lh(&right_lock);
12573+ init_load_count(&right_load);
12574+
12575+ /* We get a lock on the right twig node even it is not dirty because
12576+ * slum continues or discontinues on leaf level not on next twig. This
12577+ * lock on the right twig is needed for getting its leftmost child. */
12578+ ret =
12579+ reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12580+ ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12581+ if (ret)
12582+ goto out;
12583+
12584+ ret = incr_load_count_znode(&right_load, right_lock.node);
12585+ if (ret)
12586+ goto out;
12587+
12588+ /* right twig could be not dirty */
12589+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12590+ /* If right twig node is dirty we always attempt to squeeze it
12591+ * content to the left... */
12592+ became_dirty:
12593+ ret =
12594+ squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12595+ if (ret <= 0) {
12596+ /* pos->coord is on internal item, go to leaf level, or
12597+ * we have an error which will be caught in squalloc() */
12598+ pos->state = POS_TO_LEAF;
12599+ goto out;
12600+ }
12601+
12602+ /* If right twig was squeezed completely we wave to re-lock
12603+ * right twig. now it is done through the top-level squalloc
12604+ * routine. */
12605+ if (node_is_empty(right_lock.node))
12606+ goto out;
12607+
12608+ /* ... and prep it if it is not yet prepped */
12609+ if (!znode_check_flushprepped(right_lock.node)) {
12610+ /* As usual, process parent before ... */
12611+ ret =
12612+ check_parents_and_squalloc_upper_levels(pos,
12613+ pos->lock.
12614+ node,
12615+ right_lock.
12616+ node);
12617+ if (ret)
12618+ goto out;
12619+
12620+ /* ... processing the child */
12621+ ret =
12622+ lock_parent_and_allocate_znode(right_lock.node,
12623+ pos);
12624+ if (ret)
12625+ goto out;
12626+ }
12627+ } else {
12628+ coord_init_first_unit(&at_right, right_lock.node);
12629+
12630+ /* check first child of next twig, should we continue there ? */
12631+ ret = get_leftmost_child_of_unit(&at_right, &child);
12632+ if (ret || child == NULL || jnode_check_flushprepped(child)) {
12633+ pos_stop(pos);
12634+ goto out;
12635+ }
12636+
12637+ /* check clean twig for possible relocation */
12638+ if (!znode_check_flushprepped(right_lock.node)) {
12639+ ret =
12640+ reverse_relocate_check_dirty_parent(child,
12641+ &at_right, pos);
12642+ if (ret)
12643+ goto out;
12644+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12645+ goto became_dirty;
12646+ }
12647+ }
12648+
12649+ assert("zam-875", znode_check_flushprepped(right_lock.node));
12650+
12651+ /* Update the preceder by a block number of just processed right twig
12652+ * node. The code above could miss the preceder updating because
12653+ * allocate_znode() could not be called for this node. */
12654+ pos->preceder.blk = *znode_get_block(right_lock.node);
12655+ check_preceder(pos->preceder.blk);
12656+
12657+ coord_init_first_unit(&at_right, right_lock.node);
12658+ assert("zam-868", coord_is_existing_unit(&at_right));
12659+
12660+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12661+ move_flush_pos(pos, &right_lock, &right_load, &at_right);
12662+
12663+ out:
12664+ done_load_count(&right_load);
12665+ done_lh(&right_lock);
12666+
12667+ if (child)
12668+ jput(child);
12669+
12670+ return ret;
12671+}
12672+
12673+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12674+ * continue there. */
12675+static int handle_pos_to_leaf(flush_pos_t * pos)
12676+{
12677+ int ret;
12678+ lock_handle child_lock;
12679+ load_count child_load;
12680+ jnode *child;
12681+
12682+ assert("zam-846", pos->state == POS_TO_LEAF);
12683+ assert("zam-847", item_is_internal(&pos->coord));
12684+
12685+ init_lh(&child_lock);
12686+ init_load_count(&child_load);
12687+
12688+ ret = get_leftmost_child_of_unit(&pos->coord, &child);
12689+ if (ret)
12690+ return ret;
12691+ if (child == NULL) {
12692+ pos_stop(pos);
12693+ return 0;
12694+ }
12695+
12696+ if (jnode_check_flushprepped(child)) {
12697+ pos->state = POS_INVALID;
12698+ goto out;
12699+ }
12700+
12701+ ret =
12702+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12703+ ZNODE_LOCK_LOPRI);
12704+ if (ret)
12705+ goto out;
12706+
12707+ ret = incr_load_count_znode(&child_load, JZNODE(child));
12708+ if (ret)
12709+ goto out;
12710+
12711+ ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12712+ if (ret)
12713+ goto out;
12714+
12715+ /* move flush position to leaf level */
12716+ pos->state = POS_ON_LEAF;
12717+ move_flush_pos(pos, &child_lock, &child_load, NULL);
12718+
12719+ if (node_is_empty(JZNODE(child))) {
12720+ ret = delete_empty_node(JZNODE(child));
12721+ pos->state = POS_INVALID;
12722+ }
12723+ out:
12724+ done_load_count(&child_load);
12725+ done_lh(&child_lock);
12726+ jput(child);
12727+
12728+ return ret;
12729+}
12730+
12731+/* move pos from leaf to twig, and move lock from leaf to twig. */
12732+/* Move pos->lock to upper (twig) level */
12733+static int handle_pos_to_twig(flush_pos_t * pos)
12734+{
12735+ int ret;
12736+
12737+ lock_handle parent_lock;
12738+ load_count parent_load;
12739+ coord_t pcoord;
12740+
12741+ assert("zam-852", pos->state == POS_TO_TWIG);
12742+
12743+ init_lh(&parent_lock);
12744+ init_load_count(&parent_load);
12745+
12746+ ret =
12747+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12748+ if (ret)
12749+ goto out;
12750+
12751+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12752+ if (ret)
12753+ goto out;
12754+
12755+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12756+ if (ret)
12757+ goto out;
12758+
12759+ assert("zam-870", item_is_internal(&pcoord));
12760+ coord_next_item(&pcoord);
12761+
12762+ if (coord_is_after_rightmost(&pcoord))
12763+ pos->state = POS_END_OF_TWIG;
12764+ else if (item_is_extent(&pcoord))
12765+ pos->state = POS_ON_EPOINT;
12766+ else {
12767+ /* Here we understand that getting -E_NO_NEIGHBOR in
12768+ * handle_pos_on_leaf() was because of just a reaching edge of
12769+ * slum */
12770+ pos_stop(pos);
12771+ goto out;
12772+ }
12773+
12774+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12775+
12776+ out:
12777+ done_load_count(&parent_load);
12778+ done_lh(&parent_lock);
12779+
12780+ return ret;
12781+}
12782+
12783+typedef int (*pos_state_handle_t) (flush_pos_t *);
12784+static pos_state_handle_t flush_pos_handlers[] = {
12785+ /* process formatted nodes on leaf level, keep lock on a leaf node */
12786+ [POS_ON_LEAF] = handle_pos_on_leaf,
12787+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12788+ * being processed */
12789+ [POS_ON_EPOINT] = handle_pos_on_twig,
12790+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12791+ [POS_TO_TWIG] = handle_pos_to_twig,
12792+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12793+ * pos->coord points to the leaf node we jump to */
12794+ [POS_TO_LEAF] = handle_pos_to_leaf,
12795+ /* after processing last extent in the twig node, attempting to shift items from the twigs
12796+ * right neighbor and process them while shifting */
12797+ [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12798+ /* process formatted nodes on internal level, keep lock on an internal node */
12799+ [POS_ON_INTERNAL] = handle_pos_on_internal
12800+};
12801+
12802+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12803+ * encrypt) nodes and their ancestors in "parent-first" order */
12804+static int squalloc(flush_pos_t * pos)
12805+{
12806+ int ret = 0;
12807+
12808+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12809+ * greater CPU efficiency? Measure and see.... -Hans */
12810+ while (pos_valid(pos)) {
12811+ ret = flush_pos_handlers[pos->state] (pos);
12812+ if (ret < 0)
12813+ break;
12814+
12815+ ret = rapid_flush(pos);
12816+ if (ret)
12817+ break;
12818+ }
12819+
12820+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12821+ routines, -E_NO_NEIGHBOR means that slum edge was reached */
12822+ if (ret > 0 || ret == -E_NO_NEIGHBOR)
12823+ ret = 0;
12824+
12825+ return ret;
12826+}
12827+
12828+static void update_ldkey(znode * node)
12829+{
12830+ reiser4_key ldkey;
12831+
12832+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12833+ if (node_is_empty(node))
12834+ return;
12835+
12836+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12837+}
12838+
12839+/* this is to be called after calling of shift node's method to shift data from @right to
12840+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12841+ and @right correspondingly and sets right delimiting key of @left to first key of @right */
12842+static void update_znode_dkeys(znode * left, znode * right)
12843+{
12844+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12845+ assert("vs-1629", (znode_is_write_locked(left) &&
12846+ znode_is_write_locked(right)));
12847+
12848+ /* we need to update left delimiting of left if it was empty before shift */
12849+ update_ldkey(left);
12850+ update_ldkey(right);
12851+ if (node_is_empty(right))
12852+ znode_set_rd_key(left, znode_get_rd_key(right));
12853+ else
12854+ znode_set_rd_key(left, znode_get_ld_key(right));
12855+}
12856+
12857+/* try to shift everything from @right to @left. If everything was shifted -
12858+ @right is removed from the tree. Result is the number of bytes shifted. */
12859+static int
12860+shift_everything_left(znode * right, znode * left, carry_level * todo)
12861+{
12862+ coord_t from;
12863+ node_plugin *nplug;
12864+ carry_plugin_info info;
12865+
12866+ coord_init_after_last_item(&from, right);
12867+
12868+ nplug = node_plugin_by_node(right);
12869+ info.doing = NULL;
12870+ info.todo = todo;
12871+ return nplug->shift(&from, left, SHIFT_LEFT,
12872+ 1 /* delete @right if it becomes empty */ ,
12873+ 1
12874+ /* move coord @from to node @left if everything will be shifted */
12875+ ,
12876+ &info);
12877+}
12878+
12879+/* Shift as much as possible from @right to @left using the memcpy-optimized
12880+ shift_everything_left. @left and @right are formatted neighboring nodes on
12881+ leaf level. */
12882+static int squeeze_right_non_twig(znode * left, znode * right)
12883+{
12884+ int ret;
12885+ carry_pool *pool;
12886+ carry_level *todo;
12887+
12888+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12889+
12890+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12891+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12892+ return SQUEEZE_TARGET_FULL;
12893+
12894+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12895+ if (IS_ERR(pool))
12896+ return PTR_ERR(pool);
12897+ todo = (carry_level *) (pool + 1);
12898+ init_carry_level(todo, pool);
12899+
12900+ ret = shift_everything_left(right, left, todo);
12901+ if (ret > 0) {
12902+ /* something was shifted */
12903+ reiser4_tree *tree;
12904+ __u64 grabbed;
12905+
12906+ znode_make_dirty(left);
12907+ znode_make_dirty(right);
12908+
12909+ /* update delimiting keys of nodes which participated in
12910+ shift. FIXME: it would be better to have this in shift
12911+ node's operation. But it can not be done there. Nobody
12912+ remembers why, though */
12913+ tree = znode_get_tree(left);
12914+ write_lock_dk(tree);
12915+ update_znode_dkeys(left, right);
12916+ write_unlock_dk(tree);
12917+
12918+ /* Carry is called to update delimiting key and, maybe, to remove empty
12919+ node. */
12920+ grabbed = get_current_context()->grabbed_blocks;
12921+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
12922+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
12923+ ret = reiser4_carry(todo, NULL /* previous level */ );
12924+ grabbed2free_mark(grabbed);
12925+ } else {
12926+ /* Shifting impossible, we return appropriate result code */
12927+ ret =
12928+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
12929+ SQUEEZE_TARGET_FULL;
12930+ }
12931+
12932+ done_carry_pool(pool);
12933+
12934+ return ret;
12935+}
12936+
12937+#if REISER4_DEBUG
12938+static int sibling_link_is_ok(const znode *left, const znode *right)
12939+{
12940+ int result;
12941+
12942+ read_lock_tree(znode_get_tree(left));
12943+ result = (left->right == right && left == right->left);
12944+ read_unlock_tree(znode_get_tree(left));
12945+ return result;
12946+}
12947+#endif
12948+
12949+/* Shift first unit of first item if it is an internal one. Return
12950+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
12951+ SUBTREE_MOVED. */
12952+static int shift_one_internal_unit(znode * left, znode * right)
12953+{
12954+ int ret;
12955+ carry_pool *pool;
12956+ carry_level *todo;
12957+ coord_t *coord;
12958+ carry_plugin_info *info;
12959+ int size, moved;
12960+
12961+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
12962+ assert("nikita-2435", znode_is_write_locked(left));
12963+ assert("nikita-2436", znode_is_write_locked(right));
12964+ assert("nikita-2434", sibling_link_is_ok(left, right));
12965+
12966+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
12967+ sizeof(*coord) + sizeof(*info)
12968+#if REISER4_DEBUG
12969+ + sizeof(*coord) + 2 * sizeof(reiser4_key)
12970+#endif
12971+ );
12972+ if (IS_ERR(pool))
12973+ return PTR_ERR(pool);
12974+ todo = (carry_level *) (pool + 1);
12975+ init_carry_level(todo, pool);
12976+
12977+ coord = (coord_t *) (todo + 3);
12978+ coord_init_first_unit(coord, right);
12979+ info = (carry_plugin_info *) (coord + 1);
12980+
12981+#if REISER4_DEBUG
12982+ if (!node_is_empty(left)) {
12983+ coord_t *last;
12984+ reiser4_key *right_key;
12985+ reiser4_key *left_key;
12986+
12987+ last = (coord_t *) (info + 1);
12988+ right_key = (reiser4_key *) (last + 1);
12989+ left_key = right_key + 1;
12990+ coord_init_last_unit(last, left);
12991+
12992+ assert("nikita-2463",
12993+ keyle(item_key_by_coord(last, left_key),
12994+ item_key_by_coord(coord, right_key)));
12995+ }
12996+#endif
12997+
12998+ assert("jmacd-2007", item_is_internal(coord));
12999+
13000+ size = item_length_by_coord(coord);
13001+ info->todo = todo;
13002+ info->doing = NULL;
13003+
13004+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13005+ 1
13006+ /* delete @right if it becomes empty */
13007+ ,
13008+ 0
13009+ /* do not move coord @coord to node @left */
13010+ ,
13011+ info);
13012+
13013+ /* If shift returns positive, then we shifted the item. */
13014+ assert("vs-423", ret <= 0 || size == ret);
13015+ moved = (ret > 0);
13016+
13017+ if (moved) {
13018+ /* something was moved */
13019+ reiser4_tree *tree;
13020+ int grabbed;
13021+
13022+ znode_make_dirty(left);
13023+ znode_make_dirty(right);
13024+ tree = znode_get_tree(left);
13025+ write_lock_dk(tree);
13026+ update_znode_dkeys(left, right);
13027+ write_unlock_dk(tree);
13028+
13029+ /* reserve space for delimiting keys after shifting */
13030+ grabbed = get_current_context()->grabbed_blocks;
13031+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13032+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13033+
13034+ ret = reiser4_carry(todo, NULL /* previous level */ );
13035+ grabbed2free_mark(grabbed);
13036+ }
13037+
13038+ done_carry_pool(pool);
13039+
13040+ if (ret != 0) {
13041+ /* Shift or carry operation failed. */
13042+ assert("jmacd-7325", ret < 0);
13043+ return ret;
13044+ }
13045+
13046+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13047+}
13048+
13049+/* Make the final relocate/wander decision during forward parent-first squalloc for a
13050+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13051+static int
13052+allocate_znode_loaded(znode * node,
13053+ const coord_t * parent_coord, flush_pos_t * pos)
13054+{
13055+ int ret;
13056+ reiser4_super_info_data *sbinfo = get_current_super_private();
13057+ /* FIXME(D): We have the node write-locked and should have checked for !
13058+ allocated() somewhere before reaching this point, but there can be a race, so
13059+ this assertion is bogus. */
13060+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13061+ assert("jmacd-7988", znode_is_write_locked(node));
13062+ assert("jmacd-7989", coord_is_invalid(parent_coord)
13063+ || znode_is_write_locked(parent_coord->node));
13064+
13065+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13066+ znode_is_root(node) ||
13067+ /* We have enough nodes to relocate no matter what. */
13068+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13069+ /* No need to decide with new nodes, they are treated the same as
13070+ relocate. If the root node is dirty, relocate. */
13071+ if (pos->preceder.blk == 0) {
13072+ /* preceder is unknown and we have decided to relocate node --
13073+ using of default value for search start is better than search
13074+ from block #0. */
13075+ get_blocknr_hint_default(&pos->preceder.blk);
13076+ check_preceder(pos->preceder.blk);
13077+ }
13078+
13079+ goto best_reloc;
13080+
13081+ } else if (pos->preceder.blk == 0) {
13082+ /* If we don't know the preceder, leave it where it is. */
13083+ jnode_make_wander(ZJNODE(node));
13084+ } else {
13085+ /* Make a decision based on block distance. */
13086+ reiser4_block_nr dist;
13087+ reiser4_block_nr nblk = *znode_get_block(node);
13088+
13089+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13090+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13091+ assert("jmacd-6174", pos->preceder.blk != 0);
13092+
13093+ if (pos->preceder.blk == nblk - 1) {
13094+ /* Ideal. */
13095+ jnode_make_wander(ZJNODE(node));
13096+ } else {
13097+
13098+ dist =
13099+ (nblk <
13100+ pos->preceder.blk) ? (pos->preceder.blk -
13101+ nblk) : (nblk -
13102+ pos->preceder.blk);
13103+
13104+ /* See if we can find a closer block (forward direction only). */
13105+ pos->preceder.max_dist =
13106+ min((reiser4_block_nr) sbinfo->flush.
13107+ relocate_distance, dist);
13108+ pos->preceder.level = znode_get_level(node);
13109+
13110+ ret = allocate_znode_update(node, parent_coord, pos);
13111+
13112+ pos->preceder.max_dist = 0;
13113+
13114+ if (ret && (ret != -ENOSPC))
13115+ return ret;
13116+
13117+ if (ret == 0) {
13118+ /* Got a better allocation. */
13119+ znode_make_reloc(node, pos->fq);
13120+ } else if (dist < sbinfo->flush.relocate_distance) {
13121+ /* The present allocation is good enough. */
13122+ jnode_make_wander(ZJNODE(node));
13123+ } else {
13124+ /* Otherwise, try to relocate to the best position. */
13125+ best_reloc:
13126+ ret =
13127+ allocate_znode_update(node, parent_coord,
13128+ pos);
13129+ if (ret != 0)
13130+ return ret;
13131+
13132+ /* set JNODE_RELOC bit _after_ node gets allocated */
13133+ znode_make_reloc(node, pos->fq);
13134+ }
13135+ }
13136+ }
13137+
13138+ /* This is the new preceder. */
13139+ pos->preceder.blk = *znode_get_block(node);
13140+ check_preceder(pos->preceder.blk);
13141+ pos->alloc_cnt += 1;
13142+
13143+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13144+
13145+ return 0;
13146+}
13147+
13148+static int
13149+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13150+{
13151+ /*
13152+ * perform znode allocation with znode pinned in memory to avoid races
13153+ * with asynchronous emergency flush (which plays with
13154+ * JNODE_FLUSH_RESERVED bit).
13155+ */
13156+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13157+}
13158+
13159+/* A subroutine of allocate_znode, this is called first to see if there is a close
13160+ position to relocate to. It may return ENOSPC if there is no close position. If there
13161+ is no close position it may not relocate. This takes care of updating the parent node
13162+ with the relocated block address. */
13163+static int
13164+allocate_znode_update(znode * node, const coord_t * parent_coord,
13165+ flush_pos_t * pos)
13166+{
13167+ int ret;
13168+ reiser4_block_nr blk;
13169+ lock_handle uber_lock;
13170+ int flush_reserved_used = 0;
13171+ int grabbed;
13172+ reiser4_context *ctx;
13173+ reiser4_super_info_data *sbinfo;
13174+
13175+ init_lh(&uber_lock);
13176+
13177+ ctx = get_current_context();
13178+ sbinfo = get_super_private(ctx->super);
13179+
13180+ grabbed = ctx->grabbed_blocks;
13181+
13182+ /* discard e-flush allocation */
13183+ ret = zload(node);
13184+ if (ret)
13185+ return ret;
13186+
13187+ if (ZF_ISSET(node, JNODE_CREATED)) {
13188+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13189+ pos->preceder.block_stage = BLOCK_UNALLOCATED;
13190+ } else {
13191+ pos->preceder.block_stage = BLOCK_GRABBED;
13192+
13193+ /* The disk space for relocating the @node is already reserved in "flush reserved"
13194+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13195+ * space from whole disk not from only 95%). */
13196+ if (znode_get_level(node) == LEAF_LEVEL) {
13197+ /*
13198+ * earlier (during do_jnode_make_dirty()) we decided
13199+ * that @node can possibly go into overwrite set and
13200+ * reserved block for its wandering location.
13201+ */
13202+ txn_atom *atom = get_current_atom_locked();
13203+ assert("nikita-3449",
13204+ ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13205+ flush_reserved2grabbed(atom, (__u64) 1);
13206+ spin_unlock_atom(atom);
13207+ /*
13208+ * we are trying to move node into relocate
13209+ * set. Allocation of relocated position "uses"
13210+ * reserved block.
13211+ */
13212+ ZF_CLR(node, JNODE_FLUSH_RESERVED);
13213+ flush_reserved_used = 1;
13214+ } else {
13215+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13216+ if (ret != 0)
13217+ goto exit;
13218+ }
13219+ }
13220+
13221+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13222+ ret = reiser4_alloc_block(&pos->preceder, &blk,
13223+ BA_FORMATTED | BA_PERMANENT);
13224+ if (ret)
13225+ goto exit;
13226+
13227+ if (!ZF_ISSET(node, JNODE_CREATED) &&
13228+ (ret =
13229+ reiser4_dealloc_block(znode_get_block(node), 0,
13230+ BA_DEFER | BA_FORMATTED)))
13231+ goto exit;
13232+
13233+ if (likely(!znode_is_root(node))) {
13234+ item_plugin *iplug;
13235+
13236+ iplug = item_plugin_by_coord(parent_coord);
13237+ assert("nikita-2954", iplug->f.update != NULL);
13238+ iplug->f.update(parent_coord, &blk);
13239+
13240+ znode_make_dirty(parent_coord->node);
13241+
13242+ } else {
13243+ reiser4_tree *tree = znode_get_tree(node);
13244+ znode *uber;
13245+
13246+ /* We take a longterm lock on the fake node in order to change
13247+ the root block number. This may cause atom fusion. */
13248+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13249+ &uber_lock);
13250+ /* The fake node cannot be deleted, and we must have priority
13251+ here, and may not be confused with ENOSPC. */
13252+ assert("jmacd-74412",
13253+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13254+
13255+ if (ret)
13256+ goto exit;
13257+
13258+ uber = uber_lock.node;
13259+
13260+ write_lock_tree(tree);
13261+ tree->root_block = blk;
13262+ write_unlock_tree(tree);
13263+
13264+ znode_make_dirty(uber);
13265+ }
13266+
13267+ ret = znode_rehash(node, &blk);
13268+ exit:
13269+ if (ret) {
13270+ /* Get flush reserved block back if something fails, because
13271+ * callers assume that on error block wasn't relocated and its
13272+ * flush reserved block wasn't used. */
13273+ if (flush_reserved_used) {
13274+ /*
13275+ * ok, we failed to move node into relocate
13276+ * set. Restore status quo.
13277+ */
13278+ grabbed2flush_reserved((__u64) 1);
13279+ ZF_SET(node, JNODE_FLUSH_RESERVED);
13280+ }
13281+ }
13282+ zrelse(node);
13283+ done_lh(&uber_lock);
13284+ grabbed2free_mark(grabbed);
13285+ return ret;
13286+}
13287+
13288+/* JNODE INTERFACE */
13289+
13290+/* Lock a node (if formatted) and then get its parent locked, set the child's
13291+ coordinate in the parent. If the child is the root node, the above_root
13292+ znode is returned but the coord is not set. This function may cause atom
13293+ fusion, but it is only used for read locks (at this point) and therefore
13294+ fusion only occurs when the parent is already dirty. */
13295+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13296+ pointer in jnodes. */
13297+static int
13298+jnode_lock_parent_coord(jnode * node,
13299+ coord_t * coord,
13300+ lock_handle * parent_lh,
13301+ load_count * parent_zh,
13302+ znode_lock_mode parent_mode, int try)
13303+{
13304+ int ret;
13305+
13306+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13307+ assert("edward-54", jnode_is_unformatted(node)
13308+ || znode_is_any_locked(JZNODE(node)));
13309+
13310+ if (!jnode_is_znode(node)) {
13311+ reiser4_key key;
13312+ tree_level stop_level = TWIG_LEVEL;
13313+ lookup_bias bias = FIND_EXACT;
13314+
13315+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13316+
13317+ /* The case when node is not znode, but can have parent coord
13318+ (unformatted node, node which represents cluster page,
13319+ etc..). Generate a key for the appropriate entry, search
13320+ in the tree using coord_by_key, which handles locking for
13321+ us. */
13322+
13323+ /*
13324+ * nothing is locked at this moment, so, nothing prevents
13325+ * concurrent truncate from removing jnode from inode. To
13326+ * prevent this spin-lock jnode. jnode can be truncated just
13327+ * after call to the jnode_build_key(), but this is ok,
13328+ * because coord_by_key() will just fail to find appropriate
13329+ * extent.
13330+ */
13331+ spin_lock_jnode(node);
13332+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13333+ jnode_build_key(node, &key);
13334+ ret = 0;
13335+ } else
13336+ ret = RETERR(-ENOENT);
13337+ spin_unlock_jnode(node);
13338+
13339+ if (ret != 0)
13340+ return ret;
13341+
13342+ if (jnode_is_cluster_page(node))
13343+ stop_level = LEAF_LEVEL;
13344+
13345+ assert("jmacd-1812", coord != NULL);
13346+
13347+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13348+ parent_mode, bias, stop_level, stop_level,
13349+ CBK_UNIQUE, NULL /*ra_info */ );
13350+ switch (ret) {
13351+ case CBK_COORD_NOTFOUND:
13352+ assert("edward-1038",
13353+ ergo(jnode_is_cluster_page(node),
13354+ JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13355+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13356+ warning("nikita-3177", "Parent not found");
13357+ return ret;
13358+ case CBK_COORD_FOUND:
13359+ if (coord->between != AT_UNIT) {
13360+ /* FIXME: comment needed */
13361+ done_lh(parent_lh);
13362+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13363+ warning("nikita-3178",
13364+ "Found but not happy: %i",
13365+ coord->between);
13366+ }
13367+ return RETERR(-ENOENT);
13368+ }
13369+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13370+ if (ret != 0)
13371+ return ret;
13372+ /* if (jnode_is_cluster_page(node)) {
13373+ races with write() are possible
13374+ check_child_cluster (parent_lh->node);
13375+ }
13376+ */
13377+ break;
13378+ default:
13379+ return ret;
13380+ }
13381+
13382+ } else {
13383+ int flags;
13384+ znode *z;
13385+
13386+ z = JZNODE(node);
13387+ /* Formatted node case: */
13388+ assert("jmacd-2061", !znode_is_root(z));
13389+
13390+ flags = GN_ALLOW_NOT_CONNECTED;
13391+ if (try)
13392+ flags |= GN_TRY_LOCK;
13393+
13394+ ret =
13395+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13396+ if (ret != 0)
13397+ /* -E_REPEAT is ok here, it is handled by the caller. */
13398+ return ret;
13399+
13400+ /* Make the child's position "hint" up-to-date. (Unless above
13401+ root, which caller must check.) */
13402+ if (coord != NULL) {
13403+
13404+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13405+ if (ret != 0) {
13406+ warning("jmacd-976812386",
13407+ "incr_load_count_znode failed: %d",
13408+ ret);
13409+ return ret;
13410+ }
13411+
13412+ ret = find_child_ptr(parent_lh->node, z, coord);
13413+ if (ret != 0) {
13414+ warning("jmacd-976812",
13415+ "find_child_ptr failed: %d", ret);
13416+ return ret;
13417+ }
13418+ }
13419+ }
13420+
13421+ return 0;
13422+}
13423+
13424+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13425+ If there is no next neighbor or the neighbor is not in memory or if there is a
13426+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13427+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13428+static int neighbor_in_slum(znode * node, /* starting point */
13429+ lock_handle * lock, /* lock on starting point */
13430+ sideof side, /* left or right direction we seek the next node in */
13431+ znode_lock_mode mode, /* kind of lock we want */
13432+ int check_dirty)
13433+{ /* true if the neighbor should be dirty */
13434+ int ret;
13435+
13436+ assert("jmacd-6334", znode_is_connected(node));
13437+
13438+ ret =
13439+ reiser4_get_neighbor(lock, node, mode,
13440+ GN_SAME_ATOM | (side ==
13441+ LEFT_SIDE ? GN_GO_LEFT : 0));
13442+
13443+ if (ret) {
13444+ /* May return -ENOENT or -E_NO_NEIGHBOR. */
13445+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13446+ if (ret == -ENOENT) {
13447+ ret = RETERR(-E_NO_NEIGHBOR);
13448+ }
13449+
13450+ return ret;
13451+ }
13452+ if (!check_dirty)
13453+ return 0;
13454+ /* Check dirty bit of locked znode, no races here */
13455+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13456+ return 0;
13457+
13458+ done_lh(lock);
13459+ return RETERR(-E_NO_NEIGHBOR);
13460+}
13461+
13462+/* Return true if two znodes have the same parent. This is called with both nodes
13463+ write-locked (for squeezing) so no tree lock is needed. */
13464+static int znode_same_parents(znode * a, znode * b)
13465+{
13466+ int result;
13467+
13468+ assert("jmacd-7011", znode_is_write_locked(a));
13469+ assert("jmacd-7012", znode_is_write_locked(b));
13470+
13471+ /* We lock the whole tree for this check.... I really don't like whole tree
13472+ * locks... -Hans */
13473+ read_lock_tree(znode_get_tree(a));
13474+ result = (znode_parent(a) == znode_parent(b));
13475+ read_unlock_tree(znode_get_tree(a));
13476+ return result;
13477+}
13478+
13479+/* FLUSH SCAN */
13480+
13481+/* Initialize the flush_scan data structure. */
13482+static void scan_init(flush_scan * scan)
13483+{
13484+ memset(scan, 0, sizeof(*scan));
13485+ init_lh(&scan->node_lock);
13486+ init_lh(&scan->parent_lock);
13487+ init_load_count(&scan->parent_load);
13488+ init_load_count(&scan->node_load);
13489+ coord_init_invalid(&scan->parent_coord, NULL);
13490+}
13491+
13492+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13493+static void scan_done(flush_scan * scan)
13494+{
13495+ done_load_count(&scan->node_load);
13496+ if (scan->node != NULL) {
13497+ jput(scan->node);
13498+ scan->node = NULL;
13499+ }
13500+ done_load_count(&scan->parent_load);
13501+ done_lh(&scan->parent_lock);
13502+ done_lh(&scan->node_lock);
13503+}
13504+
13505+/* Returns true if flush scanning is finished. */
13506+int reiser4_scan_finished(flush_scan * scan)
13507+{
13508+ return scan->stop || (scan->direction == RIGHT_SIDE &&
13509+ scan->count >= scan->max_count);
13510+}
13511+
13512+/* Return true if the scan should continue to the @tonode. True if the node meets the
13513+ same_slum_check condition. If not, deref the "left" node and stop the scan. */
13514+int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13515+{
13516+ int go = same_slum_check(scan->node, tonode, 1, 0);
13517+
13518+ if (!go) {
13519+ scan->stop = 1;
13520+ jput(tonode);
13521+ }
13522+
13523+ return go;
13524+}
13525+
13526+/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13527+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13528+ parent coordinate. */
13529+int
13530+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13531+ const coord_t * parent)
13532+{
13533+ /* Release the old references, take the new reference. */
13534+ done_load_count(&scan->node_load);
13535+
13536+ if (scan->node != NULL) {
13537+ jput(scan->node);
13538+ }
13539+ scan->node = node;
13540+ scan->count += add_count;
13541+
13542+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
13543+ delay this update step until it finishes and update the parent_coord only once.
13544+ It did that before, but there was a bug and this was the easiest way to make it
13545+ correct. */
13546+ if (parent != NULL) {
13547+ coord_dup(&scan->parent_coord, parent);
13548+ }
13549+
13550+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13551+ is safely taken. */
13552+ return incr_load_count_jnode(&scan->node_load, node);
13553+}
13554+
13555+/* Return true if scanning in the leftward direction. */
13556+int reiser4_scanning_left(flush_scan * scan)
13557+{
13558+ return scan->direction == LEFT_SIDE;
13559+}
13560+
13561+/* Performs leftward scanning starting from either kind of node. Counts the starting
13562+ node. The right-scan object is passed in for the left-scan in order to copy the parent
13563+ of an unformatted starting position. This way we avoid searching for the unformatted
13564+ node's parent when scanning in each direction. If we search for the parent once it is
13565+ set in both scan objects. The limit parameter tells flush-scan when to stop.
13566+
13567+ Rapid scanning is used only during scan_left, where we are interested in finding the
13568+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13569+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13570+ problem is finding a way to flush only those nodes without unallocated children, and it
13571+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13572+ problem can be solved by scanning left at every level as we go upward, but this would
13573+ basically bring us back to using a top-down allocation strategy, which we already tried
13574+ (see BK history from May 2002), and has a different set of problems. The top-down
13575+ strategy makes avoiding unallocated children easier, but makes it difficult to
13576+ propertly flush dirty children with clean parents that would otherwise stop the
13577+ top-down flush, only later to dirty the parent once the children are flushed. So we
13578+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13579+ only.
13580+
13581+ The first step in solving the problem is this rapid leftward scan. After we determine
13582+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13583+ are no longer interested in the exact count, we are only interested in finding a the
13584+ best place to start the flush. We could choose one of two possibilities:
13585+
13586+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13587+ This requires checking one leaf per rapid-scan twig
13588+
13589+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13590+ to the left. This requires checking possibly all of the in-memory children of each
13591+ twig during the rapid scan.
13592+
13593+ For now we implement the first policy.
13594+*/
13595+static int
13596+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13597+{
13598+ int ret = 0;
13599+
13600+ scan->max_count = limit;
13601+ scan->direction = LEFT_SIDE;
13602+
13603+ ret = scan_set_current(scan, jref(node), 1, NULL);
13604+ if (ret != 0) {
13605+ return ret;
13606+ }
13607+
13608+ ret = scan_common(scan, right);
13609+ if (ret != 0) {
13610+ return ret;
13611+ }
13612+
13613+ /* Before rapid scanning, we need a lock on scan->node so that we can get its
13614+ parent, only if formatted. */
13615+ if (jnode_is_znode(scan->node)) {
13616+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13617+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13618+ }
13619+
13620+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13621+ return ret;
13622+}
13623+
13624+/* Performs rightward scanning... Does not count the starting node. The limit parameter
13625+ is described in scan_left. If the starting node is unformatted then the
13626+ parent_coord was already set during scan_left. The rapid_after parameter is not used
13627+ during right-scanning.
13628+
13629+ scan_right is only called if the scan_left operation does not count at least
13630+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13631+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13632+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13633+static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13634+{
13635+ int ret;
13636+
13637+ scan->max_count = limit;
13638+ scan->direction = RIGHT_SIDE;
13639+
13640+ ret = scan_set_current(scan, jref(node), 0, NULL);
13641+ if (ret != 0) {
13642+ return ret;
13643+ }
13644+
13645+ return scan_common(scan, NULL);
13646+}
13647+
13648+/* Common code to perform left or right scanning. */
13649+static int scan_common(flush_scan * scan, flush_scan * other)
13650+{
13651+ int ret;
13652+
13653+ assert("nikita-2376", scan->node != NULL);
13654+ assert("edward-54", jnode_is_unformatted(scan->node)
13655+ || jnode_is_znode(scan->node));
13656+
13657+ /* Special case for starting at an unformatted node. Optimization: we only want
13658+ to search for the parent (which requires a tree traversal) once. Obviously, we
13659+ shouldn't have to call it once for the left scan and once for the right scan.
13660+ For this reason, if we search for the parent during scan-left we then duplicate
13661+ the coord/lock/load into the scan-right object. */
13662+ if (jnode_is_unformatted(scan->node)) {
13663+ ret = scan_unformatted(scan, other);
13664+ if (ret != 0)
13665+ return ret;
13666+ }
13667+ /* This loop expects to start at a formatted position and performs chaining of
13668+ formatted regions */
13669+ while (!reiser4_scan_finished(scan)) {
13670+
13671+ ret = scan_formatted(scan);
13672+ if (ret != 0) {
13673+ return ret;
13674+ }
13675+ }
13676+
13677+ return 0;
13678+}
13679+
13680+static int scan_unformatted(flush_scan * scan, flush_scan * other)
13681+{
13682+ int ret = 0;
13683+ int try = 0;
13684+
13685+ if (!coord_is_invalid(&scan->parent_coord))
13686+ goto scan;
13687+
13688+ /* set parent coord from */
13689+ if (!jnode_is_unformatted(scan->node)) {
13690+ /* formatted position */
13691+
13692+ lock_handle lock;
13693+ assert("edward-301", jnode_is_znode(scan->node));
13694+ init_lh(&lock);
13695+
13696+ /*
13697+ * when flush starts from unformatted node, first thing it
13698+ * does is tree traversal to find formatted parent of starting
13699+ * node. This parent is then kept lock across scans to the
13700+ * left and to the right. This means that during scan to the
13701+ * left we cannot take left-ward lock, because this is
13702+ * dead-lock prone. So, if we are scanning to the left and
13703+ * there is already lock held by this thread,
13704+ * jnode_lock_parent_coord() should use try-lock.
13705+ */
13706+ try = reiser4_scanning_left(scan)
13707+ && !lock_stack_isclean(get_current_lock_stack());
13708+ /* Need the node locked to get the parent lock, We have to
13709+ take write lock since there is at least one call path
13710+ where this znode is already write-locked by us. */
13711+ ret =
13712+ longterm_lock_znode(&lock, JZNODE(scan->node),
13713+ ZNODE_WRITE_LOCK,
13714+ reiser4_scanning_left(scan) ?
13715+ ZNODE_LOCK_LOPRI :
13716+ ZNODE_LOCK_HIPRI);
13717+ if (ret != 0)
13718+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
13719+ scanned too far and can't back out, just start over. */
13720+ return ret;
13721+
13722+ ret = jnode_lock_parent_coord(scan->node,
13723+ &scan->parent_coord,
13724+ &scan->parent_lock,
13725+ &scan->parent_load,
13726+ ZNODE_WRITE_LOCK, try);
13727+
13728+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13729+ done_lh(&lock);
13730+ if (ret == -E_REPEAT) {
13731+ scan->stop = 1;
13732+ return 0;
13733+ }
13734+ if (ret)
13735+ return ret;
13736+
13737+ } else {
13738+ /* unformatted position */
13739+
13740+ ret =
13741+ jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13742+ &scan->parent_lock,
13743+ &scan->parent_load,
13744+ ZNODE_WRITE_LOCK, try);
13745+
13746+ if (IS_CBKERR(ret))
13747+ return ret;
13748+
13749+ if (ret == CBK_COORD_NOTFOUND)
13750+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13751+ return ret;
13752+
13753+ /* parent was found */
13754+ assert("jmacd-8661", other != NULL);
13755+ /* Duplicate the reference into the other flush_scan. */
13756+ coord_dup(&other->parent_coord, &scan->parent_coord);
13757+ copy_lh(&other->parent_lock, &scan->parent_lock);
13758+ copy_load_count(&other->parent_load, &scan->parent_load);
13759+ }
13760+ scan:
13761+ return scan_by_coord(scan);
13762+}
13763+
13764+/* Performs left- or rightward scanning starting from a formatted node. Follow left
13765+ pointers under tree lock as long as:
13766+
13767+ - node->left/right is non-NULL
13768+ - node->left/right is connected, dirty
13769+ - node->left/right belongs to the same atom
13770+ - scan has not reached maximum count
13771+*/
13772+static int scan_formatted(flush_scan * scan)
13773+{
13774+ int ret;
13775+ znode *neighbor = NULL;
13776+
13777+ assert("jmacd-1401", !reiser4_scan_finished(scan));
13778+
13779+ do {
13780+ znode *node = JZNODE(scan->node);
13781+
13782+ /* Node should be connected, but if not stop the scan. */
13783+ if (!znode_is_connected(node)) {
13784+ scan->stop = 1;
13785+ break;
13786+ }
13787+
13788+ /* Lock the tree, check-for and reference the next sibling. */
13789+ read_lock_tree(znode_get_tree(node));
13790+
13791+ /* It may be that a node is inserted or removed between a node and its
13792+ left sibling while the tree lock is released, but the flush-scan count
13793+ does not need to be precise. Thus, we release the tree lock as soon as
13794+ we get the neighboring node. */
13795+ neighbor =
13796+ reiser4_scanning_left(scan) ? node->left : node->right;
13797+ if (neighbor != NULL) {
13798+ zref(neighbor);
13799+ }
13800+
13801+ read_unlock_tree(znode_get_tree(node));
13802+
13803+ /* If neighbor is NULL at the leaf level, need to check for an unformatted
13804+ sibling using the parent--break in any case. */
13805+ if (neighbor == NULL) {
13806+ break;
13807+ }
13808+
13809+ /* Check the condition for going left, break if it is not met. This also
13810+ releases (jputs) the neighbor if false. */
13811+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
13812+ break;
13813+ }
13814+
13815+ /* Advance the flush_scan state to the left, repeat. */
13816+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13817+ if (ret != 0) {
13818+ return ret;
13819+ }
13820+
13821+ } while (!reiser4_scan_finished(scan));
13822+
13823+ /* If neighbor is NULL then we reached the end of a formatted region, or else the
13824+ sibling is out of memory, now check for an extent to the left (as long as
13825+ LEAF_LEVEL). */
13826+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
13827+ || reiser4_scan_finished(scan)) {
13828+ scan->stop = 1;
13829+ return 0;
13830+ }
13831+ /* Otherwise, calls scan_by_coord for the right(left)most item of the
13832+ left(right) neighbor on the parent level, then possibly continue. */
13833+
13834+ coord_init_invalid(&scan->parent_coord, NULL);
13835+ return scan_unformatted(scan, NULL);
13836+}
13837+
13838+/* NOTE-EDWARD:
13839+ This scans adjacent items of the same type and calls scan flush plugin for each one.
13840+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
13841+ from unformatted node, then we continue only if the next neighbor is also unformatted.
13842+ When called from scan_formatted, we skip first iteration (to make sure that
13843+ right(left)most item of the left(right) neighbor on the parent level is of the same
13844+ type and set appropriate coord). */
13845+static int scan_by_coord(flush_scan * scan)
13846+{
13847+ int ret = 0;
13848+ int scan_this_coord;
13849+ lock_handle next_lock;
13850+ load_count next_load;
13851+ coord_t next_coord;
13852+ jnode *child;
13853+ item_plugin *iplug;
13854+
13855+ init_lh(&next_lock);
13856+ init_load_count(&next_load);
13857+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13858+
13859+ /* set initial item id */
13860+ iplug = item_plugin_by_coord(&scan->parent_coord);
13861+
13862+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
13863+ if (scan_this_coord) {
13864+ /* Here we expect that unit is scannable. it would not be so due
13865+ * to race with extent->tail conversion. */
13866+ if (iplug->f.scan == NULL) {
13867+ scan->stop = 1;
13868+ ret = -E_REPEAT;
13869+ /* skip the check at the end. */
13870+ goto race;
13871+ }
13872+
13873+ ret = iplug->f.scan(scan);
13874+ if (ret != 0)
13875+ goto exit;
13876+
13877+ if (reiser4_scan_finished(scan)) {
13878+ checkchild(scan);
13879+ break;
13880+ }
13881+ } else {
13882+ /* the same race against truncate as above is possible
13883+ * here, it seems */
13884+
13885+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13886+ the first coordinate. */
13887+ assert("jmacd-1231",
13888+ item_is_internal(&scan->parent_coord));
13889+ }
13890+
13891+ if (iplug->f.utmost_child == NULL
13892+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13893+ /* stop this coord and continue on parrent level */
13894+ ret =
13895+ scan_set_current(scan,
13896+ ZJNODE(zref
13897+ (scan->parent_coord.node)),
13898+ 1, NULL);
13899+ if (ret != 0)
13900+ goto exit;
13901+ break;
13902+ }
13903+
13904+ /* Either way, the invariant is that scan->parent_coord is set to the
13905+ parent of scan->node. Now get the next unit. */
13906+ coord_dup(&next_coord, &scan->parent_coord);
13907+ coord_sideof_unit(&next_coord, scan->direction);
13908+
13909+ /* If off-the-end of the twig, try the next twig. */
13910+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
13911+ /* We take the write lock because we may start flushing from this
13912+ * coordinate. */
13913+ ret =
13914+ neighbor_in_slum(next_coord.node, &next_lock,
13915+ scan->direction, ZNODE_WRITE_LOCK,
13916+ 1 /* check dirty */ );
13917+ if (ret == -E_NO_NEIGHBOR) {
13918+ scan->stop = 1;
13919+ ret = 0;
13920+ break;
13921+ }
13922+
13923+ if (ret != 0) {
13924+ goto exit;
13925+ }
13926+
13927+ ret = incr_load_count_znode(&next_load, next_lock.node);
13928+ if (ret != 0) {
13929+ goto exit;
13930+ }
13931+
13932+ coord_init_sideof_unit(&next_coord, next_lock.node,
13933+ sideof_reverse(scan->direction));
13934+ }
13935+
13936+ iplug = item_plugin_by_coord(&next_coord);
13937+
13938+ /* Get the next child. */
13939+ ret =
13940+ iplug->f.utmost_child(&next_coord,
13941+ sideof_reverse(scan->direction),
13942+ &child);
13943+ if (ret != 0)
13944+ goto exit;
13945+ /* If the next child is not in memory, or, item_utmost_child
13946+ failed (due to race with unlink, most probably), stop
13947+ here. */
13948+ if (child == NULL || IS_ERR(child)) {
13949+ scan->stop = 1;
13950+ checkchild(scan);
13951+ break;
13952+ }
13953+
13954+ assert("nikita-2374", jnode_is_unformatted(child)
13955+ || jnode_is_znode(child));
13956+
13957+ /* See if it is dirty, part of the same atom. */
13958+ if (!reiser4_scan_goto(scan, child)) {
13959+ checkchild(scan);
13960+ break;
13961+ }
13962+
13963+ /* If so, make this child current. */
13964+ ret = scan_set_current(scan, child, 1, &next_coord);
13965+ if (ret != 0)
13966+ goto exit;
13967+
13968+ /* Now continue. If formatted we release the parent lock and return, then
13969+ proceed. */
13970+ if (jnode_is_znode(child))
13971+ break;
13972+
13973+ /* Otherwise, repeat the above loop with next_coord. */
13974+ if (next_load.node != NULL) {
13975+ done_lh(&scan->parent_lock);
13976+ move_lh(&scan->parent_lock, &next_lock);
13977+ move_load_count(&scan->parent_load, &next_load);
13978+ }
13979+ }
13980+
13981+ assert("jmacd-6233",
13982+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
13983+ exit:
13984+ checkchild(scan);
13985+ race: /* skip the above check */
13986+ if (jnode_is_znode(scan->node)) {
13987+ done_lh(&scan->parent_lock);
13988+ done_load_count(&scan->parent_load);
13989+ }
13990+
13991+ done_load_count(&next_load);
13992+ done_lh(&next_lock);
13993+ return ret;
13994+}
13995+
13996+/* FLUSH POS HELPERS */
13997+
13998+/* Initialize the fields of a flush_position. */
13999+static void pos_init(flush_pos_t * pos)
14000+{
14001+ memset(pos, 0, sizeof *pos);
14002+
14003+ pos->state = POS_INVALID;
14004+ coord_init_invalid(&pos->coord, NULL);
14005+ init_lh(&pos->lock);
14006+ init_load_count(&pos->load);
14007+
14008+ reiser4_blocknr_hint_init(&pos->preceder);
14009+}
14010+
14011+/* The flush loop inside squalloc periodically checks pos_valid to
14012+ determine when "enough flushing" has been performed. This will return true until one
14013+ of the following conditions is met:
14014+
14015+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14016+ parameter, meaning we have flushed as many blocks as the kernel requested. When
14017+ flushing to commit, this parameter is NULL.
14018+
14019+ 2. pos_stop() is called because squalloc discovers that the "next" node in the
14020+ flush order is either non-existant, not dirty, or not in the same atom.
14021+*/
14022+
14023+static int pos_valid(flush_pos_t * pos)
14024+{
14025+ return pos->state != POS_INVALID;
14026+}
14027+
14028+/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14029+static void pos_done(flush_pos_t * pos)
14030+{
14031+ pos_stop(pos);
14032+ reiser4_blocknr_hint_done(&pos->preceder);
14033+ if (convert_data(pos))
14034+ free_convert_data(pos);
14035+}
14036+
14037+/* Reset the point and parent. Called during flush subroutines to terminate the
14038+ squalloc loop. */
14039+static int pos_stop(flush_pos_t * pos)
14040+{
14041+ pos->state = POS_INVALID;
14042+ done_lh(&pos->lock);
14043+ done_load_count(&pos->load);
14044+ coord_init_invalid(&pos->coord, NULL);
14045+
14046+ if (pos->child) {
14047+ jput(pos->child);
14048+ pos->child = NULL;
14049+ }
14050+
14051+ return 0;
14052+}
14053+
14054+/* Return the flush_position's block allocator hint. */
14055+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14056+{
14057+ return &pos->preceder;
14058+}
14059+
14060+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14061+{
14062+ return pos->fq;
14063+}
14064+
14065+/* Make Linus happy.
14066+ Local variables:
14067+ c-indentation-style: "K&R"
14068+ mode-name: "LC"
14069+ c-basic-offset: 8
14070+ tab-width: 8
14071+ fill-column: 90
14072+ LocalWords: preceder
14073+ End:
14074+*/
14075diff -urN linux-2.6.20.orig/fs/reiser4/flush.h linux-2.6.20/fs/reiser4/flush.h
14076--- linux-2.6.20.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300
14077+++ linux-2.6.20/fs/reiser4/flush.h 2007-05-06 14:50:43.718981974 +0400
14078@@ -0,0 +1,274 @@
14079+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14080+
14081+/* DECLARATIONS: */
14082+
14083+#if !defined(__REISER4_FLUSH_H__)
14084+#define __REISER4_FLUSH_H__
14085+
14086+#include "plugin/cluster.h"
14087+
14088+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14089+ single level of the tree. A flush-scan is used for counting the number of adjacent
14090+ nodes to flush, which is used to determine whether we should relocate, and it is also
14091+ used to find a starting point for flush. A flush-scan object can scan in both right
14092+ and left directions via the scan_left() and scan_right() interfaces. The
14093+ right- and left-variations are similar but perform different functions. When scanning
14094+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14095+ When scanning right we are simply counting the number of adjacent, dirty nodes. */
14096+struct flush_scan {
14097+
14098+ /* The current number of nodes scanned on this level. */
14099+ unsigned count;
14100+
14101+ /* There may be a maximum number of nodes for a scan on any single level. When
14102+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14103+ unsigned max_count;
14104+
14105+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14106+ sideof direction;
14107+
14108+ /* Initially @stop is set to false then set true once some condition stops the
14109+ search (e.g., we found a clean node before reaching max_count or we found a
14110+ node belonging to another atom). */
14111+ int stop;
14112+
14113+ /* The current scan position. If @node is non-NULL then its reference count has
14114+ been incremented to reflect this reference. */
14115+ jnode *node;
14116+
14117+ /* A handle for zload/zrelse of current scan position node. */
14118+ load_count node_load;
14119+
14120+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14121+ node is locked using this lock handle. The endpoint needs to be locked for
14122+ transfer to the flush_position object after scanning finishes. */
14123+ lock_handle node_lock;
14124+
14125+ /* When the position is unformatted, its parent, coordinate, and parent
14126+ zload/zrelse handle. */
14127+ lock_handle parent_lock;
14128+ coord_t parent_coord;
14129+ load_count parent_load;
14130+
14131+ /* The block allocator preceder hint. Sometimes flush_scan determines what the
14132+ preceder is and if so it sets it here, after which it is copied into the
14133+ flush_position. Otherwise, the preceder is computed later. */
14134+ reiser4_block_nr preceder_blk;
14135+};
14136+
14137+typedef struct convert_item_info {
14138+ dc_item_stat d_cur; /* disk cluster state of the current item */
14139+ dc_item_stat d_next; /* disk cluster state of the next slum item */
14140+ struct inode *inode;
14141+ flow_t flow;
14142+} convert_item_info_t;
14143+
14144+typedef struct convert_info {
14145+ int count; /* for squalloc terminating */
14146+ reiser4_cluster_t clust; /* transform cluster */
14147+ item_plugin *iplug; /* current item plugin */
14148+ convert_item_info_t *itm; /* current item info */
14149+} convert_info_t;
14150+
14151+typedef enum flush_position_state {
14152+ POS_INVALID, /* Invalid or stopped pos, do not continue slum
14153+ * processing */
14154+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14155+ * leaf level */
14156+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14157+ * to traverse unformatted nodes */
14158+ POS_TO_LEAF, /* pos is being moved to leaf level */
14159+ POS_TO_TWIG, /* pos is being moved to twig level */
14160+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14161+ * rightmost unit of the current twig */
14162+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14163+} flushpos_state_t;
14164+
14165+/* An encapsulation of the current flush point and all the parameters that are passed
14166+ through the entire squeeze-and-allocate stage of the flush routine. A single
14167+ flush_position object is constructed after left- and right-scanning finishes. */
14168+struct flush_position {
14169+ flushpos_state_t state;
14170+
14171+ coord_t coord; /* coord to traverse unformatted nodes */
14172+ lock_handle lock; /* current lock we hold */
14173+ load_count load; /* load status for current locked formatted node */
14174+
14175+ jnode *child; /* for passing a reference to unformatted child
14176+ * across pos state changes */
14177+
14178+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14179+ int leaf_relocate; /* True if enough leaf-level nodes were
14180+ * found to suggest a relocate policy. */
14181+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14182+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14183+ flush_queue_t *fq;
14184+ long *nr_written; /* number of nodes submitted to disk */
14185+ int flags; /* a copy of jnode_flush flags argument */
14186+
14187+ znode *prev_twig; /* previous parent pointer value, used to catch
14188+ * processing of new twig node */
14189+ convert_info_t *sq; /* convert info */
14190+
14191+ unsigned long pos_in_unit; /* for extents only. Position
14192+ within an extent unit of first
14193+ jnode of slum */
14194+ long nr_to_write; /* number of unformatted nodes to handle on flush */
14195+};
14196+
14197+static inline int item_convert_count(flush_pos_t * pos)
14198+{
14199+ return pos->sq->count;
14200+}
14201+static inline void inc_item_convert_count(flush_pos_t * pos)
14202+{
14203+ pos->sq->count++;
14204+}
14205+static inline void set_item_convert_count(flush_pos_t * pos, int count)
14206+{
14207+ pos->sq->count = count;
14208+}
14209+static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14210+{
14211+ return pos->sq->iplug;
14212+}
14213+
14214+static inline convert_info_t *convert_data(flush_pos_t * pos)
14215+{
14216+ return pos->sq;
14217+}
14218+
14219+static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
14220+{
14221+ assert("edward-955", convert_data(pos));
14222+ return pos->sq->itm;
14223+}
14224+
14225+static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
14226+{
14227+ return &pos->sq->clust.tc;
14228+}
14229+
14230+static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
14231+{
14232+ assert("edward-854", pos->sq != NULL);
14233+ return tfm_stream(tfm_cluster_sq(pos), id);
14234+}
14235+
14236+static inline int chaining_data_present(flush_pos_t * pos)
14237+{
14238+ return convert_data(pos) && item_convert_data(pos);
14239+}
14240+
14241+/* Returns true if next node contains next item of the disk cluster
14242+ so item convert data should be moved to the right slum neighbor.
14243+*/
14244+static inline int should_chain_next_node(flush_pos_t * pos)
14245+{
14246+ int result = 0;
14247+
14248+ assert("edward-1007", chaining_data_present(pos));
14249+
14250+ switch (item_convert_data(pos)->d_next) {
14251+ case DC_CHAINED_ITEM:
14252+ result = 1;
14253+ break;
14254+ case DC_AFTER_CLUSTER:
14255+ break;
14256+ default:
14257+ impossible("edward-1009", "bad state of next slum item");
14258+ }
14259+ return result;
14260+}
14261+
14262+/* update item state in a disk cluster to assign conversion mode */
14263+static inline void
14264+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14265+{
14266+
14267+ assert("edward-1010", chaining_data_present(pos));
14268+
14269+ if (this_node == 0) {
14270+ /* next item is on the right neighbor */
14271+ assert("edward-1011",
14272+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14273+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14274+ assert("edward-1012",
14275+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14276+
14277+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14278+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14279+ } else {
14280+ /* next item is on the same node */
14281+ assert("edward-1013",
14282+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14283+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14284+ assert("edward-1227",
14285+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14286+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
14287+
14288+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14289+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14290+ }
14291+}
14292+
14293+static inline int should_convert_node(flush_pos_t * pos, znode * node)
14294+{
14295+ return znode_convertible(node);
14296+}
14297+
14298+/* true if there is attached convert item info */
14299+static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
14300+{
14301+ return convert_data(pos) && item_convert_data(pos);
14302+}
14303+
14304+#define SQUALLOC_THRESHOLD 256
14305+
14306+static inline int should_terminate_squalloc(flush_pos_t * pos)
14307+{
14308+ return convert_data(pos) &&
14309+ !item_convert_data(pos) &&
14310+ item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14311+}
14312+
14313+void free_convert_data(flush_pos_t * pos);
14314+/* used in extent.c */
14315+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14316+ const coord_t * parent);
14317+int reiser4_scan_finished(flush_scan * scan);
14318+int reiser4_scanning_left(flush_scan * scan);
14319+int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14320+txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14321+int reiser4_alloc_extent(flush_pos_t *flush_pos);
14322+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14323+ reiser4_key *stop_key);
14324+extern int reiser4_init_fqs(void);
14325+extern void reiser4_done_fqs(void);
14326+
14327+#if REISER4_DEBUG
14328+
14329+extern void reiser4_check_fq(const txn_atom *atom);
14330+extern atomic_t flush_cnt;
14331+
14332+#define check_preceder(blk) \
14333+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14334+extern void check_pos(flush_pos_t * pos);
14335+#else
14336+#define check_preceder(b) noop
14337+#define check_pos(pos) noop
14338+#endif
14339+
14340+/* __REISER4_FLUSH_H__ */
14341+#endif
14342+
14343+/* Make Linus happy.
14344+ Local variables:
14345+ c-indentation-style: "K&R"
14346+ mode-name: "LC"
14347+ c-basic-offset: 8
14348+ tab-width: 8
14349+ fill-column: 90
14350+ LocalWords: preceder
14351+ End:
14352+*/
14353diff -urN linux-2.6.20.orig/fs/reiser4/flush_queue.c linux-2.6.20/fs/reiser4/flush_queue.c
14354--- linux-2.6.20.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300
14355+++ linux-2.6.20/fs/reiser4/flush_queue.c 2007-05-06 14:50:43.718981974 +0400
14356@@ -0,0 +1,680 @@
14357+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14358+
14359+#include "debug.h"
14360+#include "super.h"
14361+#include "txnmgr.h"
14362+#include "jnode.h"
14363+#include "znode.h"
14364+#include "page_cache.h"
14365+#include "wander.h"
14366+#include "vfs_ops.h"
14367+#include "writeout.h"
14368+#include "flush.h"
14369+
14370+#include <linux/bio.h>
14371+#include <linux/mm.h>
14372+#include <linux/pagemap.h>
14373+#include <linux/blkdev.h>
14374+#include <linux/writeback.h>
14375+
14376+/* A flush queue object is an accumulator for keeping jnodes prepared
14377+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14378+ kept on the flush queue until memory pressure or atom commit asks
14379+ flush queues to write some or all from their jnodes. */
14380+
14381+/*
14382+ LOCKING:
14383+
14384+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14385+ list protected by atom spin lock. fq->prepped list uses the following
14386+ locking:
14387+
14388+ two ways to protect fq->prepped list for read-only list traversal:
14389+
14390+ 1. atom spin-lock atom.
14391+ 2. fq is IN_USE, atom->nr_running_queues increased.
14392+
14393+ and one for list modification:
14394+
14395+ 1. atom is spin-locked and one condition is true: fq is IN_USE or
14396+ atom->nr_running_queues == 0.
14397+
14398+ The deadlock-safe order for flush queues and atoms is: first lock atom, then
14399+ lock flush queue, then lock jnode.
14400+*/
14401+
14402+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14403+#define fq_ready(fq) (!fq_in_use(fq))
14404+
14405+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14406+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14407+
14408+/* get lock on atom from locked flush queue object */
14409+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14410+{
14411+ /* This code is similar to jnode_get_atom(), look at it for the
14412+ * explanation. */
14413+ txn_atom *atom;
14414+
14415+ assert_spin_locked(&(fq->guard));
14416+
14417+ while (1) {
14418+ atom = fq->atom;
14419+ if (atom == NULL)
14420+ break;
14421+
14422+ if (spin_trylock_atom(atom))
14423+ break;
14424+
14425+ atomic_inc(&atom->refcount);
14426+ spin_unlock(&(fq->guard));
14427+ spin_lock_atom(atom);
14428+ spin_lock(&(fq->guard));
14429+
14430+ if (fq->atom == atom) {
14431+ atomic_dec(&atom->refcount);
14432+ break;
14433+ }
14434+
14435+ spin_unlock(&(fq->guard));
14436+ atom_dec_and_unlock(atom);
14437+ spin_lock(&(fq->guard));
14438+ }
14439+
14440+ return atom;
14441+}
14442+
14443+txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14444+{
14445+ txn_atom *atom;
14446+
14447+ spin_lock(&(fq->guard));
14448+ atom = atom_locked_by_fq_nolock(fq);
14449+ spin_unlock(&(fq->guard));
14450+ return atom;
14451+}
14452+
14453+static void init_fq(flush_queue_t * fq)
14454+{
14455+ memset(fq, 0, sizeof *fq);
14456+
14457+ atomic_set(&fq->nr_submitted, 0);
14458+
14459+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14460+
14461+ init_waitqueue_head(&fq->wait);
14462+ spin_lock_init(&fq->guard);
14463+}
14464+
14465+/* slab for flush queues */
14466+static struct kmem_cache *fq_slab;
14467+
14468+/**
14469+ * reiser4_init_fqs - create flush queue cache
14470+ *
14471+ * Initializes slab cache of flush queues. It is part of reiser4 module
14472+ * initialization.
14473+ */
14474+int reiser4_init_fqs(void)
14475+{
14476+ fq_slab = kmem_cache_create("fq",
14477+ sizeof(flush_queue_t),
14478+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14479+ if (fq_slab == NULL)
14480+ return RETERR(-ENOMEM);
14481+ return 0;
14482+}
14483+
14484+/**
14485+ * reiser4_done_fqs - delete flush queue cache
14486+ *
14487+ * This is called on reiser4 module unloading or system shutdown.
14488+ */
14489+void reiser4_done_fqs(void)
14490+{
14491+ destroy_reiser4_cache(&fq_slab);
14492+}
14493+
14494+/* create new flush queue object */
14495+static flush_queue_t *create_fq(gfp_t gfp)
14496+{
14497+ flush_queue_t *fq;
14498+
14499+ fq = kmem_cache_alloc(fq_slab, gfp);
14500+ if (fq)
14501+ init_fq(fq);
14502+
14503+ return fq;
14504+}
14505+
14506+/* adjust atom's and flush queue's counters of queued nodes */
14507+static void count_enqueued_node(flush_queue_t * fq)
14508+{
14509+ ON_DEBUG(fq->atom->num_queued++);
14510+}
14511+
14512+static void count_dequeued_node(flush_queue_t * fq)
14513+{
14514+ assert("zam-993", fq->atom->num_queued > 0);
14515+ ON_DEBUG(fq->atom->num_queued--);
14516+}
14517+
14518+/* attach flush queue object to the atom */
14519+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14520+{
14521+ assert_spin_locked(&(atom->alock));
14522+ list_add(&fq->alink, &atom->flush_queues);
14523+ fq->atom = atom;
14524+ ON_DEBUG(atom->nr_flush_queues++);
14525+}
14526+
14527+static void detach_fq(flush_queue_t * fq)
14528+{
14529+ assert_spin_locked(&(fq->atom->alock));
14530+
14531+ spin_lock(&(fq->guard));
14532+ list_del_init(&fq->alink);
14533+ assert("vs-1456", fq->atom->nr_flush_queues > 0);
14534+ ON_DEBUG(fq->atom->nr_flush_queues--);
14535+ fq->atom = NULL;
14536+ spin_unlock(&(fq->guard));
14537+}
14538+
14539+/* destroy flush queue object */
14540+static void done_fq(flush_queue_t * fq)
14541+{
14542+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14543+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14544+
14545+ kmem_cache_free(fq_slab, fq);
14546+}
14547+
14548+/* */
14549+static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14550+{
14551+ JF_SET(node, JNODE_FLUSH_QUEUED);
14552+ count_enqueued_node(fq);
14553+}
14554+
14555+/* Putting jnode into the flush queue. Both atom and jnode should be
14556+ spin-locked. */
14557+void queue_jnode(flush_queue_t * fq, jnode * node)
14558+{
14559+ assert_spin_locked(&(node->guard));
14560+ assert("zam-713", node->atom != NULL);
14561+ assert_spin_locked(&(node->atom->alock));
14562+ assert("zam-716", fq->atom != NULL);
14563+ assert("zam-717", fq->atom == node->atom);
14564+ assert("zam-907", fq_in_use(fq));
14565+
14566+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14567+ assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14568+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14569+ assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14570+
14571+ mark_jnode_queued(fq, node);
14572+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14573+
14574+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14575+ FQ_LIST, 1));
14576+}
14577+
14578+/* repeatable process for waiting io completion on a flush queue object */
14579+static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14580+{
14581+ assert("zam-738", fq->atom != NULL);
14582+ assert_spin_locked(&(fq->atom->alock));
14583+ assert("zam-736", fq_in_use(fq));
14584+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14585+
14586+ if (atomic_read(&fq->nr_submitted) != 0) {
14587+ struct super_block *super;
14588+
14589+ spin_unlock_atom(fq->atom);
14590+
14591+ assert("nikita-3013", reiser4_schedulable());
14592+
14593+ super = reiser4_get_current_sb();
14594+
14595+ /* FIXME: this is instead of blk_run_queues() */
14596+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14597+
14598+ if (!(super->s_flags & MS_RDONLY))
14599+ wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
14600+
14601+ /* Ask the caller to re-acquire the locks and call this
14602+ function again. Note: this technique is commonly used in
14603+ the txnmgr code. */
14604+ return -E_REPEAT;
14605+ }
14606+
14607+ *nr_io_errors += atomic_read(&fq->nr_errors);
14608+ return 0;
14609+}
14610+
14611+/* wait on I/O completion, re-submit dirty nodes to write */
14612+static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14613+{
14614+ int ret;
14615+ txn_atom *atom = fq->atom;
14616+
14617+ assert("zam-801", atom != NULL);
14618+ assert_spin_locked(&(atom->alock));
14619+ assert("zam-762", fq_in_use(fq));
14620+
14621+ ret = wait_io(fq, nr_io_errors);
14622+ if (ret)
14623+ return ret;
14624+
14625+ detach_fq(fq);
14626+ done_fq(fq);
14627+
14628+ reiser4_atom_send_event(atom);
14629+
14630+ return 0;
14631+}
14632+
14633+/* wait for all i/o for given atom to be completed, actually do one iteration
14634+ on that and return -E_REPEAT if there more iterations needed */
14635+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14636+{
14637+ flush_queue_t *fq;
14638+
14639+ assert_spin_locked(&(atom->alock));
14640+
14641+ if (list_empty_careful(&atom->flush_queues))
14642+ return 0;
14643+
14644+ list_for_each_entry(fq, &atom->flush_queues, alink) {
14645+ if (fq_ready(fq)) {
14646+ int ret;
14647+
14648+ mark_fq_in_use(fq);
14649+ assert("vs-1247", fq->owner == NULL);
14650+ ON_DEBUG(fq->owner = current);
14651+ ret = finish_fq(fq, nr_io_errors);
14652+
14653+ if (*nr_io_errors)
14654+ reiser4_handle_error();
14655+
14656+ if (ret) {
14657+ reiser4_fq_put(fq);
14658+ return ret;
14659+ }
14660+
14661+ spin_unlock_atom(atom);
14662+
14663+ return -E_REPEAT;
14664+ }
14665+ }
14666+
14667+ /* All flush queues are in use; atom remains locked */
14668+ return -EBUSY;
14669+}
14670+
14671+/* wait all i/o for current atom */
14672+int current_atom_finish_all_fq(void)
14673+{
14674+ txn_atom *atom;
14675+ int nr_io_errors = 0;
14676+ int ret = 0;
14677+
14678+ do {
14679+ while (1) {
14680+ atom = get_current_atom_locked();
14681+ ret = finish_all_fq(atom, &nr_io_errors);
14682+ if (ret != -EBUSY)
14683+ break;
14684+ reiser4_atom_wait_event(atom);
14685+ }
14686+ } while (ret == -E_REPEAT);
14687+
14688+ /* we do not need locked atom after this function finishes, SUCCESS or
14689+ -EBUSY are two return codes when atom remains locked after
14690+ finish_all_fq */
14691+ if (!ret)
14692+ spin_unlock_atom(atom);
14693+
14694+ assert_spin_not_locked(&(atom->alock));
14695+
14696+ if (ret)
14697+ return ret;
14698+
14699+ if (nr_io_errors)
14700+ return RETERR(-EIO);
14701+
14702+ return 0;
14703+}
14704+
14705+/* change node->atom field for all jnode from given list */
14706+static void
14707+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14708+{
14709+ jnode *cur;
14710+
14711+ list_for_each_entry(cur, list, capture_link) {
14712+ spin_lock_jnode(cur);
14713+ cur->atom = atom;
14714+ spin_unlock_jnode(cur);
14715+ }
14716+}
14717+
14718+/* support for atom fusion operation */
14719+void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14720+{
14721+ flush_queue_t *fq;
14722+
14723+ assert_spin_locked(&(to->alock));
14724+ assert_spin_locked(&(from->alock));
14725+
14726+ list_for_each_entry(fq, &from->flush_queues, alink) {
14727+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14728+ spin_lock(&(fq->guard));
14729+ fq->atom = to;
14730+ spin_unlock(&(fq->guard));
14731+ }
14732+
14733+ list_splice_init(&from->flush_queues, to->flush_queues.prev);
14734+
14735+#if REISER4_DEBUG
14736+ to->num_queued += from->num_queued;
14737+ to->nr_flush_queues += from->nr_flush_queues;
14738+ from->nr_flush_queues = 0;
14739+#endif
14740+}
14741+
14742+#if REISER4_DEBUG
14743+int atom_fq_parts_are_clean(txn_atom * atom)
14744+{
14745+ assert("zam-915", atom != NULL);
14746+ return list_empty_careful(&atom->flush_queues);
14747+}
14748+#endif
14749+/* Bio i/o completion routine for reiser4 write operations. */
14750+static int
14751+end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
14752+ int err)
14753+{
14754+ int i;
14755+ int nr_errors = 0;
14756+ flush_queue_t *fq;
14757+
14758+ assert("zam-958", bio->bi_rw & WRITE);
14759+
14760+ /* i/o op. is not fully completed */
14761+ if (bio->bi_size != 0)
14762+ return 1;
14763+
14764+ if (err == -EOPNOTSUPP)
14765+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14766+
14767+ /* we expect that bio->private is set to NULL or fq object which is used
14768+ * for synchronization and error counting. */
14769+ fq = bio->bi_private;
14770+ /* Check all elements of io_vec for correct write completion. */
14771+ for (i = 0; i < bio->bi_vcnt; i += 1) {
14772+ struct page *pg = bio->bi_io_vec[i].bv_page;
14773+
14774+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14775+ SetPageError(pg);
14776+ nr_errors++;
14777+ }
14778+
14779+ {
14780+ /* jnode WRITEBACK ("write is in progress bit") is
14781+ * atomically cleared here. */
14782+ jnode *node;
14783+
14784+ assert("zam-736", pg != NULL);
14785+ assert("zam-736", PagePrivate(pg));
14786+ node = jprivate(pg);
14787+
14788+ JF_CLR(node, JNODE_WRITEBACK);
14789+ }
14790+
14791+ end_page_writeback(pg);
14792+ page_cache_release(pg);
14793+ }
14794+
14795+ if (fq) {
14796+ /* count i/o error in fq object */
14797+ atomic_add(nr_errors, &fq->nr_errors);
14798+
14799+ /* If all write requests registered in this "fq" are done we up
14800+ * the waiter. */
14801+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
14802+ wake_up(&fq->wait);
14803+ }
14804+
14805+ bio_put(bio);
14806+ return 0;
14807+}
14808+
14809+/* Count I/O requests which will be submitted by @bio in given flush queues
14810+ @fq */
14811+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14812+{
14813+ bio->bi_private = fq;
14814+ bio->bi_end_io = end_io_handler;
14815+
14816+ if (fq)
14817+ atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14818+}
14819+
14820+/* Move all queued nodes out from @fq->prepped list. */
14821+static void release_prepped_list(flush_queue_t * fq)
14822+{
14823+ txn_atom *atom;
14824+
14825+ assert("zam-904", fq_in_use(fq));
14826+ atom = atom_locked_by_fq(fq);
14827+
14828+ while (!list_empty(ATOM_FQ_LIST(fq))) {
14829+ jnode *cur;
14830+
14831+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14832+ list_del_init(&cur->capture_link);
14833+
14834+ count_dequeued_node(fq);
14835+ spin_lock_jnode(cur);
14836+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14837+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14838+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14839+ JF_CLR(cur, JNODE_FLUSH_QUEUED);
14840+
14841+ if (JF_ISSET(cur, JNODE_DIRTY)) {
14842+ list_add_tail(&cur->capture_link,
14843+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14844+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14845+ DIRTY_LIST, 1));
14846+ } else {
14847+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14848+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14849+ CLEAN_LIST, 1));
14850+ }
14851+
14852+ spin_unlock_jnode(cur);
14853+ }
14854+
14855+ if (--atom->nr_running_queues == 0)
14856+ reiser4_atom_send_event(atom);
14857+
14858+ spin_unlock_atom(atom);
14859+}
14860+
14861+/* Submit write requests for nodes on the already filled flush queue @fq.
14862+
14863+ @fq: flush queue object which contains jnodes we can (and will) write.
14864+ @return: number of submitted blocks (>=0) if success, otherwise -- an error
14865+ code (<0). */
14866+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
14867+{
14868+ int ret;
14869+ txn_atom *atom;
14870+
14871+ while (1) {
14872+ atom = atom_locked_by_fq(fq);
14873+ assert("zam-924", atom);
14874+ /* do not write fq in parallel. */
14875+ if (atom->nr_running_queues == 0
14876+ || !(flags & WRITEOUT_SINGLE_STREAM))
14877+ break;
14878+ reiser4_atom_wait_event(atom);
14879+ }
14880+
14881+ atom->nr_running_queues++;
14882+ spin_unlock_atom(atom);
14883+
14884+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14885+ release_prepped_list(fq);
14886+
14887+ return ret;
14888+}
14889+
14890+/* Getting flush queue object for exclusive use by one thread. May require
14891+ several iterations which is indicated by -E_REPEAT return code.
14892+
14893+ This function does not contain code for obtaining an atom lock because an
14894+ atom lock is obtained by different ways in different parts of reiser4,
14895+ usually it is current atom, but we need a possibility for getting fq for the
14896+ atom of given jnode. */
14897+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
14898+{
14899+ flush_queue_t *fq;
14900+
14901+ assert_spin_locked(&(atom->alock));
14902+
14903+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
14904+ while (&atom->flush_queues != &fq->alink) {
14905+ spin_lock(&(fq->guard));
14906+
14907+ if (fq_ready(fq)) {
14908+ mark_fq_in_use(fq);
14909+ assert("vs-1246", fq->owner == NULL);
14910+ ON_DEBUG(fq->owner = current);
14911+ spin_unlock(&(fq->guard));
14912+
14913+ if (*new_fq)
14914+ done_fq(*new_fq);
14915+
14916+ *new_fq = fq;
14917+
14918+ return 0;
14919+ }
14920+
14921+ spin_unlock(&(fq->guard));
14922+
14923+ fq = list_entry(fq->alink.next, flush_queue_t, alink);
14924+ }
14925+
14926+ /* Use previously allocated fq object */
14927+ if (*new_fq) {
14928+ mark_fq_in_use(*new_fq);
14929+ assert("vs-1248", (*new_fq)->owner == 0);
14930+ ON_DEBUG((*new_fq)->owner = current);
14931+ attach_fq(atom, *new_fq);
14932+
14933+ return 0;
14934+ }
14935+
14936+ spin_unlock_atom(atom);
14937+
14938+ *new_fq = create_fq(gfp);
14939+
14940+ if (*new_fq == NULL)
14941+ return RETERR(-ENOMEM);
14942+
14943+ return RETERR(-E_REPEAT);
14944+}
14945+
14946+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
14947+{
14948+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
14949+}
14950+
14951+/* A wrapper around reiser4_fq_by_atom for getting a flush queue
14952+ object for current atom, if success fq->atom remains locked. */
14953+flush_queue_t *get_fq_for_current_atom(void)
14954+{
14955+ flush_queue_t *fq = NULL;
14956+ txn_atom *atom;
14957+ int ret;
14958+
14959+ do {
14960+ atom = get_current_atom_locked();
14961+ ret = reiser4_fq_by_atom(atom, &fq);
14962+ } while (ret == -E_REPEAT);
14963+
14964+ if (ret)
14965+ return ERR_PTR(ret);
14966+ return fq;
14967+}
14968+
14969+/* Releasing flush queue object after exclusive use */
14970+void reiser4_fq_put_nolock(flush_queue_t *fq)
14971+{
14972+ assert("zam-747", fq->atom != NULL);
14973+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
14974+ mark_fq_ready(fq);
14975+ assert("vs-1245", fq->owner == current);
14976+ ON_DEBUG(fq->owner = NULL);
14977+}
14978+
14979+void reiser4_fq_put(flush_queue_t * fq)
14980+{
14981+ txn_atom *atom;
14982+
14983+ spin_lock(&(fq->guard));
14984+ atom = atom_locked_by_fq_nolock(fq);
14985+
14986+ assert("zam-746", atom != NULL);
14987+
14988+ reiser4_fq_put_nolock(fq);
14989+ reiser4_atom_send_event(atom);
14990+
14991+ spin_unlock(&(fq->guard));
14992+ spin_unlock_atom(atom);
14993+}
14994+
14995+/* A part of atom object initialization related to the embedded flush queue
14996+ list head */
14997+
14998+void init_atom_fq_parts(txn_atom *atom)
14999+{
15000+ INIT_LIST_HEAD(&atom->flush_queues);
15001+}
15002+
15003+#if REISER4_DEBUG
15004+
15005+void reiser4_check_fq(const txn_atom *atom)
15006+{
15007+ /* check number of nodes on all atom's flush queues */
15008+ flush_queue_t *fq;
15009+ int count;
15010+ struct list_head *pos;
15011+
15012+ count = 0;
15013+ list_for_each_entry(fq, &atom->flush_queues, alink) {
15014+ spin_lock(&(fq->guard));
15015+ /* calculate number of jnodes on fq' list of prepped jnodes */
15016+ list_for_each(pos, ATOM_FQ_LIST(fq))
15017+ count++;
15018+ spin_unlock(&(fq->guard));
15019+ }
15020+ if (count != atom->fq)
15021+ warning("", "fq counter %d, real %d\n", atom->fq, count);
15022+
15023+}
15024+
15025+#endif
15026+
15027+/*
15028+ * Local variables:
15029+ * c-indentation-style: "K&R"
15030+ * mode-name: "LC"
15031+ * c-basic-offset: 8
15032+ * tab-width: 8
15033+ * fill-column: 79
15034+ * scroll-step: 1
15035+ * End:
15036+ */
15037diff -urN linux-2.6.20.orig/fs/reiser4/forward.h linux-2.6.20/fs/reiser4/forward.h
15038--- linux-2.6.20.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300
15039+++ linux-2.6.20/fs/reiser4/forward.h 2007-05-06 14:50:43.718981974 +0400
15040@@ -0,0 +1,256 @@
15041+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15042+
15043+/* Forward declarations. Thank you Kernighan. */
15044+
15045+#if !defined( __REISER4_FORWARD_H__ )
15046+#define __REISER4_FORWARD_H__
15047+
15048+#include <asm/errno.h>
15049+#include <linux/types.h>
15050+
15051+typedef struct zlock zlock;
15052+typedef struct lock_stack lock_stack;
15053+typedef struct lock_handle lock_handle;
15054+typedef struct znode znode;
15055+typedef struct flow flow_t;
15056+typedef struct coord coord_t;
15057+typedef struct tree_access_pointer tap_t;
15058+typedef struct item_coord item_coord;
15059+typedef struct shift_params shift_params;
15060+typedef struct reiser4_object_create_data reiser4_object_create_data;
15061+typedef union reiser4_plugin reiser4_plugin;
15062+typedef __u16 reiser4_plugin_id;
15063+typedef __u64 reiser4_plugin_groups;
15064+typedef struct item_plugin item_plugin;
15065+typedef struct jnode_plugin jnode_plugin;
15066+typedef struct reiser4_item_data reiser4_item_data;
15067+typedef union reiser4_key reiser4_key;
15068+typedef struct reiser4_tree reiser4_tree;
15069+typedef struct carry_cut_data carry_cut_data;
15070+typedef struct carry_kill_data carry_kill_data;
15071+typedef struct carry_tree_op carry_tree_op;
15072+typedef struct carry_tree_node carry_tree_node;
15073+typedef struct carry_plugin_info carry_plugin_info;
15074+typedef struct reiser4_journal reiser4_journal;
15075+typedef struct txn_atom txn_atom;
15076+typedef struct txn_handle txn_handle;
15077+typedef struct txn_mgr txn_mgr;
15078+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15079+typedef struct reiser4_context reiser4_context;
15080+typedef struct carry_level carry_level;
15081+typedef struct blocknr_set_entry blocknr_set_entry;
15082+/* super_block->s_fs_info points to this */
15083+typedef struct reiser4_super_info_data reiser4_super_info_data;
15084+/* next two objects are fields of reiser4_super_info_data */
15085+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15086+typedef struct reiser4_space_allocator reiser4_space_allocator;
15087+
15088+typedef struct flush_scan flush_scan;
15089+typedef struct flush_position flush_pos_t;
15090+
15091+typedef unsigned short pos_in_node_t;
15092+#define MAX_POS_IN_NODE 65535
15093+
15094+typedef struct jnode jnode;
15095+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15096+
15097+typedef struct uf_coord uf_coord_t;
15098+typedef struct hint hint_t;
15099+
15100+typedef struct ktxnmgrd_context ktxnmgrd_context;
15101+
15102+typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
15103+
15104+struct inode;
15105+struct page;
15106+struct file;
15107+struct dentry;
15108+struct super_block;
15109+
15110+/* return values of coord_by_key(). cbk == coord_by_key */
15111+typedef enum {
15112+ CBK_COORD_FOUND = 0,
15113+ CBK_COORD_NOTFOUND = -ENOENT,
15114+} lookup_result;
15115+
15116+/* results of lookup with directory file */
15117+typedef enum {
15118+ FILE_NAME_FOUND = 0,
15119+ FILE_NAME_NOTFOUND = -ENOENT,
15120+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15121+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15122+} file_lookup_result;
15123+
15124+/* behaviors of lookup. If coord we are looking for is actually in a tree,
15125+ both coincide. */
15126+typedef enum {
15127+ /* search exactly for the coord with key given */
15128+ FIND_EXACT,
15129+ /* search for coord with the maximal key not greater than one
15130+ given */
15131+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15132+} lookup_bias;
15133+
15134+typedef enum {
15135+ /* number of leaf level of the tree
15136+ The fake root has (tree_level=0). */
15137+ LEAF_LEVEL = 1,
15138+
15139+ /* number of level one above leaf level of the tree.
15140+
15141+ It is supposed that internal tree used by reiser4 to store file
15142+ system data and meta data will have height 2 initially (when
15143+ created by mkfs).
15144+ */
15145+ TWIG_LEVEL = 2,
15146+} tree_level;
15147+
15148+/* The "real" maximum ztree height is the 0-origin size of any per-level
15149+ array, since the zero'th level is not used. */
15150+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15151+
15152+/* enumeration of possible mutual position of item and coord. This enum is
15153+ return type of ->is_in_item() item plugin method which see. */
15154+typedef enum {
15155+ /* coord is on the left of an item */
15156+ IP_ON_THE_LEFT,
15157+ /* coord is inside item */
15158+ IP_INSIDE,
15159+ /* coord is inside item, but to the right of the rightmost unit of
15160+ this item */
15161+ IP_RIGHT_EDGE,
15162+ /* coord is on the right of an item */
15163+ IP_ON_THE_RIGHT
15164+} interposition;
15165+
15166+/* type of lock to acquire on znode before returning it to caller */
15167+typedef enum {
15168+ ZNODE_NO_LOCK = 0,
15169+ ZNODE_READ_LOCK = 1,
15170+ ZNODE_WRITE_LOCK = 2,
15171+} znode_lock_mode;
15172+
15173+/* type of lock request */
15174+typedef enum {
15175+ ZNODE_LOCK_LOPRI = 0,
15176+ ZNODE_LOCK_HIPRI = (1 << 0),
15177+
15178+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15179+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15180+ return the value -E_REPEAT. */
15181+ ZNODE_LOCK_NONBLOCK = (1 << 1),
15182+ /* An option for longterm_lock_znode which prevents atom fusion */
15183+ ZNODE_LOCK_DONT_FUSE = (1 << 2)
15184+} znode_lock_request;
15185+
15186+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15187+
15188+/* used to specify direction of shift. These must be -1 and 1 */
15189+typedef enum {
15190+ SHIFT_LEFT = 1,
15191+ SHIFT_RIGHT = -1
15192+} shift_direction;
15193+
15194+typedef enum {
15195+ LEFT_SIDE,
15196+ RIGHT_SIDE
15197+} sideof;
15198+
15199+#define round_up( value, order ) \
15200+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15201+ ~( ( order ) - 1 ) ) )
15202+
15203+/* values returned by squalloc_right_neighbor and its auxiliary functions */
15204+typedef enum {
15205+ /* unit of internal item is moved */
15206+ SUBTREE_MOVED = 0,
15207+ /* nothing else can be squeezed into left neighbor */
15208+ SQUEEZE_TARGET_FULL = 1,
15209+ /* all content of node is squeezed into its left neighbor */
15210+ SQUEEZE_SOURCE_EMPTY = 2,
15211+ /* one more item is copied (this is only returned by
15212+ allocate_and_copy_extent to squalloc_twig)) */
15213+ SQUEEZE_CONTINUE = 3
15214+} squeeze_result;
15215+
15216+/* Do not change items ids. If you do - there will be format change */
15217+typedef enum {
15218+ STATIC_STAT_DATA_ID = 0x0,
15219+ SIMPLE_DIR_ENTRY_ID = 0x1,
15220+ COMPOUND_DIR_ID = 0x2,
15221+ NODE_POINTER_ID = 0x3,
15222+ EXTENT_POINTER_ID = 0x5,
15223+ FORMATTING_ID = 0x6,
15224+ CTAIL_ID = 0x7,
15225+ BLACK_BOX_ID = 0x8,
15226+ LAST_ITEM_ID = 0x9
15227+} item_id;
15228+
15229+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15230+ whether commit() was called or VM memory pressure was applied. */
15231+typedef enum {
15232+ /* submit flush queue to disk at jnode_flush completion */
15233+ JNODE_FLUSH_WRITE_BLOCKS = 1,
15234+
15235+ /* flush is called for commit */
15236+ JNODE_FLUSH_COMMIT = 2,
15237+ /* not implemented */
15238+ JNODE_FLUSH_MEMORY_FORMATTED = 4,
15239+
15240+ /* not implemented */
15241+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15242+} jnode_flush_flags;
15243+
15244+/* Flags to insert/paste carry operations. Currently they only used in
15245+ flushing code, but in future, they can be used to optimize for repetitive
15246+ accesses. */
15247+typedef enum {
15248+ /* carry is not allowed to shift data to the left when trying to find
15249+ free space */
15250+ COPI_DONT_SHIFT_LEFT = (1 << 0),
15251+ /* carry is not allowed to shift data to the right when trying to find
15252+ free space */
15253+ COPI_DONT_SHIFT_RIGHT = (1 << 1),
15254+ /* carry is not allowed to allocate new node(s) when trying to find
15255+ free space */
15256+ COPI_DONT_ALLOCATE = (1 << 2),
15257+ /* try to load left neighbor if its not in a cache */
15258+ COPI_LOAD_LEFT = (1 << 3),
15259+ /* try to load right neighbor if its not in a cache */
15260+ COPI_LOAD_RIGHT = (1 << 4),
15261+ /* shift insertion point to the left neighbor */
15262+ COPI_GO_LEFT = (1 << 5),
15263+ /* shift insertion point to the right neighbor */
15264+ COPI_GO_RIGHT = (1 << 6),
15265+ /* try to step back into original node if insertion into new node
15266+ fails after shifting data there. */
15267+ COPI_STEP_BACK = (1 << 7)
15268+} cop_insert_flag;
15269+
15270+typedef enum {
15271+ SAFE_UNLINK, /* safe-link for unlink */
15272+ SAFE_TRUNCATE /* safe-link for truncate */
15273+} reiser4_safe_link_t;
15274+
15275+/* this is to show on which list of atom jnode is */
15276+typedef enum {
15277+ NOT_CAPTURED,
15278+ DIRTY_LIST,
15279+ CLEAN_LIST,
15280+ FQ_LIST,
15281+ WB_LIST,
15282+ OVRWR_LIST
15283+} atom_list;
15284+
15285+/* __REISER4_FORWARD_H__ */
15286+#endif
15287+
15288+/* Make Linus happy.
15289+ Local variables:
15290+ c-indentation-style: "K&R"
15291+ mode-name: "LC"
15292+ c-basic-offset: 8
15293+ tab-width: 8
15294+ fill-column: 120
15295+ End:
15296+*/
15297diff -urN linux-2.6.20.orig/fs/reiser4/fsdata.c linux-2.6.20/fs/reiser4/fsdata.c
15298--- linux-2.6.20.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300
15299+++ linux-2.6.20/fs/reiser4/fsdata.c 2007-05-06 14:50:43.722983224 +0400
15300@@ -0,0 +1,803 @@
15301+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15302+ * reiser4/README */
15303+
15304+#include "fsdata.h"
15305+#include "inode.h"
15306+
15307+/* cache or dir_cursors */
15308+static struct kmem_cache *d_cursor_cache;
15309+static struct shrinker *d_cursor_shrinker;
15310+
15311+/* list of unused cursors */
15312+static LIST_HEAD(cursor_cache);
15313+
15314+/* number of cursors in list of ununsed cursors */
15315+static unsigned long d_cursor_unused = 0;
15316+
15317+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15318+DEFINE_SPINLOCK(d_lock);
15319+
15320+static reiser4_file_fsdata *create_fsdata(struct file *file);
15321+static int file_is_stateless(struct file *file);
15322+static void free_fsdata(reiser4_file_fsdata *fsdata);
15323+static void kill_cursor(dir_cursor *);
15324+
15325+/**
15326+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15327+ * @nr: number of objects to free
15328+ * @mask: GFP mask
15329+ *
15330+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15331+ * number. Return number of still freeable cursors.
15332+ */
15333+static int d_cursor_shrink(int nr, gfp_t mask)
15334+{
15335+ if (nr != 0) {
15336+ dir_cursor *scan;
15337+ int killed;
15338+
15339+ killed = 0;
15340+ spin_lock(&d_lock);
15341+ while (!list_empty(&cursor_cache)) {
15342+ scan = list_entry(cursor_cache.next, dir_cursor, alist);
15343+ assert("nikita-3567", scan->ref == 0);
15344+ kill_cursor(scan);
15345+ ++killed;
15346+ --nr;
15347+ if (nr == 0)
15348+ break;
15349+ }
15350+ spin_unlock(&d_lock);
15351+ }
15352+ return d_cursor_unused;
15353+}
15354+
15355+/**
15356+ * reiser4_init_d_cursor - create d_cursor cache
15357+ *
15358+ * Initializes slab cache of d_cursors. It is part of reiser4 module
15359+ * initialization.
15360+ */
15361+int reiser4_init_d_cursor(void)
15362+{
15363+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15364+ SLAB_HWCACHE_ALIGN, NULL, NULL);
15365+ if (d_cursor_cache == NULL)
15366+ return RETERR(-ENOMEM);
15367+
15368+ /*
15369+ * actually, d_cursors are "priceless", because there is no way to
15370+ * recover information stored in them. On the other hand, we don't
15371+ * want to consume all kernel memory by them. As a compromise, just
15372+ * assign higher "seeks" value to d_cursor cache, so that it will be
15373+ * shrunk only if system is really tight on memory.
15374+ */
15375+ d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15376+ d_cursor_shrink);
15377+ if (d_cursor_shrinker == NULL) {
15378+ destroy_reiser4_cache(&d_cursor_cache);
15379+ d_cursor_cache = NULL;
15380+ return RETERR(-ENOMEM);
15381+ }
15382+ return 0;
15383+}
15384+
15385+/**
15386+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15387+ *
15388+ * This is called on reiser4 module unloading or system shutdown.
15389+ */
15390+void reiser4_done_d_cursor(void)
15391+{
15392+ BUG_ON(d_cursor_shrinker == NULL);
15393+ remove_shrinker(d_cursor_shrinker);
15394+ d_cursor_shrinker = NULL;
15395+
15396+ destroy_reiser4_cache(&d_cursor_cache);
15397+}
15398+
15399+#define D_CURSOR_TABLE_SIZE (256)
15400+
15401+static inline unsigned long
15402+d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
15403+{
15404+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15405+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15406+}
15407+
15408+static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
15409+{
15410+ return k1->cid == k2->cid && k1->oid == k2->oid;
15411+}
15412+
15413+/*
15414+ * define functions to manipulate reiser4 super block's hash table of
15415+ * dir_cursors
15416+ */
15417+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15418+#define KFREE(ptr, size) kfree(ptr)
15419+TYPE_SAFE_HASH_DEFINE(d_cursor,
15420+ dir_cursor,
15421+ d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
15422+#undef KFREE
15423+#undef KMALLOC
15424+
15425+/**
15426+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15427+ * @super: super block to initialize
15428+ *
15429+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15430+ * of mount.
15431+ */
15432+int reiser4_init_super_d_info(struct super_block *super)
15433+{
15434+ d_cursor_info *p;
15435+
15436+ p = &get_super_private(super)->d_info;
15437+
15438+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15439+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15440+}
15441+
15442+/**
15443+ * reiser4_done_super_d_info - release per-super-block d_cursor resources
15444+ * @super: super block being umounted
15445+ *
15446+ * It is called on umount. Kills all directory cursors attached to suoer block.
15447+ */
15448+void reiser4_done_super_d_info(struct super_block *super)
15449+{
15450+ d_cursor_info *d_info;
15451+ dir_cursor *cursor, *next;
15452+
15453+ d_info = &get_super_private(super)->d_info;
15454+ for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15455+ kill_cursor(cursor);
15456+
15457+ BUG_ON(d_info->tree.rnode != NULL);
15458+ d_cursor_hash_done(&d_info->table);
15459+}
15460+
15461+/**
15462+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15463+ * @cursor: cursor to free
15464+ *
15465+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15466+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15467+ * indices, hash table, list of unused cursors and frees it.
15468+ */
15469+static void kill_cursor(dir_cursor *cursor)
15470+{
15471+ unsigned long index;
15472+
15473+ assert("nikita-3566", cursor->ref == 0);
15474+ assert("nikita-3572", cursor->fsdata != NULL);
15475+
15476+ index = (unsigned long)cursor->key.oid;
15477+ list_del_init(&cursor->fsdata->dir.linkage);
15478+ free_fsdata(cursor->fsdata);
15479+ cursor->fsdata = NULL;
15480+
15481+ if (list_empty_careful(&cursor->list))
15482+ /* this is last cursor for a file. Kill radix-tree entry */
15483+ radix_tree_delete(&cursor->info->tree, index);
15484+ else {
15485+ void **slot;
15486+
15487+ /*
15488+ * there are other cursors for the same oid.
15489+ */
15490+
15491+ /*
15492+ * if radix tree point to the cursor being removed, re-target
15493+ * radix tree slot to the next cursor in the (non-empty as was
15494+ * checked above) element of the circular list of all cursors
15495+ * for this oid.
15496+ */
15497+ slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15498+ assert("nikita-3571", *slot != NULL);
15499+ if (*slot == cursor)
15500+ *slot = list_entry(cursor->list.next, dir_cursor, list);
15501+ /* remove cursor from circular list */
15502+ list_del_init(&cursor->list);
15503+ }
15504+ /* remove cursor from the list of unused cursors */
15505+ list_del_init(&cursor->alist);
15506+ /* remove cursor from the hash table */
15507+ d_cursor_hash_remove(&cursor->info->table, cursor);
15508+ /* and free it */
15509+ kmem_cache_free(d_cursor_cache, cursor);
15510+ --d_cursor_unused;
15511+}
15512+
15513+/* possible actions that can be performed on all cursors for the given file */
15514+enum cursor_action {
15515+ /*
15516+ * load all detached state: this is called when stat-data is loaded
15517+ * from the disk to recover information about all pending readdirs
15518+ */
15519+ CURSOR_LOAD,
15520+ /*
15521+ * detach all state from inode, leaving it in the cache. This is called
15522+ * when inode is removed form the memory by memory pressure
15523+ */
15524+ CURSOR_DISPOSE,
15525+ /*
15526+ * detach cursors from the inode, and free them. This is called when
15527+ * inode is destroyed
15528+ */
15529+ CURSOR_KILL
15530+};
15531+
15532+/*
15533+ * return d_cursor data for the file system @inode is in.
15534+ */
15535+static inline d_cursor_info *d_info(struct inode *inode)
15536+{
15537+ return &get_super_private(inode->i_sb)->d_info;
15538+}
15539+
15540+/*
15541+ * lookup d_cursor in the per-super-block radix tree.
15542+ */
15543+static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
15544+{
15545+ return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15546+}
15547+
15548+/*
15549+ * attach @cursor to the radix tree. There may be multiple cursors for the
15550+ * same oid, they are chained into circular list.
15551+ */
15552+static void bind_cursor(dir_cursor * cursor, unsigned long index)
15553+{
15554+ dir_cursor *head;
15555+
15556+ head = lookup(cursor->info, index);
15557+ if (head == NULL) {
15558+ /* this is the first cursor for this index */
15559+ INIT_LIST_HEAD(&cursor->list);
15560+ radix_tree_insert(&cursor->info->tree, index, cursor);
15561+ } else {
15562+ /* some cursor already exists. Chain ours */
15563+ list_add(&cursor->list, &head->list);
15564+ }
15565+}
15566+
15567+/*
15568+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
15569+ * "unused" list. Called when file descriptor is not longer in active use.
15570+ */
15571+static void clean_fsdata(struct file *file)
15572+{
15573+ dir_cursor *cursor;
15574+ reiser4_file_fsdata *fsdata;
15575+
15576+ assert("nikita-3570", file_is_stateless(file));
15577+
15578+ fsdata = (reiser4_file_fsdata *) file->private_data;
15579+ if (fsdata != NULL) {
15580+ cursor = fsdata->cursor;
15581+ if (cursor != NULL) {
15582+ spin_lock(&d_lock);
15583+ --cursor->ref;
15584+ if (cursor->ref == 0) {
15585+ list_add_tail(&cursor->alist, &cursor_cache);
15586+ ++d_cursor_unused;
15587+ }
15588+ spin_unlock(&d_lock);
15589+ file->private_data = NULL;
15590+ }
15591+ }
15592+}
15593+
15594+/*
15595+ * global counter used to generate "client ids". These ids are encoded into
15596+ * high bits of fpos.
15597+ */
15598+static __u32 cid_counter = 0;
15599+#define CID_SHIFT (20)
15600+#define CID_MASK (0xfffffull)
15601+
15602+static void free_file_fsdata_nolock(struct file *);
15603+
15604+/**
15605+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15606+ * @cursor:
15607+ * @file:
15608+ * @inode:
15609+ *
15610+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15611+ * reiser4 super block's hash table and radix tree.
15612+ add detachable readdir
15613+ * state to the @f
15614+ */
15615+static int insert_cursor(dir_cursor *cursor, struct file *file,
15616+ struct inode *inode)
15617+{
15618+ int result;
15619+ reiser4_file_fsdata *fsdata;
15620+
15621+ memset(cursor, 0, sizeof *cursor);
15622+
15623+ /* this is either first call to readdir, or rewind. Anyway, create new
15624+ * cursor. */
15625+ fsdata = create_fsdata(NULL);
15626+ if (fsdata != NULL) {
15627+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15628+ if (result == 0) {
15629+ d_cursor_info *info;
15630+ oid_t oid;
15631+
15632+ info = d_info(inode);
15633+ oid = get_inode_oid(inode);
15634+ /* cid occupies higher 12 bits of f->f_pos. Don't
15635+ * allow it to become negative: this confuses
15636+ * nfsd_readdir() */
15637+ cursor->key.cid = (++cid_counter) & 0x7ff;
15638+ cursor->key.oid = oid;
15639+ cursor->fsdata = fsdata;
15640+ cursor->info = info;
15641+ cursor->ref = 1;
15642+
15643+ spin_lock_inode(inode);
15644+ /* install cursor as @f's private_data, discarding old
15645+ * one if necessary */
15646+#if REISER4_DEBUG
15647+ if (file->private_data)
15648+ warning("", "file has fsdata already");
15649+#endif
15650+ clean_fsdata(file);
15651+ free_file_fsdata_nolock(file);
15652+ file->private_data = fsdata;
15653+ fsdata->cursor = cursor;
15654+ spin_unlock_inode(inode);
15655+ spin_lock(&d_lock);
15656+ /* insert cursor into hash table */
15657+ d_cursor_hash_insert(&info->table, cursor);
15658+ /* and chain it into radix-tree */
15659+ bind_cursor(cursor, (unsigned long)oid);
15660+ spin_unlock(&d_lock);
15661+ radix_tree_preload_end();
15662+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15663+ }
15664+ } else
15665+ result = RETERR(-ENOMEM);
15666+ return result;
15667+}
15668+
15669+/**
15670+ * process_cursors - do action on each cursor attached to inode
15671+ * @inode:
15672+ * @act: action to do
15673+ *
15674+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15675+ * and performs action specified by @act on each of cursors.
15676+ */
15677+static void process_cursors(struct inode *inode, enum cursor_action act)
15678+{
15679+ oid_t oid;
15680+ dir_cursor *start;
15681+ struct list_head *head;
15682+ reiser4_context *ctx;
15683+ d_cursor_info *info;
15684+
15685+ /* this can be called by
15686+ *
15687+ * kswapd->...->prune_icache->..reiser4_destroy_inode
15688+ *
15689+ * without reiser4_context
15690+ */
15691+ ctx = reiser4_init_context(inode->i_sb);
15692+ if (IS_ERR(ctx)) {
15693+ warning("vs-23", "failed to init context");
15694+ return;
15695+ }
15696+
15697+ assert("nikita-3558", inode != NULL);
15698+
15699+ info = d_info(inode);
15700+ oid = get_inode_oid(inode);
15701+ spin_lock_inode(inode);
15702+ head = get_readdir_list(inode);
15703+ spin_lock(&d_lock);
15704+ /* find any cursor for this oid: reference to it is hanging of radix
15705+ * tree */
15706+ start = lookup(info, (unsigned long)oid);
15707+ if (start != NULL) {
15708+ dir_cursor *scan;
15709+ reiser4_file_fsdata *fsdata;
15710+
15711+ /* process circular list of cursors for this oid */
15712+ scan = start;
15713+ do {
15714+ dir_cursor *next;
15715+
15716+ next = list_entry(scan->list.next, dir_cursor, list);
15717+ fsdata = scan->fsdata;
15718+ assert("nikita-3557", fsdata != NULL);
15719+ if (scan->key.oid == oid) {
15720+ switch (act) {
15721+ case CURSOR_DISPOSE:
15722+ list_del_init(&fsdata->dir.linkage);
15723+ break;
15724+ case CURSOR_LOAD:
15725+ list_add(&fsdata->dir.linkage, head);
15726+ break;
15727+ case CURSOR_KILL:
15728+ kill_cursor(scan);
15729+ break;
15730+ }
15731+ }
15732+ if (scan == next)
15733+ /* last cursor was just killed */
15734+ break;
15735+ scan = next;
15736+ } while (scan != start);
15737+ }
15738+ spin_unlock(&d_lock);
15739+ /* check that we killed 'em all */
15740+ assert("nikita-3568",
15741+ ergo(act == CURSOR_KILL,
15742+ list_empty_careful(get_readdir_list(inode))));
15743+ assert("nikita-3569",
15744+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15745+ spin_unlock_inode(inode);
15746+ reiser4_exit_context(ctx);
15747+}
15748+
15749+/**
15750+ * reiser4_dispose_cursors - removes cursors from inode's list
15751+ * @inode: inode to dispose cursors of
15752+ *
15753+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15754+ * attached to cursor from inode's readdir list. This is called when inode is
15755+ * removed from the memory by memory pressure.
15756+ */
15757+void reiser4_dispose_cursors(struct inode *inode)
15758+{
15759+ process_cursors(inode, CURSOR_DISPOSE);
15760+}
15761+
15762+/**
15763+ * reiser4_load_cursors - attach cursors to inode
15764+ * @inode: inode to load cursors to
15765+ *
15766+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15767+ * attached to cursor to inode's readdir list. This is done when inode is
15768+ * loaded into memory.
15769+ */
15770+void reiser4_load_cursors(struct inode *inode)
15771+{
15772+ process_cursors(inode, CURSOR_LOAD);
15773+}
15774+
15775+/**
15776+ * reiser4_kill_cursors - kill all inode cursors
15777+ * @inode: inode to kill cursors of
15778+ *
15779+ * Frees all cursors for this inode. This is called when inode is destroyed.
15780+ */
15781+void reiser4_kill_cursors(struct inode *inode)
15782+{
15783+ process_cursors(inode, CURSOR_KILL);
15784+}
15785+
15786+/**
15787+ * file_is_stateless -
15788+ * @file:
15789+ *
15790+ * true, if file descriptor @f is created by NFS server by "demand" to serve
15791+ * one file system operation. This means that there may be "detached state"
15792+ * for underlying inode.
15793+ */
15794+static int file_is_stateless(struct file *file)
15795+{
15796+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15797+}
15798+
15799+/**
15800+ * reiser4_get_dir_fpos -
15801+ * @dir:
15802+ *
15803+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15804+ * in the case of stateless directory operation (readdir-over-nfs), client id
15805+ * was encoded in the high bits of cookie and should me masked off.
15806+ */
15807+loff_t reiser4_get_dir_fpos(struct file *dir)
15808+{
15809+ if (file_is_stateless(dir))
15810+ return dir->f_pos & CID_MASK;
15811+ else
15812+ return dir->f_pos;
15813+}
15814+
15815+/**
15816+ * reiser4_attach_fsdata - try to attach fsdata
15817+ * @file:
15818+ * @inode:
15819+ *
15820+ * Finds or creates cursor for readdir-over-nfs.
15821+ */
15822+int reiser4_attach_fsdata(struct file *file, struct inode *inode)
15823+{
15824+ loff_t pos;
15825+ int result;
15826+ dir_cursor *cursor;
15827+
15828+ /*
15829+ * we are serialized by inode->i_mutex
15830+ */
15831+ if (!file_is_stateless(file))
15832+ return 0;
15833+
15834+ pos = file->f_pos;
15835+ result = 0;
15836+ if (pos == 0) {
15837+ /*
15838+ * first call to readdir (or rewind to the beginning of
15839+ * directory)
15840+ */
15841+ cursor = kmem_cache_alloc(d_cursor_cache,
15842+ reiser4_ctx_gfp_mask_get());
15843+ if (cursor != NULL)
15844+ result = insert_cursor(cursor, file, inode);
15845+ else
15846+ result = RETERR(-ENOMEM);
15847+ } else {
15848+ /* try to find existing cursor */
15849+ d_cursor_key key;
15850+
15851+ key.cid = pos >> CID_SHIFT;
15852+ key.oid = get_inode_oid(inode);
15853+ spin_lock(&d_lock);
15854+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15855+ if (cursor != NULL) {
15856+ /* cursor was found */
15857+ if (cursor->ref == 0) {
15858+ /* move it from unused list */
15859+ list_del_init(&cursor->alist);
15860+ --d_cursor_unused;
15861+ }
15862+ ++cursor->ref;
15863+ }
15864+ spin_unlock(&d_lock);
15865+ if (cursor != NULL) {
15866+ spin_lock_inode(inode);
15867+ assert("nikita-3556", cursor->fsdata->back == NULL);
15868+ clean_fsdata(file);
15869+ free_file_fsdata_nolock(file);
15870+ file->private_data = cursor->fsdata;
15871+ spin_unlock_inode(inode);
15872+ }
15873+ }
15874+ return result;
15875+}
15876+
15877+/**
15878+ * reiser4_detach_fsdata - ???
15879+ * @file:
15880+ *
15881+ * detach fsdata, if necessary
15882+ */
15883+void reiser4_detach_fsdata(struct file *file)
15884+{
15885+ struct inode *inode;
15886+
15887+ if (!file_is_stateless(file))
15888+ return;
15889+
15890+ inode = file->f_dentry->d_inode;
15891+ spin_lock_inode(inode);
15892+ clean_fsdata(file);
15893+ spin_unlock_inode(inode);
15894+}
15895+
15896+/* slab for reiser4_dentry_fsdata */
15897+static struct kmem_cache *dentry_fsdata_cache;
15898+
15899+/**
15900+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
15901+ *
15902+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
15903+ * part of reiser4 module initialization.
15904+ */
15905+int reiser4_init_dentry_fsdata(void)
15906+{
15907+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
15908+ sizeof(reiser4_dentry_fsdata),
15909+ 0,
15910+ SLAB_HWCACHE_ALIGN |
15911+ SLAB_RECLAIM_ACCOUNT, NULL,
15912+ NULL);
15913+ if (dentry_fsdata_cache == NULL)
15914+ return RETERR(-ENOMEM);
15915+ return 0;
15916+}
15917+
15918+/**
15919+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
15920+ *
15921+ * This is called on reiser4 module unloading or system shutdown.
15922+ */
15923+void reiser4_done_dentry_fsdata(void)
15924+{
15925+ destroy_reiser4_cache(&dentry_fsdata_cache);
15926+}
15927+
15928+/**
15929+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
15930+ * @dentry: queried dentry
15931+ *
15932+ * Allocates if necessary and returns per-dentry data that we attach to each
15933+ * dentry.
15934+ */
15935+reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
15936+{
15937+ assert("nikita-1365", dentry != NULL);
15938+
15939+ if (dentry->d_fsdata == NULL) {
15940+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
15941+ reiser4_ctx_gfp_mask_get());
15942+ if (dentry->d_fsdata == NULL)
15943+ return ERR_PTR(RETERR(-ENOMEM));
15944+ memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
15945+ }
15946+ return dentry->d_fsdata;
15947+}
15948+
15949+/**
15950+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
15951+ * @dentry: dentry to free fsdata of
15952+ *
15953+ * Detaches and frees fs-specific dentry data
15954+ */
15955+void reiser4_free_dentry_fsdata(struct dentry *dentry)
15956+{
15957+ if (dentry->d_fsdata != NULL) {
15958+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
15959+ dentry->d_fsdata = NULL;
15960+ }
15961+}
15962+
15963+/* slab for reiser4_file_fsdata */
15964+static struct kmem_cache *file_fsdata_cache;
15965+
15966+/**
15967+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
15968+ *
15969+ * Initializes slab cache of structures attached to file->private_data. It is
15970+ * part of reiser4 module initialization.
15971+ */
15972+int reiser4_init_file_fsdata(void)
15973+{
15974+ file_fsdata_cache = kmem_cache_create("file_fsdata",
15975+ sizeof(reiser4_file_fsdata),
15976+ 0,
15977+ SLAB_HWCACHE_ALIGN |
15978+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
15979+ if (file_fsdata_cache == NULL)
15980+ return RETERR(-ENOMEM);
15981+ return 0;
15982+}
15983+
15984+/**
15985+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
15986+ *
15987+ * This is called on reiser4 module unloading or system shutdown.
15988+ */
15989+void reiser4_done_file_fsdata(void)
15990+{
15991+ destroy_reiser4_cache(&file_fsdata_cache);
15992+}
15993+
15994+/**
15995+ * create_fsdata - allocate and initialize reiser4_file_fsdata
15996+ * @file: what to create file_fsdata for, may be NULL
15997+ *
15998+ * Allocates and initializes reiser4_file_fsdata structure.
15999+ */
16000+static reiser4_file_fsdata *create_fsdata(struct file *file)
16001+{
16002+ reiser4_file_fsdata *fsdata;
16003+
16004+ fsdata = kmem_cache_alloc(file_fsdata_cache,
16005+ reiser4_ctx_gfp_mask_get());
16006+ if (fsdata != NULL) {
16007+ memset(fsdata, 0, sizeof *fsdata);
16008+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16009+ fsdata->back = file;
16010+ INIT_LIST_HEAD(&fsdata->dir.linkage);
16011+ }
16012+ return fsdata;
16013+}
16014+
16015+/**
16016+ * free_fsdata - free reiser4_file_fsdata
16017+ * @fsdata: object to free
16018+ *
16019+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
16020+ */
16021+static void free_fsdata(reiser4_file_fsdata *fsdata)
16022+{
16023+ BUG_ON(fsdata == NULL);
16024+ kmem_cache_free(file_fsdata_cache, fsdata);
16025+}
16026+
16027+/**
16028+ * reiser4_get_file_fsdata - get fs-specific file data
16029+ * @file: queried file
16030+ *
16031+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16032+ * to @file.
16033+ */
16034+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16035+{
16036+ assert("nikita-1603", file != NULL);
16037+
16038+ if (file->private_data == NULL) {
16039+ reiser4_file_fsdata *fsdata;
16040+ struct inode *inode;
16041+
16042+ fsdata = create_fsdata(file);
16043+ if (fsdata == NULL)
16044+ return ERR_PTR(RETERR(-ENOMEM));
16045+
16046+ inode = file->f_dentry->d_inode;
16047+ spin_lock_inode(inode);
16048+ if (file->private_data == NULL) {
16049+ file->private_data = fsdata;
16050+ fsdata = NULL;
16051+ }
16052+ spin_unlock_inode(inode);
16053+ if (fsdata != NULL)
16054+ /* other thread initialized ->fsdata */
16055+ kmem_cache_free(file_fsdata_cache, fsdata);
16056+ }
16057+ assert("nikita-2665", file->private_data != NULL);
16058+ return file->private_data;
16059+}
16060+
16061+/**
16062+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16063+ * @file:
16064+ *
16065+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16066+ * readdir list, frees if it is not linked to d_cursor object.
16067+ */
16068+static void free_file_fsdata_nolock(struct file *file)
16069+{
16070+ reiser4_file_fsdata *fsdata;
16071+
16072+ assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16073+ fsdata = file->private_data;
16074+ if (fsdata != NULL) {
16075+ list_del_init(&fsdata->dir.linkage);
16076+ if (fsdata->cursor == NULL)
16077+ free_fsdata(fsdata);
16078+ }
16079+ file->private_data = NULL;
16080+}
16081+
16082+/**
16083+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16084+ * @file:
16085+ *
16086+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16087+ */
16088+void reiser4_free_file_fsdata(struct file *file)
16089+{
16090+ spin_lock_inode(file->f_dentry->d_inode);
16091+ free_file_fsdata_nolock(file);
16092+ spin_unlock_inode(file->f_dentry->d_inode);
16093+}
16094+
16095+/*
16096+ * Local variables:
16097+ * c-indentation-style: "K&R"
16098+ * mode-name: "LC"
16099+ * c-basic-offset: 8
16100+ * tab-width: 8
16101+ * fill-column: 79
16102+ * End:
16103+ */
16104diff -urN linux-2.6.20.orig/fs/reiser4/fsdata.h linux-2.6.20/fs/reiser4/fsdata.h
16105--- linux-2.6.20.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300
16106+++ linux-2.6.20/fs/reiser4/fsdata.h 2007-05-06 14:50:43.722983224 +0400
16107@@ -0,0 +1,207 @@
16108+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16109+ * reiser4/README */
16110+
16111+#if !defined( __REISER4_FSDATA_H__ )
16112+#define __REISER4_FSDATA_H__
16113+
16114+#include "debug.h"
16115+#include "kassign.h"
16116+#include "seal.h"
16117+#include "type_safe_hash.h"
16118+#include "plugin/file/file.h"
16119+#include "readahead.h"
16120+
16121+/*
16122+ * comment about reiser4_dentry_fsdata
16123+ *
16124+ *
16125+ */
16126+
16127+/*
16128+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
16129+ * protected by ->i_mutex on inode. Under this lock following invariant
16130+ * holds:
16131+ *
16132+ * file descriptor is "looking" at the entry_no-th directory entry from
16133+ * the beginning of directory. This entry has key dir_entry_key and is
16134+ * pos-th entry with duplicate-key sequence.
16135+ *
16136+ */
16137+
16138+/* logical position within directory */
16139+typedef struct {
16140+ /* key of directory entry (actually, part of a key sufficient to
16141+ identify directory entry) */
16142+ de_id dir_entry_key;
16143+ /* ordinal number of directory entry among all entries with the same
16144+ key. (Starting from 0.) */
16145+ unsigned pos;
16146+} dir_pos;
16147+
16148+typedef struct {
16149+ /* f_pos corresponding to this readdir position */
16150+ __u64 fpos;
16151+ /* logical position within directory */
16152+ dir_pos position;
16153+ /* logical number of directory entry within
16154+ directory */
16155+ __u64 entry_no;
16156+} readdir_pos;
16157+
16158+/*
16159+ * this is used to speed up lookups for directory entry: on initial call to
16160+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
16161+ * in struct dentry and reused later to avoid tree traversals.
16162+ */
16163+typedef struct de_location {
16164+ /* seal covering directory entry */
16165+ seal_t entry_seal;
16166+ /* coord of directory entry */
16167+ coord_t entry_coord;
16168+ /* ordinal number of directory entry among all entries with the same
16169+ key. (Starting from 0.) */
16170+ int pos;
16171+} de_location;
16172+
16173+/**
16174+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16175+ *
16176+ * This is allocated dynamically and released in d_op->d_release()
16177+ *
16178+ * Currently it only contains cached location (hint) of directory entry, but
16179+ * it is expected that other information will be accumulated here.
16180+ */
16181+typedef struct reiser4_dentry_fsdata {
16182+ /*
16183+ * here will go fields filled by ->lookup() to speedup next
16184+ * create/unlink, like blocknr of znode with stat-data, or key of
16185+ * stat-data.
16186+ */
16187+ de_location dec;
16188+ int stateless; /* created through reiser4_decode_fh, needs special
16189+ * treatment in readdir. */
16190+} reiser4_dentry_fsdata;
16191+
16192+extern int reiser4_init_dentry_fsdata(void);
16193+extern void reiser4_done_dentry_fsdata(void);
16194+extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16195+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16196+
16197+/**
16198+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16199+ *
16200+ * This is allocated dynamically and released in inode->i_fop->release
16201+ */
16202+typedef struct reiser4_file_fsdata {
16203+ /*
16204+ * pointer back to the struct file which this reiser4_file_fsdata is
16205+ * part of
16206+ */
16207+ struct file *back;
16208+ /* detached cursor for stateless readdir. */
16209+ struct dir_cursor *cursor;
16210+ /*
16211+ * We need both directory and regular file parts here, because there
16212+ * are file system objects that are files and directories.
16213+ */
16214+ struct {
16215+ /*
16216+ * position in directory. It is updated each time directory is
16217+ * modified
16218+ */
16219+ readdir_pos readdir;
16220+ /* head of this list is reiser4_inode->lists.readdir_list */
16221+ struct list_head linkage;
16222+ } dir;
16223+ /* hints to speed up operations with regular files: read and write. */
16224+ struct {
16225+ hint_t hint;
16226+ } reg;
16227+ struct reiser4_file_ra_state ra1;
16228+
16229+} reiser4_file_fsdata;
16230+
16231+extern int reiser4_init_file_fsdata(void);
16232+extern void reiser4_done_file_fsdata(void);
16233+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16234+extern void reiser4_free_file_fsdata(struct file *);
16235+
16236+/*
16237+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16238+ * used to address problem reiser4 has with readdir accesses via NFS. See
16239+ * plugin/file_ops_readdir.c for more details.
16240+ */
16241+typedef struct {
16242+ __u16 cid;
16243+ __u64 oid;
16244+} d_cursor_key;
16245+
16246+/*
16247+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16248+ * maintain hash table of dir_cursor-s in reiser4's super block
16249+ */
16250+typedef struct dir_cursor dir_cursor;
16251+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16252+
16253+typedef struct d_cursor_info d_cursor_info;
16254+
16255+struct dir_cursor {
16256+ int ref;
16257+ reiser4_file_fsdata *fsdata;
16258+
16259+ /* link to reiser4 super block hash table of cursors */
16260+ d_cursor_hash_link hash;
16261+
16262+ /*
16263+ * this is to link cursors to reiser4 super block's radix tree of
16264+ * cursors if there are more than one cursor of the same objectid
16265+ */
16266+ struct list_head list;
16267+ d_cursor_key key;
16268+ d_cursor_info *info;
16269+ /* list of unused cursors */
16270+ struct list_head alist;
16271+};
16272+
16273+extern int reiser4_init_d_cursor(void);
16274+extern void reiser4_done_d_cursor(void);
16275+
16276+extern int reiser4_init_super_d_info(struct super_block *);
16277+extern void reiser4_done_super_d_info(struct super_block *);
16278+
16279+extern loff_t reiser4_get_dir_fpos(struct file *);
16280+extern int reiser4_attach_fsdata(struct file *, struct inode *);
16281+extern void reiser4_detach_fsdata(struct file *);
16282+
16283+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16284+ more details */
16285+void reiser4_dispose_cursors(struct inode *inode);
16286+void reiser4_load_cursors(struct inode *inode);
16287+void reiser4_kill_cursors(struct inode *inode);
16288+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16289+ int offset, int adj);
16290+
16291+/*
16292+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16293+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16294+ */
16295+struct d_cursor_info {
16296+ d_cursor_hash_table table;
16297+ struct radix_tree_root tree;
16298+};
16299+
16300+/* spinlock protecting readdir cursors */
16301+extern spinlock_t d_lock;
16302+
16303+/* __REISER4_FSDATA_H__ */
16304+#endif
16305+
16306+/*
16307+ * Local variables:
16308+ * c-indentation-style: "K&R"
16309+ * mode-name: "LC"
16310+ * c-basic-offset: 8
16311+ * tab-width: 8
16312+ * fill-column: 120
16313+ * End:
16314+ */
16315diff -urN linux-2.6.20.orig/fs/reiser4/init_super.c linux-2.6.20/fs/reiser4/init_super.c
16316--- linux-2.6.20.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300
16317+++ linux-2.6.20/fs/reiser4/init_super.c 2007-05-06 14:50:43.722983224 +0400
16318@@ -0,0 +1,750 @@
16319+/* Copyright by Hans Reiser, 2003 */
16320+
16321+#include "super.h"
16322+#include "inode.h"
16323+#include "plugin/plugin_set.h"
16324+
16325+#include <linux/swap.h>
16326+
16327+/**
16328+ * init_fs_info - allocate reiser4 specific super block
16329+ * @super: super block of filesystem
16330+ *
16331+ * Allocates and initialize reiser4_super_info_data, attaches it to
16332+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
16333+ */
16334+int reiser4_init_fs_info(struct super_block *super)
16335+{
16336+ reiser4_super_info_data *sbinfo;
16337+
16338+ sbinfo = kmalloc(sizeof(reiser4_super_info_data),
16339+ reiser4_ctx_gfp_mask_get());
16340+ if (!sbinfo)
16341+ return RETERR(-ENOMEM);
16342+
16343+ super->s_fs_info = sbinfo;
16344+ super->s_op = NULL;
16345+ memset(sbinfo, 0, sizeof(*sbinfo));
16346+
16347+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16348+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16349+
16350+ mutex_init(&sbinfo->delete_mutex);
16351+ spin_lock_init(&(sbinfo->guard));
16352+
16353+ /* initialize per-super-block d_cursor resources */
16354+ reiser4_init_super_d_info(super);
16355+
16356+ return 0;
16357+}
16358+
16359+/**
16360+ * reiser4_done_fs_info - free reiser4 specific super block
16361+ * @super: super block of filesystem
16362+ *
16363+ * Performs some sanity checks, releases structures maintaining d_cursor-s,
16364+ * frees reiser4_super_info_data.
16365+ */
16366+void reiser4_done_fs_info(struct super_block *super)
16367+{
16368+ assert("zam-990", super->s_fs_info != NULL);
16369+
16370+ /* release per-super-block d_cursor resources */
16371+ reiser4_done_super_d_info(super);
16372+
16373+ /* make sure that there are not jnodes already */
16374+ assert("", list_empty(&get_super_private(super)->all_jnodes));
16375+ assert("", get_current_context()->trans->atom == NULL);
16376+ reiser4_check_block_counters(super);
16377+ kfree(super->s_fs_info);
16378+ super->s_fs_info = NULL;
16379+}
16380+
16381+/* type of option parseable by parse_option() */
16382+typedef enum {
16383+ /* value of option is arbitrary string */
16384+ OPT_STRING,
16385+
16386+ /*
16387+ * option specifies bit in a bitmask. When option is set - bit in
16388+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16389+ * dont_load_bitmap, atomic_write.
16390+ */
16391+ OPT_BIT,
16392+
16393+ /*
16394+ * value of option should conform to sprintf() format. Examples are
16395+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16396+ */
16397+ OPT_FORMAT,
16398+
16399+ /*
16400+ * option can take one of predefined values. Example is onerror=panic or
16401+ * onerror=remount-ro
16402+ */
16403+ OPT_ONEOF,
16404+} opt_type_t;
16405+
16406+typedef struct opt_bitmask_bit {
16407+ const char *bit_name;
16408+ int bit_nr;
16409+} opt_bitmask_bit;
16410+
16411+/* description of option parseable by parse_option() */
16412+typedef struct opt_desc {
16413+ /* option name.
16414+
16415+ parsed portion of string has a form "name=value".
16416+ */
16417+ const char *name;
16418+ /* type of option */
16419+ opt_type_t type;
16420+ union {
16421+ /* where to store value of string option (type == OPT_STRING) */
16422+ char **string;
16423+ /* description of bits for bit option (type == OPT_BIT) */
16424+ struct {
16425+ int nr;
16426+ void *addr;
16427+ } bit;
16428+ /* description of format and targets for format option (type
16429+ == OPT_FORMAT) */
16430+ struct {
16431+ const char *format;
16432+ int nr_args;
16433+ void *arg1;
16434+ void *arg2;
16435+ void *arg3;
16436+ void *arg4;
16437+ } f;
16438+ struct {
16439+ int *result;
16440+ const char *list[10];
16441+ } oneof;
16442+ struct {
16443+ void *addr;
16444+ int nr_bits;
16445+ opt_bitmask_bit *bits;
16446+ } bitmask;
16447+ } u;
16448+} opt_desc_t;
16449+
16450+/**
16451+ * parse_option - parse one option
16452+ * @opt_strin: starting point of parsing
16453+ * @opt: option description
16454+ *
16455+ * foo=bar,
16456+ * ^ ^ ^
16457+ * | | +-- replaced to '\0'
16458+ * | +-- val_start
16459+ * +-- opt_string
16460+ * Figures out option type and handles option correspondingly.
16461+ */
16462+static int parse_option(char *opt_string, opt_desc_t *opt)
16463+{
16464+ char *val_start;
16465+ int result;
16466+ const char *err_msg;
16467+
16468+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16469+
16470+ val_start = strchr(opt_string, '=');
16471+ if (val_start != NULL) {
16472+ *val_start = '\0';
16473+ ++val_start;
16474+ }
16475+
16476+ err_msg = NULL;
16477+ result = 0;
16478+ switch (opt->type) {
16479+ case OPT_STRING:
16480+ if (val_start == NULL) {
16481+ err_msg = "String arg missing";
16482+ result = RETERR(-EINVAL);
16483+ } else
16484+ *opt->u.string = val_start;
16485+ break;
16486+ case OPT_BIT:
16487+ if (val_start != NULL)
16488+ err_msg = "Value ignored";
16489+ else
16490+ set_bit(opt->u.bit.nr, opt->u.bit.addr);
16491+ break;
16492+ case OPT_FORMAT:
16493+ if (val_start == NULL) {
16494+ err_msg = "Formatted arg missing";
16495+ result = RETERR(-EINVAL);
16496+ break;
16497+ }
16498+ if (sscanf(val_start, opt->u.f.format,
16499+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16500+ opt->u.f.arg4) != opt->u.f.nr_args) {
16501+ err_msg = "Wrong conversion";
16502+ result = RETERR(-EINVAL);
16503+ }
16504+ break;
16505+ case OPT_ONEOF:
16506+ {
16507+ int i = 0;
16508+
16509+ if (val_start == NULL) {
16510+ err_msg = "Value is missing";
16511+ result = RETERR(-EINVAL);
16512+ break;
16513+ }
16514+ err_msg = "Wrong option value";
16515+ result = RETERR(-EINVAL);
16516+ while (opt->u.oneof.list[i]) {
16517+ if (!strcmp(opt->u.oneof.list[i], val_start)) {
16518+ result = 0;
16519+ err_msg = NULL;
16520+ *opt->u.oneof.result = i;
16521+ break;
16522+ }
16523+ i++;
16524+ }
16525+ break;
16526+ }
16527+ default:
16528+ wrong_return_value("nikita-2100", "opt -> type");
16529+ break;
16530+ }
16531+ if (err_msg != NULL) {
16532+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16533+ err_msg, opt->name, val_start ? "=" : "",
16534+ val_start ? : "");
16535+ }
16536+ return result;
16537+}
16538+
16539+/**
16540+ * parse_options - parse reiser4 mount options
16541+ * @opt_string: starting point
16542+ * @opts: array of option description
16543+ * @nr_opts: number of elements in @opts
16544+ *
16545+ * Parses comma separated list of reiser4 mount options.
16546+ */
16547+static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
16548+{
16549+ int result;
16550+
16551+ result = 0;
16552+ while ((result == 0) && opt_string && *opt_string) {
16553+ int j;
16554+ char *next;
16555+
16556+ next = strchr(opt_string, ',');
16557+ if (next != NULL) {
16558+ *next = '\0';
16559+ ++next;
16560+ }
16561+ for (j = 0; j < nr_opts; ++j) {
16562+ if (!strncmp(opt_string, opts[j].name,
16563+ strlen(opts[j].name))) {
16564+ result = parse_option(opt_string, &opts[j]);
16565+ break;
16566+ }
16567+ }
16568+ if (j == nr_opts) {
16569+ warning("nikita-2307", "Unrecognized option: \"%s\"",
16570+ opt_string);
16571+ /* traditionally, -EINVAL is returned on wrong mount
16572+ option */
16573+ result = RETERR(-EINVAL);
16574+ }
16575+ opt_string = next;
16576+ }
16577+ return result;
16578+}
16579+
16580+#define NUM_OPT( label, fmt, addr ) \
16581+ { \
16582+ .name = ( label ), \
16583+ .type = OPT_FORMAT, \
16584+ .u = { \
16585+ .f = { \
16586+ .format = ( fmt ), \
16587+ .nr_args = 1, \
16588+ .arg1 = ( addr ), \
16589+ .arg2 = NULL, \
16590+ .arg3 = NULL, \
16591+ .arg4 = NULL \
16592+ } \
16593+ } \
16594+ }
16595+
16596+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16597+
16598+#define BIT_OPT(label, bitnr) \
16599+ { \
16600+ .name = label, \
16601+ .type = OPT_BIT, \
16602+ .u = { \
16603+ .bit = { \
16604+ .nr = bitnr, \
16605+ .addr = &sbinfo->fs_flags \
16606+ } \
16607+ } \
16608+ }
16609+
16610+#define MAX_NR_OPTIONS (30)
16611+
16612+/**
16613+ * reiser4_init_super_data - initialize reiser4 private super block
16614+ * @super: super block to initialize
16615+ * @opt_string: list of reiser4 mount options
16616+ *
16617+ * Sets various reiser4 parameters to default values. Parses mount options and
16618+ * overwrites default settings.
16619+ */
16620+int reiser4_init_super_data(struct super_block *super, char *opt_string)
16621+{
16622+ int result;
16623+ opt_desc_t *opts, *p;
16624+ reiser4_super_info_data *sbinfo = get_super_private(super);
16625+
16626+ /* initialize super, export, dentry operations */
16627+ sbinfo->ops.super = reiser4_super_operations;
16628+ sbinfo->ops.export = reiser4_export_operations;
16629+ sbinfo->ops.dentry = reiser4_dentry_operations;
16630+ super->s_op = &sbinfo->ops.super;
16631+ super->s_export_op = &sbinfo->ops.export;
16632+
16633+ /* initialize transaction manager parameters to default values */
16634+ sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16635+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16636+ sbinfo->tmgr.atom_min_size = 256;
16637+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16638+
16639+ /* initialize cbk cache parameter */
16640+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16641+
16642+ /* initialize flush parameters */
16643+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16644+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16645+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16646+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16647+
16648+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16649+
16650+ /* preliminary tree initializations */
16651+ sbinfo->tree.super = super;
16652+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16653+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16654+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16655+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16656+ rwlock_init(&(sbinfo->tree.tree_lock));
16657+ spin_lock_init(&(sbinfo->tree.epoch_lock));
16658+
16659+ /* initialize default readahead params */
16660+ sbinfo->ra_params.max = num_physpages / 4;
16661+ sbinfo->ra_params.flags = 0;
16662+
16663+ /* allocate memory for structure describing reiser4 mount options */
16664+ opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS,
16665+ reiser4_ctx_gfp_mask_get());
16666+ if (opts == NULL)
16667+ return RETERR(-ENOMEM);
16668+
16669+ /* initialize structure describing reiser4 mount options */
16670+ p = opts;
16671+
16672+#if REISER4_DEBUG
16673+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
16674+ warning ("zam-1046", "opt array is overloaded"); break; \
16675+ }
16676+#else
16677+# define OPT_ARRAY_CHECK noop
16678+#endif
16679+
16680+#define PUSH_OPT(...) \
16681+do { \
16682+ opt_desc_t o = __VA_ARGS__; \
16683+ OPT_ARRAY_CHECK; \
16684+ *p ++ = o; \
16685+} while (0)
16686+
16687+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16688+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16689+
16690+ /*
16691+ * tmgr.atom_max_size=N
16692+ * Atoms containing more than N blocks will be forced to commit. N is
16693+ * decimal.
16694+ */
16695+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16696+ /*
16697+ * tmgr.atom_max_age=N
16698+ * Atoms older than N seconds will be forced to commit. N is decimal.
16699+ */
16700+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16701+ /*
16702+ * tmgr.atom_min_size=N
16703+ * In committing an atom to free dirty pages, force the atom less than
16704+ * N in size to fuse with another one.
16705+ */
16706+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16707+ /*
16708+ * tmgr.atom_max_flushers=N
16709+ * limit of concurrent flushers for one atom. 0 means no limit.
16710+ */
16711+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16712+ /*
16713+ * tree.cbk_cache_slots=N
16714+ * Number of slots in the cbk cache.
16715+ */
16716+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16717+ /*
16718+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16719+ * leaf-level blocks it will force them to be relocated.
16720+ */
16721+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16722+ /*
16723+ * If flush finds can find a block allocation closer than at most
16724+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16725+ * position.
16726+ */
16727+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16728+ /*
16729+ * If we have written this much or more blocks before encountering busy
16730+ * jnode in flush list - abort flushing hoping that next time we get
16731+ * called this jnode will be clean already, and we will save some
16732+ * seeks.
16733+ */
16734+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16735+ /* The maximum number of nodes to scan left on a level during flush. */
16736+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16737+ /* preferred IO size */
16738+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16739+ /* carry flags used for insertion of new nodes */
16740+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16741+ /* carry flags used for insertion of new extents */
16742+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16743+ /* carry flags used for paste operations */
16744+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16745+ /* carry flags used for insert operations */
16746+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16747+
16748+#ifdef CONFIG_REISER4_BADBLOCKS
16749+ /*
16750+ * Alternative master superblock location in case if it's original
16751+ * location is not writeable/accessable. This is offset in BYTES.
16752+ */
16753+ PUSH_SB_FIELD_OPT(altsuper, "%lu");
16754+#endif
16755+
16756+ /* turn on BSD-style gid assignment */
16757+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16758+ /* turn on 32 bit times */
16759+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16760+ /*
16761+ * Don't load all bitmap blocks at mount time, it is useful for
16762+ * machines with tiny RAM and large disks.
16763+ */
16764+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16765+ /* disable transaction commits during write() */
16766+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16767+ /* disable use of write barriers in the reiser4 log writer. */
16768+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16769+
16770+ PUSH_OPT(
16771+ {
16772+ /*
16773+ * tree traversal readahead parameters:
16774+ * -o readahead:MAXNUM:FLAGS
16775+ * MAXNUM - max number fo nodes to request readahead for: -1UL
16776+ * will set it to max_sane_readahead()
16777+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16778+ * CONTINUE_ON_PRESENT
16779+ */
16780+ .name = "readahead",
16781+ .type = OPT_FORMAT,
16782+ .u = {
16783+ .f = {
16784+ .format = "%u:%u",
16785+ .nr_args = 2,
16786+ .arg1 = &sbinfo->ra_params.max,
16787+ .arg2 = &sbinfo->ra_params.flags,
16788+ .arg3 = NULL,
16789+ .arg4 = NULL
16790+ }
16791+ }
16792+ }
16793+ );
16794+
16795+ /* What to do in case of fs error */
16796+ PUSH_OPT(
16797+ {
16798+ .name = "onerror",
16799+ .type = OPT_ONEOF,
16800+ .u = {
16801+ .oneof = {
16802+ .result = &sbinfo->onerror,
16803+ .list = {
16804+ "panic", "remount-ro", NULL
16805+ },
16806+ }
16807+ }
16808+ }
16809+ );
16810+
16811+ /* modify default settings to values set by mount options */
16812+ result = parse_options(opt_string, opts, p - opts);
16813+ kfree(opts);
16814+ if (result != 0)
16815+ return result;
16816+
16817+ /* correct settings to sanity values */
16818+ sbinfo->tmgr.atom_max_age *= HZ;
16819+ if (sbinfo->tmgr.atom_max_age <= 0)
16820+ /* overflow */
16821+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16822+
16823+ /* round optimal io size up to 512 bytes */
16824+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16825+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16826+ if (sbinfo->optimal_io_size == 0) {
16827+ warning("nikita-2497", "optimal_io_size is too small");
16828+ return RETERR(-EINVAL);
16829+ }
16830+ return result;
16831+}
16832+
16833+/**
16834+ * reiser4_init_read_super - read reiser4 master super block
16835+ * @super: super block to fill
16836+ * @silent: if 0 - print warnings
16837+ *
16838+ * Reads reiser4 master super block either from predefined location or from
16839+ * location specified by altsuper mount option, initializes disk format plugin.
16840+ */
16841+int reiser4_init_read_super(struct super_block *super, int silent)
16842+{
16843+ struct buffer_head *super_bh;
16844+ struct reiser4_master_sb *master_sb;
16845+ reiser4_super_info_data *sbinfo = get_super_private(super);
16846+ unsigned long blocksize;
16847+
16848+ read_super_block:
16849+#ifdef CONFIG_REISER4_BADBLOCKS
16850+ if (sbinfo->altsuper)
16851+ /*
16852+ * read reiser4 master super block at position specified by
16853+ * mount option
16854+ */
16855+ super_bh = sb_bread(super,
16856+ (sector_t)(sbinfo->altsuper / super->s_blocksize));
16857+ else
16858+#endif
16859+ /* read reiser4 master super block at 16-th 4096 block */
16860+ super_bh = sb_bread(super,
16861+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16862+ if (!super_bh)
16863+ return RETERR(-EIO);
16864+
16865+ master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16866+ /* check reiser4 magic string */
16867+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16868+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
16869+ /* reiser4 master super block contains filesystem blocksize */
16870+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16871+
16872+ if (blocksize != PAGE_CACHE_SIZE) {
16873+ /*
16874+ * currenly reiser4's blocksize must be equal to
16875+ * pagesize
16876+ */
16877+ if (!silent)
16878+ warning("nikita-2609",
16879+ "%s: wrong block size %ld\n", super->s_id,
16880+ blocksize);
16881+ brelse(super_bh);
16882+ return RETERR(-EINVAL);
16883+ }
16884+ if (blocksize != super->s_blocksize) {
16885+ /*
16886+ * filesystem uses different blocksize. Reread master
16887+ * super block with correct blocksize
16888+ */
16889+ brelse(super_bh);
16890+ if (!sb_set_blocksize(super, (int)blocksize))
16891+ return RETERR(-EINVAL);
16892+ goto read_super_block;
16893+ }
16894+
16895+ sbinfo->df_plug =
16896+ disk_format_plugin_by_id(
16897+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16898+ if (sbinfo->df_plug == NULL) {
16899+ if (!silent)
16900+ warning("nikita-26091",
16901+ "%s: unknown disk format plugin %d\n",
16902+ super->s_id,
16903+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16904+ brelse(super_bh);
16905+ return RETERR(-EINVAL);
16906+ }
16907+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
16908+ brelse(super_bh);
16909+ return 0;
16910+ }
16911+
16912+ /* there is no reiser4 on the device */
16913+ if (!silent)
16914+ warning("nikita-2608",
16915+ "%s: wrong master super block magic", super->s_id);
16916+ brelse(super_bh);
16917+ return RETERR(-EINVAL);
16918+}
16919+
16920+static struct {
16921+ reiser4_plugin_type type;
16922+ reiser4_plugin_id id;
16923+} default_plugins[PSET_LAST] = {
16924+ [PSET_FILE] = {
16925+ .type = REISER4_FILE_PLUGIN_TYPE,
16926+ .id = UNIX_FILE_PLUGIN_ID
16927+ },
16928+ [PSET_DIR] = {
16929+ .type = REISER4_DIR_PLUGIN_TYPE,
16930+ .id = HASHED_DIR_PLUGIN_ID
16931+ },
16932+ [PSET_HASH] = {
16933+ .type = REISER4_HASH_PLUGIN_TYPE,
16934+ .id = R5_HASH_ID
16935+ },
16936+ [PSET_FIBRATION] = {
16937+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
16938+ .id = FIBRATION_DOT_O
16939+ },
16940+ [PSET_PERM] = {
16941+ .type = REISER4_PERM_PLUGIN_TYPE,
16942+ .id = NULL_PERM_ID
16943+ },
16944+ [PSET_FORMATTING] = {
16945+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
16946+ .id = SMALL_FILE_FORMATTING_ID
16947+ },
16948+ [PSET_SD] = {
16949+ .type = REISER4_ITEM_PLUGIN_TYPE,
16950+ .id = STATIC_STAT_DATA_ID
16951+ },
16952+ [PSET_DIR_ITEM] = {
16953+ .type = REISER4_ITEM_PLUGIN_TYPE,
16954+ .id = COMPOUND_DIR_ID
16955+ },
16956+ [PSET_CIPHER] = {
16957+ .type = REISER4_CIPHER_PLUGIN_TYPE,
16958+ .id = NONE_CIPHER_ID
16959+ },
16960+ [PSET_DIGEST] = {
16961+ .type = REISER4_DIGEST_PLUGIN_TYPE,
16962+ .id = SHA256_32_DIGEST_ID
16963+ },
16964+ [PSET_COMPRESSION] = {
16965+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
16966+ .id = LZO1_COMPRESSION_ID
16967+ },
16968+ [PSET_COMPRESSION_MODE] = {
16969+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
16970+ .id = CONVX_COMPRESSION_MODE_ID
16971+ },
16972+ [PSET_CLUSTER] = {
16973+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
16974+ .id = CLUSTER_64K_ID
16975+ },
16976+ [PSET_CREATE] = {
16977+ .type = REISER4_FILE_PLUGIN_TYPE,
16978+ .id = UNIX_FILE_PLUGIN_ID
16979+ }
16980+};
16981+
16982+/* access to default plugin table */
16983+reiser4_plugin *get_default_plugin(pset_member memb)
16984+{
16985+ return plugin_by_id(default_plugins[memb].type,
16986+ default_plugins[memb].id);
16987+}
16988+
16989+/**
16990+ * reiser4_init_root_inode - obtain inode of root directory
16991+ * @super: super block of filesystem
16992+ *
16993+ * Obtains inode of root directory (reading it from disk), initializes plugin
16994+ * set it was not initialized.
16995+ */
16996+int reiser4_init_root_inode(struct super_block *super)
16997+{
16998+ reiser4_super_info_data *sbinfo = get_super_private(super);
16999+ struct inode *inode;
17000+ int result = 0;
17001+
17002+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17003+ if (IS_ERR(inode))
17004+ return RETERR(PTR_ERR(inode));
17005+
17006+ super->s_root = d_alloc_root(inode);
17007+ if (!super->s_root) {
17008+ iput(inode);
17009+ return RETERR(-ENOMEM);
17010+ }
17011+
17012+ super->s_root->d_op = &sbinfo->ops.dentry;
17013+
17014+ if (!is_inode_loaded(inode)) {
17015+ pset_member memb;
17016+ plugin_set *pset;
17017+
17018+ pset = reiser4_inode_data(inode)->pset;
17019+ for (memb = 0; memb < PSET_LAST; ++memb) {
17020+
17021+ if (aset_get(pset, memb) != NULL)
17022+ continue;
17023+
17024+ result = grab_plugin_pset(inode, NULL, memb);
17025+ if (result != 0)
17026+ break;
17027+
17028+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17029+ }
17030+
17031+ if (result == 0) {
17032+ if (REISER4_DEBUG) {
17033+ for (memb = 0; memb < PSET_LAST; ++memb)
17034+ assert("nikita-3500",
17035+ aset_get(pset, memb) != NULL);
17036+ }
17037+ } else
17038+ warning("nikita-3448", "Cannot set plugins of root: %i",
17039+ result);
17040+ reiser4_iget_complete(inode);
17041+
17042+ /* As the default pset kept in the root dir may has been changed
17043+ (length is unknown), call update_sd. */
17044+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17045+ result = reiser4_grab_space(
17046+ inode_file_plugin(inode)->estimate.update(inode),
17047+ BA_CAN_COMMIT);
17048+
17049+ if (result == 0)
17050+ result = reiser4_update_sd(inode);
17051+
17052+ all_grabbed2free();
17053+ }
17054+ }
17055+
17056+ super->s_maxbytes = MAX_LFS_FILESIZE;
17057+ return result;
17058+}
17059+
17060+/*
17061+ * Local variables:
17062+ * c-indentation-style: "K&R"
17063+ * mode-name: "LC"
17064+ * c-basic-offset: 8
17065+ * tab-width: 8
17066+ * fill-column: 79
17067+ * End:
17068+ */
17069diff -urN linux-2.6.20.orig/fs/reiser4/inode.c linux-2.6.20/fs/reiser4/inode.c
17070--- linux-2.6.20.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300
17071+++ linux-2.6.20/fs/reiser4/inode.c 2007-05-06 14:50:43.726984474 +0400
17072@@ -0,0 +1,709 @@
17073+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17074+
17075+/* Inode specific operations. */
17076+
17077+#include "forward.h"
17078+#include "debug.h"
17079+#include "key.h"
17080+#include "kassign.h"
17081+#include "coord.h"
17082+#include "seal.h"
17083+#include "dscale.h"
17084+#include "plugin/item/item.h"
17085+#include "plugin/security/perm.h"
17086+#include "plugin/plugin.h"
17087+#include "plugin/object.h"
17088+#include "znode.h"
17089+#include "vfs_ops.h"
17090+#include "inode.h"
17091+#include "super.h"
17092+#include "reiser4.h"
17093+
17094+#include <linux/fs.h> /* for struct super_block, address_space */
17095+
17096+/* return reiser4 internal tree which inode belongs to */
17097+/* Audited by: green(2002.06.17) */
17098+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17099+{
17100+ assert("nikita-256", inode != NULL);
17101+ assert("nikita-257", inode->i_sb != NULL);
17102+ return reiser4_get_tree(inode->i_sb);
17103+}
17104+
17105+/* return reiser4-specific inode flags */
17106+static inline unsigned long *inode_flags(const struct inode *const inode)
17107+{
17108+ assert("nikita-2842", inode != NULL);
17109+ return &reiser4_inode_data(inode)->flags;
17110+}
17111+
17112+/* set reiser4-specific flag @f in @inode */
17113+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17114+{
17115+ assert("nikita-2248", inode != NULL);
17116+ set_bit((int)f, inode_flags(inode));
17117+}
17118+
17119+/* clear reiser4-specific flag @f in @inode */
17120+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17121+{
17122+ assert("nikita-2250", inode != NULL);
17123+ clear_bit((int)f, inode_flags(inode));
17124+}
17125+
17126+/* true if reiser4-specific flag @f is set in @inode */
17127+int reiser4_inode_get_flag(const struct inode *inode,
17128+ reiser4_file_plugin_flags f)
17129+{
17130+ assert("nikita-2251", inode != NULL);
17131+ return test_bit((int)f, inode_flags(inode));
17132+}
17133+
17134+/* convert oid to inode number */
17135+ino_t oid_to_ino(oid_t oid)
17136+{
17137+ return (ino_t) oid;
17138+}
17139+
17140+/* convert oid to user visible inode number */
17141+ino_t oid_to_uino(oid_t oid)
17142+{
17143+ /* reiser4 object is uniquely identified by oid which is 64 bit
17144+ quantity. Kernel in-memory inode is indexed (in the hash table) by
17145+ 32 bit i_ino field, but this is not a problem, because there is a
17146+ way to further distinguish inodes with identical inode numbers
17147+ (find_actor supplied to iget()).
17148+
17149+ But user space expects unique 32 bit inode number. Obviously this
17150+ is impossible. Work-around is to somehow hash oid into user visible
17151+ inode number.
17152+ */
17153+ oid_t max_ino = (ino_t) ~ 0;
17154+
17155+ if (REISER4_INO_IS_OID || (oid <= max_ino))
17156+ return oid;
17157+ else
17158+ /* this is remotely similar to algorithm used to find next pid
17159+ to use for process: after wrap-around start from some
17160+ offset rather than from 0. Idea is that there are some long
17161+ living objects with which we don't want to collide.
17162+ */
17163+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17164+}
17165+
17166+/* check that "inode" is on reiser4 file-system */
17167+int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17168+{
17169+ return inode != NULL && is_reiser4_super(inode->i_sb);
17170+}
17171+
17172+/* Maximal length of a name that can be stored in directory @inode.
17173+
17174+ This is used in check during file creation and lookup. */
17175+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17176+{
17177+ assert("nikita-287", is_reiser4_inode(inode));
17178+ assert("nikita-1710", inode_dir_item_plugin(inode));
17179+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17180+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17181+ else
17182+ return 255;
17183+}
17184+
17185+#if REISER4_USE_COLLISION_LIMIT
17186+/* Maximal number of hash collisions for this directory. */
17187+int max_hash_collisions(const struct inode *dir /* inode queried */ )
17188+{
17189+ assert("nikita-1711", dir != NULL);
17190+ return reiser4_inode_data(dir)->plugin.max_collisions;
17191+}
17192+#endif /* REISER4_USE_COLLISION_LIMIT */
17193+
17194+/* Install file, inode, and address_space operation on @inode, depending on
17195+ its mode. */
17196+int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17197+ reiser4_object_create_data * data /* parameters to create
17198+ * object */ )
17199+{
17200+ reiser4_super_info_data *sinfo;
17201+ file_plugin *fplug;
17202+ dir_plugin *dplug;
17203+
17204+ fplug = inode_file_plugin(inode);
17205+ dplug = inode_dir_plugin(inode);
17206+
17207+ sinfo = get_super_private(inode->i_sb);
17208+
17209+ switch (inode->i_mode & S_IFMT) {
17210+ case S_IFSOCK:
17211+ case S_IFBLK:
17212+ case S_IFCHR:
17213+ case S_IFIFO:
17214+ {
17215+ dev_t rdev; /* to keep gcc happy */
17216+
17217+ assert("vs-46", fplug != NULL);
17218+ /* ugly hack with rdev */
17219+ if (data == NULL) {
17220+ rdev = inode->i_rdev;
17221+ inode->i_rdev = 0;
17222+ } else
17223+ rdev = data->rdev;
17224+ inode->i_blocks = 0;
17225+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17226+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17227+ /* initialize inode->i_fop and inode->i_rdev for block and char
17228+ devices */
17229+ init_special_inode(inode, inode->i_mode, rdev);
17230+ /* all address space operations are null */
17231+ inode->i_mapping->a_ops =
17232+ &file_plugins[fplug->h.id].as_ops;
17233+ break;
17234+ }
17235+ case S_IFLNK:
17236+ assert("vs-46", fplug != NULL);
17237+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17238+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17239+ inode->i_fop = NULL;
17240+ /* all address space operations are null */
17241+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17242+ break;
17243+ case S_IFDIR:
17244+ assert("vs-46", dplug != NULL);
17245+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17246+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17247+ inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17248+ inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17249+ inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17250+ break;
17251+ case S_IFREG:
17252+ assert("vs-46", fplug != NULL);
17253+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17254+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17255+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17256+ inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17257+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17258+ break;
17259+ default:
17260+ warning("nikita-291", "wrong file mode: %o for %llu",
17261+ inode->i_mode,
17262+ (unsigned long long)get_inode_oid(inode));
17263+ reiser4_make_bad_inode(inode);
17264+ return RETERR(-EINVAL);
17265+ }
17266+ return 0;
17267+}
17268+
17269+/* Initialize inode from disk data. Called with inode locked.
17270+ Return inode locked. */
17271+static int init_inode(struct inode *inode /* inode to intialise */ ,
17272+ coord_t * coord /* coord of stat data */ )
17273+{
17274+ int result;
17275+ item_plugin *iplug;
17276+ void *body;
17277+ int length;
17278+ reiser4_inode *state;
17279+
17280+ assert("nikita-292", coord != NULL);
17281+ assert("nikita-293", inode != NULL);
17282+
17283+ coord_clear_iplug(coord);
17284+ result = zload(coord->node);
17285+ if (result)
17286+ return result;
17287+ iplug = item_plugin_by_coord(coord);
17288+ body = item_body_by_coord(coord);
17289+ length = item_length_by_coord(coord);
17290+
17291+ assert("nikita-295", iplug != NULL);
17292+ assert("nikita-296", body != NULL);
17293+ assert("nikita-297", length > 0);
17294+
17295+ /* inode is under I_LOCK now */
17296+
17297+ state = reiser4_inode_data(inode);
17298+ /* call stat-data plugin method to load sd content into inode */
17299+ result = iplug->s.sd.init_inode(inode, body, length);
17300+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17301+ if (result == 0) {
17302+ result = setup_inode_ops(inode, NULL);
17303+ if (result == 0 && inode->i_sb->s_root &&
17304+ inode->i_sb->s_root->d_inode)
17305+ result = finish_pset(inode);
17306+ }
17307+ zrelse(coord->node);
17308+ return result;
17309+}
17310+
17311+/* read `inode' from the disk. This is what was previously in
17312+ reiserfs_read_inode2().
17313+
17314+ Must be called with inode locked. Return inode still locked.
17315+*/
17316+static int read_inode(struct inode *inode /* inode to read from disk */ ,
17317+ const reiser4_key * key /* key of stat data */ ,
17318+ int silent)
17319+{
17320+ int result;
17321+ lock_handle lh;
17322+ reiser4_inode *info;
17323+ coord_t coord;
17324+
17325+ assert("nikita-298", inode != NULL);
17326+ assert("nikita-1945", !is_inode_loaded(inode));
17327+
17328+ info = reiser4_inode_data(inode);
17329+ assert("nikita-300", info->locality_id != 0);
17330+
17331+ coord_init_zero(&coord);
17332+ init_lh(&lh);
17333+ /* locate stat-data in a tree and return znode locked */
17334+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17335+ assert("nikita-301", !is_inode_loaded(inode));
17336+ if (result == 0) {
17337+ /* use stat-data plugin to load sd into inode. */
17338+ result = init_inode(inode, &coord);
17339+ if (result == 0) {
17340+ /* initialize stat-data seal */
17341+ spin_lock_inode(inode);
17342+ reiser4_seal_init(&info->sd_seal, &coord, key);
17343+ info->sd_coord = coord;
17344+ spin_unlock_inode(inode);
17345+
17346+ /* call file plugin's method to initialize plugin
17347+ * specific part of inode */
17348+ if (inode_file_plugin(inode)->init_inode_data)
17349+ inode_file_plugin(inode)->init_inode_data(inode,
17350+ NULL,
17351+ 0);
17352+ /* load detached directory cursors for stateless
17353+ * directory readers (NFS). */
17354+ reiser4_load_cursors(inode);
17355+
17356+ /* Check the opened inode for consistency. */
17357+ result =
17358+ get_super_private(inode->i_sb)->df_plug->
17359+ check_open(inode);
17360+ }
17361+ }
17362+ /* lookup_sd() doesn't release coord because we want znode
17363+ stay read-locked while stat-data fields are accessed in
17364+ init_inode() */
17365+ done_lh(&lh);
17366+
17367+ if (result != 0)
17368+ reiser4_make_bad_inode(inode);
17369+ return result;
17370+}
17371+
17372+/* initialise new reiser4 inode being inserted into hash table. */
17373+static int init_locked_inode(struct inode *inode /* new inode */ ,
17374+ void *opaque /* key of stat data passed to the
17375+ * iget5_locked as cookie */ )
17376+{
17377+ reiser4_key *key;
17378+
17379+ assert("nikita-1995", inode != NULL);
17380+ assert("nikita-1996", opaque != NULL);
17381+ key = opaque;
17382+ set_inode_oid(inode, get_key_objectid(key));
17383+ reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17384+ return 0;
17385+}
17386+
17387+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17388+
17389+ This function is called by iget5_locked() to distinguish reiser4 inodes
17390+ having the same inode numbers. Such inodes can only exist due to some error
17391+ condition. One of them should be bad. Inodes with identical inode numbers
17392+ (objectids) are distinguished by their packing locality.
17393+
17394+*/
17395+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17396+ * check */ ,
17397+ void *opaque /* "cookie" passed to
17398+ * iget5_locked(). This is stat data
17399+ * key */ )
17400+{
17401+ reiser4_key *key;
17402+
17403+ key = opaque;
17404+ return
17405+ /* oid is unique, so first term is enough, actually. */
17406+ get_inode_oid(inode) == get_key_objectid(key) &&
17407+ /*
17408+ * also, locality should be checked, but locality is stored in
17409+ * the reiser4-specific part of the inode, and actor can be
17410+ * called against arbitrary inode that happened to be in this
17411+ * hash chain. Hence we first have to check that this is
17412+ * reiser4 inode at least. is_reiser4_inode() is probably too
17413+ * early to call, as inode may have ->i_op not yet
17414+ * initialised.
17415+ */
17416+ is_reiser4_super(inode->i_sb) &&
17417+ /*
17418+ * usually objectid is unique, but pseudo files use counter to
17419+ * generate objectid. All pseudo files are placed into special
17420+ * (otherwise unused) locality.
17421+ */
17422+ reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17423+}
17424+
17425+/* hook for kmem_cache_create */
17426+void loading_init_once(reiser4_inode * info)
17427+{
17428+ mutex_init(&info->loading);
17429+}
17430+
17431+/* for reiser4_alloc_inode */
17432+void loading_alloc(reiser4_inode * info)
17433+{
17434+ assert("vs-1717", !mutex_is_locked(&info->loading));
17435+}
17436+
17437+/* for reiser4_destroy */
17438+void loading_destroy(reiser4_inode * info)
17439+{
17440+ assert("vs-1717a", !mutex_is_locked(&info->loading));
17441+}
17442+
17443+static void loading_begin(reiser4_inode * info)
17444+{
17445+ mutex_lock(&info->loading);
17446+}
17447+
17448+static void loading_end(reiser4_inode * info)
17449+{
17450+ mutex_unlock(&info->loading);
17451+}
17452+
17453+/**
17454+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17455+ * @super: super block of filesystem
17456+ * @key: key of inode's stat-data
17457+ * @silent:
17458+ *
17459+ * This is our helper function a la iget(). This is be called by
17460+ * lookup_common() and reiser4_read_super(). Return inode locked or error
17461+ * encountered.
17462+ */
17463+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17464+ int silent)
17465+{
17466+ struct inode *inode;
17467+ int result;
17468+ reiser4_inode *info;
17469+
17470+ assert("nikita-302", super != NULL);
17471+ assert("nikita-303", key != NULL);
17472+
17473+ result = 0;
17474+
17475+ /* call iget(). Our ->read_inode() is dummy, so this will either
17476+ find inode in cache or return uninitialised inode */
17477+ inode = iget5_locked(super,
17478+ (unsigned long)get_key_objectid(key),
17479+ reiser4_inode_find_actor,
17480+ init_locked_inode, (reiser4_key *) key);
17481+ if (inode == NULL)
17482+ return ERR_PTR(RETERR(-ENOMEM));
17483+ if (is_bad_inode(inode)) {
17484+ warning("nikita-304", "Bad inode found");
17485+ reiser4_print_key("key", key);
17486+ iput(inode);
17487+ return ERR_PTR(RETERR(-EIO));
17488+ }
17489+
17490+ info = reiser4_inode_data(inode);
17491+
17492+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17493+ loaded and initialized inode from just allocated inode. If
17494+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17495+ info->loading. The place in reiser4 which uses not initialized inode
17496+ is the reiser4 repacker, see repacker-related functions in
17497+ plugin/item/extent.c */
17498+ if (!is_inode_loaded(inode)) {
17499+ loading_begin(info);
17500+ if (!is_inode_loaded(inode)) {
17501+ /* locking: iget5_locked returns locked inode */
17502+ assert("nikita-1941", !is_inode_loaded(inode));
17503+ assert("nikita-1949",
17504+ reiser4_inode_find_actor(inode,
17505+ (reiser4_key *) key));
17506+ /* now, inode has objectid as ->i_ino and locality in
17507+ reiser4-specific part. This is enough for
17508+ read_inode() to read stat data from the disk */
17509+ result = read_inode(inode, key, silent);
17510+ } else
17511+ loading_end(info);
17512+ }
17513+
17514+ if (inode->i_state & I_NEW)
17515+ unlock_new_inode(inode);
17516+
17517+ if (is_bad_inode(inode)) {
17518+ assert("vs-1717", result != 0);
17519+ loading_end(info);
17520+ iput(inode);
17521+ inode = ERR_PTR(result);
17522+ } else if (REISER4_DEBUG) {
17523+ reiser4_key found_key;
17524+
17525+ assert("vs-1717", result == 0);
17526+ build_sd_key(inode, &found_key);
17527+ if (!keyeq(&found_key, key)) {
17528+ warning("nikita-305", "Wrong key in sd");
17529+ reiser4_print_key("sought for", key);
17530+ reiser4_print_key("found", &found_key);
17531+ }
17532+ if (inode->i_nlink == 0) {
17533+ warning("nikita-3559", "Unlinked inode found: %llu\n",
17534+ (unsigned long long)get_inode_oid(inode));
17535+ }
17536+ }
17537+ return inode;
17538+}
17539+
17540+/* reiser4_iget() may return not fully initialized inode, this function should
17541+ * be called after one completes reiser4 inode initializing. */
17542+void reiser4_iget_complete(struct inode *inode)
17543+{
17544+ assert("zam-988", is_reiser4_inode(inode));
17545+
17546+ if (!is_inode_loaded(inode)) {
17547+ reiser4_inode_set_flag(inode, REISER4_LOADED);
17548+ loading_end(reiser4_inode_data(inode));
17549+ }
17550+}
17551+
17552+void reiser4_make_bad_inode(struct inode *inode)
17553+{
17554+ assert("nikita-1934", inode != NULL);
17555+
17556+ /* clear LOADED bit */
17557+ reiser4_inode_clr_flag(inode, REISER4_LOADED);
17558+ make_bad_inode(inode);
17559+ return;
17560+}
17561+
17562+file_plugin *inode_file_plugin(const struct inode * inode)
17563+{
17564+ assert("nikita-1997", inode != NULL);
17565+ return reiser4_inode_data(inode)->pset->file;
17566+}
17567+
17568+dir_plugin *inode_dir_plugin(const struct inode * inode)
17569+{
17570+ assert("nikita-1998", inode != NULL);
17571+ return reiser4_inode_data(inode)->pset->dir;
17572+}
17573+
17574+formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17575+{
17576+ assert("nikita-2000", inode != NULL);
17577+ return reiser4_inode_data(inode)->pset->formatting;
17578+}
17579+
17580+hash_plugin *inode_hash_plugin(const struct inode * inode)
17581+{
17582+ assert("nikita-2001", inode != NULL);
17583+ return reiser4_inode_data(inode)->pset->hash;
17584+}
17585+
17586+fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17587+{
17588+ assert("nikita-2001", inode != NULL);
17589+ return reiser4_inode_data(inode)->pset->fibration;
17590+}
17591+
17592+cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17593+{
17594+ assert("edward-36", inode != NULL);
17595+ return reiser4_inode_data(inode)->pset->cipher;
17596+}
17597+
17598+compression_plugin *inode_compression_plugin(const struct inode * inode)
17599+{
17600+ assert("edward-37", inode != NULL);
17601+ return reiser4_inode_data(inode)->pset->compression;
17602+}
17603+
17604+compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17605+ inode)
17606+{
17607+ assert("edward-1330", inode != NULL);
17608+ return reiser4_inode_data(inode)->pset->compression_mode;
17609+}
17610+
17611+cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17612+{
17613+ assert("edward-1328", inode != NULL);
17614+ return reiser4_inode_data(inode)->pset->cluster;
17615+}
17616+
17617+file_plugin *inode_create_plugin(const struct inode * inode)
17618+{
17619+ assert("edward-1329", inode != NULL);
17620+ return reiser4_inode_data(inode)->pset->create;
17621+}
17622+
17623+digest_plugin *inode_digest_plugin(const struct inode * inode)
17624+{
17625+ assert("edward-86", inode != NULL);
17626+ return reiser4_inode_data(inode)->pset->digest;
17627+}
17628+
17629+item_plugin *inode_sd_plugin(const struct inode * inode)
17630+{
17631+ assert("vs-534", inode != NULL);
17632+ return reiser4_inode_data(inode)->pset->sd;
17633+}
17634+
17635+item_plugin *inode_dir_item_plugin(const struct inode * inode)
17636+{
17637+ assert("vs-534", inode != NULL);
17638+ return reiser4_inode_data(inode)->pset->dir_item;
17639+}
17640+
17641+file_plugin *child_create_plugin(const struct inode * inode)
17642+{
17643+ assert("edward-1329", inode != NULL);
17644+ return reiser4_inode_data(inode)->hset->create;
17645+}
17646+
17647+void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17648+{
17649+ reiser4_inode *state;
17650+
17651+ assert("nikita-2716", inode != NULL);
17652+ assert("nikita-2717", ext < LAST_SD_EXTENSION);
17653+ assert("nikita-3491", spin_inode_is_locked(inode));
17654+
17655+ state = reiser4_inode_data(inode);
17656+ state->extmask |= 1 << ext;
17657+ /* force re-calculation of stat-data length on next call to
17658+ update_sd(). */
17659+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17660+}
17661+
17662+void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17663+{
17664+ reiser4_inode *state;
17665+
17666+ assert("vpf-1926", inode != NULL);
17667+ assert("vpf-1927", ext < LAST_SD_EXTENSION);
17668+ assert("vpf-1928", spin_inode_is_locked(inode));
17669+
17670+ state = reiser4_inode_data(inode);
17671+ state->extmask &= ~(1 << ext);
17672+ /* force re-calculation of stat-data length on next call to
17673+ update_sd(). */
17674+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17675+}
17676+
17677+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17678+{
17679+ assert("edward-1287", inode != NULL);
17680+ if (!dscale_fit(old, new))
17681+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17682+ return;
17683+}
17684+
17685+void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17686+{
17687+ assert("nikita-2875", inode != NULL);
17688+ spin_lock_inode(inode);
17689+ inode_check_scale_nolock(inode, old, new);
17690+ spin_unlock_inode(inode);
17691+}
17692+
17693+/*
17694+ * initialize ->ordering field of inode. This field defines how file stat-data
17695+ * and body is ordered within a tree with respect to other objects within the
17696+ * same parent directory.
17697+ */
17698+void
17699+init_inode_ordering(struct inode *inode,
17700+ reiser4_object_create_data * crd, int create)
17701+{
17702+ reiser4_key key;
17703+
17704+ if (create) {
17705+ struct inode *parent;
17706+
17707+ parent = crd->parent;
17708+ assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17709+ inode_dir_plugin(parent)->build_entry_key(parent,
17710+ &crd->dentry->d_name,
17711+ &key);
17712+ } else {
17713+ coord_t *coord;
17714+
17715+ coord = &reiser4_inode_data(inode)->sd_coord;
17716+ coord_clear_iplug(coord);
17717+ /* safe to use ->sd_coord, because node is under long term
17718+ * lock */
17719+ WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17720+ }
17721+
17722+ set_inode_ordering(inode, get_key_ordering(&key));
17723+}
17724+
17725+znode *inode_get_vroot(struct inode *inode)
17726+{
17727+ reiser4_block_nr blk;
17728+ znode *result;
17729+
17730+ spin_lock_inode(inode);
17731+ blk = reiser4_inode_data(inode)->vroot;
17732+ spin_unlock_inode(inode);
17733+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17734+ result = zlook(reiser4_tree_by_inode(inode), &blk);
17735+ else
17736+ result = NULL;
17737+ return result;
17738+}
17739+
17740+void inode_set_vroot(struct inode *inode, znode *vroot)
17741+{
17742+ spin_lock_inode(inode);
17743+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17744+ spin_unlock_inode(inode);
17745+}
17746+
17747+#if REISER4_DEBUG
17748+
17749+void reiser4_inode_invariant(const struct inode *inode)
17750+{
17751+ assert("nikita-3077", spin_inode_is_locked(inode));
17752+}
17753+
17754+int inode_has_no_jnodes(reiser4_inode * r4_inode)
17755+{
17756+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17757+ r4_inode->nr_jnodes == 0;
17758+}
17759+
17760+#endif
17761+
17762+/* true if directory is empty (only contains dot and dotdot) */
17763+/* FIXME: shouldn't it be dir plugin method? */
17764+int is_dir_empty(const struct inode *dir)
17765+{
17766+ assert("nikita-1976", dir != NULL);
17767+
17768+ /* rely on our method to maintain directory i_size being equal to the
17769+ number of entries. */
17770+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17771+}
17772+
17773+/* Make Linus happy.
17774+ Local variables:
17775+ c-indentation-style: "K&R"
17776+ mode-name: "LC"
17777+ c-basic-offset: 8
17778+ tab-width: 8
17779+ fill-column: 120
17780+ End:
17781+*/
17782diff -urN linux-2.6.20.orig/fs/reiser4/inode.h linux-2.6.20/fs/reiser4/inode.h
17783--- linux-2.6.20.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300
17784+++ linux-2.6.20/fs/reiser4/inode.h 2007-05-06 14:50:43.726984474 +0400
17785@@ -0,0 +1,438 @@
17786+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17787+
17788+/* Inode functions. */
17789+
17790+#if !defined( __REISER4_INODE_H__ )
17791+#define __REISER4_INODE_H__
17792+
17793+#include "forward.h"
17794+#include "debug.h"
17795+#include "key.h"
17796+#include "seal.h"
17797+#include "plugin/plugin.h"
17798+#include "plugin/file/cryptcompress.h"
17799+#include "plugin/file/file.h"
17800+#include "plugin/dir/dir.h"
17801+#include "plugin/plugin_set.h"
17802+#include "plugin/security/perm.h"
17803+#include "vfs_ops.h"
17804+#include "jnode.h"
17805+#include "fsdata.h"
17806+
17807+#include <linux/types.h> /* for __u?? , ino_t */
17808+#include <linux/fs.h> /* for struct super_block, struct
17809+ * rw_semaphore, etc */
17810+#include <linux/spinlock.h>
17811+#include <asm/types.h>
17812+
17813+/* reiser4-specific inode flags. They are "transient" and are not
17814+ supposed to be stored on disk. Used to trace "state" of
17815+ inode
17816+*/
17817+typedef enum {
17818+ /* this is light-weight inode, inheriting some state from its
17819+ parent */
17820+ REISER4_LIGHT_WEIGHT = 0,
17821+ /* stat data wasn't yet created */
17822+ REISER4_NO_SD = 1,
17823+ /* internal immutable flag. Currently is only used
17824+ to avoid race condition during file creation.
17825+ See comment in create_object(). */
17826+ REISER4_IMMUTABLE = 2,
17827+ /* inode was read from storage */
17828+ REISER4_LOADED = 3,
17829+ /* this bit is set for symlinks. inode->i_private points to target
17830+ name of symlink. */
17831+ REISER4_GENERIC_PTR_USED = 4,
17832+ /* set if size of stat-data item for this inode is known. If this is
17833+ * set we can avoid recalculating size of stat-data on each update. */
17834+ REISER4_SDLEN_KNOWN = 5,
17835+ /* reiser4_inode->crypt points to the crypto stat */
17836+ REISER4_CRYPTO_STAT_LOADED = 6,
17837+ /* cryptcompress_inode_data points to the secret key */
17838+ REISER4_SECRET_KEY_INSTALLED = 7,
17839+ /* File (possibly) has pages corresponding to the tail items, that
17840+ * were created by ->readpage. It is set by mmap_unix_file() and
17841+ * sendfile_unix_file(). This bit is inspected by write_unix_file and
17842+ * kill-hook of tail items. It is never cleared once set. This bit is
17843+ * modified and inspected under i_mutex. */
17844+ REISER4_HAS_MMAP = 8,
17845+ REISER4_PART_MIXED = 9,
17846+ REISER4_PART_IN_CONV = 10,
17847+ /* This flag indicates that file plugin conversion is in progress */
17848+ REISER4_FILE_CONV_IN_PROGRESS = 11
17849+} reiser4_file_plugin_flags;
17850+
17851+/* state associated with each inode.
17852+ reiser4 inode.
17853+
17854+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17855+ be of the same size. File-system allocates inodes by itself through
17856+ s_op->allocate_inode() method. So, it is possible to adjust size of inode
17857+ at the time of its creation.
17858+
17859+ Invariants involving parts of this data-type:
17860+
17861+ [inode->eflushed]
17862+
17863+*/
17864+
17865+typedef struct reiser4_inode reiser4_inode;
17866+/* return pointer to reiser4-specific part of inode */
17867+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17868+ /* inode queried */ );
17869+
17870+#if BITS_PER_LONG == 64
17871+
17872+#define REISER4_INO_IS_OID (1)
17873+typedef struct {;
17874+} oid_hi_t;
17875+
17876+/* BITS_PER_LONG == 64 */
17877+#else
17878+
17879+#define REISER4_INO_IS_OID (0)
17880+typedef __u32 oid_hi_t;
17881+
17882+/* BITS_PER_LONG == 64 */
17883+#endif
17884+
17885+struct reiser4_inode {
17886+ /* spin lock protecting fields of this structure. */
17887+ spinlock_t guard;
17888+ /* main plugin set that control the file
17889+ (see comments in plugin/plugin_set.c) */
17890+ plugin_set *pset;
17891+ /* plugin set for inheritance
17892+ (see comments in plugin/plugin_set.c) */
17893+ plugin_set *hset;
17894+ /* high 32 bits of object id */
17895+ oid_hi_t oid_hi;
17896+ /* seal for stat-data */
17897+ seal_t sd_seal;
17898+ /* locality id for this file */
17899+ oid_t locality_id;
17900+#if REISER4_LARGE_KEY
17901+ __u64 ordering;
17902+#endif
17903+ /* coord of stat-data in sealed node */
17904+ coord_t sd_coord;
17905+ /* bit-mask of stat-data extentions used by this file */
17906+ __u64 extmask;
17907+ /* bitmask of non-default plugins for this inode */
17908+ __u16 plugin_mask;
17909+ /* bitmask of set heir plugins for this inode. */
17910+ __u16 heir_mask;
17911+ union {
17912+ struct list_head readdir_list;
17913+ struct list_head not_used;
17914+ } lists;
17915+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
17916+ unsigned long flags;
17917+ union {
17918+ /* fields specific to unix_file plugin */
17919+ unix_file_info_t unix_file_info;
17920+ /* fields specific to cryptcompress plugin */
17921+ cryptcompress_info_t cryptcompress_info;
17922+ } file_plugin_data;
17923+
17924+ /* this semaphore is to serialize readers and writers of @pset->file
17925+ * when file plugin conversion is enabled
17926+ */
17927+ struct rw_semaphore conv_sem;
17928+
17929+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
17930+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */
17931+ struct radix_tree_root jnodes_tree;
17932+#if REISER4_DEBUG
17933+ /* number of unformatted node jnodes of this file in jnode hash table */
17934+ unsigned long nr_jnodes;
17935+#endif
17936+
17937+ /* block number of virtual root for this object. See comment above
17938+ * fs/reiser4/search.c:handle_vroot() */
17939+ reiser4_block_nr vroot;
17940+ struct mutex loading;
17941+};
17942+
17943+void loading_init_once(reiser4_inode *);
17944+void loading_alloc(reiser4_inode *);
17945+void loading_destroy(reiser4_inode *);
17946+
17947+typedef struct reiser4_inode_object {
17948+ /* private part */
17949+ reiser4_inode p;
17950+ /* generic fields not specific to reiser4, but used by VFS */
17951+ struct inode vfs_inode;
17952+} reiser4_inode_object;
17953+
17954+/* return pointer to the reiser4 specific portion of @inode */
17955+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17956+ /* inode queried */ )
17957+{
17958+ assert("nikita-254", inode != NULL);
17959+ return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
17960+}
17961+
17962+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
17963+ r4_inode /* inode queried */
17964+ )
17965+{
17966+ return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
17967+}
17968+
17969+/*
17970+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
17971+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
17972+ * bits.
17973+ *
17974+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
17975+ * of inode, otherwise whole oid is stored in i_ino.
17976+ *
17977+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
17978+ */
17979+
17980+#define OID_HI_SHIFT (sizeof(ino_t) * 8)
17981+
17982+#if REISER4_INO_IS_OID
17983+
17984+static inline oid_t get_inode_oid(const struct inode *inode)
17985+{
17986+ return inode->i_ino;
17987+}
17988+
17989+static inline void set_inode_oid(struct inode *inode, oid_t oid)
17990+{
17991+ inode->i_ino = oid;
17992+}
17993+
17994+/* REISER4_INO_IS_OID */
17995+#else
17996+
17997+static inline oid_t get_inode_oid(const struct inode *inode)
17998+{
17999+ return
18000+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18001+ inode->i_ino;
18002+}
18003+
18004+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18005+{
18006+ assert("nikita-2519", inode != NULL);
18007+ inode->i_ino = (ino_t) (oid);
18008+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18009+ assert("nikita-2521", get_inode_oid(inode) == (oid));
18010+}
18011+
18012+/* REISER4_INO_IS_OID */
18013+#endif
18014+
18015+static inline oid_t get_inode_locality(const struct inode *inode)
18016+{
18017+ return reiser4_inode_data(inode)->locality_id;
18018+}
18019+
18020+#if REISER4_LARGE_KEY
18021+static inline __u64 get_inode_ordering(const struct inode *inode)
18022+{
18023+ return reiser4_inode_data(inode)->ordering;
18024+}
18025+
18026+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18027+{
18028+ reiser4_inode_data(inode)->ordering = ordering;
18029+}
18030+
18031+#else
18032+
18033+#define get_inode_ordering(inode) (0)
18034+#define set_inode_ordering(inode, val) noop
18035+
18036+#endif
18037+
18038+/* return inode in which @uf_info is embedded */
18039+static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
18040+ uf_info)
18041+{
18042+ return &container_of(uf_info, reiser4_inode_object,
18043+ p.file_plugin_data.unix_file_info)->vfs_inode;
18044+}
18045+
18046+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18047+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18048+
18049+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18050+
18051+#if REISER4_DEBUG
18052+extern void reiser4_inode_invariant(const struct inode *inode);
18053+extern int inode_has_no_jnodes(reiser4_inode *);
18054+#else
18055+#define reiser4_inode_invariant(inode) noop
18056+#endif
18057+
18058+static inline int spin_inode_is_locked(const struct inode *inode)
18059+{
18060+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18061+ return 1;
18062+}
18063+
18064+/**
18065+ * spin_lock_inode - lock reiser4_inode' embedded spinlock
18066+ * @inode: inode to lock
18067+ *
18068+ * In debug mode it checks that lower priority locks are not held and
18069+ * increments reiser4_context's lock counters on which lock ordering checking
18070+ * is based.
18071+ */
18072+static inline void spin_lock_inode(struct inode *inode)
18073+{
18074+ assert("", LOCK_CNT_NIL(spin_locked));
18075+ /* check lock ordering */
18076+ assert_spin_not_locked(&d_lock);
18077+
18078+ spin_lock(&reiser4_inode_data(inode)->guard);
18079+
18080+ LOCK_CNT_INC(spin_locked_inode);
18081+ LOCK_CNT_INC(spin_locked);
18082+
18083+ reiser4_inode_invariant(inode);
18084+}
18085+
18086+/**
18087+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18088+ * @inode: inode to unlock
18089+ *
18090+ * In debug mode it checks that spinlock is held and decrements
18091+ * reiser4_context's lock counters on which lock ordering checking is based.
18092+ */
18093+static inline void spin_unlock_inode(struct inode *inode)
18094+{
18095+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18096+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18097+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18098+
18099+ reiser4_inode_invariant(inode);
18100+
18101+ LOCK_CNT_DEC(spin_locked_inode);
18102+ LOCK_CNT_DEC(spin_locked);
18103+
18104+ spin_unlock(&reiser4_inode_data(inode)->guard);
18105+}
18106+
18107+extern znode *inode_get_vroot(struct inode *inode);
18108+extern void inode_set_vroot(struct inode *inode, znode * vroot);
18109+
18110+extern int reiser4_max_filename_len(const struct inode *inode);
18111+extern int max_hash_collisions(const struct inode *dir);
18112+extern void reiser4_unlock_inode(struct inode *inode);
18113+extern int is_reiser4_inode(const struct inode *inode);
18114+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18115+extern struct inode *reiser4_iget(struct super_block *super,
18116+ const reiser4_key * key, int silent);
18117+extern void reiser4_iget_complete(struct inode *inode);
18118+extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18119+extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18120+extern int reiser4_inode_get_flag(const struct inode *inode,
18121+ reiser4_file_plugin_flags f);
18122+
18123+/* has inode been initialized? */
18124+static inline int
18125+is_inode_loaded(const struct inode *inode /* inode queried */ )
18126+{
18127+ assert("nikita-1120", inode != NULL);
18128+ return reiser4_inode_get_flag(inode, REISER4_LOADED);
18129+}
18130+
18131+extern file_plugin *inode_file_plugin(const struct inode *inode);
18132+extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18133+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18134+extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18135+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18136+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18137+extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18138+extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18139+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18140+ *inode);
18141+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18142+extern file_plugin *inode_create_plugin(const struct inode *inode);
18143+extern item_plugin *inode_sd_plugin(const struct inode *inode);
18144+extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18145+extern file_plugin *child_create_plugin(const struct inode *inode);
18146+
18147+extern void reiser4_make_bad_inode(struct inode *inode);
18148+
18149+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18150+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18151+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18152+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18153+
18154+/*
18155+ * update field @field in inode @i to contain value @value.
18156+ */
18157+#define INODE_SET_FIELD(i, field, value) \
18158+({ \
18159+ struct inode *__i; \
18160+ typeof(value) __v; \
18161+ \
18162+ __i = (i); \
18163+ __v = (value); \
18164+ inode_check_scale(__i, __i->field, __v); \
18165+ __i->field = __v; \
18166+})
18167+
18168+#define INODE_INC_FIELD(i, field) \
18169+({ \
18170+ struct inode *__i; \
18171+ \
18172+ __i = (i); \
18173+ inode_check_scale(__i, __i->field, __i->field + 1); \
18174+ ++ __i->field; \
18175+})
18176+
18177+#define INODE_DEC_FIELD(i, field) \
18178+({ \
18179+ struct inode *__i; \
18180+ \
18181+ __i = (i); \
18182+ inode_check_scale(__i, __i->field, __i->field - 1); \
18183+ -- __i->field; \
18184+})
18185+
18186+/* See comment before reiser4_readdir_common() for description. */
18187+static inline struct list_head *get_readdir_list(const struct inode *inode)
18188+{
18189+ return &reiser4_inode_data(inode)->lists.readdir_list;
18190+}
18191+
18192+extern void init_inode_ordering(struct inode *inode,
18193+ reiser4_object_create_data * crd, int create);
18194+
18195+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18196+{
18197+ return &reiser4_inode_data(inode)->jnodes_tree;
18198+}
18199+
18200+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18201+ * r4_inode)
18202+{
18203+ return &r4_inode->jnodes_tree;
18204+}
18205+
18206+#if REISER4_DEBUG
18207+extern void print_inode(const char *prefix, const struct inode *i);
18208+#endif
18209+
18210+int is_dir_empty(const struct inode *);
18211+
18212+/* __REISER4_INODE_H__ */
18213+#endif
18214+
18215+/* Make Linus happy.
18216+ Local variables:
18217+ c-indentation-style: "K&R"
18218+ mode-name: "LC"
18219+ c-basic-offset: 8
18220+ tab-width: 8
18221+ fill-column: 120
18222+ End:
18223+*/
18224diff -urN linux-2.6.20.orig/fs/reiser4/ioctl.h linux-2.6.20/fs/reiser4/ioctl.h
18225--- linux-2.6.20.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300
18226+++ linux-2.6.20/fs/reiser4/ioctl.h 2007-05-06 14:50:43.726984474 +0400
18227@@ -0,0 +1,41 @@
18228+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18229+ * reiser4/README */
18230+
18231+#if !defined( __REISER4_IOCTL_H__ )
18232+#define __REISER4_IOCTL_H__
18233+
18234+#include <linux/fs.h>
18235+
18236+/*
18237+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18238+ * extents and fix in this state. This is used by applications that rely on
18239+ *
18240+ * . files being block aligned, and
18241+ *
18242+ * . files never migrating on disk
18243+ *
18244+ * for example, boot loaders (LILO) need this.
18245+ *
18246+ * This ioctl should be used as
18247+ *
18248+ * result = ioctl(fd, REISER4_IOC_UNPACK);
18249+ *
18250+ * File behind fd descriptor will be converted to the extents (if necessary),
18251+ * and its stat-data will be updated so that it will never be converted back
18252+ * into tails again.
18253+ */
18254+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18255+
18256+/* __REISER4_IOCTL_H__ */
18257+#endif
18258+
18259+/* Make Linus happy.
18260+ Local variables:
18261+ c-indentation-style: "K&R"
18262+ mode-name: "LC"
18263+ c-basic-offset: 8
18264+ tab-width: 8
18265+ fill-column: 120
18266+ scroll-step: 1
18267+ End:
18268+*/
18269diff -urN linux-2.6.20.orig/fs/reiser4/jnode.c linux-2.6.20/fs/reiser4/jnode.c
18270--- linux-2.6.20.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300
18271+++ linux-2.6.20/fs/reiser4/jnode.c 2007-05-06 14:50:43.730985723 +0400
18272@@ -0,0 +1,1925 @@
18273+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18274+ * reiser4/README */
18275+/* Jnode manipulation functions. */
18276+/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18277+
18278+ In particular, jnodes are used to track transactional information
18279+ associated with each block. Each znode contains jnode as ->zjnode field.
18280+
18281+ Jnode stands for either Josh or Journal node.
18282+*/
18283+
18284+/*
18285+ * Taxonomy.
18286+ *
18287+ * Jnode represents block containing data or meta-data. There are jnodes
18288+ * for:
18289+ *
18290+ * unformatted blocks (jnodes proper). There are plans, however to
18291+ * have a handle per extent unit rather than per each unformatted
18292+ * block, because there are so many of them.
18293+ *
18294+ * For bitmaps. Each bitmap is actually represented by two jnodes--one
18295+ * for working and another for "commit" data, together forming bnode.
18296+ *
18297+ * For io-heads. These are used by log writer.
18298+ *
18299+ * For formatted nodes (znode). See comment at the top of znode.c for
18300+ * details specific to the formatted nodes (znodes).
18301+ *
18302+ * Node data.
18303+ *
18304+ * Jnode provides access to the data of node it represents. Data are
18305+ * stored in a page. Page is kept in a page cache. This means, that jnodes
18306+ * are highly interconnected with page cache and VM internals.
18307+ *
18308+ * jnode has a pointer to page (->pg) containing its data. Pointer to data
18309+ * themselves is cached in ->data field to avoid frequent calls to
18310+ * page_address().
18311+ *
18312+ * jnode and page are attached to each other by jnode_attach_page(). This
18313+ * function places pointer to jnode in set_page_private(), sets PG_private
18314+ * flag and increments page counter.
18315+ *
18316+ * Opposite operation is performed by page_clear_jnode().
18317+ *
18318+ * jnode->pg is protected by jnode spin lock, and page->private is
18319+ * protected by page lock. See comment at the top of page_cache.c for
18320+ * more.
18321+ *
18322+ * page can be detached from jnode for two reasons:
18323+ *
18324+ * . jnode is removed from a tree (file is truncated, of formatted
18325+ * node is removed by balancing).
18326+ *
18327+ * . during memory pressure, VM calls ->releasepage() method
18328+ * (reiser4_releasepage()) to evict page from memory.
18329+ *
18330+ * (there, of course, is also umount, but this is special case we are not
18331+ * concerned with here).
18332+ *
18333+ * To protect jnode page from eviction, one calls jload() function that
18334+ * "pins" page in memory (loading it if necessary), increments
18335+ * jnode->d_count, and kmap()s page. Page is unpinned through call to
18336+ * jrelse().
18337+ *
18338+ * Jnode life cycle.
18339+ *
18340+ * jnode is created, placed in hash table, and, optionally, in per-inode
18341+ * radix tree. Page can be attached to jnode, pinned, released, etc.
18342+ *
18343+ * When jnode is captured into atom its reference counter is
18344+ * increased. While being part of an atom, jnode can be "early
18345+ * flushed". This means that as part of flush procedure, jnode is placed
18346+ * into "relocate set", and its page is submitted to the disk. After io
18347+ * completes, page can be detached, then loaded again, re-dirtied, etc.
18348+ *
18349+ * Thread acquired reference to jnode by calling jref() and releases it by
18350+ * jput(). When last reference is removed, jnode is still retained in
18351+ * memory (cached) if it has page attached, _unless_ it is scheduled for
18352+ * destruction (has JNODE_HEARD_BANSHEE bit set).
18353+ *
18354+ * Tree read-write lock was used as "existential" lock for jnodes. That is,
18355+ * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18356+ * that is, tree lock protected unreferenced jnodes stored in the hash
18357+ * table, from recycling.
18358+ *
18359+ * This resulted in high contention on tree lock, because jref()/jput() is
18360+ * frequent operation. To ameliorate this problem, RCU is used: when jput()
18361+ * is just about to release last reference on jnode it sets JNODE_RIP bit
18362+ * on it, and then proceed with jnode destruction (removing jnode from hash
18363+ * table, cbk_cache, detaching page, etc.). All places that change jnode
18364+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18365+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18366+ * jnode_rip_check() function), and pretend that nothing was found in hash
18367+ * table if bit is set.
18368+ *
18369+ * jput defers actual return of jnode into slab cache to some later time
18370+ * (by call_rcu()), this guarantees that other threads can safely continue
18371+ * working with JNODE_RIP-ped jnode.
18372+ *
18373+ */
18374+
18375+#include "reiser4.h"
18376+#include "debug.h"
18377+#include "dformat.h"
18378+#include "jnode.h"
18379+#include "plugin/plugin_header.h"
18380+#include "plugin/plugin.h"
18381+#include "txnmgr.h"
18382+/*#include "jnode.h"*/
18383+#include "znode.h"
18384+#include "tree.h"
18385+#include "tree_walk.h"
18386+#include "super.h"
18387+#include "inode.h"
18388+#include "page_cache.h"
18389+
18390+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18391+#include <linux/types.h>
18392+#include <linux/slab.h>
18393+#include <linux/pagemap.h>
18394+#include <linux/swap.h>
18395+#include <linux/fs.h> /* for struct address_space */
18396+#include <linux/writeback.h> /* for inode_lock */
18397+
18398+static struct kmem_cache *_jnode_slab = NULL;
18399+
18400+static void jnode_set_type(jnode * node, jnode_type type);
18401+static int jdelete(jnode * node);
18402+static int jnode_try_drop(jnode * node);
18403+
18404+#if REISER4_DEBUG
18405+static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18406+#endif
18407+
18408+/* true if valid page is attached to jnode */
18409+static inline int jnode_is_parsed(jnode * node)
18410+{
18411+ return JF_ISSET(node, JNODE_PARSED);
18412+}
18413+
18414+/* hash table support */
18415+
18416+/* compare two jnode keys for equality. Used by hash-table macros */
18417+static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
18418+{
18419+ assert("nikita-2350", k1 != NULL);
18420+ assert("nikita-2351", k2 != NULL);
18421+
18422+ return (k1->index == k2->index && k1->objectid == k2->objectid);
18423+}
18424+
18425+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18426+static inline __u32
18427+jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
18428+{
18429+ assert("nikita-2352", key != NULL);
18430+ assert("nikita-3346", IS_POW(table->_buckets));
18431+
18432+ /* yes, this is remarkable simply (where not stupid) hash function. */
18433+ return (key->objectid + key->index) & (table->_buckets - 1);
18434+}
18435+
18436+/* The hash table definition */
18437+#define KMALLOC(size) reiser4_vmalloc(size)
18438+#define KFREE(ptr, size) vfree(ptr)
18439+TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
18440+ jnode_key_eq);
18441+#undef KFREE
18442+#undef KMALLOC
18443+
18444+/* call this to initialise jnode hash table */
18445+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18446+{
18447+ assert("nikita-2359", tree != NULL);
18448+ return j_hash_init(&tree->jhash_table, 16384);
18449+}
18450+
18451+/* call this to destroy jnode hash table. This is called during umount. */
18452+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18453+{
18454+ j_hash_table *jtable;
18455+ jnode *node;
18456+ jnode *next;
18457+
18458+ assert("nikita-2360", tree != NULL);
18459+
18460+ /*
18461+ * Scan hash table and free all jnodes.
18462+ */
18463+ jtable = &tree->jhash_table;
18464+ if (jtable->_table) {
18465+ for_all_in_htable(jtable, j, node, next) {
18466+ assert("nikita-2361", !atomic_read(&node->x_count));
18467+ jdrop(node);
18468+ }
18469+
18470+ j_hash_done(&tree->jhash_table);
18471+ }
18472+ return 0;
18473+}
18474+
18475+/**
18476+ * init_jnodes - create jnode cache
18477+ *
18478+ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18479+ */
18480+int init_jnodes(void)
18481+{
18482+ assert("umka-168", _jnode_slab == NULL);
18483+
18484+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18485+ SLAB_HWCACHE_ALIGN |
18486+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
18487+ if (_jnode_slab == NULL)
18488+ return RETERR(-ENOMEM);
18489+
18490+ return 0;
18491+}
18492+
18493+/**
18494+ * done_znodes - delete znode cache
18495+ *
18496+ * This is called on reiser4 module unloading or system shutdown.
18497+ */
18498+void done_jnodes(void)
18499+{
18500+ destroy_reiser4_cache(&_jnode_slab);
18501+}
18502+
18503+/* Initialize a jnode. */
18504+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18505+{
18506+ assert("umka-175", node != NULL);
18507+
18508+ memset(node, 0, sizeof(jnode));
18509+ ON_DEBUG(node->magic = JMAGIC);
18510+ jnode_set_type(node, type);
18511+ atomic_set(&node->d_count, 0);
18512+ atomic_set(&node->x_count, 0);
18513+ spin_lock_init(&node->guard);
18514+ spin_lock_init(&node->load);
18515+ node->atom = NULL;
18516+ node->tree = tree;
18517+ INIT_LIST_HEAD(&node->capture_link);
18518+
18519+ ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18520+
18521+ INIT_RCU_HEAD(&node->rcu);
18522+
18523+#if REISER4_DEBUG
18524+ {
18525+ reiser4_super_info_data *sbinfo;
18526+
18527+ sbinfo = get_super_private(tree->super);
18528+ spin_lock_irq(&sbinfo->all_guard);
18529+ list_add(&node->jnodes, &sbinfo->all_jnodes);
18530+ spin_unlock_irq(&sbinfo->all_guard);
18531+ }
18532+#endif
18533+}
18534+
18535+#if REISER4_DEBUG
18536+/*
18537+ * Remove jnode from ->all_jnodes list.
18538+ */
18539+static void jnode_done(jnode * node, reiser4_tree * tree)
18540+{
18541+ reiser4_super_info_data *sbinfo;
18542+
18543+ sbinfo = get_super_private(tree->super);
18544+
18545+ spin_lock_irq(&sbinfo->all_guard);
18546+ assert("nikita-2422", !list_empty(&node->jnodes));
18547+ list_del_init(&node->jnodes);
18548+ spin_unlock_irq(&sbinfo->all_guard);
18549+}
18550+#endif
18551+
18552+/* return already existing jnode of page */
18553+jnode *jnode_by_page(struct page *pg)
18554+{
18555+ assert("nikita-2066", pg != NULL);
18556+ assert("nikita-2400", PageLocked(pg));
18557+ assert("nikita-2068", PagePrivate(pg));
18558+ assert("nikita-2067", jprivate(pg) != NULL);
18559+ return jprivate(pg);
18560+}
18561+
18562+/* exported functions to allocate/free jnode objects outside this file */
18563+jnode *jalloc(void)
18564+{
18565+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18566+ return jal;
18567+}
18568+
18569+/* return jnode back to the slab allocator */
18570+inline void jfree(jnode * node)
18571+{
18572+ assert("zam-449", node != NULL);
18573+
18574+ assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18575+ NODE_LIST(node) == NOT_CAPTURED));
18576+ assert("nikita-3222", list_empty(&node->jnodes));
18577+ assert("nikita-3221", jnode_page(node) == NULL);
18578+
18579+ /* not yet phash_jnode_destroy(node); */
18580+
18581+ kmem_cache_free(_jnode_slab, node);
18582+}
18583+
18584+/*
18585+ * This function is supplied as RCU callback. It actually frees jnode when
18586+ * last reference to it is gone.
18587+ */
18588+static void jnode_free_actor(struct rcu_head *head)
18589+{
18590+ jnode *node;
18591+ jnode_type jtype;
18592+
18593+ node = container_of(head, jnode, rcu);
18594+ jtype = jnode_get_type(node);
18595+
18596+ ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18597+
18598+ switch (jtype) {
18599+ case JNODE_IO_HEAD:
18600+ case JNODE_BITMAP:
18601+ case JNODE_UNFORMATTED_BLOCK:
18602+ jfree(node);
18603+ break;
18604+ case JNODE_FORMATTED_BLOCK:
18605+ zfree(JZNODE(node));
18606+ break;
18607+ case JNODE_INODE:
18608+ default:
18609+ wrong_return_value("nikita-3197", "Wrong jnode type");
18610+ }
18611+}
18612+
18613+/*
18614+ * Free a jnode. Post a callback to be executed later through RCU when all
18615+ * references to @node are released.
18616+ */
18617+static inline void jnode_free(jnode * node, jnode_type jtype)
18618+{
18619+ if (jtype != JNODE_INODE) {
18620+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18621+ call_rcu(&node->rcu, jnode_free_actor);
18622+ } else
18623+ jnode_list_remove(node);
18624+}
18625+
18626+/* allocate new unformatted jnode */
18627+static jnode *jnew_unformatted(void)
18628+{
18629+ jnode *jal;
18630+
18631+ jal = jalloc();
18632+ if (jal == NULL)
18633+ return NULL;
18634+
18635+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18636+ jal->key.j.mapping = NULL;
18637+ jal->key.j.index = (unsigned long)-1;
18638+ jal->key.j.objectid = 0;
18639+ return jal;
18640+}
18641+
18642+/* look for jnode with given mapping and offset within hash table */
18643+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18644+{
18645+ jnode_key_t jkey;
18646+ jnode *node;
18647+
18648+ assert("nikita-2353", tree != NULL);
18649+
18650+ jkey.objectid = objectid;
18651+ jkey.index = index;
18652+
18653+ /*
18654+ * hash table is _not_ protected by any lock during lookups. All we
18655+ * have to do is to disable preemption to keep RCU happy.
18656+ */
18657+
18658+ rcu_read_lock();
18659+ node = j_hash_find(&tree->jhash_table, &jkey);
18660+ if (node != NULL) {
18661+ /* protect @node from recycling */
18662+ jref(node);
18663+ assert("nikita-2955", jnode_invariant(node, 0, 0));
18664+ node = jnode_rip_check(tree, node);
18665+ }
18666+ rcu_read_unlock();
18667+ return node;
18668+}
18669+
18670+/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18671+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18672+{
18673+ assert("vs-1694", mapping->host != NULL);
18674+
18675+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18676+}
18677+
18678+jnode *jfind(struct address_space * mapping, unsigned long index)
18679+{
18680+ reiser4_tree *tree;
18681+ jnode *node;
18682+
18683+ assert("vs-1694", mapping->host != NULL);
18684+ tree = reiser4_tree_by_inode(mapping->host);
18685+
18686+ read_lock_tree(tree);
18687+ node = jfind_nolock(mapping, index);
18688+ if (node != NULL)
18689+ jref(node);
18690+ read_unlock_tree(tree);
18691+ return node;
18692+}
18693+
18694+static void inode_attach_jnode(jnode * node)
18695+{
18696+ struct inode *inode;
18697+ reiser4_inode *info;
18698+ struct radix_tree_root *rtree;
18699+
18700+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18701+ assert("zam-1043", node->key.j.mapping != NULL);
18702+ inode = node->key.j.mapping->host;
18703+ info = reiser4_inode_data(inode);
18704+ rtree = jnode_tree_by_reiser4_inode(info);
18705+ if (rtree->rnode == NULL) {
18706+ /* prevent inode from being pruned when it has jnodes attached
18707+ to it */
18708+ write_lock_irq(&inode->i_data.tree_lock);
18709+ inode->i_data.nrpages++;
18710+ write_unlock_irq(&inode->i_data.tree_lock);
18711+ }
18712+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18713+ check_me("zam-1045",
18714+ !radix_tree_insert(rtree, node->key.j.index, node));
18715+ ON_DEBUG(info->nr_jnodes++);
18716+}
18717+
18718+static void inode_detach_jnode(jnode * node)
18719+{
18720+ struct inode *inode;
18721+ reiser4_inode *info;
18722+ struct radix_tree_root *rtree;
18723+
18724+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18725+ assert("zam-1044", node->key.j.mapping != NULL);
18726+ inode = node->key.j.mapping->host;
18727+ info = reiser4_inode_data(inode);
18728+ rtree = jnode_tree_by_reiser4_inode(info);
18729+
18730+ assert("zam-1051", info->nr_jnodes != 0);
18731+ assert("zam-1052", rtree->rnode != NULL);
18732+ ON_DEBUG(info->nr_jnodes--);
18733+
18734+ /* delete jnode from inode's radix tree of jnodes */
18735+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18736+ if (rtree->rnode == NULL) {
18737+ /* inode can be pruned now */
18738+ write_lock_irq(&inode->i_data.tree_lock);
18739+ inode->i_data.nrpages--;
18740+ write_unlock_irq(&inode->i_data.tree_lock);
18741+ }
18742+}
18743+
18744+/* put jnode into hash table (where they can be found by flush who does not know
18745+ mapping) and to inode's tree of jnodes (where they can be found (hopefully
18746+ faster) in places where mapping is known). Currently it is used by
18747+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18748+ created */
18749+static void
18750+hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18751+ unsigned long index)
18752+{
18753+ j_hash_table *jtable;
18754+
18755+ assert("vs-1446", jnode_is_unformatted(node));
18756+ assert("vs-1442", node->key.j.mapping == 0);
18757+ assert("vs-1443", node->key.j.objectid == 0);
18758+ assert("vs-1444", node->key.j.index == (unsigned long)-1);
18759+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18760+
18761+ node->key.j.mapping = mapping;
18762+ node->key.j.objectid = get_inode_oid(mapping->host);
18763+ node->key.j.index = index;
18764+
18765+ jtable = &jnode_get_tree(node)->jhash_table;
18766+
18767+ /* race with some other thread inserting jnode into the hash table is
18768+ * impossible, because we keep the page lock. */
18769+ /*
18770+ * following assertion no longer holds because of RCU: it is possible
18771+ * jnode is in the hash table, but with JNODE_RIP bit set.
18772+ */
18773+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18774+ j_hash_insert_rcu(jtable, node);
18775+ inode_attach_jnode(node);
18776+}
18777+
18778+static void unhash_unformatted_node_nolock(jnode * node)
18779+{
18780+ assert("vs-1683", node->key.j.mapping != NULL);
18781+ assert("vs-1684",
18782+ node->key.j.objectid ==
18783+ get_inode_oid(node->key.j.mapping->host));
18784+
18785+ /* remove jnode from hash-table */
18786+ j_hash_remove_rcu(&node->tree->jhash_table, node);
18787+ inode_detach_jnode(node);
18788+ node->key.j.mapping = NULL;
18789+ node->key.j.index = (unsigned long)-1;
18790+ node->key.j.objectid = 0;
18791+
18792+}
18793+
18794+/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18795+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
18796+ reiser4_uncapture_jnode */
18797+void unhash_unformatted_jnode(jnode * node)
18798+{
18799+ assert("vs-1445", jnode_is_unformatted(node));
18800+
18801+ write_lock_tree(node->tree);
18802+ unhash_unformatted_node_nolock(node);
18803+ write_unlock_tree(node->tree);
18804+}
18805+
18806+/*
18807+ * search hash table for a jnode with given oid and index. If not found,
18808+ * allocate new jnode, insert it, and also insert into radix tree for the
18809+ * given inode/mapping.
18810+ */
18811+static jnode *find_get_jnode(reiser4_tree * tree,
18812+ struct address_space *mapping,
18813+ oid_t oid, unsigned long index)
18814+{
18815+ jnode *result;
18816+ jnode *shadow;
18817+ int preload;
18818+
18819+ result = jnew_unformatted();
18820+
18821+ if (unlikely(result == NULL))
18822+ return ERR_PTR(RETERR(-ENOMEM));
18823+
18824+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
18825+ if (preload != 0)
18826+ return ERR_PTR(preload);
18827+
18828+ write_lock_tree(tree);
18829+ shadow = jfind_nolock(mapping, index);
18830+ if (likely(shadow == NULL)) {
18831+ /* add new jnode to hash table and inode's radix tree of jnodes */
18832+ jref(result);
18833+ hash_unformatted_jnode(result, mapping, index);
18834+ } else {
18835+ /* jnode is found in inode's radix tree of jnodes */
18836+ jref(shadow);
18837+ jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18838+ assert("vs-1498", shadow->key.j.mapping == mapping);
18839+ result = shadow;
18840+ }
18841+ write_unlock_tree(tree);
18842+
18843+ assert("nikita-2955",
18844+ ergo(result != NULL, jnode_invariant(result, 0, 0)));
18845+ radix_tree_preload_end();
18846+ return result;
18847+}
18848+
18849+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18850+ creates) jnode corresponding to page @pg. jnode is attached to page and
18851+ inserted into jnode hash-table. */
18852+static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18853+{
18854+ /*
18855+ * There are two ways to create jnode: starting with pre-existing page
18856+ * and without page.
18857+ *
18858+ * When page already exists, jnode is created
18859+ * (jnode_of_page()->do_jget()) under page lock. This is done in
18860+ * ->writepage(), or when capturing anonymous page dirtied through
18861+ * mmap.
18862+ *
18863+ * Jnode without page is created by index_extent_jnode().
18864+ *
18865+ */
18866+
18867+ jnode *result;
18868+ oid_t oid = get_inode_oid(pg->mapping->host);
18869+
18870+ assert("umka-176", pg != NULL);
18871+ assert("nikita-2394", PageLocked(pg));
18872+
18873+ result = jprivate(pg);
18874+ if (likely(result != NULL))
18875+ return jref(result);
18876+
18877+ tree = reiser4_tree_by_page(pg);
18878+
18879+ /* check hash-table first */
18880+ result = jfind(pg->mapping, pg->index);
18881+ if (unlikely(result != NULL)) {
18882+ spin_lock_jnode(result);
18883+ jnode_attach_page(result, pg);
18884+ spin_unlock_jnode(result);
18885+ result->key.j.mapping = pg->mapping;
18886+ return result;
18887+ }
18888+
18889+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */
18890+ reiser4_ctx_gfp_mask_force(GFP_NOFS);
18891+ result = find_get_jnode(tree, pg->mapping, oid, pg->index);
18892+ if (unlikely(IS_ERR(result)))
18893+ return result;
18894+ /* attach jnode to page */
18895+ spin_lock_jnode(result);
18896+ jnode_attach_page(result, pg);
18897+ spin_unlock_jnode(result);
18898+ return result;
18899+}
18900+
18901+/*
18902+ * return jnode for @pg, creating it if necessary.
18903+ */
18904+jnode *jnode_of_page(struct page * pg)
18905+{
18906+ jnode *result;
18907+
18908+ assert("umka-176", pg != NULL);
18909+ assert("nikita-2394", PageLocked(pg));
18910+
18911+ result = do_jget(reiser4_tree_by_page(pg), pg);
18912+
18913+ if (REISER4_DEBUG && !IS_ERR(result)) {
18914+ assert("nikita-3210", result == jprivate(pg));
18915+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
18916+ if (jnode_is_unformatted(jprivate(pg))) {
18917+ assert("nikita-2364",
18918+ jprivate(pg)->key.j.index == pg->index);
18919+ assert("nikita-2367",
18920+ jprivate(pg)->key.j.mapping == pg->mapping);
18921+ assert("nikita-2365",
18922+ jprivate(pg)->key.j.objectid ==
18923+ get_inode_oid(pg->mapping->host));
18924+ assert("vs-1200",
18925+ jprivate(pg)->key.j.objectid ==
18926+ pg->mapping->host->i_ino);
18927+ assert("nikita-2356",
18928+ jnode_is_unformatted(jnode_by_page(pg)));
18929+ }
18930+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
18931+ }
18932+ return result;
18933+}
18934+
18935+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
18936+ * page.*/
18937+void jnode_attach_page(jnode * node, struct page *pg)
18938+{
18939+ assert("nikita-2060", node != NULL);
18940+ assert("nikita-2061", pg != NULL);
18941+
18942+ assert("nikita-2050", jprivate(pg) == 0ul);
18943+ assert("nikita-2393", !PagePrivate(pg));
18944+ assert("vs-1741", node->pg == NULL);
18945+
18946+ assert("nikita-2396", PageLocked(pg));
18947+ assert_spin_locked(&(node->guard));
18948+
18949+ page_cache_get(pg);
18950+ set_page_private(pg, (unsigned long)node);
18951+ node->pg = pg;
18952+ SetPagePrivate(pg);
18953+}
18954+
18955+/* Dual to jnode_attach_page: break a binding between page and jnode */
18956+void page_clear_jnode(struct page *page, jnode * node)
18957+{
18958+ assert("nikita-2424", page != NULL);
18959+ assert("nikita-2425", PageLocked(page));
18960+ assert("nikita-2426", node != NULL);
18961+ assert_spin_locked(&(node->guard));
18962+ assert("nikita-2428", PagePrivate(page));
18963+
18964+ assert("nikita-3551", !PageWriteback(page));
18965+
18966+ JF_CLR(node, JNODE_PARSED);
18967+ set_page_private(page, 0ul);
18968+ ClearPagePrivate(page);
18969+ node->pg = NULL;
18970+ page_cache_release(page);
18971+}
18972+
18973+#if 0
18974+/* it is only used in one place to handle error */
18975+void
18976+page_detach_jnode(struct page *page, struct address_space *mapping,
18977+ unsigned long index)
18978+{
18979+ assert("nikita-2395", page != NULL);
18980+
18981+ lock_page(page);
18982+ if ((page->mapping == mapping) && (page->index == index)
18983+ && PagePrivate(page)) {
18984+ jnode *node;
18985+
18986+ node = jprivate(page);
18987+ spin_lock_jnode(node);
18988+ page_clear_jnode(page, node);
18989+ spin_unlock_jnode(node);
18990+ }
18991+ unlock_page(page);
18992+}
18993+#endif /* 0 */
18994+
18995+/* return @node page locked.
18996+
18997+ Locking ordering requires that one first takes page lock and afterwards
18998+ spin lock on node attached to this page. Sometimes it is necessary to go in
18999+ the opposite direction. This is done through standard trylock-and-release
19000+ loop.
19001+*/
19002+static struct page *jnode_lock_page(jnode * node)
19003+{
19004+ struct page *page;
19005+
19006+ assert("nikita-2052", node != NULL);
19007+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19008+
19009+ while (1) {
19010+
19011+ spin_lock_jnode(node);
19012+ page = jnode_page(node);
19013+ if (page == NULL) {
19014+ break;
19015+ }
19016+
19017+ /* no need to page_cache_get( page ) here, because page cannot
19018+ be evicted from memory without detaching it from jnode and
19019+ this requires spin lock on jnode that we already hold.
19020+ */
19021+ if (!TestSetPageLocked(page)) {
19022+ /* We won a lock on jnode page, proceed. */
19023+ break;
19024+ }
19025+
19026+ /* Page is locked by someone else. */
19027+ page_cache_get(page);
19028+ spin_unlock_jnode(node);
19029+ wait_on_page_locked(page);
19030+ /* it is possible that page was detached from jnode and
19031+ returned to the free pool, or re-assigned while we were
19032+ waiting on locked bit. This will be rechecked on the next
19033+ loop iteration.
19034+ */
19035+ page_cache_release(page);
19036+
19037+ /* try again */
19038+ }
19039+ return page;
19040+}
19041+
19042+/*
19043+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19044+ * validness of jnode content.
19045+ */
19046+static inline int jparse(jnode * node)
19047+{
19048+ int result;
19049+
19050+ assert("nikita-2466", node != NULL);
19051+
19052+ spin_lock_jnode(node);
19053+ if (likely(!jnode_is_parsed(node))) {
19054+ result = jnode_ops(node)->parse(node);
19055+ if (likely(result == 0))
19056+ JF_SET(node, JNODE_PARSED);
19057+ } else
19058+ result = 0;
19059+ spin_unlock_jnode(node);
19060+ return result;
19061+}
19062+
19063+/* Lock a page attached to jnode, create and attach page to jnode if it had no
19064+ * one. */
19065+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19066+{
19067+ struct page *page;
19068+
19069+ spin_lock_jnode(node);
19070+ page = jnode_page(node);
19071+
19072+ if (page == NULL) {
19073+ spin_unlock_jnode(node);
19074+ page = find_or_create_page(jnode_get_mapping(node),
19075+ jnode_get_index(node), gfp_flags);
19076+ if (page == NULL)
19077+ return ERR_PTR(RETERR(-ENOMEM));
19078+ } else {
19079+ if (!TestSetPageLocked(page)) {
19080+ spin_unlock_jnode(node);
19081+ return page;
19082+ }
19083+ page_cache_get(page);
19084+ spin_unlock_jnode(node);
19085+ lock_page(page);
19086+ assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19087+ }
19088+
19089+ spin_lock_jnode(node);
19090+ if (!jnode_page(node))
19091+ jnode_attach_page(node, page);
19092+ spin_unlock_jnode(node);
19093+
19094+ page_cache_release(page);
19095+ assert("zam-894", jnode_page(node) == page);
19096+ return page;
19097+}
19098+
19099+/* Start read operation for jnode's page if page is not up-to-date. */
19100+static int jnode_start_read(jnode * node, struct page *page)
19101+{
19102+ assert("zam-893", PageLocked(page));
19103+
19104+ if (PageUptodate(page)) {
19105+ unlock_page(page);
19106+ return 0;
19107+ }
19108+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19109+}
19110+
19111+#if REISER4_DEBUG
19112+static void check_jload(jnode * node, struct page *page)
19113+{
19114+ if (jnode_is_znode(node)) {
19115+ node40_header *nh;
19116+ znode *z;
19117+
19118+ z = JZNODE(node);
19119+ if (znode_is_any_locked(z)) {
19120+ nh = (node40_header *) kmap(page);
19121+ /* this only works for node40-only file systems. For
19122+ * debugging. */
19123+ assert("nikita-3253",
19124+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19125+ kunmap(page);
19126+ }
19127+ assert("nikita-3565", znode_invariant(z));
19128+ }
19129+}
19130+#else
19131+#define check_jload(node, page) noop
19132+#endif
19133+
19134+/* prefetch jnode to speed up next call to jload. Call this when you are going
19135+ * to call jload() shortly. This will bring appropriate portion of jnode into
19136+ * CPU cache. */
19137+void jload_prefetch(jnode * node)
19138+{
19139+ prefetchw(&node->x_count);
19140+}
19141+
19142+/* load jnode's data into memory */
19143+int jload_gfp(jnode * node /* node to load */ ,
19144+ gfp_t gfp_flags /* allocation flags */ ,
19145+ int do_kmap /* true if page should be kmapped */ )
19146+{
19147+ struct page *page;
19148+ int result = 0;
19149+ int parsed;
19150+
19151+ assert("nikita-3010", reiser4_schedulable());
19152+
19153+ prefetchw(&node->pg);
19154+
19155+ /* taking d-reference implies taking x-reference. */
19156+ jref(node);
19157+
19158+ /*
19159+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19160+ * should be atomic, otherwise there is a race against
19161+ * reiser4_releasepage().
19162+ */
19163+ spin_lock(&(node->load));
19164+ add_d_ref(node);
19165+ parsed = jnode_is_parsed(node);
19166+ spin_unlock(&(node->load));
19167+
19168+ if (unlikely(!parsed)) {
19169+ page = jnode_get_page_locked(node, gfp_flags);
19170+ if (unlikely(IS_ERR(page))) {
19171+ result = PTR_ERR(page);
19172+ goto failed;
19173+ }
19174+
19175+ result = jnode_start_read(node, page);
19176+ if (unlikely(result != 0))
19177+ goto failed;
19178+
19179+ wait_on_page_locked(page);
19180+ if (unlikely(!PageUptodate(page))) {
19181+ result = RETERR(-EIO);
19182+ goto failed;
19183+ }
19184+
19185+ if (do_kmap)
19186+ node->data = kmap(page);
19187+
19188+ result = jparse(node);
19189+ if (unlikely(result != 0)) {
19190+ if (do_kmap)
19191+ kunmap(page);
19192+ goto failed;
19193+ }
19194+ check_jload(node, page);
19195+ } else {
19196+ page = jnode_page(node);
19197+ check_jload(node, page);
19198+ if (do_kmap)
19199+ node->data = kmap(page);
19200+ }
19201+
19202+ if (!is_writeout_mode())
19203+ /* We do not mark pages active if jload is called as a part of
19204+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19205+ * and write_logs() add no value to cached data, there is no
19206+ * sense to mark pages as active when they go to disk, it just
19207+ * confuses vm scanning routines because clean page could be
19208+ * moved out from inactive list as a result of this
19209+ * mark_page_accessed() call. */
19210+ mark_page_accessed(page);
19211+
19212+ return 0;
19213+
19214+ failed:
19215+ jrelse_tail(node);
19216+ return result;
19217+
19218+}
19219+
19220+/* start asynchronous reading for given jnode's page. */
19221+int jstartio(jnode * node)
19222+{
19223+ struct page *page;
19224+
19225+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19226+ if (IS_ERR(page))
19227+ return PTR_ERR(page);
19228+
19229+ return jnode_start_read(node, page);
19230+}
19231+
19232+/* Initialize a node by calling appropriate plugin instead of reading
19233+ * node from disk as in jload(). */
19234+int jinit_new(jnode * node, gfp_t gfp_flags)
19235+{
19236+ struct page *page;
19237+ int result;
19238+
19239+ jref(node);
19240+ add_d_ref(node);
19241+
19242+ page = jnode_get_page_locked(node, gfp_flags);
19243+ if (IS_ERR(page)) {
19244+ result = PTR_ERR(page);
19245+ goto failed;
19246+ }
19247+
19248+ SetPageUptodate(page);
19249+ unlock_page(page);
19250+
19251+ node->data = kmap(page);
19252+
19253+ if (!jnode_is_parsed(node)) {
19254+ jnode_plugin *jplug = jnode_ops(node);
19255+ spin_lock_jnode(node);
19256+ result = jplug->init(node);
19257+ spin_unlock_jnode(node);
19258+ if (result) {
19259+ kunmap(page);
19260+ goto failed;
19261+ }
19262+ JF_SET(node, JNODE_PARSED);
19263+ }
19264+
19265+ return 0;
19266+
19267+ failed:
19268+ jrelse(node);
19269+ return result;
19270+}
19271+
19272+/* release a reference to jnode acquired by jload(), decrement ->d_count */
19273+void jrelse_tail(jnode * node /* jnode to release references to */ )
19274+{
19275+ assert("nikita-489", atomic_read(&node->d_count) > 0);
19276+ atomic_dec(&node->d_count);
19277+ /* release reference acquired in jload_gfp() or jinit_new() */
19278+ jput(node);
19279+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
19280+ LOCK_CNT_DEC(d_refs);
19281+}
19282+
19283+/* drop reference to node data. When last reference is dropped, data are
19284+ unloaded. */
19285+void jrelse(jnode * node /* jnode to release references to */ )
19286+{
19287+ struct page *page;
19288+
19289+ assert("nikita-487", node != NULL);
19290+ assert_spin_not_locked(&(node->guard));
19291+
19292+ page = jnode_page(node);
19293+ if (likely(page != NULL)) {
19294+ /*
19295+ * it is safe not to lock jnode here, because at this point
19296+ * @node->d_count is greater than zero (if jrelse() is used
19297+ * correctly, that is). JNODE_PARSED may be not set yet, if,
19298+ * for example, we got here as a result of error handling path
19299+ * in jload(). Anyway, page cannot be detached by
19300+ * reiser4_releasepage(). truncate will invalidate page
19301+ * regardless, but this should not be a problem.
19302+ */
19303+ kunmap(page);
19304+ }
19305+ jrelse_tail(node);
19306+}
19307+
19308+/* called from jput() to wait for io completion */
19309+static void jnode_finish_io(jnode * node)
19310+{
19311+ struct page *page;
19312+
19313+ assert("nikita-2922", node != NULL);
19314+
19315+ spin_lock_jnode(node);
19316+ page = jnode_page(node);
19317+ if (page != NULL) {
19318+ page_cache_get(page);
19319+ spin_unlock_jnode(node);
19320+ wait_on_page_writeback(page);
19321+ page_cache_release(page);
19322+ } else
19323+ spin_unlock_jnode(node);
19324+}
19325+
19326+/*
19327+ * This is called by jput() when last reference to jnode is released. This is
19328+ * separate function, because we want fast path of jput() to be inline and,
19329+ * therefore, small.
19330+ */
19331+void jput_final(jnode * node)
19332+{
19333+ int r_i_p;
19334+
19335+ /* A fast check for keeping node in cache. We always keep node in cache
19336+ * if its page is present and node was not marked for deletion */
19337+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19338+ rcu_read_unlock();
19339+ return;
19340+ }
19341+ assert("edward-1432", node->page_count == 0);
19342+
19343+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19344+ /*
19345+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19346+ * this case it is safe to access node after unlock.
19347+ */
19348+ rcu_read_unlock();
19349+ if (r_i_p) {
19350+ jnode_finish_io(node);
19351+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19352+ /* node is removed from the tree. */
19353+ jdelete(node);
19354+ else
19355+ jnode_try_drop(node);
19356+ }
19357+ /* if !r_i_p some other thread is already killing it */
19358+}
19359+
19360+int jwait_io(jnode * node, int rw)
19361+{
19362+ struct page *page;
19363+ int result;
19364+
19365+ assert("zam-447", node != NULL);
19366+ assert("zam-448", jnode_page(node) != NULL);
19367+
19368+ page = jnode_page(node);
19369+
19370+ result = 0;
19371+ if (rw == READ) {
19372+ wait_on_page_locked(page);
19373+ } else {
19374+ assert("nikita-2227", rw == WRITE);
19375+ wait_on_page_writeback(page);
19376+ }
19377+ if (PageError(page))
19378+ result = RETERR(-EIO);
19379+
19380+ return result;
19381+}
19382+
19383+/*
19384+ * jnode types and plugins.
19385+ *
19386+ * jnode by itself is a "base type". There are several different jnode
19387+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19388+ * has to do different things based on jnode type. In the standard reiser4 way
19389+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19390+ *
19391+ * Functions below deal with jnode types and define methods of jnode plugin.
19392+ *
19393+ */
19394+
19395+/* set jnode type. This is done during jnode initialization. */
19396+static void jnode_set_type(jnode * node, jnode_type type)
19397+{
19398+ static unsigned long type_to_mask[] = {
19399+ [JNODE_UNFORMATTED_BLOCK] = 1,
19400+ [JNODE_FORMATTED_BLOCK] = 0,
19401+ [JNODE_BITMAP] = 2,
19402+ [JNODE_IO_HEAD] = 6,
19403+ [JNODE_INODE] = 4
19404+ };
19405+
19406+ assert("zam-647", type < LAST_JNODE_TYPE);
19407+ assert("nikita-2815", !jnode_is_loaded(node));
19408+ assert("nikita-3386", node->state == 0);
19409+
19410+ node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19411+}
19412+
19413+/* ->init() method of jnode plugin for jnodes that don't require plugin
19414+ * specific initialization. */
19415+static int init_noinit(jnode * node UNUSED_ARG)
19416+{
19417+ return 0;
19418+}
19419+
19420+/* ->parse() method of jnode plugin for jnodes that don't require plugin
19421+ * specific pasring. */
19422+static int parse_noparse(jnode * node UNUSED_ARG)
19423+{
19424+ return 0;
19425+}
19426+
19427+/* ->mapping() method for unformatted jnode */
19428+struct address_space *mapping_jnode(const jnode * node)
19429+{
19430+ struct address_space *map;
19431+
19432+ assert("nikita-2713", node != NULL);
19433+
19434+ /* mapping is stored in jnode */
19435+
19436+ map = node->key.j.mapping;
19437+ assert("nikita-2714", map != NULL);
19438+ assert("nikita-2897", is_reiser4_inode(map->host));
19439+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19440+ return map;
19441+}
19442+
19443+/* ->index() method for unformatted jnodes */
19444+unsigned long index_jnode(const jnode * node)
19445+{
19446+ /* index is stored in jnode */
19447+ return node->key.j.index;
19448+}
19449+
19450+/* ->remove() method for unformatted jnodes */
19451+static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19452+{
19453+ /* remove jnode from hash table and radix tree */
19454+ if (node->key.j.mapping)
19455+ unhash_unformatted_node_nolock(node);
19456+}
19457+
19458+/* ->mapping() method for znodes */
19459+static struct address_space *mapping_znode(const jnode * node)
19460+{
19461+ /* all znodes belong to fake inode */
19462+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19463+}
19464+
19465+/* ->index() method for znodes */
19466+static unsigned long index_znode(const jnode * node)
19467+{
19468+ unsigned long addr;
19469+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19470+
19471+ /* index of znode is just its address (shifted) */
19472+ addr = (unsigned long)node;
19473+ return (addr - PAGE_OFFSET) >> znode_shift_order;
19474+}
19475+
19476+/* ->mapping() method for bitmap jnode */
19477+static struct address_space *mapping_bitmap(const jnode * node)
19478+{
19479+ /* all bitmap blocks belong to special bitmap inode */
19480+ return get_super_private(jnode_get_tree(node)->super)->bitmap->
19481+ i_mapping;
19482+}
19483+
19484+/* ->index() method for jnodes that are indexed by address */
19485+static unsigned long index_is_address(const jnode * node)
19486+{
19487+ unsigned long ind;
19488+
19489+ ind = (unsigned long)node;
19490+ return ind - PAGE_OFFSET;
19491+}
19492+
19493+/* resolve race with jput */
19494+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19495+{
19496+ /*
19497+ * This is used as part of RCU-based jnode handling.
19498+ *
19499+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19500+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19501+ * not protected during this, so concurrent thread may execute
19502+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19503+ * freed in jput_final(). To avoid such races, jput_final() sets
19504+ * JNODE_RIP on jnode (under tree lock). All places that work with
19505+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19506+ * (first without taking tree lock), and if this bit is set, released
19507+ * reference acquired by the current thread and returns NULL.
19508+ *
19509+ * As a result, if jnode is being concurrently freed, NULL is returned
19510+ * and caller should pretend that jnode wasn't found in the first
19511+ * place.
19512+ *
19513+ * Otherwise it's safe to release "rcu-read-lock" and continue with
19514+ * jnode.
19515+ */
19516+ if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19517+ read_lock_tree(tree);
19518+ if (JF_ISSET(node, JNODE_RIP)) {
19519+ dec_x_ref(node);
19520+ node = NULL;
19521+ }
19522+ read_unlock_tree(tree);
19523+ }
19524+ return node;
19525+}
19526+
19527+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19528+{
19529+ struct inode *inode;
19530+ item_plugin *iplug;
19531+ loff_t off;
19532+
19533+ assert("nikita-3092", node != NULL);
19534+ assert("nikita-3093", key != NULL);
19535+ assert("nikita-3094", jnode_is_unformatted(node));
19536+
19537+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19538+ inode = mapping_jnode(node)->host;
19539+
19540+ if (node->parent_item_id != 0)
19541+ iplug = item_plugin_by_id(node->parent_item_id);
19542+ else
19543+ iplug = NULL;
19544+
19545+ if (iplug != NULL && iplug->f.key_by_offset)
19546+ iplug->f.key_by_offset(inode, off, key);
19547+ else {
19548+ file_plugin *fplug;
19549+
19550+ fplug = inode_file_plugin(inode);
19551+ assert("zam-1007", fplug != NULL);
19552+ assert("zam-1008", fplug->key_by_inode != NULL);
19553+
19554+ fplug->key_by_inode(inode, off, key);
19555+ }
19556+
19557+ return key;
19558+}
19559+
19560+/* ->parse() method for formatted nodes */
19561+static int parse_znode(jnode * node)
19562+{
19563+ return zparse(JZNODE(node));
19564+}
19565+
19566+/* ->delete() method for formatted nodes */
19567+static void delete_znode(jnode * node, reiser4_tree * tree)
19568+{
19569+ znode *z;
19570+
19571+ assert_rw_write_locked(&(tree->tree_lock));
19572+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19573+
19574+ z = JZNODE(node);
19575+ assert("vs-899", z->c_count == 0);
19576+
19577+ /* delete znode from sibling list. */
19578+ sibling_list_remove(z);
19579+
19580+ znode_remove(z, tree);
19581+}
19582+
19583+/* ->remove() method for formatted nodes */
19584+static int remove_znode(jnode * node, reiser4_tree * tree)
19585+{
19586+ znode *z;
19587+
19588+ assert_rw_write_locked(&(tree->tree_lock));
19589+ z = JZNODE(node);
19590+
19591+ if (z->c_count == 0) {
19592+ /* detach znode from sibling list. */
19593+ sibling_list_drop(z);
19594+ /* this is called with tree spin-lock held, so call
19595+ znode_remove() directly (rather than znode_lock_remove()). */
19596+ znode_remove(z, tree);
19597+ return 0;
19598+ }
19599+ return RETERR(-EBUSY);
19600+}
19601+
19602+/* ->init() method for formatted nodes */
19603+static int init_znode(jnode * node)
19604+{
19605+ znode *z;
19606+
19607+ z = JZNODE(node);
19608+ /* call node plugin to do actual initialization */
19609+ return z->nplug->init(z);
19610+}
19611+
19612+/* ->clone() method for formatted nodes */
19613+static jnode *clone_formatted(jnode * node)
19614+{
19615+ znode *clone;
19616+
19617+ assert("vs-1430", jnode_is_znode(node));
19618+ clone = zalloc(reiser4_ctx_gfp_mask_get());
19619+ if (clone == NULL)
19620+ return ERR_PTR(RETERR(-ENOMEM));
19621+ zinit(clone, NULL, current_tree);
19622+ jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19623+ /* ZJNODE(clone)->key.z is not initialized */
19624+ clone->level = JZNODE(node)->level;
19625+
19626+ return ZJNODE(clone);
19627+}
19628+
19629+/* jplug->clone for unformatted nodes */
19630+static jnode *clone_unformatted(jnode * node)
19631+{
19632+ jnode *clone;
19633+
19634+ assert("vs-1431", jnode_is_unformatted(node));
19635+ clone = jalloc();
19636+ if (clone == NULL)
19637+ return ERR_PTR(RETERR(-ENOMEM));
19638+
19639+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19640+ jnode_set_block(clone, jnode_get_block(node));
19641+
19642+ return clone;
19643+
19644+}
19645+
19646+/*
19647+ * Setup jnode plugin methods for various jnode types.
19648+ */
19649+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19650+ [JNODE_UNFORMATTED_BLOCK] = {
19651+ .h = {
19652+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19653+ .id = JNODE_UNFORMATTED_BLOCK,
19654+ .pops = NULL,
19655+ .label = "unformatted",
19656+ .desc = "unformatted node",
19657+ .linkage = {NULL, NULL}
19658+ },
19659+ .init = init_noinit,
19660+ .parse = parse_noparse,
19661+ .mapping = mapping_jnode,
19662+ .index = index_jnode,
19663+ .clone = clone_unformatted
19664+ },
19665+ [JNODE_FORMATTED_BLOCK] = {
19666+ .h = {
19667+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19668+ .id = JNODE_FORMATTED_BLOCK,
19669+ .pops = NULL,
19670+ .label = "formatted",
19671+ .desc = "formatted tree node",
19672+ .linkage = {NULL, NULL}
19673+ },
19674+ .init = init_znode,
19675+ .parse = parse_znode,
19676+ .mapping = mapping_znode,
19677+ .index = index_znode,
19678+ .clone = clone_formatted
19679+ },
19680+ [JNODE_BITMAP] = {
19681+ .h = {
19682+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19683+ .id = JNODE_BITMAP,
19684+ .pops = NULL,
19685+ .label = "bitmap",
19686+ .desc = "bitmap node",
19687+ .linkage = {NULL, NULL}
19688+ },
19689+ .init = init_noinit,
19690+ .parse = parse_noparse,
19691+ .mapping = mapping_bitmap,
19692+ .index = index_is_address,
19693+ .clone = NULL
19694+ },
19695+ [JNODE_IO_HEAD] = {
19696+ .h = {
19697+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19698+ .id = JNODE_IO_HEAD,
19699+ .pops = NULL,
19700+ .label = "io head",
19701+ .desc = "io head",
19702+ .linkage = {NULL, NULL}
19703+ },
19704+ .init = init_noinit,
19705+ .parse = parse_noparse,
19706+ .mapping = mapping_bitmap,
19707+ .index = index_is_address,
19708+ .clone = NULL
19709+ },
19710+ [JNODE_INODE] = {
19711+ .h = {
19712+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19713+ .id = JNODE_INODE,
19714+ .pops = NULL,
19715+ .label = "inode",
19716+ .desc = "inode's builtin jnode",
19717+ .linkage = {NULL, NULL}
19718+ },
19719+ .init = NULL,
19720+ .parse = NULL,
19721+ .mapping = NULL,
19722+ .index = NULL,
19723+ .clone = NULL
19724+ }
19725+};
19726+
19727+/*
19728+ * jnode destruction.
19729+ *
19730+ * Thread may use a jnode after it acquired a reference to it. References are
19731+ * counted in ->x_count field. Reference protects jnode from being
19732+ * recycled. This is different from protecting jnode data (that are stored in
19733+ * jnode page) from being evicted from memory. Data are protected by jload()
19734+ * and released by jrelse().
19735+ *
19736+ * If thread already possesses a reference to the jnode it can acquire another
19737+ * one through jref(). Initial reference is obtained (usually) by locating
19738+ * jnode in some indexing structure that depends on jnode type: formatted
19739+ * nodes are kept in global hash table, where they are indexed by block
19740+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19741+ * table, which is indexed by oid and offset within file, and in per-inode
19742+ * radix tree.
19743+ *
19744+ * Reference to jnode is released by jput(). If last reference is released,
19745+ * jput_final() is called. This function determines whether jnode has to be
19746+ * deleted (this happens when corresponding node is removed from the file
19747+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19748+ * should be just "removed" (deleted from memory).
19749+ *
19750+ * Jnode destruction is signally delicate dance because of locking and RCU.
19751+ */
19752+
19753+/*
19754+ * Returns true if jnode cannot be removed right now. This check is called
19755+ * under tree lock. If it returns true, jnode is irrevocably committed to be
19756+ * deleted/removed.
19757+ */
19758+static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19759+{
19760+ /* if other thread managed to acquire a reference to this jnode, don't
19761+ * free it. */
19762+ if (atomic_read(&node->x_count) > 0)
19763+ return 1;
19764+ /* also, don't free znode that has children in memory */
19765+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19766+ return 1;
19767+ return 0;
19768+}
19769+
19770+/*
19771+ * this is called as part of removing jnode. Based on jnode type, call
19772+ * corresponding function that removes jnode from indices and returns it back
19773+ * to the appropriate slab (through RCU).
19774+ */
19775+static inline void
19776+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19777+{
19778+ switch (jtype) {
19779+ case JNODE_UNFORMATTED_BLOCK:
19780+ remove_jnode(node, tree);
19781+ break;
19782+ case JNODE_IO_HEAD:
19783+ case JNODE_BITMAP:
19784+ break;
19785+ case JNODE_INODE:
19786+ break;
19787+ case JNODE_FORMATTED_BLOCK:
19788+ remove_znode(node, tree);
19789+ break;
19790+ default:
19791+ wrong_return_value("nikita-3196", "Wrong jnode type");
19792+ }
19793+}
19794+
19795+/*
19796+ * this is called as part of deleting jnode. Based on jnode type, call
19797+ * corresponding function that removes jnode from indices and returns it back
19798+ * to the appropriate slab (through RCU).
19799+ *
19800+ * This differs from jnode_remove() only for formatted nodes---for them
19801+ * sibling list handling is different for removal and deletion.
19802+ */
19803+static inline void
19804+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19805+{
19806+ switch (jtype) {
19807+ case JNODE_UNFORMATTED_BLOCK:
19808+ remove_jnode(node, tree);
19809+ break;
19810+ case JNODE_IO_HEAD:
19811+ case JNODE_BITMAP:
19812+ break;
19813+ case JNODE_FORMATTED_BLOCK:
19814+ delete_znode(node, tree);
19815+ break;
19816+ case JNODE_INODE:
19817+ default:
19818+ wrong_return_value("nikita-3195", "Wrong jnode type");
19819+ }
19820+}
19821+
19822+#if REISER4_DEBUG
19823+/*
19824+ * remove jnode from the debugging list of all jnodes hanging off super-block.
19825+ */
19826+void jnode_list_remove(jnode * node)
19827+{
19828+ reiser4_super_info_data *sbinfo;
19829+
19830+ sbinfo = get_super_private(jnode_get_tree(node)->super);
19831+
19832+ spin_lock_irq(&sbinfo->all_guard);
19833+ assert("nikita-2422", !list_empty(&node->jnodes));
19834+ list_del_init(&node->jnodes);
19835+ spin_unlock_irq(&sbinfo->all_guard);
19836+}
19837+#endif
19838+
19839+/*
19840+ * this is called by jput_final() to remove jnode when last reference to it is
19841+ * released.
19842+ */
19843+static int jnode_try_drop(jnode * node)
19844+{
19845+ int result;
19846+ reiser4_tree *tree;
19847+ jnode_type jtype;
19848+
19849+ assert("nikita-2491", node != NULL);
19850+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19851+
19852+ tree = jnode_get_tree(node);
19853+ jtype = jnode_get_type(node);
19854+
19855+ spin_lock_jnode(node);
19856+ write_lock_tree(tree);
19857+ /*
19858+ * if jnode has a page---leave it alone. Memory pressure will
19859+ * eventually kill page and jnode.
19860+ */
19861+ if (jnode_page(node) != NULL) {
19862+ write_unlock_tree(tree);
19863+ spin_unlock_jnode(node);
19864+ JF_CLR(node, JNODE_RIP);
19865+ return RETERR(-EBUSY);
19866+ }
19867+
19868+ /* re-check ->x_count under tree lock. */
19869+ result = jnode_is_busy(node, jtype);
19870+ if (result == 0) {
19871+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19872+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19873+
19874+ spin_unlock_jnode(node);
19875+ /* no page and no references---despatch him. */
19876+ jnode_remove(node, jtype, tree);
19877+ write_unlock_tree(tree);
19878+ jnode_free(node, jtype);
19879+ } else {
19880+ /* busy check failed: reference was acquired by concurrent
19881+ * thread. */
19882+ write_unlock_tree(tree);
19883+ spin_unlock_jnode(node);
19884+ JF_CLR(node, JNODE_RIP);
19885+ }
19886+ return result;
19887+}
19888+
19889+/* jdelete() -- Delete jnode from the tree and file system */
19890+static int jdelete(jnode * node /* jnode to finish with */ )
19891+{
19892+ struct page *page;
19893+ int result;
19894+ reiser4_tree *tree;
19895+ jnode_type jtype;
19896+
19897+ assert("nikita-467", node != NULL);
19898+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
19899+
19900+ jtype = jnode_get_type(node);
19901+
19902+ page = jnode_lock_page(node);
19903+ assert_spin_locked(&(node->guard));
19904+
19905+ tree = jnode_get_tree(node);
19906+
19907+ write_lock_tree(tree);
19908+ /* re-check ->x_count under tree lock. */
19909+ result = jnode_is_busy(node, jtype);
19910+ if (likely(!result)) {
19911+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19912+ assert("jmacd-511", atomic_read(&node->d_count) == 0);
19913+
19914+ /* detach page */
19915+ if (page != NULL) {
19916+ /*
19917+ * FIXME this is racy against jnode_extent_write().
19918+ */
19919+ page_clear_jnode(page, node);
19920+ }
19921+ spin_unlock_jnode(node);
19922+ /* goodbye */
19923+ jnode_delete(node, jtype, tree);
19924+ write_unlock_tree(tree);
19925+ jnode_free(node, jtype);
19926+ /* @node is no longer valid pointer */
19927+ if (page != NULL)
19928+ reiser4_drop_page(page);
19929+ } else {
19930+ /* busy check failed: reference was acquired by concurrent
19931+ * thread. */
19932+ JF_CLR(node, JNODE_RIP);
19933+ write_unlock_tree(tree);
19934+ spin_unlock_jnode(node);
19935+ if (page != NULL)
19936+ unlock_page(page);
19937+ }
19938+ return result;
19939+}
19940+
19941+/* drop jnode on the floor.
19942+
19943+ Return value:
19944+
19945+ -EBUSY: failed to drop jnode, because there are still references to it
19946+
19947+ 0: successfully dropped jnode
19948+
19949+*/
19950+static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
19951+{
19952+ struct page *page;
19953+ jnode_type jtype;
19954+ int result;
19955+
19956+ assert("zam-602", node != NULL);
19957+ assert_rw_not_read_locked(&(tree->tree_lock));
19958+ assert_rw_not_write_locked(&(tree->tree_lock));
19959+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19960+
19961+ jtype = jnode_get_type(node);
19962+
19963+ page = jnode_lock_page(node);
19964+ assert_spin_locked(&(node->guard));
19965+
19966+ write_lock_tree(tree);
19967+
19968+ /* re-check ->x_count under tree lock. */
19969+ result = jnode_is_busy(node, jtype);
19970+ if (!result) {
19971+ assert("nikita-2488", page == jnode_page(node));
19972+ assert("nikita-2533", atomic_read(&node->d_count) == 0);
19973+ if (page != NULL) {
19974+ assert("nikita-2126", !PageDirty(page));
19975+ assert("nikita-2127", PageUptodate(page));
19976+ assert("nikita-2181", PageLocked(page));
19977+ page_clear_jnode(page, node);
19978+ }
19979+ spin_unlock_jnode(node);
19980+ jnode_remove(node, jtype, tree);
19981+ write_unlock_tree(tree);
19982+ jnode_free(node, jtype);
19983+ if (page != NULL) {
19984+ reiser4_drop_page(page);
19985+ }
19986+ } else {
19987+ /* busy check failed: reference was acquired by concurrent
19988+ * thread. */
19989+ JF_CLR(node, JNODE_RIP);
19990+ write_unlock_tree(tree);
19991+ spin_unlock_jnode(node);
19992+ if (page != NULL)
19993+ unlock_page(page);
19994+ }
19995+ return result;
19996+}
19997+
19998+/* This function frees jnode "if possible". In particular, [dcx]_count has to
19999+ be 0 (where applicable). */
20000+void jdrop(jnode * node)
20001+{
20002+ jdrop_in_tree(node, jnode_get_tree(node));
20003+}
20004+
20005+/* IO head jnode implementation; The io heads are simple j-nodes with limited
20006+ functionality (these j-nodes are not in any hash table) just for reading
20007+ from and writing to disk. */
20008+
20009+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20010+{
20011+ jnode *jal = jalloc();
20012+
20013+ if (jal != NULL) {
20014+ jnode_init(jal, current_tree, JNODE_IO_HEAD);
20015+ jnode_set_block(jal, block);
20016+ }
20017+
20018+ jref(jal);
20019+
20020+ return jal;
20021+}
20022+
20023+void reiser4_drop_io_head(jnode * node)
20024+{
20025+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20026+
20027+ jput(node);
20028+ jdrop(node);
20029+}
20030+
20031+/* protect keep jnode data from reiser4_releasepage() */
20032+void pin_jnode_data(jnode * node)
20033+{
20034+ assert("zam-671", jnode_page(node) != NULL);
20035+ page_cache_get(jnode_page(node));
20036+}
20037+
20038+/* make jnode data free-able again */
20039+void unpin_jnode_data(jnode * node)
20040+{
20041+ assert("zam-672", jnode_page(node) != NULL);
20042+ page_cache_release(jnode_page(node));
20043+}
20044+
20045+struct address_space *jnode_get_mapping(const jnode * node)
20046+{
20047+ assert("nikita-3162", node != NULL);
20048+ return jnode_ops(node)->mapping(node);
20049+}
20050+
20051+#if REISER4_DEBUG
20052+/* debugging aid: jnode invariant */
20053+int jnode_invariant_f(const jnode * node, char const **msg)
20054+{
20055+#define _ergo(ant, con) \
20056+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20057+#define _check(exp) ((*msg) = #exp, (exp))
20058+
20059+ return _check(node != NULL) &&
20060+ /* [jnode-queued] */
20061+ /* only relocated node can be queued, except that when znode
20062+ * is being deleted, its JNODE_RELOC bit is cleared */
20063+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20064+ JF_ISSET(node, JNODE_RELOC) ||
20065+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20066+ _check(node->jnodes.prev != NULL) &&
20067+ _check(node->jnodes.next != NULL) &&
20068+ /* [jnode-dirty] invariant */
20069+ /* dirty inode is part of atom */
20070+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20071+ /* [jnode-oid] invariant */
20072+ /* for unformatted node ->objectid and ->mapping fields are
20073+ * consistent */
20074+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20075+ node->key.j.objectid ==
20076+ get_inode_oid(node->key.j.mapping->host)) &&
20077+ /* [jnode-atom-valid] invariant */
20078+ /* node atom has valid state */
20079+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20080+ /* [jnode-page-binding] invariant */
20081+ /* if node points to page, it points back to node */
20082+ _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20083+ /* [jnode-refs] invariant */
20084+ /* only referenced jnode can be loaded */
20085+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20086+
20087+}
20088+
20089+static const char *jnode_type_name(jnode_type type)
20090+{
20091+ switch (type) {
20092+ case JNODE_UNFORMATTED_BLOCK:
20093+ return "unformatted";
20094+ case JNODE_FORMATTED_BLOCK:
20095+ return "formatted";
20096+ case JNODE_BITMAP:
20097+ return "bitmap";
20098+ case JNODE_IO_HEAD:
20099+ return "io head";
20100+ case JNODE_INODE:
20101+ return "inode";
20102+ case LAST_JNODE_TYPE:
20103+ return "last";
20104+ default:{
20105+ static char unknown[30];
20106+
20107+ sprintf(unknown, "unknown %i", type);
20108+ return unknown;
20109+ }
20110+ }
20111+}
20112+
20113+#define jnode_state_name( node, flag ) \
20114+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20115+
20116+/* debugging aid: output human readable information about @node */
20117+static void info_jnode(const char *prefix /* prefix to print */ ,
20118+ const jnode * node /* node to print */ )
20119+{
20120+ assert("umka-068", prefix != NULL);
20121+
20122+ if (node == NULL) {
20123+ printk("%s: null\n", prefix);
20124+ return;
20125+ }
20126+
20127+ printk
20128+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20129+ " block: %s, d_count: %d, x_count: %d, "
20130+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20131+ node->state,
20132+ jnode_state_name(node, JNODE_PARSED),
20133+ jnode_state_name(node, JNODE_HEARD_BANSHEE),
20134+ jnode_state_name(node, JNODE_LEFT_CONNECTED),
20135+ jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20136+ jnode_state_name(node, JNODE_ORPHAN),
20137+ jnode_state_name(node, JNODE_CREATED),
20138+ jnode_state_name(node, JNODE_RELOC),
20139+ jnode_state_name(node, JNODE_OVRWR),
20140+ jnode_state_name(node, JNODE_DIRTY),
20141+ jnode_state_name(node, JNODE_IS_DYING),
20142+ jnode_state_name(node, JNODE_RIP),
20143+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20144+ jnode_state_name(node, JNODE_WRITEBACK),
20145+ jnode_state_name(node, JNODE_NEW),
20146+ jnode_state_name(node, JNODE_DKSET),
20147+ jnode_state_name(node, JNODE_REPACK),
20148+ jnode_state_name(node, JNODE_CLUSTER_PAGE),
20149+ jnode_get_level(node), sprint_address(jnode_get_block(node)),
20150+ atomic_read(&node->d_count), atomic_read(&node->x_count),
20151+ jnode_page(node), node->atom, 0, 0,
20152+ jnode_type_name(jnode_get_type(node)));
20153+ if (jnode_is_unformatted(node)) {
20154+ printk("inode: %llu, index: %lu, ",
20155+ node->key.j.objectid, node->key.j.index);
20156+ }
20157+}
20158+
20159+/* debugging aid: check znode invariant and panic if it doesn't hold */
20160+static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20161+{
20162+ char const *failed_msg;
20163+ int result;
20164+ reiser4_tree *tree;
20165+
20166+ tree = jnode_get_tree(node);
20167+
20168+ assert("umka-063312", node != NULL);
20169+ assert("umka-064321", tree != NULL);
20170+
20171+ if (!jlocked && !tlocked)
20172+ spin_lock_jnode((jnode *) node);
20173+ if (!tlocked)
20174+ read_lock_tree(jnode_get_tree(node));
20175+ result = jnode_invariant_f(node, &failed_msg);
20176+ if (!result) {
20177+ info_jnode("corrupted node", node);
20178+ warning("jmacd-555", "Condition %s failed", failed_msg);
20179+ }
20180+ if (!tlocked)
20181+ read_unlock_tree(jnode_get_tree(node));
20182+ if (!jlocked && !tlocked)
20183+ spin_unlock_jnode((jnode *) node);
20184+ return result;
20185+}
20186+
20187+#endif /* REISER4_DEBUG */
20188+
20189+/* Make Linus happy.
20190+ Local variables:
20191+ c-indentation-style: "K&R"
20192+ mode-name: "LC"
20193+ c-basic-offset: 8
20194+ tab-width: 8
20195+ fill-column: 80
20196+ End:
20197+*/
20198diff -urN linux-2.6.20.orig/fs/reiser4/jnode.h linux-2.6.20/fs/reiser4/jnode.h
20199--- linux-2.6.20.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300
20200+++ linux-2.6.20/fs/reiser4/jnode.h 2007-05-06 14:50:43.734986973 +0400
20201@@ -0,0 +1,705 @@
20202+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20203+ * reiser4/README */
20204+
20205+/* Declaration of jnode. See jnode.c for details. */
20206+
20207+#ifndef __JNODE_H__
20208+#define __JNODE_H__
20209+
20210+#include "forward.h"
20211+#include "type_safe_hash.h"
20212+#include "txnmgr.h"
20213+#include "key.h"
20214+#include "debug.h"
20215+#include "dformat.h"
20216+#include "page_cache.h"
20217+#include "context.h"
20218+
20219+#include "plugin/plugin.h"
20220+
20221+#include <linux/fs.h>
20222+#include <linux/mm.h>
20223+#include <linux/spinlock.h>
20224+#include <asm/atomic.h>
20225+#include <asm/bitops.h>
20226+#include <linux/list.h>
20227+#include <linux/rcupdate.h>
20228+
20229+/* declare hash table of jnodes (jnodes proper, that is, unformatted
20230+ nodes) */
20231+TYPE_SAFE_HASH_DECLARE(j, jnode);
20232+
20233+/* declare hash table of znodes */
20234+TYPE_SAFE_HASH_DECLARE(z, znode);
20235+
20236+typedef struct {
20237+ __u64 objectid;
20238+ unsigned long index;
20239+ struct address_space *mapping;
20240+} jnode_key_t;
20241+
20242+/*
20243+ Jnode is the "base class" of other nodes in reiser4. It is also happens to
20244+ be exactly the node we use for unformatted tree nodes.
20245+
20246+ Jnode provides following basic functionality:
20247+
20248+ . reference counting and indexing.
20249+
20250+ . integration with page cache. Jnode has ->pg reference to which page can
20251+ be attached.
20252+
20253+ . interface to transaction manager. It is jnode that is kept in transaction
20254+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20255+ means, there should be special type of jnode for inode.)
20256+
20257+ Locking:
20258+
20259+ Spin lock: the following fields are protected by the per-jnode spin lock:
20260+
20261+ ->state
20262+ ->atom
20263+ ->capture_link
20264+
20265+ Following fields are protected by the global tree lock:
20266+
20267+ ->link
20268+ ->key.z (content of ->key.z is only changed in znode_rehash())
20269+ ->key.j
20270+
20271+ Atomic counters
20272+
20273+ ->x_count
20274+ ->d_count
20275+
20276+ ->pg, and ->data are protected by spin lock for unused jnode and are
20277+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20278+ is false).
20279+
20280+ ->tree is immutable after creation
20281+
20282+ Unclear
20283+
20284+ ->blocknr: should be under jnode spin-lock, but current interface is based
20285+ on passing of block address.
20286+
20287+ If you ever need to spin lock two nodes at once, do this in "natural"
20288+ memory order: lock znode with lower address first. (See lock_two_nodes().)
20289+
20290+ Invariants involving this data-type:
20291+
20292+ [jnode-dirty]
20293+ [jnode-refs]
20294+ [jnode-oid]
20295+ [jnode-queued]
20296+ [jnode-atom-valid]
20297+ [jnode-page-binding]
20298+*/
20299+
20300+struct jnode {
20301+#if REISER4_DEBUG
20302+#define JMAGIC 0x52654973 /* "ReIs" */
20303+ int magic;
20304+#endif
20305+ /* FIRST CACHE LINE (16 bytes): data used by jload */
20306+
20307+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20308+ /* 0 */ unsigned long state;
20309+
20310+ /* lock, protecting jnode's fields. */
20311+ /* 4 */ spinlock_t load;
20312+
20313+ /* counter of references to jnode itself. Increased on jref().
20314+ Decreased on jput().
20315+ */
20316+ /* 8 */ atomic_t x_count;
20317+
20318+ /* counter of references to jnode's data. Pin data page(s) in
20319+ memory while this is greater than 0. Increased on jload().
20320+ Decreased on jrelse().
20321+ */
20322+ /* 12 */ atomic_t d_count;
20323+
20324+ /* SECOND CACHE LINE: data used by hash table lookups */
20325+
20326+ /* 16 */ union {
20327+ /* znodes are hashed by block number */
20328+ reiser4_block_nr z;
20329+ /* unformatted nodes are hashed by mapping plus offset */
20330+ jnode_key_t j;
20331+ } key;
20332+
20333+ /* THIRD CACHE LINE */
20334+
20335+ /* 32 */ union {
20336+ /* pointers to maintain hash-table */
20337+ z_hash_link z;
20338+ j_hash_link j;
20339+ } link;
20340+
20341+ /* pointer to jnode page. */
20342+ /* 36 */ struct page *pg;
20343+ /* pointer to node itself. This is page_address(node->pg) when page is
20344+ attached to the jnode
20345+ */
20346+ /* 40 */ void *data;
20347+
20348+ /* 44 */ reiser4_tree *tree;
20349+
20350+ /* FOURTH CACHE LINE: atom related fields */
20351+
20352+ /* 48 */ spinlock_t guard;
20353+
20354+ /* atom the block is in, if any */
20355+ /* 52 */ txn_atom *atom;
20356+
20357+ /* capture list */
20358+ /* 56 */ struct list_head capture_link;
20359+
20360+ /* FIFTH CACHE LINE */
20361+
20362+ /* 64 */ struct rcu_head rcu;
20363+ /* crosses cache line */
20364+
20365+ /* SIXTH CACHE LINE */
20366+
20367+ /* the real blocknr (where io is going to/from) */
20368+ /* 80 */ reiser4_block_nr blocknr;
20369+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20370+ /* NOTE: this parent_item_id looks like jnode type. */
20371+ /* 88 */ reiser4_plugin_id parent_item_id;
20372+ /* 92 */
20373+#if REISER4_DEBUG
20374+ /* number of pages referenced by the jnode (meaningful while capturing of
20375+ page clusters) */
20376+ int page_count;
20377+ /* list of all jnodes for debugging purposes. */
20378+ struct list_head jnodes;
20379+ /* how many times this jnode was written in one transaction */
20380+ int written;
20381+ /* this indicates which atom's list the jnode is on */
20382+ atom_list list;
20383+#endif
20384+} __attribute__ ((aligned(16)));
20385+
20386+/*
20387+ * jnode types. Enumeration of existing jnode types.
20388+ */
20389+typedef enum {
20390+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20391+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20392+ JNODE_BITMAP, /* bitmap */
20393+ JNODE_IO_HEAD, /* jnode representing a block in the
20394+ * wandering log */
20395+ JNODE_INODE, /* jnode embedded into inode */
20396+ LAST_JNODE_TYPE
20397+} jnode_type;
20398+
20399+/* jnode states */
20400+typedef enum {
20401+ /* jnode's page is loaded and data checked */
20402+ JNODE_PARSED = 0,
20403+ /* node was deleted, not all locks on it were released. This
20404+ node is empty and is going to be removed from the tree
20405+ shortly. */
20406+ JNODE_HEARD_BANSHEE = 1,
20407+ /* left sibling pointer is valid */
20408+ JNODE_LEFT_CONNECTED = 2,
20409+ /* right sibling pointer is valid */
20410+ JNODE_RIGHT_CONNECTED = 3,
20411+
20412+ /* znode was just created and doesn't yet have a pointer from
20413+ its parent */
20414+ JNODE_ORPHAN = 4,
20415+
20416+ /* this node was created by its transaction and has not been assigned
20417+ a block address. */
20418+ JNODE_CREATED = 5,
20419+
20420+ /* this node is currently relocated */
20421+ JNODE_RELOC = 6,
20422+ /* this node is currently wandered */
20423+ JNODE_OVRWR = 7,
20424+
20425+ /* this znode has been modified */
20426+ JNODE_DIRTY = 8,
20427+
20428+ /* znode lock is being invalidated */
20429+ JNODE_IS_DYING = 9,
20430+
20431+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20432+
20433+ /* jnode is queued for flushing. */
20434+ JNODE_FLUSH_QUEUED = 12,
20435+
20436+ /* In the following bits jnode type is encoded. */
20437+ JNODE_TYPE_1 = 13,
20438+ JNODE_TYPE_2 = 14,
20439+ JNODE_TYPE_3 = 15,
20440+
20441+ /* jnode is being destroyed */
20442+ JNODE_RIP = 16,
20443+
20444+ /* znode was not captured during locking (it might so be because
20445+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20446+ JNODE_MISSED_IN_CAPTURE = 17,
20447+
20448+ /* write is in progress */
20449+ JNODE_WRITEBACK = 18,
20450+
20451+ /* FIXME: now it is used by crypto-compress plugin only */
20452+ JNODE_NEW = 19,
20453+
20454+ /* delimiting keys are already set for this znode. */
20455+ JNODE_DKSET = 20,
20456+
20457+ /* when this bit is set page and jnode can not be disconnected */
20458+ JNODE_WRITE_PREPARED = 21,
20459+
20460+ JNODE_CLUSTER_PAGE = 22,
20461+ /* Jnode is marked for repacking, that means the reiser4 flush and the
20462+ * block allocator should process this node special way */
20463+ JNODE_REPACK = 23,
20464+ /* node should be converted by flush in squalloc phase */
20465+ JNODE_CONVERTIBLE = 24,
20466+ /*
20467+ * When jnode is dirtied for the first time in given transaction,
20468+ * do_jnode_make_dirty() checks whether this jnode can possible became
20469+ * member of overwrite set. If so, this bit is set, and one block is
20470+ * reserved in the ->flush_reserved space of atom.
20471+ *
20472+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20473+ *
20474+ * (1) flush decides that we want this block to go into relocate
20475+ * set after all.
20476+ *
20477+ * (2) wandering log is allocated (by log writer)
20478+ *
20479+ * (3) extent is allocated
20480+ *
20481+ */
20482+ JNODE_FLUSH_RESERVED = 29
20483+} reiser4_jnode_state;
20484+
20485+/* Macros for accessing the jnode state. */
20486+
20487+static inline void JF_CLR(jnode * j, int f)
20488+{
20489+ assert("unknown-1", j->magic == JMAGIC);
20490+ clear_bit(f, &j->state);
20491+}
20492+static inline int JF_ISSET(const jnode * j, int f)
20493+{
20494+ assert("unknown-2", j->magic == JMAGIC);
20495+ return test_bit(f, &((jnode *) j)->state);
20496+}
20497+static inline void JF_SET(jnode * j, int f)
20498+{
20499+ assert("unknown-3", j->magic == JMAGIC);
20500+ set_bit(f, &j->state);
20501+}
20502+
20503+static inline int JF_TEST_AND_SET(jnode * j, int f)
20504+{
20505+ assert("unknown-4", j->magic == JMAGIC);
20506+ return test_and_set_bit(f, &j->state);
20507+}
20508+
20509+static inline void spin_lock_jnode(jnode *node)
20510+{
20511+ /* check that spinlocks of lower priorities are not held */
20512+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20513+ LOCK_CNT_NIL(spin_locked_txnh) &&
20514+ LOCK_CNT_NIL(spin_locked_zlock) &&
20515+ LOCK_CNT_NIL(rw_locked_dk) &&
20516+ LOCK_CNT_LT(spin_locked_jnode, 2)));
20517+
20518+ spin_lock(&(node->guard));
20519+
20520+ LOCK_CNT_INC(spin_locked_jnode);
20521+ LOCK_CNT_INC(spin_locked);
20522+}
20523+
20524+static inline void spin_unlock_jnode(jnode *node)
20525+{
20526+ assert_spin_locked(&(node->guard));
20527+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20528+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20529+
20530+ LOCK_CNT_DEC(spin_locked_jnode);
20531+ LOCK_CNT_DEC(spin_locked);
20532+
20533+ spin_unlock(&(node->guard));
20534+}
20535+
20536+static inline int jnode_is_in_deleteset(const jnode * node)
20537+{
20538+ return JF_ISSET(node, JNODE_RELOC);
20539+}
20540+
20541+extern int init_jnodes(void);
20542+extern void done_jnodes(void);
20543+
20544+/* Jnode routines */
20545+extern jnode *jalloc(void);
20546+extern void jfree(jnode * node) NONNULL;
20547+extern jnode *jclone(jnode *);
20548+extern jnode *jlookup(reiser4_tree * tree,
20549+ oid_t objectid, unsigned long ind) NONNULL;
20550+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20551+extern jnode *jnode_by_page(struct page *pg) NONNULL;
20552+extern jnode *jnode_of_page(struct page *pg) NONNULL;
20553+void jnode_attach_page(jnode * node, struct page *pg);
20554+
20555+void unhash_unformatted_jnode(jnode *);
20556+extern jnode *page_next_jnode(jnode * node) NONNULL;
20557+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20558+extern void jnode_make_dirty(jnode * node) NONNULL;
20559+extern void jnode_make_clean(jnode * node) NONNULL;
20560+extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20561+extern void jnode_make_wander(jnode *) NONNULL;
20562+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20563+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20564+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20565+
20566+/**
20567+ * jnode_get_block
20568+ * @node: jnode to query
20569+ *
20570+ */
20571+static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20572+{
20573+ assert("nikita-528", node != NULL);
20574+
20575+ return &node->blocknr;
20576+}
20577+
20578+/**
20579+ * jnode_set_block
20580+ * @node: jnode to update
20581+ * @blocknr: new block nr
20582+ */
20583+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20584+{
20585+ assert("nikita-2020", node != NULL);
20586+ assert("umka-055", blocknr != NULL);
20587+ node->blocknr = *blocknr;
20588+}
20589+
20590+
20591+/* block number for IO. Usually this is the same as jnode_get_block(), unless
20592+ * jnode was emergency flushed---then block number chosen by eflush is
20593+ * used. */
20594+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20595+{
20596+ assert("nikita-2768", node != NULL);
20597+ assert_spin_locked(&(node->guard));
20598+
20599+ return jnode_get_block(node);
20600+}
20601+
20602+/* Jnode flush interface. */
20603+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20604+extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
20605+
20606+/* FIXME-VS: these are used in plugin/item/extent.c */
20607+
20608+/* does extent_get_block have to be called */
20609+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20610+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20611+
20612+/* the node should be converted during flush squalloc phase */
20613+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20614+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20615+
20616+/* Macros to convert from jnode to znode, znode to jnode. These are macros
20617+ because C doesn't allow overloading of const prototypes. */
20618+#define ZJNODE(x) (& (x) -> zjnode)
20619+#define JZNODE(x) \
20620+({ \
20621+ typeof (x) __tmp_x; \
20622+ \
20623+ __tmp_x = (x); \
20624+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20625+ (znode*) __tmp_x; \
20626+})
20627+
20628+extern int jnodes_tree_init(reiser4_tree * tree);
20629+extern int jnodes_tree_done(reiser4_tree * tree);
20630+
20631+#if REISER4_DEBUG
20632+
20633+extern int znode_is_any_locked(const znode * node);
20634+extern void jnode_list_remove(jnode * node);
20635+
20636+#else
20637+
20638+#define jnode_list_remove(node) noop
20639+
20640+#endif
20641+
20642+int znode_is_root(const znode * node) NONNULL;
20643+
20644+/* bump reference counter on @node */
20645+static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20646+{
20647+ assert("nikita-1911", node != NULL);
20648+
20649+ atomic_inc(&node->x_count);
20650+ LOCK_CNT_INC(x_refs);
20651+}
20652+
20653+static inline void dec_x_ref(jnode * node)
20654+{
20655+ assert("nikita-3215", node != NULL);
20656+ assert("nikita-3216", atomic_read(&node->x_count) > 0);
20657+
20658+ atomic_dec(&node->x_count);
20659+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20660+ LOCK_CNT_DEC(x_refs);
20661+}
20662+
20663+/* jref() - increase counter of references to jnode/znode (x_count) */
20664+static inline jnode *jref(jnode * node)
20665+{
20666+ assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20667+ add_x_ref(node);
20668+ return node;
20669+}
20670+
20671+/* get the page of jnode */
20672+static inline struct page *jnode_page(const jnode * node)
20673+{
20674+ return node->pg;
20675+}
20676+
20677+/* return pointer to jnode data */
20678+static inline char *jdata(const jnode * node)
20679+{
20680+ assert("nikita-1415", node != NULL);
20681+ assert("nikita-3198", jnode_page(node) != NULL);
20682+ return node->data;
20683+}
20684+
20685+static inline int jnode_is_loaded(const jnode * node)
20686+{
20687+ assert("zam-506", node != NULL);
20688+ return atomic_read(&node->d_count) > 0;
20689+}
20690+
20691+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20692+
20693+static inline void jnode_set_reloc(jnode * node)
20694+{
20695+ assert("nikita-2431", node != NULL);
20696+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20697+ JF_SET(node, JNODE_RELOC);
20698+}
20699+
20700+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20701+
20702+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20703+
20704+static inline int jload(jnode *node)
20705+{
20706+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20707+}
20708+
20709+extern int jinit_new(jnode *, gfp_t) NONNULL;
20710+extern int jstartio(jnode *) NONNULL;
20711+
20712+extern void jdrop(jnode *) NONNULL;
20713+extern int jwait_io(jnode *, int rw) NONNULL;
20714+
20715+void jload_prefetch(jnode *);
20716+
20717+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20718+extern void reiser4_drop_io_head(jnode * node) NONNULL;
20719+
20720+static inline reiser4_tree *jnode_get_tree(const jnode * node)
20721+{
20722+ assert("nikita-2691", node != NULL);
20723+ return node->tree;
20724+}
20725+
20726+extern void pin_jnode_data(jnode *);
20727+extern void unpin_jnode_data(jnode *);
20728+
20729+static inline jnode_type jnode_get_type(const jnode * node)
20730+{
20731+ static const unsigned long state_mask =
20732+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20733+
20734+ static jnode_type mask_to_type[] = {
20735+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20736+
20737+ /* 000 */
20738+ [0] = JNODE_FORMATTED_BLOCK,
20739+ /* 001 */
20740+ [1] = JNODE_UNFORMATTED_BLOCK,
20741+ /* 010 */
20742+ [2] = JNODE_BITMAP,
20743+ /* 011 */
20744+ [3] = LAST_JNODE_TYPE, /*invalid */
20745+ /* 100 */
20746+ [4] = JNODE_INODE,
20747+ /* 101 */
20748+ [5] = LAST_JNODE_TYPE,
20749+ /* 110 */
20750+ [6] = JNODE_IO_HEAD,
20751+ /* 111 */
20752+ [7] = LAST_JNODE_TYPE, /* invalid */
20753+ };
20754+
20755+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20756+}
20757+
20758+/* returns true if node is a znode */
20759+static inline int jnode_is_znode(const jnode * node)
20760+{
20761+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20762+}
20763+
20764+static inline int jnode_is_flushprepped(jnode * node)
20765+{
20766+ assert("jmacd-78212", node != NULL);
20767+ assert_spin_locked(&(node->guard));
20768+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20769+ JF_ISSET(node, JNODE_OVRWR);
20770+}
20771+
20772+/* Return true if @node has already been processed by the squeeze and allocate
20773+ process. This implies the block address has been finalized for the
20774+ duration of this atom (or it is clean and will remain in place). If this
20775+ returns true you may use the block number as a hint. */
20776+static inline int jnode_check_flushprepped(jnode * node)
20777+{
20778+ int result;
20779+
20780+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */
20781+ spin_lock_jnode(node);
20782+ result = jnode_is_flushprepped(node);
20783+ spin_unlock_jnode(node);
20784+ return result;
20785+}
20786+
20787+/* returns true if node is unformatted */
20788+static inline int jnode_is_unformatted(const jnode * node)
20789+{
20790+ assert("jmacd-0123", node != NULL);
20791+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20792+}
20793+
20794+/* returns true if node represents a cluster cache page */
20795+static inline int jnode_is_cluster_page(const jnode * node)
20796+{
20797+ assert("edward-50", node != NULL);
20798+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20799+}
20800+
20801+/* returns true is node is builtin inode's jnode */
20802+static inline int jnode_is_inode(const jnode * node)
20803+{
20804+ assert("vs-1240", node != NULL);
20805+ return jnode_get_type(node) == JNODE_INODE;
20806+}
20807+
20808+static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20809+{
20810+ assert("nikita-2367", type < LAST_JNODE_TYPE);
20811+ return jnode_plugin_by_id((reiser4_plugin_id) type);
20812+}
20813+
20814+static inline jnode_plugin *jnode_ops(const jnode * node)
20815+{
20816+ assert("nikita-2366", node != NULL);
20817+
20818+ return jnode_ops_of(jnode_get_type(node));
20819+}
20820+
20821+/* Get the index of a block. */
20822+static inline unsigned long jnode_get_index(jnode * node)
20823+{
20824+ return jnode_ops(node)->index(node);
20825+}
20826+
20827+/* return true if "node" is the root */
20828+static inline int jnode_is_root(const jnode * node)
20829+{
20830+ return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20831+}
20832+
20833+extern struct address_space *mapping_jnode(const jnode * node);
20834+extern unsigned long index_jnode(const jnode * node);
20835+
20836+static inline void jput(jnode * node);
20837+extern void jput_final(jnode * node);
20838+
20839+/* bump data counter on @node */
20840+static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20841+{
20842+ assert("nikita-1962", node != NULL);
20843+
20844+ atomic_inc(&node->d_count);
20845+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
20846+ LOCK_CNT_INC(d_refs);
20847+}
20848+
20849+/* jput() - decrement x_count reference counter on znode.
20850+
20851+ Count may drop to 0, jnode stays in cache until memory pressure causes the
20852+ eviction of its page. The c_count variable also ensures that children are
20853+ pressured out of memory before the parent. The jnode remains hashed as
20854+ long as the VM allows its page to stay in memory.
20855+*/
20856+static inline void jput(jnode * node)
20857+{
20858+ assert("jmacd-509", node != NULL);
20859+ assert("jmacd-510", atomic_read(&node->x_count) > 0);
20860+ assert("zam-926", reiser4_schedulable());
20861+ LOCK_CNT_DEC(x_refs);
20862+
20863+ rcu_read_lock();
20864+ /*
20865+ * we don't need any kind of lock here--jput_final() uses RCU.
20866+ */
20867+ if (unlikely(atomic_dec_and_test(&node->x_count))) {
20868+ jput_final(node);
20869+ } else
20870+ rcu_read_unlock();
20871+ assert("nikita-3473", reiser4_schedulable());
20872+}
20873+
20874+extern void jrelse(jnode * node);
20875+extern void jrelse_tail(jnode * node);
20876+
20877+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20878+
20879+/* resolve race with jput */
20880+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20881+{
20882+ if (unlikely(JF_ISSET(node, JNODE_RIP)))
20883+ node = jnode_rip_sync(tree, node);
20884+ return node;
20885+}
20886+
20887+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20888+
20889+#if REISER4_DEBUG
20890+extern int jnode_invariant_f(const jnode *node, char const **msg);
20891+#endif
20892+
20893+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
20894+
20895+/* __JNODE_H__ */
20896+#endif
20897+
20898+/* Make Linus happy.
20899+ Local variables:
20900+ c-indentation-style: "K&R"
20901+ mode-name: "LC"
20902+ c-basic-offset: 8
20903+ tab-width: 8
20904+ fill-column: 120
20905+ End:
20906+*/
20907diff -urN linux-2.6.20.orig/fs/reiser4/kassign.c linux-2.6.20/fs/reiser4/kassign.c
20908--- linux-2.6.20.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300
20909+++ linux-2.6.20/fs/reiser4/kassign.c 2007-05-06 14:50:43.734986973 +0400
20910@@ -0,0 +1,661 @@
20911+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20912+ * reiser4/README */
20913+
20914+/* Key assignment policy implementation */
20915+
20916+/*
20917+ * In reiser4 every piece of file system data and meta-data has a key. Keys
20918+ * are used to store information in and retrieve it from reiser4 internal
20919+ * tree. In addition to this, keys define _ordering_ of all file system
20920+ * information: things having close keys are placed into the same or
20921+ * neighboring (in the tree order) nodes of the tree. As our block allocator
20922+ * tries to respect tree order (see flush.c), keys also define order in which
20923+ * things are laid out on the disk, and hence, affect performance directly.
20924+ *
20925+ * Obviously, assignment of keys to data and meta-data should be consistent
20926+ * across whole file system. Algorithm that calculates a key for a given piece
20927+ * of data or meta-data is referred to as "key assignment".
20928+ *
20929+ * Key assignment is too expensive to be implemented as a plugin (that is,
20930+ * with an ability to support different key assignment schemas in the same
20931+ * compiled kernel image). As a compromise, all key-assignment functions and
20932+ * data-structures are collected in this single file, so that modifications to
20933+ * key assignment algorithm can be localized. Additional changes may be
20934+ * required in key.[ch].
20935+ *
20936+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
20937+ * may guess, there is "Plan B" too.
20938+ *
20939+ */
20940+
20941+/*
20942+ * Additional complication with key assignment implementation is a requirement
20943+ * to support different key length.
20944+ */
20945+
20946+/*
20947+ * KEY ASSIGNMENT: PLAN A, LONG KEYS.
20948+ *
20949+ * DIRECTORY ITEMS
20950+ *
20951+ * | 60 | 4 | 7 |1| 56 | 64 | 64 |
20952+ * +--------------+---+---+-+-------------+------------------+-----------------+
20953+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
20954+ * +--------------+---+---+-+-------------+------------------+-----------------+
20955+ * | | | | |
20956+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
20957+ *
20958+ * dirid objectid of directory this item is for
20959+ *
20960+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
20961+ *
20962+ * H 1 if last 8 bytes of the key contain hash,
20963+ * 0 if last 8 bytes of the key contain prefix-3
20964+ *
20965+ * prefix-1 first 7 characters of file name.
20966+ * Padded by zeroes if name is not long enough.
20967+ *
20968+ * prefix-2 next 8 characters of the file name.
20969+ *
20970+ * prefix-3 next 8 characters of the file name.
20971+ *
20972+ * hash hash of the rest of file name (i.e., portion of file
20973+ * name not included into prefix-1 and prefix-2).
20974+ *
20975+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
20976+ * in the key. Such file names are called "short". They are distinguished by H
20977+ * bit set 0 in the key.
20978+ *
20979+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
20980+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
20981+ * key. Last 8 bytes of the key are occupied by hash of the remaining
20982+ * characters of the name.
20983+ *
20984+ * This key assignment reaches following important goals:
20985+ *
20986+ * (1) directory entries are sorted in approximately lexicographical
20987+ * order.
20988+ *
20989+ * (2) collisions (when multiple directory items have the same key), while
20990+ * principally unavoidable in a tree with fixed length keys, are rare.
20991+ *
20992+ * STAT DATA
20993+ *
20994+ * | 60 | 4 | 64 | 4 | 60 | 64 |
20995+ * +--------------+---+-----------------+---+--------------+-----------------+
20996+ * | locality id | 1 | ordering | 0 | objectid | 0 |
20997+ * +--------------+---+-----------------+---+--------------+-----------------+
20998+ * | | | | |
20999+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21000+ *
21001+ * locality id object id of a directory where first name was created for
21002+ * the object
21003+ *
21004+ * ordering copy of second 8-byte portion of the key of directory
21005+ * entry for the first name of this object. Ordering has a form
21006+ * {
21007+ * fibration :7;
21008+ * h :1;
21009+ * prefix1 :56;
21010+ * }
21011+ * see description of key for directory entry above.
21012+ *
21013+ * objectid object id for this object
21014+ *
21015+ * This key assignment policy is designed to keep stat-data in the same order
21016+ * as corresponding directory items, thus speeding up readdir/stat types of
21017+ * workload.
21018+ *
21019+ * FILE BODY
21020+ *
21021+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21022+ * +--------------+---+-----------------+---+--------------+-----------------+
21023+ * | locality id | 4 | ordering | 0 | objectid | offset |
21024+ * +--------------+---+-----------------+---+--------------+-----------------+
21025+ * | | | | |
21026+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21027+ *
21028+ * locality id object id of a directory where first name was created for
21029+ * the object
21030+ *
21031+ * ordering the same as in the key of stat-data for this object
21032+ *
21033+ * objectid object id for this object
21034+ *
21035+ * offset logical offset from the beginning of this file.
21036+ * Measured in bytes.
21037+ *
21038+ *
21039+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21040+ *
21041+ * DIRECTORY ITEMS
21042+ *
21043+ * | 60 | 4 | 7 |1| 56 | 64 |
21044+ * +--------------+---+---+-+-------------+-----------------+
21045+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21046+ * +--------------+---+---+-+-------------+-----------------+
21047+ * | | | |
21048+ * | 8 bytes | 8 bytes | 8 bytes |
21049+ *
21050+ * dirid objectid of directory this item is for
21051+ *
21052+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21053+ *
21054+ * H 1 if last 8 bytes of the key contain hash,
21055+ * 0 if last 8 bytes of the key contain prefix-2
21056+ *
21057+ * prefix-1 first 7 characters of file name.
21058+ * Padded by zeroes if name is not long enough.
21059+ *
21060+ * prefix-2 next 8 characters of the file name.
21061+ *
21062+ * hash hash of the rest of file name (i.e., portion of file
21063+ * name not included into prefix-1).
21064+ *
21065+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21066+ * the key. Such file names are called "short". They are distinguished by H
21067+ * bit set in the key.
21068+ *
21069+ * Other file names are "long". For long name, H bit is 0, and first 7
21070+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21071+ * key are occupied by hash of the remaining characters of the name.
21072+ *
21073+ * STAT DATA
21074+ *
21075+ * | 60 | 4 | 4 | 60 | 64 |
21076+ * +--------------+---+---+--------------+-----------------+
21077+ * | locality id | 1 | 0 | objectid | 0 |
21078+ * +--------------+---+---+--------------+-----------------+
21079+ * | | | |
21080+ * | 8 bytes | 8 bytes | 8 bytes |
21081+ *
21082+ * locality id object id of a directory where first name was created for
21083+ * the object
21084+ *
21085+ * objectid object id for this object
21086+ *
21087+ * FILE BODY
21088+ *
21089+ * | 60 | 4 | 4 | 60 | 64 |
21090+ * +--------------+---+---+--------------+-----------------+
21091+ * | locality id | 4 | 0 | objectid | offset |
21092+ * +--------------+---+---+--------------+-----------------+
21093+ * | | | |
21094+ * | 8 bytes | 8 bytes | 8 bytes |
21095+ *
21096+ * locality id object id of a directory where first name was created for
21097+ * the object
21098+ *
21099+ * objectid object id for this object
21100+ *
21101+ * offset logical offset from the beginning of this file.
21102+ * Measured in bytes.
21103+ *
21104+ *
21105+ */
21106+
21107+#include "debug.h"
21108+#include "key.h"
21109+#include "kassign.h"
21110+#include "vfs_ops.h"
21111+#include "inode.h"
21112+#include "super.h"
21113+#include "dscale.h"
21114+
21115+#include <linux/types.h> /* for __u?? */
21116+#include <linux/fs.h> /* for struct super_block, etc */
21117+
21118+/* bitmask for H bit (see comment at the beginning of this file */
21119+static const __u64 longname_mark = 0x0100000000000000ull;
21120+/* bitmask for F and H portions of the key. */
21121+static const __u64 fibration_mask = 0xff00000000000000ull;
21122+
21123+/* return true if name is not completely encoded in @key */
21124+int is_longname_key(const reiser4_key * key)
21125+{
21126+ __u64 highpart;
21127+
21128+ assert("nikita-2863", key != NULL);
21129+ if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21130+ reiser4_print_key("oops", key);
21131+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21132+
21133+ if (REISER4_LARGE_KEY)
21134+ highpart = get_key_ordering(key);
21135+ else
21136+ highpart = get_key_objectid(key);
21137+
21138+ return (highpart & longname_mark) ? 1 : 0;
21139+}
21140+
21141+/* return true if @name is too long to be completely encoded in the key */
21142+int is_longname(const char *name UNUSED_ARG, int len)
21143+{
21144+ if (REISER4_LARGE_KEY)
21145+ return len > 23;
21146+ else
21147+ return len > 15;
21148+}
21149+
21150+/* code ascii string into __u64.
21151+
21152+ Put characters of @name into result (@str) one after another starting
21153+ from @start_idx-th highest (arithmetically) byte. This produces
21154+ endian-safe encoding. memcpy(2) will not do.
21155+
21156+*/
21157+static __u64 pack_string(const char *name /* string to encode */ ,
21158+ int start_idx /* highest byte in result from
21159+ * which to start encoding */ )
21160+{
21161+ unsigned i;
21162+ __u64 str;
21163+
21164+ str = 0;
21165+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21166+ str <<= 8;
21167+ str |= (unsigned char)name[i];
21168+ }
21169+ str <<= (sizeof str - i - start_idx) << 3;
21170+ return str;
21171+}
21172+
21173+/* opposite to pack_string(). Takes value produced by pack_string(), restores
21174+ * string encoded in it and stores result in @buf */
21175+char * reiser4_unpack_string(__u64 value, char *buf)
21176+{
21177+ do {
21178+ *buf = value >> (64 - 8);
21179+ if (*buf)
21180+ ++buf;
21181+ value <<= 8;
21182+ } while (value != 0);
21183+ *buf = 0;
21184+ return buf;
21185+}
21186+
21187+/* obtain name encoded in @key and store it in @buf */
21188+char *extract_name_from_key(const reiser4_key * key, char *buf)
21189+{
21190+ char *c;
21191+
21192+ assert("nikita-2868", !is_longname_key(key));
21193+
21194+ c = buf;
21195+ if (REISER4_LARGE_KEY) {
21196+ c = reiser4_unpack_string(get_key_ordering(key) &
21197+ ~fibration_mask, c);
21198+ c = reiser4_unpack_string(get_key_fulloid(key), c);
21199+ } else
21200+ c = reiser4_unpack_string(get_key_fulloid(key) &
21201+ ~fibration_mask, c);
21202+ reiser4_unpack_string(get_key_offset(key), c);
21203+ return buf;
21204+}
21205+
21206+/**
21207+ * complete_entry_key - calculate entry key by name
21208+ * @dir: directory where entry is (or will be) in
21209+ * @name: name to calculate key of
21210+ * @len: lenth of name
21211+ * @result: place to store result in
21212+ *
21213+ * Sets fields of entry key @result which depend on file name.
21214+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21215+ * objectid and offset. Otherwise, objectid and offset are set.
21216+ */
21217+void complete_entry_key(const struct inode *dir, const char *name,
21218+ int len, reiser4_key *result)
21219+{
21220+#if REISER4_LARGE_KEY
21221+ __u64 ordering;
21222+ __u64 objectid;
21223+ __u64 offset;
21224+
21225+ assert("nikita-1139", dir != NULL);
21226+ assert("nikita-1142", result != NULL);
21227+ assert("nikita-2867", strlen(name) == len);
21228+
21229+ /*
21230+ * key allocation algorithm for directory entries in case of large
21231+ * keys:
21232+ *
21233+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21234+ * characters into ordering field of key, next 8 charactes (if any)
21235+ * into objectid field of key and next 8 ones (of any) into offset
21236+ * field of key
21237+ *
21238+ * If file name is longer than 23 characters, put first 7 characters
21239+ * into key's ordering, next 8 to objectid and hash of remaining
21240+ * characters into offset field.
21241+ *
21242+ * To distinguish above cases, in latter set up unused high bit in
21243+ * ordering field.
21244+ */
21245+
21246+ /* [0-6] characters to ordering */
21247+ ordering = pack_string(name, 1);
21248+ if (len > 7) {
21249+ /* [7-14] characters to objectid */
21250+ objectid = pack_string(name + 7, 0);
21251+ if (len > 15) {
21252+ if (len <= 23) {
21253+ /* [15-23] characters to offset */
21254+ offset = pack_string(name + 15, 0);
21255+ } else {
21256+ /* note in a key the fact that offset contains hash. */
21257+ ordering |= longname_mark;
21258+
21259+ /* offset is the hash of the file name's tail. */
21260+ offset = inode_hash_plugin(dir)->hash(name + 15,
21261+ len - 15);
21262+ }
21263+ } else {
21264+ offset = 0ull;
21265+ }
21266+ } else {
21267+ objectid = 0ull;
21268+ offset = 0ull;
21269+ }
21270+
21271+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21272+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21273+
21274+ set_key_ordering(result, ordering);
21275+ set_key_fulloid(result, objectid);
21276+ set_key_offset(result, offset);
21277+ return;
21278+
21279+#else
21280+ __u64 objectid;
21281+ __u64 offset;
21282+
21283+ assert("nikita-1139", dir != NULL);
21284+ assert("nikita-1142", result != NULL);
21285+ assert("nikita-2867", strlen(name) == len);
21286+
21287+ /*
21288+ * key allocation algorithm for directory entries in case of not large
21289+ * keys:
21290+ *
21291+ * If name is not longer than 7 + 8 = 15 characters, put first 7
21292+ * characters into objectid field of key, next 8 charactes (if any)
21293+ * into offset field of key
21294+ *
21295+ * If file name is longer than 15 characters, put first 7 characters
21296+ * into key's objectid, and hash of remaining characters into offset
21297+ * field.
21298+ *
21299+ * To distinguish above cases, in latter set up unused high bit in
21300+ * objectid field.
21301+ */
21302+
21303+ /* [0-6] characters to objectid */
21304+ objectid = pack_string(name, 1);
21305+ if (len > 7) {
21306+ if (len <= 15) {
21307+ /* [7-14] characters to offset */
21308+ offset = pack_string(name + 7, 0);
21309+ } else {
21310+ /* note in a key the fact that offset contains hash. */
21311+ objectid |= longname_mark;
21312+
21313+ /* offset is the hash of the file name. */
21314+ offset = inode_hash_plugin(dir)->hash(name + 7,
21315+ len - 7);
21316+ }
21317+ } else
21318+ offset = 0ull;
21319+
21320+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21321+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21322+
21323+ set_key_fulloid(result, objectid);
21324+ set_key_offset(result, offset);
21325+ return;
21326+#endif /* ! REISER4_LARGE_KEY */
21327+}
21328+
21329+/* true, if @key is the key of "." */
21330+int is_dot_key(const reiser4_key * key /* key to check */ )
21331+{
21332+ assert("nikita-1717", key != NULL);
21333+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21334+ return
21335+ (get_key_ordering(key) == 0ull) &&
21336+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21337+}
21338+
21339+/* build key for stat-data.
21340+
21341+ return key of stat-data of this object. This should became sd plugin
21342+ method in the future. For now, let it be here.
21343+
21344+*/
21345+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21346+ reiser4_key * result /* resulting key of @target
21347+ stat-data */ )
21348+{
21349+ assert("nikita-261", result != NULL);
21350+
21351+ reiser4_key_init(result);
21352+ set_key_locality(result, reiser4_inode_data(target)->locality_id);
21353+ set_key_ordering(result, get_inode_ordering(target));
21354+ set_key_objectid(result, get_inode_oid(target));
21355+ set_key_type(result, KEY_SD_MINOR);
21356+ set_key_offset(result, (__u64) 0);
21357+ return result;
21358+}
21359+
21360+/* encode part of key into &obj_key_id
21361+
21362+ This encodes into @id part of @key sufficient to restore @key later,
21363+ given that latter is key of object (key of stat-data).
21364+
21365+ See &obj_key_id
21366+*/
21367+int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21368+ obj_key_id * id /* id where key is encoded in */ )
21369+{
21370+ assert("nikita-1151", key != NULL);
21371+ assert("nikita-1152", id != NULL);
21372+
21373+ memcpy(id, key, sizeof *id);
21374+ return 0;
21375+}
21376+
21377+/* encode reference to @obj in @id.
21378+
21379+ This is like build_obj_key_id() above, but takes inode as parameter. */
21380+int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21381+ obj_key_id * id /* result */ )
21382+{
21383+ reiser4_key sdkey;
21384+
21385+ assert("nikita-1166", obj != NULL);
21386+ assert("nikita-1167", id != NULL);
21387+
21388+ build_sd_key(obj, &sdkey);
21389+ build_obj_key_id(&sdkey, id);
21390+ return 0;
21391+}
21392+
21393+/* decode @id back into @key
21394+
21395+ Restore key of object stat-data from @id. This is dual to
21396+ build_obj_key_id() above.
21397+*/
21398+int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21399+ * from */ ,
21400+ reiser4_key * key /* result */ )
21401+{
21402+ assert("nikita-1153", id != NULL);
21403+ assert("nikita-1154", key != NULL);
21404+
21405+ reiser4_key_init(key);
21406+ memcpy(key, id, sizeof *id);
21407+ return 0;
21408+}
21409+
21410+/* extract objectid of directory from key of directory entry within said
21411+ directory.
21412+ */
21413+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21414+ * directory
21415+ * entry */ )
21416+{
21417+ assert("nikita-1314", de_key != NULL);
21418+ return get_key_locality(de_key);
21419+}
21420+
21421+/* encode into @id key of directory entry.
21422+
21423+ Encode into @id information sufficient to later distinguish directory
21424+ entries within the same directory. This is not whole key, because all
21425+ directory entries within directory item share locality which is equal
21426+ to objectid of their directory.
21427+
21428+*/
21429+int build_de_id(const struct inode *dir /* inode of directory */ ,
21430+ const struct qstr *name /* name to be given to @obj by
21431+ * directory entry being
21432+ * constructed */ ,
21433+ de_id * id /* short key of directory entry */ )
21434+{
21435+ reiser4_key key;
21436+
21437+ assert("nikita-1290", dir != NULL);
21438+ assert("nikita-1292", id != NULL);
21439+
21440+ /* NOTE-NIKITA this is suboptimal. */
21441+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21442+ return build_de_id_by_key(&key, id);
21443+}
21444+
21445+/* encode into @id key of directory entry.
21446+
21447+ Encode into @id information sufficient to later distinguish directory
21448+ entries within the same directory. This is not whole key, because all
21449+ directory entries within directory item share locality which is equal
21450+ to objectid of their directory.
21451+
21452+*/
21453+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21454+ * entry */ ,
21455+ de_id * id /* short key of directory entry */ )
21456+{
21457+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21458+ return 0;
21459+}
21460+
21461+/* restore from @id key of directory entry.
21462+
21463+ Function dual to build_de_id(): given @id and locality, build full
21464+ key of directory entry within directory item.
21465+
21466+*/
21467+int extract_key_from_de_id(const oid_t locality /* locality of directory
21468+ * entry */ ,
21469+ const de_id * id /* directory entry id */ ,
21470+ reiser4_key * key /* result */ )
21471+{
21472+ /* no need to initialise key here: all fields are overwritten */
21473+ memcpy(((__u64 *) key) + 1, id, sizeof *id);
21474+ set_key_locality(key, locality);
21475+ set_key_type(key, KEY_FILE_NAME_MINOR);
21476+ return 0;
21477+}
21478+
21479+/* compare two &de_id's */
21480+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21481+ const de_id * id2 /* second &de_id to compare */ )
21482+{
21483+ /* NOTE-NIKITA ugly implementation */
21484+ reiser4_key k1;
21485+ reiser4_key k2;
21486+
21487+ extract_key_from_de_id((oid_t) 0, id1, &k1);
21488+ extract_key_from_de_id((oid_t) 0, id2, &k2);
21489+ return keycmp(&k1, &k2);
21490+}
21491+
21492+/* compare &de_id with key */
21493+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21494+ const reiser4_key * key /* key to compare */ )
21495+{
21496+ cmp_t result;
21497+ reiser4_key *k1;
21498+
21499+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21500+ result = KEY_DIFF_EL(k1, key, 1);
21501+ if (result == EQUAL_TO) {
21502+ result = KEY_DIFF_EL(k1, key, 2);
21503+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21504+ result = KEY_DIFF_EL(k1, key, 3);
21505+ }
21506+ }
21507+ return result;
21508+}
21509+
21510+/*
21511+ * return number of bytes necessary to encode @inode identity.
21512+ */
21513+int inode_onwire_size(const struct inode *inode)
21514+{
21515+ int result;
21516+
21517+ result = dscale_bytes(get_inode_oid(inode));
21518+ result += dscale_bytes(get_inode_locality(inode));
21519+
21520+ /*
21521+ * ordering is large (it usually has highest bits set), so it makes
21522+ * little sense to dscale it.
21523+ */
21524+ if (REISER4_LARGE_KEY)
21525+ result += sizeof(get_inode_ordering(inode));
21526+ return result;
21527+}
21528+
21529+/*
21530+ * encode @inode identity at @start
21531+ */
21532+char *build_inode_onwire(const struct inode *inode, char *start)
21533+{
21534+ start += dscale_write(start, get_inode_locality(inode));
21535+ start += dscale_write(start, get_inode_oid(inode));
21536+
21537+ if (REISER4_LARGE_KEY) {
21538+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21539+ start += sizeof(get_inode_ordering(inode));
21540+ }
21541+ return start;
21542+}
21543+
21544+/*
21545+ * extract key that was previously encoded by build_inode_onwire() at @addr
21546+ */
21547+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21548+{
21549+ __u64 val;
21550+
21551+ addr += dscale_read(addr, &val);
21552+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21553+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21554+ addr += dscale_read(addr, &val);
21555+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21556+#if REISER4_LARGE_KEY
21557+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21558+ addr += sizeof key_id->ordering;
21559+#endif
21560+ return addr;
21561+}
21562+
21563+/* Make Linus happy.
21564+ Local variables:
21565+ c-indentation-style: "K&R"
21566+ mode-name: "LC"
21567+ c-basic-offset: 8
21568+ tab-width: 8
21569+ fill-column: 120
21570+ End:
21571+*/
21572diff -urN linux-2.6.20.orig/fs/reiser4/kassign.h linux-2.6.20/fs/reiser4/kassign.h
21573--- linux-2.6.20.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300
21574+++ linux-2.6.20/fs/reiser4/kassign.h 2007-05-06 14:50:43.734986973 +0400
21575@@ -0,0 +1,110 @@
21576+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21577+ * reiser4/README */
21578+
21579+/* Key assignment policy interface. See kassign.c for details. */
21580+
21581+#if !defined( __KASSIGN_H__ )
21582+#define __KASSIGN_H__
21583+
21584+#include "forward.h"
21585+#include "key.h"
21586+#include "dformat.h"
21587+
21588+#include <linux/types.h> /* for __u?? */
21589+#include <linux/fs.h> /* for struct super_block, etc */
21590+#include <linux/dcache.h> /* for struct qstr */
21591+
21592+/* key assignment functions */
21593+
21594+/* Information from which key of file stat-data can be uniquely
21595+ restored. This depends on key assignment policy for
21596+ stat-data. Currently it's enough to store object id and locality id
21597+ (60+60==120) bits, because minor packing locality and offset of
21598+ stat-data key are always known constants: KEY_SD_MINOR and 0
21599+ respectively. For simplicity 4 bits are wasted in each id, and just
21600+ two 64 bit integers are stored.
21601+
21602+ This field has to be byte-aligned, because we don't want to waste
21603+ space in directory entries. There is another side of a coin of
21604+ course: we waste CPU and bus bandwidth in stead, by copying data back
21605+ and forth.
21606+
21607+ Next optimization: &obj_key_id is mainly used to address stat data from
21608+ directory entries. Under the assumption that majority of files only have
21609+ only name (one hard link) from *the* parent directory it seems reasonable
21610+ to only store objectid of stat data and take its locality from key of
21611+ directory item.
21612+
21613+ This requires some flag to be added to the &obj_key_id to distinguish
21614+ between these two cases. Remaining bits in flag byte are then asking to be
21615+ used to store file type.
21616+
21617+ This optimization requires changes in directory item handling code.
21618+
21619+*/
21620+typedef struct obj_key_id {
21621+ d8 locality[sizeof(__u64)];
21622+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21623+ )
21624+ d8 objectid[sizeof(__u64)];
21625+}
21626+obj_key_id;
21627+
21628+/* Information sufficient to uniquely identify directory entry within
21629+ compressed directory item.
21630+
21631+ For alignment issues see &obj_key_id above.
21632+*/
21633+typedef struct de_id {
21634+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21635+ d8 objectid[sizeof(__u64)];
21636+ d8 offset[sizeof(__u64)];
21637+}
21638+de_id;
21639+
21640+extern int inode_onwire_size(const struct inode *obj);
21641+extern char *build_inode_onwire(const struct inode *obj, char *area);
21642+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21643+
21644+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21645+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21646+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21647+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21648+extern int build_de_id(const struct inode *dir, const struct qstr *name,
21649+ de_id * id);
21650+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21651+extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21652+ reiser4_key * key);
21653+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21654+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21655+
21656+extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21657+extern void build_entry_key_common(const struct inode *dir,
21658+ const struct qstr *name,
21659+ reiser4_key * result);
21660+extern void build_entry_key_stable_entry(const struct inode *dir,
21661+ const struct qstr *name,
21662+ reiser4_key * result);
21663+extern int is_dot_key(const reiser4_key * key);
21664+extern reiser4_key *build_sd_key(const struct inode *target,
21665+ reiser4_key * result);
21666+
21667+extern int is_longname_key(const reiser4_key * key);
21668+extern int is_longname(const char *name, int len);
21669+extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21670+extern char *reiser4_unpack_string(__u64 value, char *buf);
21671+extern void complete_entry_key(const struct inode *dir, const char *name,
21672+ int len, reiser4_key *result);
21673+
21674+/* __KASSIGN_H__ */
21675+#endif
21676+
21677+/* Make Linus happy.
21678+ Local variables:
21679+ c-indentation-style: "K&R"
21680+ mode-name: "LC"
21681+ c-basic-offset: 8
21682+ tab-width: 8
21683+ fill-column: 120
21684+ End:
21685+*/
21686diff -urN linux-2.6.20.orig/fs/reiser4/Kconfig linux-2.6.20/fs/reiser4/Kconfig
21687--- linux-2.6.20.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300
21688+++ linux-2.6.20/fs/reiser4/Kconfig 2007-05-06 14:50:43.734986973 +0400
21689@@ -0,0 +1,32 @@
21690+config REISER4_FS
21691+ tristate "Reiser4 (EXPERIMENTAL)"
21692+ depends on EXPERIMENTAL
21693+ select ZLIB_INFLATE
21694+ select ZLIB_DEFLATE
21695+ select CRYPTO
21696+ help
21697+ Reiser4 is a filesystem that performs all filesystem operations
21698+ as atomic transactions, which means that it either performs a
21699+ write, or it does not, and in the event of a crash it does not
21700+ partially perform it or corrupt it.
21701+
21702+ It stores files in dancing trees, which are like balanced trees but
21703+ faster. It packs small files together so that they share blocks
21704+ without wasting space. This means you can use it to store really
21705+ small files. It also means that it saves you disk space. It avoids
21706+ hassling you with anachronisms like having a maximum number of
21707+ inodes, and wasting space if you use less than that number.
21708+
21709+ Reiser4 is a distinct filesystem type from reiserfs (V3).
21710+ It's therefore not possible to use reiserfs file systems
21711+ with reiser4.
21712+
21713+ To learn more about reiser4, go to http://www.namesys.com
21714+
21715+config REISER4_DEBUG
21716+ bool "Enable reiser4 debug mode"
21717+ depends on REISER4_FS
21718+ help
21719+ Don't use this unless you are debugging reiser4.
21720+
21721+ If unsure, say N.
21722diff -urN linux-2.6.20.orig/fs/reiser4/key.c linux-2.6.20/fs/reiser4/key.c
21723--- linux-2.6.20.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300
21724+++ linux-2.6.20/fs/reiser4/key.c 2007-05-06 14:50:43.734986973 +0400
21725@@ -0,0 +1,137 @@
21726+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21727+
21728+/* Key manipulations. */
21729+
21730+#include "debug.h"
21731+#include "key.h"
21732+#include "super.h"
21733+#include "reiser4.h"
21734+
21735+#include <linux/types.h> /* for __u?? */
21736+
21737+/* Minimal possible key: all components are zero. It is presumed that this is
21738+ independent of key scheme. */
21739+static const reiser4_key MINIMAL_KEY = {
21740+ .el = {
21741+ 0ull,
21742+ ON_LARGE_KEY(0ull,)
21743+ 0ull,
21744+ 0ull
21745+ }
21746+};
21747+
21748+/* Maximal possible key: all components are ~0. It is presumed that this is
21749+ independent of key scheme. */
21750+static const reiser4_key MAXIMAL_KEY = {
21751+ .el = {
21752+ __constant_cpu_to_le64(~0ull),
21753+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21754+ __constant_cpu_to_le64(~0ull),
21755+ __constant_cpu_to_le64(~0ull)
21756+ }
21757+};
21758+
21759+/* Initialize key. */
21760+void reiser4_key_init(reiser4_key * key /* key to init */ )
21761+{
21762+ assert("nikita-1169", key != NULL);
21763+ memset(key, 0, sizeof *key);
21764+}
21765+
21766+/* minimal possible key in the tree. Return pointer to the static storage. */
21767+const reiser4_key *reiser4_min_key(void)
21768+{
21769+ return &MINIMAL_KEY;
21770+}
21771+
21772+/* maximum possible key in the tree. Return pointer to the static storage. */
21773+const reiser4_key *reiser4_max_key(void)
21774+{
21775+ return &MAXIMAL_KEY;
21776+}
21777+
21778+#if REISER4_DEBUG
21779+/* debugging aid: print symbolic name of key type */
21780+static const char *type_name(unsigned int key_type /* key type */ )
21781+{
21782+ switch (key_type) {
21783+ case KEY_FILE_NAME_MINOR:
21784+ return "file name";
21785+ case KEY_SD_MINOR:
21786+ return "stat data";
21787+ case KEY_ATTR_NAME_MINOR:
21788+ return "attr name";
21789+ case KEY_ATTR_BODY_MINOR:
21790+ return "attr body";
21791+ case KEY_BODY_MINOR:
21792+ return "file body";
21793+ default:
21794+ return "unknown";
21795+ }
21796+}
21797+
21798+/* debugging aid: print human readable information about key */
21799+void reiser4_print_key(const char *prefix /* prefix to print */ ,
21800+ const reiser4_key * key /* key to print */ )
21801+{
21802+ /* turn bold on */
21803+ /* printf ("\033[1m"); */
21804+ if (key == NULL)
21805+ printk("%s: null key\n", prefix);
21806+ else {
21807+ if (REISER4_LARGE_KEY)
21808+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21809+ get_key_locality(key),
21810+ get_key_type(key),
21811+ get_key_ordering(key),
21812+ get_key_band(key),
21813+ get_key_objectid(key), get_key_offset(key));
21814+ else
21815+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21816+ get_key_locality(key),
21817+ get_key_type(key),
21818+ get_key_band(key),
21819+ get_key_objectid(key), get_key_offset(key));
21820+ /*
21821+ * if this is a key of directory entry, try to decode part of
21822+ * a name stored in the key, and output it.
21823+ */
21824+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21825+ char buf[DE_NAME_BUF_LEN];
21826+ char *c;
21827+
21828+ c = buf;
21829+ c = reiser4_unpack_string(get_key_ordering(key), c);
21830+ reiser4_unpack_string(get_key_fulloid(key), c);
21831+ printk("[%s", buf);
21832+ if (is_longname_key(key))
21833+ /*
21834+ * only part of the name is stored in the key.
21835+ */
21836+ printk("...]\n");
21837+ else {
21838+ /*
21839+ * whole name is stored in the key.
21840+ */
21841+ reiser4_unpack_string(get_key_offset(key), buf);
21842+ printk("%s]\n", buf);
21843+ }
21844+ } else {
21845+ printk("[%s]\n", type_name(get_key_type(key)));
21846+ }
21847+ }
21848+ /* turn bold off */
21849+ /* printf ("\033[m\017"); */
21850+}
21851+
21852+#endif
21853+
21854+/* Make Linus happy.
21855+ Local variables:
21856+ c-indentation-style: "K&R"
21857+ mode-name: "LC"
21858+ c-basic-offset: 8
21859+ tab-width: 8
21860+ fill-column: 120
21861+ End:
21862+*/
21863diff -urN linux-2.6.20.orig/fs/reiser4/key.h linux-2.6.20/fs/reiser4/key.h
21864--- linux-2.6.20.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300
21865+++ linux-2.6.20/fs/reiser4/key.h 2007-05-06 14:50:43.738988223 +0400
21866@@ -0,0 +1,384 @@
21867+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21868+
21869+/* Declarations of key-related data-structures and operations on keys. */
21870+
21871+#if !defined( __REISER4_KEY_H__ )
21872+#define __REISER4_KEY_H__
21873+
21874+#include "dformat.h"
21875+#include "forward.h"
21876+#include "debug.h"
21877+
21878+#include <linux/types.h> /* for __u?? */
21879+
21880+/* Operations on keys in reiser4 tree */
21881+
21882+/* No access to any of these fields shall be done except via a
21883+ wrapping macro/function, and that wrapping macro/function shall
21884+ convert to little endian order. Compare keys will consider cpu byte order. */
21885+
21886+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
21887+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
21888+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
21889+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
21890+ right one. */
21891+
21892+/* possible values for minor packing locality (4 bits required) */
21893+typedef enum {
21894+ /* file name */
21895+ KEY_FILE_NAME_MINOR = 0,
21896+ /* stat-data */
21897+ KEY_SD_MINOR = 1,
21898+ /* file attribute name */
21899+ KEY_ATTR_NAME_MINOR = 2,
21900+ /* file attribute value */
21901+ KEY_ATTR_BODY_MINOR = 3,
21902+ /* file body (tail or extent) */
21903+ KEY_BODY_MINOR = 4,
21904+} key_minor_locality;
21905+
21906+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
21907+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
21908+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
21909+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
21910+ block_alloc.c to check the node type when deciding where to allocate the node.
21911+
21912+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
21913+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
21914+ current implementation tails have a different minor packing locality from extents, and no files have both extents and
21915+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
21916+*/
21917+
21918+/* Arbitrary major packing localities can be assigned to objects using
21919+ the reiser4(filenameA/..packing<=some_number) system call.
21920+
21921+ In reiser4, the creat() syscall creates a directory
21922+
21923+ whose default flow (that which is referred to if the directory is
21924+ read as a file) is the traditional unix file body.
21925+
21926+ whose directory plugin is the 'filedir'
21927+
21928+ whose major packing locality is that of the parent of the object created.
21929+
21930+ The static_stat item is a particular commonly used directory
21931+ compression (the one for normal unix files).
21932+
21933+ The filedir plugin checks to see if the static_stat item exists.
21934+ There is a unique key for static_stat. If yes, then it uses the
21935+ static_stat item for all of the values that it contains. The
21936+ static_stat item contains a flag for each stat it contains which
21937+ indicates whether one should look outside the static_stat item for its
21938+ contents.
21939+*/
21940+
21941+/* offset of fields in reiser4_key. Value of each element of this enum
21942+ is index within key (thought as array of __u64's) where this field
21943+ is. */
21944+typedef enum {
21945+ /* major "locale", aka dirid. Sits in 1st element */
21946+ KEY_LOCALITY_INDEX = 0,
21947+ /* minor "locale", aka item type. Sits in 1st element */
21948+ KEY_TYPE_INDEX = 0,
21949+ ON_LARGE_KEY(KEY_ORDERING_INDEX,)
21950+ /* "object band". Sits in 2nd element */
21951+ KEY_BAND_INDEX,
21952+ /* objectid. Sits in 2nd element */
21953+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
21954+ /* full objectid. Sits in 2nd element */
21955+ KEY_FULLOID_INDEX = KEY_BAND_INDEX,
21956+ /* Offset. Sits in 3rd element */
21957+ KEY_OFFSET_INDEX,
21958+ /* Name hash. Sits in 3rd element */
21959+ KEY_HASH_INDEX = KEY_OFFSET_INDEX,
21960+ KEY_CACHELINE_END = KEY_OFFSET_INDEX,
21961+ KEY_LAST_INDEX
21962+} reiser4_key_field_index;
21963+
21964+/* key in reiser4 internal "balanced" tree. It is just array of three
21965+ 64bit integers in disk byte order (little-endian by default). This
21966+ array is actually indexed by reiser4_key_field. Each __u64 within
21967+ this array is called "element". Logical key component encoded within
21968+ elements are called "fields".
21969+
21970+ We declare this as union with second component dummy to suppress
21971+ inconvenient array<->pointer casts implied in C. */
21972+union reiser4_key {
21973+ __le64 el[KEY_LAST_INDEX];
21974+ int pad;
21975+};
21976+
21977+/* bitmasks showing where within reiser4_key particular key is stored. */
21978+/* major locality occupies higher 60 bits of the first element */
21979+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
21980+
21981+/* minor locality occupies lower 4 bits of the first element */
21982+#define KEY_TYPE_MASK 0xfull
21983+
21984+/* controversial band occupies higher 4 bits of the 2nd element */
21985+#define KEY_BAND_MASK 0xf000000000000000ull
21986+
21987+/* objectid occupies lower 60 bits of the 2nd element */
21988+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
21989+
21990+/* full 64bit objectid*/
21991+#define KEY_FULLOID_MASK 0xffffffffffffffffull
21992+
21993+/* offset is just 3rd L.M.Nt itself */
21994+#define KEY_OFFSET_MASK 0xffffffffffffffffull
21995+
21996+/* ordering is whole second element */
21997+#define KEY_ORDERING_MASK 0xffffffffffffffffull
21998+
21999+/* how many bits key element should be shifted to left to get particular field */
22000+typedef enum {
22001+ KEY_LOCALITY_SHIFT = 4,
22002+ KEY_TYPE_SHIFT = 0,
22003+ KEY_BAND_SHIFT = 60,
22004+ KEY_OBJECTID_SHIFT = 0,
22005+ KEY_FULLOID_SHIFT = 0,
22006+ KEY_OFFSET_SHIFT = 0,
22007+ KEY_ORDERING_SHIFT = 0,
22008+} reiser4_key_field_shift;
22009+
22010+static inline __u64
22011+get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22012+{
22013+ assert("nikita-753", key != NULL);
22014+ assert("nikita-754", off < KEY_LAST_INDEX);
22015+ return le64_to_cpu(get_unaligned(&key->el[off]));
22016+}
22017+
22018+static inline void
22019+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22020+{
22021+ assert("nikita-755", key != NULL);
22022+ assert("nikita-756", off < KEY_LAST_INDEX);
22023+ put_unaligned(cpu_to_le64(value), &key->el[off]);
22024+}
22025+
22026+/* macro to define getter and setter functions for field F with type T */
22027+#define DEFINE_KEY_FIELD( L, U, T ) \
22028+static inline T get_key_ ## L ( const reiser4_key *key ) \
22029+{ \
22030+ assert( "nikita-750", key != NULL ); \
22031+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22032+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22033+} \
22034+ \
22035+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22036+{ \
22037+ __u64 el; \
22038+ \
22039+ assert( "nikita-752", key != NULL ); \
22040+ \
22041+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22042+ /* clear field bits in the key */ \
22043+ el &= ~KEY_ ## U ## _MASK; \
22044+ /* actually it should be \
22045+ \
22046+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22047+ \
22048+ but we trust user to never pass values that wouldn't fit \
22049+ into field. Clearing extra bits is one operation, but this \
22050+ function is time-critical. \
22051+ But check this in assertion. */ \
22052+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22053+ ~KEY_ ## U ## _MASK ) == 0 ); \
22054+ el |= ( loc << KEY_ ## U ## _SHIFT ); \
22055+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22056+}
22057+
22058+typedef __u64 oid_t;
22059+
22060+/* define get_key_locality(), set_key_locality() */
22061+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22062+/* define get_key_type(), set_key_type() */
22063+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22064+/* define get_key_band(), set_key_band() */
22065+DEFINE_KEY_FIELD(band, BAND, __u64);
22066+/* define get_key_objectid(), set_key_objectid() */
22067+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22068+/* define get_key_fulloid(), set_key_fulloid() */
22069+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22070+/* define get_key_offset(), set_key_offset() */
22071+DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22072+#if (REISER4_LARGE_KEY)
22073+/* define get_key_ordering(), set_key_ordering() */
22074+DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22075+#else
22076+static inline __u64 get_key_ordering(const reiser4_key * key)
22077+{
22078+ return 0;
22079+}
22080+
22081+static inline void set_key_ordering(reiser4_key * key, __u64 val)
22082+{
22083+}
22084+#endif
22085+
22086+/* key comparison result */
22087+typedef enum { LESS_THAN = -1, /* if first key is less than second */
22088+ EQUAL_TO = 0, /* if keys are equal */
22089+ GREATER_THAN = +1 /* if first key is greater than second */
22090+} cmp_t;
22091+
22092+void reiser4_key_init(reiser4_key * key);
22093+
22094+/* minimal possible key in the tree. Return pointer to the static storage. */
22095+extern const reiser4_key *reiser4_min_key(void);
22096+extern const reiser4_key *reiser4_max_key(void);
22097+
22098+/* helper macro for keycmp() */
22099+#define KEY_DIFF(k1, k2, field) \
22100+({ \
22101+ typeof (get_key_ ## field (k1)) f1; \
22102+ typeof (get_key_ ## field (k2)) f2; \
22103+ \
22104+ f1 = get_key_ ## field (k1); \
22105+ f2 = get_key_ ## field (k2); \
22106+ \
22107+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22108+})
22109+
22110+/* helper macro for keycmp() */
22111+#define KEY_DIFF_EL(k1, k2, off) \
22112+({ \
22113+ __u64 e1; \
22114+ __u64 e2; \
22115+ \
22116+ e1 = get_key_el(k1, off); \
22117+ e2 = get_key_el(k2, off); \
22118+ \
22119+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22120+})
22121+
22122+/* compare `k1' and `k2'. This function is a heart of "key allocation
22123+ policy". All you need to implement new policy is to add yet another
22124+ clause here. */
22125+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22126+ const reiser4_key * k2 /* second key to compare */ )
22127+{
22128+ cmp_t result;
22129+
22130+ /*
22131+ * This function is the heart of reiser4 tree-routines. Key comparison
22132+ * is among most heavily used operations in the file system.
22133+ */
22134+
22135+ assert("nikita-439", k1 != NULL);
22136+ assert("nikita-440", k2 != NULL);
22137+
22138+ /* there is no actual branch here: condition is compile time constant
22139+ * and constant folding and propagation ensures that only one branch
22140+ * is actually compiled in. */
22141+
22142+ if (REISER4_PLANA_KEY_ALLOCATION) {
22143+ /* if physical order of fields in a key is identical
22144+ with logical order, we can implement key comparison
22145+ as three 64bit comparisons. */
22146+ /* logical order of fields in plan-a:
22147+ locality->type->objectid->offset. */
22148+ /* compare locality and type at once */
22149+ result = KEY_DIFF_EL(k1, k2, 0);
22150+ if (result == EQUAL_TO) {
22151+ /* compare objectid (and band if it's there) */
22152+ result = KEY_DIFF_EL(k1, k2, 1);
22153+ /* compare offset */
22154+ if (result == EQUAL_TO) {
22155+ result = KEY_DIFF_EL(k1, k2, 2);
22156+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22157+ result = KEY_DIFF_EL(k1, k2, 3);
22158+ }
22159+ }
22160+ }
22161+ } else if (REISER4_3_5_KEY_ALLOCATION) {
22162+ result = KEY_DIFF(k1, k2, locality);
22163+ if (result == EQUAL_TO) {
22164+ result = KEY_DIFF(k1, k2, objectid);
22165+ if (result == EQUAL_TO) {
22166+ result = KEY_DIFF(k1, k2, type);
22167+ if (result == EQUAL_TO)
22168+ result = KEY_DIFF(k1, k2, offset);
22169+ }
22170+ }
22171+ } else
22172+ impossible("nikita-441", "Unknown key allocation scheme!");
22173+ return result;
22174+}
22175+
22176+/* true if @k1 equals @k2 */
22177+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22178+ const reiser4_key * k2 /* second key to compare */ )
22179+{
22180+ assert("nikita-1879", k1 != NULL);
22181+ assert("nikita-1880", k2 != NULL);
22182+ return !memcmp(k1, k2, sizeof *k1);
22183+}
22184+
22185+/* true if @k1 is less than @k2 */
22186+static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22187+ const reiser4_key * k2 /* second key to compare */ )
22188+{
22189+ assert("nikita-1952", k1 != NULL);
22190+ assert("nikita-1953", k2 != NULL);
22191+ return keycmp(k1, k2) == LESS_THAN;
22192+}
22193+
22194+/* true if @k1 is less than or equal to @k2 */
22195+static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22196+ const reiser4_key * k2 /* second key to compare */ )
22197+{
22198+ assert("nikita-1954", k1 != NULL);
22199+ assert("nikita-1955", k2 != NULL);
22200+ return keycmp(k1, k2) != GREATER_THAN;
22201+}
22202+
22203+/* true if @k1 is greater than @k2 */
22204+static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22205+ const reiser4_key * k2 /* second key to compare */ )
22206+{
22207+ assert("nikita-1959", k1 != NULL);
22208+ assert("nikita-1960", k2 != NULL);
22209+ return keycmp(k1, k2) == GREATER_THAN;
22210+}
22211+
22212+/* true if @k1 is greater than or equal to @k2 */
22213+static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22214+ const reiser4_key * k2 /* second key to compare */ )
22215+{
22216+ assert("nikita-1956", k1 != NULL);
22217+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22218+ * November 3: Laika */
22219+ return keycmp(k1, k2) != LESS_THAN;
22220+}
22221+
22222+static inline void prefetchkey(reiser4_key * key)
22223+{
22224+ prefetch(key);
22225+ prefetch(&key->el[KEY_CACHELINE_END]);
22226+}
22227+
22228+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22229+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22230+/* size of a buffer suitable to hold human readable key representation */
22231+#define KEY_BUF_LEN (80)
22232+
22233+#if REISER4_DEBUG
22234+extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22235+#else
22236+#define reiser4_print_key(p,k) noop
22237+#endif
22238+
22239+/* __FS_REISERFS_KEY_H__ */
22240+#endif
22241+
22242+/* Make Linus happy.
22243+ Local variables:
22244+ c-indentation-style: "K&R"
22245+ mode-name: "LC"
22246+ c-basic-offset: 8
22247+ tab-width: 8
22248+ fill-column: 120
22249+ End:
22250+*/
22251diff -urN linux-2.6.20.orig/fs/reiser4/ktxnmgrd.c linux-2.6.20/fs/reiser4/ktxnmgrd.c
22252--- linux-2.6.20.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300
22253+++ linux-2.6.20/fs/reiser4/ktxnmgrd.c 2007-05-06 14:50:43.738988223 +0400
22254@@ -0,0 +1,215 @@
22255+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22256+/* Transaction manager daemon. */
22257+
22258+/*
22259+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22260+ * needed/important for the following reasons:
22261+ *
22262+ * 1. in reiser4 atom is not committed immediately when last transaction
22263+ * handle closes, unless atom is either too old or too large (see
22264+ * atom_should_commit()). This is done to avoid committing too frequently.
22265+ * because:
22266+ *
22267+ * 2. sometimes we don't want to commit atom when closing last transaction
22268+ * handle even if it is old and fat enough. For example, because we are at
22269+ * this point under directory semaphore, and committing would stall all
22270+ * accesses to this directory.
22271+ *
22272+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22273+ * either due to (tunable) timeout or because it was explicitly woken up by
22274+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22275+ * eligible.
22276+ *
22277+ */
22278+
22279+#include "debug.h"
22280+#include "txnmgr.h"
22281+#include "tree.h"
22282+#include "ktxnmgrd.h"
22283+#include "super.h"
22284+#include "reiser4.h"
22285+
22286+#include <linux/sched.h> /* for struct task_struct */
22287+#include <linux/wait.h>
22288+#include <linux/suspend.h>
22289+#include <linux/kernel.h>
22290+#include <linux/writeback.h>
22291+#include <linux/kthread.h>
22292+#include <linux/freezer.h>
22293+
22294+static int scan_mgr(struct super_block *);
22295+
22296+/*
22297+ * change current->comm so that ps, top, and friends will see changed
22298+ * state. This serves no useful purpose whatsoever, but also costs nothing. May
22299+ * be it will make lonely system administrator feeling less alone at 3 A.M.
22300+ */
22301+#define set_comm( state ) \
22302+ snprintf( current -> comm, sizeof( current -> comm ), \
22303+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22304+
22305+/**
22306+ * ktxnmgrd - kernel txnmgr daemon
22307+ * @arg: pointer to super block
22308+ *
22309+ * The background transaction manager daemon, started as a kernel thread during
22310+ * reiser4 initialization.
22311+ */
22312+static int ktxnmgrd(void *arg)
22313+{
22314+ struct super_block *super;
22315+ ktxnmgrd_context *ctx;
22316+ txn_mgr *mgr;
22317+ int done = 0;
22318+
22319+ super = arg;
22320+ mgr = &get_super_private(super)->tmgr;
22321+
22322+ /*
22323+ * do_fork() just copies task_struct into the new thread. ->fs_context
22324+ * shouldn't be copied of course. This shouldn't be a problem for the
22325+ * rest of the code though.
22326+ */
22327+ current->journal_info = NULL;
22328+ ctx = mgr->daemon;
22329+ while (1) {
22330+ try_to_freeze();
22331+ set_comm("wait");
22332+ {
22333+ DEFINE_WAIT(__wait);
22334+
22335+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22336+ if (kthread_should_stop()) {
22337+ done = 1;
22338+ } else
22339+ schedule_timeout(ctx->timeout);
22340+ finish_wait(&ctx->wait, &__wait);
22341+ }
22342+ if (done)
22343+ break;
22344+ set_comm("run");
22345+ spin_lock(&ctx->guard);
22346+ /*
22347+ * wait timed out or ktxnmgrd was woken up by explicit request
22348+ * to commit something. Scan list of atoms in txnmgr and look
22349+ * for too old atoms.
22350+ */
22351+ do {
22352+ ctx->rescan = 0;
22353+ scan_mgr(super);
22354+ spin_lock(&ctx->guard);
22355+ if (ctx->rescan) {
22356+ /*
22357+ * the list could be modified while ctx
22358+ * spinlock was released, we have to repeat
22359+ * scanning from the beginning
22360+ */
22361+ break;
22362+ }
22363+ } while (ctx->rescan);
22364+ spin_unlock(&ctx->guard);
22365+ }
22366+ return 0;
22367+}
22368+
22369+#undef set_comm
22370+
22371+/**
22372+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22373+ * @super: pointer to super block
22374+ *
22375+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22376+ * manager. Starts kernel txnmgr daemon. This is called on mount.
22377+ */
22378+int reiser4_init_ktxnmgrd(struct super_block *super)
22379+{
22380+ txn_mgr *mgr;
22381+ ktxnmgrd_context *ctx;
22382+
22383+ mgr = &get_super_private(super)->tmgr;
22384+
22385+ assert("zam-1014", mgr->daemon == NULL);
22386+
22387+ ctx = kmalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22388+ if (ctx == NULL)
22389+ return RETERR(-ENOMEM);
22390+
22391+ assert("nikita-2442", ctx != NULL);
22392+
22393+ memset(ctx, 0, sizeof *ctx);
22394+ init_waitqueue_head(&ctx->wait);
22395+
22396+ /*kcond_init(&ctx->startup);*/
22397+ spin_lock_init(&ctx->guard);
22398+ ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22399+ ctx->rescan = 1;
22400+ mgr->daemon = ctx;
22401+
22402+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22403+ if (IS_ERR(ctx->tsk)) {
22404+ int ret = PTR_ERR(ctx->tsk);
22405+ mgr->daemon = NULL;
22406+ kfree(ctx);
22407+ return RETERR(ret);
22408+ }
22409+ return 0;
22410+}
22411+
22412+void ktxnmgrd_kick(txn_mgr *mgr)
22413+{
22414+ assert("nikita-3234", mgr != NULL);
22415+ assert("nikita-3235", mgr->daemon != NULL);
22416+ wake_up(&mgr->daemon->wait);
22417+}
22418+
22419+int is_current_ktxnmgrd(void)
22420+{
22421+ return (get_current_super_private()->tmgr.daemon->tsk == current);
22422+}
22423+
22424+/**
22425+ * scan_mgr - commit atoms which are to be committed
22426+ * @super: super block to commit atoms of
22427+ *
22428+ * Commits old atoms.
22429+ */
22430+static int scan_mgr(struct super_block *super)
22431+{
22432+ int ret;
22433+ reiser4_context ctx;
22434+
22435+ init_stack_context(&ctx, super);
22436+
22437+ ret = commit_some_atoms(&get_super_private(super)->tmgr);
22438+
22439+ reiser4_exit_context(&ctx);
22440+ return ret;
22441+}
22442+
22443+/**
22444+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22445+ * @mgr:
22446+ *
22447+ * This is called on umount. Stops ktxnmgrd and free t
22448+ */
22449+void reiser4_done_ktxnmgrd(struct super_block *super)
22450+{
22451+ txn_mgr *mgr;
22452+
22453+ mgr = &get_super_private(super)->tmgr;
22454+ assert("zam-1012", mgr->daemon != NULL);
22455+
22456+ kthread_stop(mgr->daemon->tsk);
22457+ kfree(mgr->daemon);
22458+ mgr->daemon = NULL;
22459+}
22460+
22461+/*
22462+ * Local variables:
22463+ * c-indentation-style: "K&R"
22464+ * mode-name: "LC"
22465+ * c-basic-offset: 8
22466+ * tab-width: 8
22467+ * fill-column: 120
22468+ * End:
22469+ */
22470diff -urN linux-2.6.20.orig/fs/reiser4/ktxnmgrd.h linux-2.6.20/fs/reiser4/ktxnmgrd.h
22471--- linux-2.6.20.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300
22472+++ linux-2.6.20/fs/reiser4/ktxnmgrd.h 2007-05-06 14:50:43.738988223 +0400
22473@@ -0,0 +1,52 @@
22474+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22475+ * reiser4/README */
22476+
22477+/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22478+
22479+#ifndef __KTXNMGRD_H__
22480+#define __KTXNMGRD_H__
22481+
22482+#include "txnmgr.h"
22483+
22484+#include <linux/fs.h>
22485+#include <linux/wait.h>
22486+#include <linux/completion.h>
22487+#include <linux/spinlock.h>
22488+#include <asm/atomic.h>
22489+#include <linux/sched.h> /* for struct task_struct */
22490+
22491+/* in this structure all data necessary to start up, shut down and communicate
22492+ * with ktxnmgrd are kept. */
22493+struct ktxnmgrd_context {
22494+ /* wait queue head on which ktxnmgrd sleeps */
22495+ wait_queue_head_t wait;
22496+ /* spin lock protecting all fields of this structure */
22497+ spinlock_t guard;
22498+ /* timeout of sleeping on ->wait */
22499+ signed long timeout;
22500+ /* kernel thread running ktxnmgrd */
22501+ struct task_struct *tsk;
22502+ /* list of all file systems served by this ktxnmgrd */
22503+ struct list_head queue;
22504+ /* should ktxnmgrd repeat scanning of atoms? */
22505+ unsigned int rescan:1;
22506+};
22507+
22508+extern int reiser4_init_ktxnmgrd(struct super_block *);
22509+extern void reiser4_done_ktxnmgrd(struct super_block *);
22510+
22511+extern void ktxnmgrd_kick(txn_mgr * mgr);
22512+extern int is_current_ktxnmgrd(void);
22513+
22514+/* __KTXNMGRD_H__ */
22515+#endif
22516+
22517+/* Make Linus happy.
22518+ Local variables:
22519+ c-indentation-style: "K&R"
22520+ mode-name: "LC"
22521+ c-basic-offset: 8
22522+ tab-width: 8
22523+ fill-column: 120
22524+ End:
22525+*/
22526diff -urN linux-2.6.20.orig/fs/reiser4/lock.c linux-2.6.20/fs/reiser4/lock.c
22527--- linux-2.6.20.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22528+++ linux-2.6.20/fs/reiser4/lock.c 2007-05-06 14:50:43.742989473 +0400
22529@@ -0,0 +1,1232 @@
22530+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22531+ * reiser4/README */
22532+
22533+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22534+ order. V4 balances the tree from the bottom up, and searches the tree from
22535+ the top down, and that is really the way we want it, so tradition won't work
22536+ for us.
22537+
22538+ Instead we have two lock orderings, a high priority lock ordering, and a low
22539+ priority lock ordering. Each node in the tree has a lock in its znode.
22540+
22541+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22542+ has a set (maybe empty) of already locked nodes ("process locked set"). Each
22543+ process may have a pending lock request to a node locked by another process.
22544+ Note: we lock and unlock, but do not transfer locks: it is possible
22545+ transferring locks instead would save some bus locking....
22546+
22547+ Deadlock occurs when we have a loop constructed from process locked sets and
22548+ lock request vectors.
22549+
22550+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22551+ memory is extended with "znodes" with which we connect nodes with their left
22552+ and right neighbors using sibling pointers stored in the znodes. When we
22553+ perform balancing operations we often go from left to right and from right to
22554+ left.
22555+
22556+ +-P1-+ +-P3-+
22557+ |+--+| V1 |+--+|
22558+ ||N1|| -------> ||N3||
22559+ |+--+| |+--+|
22560+ +----+ +----+
22561+ ^ |
22562+ |V2 |V3
22563+ | v
22564+ +---------P2---------+
22565+ |+--+ +--+|
22566+ ||N2| -------- |N4||
22567+ |+--+ +--+|
22568+ +--------------------+
22569+
22570+ We solve this by ensuring that only low priority processes lock in top to
22571+ bottom order and from right to left, and high priority processes lock from
22572+ bottom to top and left to right.
22573+
22574+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22575+ kill those damn busy loops.
22576+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22577+ stage) cannot be ordered that way. There are no rules what nodes can belong
22578+ to the atom and what nodes cannot. We cannot define what is right or left
22579+ direction, what is top or bottom. We can take immediate parent or side
22580+ neighbor of one node, but nobody guarantees that, say, left neighbor node is
22581+ not a far right neighbor for other nodes from the same atom. It breaks
22582+ deadlock avoidance rules and hi-low priority locking cannot be applied for
22583+ atom locks.
22584+
22585+ How does it help to avoid deadlocks ?
22586+
22587+ Suppose we have a deadlock with n processes. Processes from one priority
22588+ class never deadlock because they take locks in one consistent
22589+ order.
22590+
22591+ So, any possible deadlock loop must have low priority as well as high
22592+ priority processes. There are no other lock priority levels except low and
22593+ high. We know that any deadlock loop contains at least one node locked by a
22594+ low priority process and requested by a high priority process. If this
22595+ situation is caught and resolved it is sufficient to avoid deadlocks.
22596+
22597+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22598+
22599+ The deadlock prevention algorithm is based on comparing
22600+ priorities of node owners (processes which keep znode locked) and
22601+ requesters (processes which want to acquire a lock on znode). We
22602+ implement a scheme where low-priority owners yield locks to
22603+ high-priority requesters. We created a signal passing system that
22604+ is used to ask low-priority processes to yield one or more locked
22605+ znodes.
22606+
22607+ The condition when a znode needs to change its owners is described by the
22608+ following formula:
22609+
22610+ #############################################
22611+ # #
22612+ # (number of high-priority requesters) > 0 #
22613+ # AND #
22614+ # (numbers of high-priority owners) == 0 #
22615+ # #
22616+ #############################################
22617+
22618+ Note that a low-priority process delays node releasing if another
22619+ high-priority process owns this node. So, slightly more strictly speaking,
22620+ to have a deadlock capable cycle you must have a loop in which a high
22621+ priority process is waiting on a low priority process to yield a node, which
22622+ is slightly different from saying a high priority process is waiting on a
22623+ node owned by a low priority process.
22624+
22625+ It is enough to avoid deadlocks if we prevent any low-priority process from
22626+ falling asleep if its locked set contains a node which satisfies the
22627+ deadlock condition.
22628+
22629+ That condition is implicitly or explicitly checked in all places where new
22630+ high-priority requests may be added or removed from node request queue or
22631+ high-priority process takes or releases a lock on node. The main
22632+ goal of these checks is to never lose the moment when node becomes "has
22633+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22634+ at that time.
22635+
22636+ The information about received signals is stored in the per-process
22637+ structure (lock stack) and analyzed before a low-priority process goes to
22638+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22639+ sleeping process up and forces him to re-check lock status and received
22640+ signal info. If "must-yield-this-lock" signals were received the locking
22641+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22642+
22643+ V4 LOCKING DRAWBACKS
22644+
22645+ If we have already balanced on one level, and we are propagating our changes
22646+ upward to a higher level, it could be very messy to surrender all locks on
22647+ the lower level because we put so much computational work into it, and
22648+ reverting them to their state before they were locked might be very complex.
22649+ We also don't want to acquire all locks before performing balancing because
22650+ that would either be almost as much work as the balancing, or it would be
22651+ too conservative and lock too much. We want balancing to be done only at
22652+ high priority. Yet, we might want to go to the left one node and use some
22653+ of its empty space... So we make one attempt at getting the node to the left
22654+ using try_lock, and if it fails we do without it, because we didn't really
22655+ need it, it was only a nice to have.
22656+
22657+ LOCK STRUCTURES DESCRIPTION
22658+
22659+ The following data structures are used in the reiser4 locking
22660+ implementation:
22661+
22662+ All fields related to long-term locking are stored in znode->lock.
22663+
22664+ The lock stack is a per thread object. It owns all znodes locked by the
22665+ thread. One znode may be locked by several threads in case of read lock or
22666+ one znode may be write locked by one thread several times. The special link
22667+ objects (lock handles) support n<->m relation between znodes and lock
22668+ owners.
22669+
22670+ <Thread 1> <Thread 2>
22671+
22672+ +---------+ +---------+
22673+ | LS1 | | LS2 |
22674+ +---------+ +---------+
22675+ ^ ^
22676+ |---------------+ +----------+
22677+ v v v v
22678+ +---------+ +---------+ +---------+ +---------+
22679+ | LH1 | | LH2 | | LH3 | | LH4 |
22680+ +---------+ +---------+ +---------+ +---------+
22681+ ^ ^ ^ ^
22682+ | +------------+ |
22683+ v v v
22684+ +---------+ +---------+ +---------+
22685+ | Z1 | | Z2 | | Z3 |
22686+ +---------+ +---------+ +---------+
22687+
22688+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22689+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22690+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22691+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22692+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22693+ is locked (for read) twice by different threads and two lock handles are on
22694+ its list. Each lock handle represents a single relation of a locking of a
22695+ znode by a thread. Locking of a znode is an establishing of a locking
22696+ relation between the lock stack and the znode by adding of a new lock handle
22697+ to a list of lock handles, the lock stack. The lock stack links all lock
22698+ handles for all znodes locked by the lock stack. The znode list groups all
22699+ lock handles for all locks stacks which locked the znode.
22700+
22701+ Yet another relation may exist between znode and lock owners. If lock
22702+ procedure cannot immediately take lock on an object it adds the lock owner
22703+ on special `requestors' list belongs to znode. That list represents a
22704+ queue of pending lock requests. Because one lock owner may request only
22705+ only one lock object at a time, it is a 1->n relation between lock objects
22706+ and a lock owner implemented as it is described above. Full information
22707+ (priority, pointers to lock and link objects) about each lock request is
22708+ stored in lock owner structure in `request' field.
22709+
22710+ SHORT_TERM LOCKING
22711+
22712+ This is a list of primitive operations over lock stacks / lock handles /
22713+ znodes and locking descriptions for them.
22714+
22715+ 1. locking / unlocking which is done by two list insertion/deletion, one
22716+ to/from znode's list of lock handles, another one is to/from lock stack's
22717+ list of lock handles. The first insertion is protected by
22718+ znode->lock.guard spinlock. The list owned by the lock stack can be
22719+ modified only by thread who owns the lock stack and nobody else can
22720+ modify/read it. There is nothing to be protected by a spinlock or
22721+ something else.
22722+
22723+ 2. adding/removing a lock request to/from znode requesters list. The rule is
22724+ that znode->lock.guard spinlock should be taken for this.
22725+
22726+ 3. we can traverse list of lock handles and use references to lock stacks who
22727+ locked given znode if znode->lock.guard spinlock is taken.
22728+
22729+ 4. If a lock stack is associated with a znode as a lock requestor or lock
22730+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its
22731+ (lock stack's) fields should be protected from being accessed in parallel
22732+ by two or more threads. Please look at lock_stack structure definition
22733+ for the info how those fields are protected. */
22734+
22735+/* Znode lock and capturing intertwining. */
22736+/* In current implementation we capture formatted nodes before locking
22737+ them. Take a look on longterm lock znode, reiser4_try_capture() request
22738+ precedes locking requests. The longterm_lock_znode function unconditionally
22739+ captures znode before even checking of locking conditions.
22740+
22741+ Another variant is to capture znode after locking it. It was not tested, but
22742+ at least one deadlock condition is supposed to be there. One thread has
22743+ locked a znode (Node-1) and calls reiser4_try_capture() for it.
22744+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22745+ Second thread is a flushing thread, its current atom is the atom Node-1
22746+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22747+ is locked by the first thread. The described situation is a deadlock. */
22748+
22749+#include "debug.h"
22750+#include "txnmgr.h"
22751+#include "znode.h"
22752+#include "jnode.h"
22753+#include "tree.h"
22754+#include "plugin/node/node.h"
22755+#include "super.h"
22756+
22757+#include <linux/spinlock.h>
22758+
22759+#if REISER4_DEBUG
22760+static int request_is_deadlock_safe(znode *, znode_lock_mode,
22761+ znode_lock_request);
22762+#endif
22763+
22764+/* Returns a lock owner associated with current thread */
22765+lock_stack *get_current_lock_stack(void)
22766+{
22767+ return &get_current_context()->stack;
22768+}
22769+
22770+/* Wakes up all low priority owners informing them about possible deadlock */
22771+static void wake_up_all_lopri_owners(znode * node)
22772+{
22773+ lock_handle *handle;
22774+
22775+ assert_spin_locked(&(node->lock.guard));
22776+ list_for_each_entry(handle, &node->lock.owners, owners_link) {
22777+ assert("nikita-1832", handle->node == node);
22778+ /* count this signal in owner->nr_signaled */
22779+ if (!handle->signaled) {
22780+ handle->signaled = 1;
22781+ atomic_inc(&handle->owner->nr_signaled);
22782+ /* Wake up a single process */
22783+ reiser4_wake_up(handle->owner);
22784+ }
22785+ }
22786+}
22787+
22788+/* Adds a lock to a lock owner, which means creating a link to the lock and
22789+ putting the link into the two lists all links are on (the doubly linked list
22790+ that forms the lock_stack, and the doubly linked list of links attached
22791+ to a lock.
22792+*/
22793+static inline void
22794+link_object(lock_handle * handle, lock_stack * owner, znode * node)
22795+{
22796+ assert("jmacd-810", handle->owner == NULL);
22797+ assert_spin_locked(&(node->lock.guard));
22798+
22799+ handle->owner = owner;
22800+ handle->node = node;
22801+
22802+ assert("reiser4-4",
22803+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22804+
22805+ /* add lock handle to the end of lock_stack's list of locks */
22806+ list_add_tail(&handle->locks_link, &owner->locks);
22807+ ON_DEBUG(owner->nr_locks++);
22808+ reiser4_ctx_gfp_mask_set();
22809+
22810+ /* add lock handle to the head of znode's list of owners */
22811+ list_add(&handle->owners_link, &node->lock.owners);
22812+ handle->signaled = 0;
22813+}
22814+
22815+/* Breaks a relation between a lock and its owner */
22816+static inline void unlink_object(lock_handle * handle)
22817+{
22818+ assert("zam-354", handle->owner != NULL);
22819+ assert("nikita-1608", handle->node != NULL);
22820+ assert_spin_locked(&(handle->node->lock.guard));
22821+ assert("nikita-1829", handle->owner == get_current_lock_stack());
22822+ assert("reiser4-5", handle->owner->nr_locks > 0);
22823+
22824+ /* remove lock handle from lock_stack's list of locks */
22825+ list_del(&handle->locks_link);
22826+ ON_DEBUG(handle->owner->nr_locks--);
22827+ reiser4_ctx_gfp_mask_set();
22828+ assert("reiser4-6",
22829+ ergo(list_empty_careful(&handle->owner->locks),
22830+ handle->owner->nr_locks == 0));
22831+ /* remove lock handle from znode's list of owners */
22832+ list_del(&handle->owners_link);
22833+ /* indicates that lock handle is free now */
22834+ handle->node = NULL;
22835+#if REISER4_DEBUG
22836+ INIT_LIST_HEAD(&handle->locks_link);
22837+ INIT_LIST_HEAD(&handle->owners_link);
22838+ handle->owner = NULL;
22839+#endif
22840+}
22841+
22842+/* Actually locks an object knowing that we are able to do this */
22843+static void lock_object(lock_stack * owner)
22844+{
22845+ lock_request *request;
22846+ znode *node;
22847+
22848+ request = &owner->request;
22849+ node = request->node;
22850+ assert_spin_locked(&(node->lock.guard));
22851+ if (request->mode == ZNODE_READ_LOCK) {
22852+ node->lock.nr_readers++;
22853+ } else {
22854+ /* check that we don't switched from read to write lock */
22855+ assert("nikita-1840", node->lock.nr_readers <= 0);
22856+ /* We allow recursive locking; a node can be locked several
22857+ times for write by same process */
22858+ node->lock.nr_readers--;
22859+ }
22860+
22861+ link_object(request->handle, owner, node);
22862+
22863+ if (owner->curpri) {
22864+ node->lock.nr_hipri_owners++;
22865+ }
22866+}
22867+
22868+/* Check for recursive write locking */
22869+static int recursive(lock_stack * owner)
22870+{
22871+ int ret;
22872+ znode *node;
22873+ lock_handle *lh;
22874+
22875+ node = owner->request.node;
22876+
22877+ /* Owners list is not empty for a locked node */
22878+ assert("zam-314", !list_empty_careful(&node->lock.owners));
22879+ assert("nikita-1841", owner == get_current_lock_stack());
22880+ assert_spin_locked(&(node->lock.guard));
22881+
22882+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
22883+ ret = (lh->owner == owner);
22884+
22885+ /* Recursive read locking should be done usual way */
22886+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
22887+ /* mixing of read/write locks is not allowed */
22888+ assert("zam-341", !ret || znode_is_wlocked(node));
22889+
22890+ return ret;
22891+}
22892+
22893+#if REISER4_DEBUG
22894+/* Returns true if the lock is held by the calling thread. */
22895+int znode_is_any_locked(const znode * node)
22896+{
22897+ lock_handle *handle;
22898+ lock_stack *stack;
22899+ int ret;
22900+
22901+ if (!znode_is_locked(node)) {
22902+ return 0;
22903+ }
22904+
22905+ stack = get_current_lock_stack();
22906+
22907+ spin_lock_stack(stack);
22908+
22909+ ret = 0;
22910+
22911+ list_for_each_entry(handle, &stack->locks, locks_link) {
22912+ if (handle->node == node) {
22913+ ret = 1;
22914+ break;
22915+ }
22916+ }
22917+
22918+ spin_unlock_stack(stack);
22919+
22920+ return ret;
22921+}
22922+
22923+#endif
22924+
22925+/* Returns true if a write lock is held by the calling thread. */
22926+int znode_is_write_locked(const znode * node)
22927+{
22928+ lock_stack *stack;
22929+ lock_handle *handle;
22930+
22931+ assert("jmacd-8765", node != NULL);
22932+
22933+ if (!znode_is_wlocked(node)) {
22934+ return 0;
22935+ }
22936+
22937+ stack = get_current_lock_stack();
22938+
22939+ /*
22940+ * When znode is write locked, all owner handles point to the same lock
22941+ * stack. Get pointer to lock stack from the first lock handle from
22942+ * znode's owner list
22943+ */
22944+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
22945+
22946+ return (handle->owner == stack);
22947+}
22948+
22949+/* This "deadlock" condition is the essential part of reiser4 locking
22950+ implementation. This condition is checked explicitly by calling
22951+ check_deadlock_condition() or implicitly in all places where znode lock
22952+ state (set of owners and request queue) is changed. Locking code is
22953+ designed to use this condition to trigger procedure of passing object from
22954+ low priority owner(s) to high priority one(s).
22955+
22956+ The procedure results in passing an event (setting lock_handle->signaled
22957+ flag) and counting this event in nr_signaled field of owner's lock stack
22958+ object and wakeup owner's process.
22959+*/
22960+static inline int check_deadlock_condition(znode * node)
22961+{
22962+ assert_spin_locked(&(node->lock.guard));
22963+ return node->lock.nr_hipri_requests > 0
22964+ && node->lock.nr_hipri_owners == 0;
22965+}
22966+
22967+static int check_livelock_condition(znode * node, znode_lock_mode mode)
22968+{
22969+ zlock * lock = &node->lock;
22970+
22971+ return mode == ZNODE_READ_LOCK &&
22972+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
22973+}
22974+
22975+/* checks lock/request compatibility */
22976+static int can_lock_object(lock_stack * owner)
22977+{
22978+ znode *node = owner->request.node;
22979+
22980+ assert_spin_locked(&(node->lock.guard));
22981+
22982+ /* See if the node is disconnected. */
22983+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
22984+ return RETERR(-EINVAL);
22985+
22986+ /* Do not ever try to take a lock if we are going in low priority
22987+ direction and a node have a high priority request without high
22988+ priority owners. */
22989+ if (unlikely(!owner->curpri && check_deadlock_condition(node)))
22990+ return RETERR(-E_REPEAT);
22991+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
22992+ return RETERR(-E_REPEAT);
22993+ if (unlikely(!is_lock_compatible(node, owner->request.mode)))
22994+ return RETERR(-E_REPEAT);
22995+ return 0;
22996+}
22997+
22998+/* Setting of a high priority to the process. It clears "signaled" flags
22999+ because znode locked by high-priority process can't satisfy our "deadlock
23000+ condition". */
23001+static void set_high_priority(lock_stack * owner)
23002+{
23003+ assert("nikita-1846", owner == get_current_lock_stack());
23004+ /* Do nothing if current priority is already high */
23005+ if (!owner->curpri) {
23006+ /* We don't need locking for owner->locks list, because, this
23007+ * function is only called with the lock stack of the current
23008+ * thread, and no other thread can play with owner->locks list
23009+ * and/or change ->node pointers of lock handles in this list.
23010+ *
23011+ * (Interrupts also are not involved.)
23012+ */
23013+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23014+ while (&owner->locks != &item->locks_link) {
23015+ znode *node = item->node;
23016+
23017+ spin_lock_zlock(&node->lock);
23018+
23019+ node->lock.nr_hipri_owners++;
23020+
23021+ /* we can safely set signaled to zero, because
23022+ previous statement (nr_hipri_owners ++) guarantees
23023+ that signaled will be never set again. */
23024+ item->signaled = 0;
23025+ spin_unlock_zlock(&node->lock);
23026+
23027+ item = list_entry(item->locks_link.next, lock_handle, locks_link);
23028+ }
23029+ owner->curpri = 1;
23030+ atomic_set(&owner->nr_signaled, 0);
23031+ }
23032+}
23033+
23034+/* Sets a low priority to the process. */
23035+static void set_low_priority(lock_stack * owner)
23036+{
23037+ assert("nikita-3075", owner == get_current_lock_stack());
23038+ /* Do nothing if current priority is already low */
23039+ if (owner->curpri) {
23040+ /* scan all locks (lock handles) held by @owner, which is
23041+ actually current thread, and check whether we are reaching
23042+ deadlock possibility anywhere.
23043+ */
23044+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23045+ while (&owner->locks != &handle->locks_link) {
23046+ znode *node = handle->node;
23047+ spin_lock_zlock(&node->lock);
23048+ /* this thread just was hipri owner of @node, so
23049+ nr_hipri_owners has to be greater than zero. */
23050+ assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23051+ node->lock.nr_hipri_owners--;
23052+ /* If we have deadlock condition, adjust a nr_signaled
23053+ field. It is enough to set "signaled" flag only for
23054+ current process, other low-pri owners will be
23055+ signaled and waken up after current process unlocks
23056+ this object and any high-priority requestor takes
23057+ control. */
23058+ if (check_deadlock_condition(node)
23059+ && !handle->signaled) {
23060+ handle->signaled = 1;
23061+ atomic_inc(&owner->nr_signaled);
23062+ }
23063+ spin_unlock_zlock(&node->lock);
23064+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23065+ }
23066+ owner->curpri = 0;
23067+ }
23068+}
23069+
23070+static void remove_lock_request(lock_stack * requestor)
23071+{
23072+ zlock * lock = &requestor->request.node->lock;
23073+
23074+ if (requestor->curpri) {
23075+ assert("nikita-1838", lock->nr_hipri_requests > 0);
23076+ lock->nr_hipri_requests--;
23077+ if (requestor->request.mode == ZNODE_WRITE_LOCK)
23078+ lock->nr_hipri_write_requests --;
23079+ }
23080+ list_del(&requestor->requestors_link);
23081+}
23082+
23083+static void invalidate_all_lock_requests(znode * node)
23084+{
23085+ lock_stack *requestor, *tmp;
23086+
23087+ assert_spin_locked(&(node->lock.guard));
23088+
23089+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23090+ remove_lock_request(requestor);
23091+ requestor->request.ret_code = -EINVAL;
23092+ reiser4_wake_up(requestor);
23093+ requestor->request.mode = ZNODE_NO_LOCK;
23094+ }
23095+}
23096+
23097+static void dispatch_lock_requests(znode * node)
23098+{
23099+ lock_stack *requestor, *tmp;
23100+
23101+ assert_spin_locked(&(node->lock.guard));
23102+
23103+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23104+ if (znode_is_write_locked(node))
23105+ break;
23106+ if (!can_lock_object(requestor)) {
23107+ lock_object(requestor);
23108+ remove_lock_request(requestor);
23109+ requestor->request.ret_code = 0;
23110+ reiser4_wake_up(requestor);
23111+ requestor->request.mode = ZNODE_NO_LOCK;
23112+ }
23113+ }
23114+}
23115+
23116+/* release long-term lock, acquired by longterm_lock_znode() */
23117+void longterm_unlock_znode(lock_handle * handle)
23118+{
23119+ znode *node = handle->node;
23120+ lock_stack *oldowner = handle->owner;
23121+ int hipri;
23122+ int readers;
23123+ int rdelta;
23124+ int youdie;
23125+
23126+ /*
23127+ * this is time-critical and highly optimized code. Modify carefully.
23128+ */
23129+
23130+ assert("jmacd-1021", handle != NULL);
23131+ assert("jmacd-1022", handle->owner != NULL);
23132+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23133+
23134+ assert("zam-130", oldowner == get_current_lock_stack());
23135+
23136+ LOCK_CNT_DEC(long_term_locked_znode);
23137+
23138+ /*
23139+ * to minimize amount of operations performed under lock, pre-compute
23140+ * all variables used within critical section. This makes code
23141+ * obscure.
23142+ */
23143+
23144+ /* was this lock of hi or lo priority */
23145+ hipri = oldowner->curpri ? 1 : 0;
23146+ /* number of readers */
23147+ readers = node->lock.nr_readers;
23148+ /* +1 if write lock, -1 if read lock */
23149+ rdelta = (readers > 0) ? -1 : +1;
23150+ /* true if node is to die and write lock is released */
23151+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23152+
23153+ spin_lock_zlock(&node->lock);
23154+
23155+ assert("zam-101", znode_is_locked(node));
23156+
23157+ /* Adjust a number of high priority owners of this lock */
23158+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23159+ node->lock.nr_hipri_owners -= hipri;
23160+
23161+ /* Handle znode deallocation on last write-lock release. */
23162+ if (znode_is_wlocked_once(node)) {
23163+ if (youdie) {
23164+ forget_znode(handle);
23165+ assert("nikita-2191", znode_invariant(node));
23166+ zput(node);
23167+ return;
23168+ }
23169+ }
23170+
23171+ if (handle->signaled)
23172+ atomic_dec(&oldowner->nr_signaled);
23173+
23174+ /* Unlocking means owner<->object link deletion */
23175+ unlink_object(handle);
23176+
23177+ /* This is enough to be sure whether an object is completely
23178+ unlocked. */
23179+ node->lock.nr_readers += rdelta;
23180+
23181+ /* If the node is locked it must have an owners list. Likewise, if
23182+ the node is unlocked it must have an empty owners list. */
23183+ assert("zam-319", equi(znode_is_locked(node),
23184+ !list_empty_careful(&node->lock.owners)));
23185+
23186+#if REISER4_DEBUG
23187+ if (!znode_is_locked(node))
23188+ ++node->times_locked;
23189+#endif
23190+
23191+ /* If there are pending lock requests we wake up a requestor */
23192+ if (!znode_is_wlocked(node))
23193+ dispatch_lock_requests(node);
23194+ if (check_deadlock_condition(node))
23195+ wake_up_all_lopri_owners(node);
23196+ spin_unlock_zlock(&node->lock);
23197+
23198+ /* minus one reference from handle->node */
23199+ assert("nikita-2190", znode_invariant(node));
23200+ ON_DEBUG(check_lock_data());
23201+ ON_DEBUG(check_lock_node_data(node));
23202+ zput(node);
23203+}
23204+
23205+/* final portion of longterm-lock */
23206+static int
23207+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23208+{
23209+ znode *node = owner->request.node;
23210+
23211+ assert_spin_locked(&(node->lock.guard));
23212+
23213+ /* If we broke with (ok == 0) it means we can_lock, now do it. */
23214+ if (ok == 0) {
23215+ lock_object(owner);
23216+ owner->request.mode = 0;
23217+ /* count a reference from lockhandle->node
23218+
23219+ znode was already referenced at the entry to this function,
23220+ hence taking spin-lock here is not necessary (see comment
23221+ in the zref()).
23222+ */
23223+ zref(node);
23224+
23225+ LOCK_CNT_INC(long_term_locked_znode);
23226+ }
23227+ spin_unlock_zlock(&node->lock);
23228+ ON_DEBUG(check_lock_data());
23229+ ON_DEBUG(check_lock_node_data(node));
23230+ return ok;
23231+}
23232+
23233+/*
23234+ * version of longterm_znode_lock() optimized for the most common case: read
23235+ * lock without any special flags. This is the kind of lock that any tree
23236+ * traversal takes on the root node of the tree, which is very frequent.
23237+ */
23238+static int longterm_lock_tryfast(lock_stack * owner)
23239+{
23240+ int result;
23241+ znode *node;
23242+ zlock *lock;
23243+
23244+ node = owner->request.node;
23245+ lock = &node->lock;
23246+
23247+ assert("nikita-3340", reiser4_schedulable());
23248+ assert("nikita-3341", request_is_deadlock_safe(node,
23249+ ZNODE_READ_LOCK,
23250+ ZNODE_LOCK_LOPRI));
23251+ spin_lock_zlock(lock);
23252+ result = can_lock_object(owner);
23253+ spin_unlock_zlock(lock);
23254+
23255+ if (likely(result != -EINVAL)) {
23256+ spin_lock_znode(node);
23257+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23258+ spin_unlock_znode(node);
23259+ spin_lock_zlock(lock);
23260+ if (unlikely(result != 0)) {
23261+ owner->request.mode = 0;
23262+ } else {
23263+ result = can_lock_object(owner);
23264+ if (unlikely(result == -E_REPEAT)) {
23265+ /* fall back to longterm_lock_znode() */
23266+ spin_unlock_zlock(lock);
23267+ return 1;
23268+ }
23269+ }
23270+ return lock_tail(owner, result, ZNODE_READ_LOCK);
23271+ } else
23272+ return 1;
23273+}
23274+
23275+/* locks given lock object */
23276+int longterm_lock_znode(
23277+ /* local link object (allocated by lock owner thread, usually on its own
23278+ * stack) */
23279+ lock_handle * handle,
23280+ /* znode we want to lock. */
23281+ znode * node,
23282+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23283+ znode_lock_mode mode,
23284+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23285+ znode_lock_request request) {
23286+ int ret;
23287+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23288+ int non_blocking = 0;
23289+ int has_atom;
23290+ txn_capture cap_flags;
23291+ zlock *lock;
23292+ txn_handle *txnh;
23293+ tree_level level;
23294+
23295+ /* Get current process context */
23296+ lock_stack *owner = get_current_lock_stack();
23297+
23298+ /* Check that the lock handle is initialized and isn't already being
23299+ * used. */
23300+ assert("jmacd-808", handle->owner == NULL);
23301+ assert("nikita-3026", reiser4_schedulable());
23302+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23303+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23304+ /* long term locks are not allowed in the VM contexts (->writepage(),
23305+ * prune_{d,i}cache()).
23306+ *
23307+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23308+ * bug caused by d_splice_alias() only working for directories.
23309+ */
23310+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23311+ assert ("zam-1055", mode != ZNODE_NO_LOCK);
23312+
23313+ cap_flags = 0;
23314+ if (request & ZNODE_LOCK_NONBLOCK) {
23315+ cap_flags |= TXN_CAPTURE_NONBLOCKING;
23316+ non_blocking = 1;
23317+ }
23318+
23319+ if (request & ZNODE_LOCK_DONT_FUSE)
23320+ cap_flags |= TXN_CAPTURE_DONT_FUSE;
23321+
23322+ /* If we are changing our process priority we must adjust a number
23323+ of high priority owners for each znode that we already lock */
23324+ if (hipri) {
23325+ set_high_priority(owner);
23326+ } else {
23327+ set_low_priority(owner);
23328+ }
23329+
23330+ level = znode_get_level(node);
23331+
23332+ /* Fill request structure with our values. */
23333+ owner->request.mode = mode;
23334+ owner->request.handle = handle;
23335+ owner->request.node = node;
23336+
23337+ txnh = get_current_context()->trans;
23338+ lock = &node->lock;
23339+
23340+ if (mode == ZNODE_READ_LOCK && request == 0) {
23341+ ret = longterm_lock_tryfast(owner);
23342+ if (ret <= 0)
23343+ return ret;
23344+ }
23345+
23346+ has_atom = (txnh->atom != NULL);
23347+
23348+ /* Synchronize on node's zlock guard lock. */
23349+ spin_lock_zlock(lock);
23350+
23351+ if (znode_is_locked(node) &&
23352+ mode == ZNODE_WRITE_LOCK && recursive(owner))
23353+ return lock_tail(owner, 0, mode);
23354+
23355+ for (;;) {
23356+ /* Check the lock's availability: if it is unavaiable we get
23357+ E_REPEAT, 0 indicates "can_lock", otherwise the node is
23358+ invalid. */
23359+ ret = can_lock_object(owner);
23360+
23361+ if (unlikely(ret == -EINVAL)) {
23362+ /* @node is dying. Leave it alone. */
23363+ break;
23364+ }
23365+
23366+ if (unlikely(ret == -E_REPEAT && non_blocking)) {
23367+ /* either locking of @node by the current thread will
23368+ * lead to the deadlock, or lock modes are
23369+ * incompatible. */
23370+ break;
23371+ }
23372+
23373+ assert("nikita-1844", (ret == 0)
23374+ || ((ret == -E_REPEAT) && !non_blocking));
23375+ /* If we can get the lock... Try to capture first before
23376+ taking the lock. */
23377+
23378+ /* first handle commonest case where node and txnh are already
23379+ * in the same atom. */
23380+ /* safe to do without taking locks, because:
23381+ *
23382+ * 1. read of aligned word is atomic with respect to writes to
23383+ * this word
23384+ *
23385+ * 2. false negatives are handled in reiser4_try_capture().
23386+ *
23387+ * 3. false positives are impossible.
23388+ *
23389+ * PROOF: left as an exercise to the curious reader.
23390+ *
23391+ * Just kidding. Here is one:
23392+ *
23393+ * At the time T0 txnh->atom is stored in txnh_atom.
23394+ *
23395+ * At the time T1 node->atom is stored in node_atom.
23396+ *
23397+ * At the time T2 we observe that
23398+ *
23399+ * txnh_atom != NULL && node_atom == txnh_atom.
23400+ *
23401+ * Imagine that at this moment we acquire node and txnh spin
23402+ * lock in this order. Suppose that under spin lock we have
23403+ *
23404+ * node->atom != txnh->atom, (S1)
23405+ *
23406+ * at the time T3.
23407+ *
23408+ * txnh->atom != NULL still, because txnh is open by the
23409+ * current thread.
23410+ *
23411+ * Suppose node->atom == NULL, that is, node was un-captured
23412+ * between T1, and T3. But un-capturing of formatted node is
23413+ * always preceded by the call to reiser4_invalidate_lock(),
23414+ * which marks znode as JNODE_IS_DYING under zlock spin
23415+ * lock. Contradiction, because can_lock_object() above checks
23416+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23417+ *
23418+ * Suppose that node->atom != node_atom, that is, atom, node
23419+ * belongs to was fused into another atom: node_atom was fused
23420+ * into node->atom. Atom of txnh was equal to node_atom at T2,
23421+ * which means that under spin lock, txnh->atom == node->atom,
23422+ * because txnh->atom can only follow fusion
23423+ * chain. Contradicts S1.
23424+ *
23425+ * The same for hypothesis txnh->atom != txnh_atom. Hence,
23426+ * node->atom == node_atom == txnh_atom == txnh->atom. Again
23427+ * contradicts S1. Hence S1 is false. QED.
23428+ *
23429+ */
23430+
23431+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23432+ ;
23433+ } else {
23434+ /*
23435+ * unlock zlock spin lock here. It is possible for
23436+ * longterm_unlock_znode() to sneak in here, but there
23437+ * is no harm: reiser4_invalidate_lock() will mark znode
23438+ * as JNODE_IS_DYING and this will be noted by
23439+ * can_lock_object() below.
23440+ */
23441+ spin_unlock_zlock(lock);
23442+ spin_lock_znode(node);
23443+ ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23444+ spin_unlock_znode(node);
23445+ spin_lock_zlock(lock);
23446+ if (unlikely(ret != 0)) {
23447+ /* In the failure case, the txnmgr releases
23448+ the znode's lock (or in some cases, it was
23449+ released a while ago). There's no need to
23450+ reacquire it so we should return here,
23451+ avoid releasing the lock. */
23452+ owner->request.mode = 0;
23453+ break;
23454+ }
23455+
23456+ /* Check the lock's availability again -- this is
23457+ because under some circumstances the capture code
23458+ has to release and reacquire the znode spinlock. */
23459+ ret = can_lock_object(owner);
23460+ }
23461+
23462+ /* This time, a return of (ret == 0) means we can lock, so we
23463+ should break out of the loop. */
23464+ if (likely(ret != -E_REPEAT || non_blocking))
23465+ break;
23466+
23467+ /* Lock is unavailable, we have to wait. */
23468+ ret = reiser4_prepare_to_sleep(owner);
23469+ if (unlikely(ret != 0))
23470+ break;
23471+
23472+ assert_spin_locked(&(node->lock.guard));
23473+ if (hipri) {
23474+ /* If we are going in high priority direction then
23475+ increase high priority requests counter for the
23476+ node */
23477+ lock->nr_hipri_requests++;
23478+ if (mode == ZNODE_WRITE_LOCK)
23479+ lock->nr_hipri_write_requests ++;
23480+ /* If there are no high priority owners for a node,
23481+ then immediately wake up low priority owners, so
23482+ they can detect possible deadlock */
23483+ if (lock->nr_hipri_owners == 0)
23484+ wake_up_all_lopri_owners(node);
23485+ }
23486+ list_add_tail(&owner->requestors_link, &lock->requestors);
23487+
23488+ /* Ok, here we have prepared a lock request, so unlock
23489+ a znode ... */
23490+ spin_unlock_zlock(lock);
23491+ /* ... and sleep */
23492+ reiser4_go_to_sleep(owner);
23493+ if (owner->request.mode == ZNODE_NO_LOCK)
23494+ goto request_is_done;
23495+ spin_lock_zlock(lock);
23496+ if (owner->request.mode == ZNODE_NO_LOCK) {
23497+ spin_unlock_zlock(lock);
23498+ request_is_done:
23499+ if (owner->request.ret_code == 0) {
23500+ LOCK_CNT_INC(long_term_locked_znode);
23501+ zref(node);
23502+ }
23503+ return owner->request.ret_code;
23504+ }
23505+ remove_lock_request(owner);
23506+ }
23507+
23508+ return lock_tail(owner, ret, mode);
23509+}
23510+
23511+/* lock object invalidation means changing of lock object state to `INVALID'
23512+ and waiting for all other processes to cancel theirs lock requests. */
23513+void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23514+ * owner and lock
23515+ * object is being
23516+ * invalidated. */ )
23517+{
23518+ znode *node = handle->node;
23519+ lock_stack *owner = handle->owner;
23520+
23521+ assert("zam-325", owner == get_current_lock_stack());
23522+ assert("zam-103", znode_is_write_locked(node));
23523+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23524+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23525+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23526+ assert("nikita-3097", znode_is_wlocked_once(node));
23527+ assert_spin_locked(&(node->lock.guard));
23528+
23529+ if (handle->signaled)
23530+ atomic_dec(&owner->nr_signaled);
23531+
23532+ ZF_SET(node, JNODE_IS_DYING);
23533+ unlink_object(handle);
23534+ node->lock.nr_readers = 0;
23535+
23536+ invalidate_all_lock_requests(node);
23537+ spin_unlock_zlock(&node->lock);
23538+}
23539+
23540+/* Initializes lock_stack. */
23541+void init_lock_stack(lock_stack * owner /* pointer to
23542+ * allocated
23543+ * structure. */ )
23544+{
23545+ INIT_LIST_HEAD(&owner->locks);
23546+ INIT_LIST_HEAD(&owner->requestors_link);
23547+ spin_lock_init(&owner->sguard);
23548+ owner->curpri = 1;
23549+ init_waitqueue_head(&owner->wait);
23550+}
23551+
23552+/* Initializes lock object. */
23553+void reiser4_init_lock(zlock * lock /* pointer on allocated
23554+ * uninitialized lock object
23555+ * structure. */ )
23556+{
23557+ memset(lock, 0, sizeof(zlock));
23558+ spin_lock_init(&lock->guard);
23559+ INIT_LIST_HEAD(&lock->requestors);
23560+ INIT_LIST_HEAD(&lock->owners);
23561+}
23562+
23563+/* Transfer a lock handle (presumably so that variables can be moved between stack and
23564+ heap locations). */
23565+static void
23566+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23567+{
23568+ znode *node = old->node;
23569+ lock_stack *owner = old->owner;
23570+ int signaled;
23571+
23572+ /* locks_list, modified by link_object() is not protected by
23573+ anything. This is valid because only current thread ever modifies
23574+ locks_list of its lock_stack.
23575+ */
23576+ assert("nikita-1827", owner == get_current_lock_stack());
23577+ assert("nikita-1831", new->owner == NULL);
23578+
23579+ spin_lock_zlock(&node->lock);
23580+
23581+ signaled = old->signaled;
23582+ if (unlink_old) {
23583+ unlink_object(old);
23584+ } else {
23585+ if (node->lock.nr_readers > 0) {
23586+ node->lock.nr_readers += 1;
23587+ } else {
23588+ node->lock.nr_readers -= 1;
23589+ }
23590+ if (signaled) {
23591+ atomic_inc(&owner->nr_signaled);
23592+ }
23593+ if (owner->curpri) {
23594+ node->lock.nr_hipri_owners += 1;
23595+ }
23596+ LOCK_CNT_INC(long_term_locked_znode);
23597+
23598+ zref(node);
23599+ }
23600+ link_object(new, owner, node);
23601+ new->signaled = signaled;
23602+
23603+ spin_unlock_zlock(&node->lock);
23604+}
23605+
23606+void move_lh(lock_handle * new, lock_handle * old)
23607+{
23608+ move_lh_internal(new, old, /*unlink_old */ 1);
23609+}
23610+
23611+void copy_lh(lock_handle * new, lock_handle * old)
23612+{
23613+ move_lh_internal(new, old, /*unlink_old */ 0);
23614+}
23615+
23616+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23617+int reiser4_check_deadlock(void)
23618+{
23619+ lock_stack *owner = get_current_lock_stack();
23620+ return atomic_read(&owner->nr_signaled) != 0;
23621+}
23622+
23623+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23624+ priorities. */
23625+int reiser4_prepare_to_sleep(lock_stack * owner)
23626+{
23627+ assert("nikita-1847", owner == get_current_lock_stack());
23628+
23629+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23630+ * counted in nr_signaled */
23631+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23632+ assert("zam-959", !owner->curpri);
23633+ return RETERR(-E_DEADLOCK);
23634+ }
23635+ return 0;
23636+}
23637+
23638+/* Wakes up a single thread */
23639+void __reiser4_wake_up(lock_stack * owner)
23640+{
23641+ atomic_set(&owner->wakeup, 1);
23642+ wake_up(&owner->wait);
23643+}
23644+
23645+/* Puts a thread to sleep */
23646+void reiser4_go_to_sleep(lock_stack * owner)
23647+{
23648+ /* Well, we might sleep here, so holding of any spinlocks is no-no */
23649+ assert("nikita-3027", reiser4_schedulable());
23650+
23651+ wait_event(owner->wait, atomic_read(&owner->wakeup));
23652+ atomic_set(&owner->wakeup, 0);
23653+}
23654+
23655+int lock_stack_isclean(lock_stack * owner)
23656+{
23657+ if (list_empty_careful(&owner->locks)) {
23658+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23659+ return 1;
23660+ }
23661+
23662+ return 0;
23663+}
23664+
23665+#if REISER4_DEBUG
23666+
23667+/*
23668+ * debugging functions
23669+ */
23670+
23671+static void list_check(struct list_head *head)
23672+{
23673+ struct list_head *pos;
23674+
23675+ list_for_each(pos, head)
23676+ assert("", (pos->prev != NULL && pos->next != NULL &&
23677+ pos->prev->next == pos && pos->next->prev == pos));
23678+}
23679+
23680+/* check consistency of locking data-structures hanging of the @stack */
23681+static void check_lock_stack(lock_stack * stack)
23682+{
23683+ spin_lock_stack(stack);
23684+ /* check that stack->locks is not corrupted */
23685+ list_check(&stack->locks);
23686+ spin_unlock_stack(stack);
23687+}
23688+
23689+/* check consistency of locking data structures */
23690+void check_lock_data(void)
23691+{
23692+ check_lock_stack(&get_current_context()->stack);
23693+}
23694+
23695+/* check consistency of locking data structures for @node */
23696+void check_lock_node_data(znode * node)
23697+{
23698+ spin_lock_zlock(&node->lock);
23699+ list_check(&node->lock.owners);
23700+ list_check(&node->lock.requestors);
23701+ spin_unlock_zlock(&node->lock);
23702+}
23703+
23704+/* check that given lock request is dead lock safe. This check is, of course,
23705+ * not exhaustive. */
23706+static int
23707+request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23708+ znode_lock_request request)
23709+{
23710+ lock_stack *owner;
23711+
23712+ owner = get_current_lock_stack();
23713+ /*
23714+ * check that hipri lock request is not issued when there are locked
23715+ * nodes at the higher levels.
23716+ */
23717+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23718+ znode_get_level(node) != 0) {
23719+ lock_handle *item;
23720+
23721+ list_for_each_entry(item, &owner->locks, locks_link) {
23722+ znode *other;
23723+
23724+ other = item->node;
23725+
23726+ if (znode_get_level(other) == 0)
23727+ continue;
23728+ if (znode_get_level(other) > znode_get_level(node))
23729+ return 0;
23730+ }
23731+ }
23732+ return 1;
23733+}
23734+
23735+#endif
23736+
23737+/* return pointer to static storage with name of lock_mode. For
23738+ debugging */
23739+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23740+{
23741+ if (lock == ZNODE_READ_LOCK)
23742+ return "read";
23743+ else if (lock == ZNODE_WRITE_LOCK)
23744+ return "write";
23745+ else {
23746+ static char buf[30];
23747+
23748+ sprintf(buf, "unknown: %i", lock);
23749+ return buf;
23750+ }
23751+}
23752+
23753+/* Make Linus happy.
23754+ Local variables:
23755+ c-indentation-style: "K&R"
23756+ mode-name: "LC"
23757+ c-basic-offset: 8
23758+ tab-width: 8
23759+ fill-column: 79
23760+ End:
23761+*/
23762diff -urN linux-2.6.20.orig/fs/reiser4/lock.h linux-2.6.20/fs/reiser4/lock.h
23763--- linux-2.6.20.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23764+++ linux-2.6.20/fs/reiser4/lock.h 2007-05-06 14:50:43.742989473 +0400
23765@@ -0,0 +1,249 @@
23766+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23767+
23768+/* Long term locking data structures. See lock.c for details. */
23769+
23770+#ifndef __LOCK_H__
23771+#define __LOCK_H__
23772+
23773+#include "forward.h"
23774+#include "debug.h"
23775+#include "dformat.h"
23776+#include "key.h"
23777+#include "coord.h"
23778+#include "plugin/node/node.h"
23779+#include "txnmgr.h"
23780+#include "readahead.h"
23781+
23782+#include <linux/types.h>
23783+#include <linux/spinlock.h>
23784+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
23785+#include <asm/atomic.h>
23786+#include <linux/wait.h>
23787+
23788+/* Per-znode lock object */
23789+struct zlock {
23790+ spinlock_t guard;
23791+ /* The number of readers if positive; the number of recursively taken
23792+ write locks if negative. Protected by zlock spin lock. */
23793+ int nr_readers;
23794+ /* A number of processes (lock_stacks) that have this object
23795+ locked with high priority */
23796+ unsigned nr_hipri_owners;
23797+ /* A number of attempts to lock znode in high priority direction */
23798+ unsigned nr_hipri_requests;
23799+ /* A linked list of lock_handle objects that contains pointers
23800+ for all lock_stacks which have this lock object locked */
23801+ unsigned nr_hipri_write_requests;
23802+ struct list_head owners;
23803+ /* A linked list of lock_stacks that wait for this lock */
23804+ struct list_head requestors;
23805+};
23806+
23807+static inline void spin_lock_zlock(zlock *lock)
23808+{
23809+ /* check that zlock is not locked */
23810+ assert("", LOCK_CNT_NIL(spin_locked_zlock));
23811+ /* check that spinlocks of lower priorities are not held */
23812+ assert("", LOCK_CNT_NIL(spin_locked_stack));
23813+
23814+ spin_lock(&lock->guard);
23815+
23816+ LOCK_CNT_INC(spin_locked_zlock);
23817+ LOCK_CNT_INC(spin_locked);
23818+}
23819+
23820+static inline void spin_unlock_zlock(zlock *lock)
23821+{
23822+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23823+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23824+
23825+ LOCK_CNT_DEC(spin_locked_zlock);
23826+ LOCK_CNT_DEC(spin_locked);
23827+
23828+ spin_unlock(&lock->guard);
23829+}
23830+
23831+#define lock_is_locked(lock) ((lock)->nr_readers != 0)
23832+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
23833+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
23834+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
23835+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
23836+#define lock_mode_compatible(lock, mode) \
23837+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23838+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23839+
23840+/* Since we have R/W znode locks we need additional bidirectional `link'
23841+ objects to implement n<->m relationship between lock owners and lock
23842+ objects. We call them `lock handles'.
23843+
23844+ Locking: see lock.c/"SHORT-TERM LOCKING"
23845+*/
23846+struct lock_handle {
23847+ /* This flag indicates that a signal to yield a lock was passed to
23848+ lock owner and counted in owner->nr_signalled
23849+
23850+ Locking: this is accessed under spin lock on ->node.
23851+ */
23852+ int signaled;
23853+ /* A link to owner of a lock */
23854+ lock_stack *owner;
23855+ /* A link to znode locked */
23856+ znode *node;
23857+ /* A list of all locks for a process */
23858+ struct list_head locks_link;
23859+ /* A list of all owners for a znode */
23860+ struct list_head owners_link;
23861+};
23862+
23863+typedef struct lock_request {
23864+ /* A pointer to uninitialized link object */
23865+ lock_handle *handle;
23866+ /* A pointer to the object we want to lock */
23867+ znode *node;
23868+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23869+ znode_lock_mode mode;
23870+ /* how dispatch_lock_requests() returns lock request result code */
23871+ int ret_code;
23872+} lock_request;
23873+
23874+/* A lock stack structure for accumulating locks owned by a process */
23875+struct lock_stack {
23876+ /* A guard lock protecting a lock stack */
23877+ spinlock_t sguard;
23878+ /* number of znodes which were requested by high priority processes */
23879+ atomic_t nr_signaled;
23880+ /* Current priority of a process
23881+
23882+ This is only accessed by the current thread and thus requires no
23883+ locking.
23884+ */
23885+ int curpri;
23886+ /* A list of all locks owned by this process. Elements can be added to
23887+ * this list only by the current thread. ->node pointers in this list
23888+ * can be only changed by the current thread. */
23889+ struct list_head locks;
23890+ /* When lock_stack waits for the lock, it puts itself on double-linked
23891+ requestors list of that lock */
23892+ struct list_head requestors_link;
23893+ /* Current lock request info.
23894+
23895+ This is only accessed by the current thread and thus requires no
23896+ locking.
23897+ */
23898+ lock_request request;
23899+ /* the following two fields are the lock stack's
23900+ * synchronization object to use with the standard linux/wait.h
23901+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
23902+ * usage details. */
23903+ wait_queue_head_t wait;
23904+ atomic_t wakeup;
23905+#if REISER4_DEBUG
23906+ int nr_locks; /* number of lock handles in the above list */
23907+#endif
23908+};
23909+
23910+/*
23911+ User-visible znode locking functions
23912+*/
23913+
23914+extern int longterm_lock_znode(lock_handle * handle,
23915+ znode * node,
23916+ znode_lock_mode mode,
23917+ znode_lock_request request);
23918+
23919+extern void longterm_unlock_znode(lock_handle * handle);
23920+
23921+extern int reiser4_check_deadlock(void);
23922+
23923+extern lock_stack *get_current_lock_stack(void);
23924+
23925+extern void init_lock_stack(lock_stack * owner);
23926+extern void reiser4_init_lock(zlock * lock);
23927+
23928+static inline void init_lh(lock_handle *lh)
23929+{
23930+#if REISER4_DEBUG
23931+ memset(lh, 0, sizeof *lh);
23932+ INIT_LIST_HEAD(&lh->locks_link);
23933+ INIT_LIST_HEAD(&lh->owners_link);
23934+#else
23935+ lh->node = NULL;
23936+#endif
23937+}
23938+
23939+static inline void done_lh(lock_handle *lh)
23940+{
23941+ assert("zam-342", lh != NULL);
23942+ if (lh->node != NULL)
23943+ longterm_unlock_znode(lh);
23944+}
23945+
23946+extern void move_lh(lock_handle * new, lock_handle * old);
23947+extern void copy_lh(lock_handle * new, lock_handle * old);
23948+
23949+extern int reiser4_prepare_to_sleep(lock_stack * owner);
23950+extern void reiser4_go_to_sleep(lock_stack * owner);
23951+extern void __reiser4_wake_up(lock_stack * owner);
23952+
23953+extern int lock_stack_isclean(lock_stack * owner);
23954+
23955+/* zlock object state check macros: only used in assertions. Both forms imply that the
23956+ lock is held by the current thread. */
23957+extern int znode_is_write_locked(const znode *);
23958+extern void reiser4_invalidate_lock(lock_handle *);
23959+
23960+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
23961+#define spin_ordering_pred_stack(stack) \
23962+ (LOCK_CNT_NIL(spin_locked_stack) && \
23963+ LOCK_CNT_NIL(spin_locked_txnmgr) && \
23964+ LOCK_CNT_NIL(spin_locked_inode) && \
23965+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \
23966+ LOCK_CNT_NIL(spin_locked_super_eflush) )
23967+
23968+static inline void spin_lock_stack(lock_stack *stack)
23969+{
23970+ assert("", spin_ordering_pred_stack(stack));
23971+ spin_lock(&(stack->sguard));
23972+ LOCK_CNT_INC(spin_locked_stack);
23973+ LOCK_CNT_INC(spin_locked);
23974+}
23975+
23976+static inline void spin_unlock_stack(lock_stack *stack)
23977+{
23978+ assert_spin_locked(&(stack->sguard));
23979+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
23980+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23981+ LOCK_CNT_DEC(spin_locked_stack);
23982+ LOCK_CNT_DEC(spin_locked);
23983+ spin_unlock(&(stack->sguard));
23984+}
23985+
23986+static inline void reiser4_wake_up(lock_stack * owner)
23987+{
23988+ spin_lock_stack(owner);
23989+ __reiser4_wake_up(owner);
23990+ spin_unlock_stack(owner);
23991+}
23992+
23993+const char *lock_mode_name(znode_lock_mode lock);
23994+
23995+#if REISER4_DEBUG
23996+extern void check_lock_data(void);
23997+extern void check_lock_node_data(znode * node);
23998+#else
23999+#define check_lock_data() noop
24000+#define check_lock_node_data() noop
24001+#endif
24002+
24003+/* __LOCK_H__ */
24004+#endif
24005+
24006+/* Make Linus happy.
24007+ Local variables:
24008+ c-indentation-style: "K&R"
24009+ mode-name: "LC"
24010+ c-basic-offset: 8
24011+ tab-width: 8
24012+ fill-column: 120
24013+ End:
24014+*/
24015diff -urN linux-2.6.20.orig/fs/reiser4/Makefile linux-2.6.20/fs/reiser4/Makefile
24016--- linux-2.6.20.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
24017+++ linux-2.6.20/fs/reiser4/Makefile 2007-05-06 14:50:43.742989473 +0400
24018@@ -0,0 +1,99 @@
24019+#
24020+# reiser4/Makefile
24021+#
24022+
24023+obj-$(CONFIG_REISER4_FS) += reiser4.o
24024+
24025+reiser4-y := \
24026+ debug.o \
24027+ jnode.o \
24028+ znode.o \
24029+ key.o \
24030+ pool.o \
24031+ tree_mod.o \
24032+ estimate.o \
24033+ carry.o \
24034+ carry_ops.o \
24035+ lock.o \
24036+ tree.o \
24037+ context.o \
24038+ tap.o \
24039+ coord.o \
24040+ block_alloc.o \
24041+ txnmgr.o \
24042+ kassign.o \
24043+ flush.o \
24044+ wander.o \
24045+ eottl.o \
24046+ search.o \
24047+ page_cache.o \
24048+ seal.o \
24049+ dscale.o \
24050+ flush_queue.o \
24051+ ktxnmgrd.o \
24052+ blocknrset.o \
24053+ super.o \
24054+ super_ops.o \
24055+ fsdata.o \
24056+ export_ops.o \
24057+ oid.o \
24058+ tree_walk.o \
24059+ inode.o \
24060+ vfs_ops.o \
24061+ as_ops.o \
24062+ entd.o\
24063+ readahead.o \
24064+ status_flags.o \
24065+ init_super.o \
24066+ safe_link.o \
24067+ \
24068+ plugin/plugin.o \
24069+ plugin/plugin_set.o \
24070+ plugin/node/node.o \
24071+ plugin/object.o \
24072+ plugin/cluster.o \
24073+ plugin/inode_ops.o \
24074+ plugin/inode_ops_rename.o \
24075+ plugin/file_ops.o \
24076+ plugin/file_ops_readdir.o \
24077+ plugin/file_plugin_common.o \
24078+ plugin/file/file.o \
24079+ plugin/file/tail_conversion.o \
24080+ plugin/file/file_conversion.o \
24081+ plugin/file/symlink.o \
24082+ plugin/file/cryptcompress.o \
24083+ plugin/dir_plugin_common.o \
24084+ plugin/dir/hashed_dir.o \
24085+ plugin/dir/seekable_dir.o \
24086+ plugin/node/node40.o \
24087+ \
24088+ plugin/crypto/cipher.o \
24089+ plugin/crypto/digest.o \
24090+ \
24091+ plugin/compress/minilzo.o \
24092+ plugin/compress/compress.o \
24093+ plugin/compress/compress_mode.o \
24094+ \
24095+ plugin/item/static_stat.o \
24096+ plugin/item/sde.o \
24097+ plugin/item/cde.o \
24098+ plugin/item/blackbox.o \
24099+ plugin/item/internal.o \
24100+ plugin/item/tail.o \
24101+ plugin/item/ctail.o \
24102+ plugin/item/extent.o \
24103+ plugin/item/extent_item_ops.o \
24104+ plugin/item/extent_file_ops.o \
24105+ plugin/item/extent_flush_ops.o \
24106+ \
24107+ plugin/hash.o \
24108+ plugin/fibration.o \
24109+ plugin/tail_policy.o \
24110+ plugin/item/item.o \
24111+ \
24112+ plugin/security/perm.o \
24113+ plugin/space/bitmap.o \
24114+ \
24115+ plugin/disk_format/disk_format40.o \
24116+ plugin/disk_format/disk_format.o
24117+
24118diff -urN linux-2.6.20.orig/fs/reiser4/oid.c linux-2.6.20/fs/reiser4/oid.c
24119--- linux-2.6.20.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300
24120+++ linux-2.6.20/fs/reiser4/oid.c 2007-05-06 14:50:43.742989473 +0400
24121@@ -0,0 +1,141 @@
24122+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24123+
24124+#include "debug.h"
24125+#include "super.h"
24126+#include "txnmgr.h"
24127+
24128+/* we used to have oid allocation plugin. It was removed because it
24129+ was recognized as providing unneeded level of abstraction. If one
24130+ ever will find it useful - look at yet_unneeded_abstractions/oid
24131+*/
24132+
24133+/*
24134+ * initialize in-memory data for oid allocator at @super. @nr_files and @next
24135+ * are provided by disk format plugin that reads them from the disk during
24136+ * mount.
24137+ */
24138+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24139+{
24140+ reiser4_super_info_data *sbinfo;
24141+
24142+ sbinfo = get_super_private(super);
24143+
24144+ sbinfo->next_to_use = next;
24145+ sbinfo->oids_in_use = nr_files;
24146+ return 0;
24147+}
24148+
24149+/*
24150+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24151+ * runs out of oids.
24152+ */
24153+oid_t oid_allocate(struct super_block * super)
24154+{
24155+ reiser4_super_info_data *sbinfo;
24156+ oid_t oid;
24157+
24158+ sbinfo = get_super_private(super);
24159+
24160+ spin_lock_reiser4_super(sbinfo);
24161+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24162+ oid = sbinfo->next_to_use++;
24163+ sbinfo->oids_in_use++;
24164+ } else
24165+ oid = ABSOLUTE_MAX_OID;
24166+ spin_unlock_reiser4_super(sbinfo);
24167+ return oid;
24168+}
24169+
24170+/*
24171+ * Tell oid allocator that @oid is now free.
24172+ */
24173+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24174+{
24175+ reiser4_super_info_data *sbinfo;
24176+
24177+ sbinfo = get_super_private(super);
24178+
24179+ spin_lock_reiser4_super(sbinfo);
24180+ sbinfo->oids_in_use--;
24181+ spin_unlock_reiser4_super(sbinfo);
24182+ return 0;
24183+}
24184+
24185+/*
24186+ * return next @oid that would be allocated (i.e., returned by oid_allocate())
24187+ * without actually allocating it. This is used by disk format plugin to save
24188+ * oid allocator state on the disk.
24189+ */
24190+oid_t oid_next(const struct super_block * super)
24191+{
24192+ reiser4_super_info_data *sbinfo;
24193+ oid_t oid;
24194+
24195+ sbinfo = get_super_private(super);
24196+
24197+ spin_lock_reiser4_super(sbinfo);
24198+ oid = sbinfo->next_to_use;
24199+ spin_unlock_reiser4_super(sbinfo);
24200+ return oid;
24201+}
24202+
24203+/*
24204+ * returns number of currently used oids. This is used by statfs(2) to report
24205+ * number of "inodes" and by disk format plugin to save oid allocator state on
24206+ * the disk.
24207+ */
24208+long oids_used(const struct super_block *super)
24209+{
24210+ reiser4_super_info_data *sbinfo;
24211+ oid_t used;
24212+
24213+ sbinfo = get_super_private(super);
24214+
24215+ spin_lock_reiser4_super(sbinfo);
24216+ used = sbinfo->oids_in_use;
24217+ spin_unlock_reiser4_super(sbinfo);
24218+ if (used < (__u64) ((long)~0) >> 1)
24219+ return (long)used;
24220+ else
24221+ return (long)-1;
24222+}
24223+
24224+/*
24225+ * Count oid as allocated in atom. This is done after call to oid_allocate()
24226+ * at the point when we are irrevocably committed to creation of the new file
24227+ * (i.e., when oid allocation cannot be any longer rolled back due to some
24228+ * error).
24229+ */
24230+void oid_count_allocated(void)
24231+{
24232+ txn_atom *atom;
24233+
24234+ atom = get_current_atom_locked();
24235+ atom->nr_objects_created++;
24236+ spin_unlock_atom(atom);
24237+}
24238+
24239+/*
24240+ * Count oid as free in atom. This is done after call to oid_release() at the
24241+ * point when we are irrevocably committed to the deletion of the file (i.e.,
24242+ * when oid release cannot be any longer rolled back due to some error).
24243+ */
24244+void oid_count_released(void)
24245+{
24246+ txn_atom *atom;
24247+
24248+ atom = get_current_atom_locked();
24249+ atom->nr_objects_deleted++;
24250+ spin_unlock_atom(atom);
24251+}
24252+
24253+/*
24254+ Local variables:
24255+ c-indentation-style: "K&R"
24256+ mode-name: "LC"
24257+ c-basic-offset: 8
24258+ tab-width: 8
24259+ fill-column: 120
24260+ scroll-step: 1
24261+ End:
24262+*/
24263diff -urN linux-2.6.20.orig/fs/reiser4/page_cache.c linux-2.6.20/fs/reiser4/page_cache.c
24264--- linux-2.6.20.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300
24265+++ linux-2.6.20/fs/reiser4/page_cache.c 2007-05-06 14:50:43.742989473 +0400
24266@@ -0,0 +1,736 @@
24267+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24268+ * reiser4/README */
24269+
24270+/* Memory pressure hooks. Fake inodes handling. */
24271+
24272+/* GLOSSARY
24273+
24274+ . Formatted and unformatted nodes.
24275+ Elements of reiser4 balanced tree to store data and metadata.
24276+ Unformatted nodes are pointed to by extent pointers. Such nodes
24277+ are used to store data of large objects. Unlike unformatted nodes,
24278+ formatted ones have associated format described by node4X plugin.
24279+
24280+ . Jnode (or journal node)
24281+ The in-memory header which is used to track formatted and unformatted
24282+ nodes, bitmap nodes, etc. In particular, jnodes are used to track
24283+ transactional information associated with each block(see reiser4/jnode.c
24284+ for details).
24285+
24286+ . Znode
24287+ The in-memory header which is used to track formatted nodes. Contains
24288+ embedded jnode (see reiser4/znode.c for details).
24289+*/
24290+
24291+/* We store all file system meta data (and data, of course) in the page cache.
24292+
24293+ What does this mean? In stead of using bread/brelse we create special
24294+ "fake" inode (one per super block) and store content of formatted nodes
24295+ into pages bound to this inode in the page cache. In newer kernels bread()
24296+ already uses inode attached to block device (bd_inode). Advantage of having
24297+ our own fake inode is that we can install appropriate methods in its
24298+ address_space operations. Such methods are called by VM on memory pressure
24299+ (or during background page flushing) and we can use them to react
24300+ appropriately.
24301+
24302+ In initial version we only support one block per page. Support for multiple
24303+ blocks per page is complicated by relocation.
24304+
24305+ To each page, used by reiser4, jnode is attached. jnode is analogous to
24306+ buffer head. Difference is that jnode is bound to the page permanently:
24307+ jnode cannot be removed from memory until its backing page is.
24308+
24309+ jnode contain pointer to page (->pg field) and page contain pointer to
24310+ jnode in ->private field. Pointer from jnode to page is protected to by
24311+ jnode's spinlock and pointer from page to jnode is protected by page lock
24312+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24313+ lock. To go into reverse direction use jnode_lock_page() function that uses
24314+ standard try-lock-and-release device.
24315+
24316+ Properties:
24317+
24318+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24319+ reference counter is increased.
24320+
24321+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24322+ reference counter is decreased.
24323+
24324+ 3. on jload() reference counter on jnode page is increased, page is
24325+ kmapped and `referenced'.
24326+
24327+ 4. on jrelse() inverse operations are performed.
24328+
24329+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24330+
24331+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24332+ historically.]
24333+
24334+ [In the following discussion, `lock' invariably means long term lock on
24335+ znode.] (What about page locks?)
24336+
24337+ There is some special class of deadlock possibilities related to memory
24338+ pressure. Locks acquired by other reiser4 threads are accounted for in
24339+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24340+ invoked additional hidden arc is added to the locking graph: thread that
24341+ tries to allocate memory waits for ->vm_writeback() to finish. If this
24342+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24343+ prevention is useless.
24344+
24345+ Another related problem is possibility for ->vm_writeback() to run out of
24346+ memory itself. This is not a problem for ext2 and friends, because their
24347+ ->vm_writeback() don't allocate much memory, but reiser4 flush is
24348+ definitely able to allocate huge amounts of memory.
24349+
24350+ It seems that there is no reliable way to cope with the problems above. In
24351+ stead it was decided that ->vm_writeback() (as invoked in the kswapd
24352+ context) wouldn't perform any flushing itself, but rather should just wake
24353+ up some auxiliary thread dedicated for this purpose (or, the same thread
24354+ that does periodic commit of old atoms (ktxnmgrd.c)).
24355+
24356+ Details:
24357+
24358+ 1. Page is called `reclaimable' against particular reiser4 mount F if this
24359+ page can be ultimately released by try_to_free_pages() under presumptions
24360+ that:
24361+
24362+ a. ->vm_writeback() for F is no-op, and
24363+
24364+ b. none of the threads accessing F are making any progress, and
24365+
24366+ c. other reiser4 mounts obey the same memory reservation protocol as F
24367+ (described below).
24368+
24369+ For example, clean un-pinned page, or page occupied by ext2 data are
24370+ reclaimable against any reiser4 mount.
24371+
24372+ When there is more than one reiser4 mount in a system, condition (c) makes
24373+ reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24374+
24375+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24376+
24377+ Fake inode is used to bound formatted nodes and each node is indexed within
24378+ fake inode by its block number. If block size of smaller than page size, it
24379+ may so happen that block mapped to the page with formatted node is occupied
24380+ by unformatted node or is unallocated. This lead to some complications,
24381+ because flushing whole page can lead to an incorrect overwrite of
24382+ unformatted node that is moreover, can be cached in some other place as
24383+ part of the file body. To avoid this, buffers for unformatted nodes are
24384+ never marked dirty. Also pages in the fake are never marked dirty. This
24385+ rules out usage of ->writepage() as memory pressure hook. In stead
24386+ ->releasepage() is used.
24387+
24388+ Josh is concerned that page->buffer is going to die. This should not pose
24389+ significant problem though, because we need to add some data structures to
24390+ the page anyway (jnode) and all necessary book keeping can be put there.
24391+
24392+*/
24393+
24394+/* Life cycle of pages/nodes.
24395+
24396+ jnode contains reference to page and page contains reference back to
24397+ jnode. This reference is counted in page ->count. Thus, page bound to jnode
24398+ cannot be released back into free pool.
24399+
24400+ 1. Formatted nodes.
24401+
24402+ 1. formatted node is represented by znode. When new znode is created its
24403+ ->pg pointer is NULL initially.
24404+
24405+ 2. when node content is loaded into znode (by call to zload()) for the
24406+ first time following happens (in call to ->read_node() or
24407+ ->allocate_node()):
24408+
24409+ 1. new page is added to the page cache.
24410+
24411+ 2. this page is attached to znode and its ->count is increased.
24412+
24413+ 3. page is kmapped.
24414+
24415+ 3. if more calls to zload() follow (without corresponding zrelses), page
24416+ counter is left intact and in its stead ->d_count is increased in znode.
24417+
24418+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24419+ ->release_node() is called and page is kunmapped as result.
24420+
24421+ 5. at some moment node can be captured by a transaction. Its ->x_count
24422+ is then increased by transaction manager.
24423+
24424+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24425+ bit set) following will happen (also see comment at the top of znode.c):
24426+
24427+ 1. when last lock is released, node will be uncaptured from
24428+ transaction. This released reference that transaction manager acquired
24429+ at the step 5.
24430+
24431+ 2. when last reference is released, zput() detects that node is
24432+ actually deleted and calls ->delete_node()
24433+ operation. page_cache_delete_node() implementation detaches jnode from
24434+ page and releases page.
24435+
24436+ 7. otherwise (node wasn't removed from the tree), last reference to
24437+ znode will be released after transaction manager committed transaction
24438+ node was in. This implies squallocing of this node (see
24439+ flush.c). Nothing special happens at this point. Znode is still in the
24440+ hash table and page is still attached to it.
24441+
24442+ 8. znode is actually removed from the memory because of the memory
24443+ pressure, or during umount (znodes_tree_done()). Anyway, znode is
24444+ removed by the call to zdrop(). At this moment, page is detached from
24445+ znode and removed from the inode address space.
24446+
24447+*/
24448+
24449+#include "debug.h"
24450+#include "dformat.h"
24451+#include "key.h"
24452+#include "txnmgr.h"
24453+#include "jnode.h"
24454+#include "znode.h"
24455+#include "block_alloc.h"
24456+#include "tree.h"
24457+#include "vfs_ops.h"
24458+#include "inode.h"
24459+#include "super.h"
24460+#include "entd.h"
24461+#include "page_cache.h"
24462+#include "ktxnmgrd.h"
24463+
24464+#include <linux/types.h>
24465+#include <linux/fs.h>
24466+#include <linux/mm.h> /* for struct page */
24467+#include <linux/swap.h> /* for struct page */
24468+#include <linux/pagemap.h>
24469+#include <linux/bio.h>
24470+#include <linux/writeback.h>
24471+#include <linux/blkdev.h>
24472+
24473+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24474+
24475+static struct address_space_operations formatted_fake_as_ops;
24476+
24477+static const oid_t fake_ino = 0x1;
24478+static const oid_t bitmap_ino = 0x2;
24479+static const oid_t cc_ino = 0x3;
24480+
24481+static void
24482+init_fake_inode(struct super_block *super, struct inode *fake,
24483+ struct inode **pfake)
24484+{
24485+ assert("nikita-2168", fake->i_state & I_NEW);
24486+ fake->i_mapping->a_ops = &formatted_fake_as_ops;
24487+ *pfake = fake;
24488+ /* NOTE-NIKITA something else? */
24489+ unlock_new_inode(fake);
24490+}
24491+
24492+/**
24493+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24494+ * @super: super block to init fake inode for
24495+ *
24496+ * Initializes fake inode to which formatted nodes are bound in the page cache
24497+ * and inode for bitmaps.
24498+ */
24499+int reiser4_init_formatted_fake(struct super_block *super)
24500+{
24501+ struct inode *fake;
24502+ struct inode *bitmap;
24503+ struct inode *cc;
24504+ reiser4_super_info_data *sinfo;
24505+
24506+ assert("nikita-1703", super != NULL);
24507+
24508+ sinfo = get_super_private_nocheck(super);
24509+ fake = iget_locked(super, oid_to_ino(fake_ino));
24510+
24511+ if (fake != NULL) {
24512+ init_fake_inode(super, fake, &sinfo->fake);
24513+
24514+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24515+ if (bitmap != NULL) {
24516+ init_fake_inode(super, bitmap, &sinfo->bitmap);
24517+
24518+ cc = iget_locked(super, oid_to_ino(cc_ino));
24519+ if (cc != NULL) {
24520+ init_fake_inode(super, cc, &sinfo->cc);
24521+ return 0;
24522+ } else {
24523+ iput(sinfo->fake);
24524+ iput(sinfo->bitmap);
24525+ sinfo->fake = NULL;
24526+ sinfo->bitmap = NULL;
24527+ }
24528+ } else {
24529+ iput(sinfo->fake);
24530+ sinfo->fake = NULL;
24531+ }
24532+ }
24533+ return RETERR(-ENOMEM);
24534+}
24535+
24536+/**
24537+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24538+ * @super: super block to init fake inode for
24539+ *
24540+ * Releases inodes which were used as address spaces of bitmap and formatted
24541+ * nodes.
24542+ */
24543+void reiser4_done_formatted_fake(struct super_block *super)
24544+{
24545+ reiser4_super_info_data *sinfo;
24546+
24547+ sinfo = get_super_private_nocheck(super);
24548+
24549+ if (sinfo->fake != NULL) {
24550+ iput(sinfo->fake);
24551+ sinfo->fake = NULL;
24552+ }
24553+
24554+ if (sinfo->bitmap != NULL) {
24555+ iput(sinfo->bitmap);
24556+ sinfo->bitmap = NULL;
24557+ }
24558+
24559+ if (sinfo->cc != NULL) {
24560+ iput(sinfo->cc);
24561+ sinfo->cc = NULL;
24562+ }
24563+ return;
24564+}
24565+
24566+void reiser4_wait_page_writeback(struct page *page)
24567+{
24568+ assert("zam-783", PageLocked(page));
24569+
24570+ do {
24571+ unlock_page(page);
24572+ wait_on_page_writeback(page);
24573+ lock_page(page);
24574+ } while (PageWriteback(page));
24575+}
24576+
24577+/* return tree @page is in */
24578+reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24579+{
24580+ assert("nikita-2461", page != NULL);
24581+ return &get_super_private(page->mapping->host->i_sb)->tree;
24582+}
24583+
24584+/* completion handler for single page bio-based read.
24585+
24586+ mpage_end_io_read() would also do. But it's static.
24587+
24588+*/
24589+static int
24590+end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24591+ int err UNUSED_ARG)
24592+{
24593+ struct page *page;
24594+
24595+ if (bio->bi_size != 0) {
24596+ warning("nikita-3332", "Truncated single page read: %i",
24597+ bio->bi_size);
24598+ return 1;
24599+ }
24600+
24601+ page = bio->bi_io_vec[0].bv_page;
24602+
24603+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24604+ SetPageUptodate(page);
24605+ } else {
24606+ ClearPageUptodate(page);
24607+ SetPageError(page);
24608+ }
24609+ unlock_page(page);
24610+ bio_put(bio);
24611+ return 0;
24612+}
24613+
24614+/* completion handler for single page bio-based write.
24615+
24616+ mpage_end_io_write() would also do. But it's static.
24617+
24618+*/
24619+static int
24620+end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24621+ int err UNUSED_ARG)
24622+{
24623+ struct page *page;
24624+
24625+ if (bio->bi_size != 0) {
24626+ warning("nikita-3333", "Truncated single page write: %i",
24627+ bio->bi_size);
24628+ return 1;
24629+ }
24630+
24631+ page = bio->bi_io_vec[0].bv_page;
24632+
24633+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24634+ SetPageError(page);
24635+ end_page_writeback(page);
24636+ bio_put(bio);
24637+ return 0;
24638+}
24639+
24640+/* ->readpage() method for formatted nodes */
24641+static int formatted_readpage(struct file *f UNUSED_ARG,
24642+ struct page *page /* page to read */ )
24643+{
24644+ assert("nikita-2412", PagePrivate(page) && jprivate(page));
24645+ return reiser4_page_io(page, jprivate(page), READ,
24646+ reiser4_ctx_gfp_mask_get());
24647+}
24648+
24649+/**
24650+ * reiser4_page_io - submit single-page bio request
24651+ * @page: page to perform io for
24652+ * @node: jnode of page
24653+ * @rw: read or write
24654+ * @gfp: gfp mask for bio allocation
24655+ *
24656+ * Submits single page read or write.
24657+ */
24658+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24659+{
24660+ struct bio *bio;
24661+ int result;
24662+
24663+ assert("nikita-2094", page != NULL);
24664+ assert("nikita-2226", PageLocked(page));
24665+ assert("nikita-2634", node != NULL);
24666+ assert("nikita-2893", rw == READ || rw == WRITE);
24667+
24668+ if (rw) {
24669+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24670+ unlock_page(page);
24671+ return 0;
24672+ }
24673+ }
24674+
24675+ bio = page_bio(page, node, rw, gfp);
24676+ if (!IS_ERR(bio)) {
24677+ if (rw == WRITE) {
24678+ SetPageWriteback(page);
24679+ unlock_page(page);
24680+ }
24681+ reiser4_submit_bio(rw, bio);
24682+ result = 0;
24683+ } else {
24684+ unlock_page(page);
24685+ result = PTR_ERR(bio);
24686+ }
24687+
24688+ return result;
24689+}
24690+
24691+/* helper function to construct bio for page */
24692+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24693+{
24694+ struct bio *bio;
24695+ assert("nikita-2092", page != NULL);
24696+ assert("nikita-2633", node != NULL);
24697+
24698+ /* Simple implementation in the assumption that blocksize == pagesize.
24699+
24700+ We only have to submit one block, but submit_bh() will allocate bio
24701+ anyway, so lets use all the bells-and-whistles of bio code.
24702+ */
24703+
24704+ bio = bio_alloc(gfp, 1);
24705+ if (bio != NULL) {
24706+ int blksz;
24707+ struct super_block *super;
24708+ reiser4_block_nr blocknr;
24709+
24710+ super = page->mapping->host->i_sb;
24711+ assert("nikita-2029", super != NULL);
24712+ blksz = super->s_blocksize;
24713+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24714+
24715+ spin_lock_jnode(node);
24716+ blocknr = *jnode_get_io_block(node);
24717+ spin_unlock_jnode(node);
24718+
24719+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24720+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24721+
24722+ bio->bi_bdev = super->s_bdev;
24723+ /* fill bio->bi_sector before calling bio_add_page(), because
24724+ * q->merge_bvec_fn may want to inspect it (see
24725+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24726+ bio->bi_sector = blocknr * (blksz >> 9);
24727+
24728+ if (!bio_add_page(bio, page, blksz, 0)) {
24729+ warning("nikita-3452",
24730+ "Single page bio cannot be constructed");
24731+ return ERR_PTR(RETERR(-EINVAL));
24732+ }
24733+
24734+ /* bio -> bi_idx is filled by bio_init() */
24735+ bio->bi_end_io = (rw == READ) ?
24736+ end_bio_single_page_read : end_bio_single_page_write;
24737+
24738+ return bio;
24739+ } else
24740+ return ERR_PTR(RETERR(-ENOMEM));
24741+}
24742+
24743+/* this function is internally called by jnode_make_dirty() */
24744+int reiser4_set_page_dirty_internal(struct page *page)
24745+{
24746+ struct address_space *mapping;
24747+
24748+ mapping = page->mapping;
24749+ BUG_ON(mapping == NULL);
24750+
24751+ if (!TestSetPageDirty(page)) {
24752+ if (mapping_cap_account_dirty(mapping))
24753+ inc_zone_page_state(page, NR_FILE_DIRTY);
24754+
24755+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24756+ }
24757+
24758+ /* znode must be dirty ? */
24759+ if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
24760+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24761+ return 0;
24762+}
24763+
24764+#if REISER4_DEBUG
24765+
24766+/**
24767+ * can_hit_entd
24768+ *
24769+ * This is used on
24770+ */
24771+static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24772+{
24773+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24774+ return 1;
24775+ if (ctx->super != s)
24776+ return 1;
24777+ if (get_super_private(s)->entd.tsk == current)
24778+ return 0;
24779+ if (!lock_stack_isclean(&ctx->stack))
24780+ return 0;
24781+ if (ctx->trans->atom != NULL)
24782+ return 0;
24783+ return 1;
24784+}
24785+
24786+#endif
24787+
24788+/**
24789+ * reiser4_writepage - writepage of struct address_space_operations
24790+ * @page: page to write
24791+ * @wbc:
24792+ *
24793+ *
24794+ */
24795+/* Common memory pressure notification. */
24796+int reiser4_writepage(struct page *page,
24797+ struct writeback_control *wbc)
24798+{
24799+ struct super_block *s;
24800+ reiser4_context *ctx;
24801+
24802+ assert("vs-828", PageLocked(page));
24803+
24804+ s = page->mapping->host->i_sb;
24805+ ctx = get_current_context_check();
24806+
24807+ assert("", can_hit_entd(ctx, s));
24808+
24809+ return write_page_by_ent(page, wbc);
24810+}
24811+
24812+/* ->set_page_dirty() method of formatted address_space */
24813+static int formatted_set_page_dirty(struct page *page)
24814+{
24815+ assert("nikita-2173", page != NULL);
24816+ BUG();
24817+ return __set_page_dirty_nobuffers(page);
24818+}
24819+
24820+/* writepages method of address space operations in reiser4 is used to involve
24821+ into transactions pages which are dirtied via mmap. Only regular files can
24822+ have such pages. Fake inode is used to access formatted nodes via page
24823+ cache. As formatted nodes can never be mmaped, fake inode's writepages has
24824+ nothing to do */
24825+static int
24826+writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24827+{
24828+ return 0;
24829+}
24830+
24831+/* address space operations for the fake inode */
24832+static struct address_space_operations formatted_fake_as_ops = {
24833+ /* Perform a writeback of a single page as a memory-freeing
24834+ * operation. */
24835+ .writepage = reiser4_writepage,
24836+ /* this is called to read formatted node */
24837+ .readpage = formatted_readpage,
24838+ /* ->sync_page() method of fake inode address space operations. Called
24839+ from wait_on_page() and lock_page().
24840+
24841+ This is most annoyingly misnomered method. Actually it is called
24842+ from wait_on_page_bit() and lock_page() and its purpose is to
24843+ actually start io by jabbing device drivers.
24844+ */
24845+ .sync_page = block_sync_page,
24846+ /* Write back some dirty pages from this mapping. Called from sync.
24847+ called during sync (pdflush) */
24848+ .writepages = writepages_fake,
24849+ /* Set a page dirty */
24850+ .set_page_dirty = formatted_set_page_dirty,
24851+ /* used for read-ahead. Not applicable */
24852+ .readpages = NULL,
24853+ .prepare_write = NULL,
24854+ .commit_write = NULL,
24855+ .bmap = NULL,
24856+ /* called just before page is being detached from inode mapping and
24857+ removed from memory. Called on truncate, cut/squeeze, and
24858+ umount. */
24859+ .invalidatepage = reiser4_invalidatepage,
24860+ /* this is called by shrink_cache() so that file system can try to
24861+ release objects (jnodes, buffers, journal heads) attached to page
24862+ and, may be made page itself free-able.
24863+ */
24864+ .releasepage = reiser4_releasepage,
24865+ .direct_IO = NULL
24866+};
24867+
24868+/* called just before page is released (no longer used by reiser4). Callers:
24869+ jdelete() and extent2tail(). */
24870+void reiser4_drop_page(struct page *page)
24871+{
24872+ assert("nikita-2181", PageLocked(page));
24873+ clear_page_dirty_for_io(page);
24874+ ClearPageUptodate(page);
24875+#if defined(PG_skipped)
24876+ ClearPageSkipped(page);
24877+#endif
24878+ unlock_page(page);
24879+}
24880+
24881+#define JNODE_GANG_SIZE (16)
24882+
24883+/* find all jnodes from range specified and invalidate them */
24884+static int
24885+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24886+{
24887+ reiser4_inode *info;
24888+ int truncated_jnodes;
24889+ reiser4_tree *tree;
24890+ unsigned long index;
24891+ unsigned long end;
24892+
24893+ if (inode_file_plugin(inode) ==
24894+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24895+ /* No need to get rid of jnodes here: if the single jnode of
24896+ page cluster did not have page, then it was found and killed
24897+ before in
24898+ truncate_page_cluster_cryptcompress()->jput()->jput_final(),
24899+ otherwise it will be dropped by reiser4_invalidatepage() */
24900+ return 0;
24901+ truncated_jnodes = 0;
24902+
24903+ info = reiser4_inode_data(inode);
24904+ tree = reiser4_tree_by_inode(inode);
24905+
24906+ index = from;
24907+ end = from + count;
24908+
24909+ while (1) {
24910+ jnode *gang[JNODE_GANG_SIZE];
24911+ int taken;
24912+ int i;
24913+ jnode *node;
24914+
24915+ assert("nikita-3466", index <= end);
24916+
24917+ read_lock_tree(tree);
24918+ taken =
24919+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
24920+ (void **)gang, index,
24921+ JNODE_GANG_SIZE);
24922+ for (i = 0; i < taken; ++i) {
24923+ node = gang[i];
24924+ if (index_jnode(node) < end)
24925+ jref(node);
24926+ else
24927+ gang[i] = NULL;
24928+ }
24929+ read_unlock_tree(tree);
24930+
24931+ for (i = 0; i < taken; ++i) {
24932+ node = gang[i];
24933+ if (node != NULL) {
24934+ index = max(index, index_jnode(node));
24935+ spin_lock_jnode(node);
24936+ assert("edward-1457", node->pg == NULL);
24937+ /* this is always called after
24938+ truncate_inode_pages_range(). Therefore, here
24939+ jnode can not have page. New pages can not be
24940+ created because truncate_jnodes_range goes
24941+ under exclusive access on file obtained,
24942+ where as new page creation requires
24943+ non-exclusive access obtained */
24944+ JF_SET(node, JNODE_HEARD_BANSHEE);
24945+ reiser4_uncapture_jnode(node);
24946+ unhash_unformatted_jnode(node);
24947+ truncated_jnodes++;
24948+ jput(node);
24949+ } else
24950+ break;
24951+ }
24952+ if (i != taken || taken == 0)
24953+ break;
24954+ }
24955+ return truncated_jnodes;
24956+}
24957+
24958+/* Truncating files in reiser4: problems and solutions.
24959+
24960+ VFS calls fs's truncate after it has called truncate_inode_pages()
24961+ to get rid of pages corresponding to part of file being truncated.
24962+ In reiser4 it may cause existence of unallocated extents which do
24963+ not have jnodes. Flush code does not expect that. Solution of this
24964+ problem is straightforward. As vfs's truncate is implemented using
24965+ setattr operation, it seems reasonable to have ->setattr() that
24966+ will cut file body. However, flush code also does not expect dirty
24967+ pages without parent items, so it is impossible to cut all items,
24968+ then truncate all pages in two steps. We resolve this problem by
24969+ cutting items one-by-one. Each such fine-grained step performed
24970+ under longterm znode lock calls at the end ->kill_hook() method of
24971+ a killed item to remove its binded pages and jnodes.
24972+
24973+ The following function is a common part of mentioned kill hooks.
24974+ Also, this is called before tail-to-extent conversion (to not manage
24975+ few copies of the data).
24976+*/
24977+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
24978+ unsigned long count, int even_cows)
24979+{
24980+ loff_t from_bytes, count_bytes;
24981+
24982+ if (count == 0)
24983+ return;
24984+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
24985+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
24986+
24987+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
24988+ truncate_inode_pages_range(mapping, from_bytes,
24989+ from_bytes + count_bytes - 1);
24990+ truncate_jnodes_range(mapping->host, from, count);
24991+}
24992+
24993+/*
24994+ * Local variables:
24995+ * c-indentation-style: "K&R"
24996+ * mode-name: "LC"
24997+ * c-basic-offset: 8
24998+ * tab-width: 8
24999+ * fill-column: 120
25000+ * scroll-step: 1
25001+ * End:
25002+ */
25003diff -urN linux-2.6.20.orig/fs/reiser4/page_cache.h linux-2.6.20/fs/reiser4/page_cache.h
25004--- linux-2.6.20.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300
25005+++ linux-2.6.20/fs/reiser4/page_cache.h 2007-05-06 14:50:43.746990723 +0400
25006@@ -0,0 +1,68 @@
25007+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25008+ * reiser4/README */
25009+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25010+
25011+#if !defined( __REISER4_PAGE_CACHE_H__ )
25012+#define __REISER4_PAGE_CACHE_H__
25013+
25014+#include "forward.h"
25015+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25016+
25017+#include <linux/fs.h> /* for struct super_block, address_space */
25018+#include <linux/mm.h> /* for struct page */
25019+#include <linux/pagemap.h> /* for lock_page() */
25020+#include <linux/vmalloc.h> /* for __vmalloc() */
25021+
25022+extern int reiser4_init_formatted_fake(struct super_block *);
25023+extern void reiser4_done_formatted_fake(struct super_block *);
25024+
25025+extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25026+
25027+extern int reiser4_set_page_dirty_internal(struct page *);
25028+
25029+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25030+
25031+extern void reiser4_wait_page_writeback(struct page *);
25032+static inline void lock_and_wait_page_writeback(struct page *page)
25033+{
25034+ lock_page(page);
25035+ if (unlikely(PageWriteback(page)))
25036+ reiser4_wait_page_writeback(page);
25037+}
25038+
25039+#define jprivate(page) ((jnode *)page_private(page))
25040+
25041+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25042+extern void reiser4_drop_page(struct page *);
25043+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25044+ unsigned long count, int even_cows);
25045+extern void capture_reiser4_inodes(struct super_block *,
25046+ struct writeback_control *);
25047+static inline void * reiser4_vmalloc (unsigned long size)
25048+{
25049+ return __vmalloc(size,
25050+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25051+ PAGE_KERNEL);
25052+}
25053+
25054+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25055+
25056+#if REISER4_DEBUG
25057+extern void print_page(const char *prefix, struct page *page);
25058+#else
25059+#define print_page(prf, p) noop
25060+#endif
25061+
25062+/* __REISER4_PAGE_CACHE_H__ */
25063+#endif
25064+
25065+/* Make Linus happy.
25066+ Local variables:
25067+ c-indentation-style: "K&R"
25068+ mode-name: "LC"
25069+ c-basic-offset: 8
25070+ tab-width: 8
25071+ fill-column: 120
25072+ scroll-step: 1
25073+ End:
25074+*/
25075diff -urN linux-2.6.20.orig/fs/reiser4/plugin/cluster.c linux-2.6.20/fs/reiser4/plugin/cluster.c
25076--- linux-2.6.20.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300
25077+++ linux-2.6.20/fs/reiser4/plugin/cluster.c 2007-05-06 14:50:43.746990723 +0400
25078@@ -0,0 +1,71 @@
25079+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25080+
25081+/* Contains reiser4 cluster plugins (see
25082+ http://www.namesys.com/cryptcompress_design.html
25083+ "Concepts of clustering" for details). */
25084+
25085+#include "plugin_header.h"
25086+#include "plugin.h"
25087+#include "../inode.h"
25088+
25089+static int change_cluster(struct inode *inode,
25090+ reiser4_plugin * plugin,
25091+ pset_member memb)
25092+{
25093+ assert("edward-1324", inode != NULL);
25094+ assert("edward-1325", plugin != NULL);
25095+ assert("edward-1326", is_reiser4_inode(inode));
25096+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25097+
25098+ /* Can't change the cluster plugin for already existent regular files. */
25099+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25100+ return RETERR(-EINVAL);
25101+
25102+ /* If matches, nothing to change. */
25103+ if (inode_hash_plugin(inode) != NULL &&
25104+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25105+ return 0;
25106+
25107+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25108+ PSET_CLUSTER, plugin);
25109+}
25110+
25111+static reiser4_plugin_ops cluster_plugin_ops = {
25112+ .init = NULL,
25113+ .load = NULL,
25114+ .save_len = NULL,
25115+ .save = NULL,
25116+ .change = &change_cluster
25117+};
25118+
25119+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25120+ [CLUSTER_ ## ID ## _ID] = { \
25121+ .h = { \
25122+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25123+ .id = CLUSTER_ ## ID ## _ID, \
25124+ .pops = &cluster_plugin_ops, \
25125+ .label = LABEL, \
25126+ .desc = DESC, \
25127+ .linkage = {NULL, NULL} \
25128+ }, \
25129+ .shift = SHIFT \
25130+ }
25131+
25132+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25133+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25134+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25135+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25136+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25137+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25138+};
25139+
25140+/*
25141+ Local variables:
25142+ c-indentation-style: "K&R"
25143+ mode-name: "LC"
25144+ c-basic-offset: 8
25145+ tab-width: 8
25146+ fill-column: 120
25147+ scroll-step: 1
25148+ End:
25149+*/
25150diff -urN linux-2.6.20.orig/fs/reiser4/plugin/cluster.h linux-2.6.20/fs/reiser4/plugin/cluster.h
25151--- linux-2.6.20.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300
25152+++ linux-2.6.20/fs/reiser4/plugin/cluster.h 2007-05-06 14:50:43.746990723 +0400
25153@@ -0,0 +1,343 @@
25154+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25155+
25156+/* This file contains page/cluster index translators and offset modulators
25157+ See http://www.namesys.com/cryptcompress_design.html for details */
25158+
25159+#if !defined( __FS_REISER4_CLUSTER_H__ )
25160+#define __FS_REISER4_CLUSTER_H__
25161+
25162+#include "../inode.h"
25163+
25164+static inline int inode_cluster_shift(struct inode *inode)
25165+{
25166+ assert("edward-92", inode != NULL);
25167+ assert("edward-93", reiser4_inode_data(inode) != NULL);
25168+
25169+ return inode_cluster_plugin(inode)->shift;
25170+}
25171+
25172+static inline unsigned cluster_nrpages_shift(struct inode *inode)
25173+{
25174+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25175+}
25176+
25177+/* cluster size in page units */
25178+static inline unsigned cluster_nrpages(struct inode *inode)
25179+{
25180+ return 1U << cluster_nrpages_shift(inode);
25181+}
25182+
25183+static inline size_t inode_cluster_size(struct inode *inode)
25184+{
25185+ assert("edward-96", inode != NULL);
25186+
25187+ return 1U << inode_cluster_shift(inode);
25188+}
25189+
25190+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25191+{
25192+ return idx >> cluster_nrpages_shift(inode);
25193+}
25194+
25195+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25196+{
25197+ return idx << cluster_nrpages_shift(inode);
25198+}
25199+
25200+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25201+{
25202+ return clust_to_pg(pg_to_clust(idx, inode), inode);
25203+}
25204+
25205+static inline pgoff_t off_to_pg(loff_t off)
25206+{
25207+ return (off >> PAGE_CACHE_SHIFT);
25208+}
25209+
25210+static inline loff_t pg_to_off(pgoff_t idx)
25211+{
25212+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25213+}
25214+
25215+static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25216+{
25217+ return off >> inode_cluster_shift(inode);
25218+}
25219+
25220+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25221+{
25222+ return (loff_t) idx << inode_cluster_shift(inode);
25223+}
25224+
25225+static inline unsigned long count_to_nr(loff_t count, unsigned shift)
25226+{
25227+ return (count + (1UL << shift) - 1) >> shift;
25228+}
25229+
25230+/* number of pages occupied by @count bytes */
25231+static inline pgoff_t count_to_nrpages(loff_t count)
25232+{
25233+ return count_to_nr(count, PAGE_CACHE_SHIFT);
25234+}
25235+
25236+/* number of clusters occupied by @count bytes */
25237+static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
25238+{
25239+ return count_to_nr(count, inode_cluster_shift(inode));
25240+}
25241+
25242+/* number of clusters occupied by @count pages */
25243+static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
25244+{
25245+ return count_to_nr(count, cluster_nrpages_shift(inode));
25246+}
25247+
25248+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25249+{
25250+ return clust_to_off(off_to_clust(off, inode), inode);
25251+}
25252+
25253+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25254+{
25255+ return clust_to_pg(off_to_clust(off, inode), inode);
25256+}
25257+
25258+static inline unsigned off_to_pgoff(loff_t off)
25259+{
25260+ return off & (PAGE_CACHE_SIZE - 1);
25261+}
25262+
25263+static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25264+{
25265+ return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25266+}
25267+
25268+static inline unsigned
25269+pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25270+{
25271+ return off_to_cloff(pg_to_off(idx), inode);
25272+}
25273+
25274+/* if @size != 0, returns index of the page
25275+ which contains the last byte of the file */
25276+static inline pgoff_t size_to_pg(loff_t size)
25277+{
25278+ return (size ? off_to_pg(size - 1) : 0);
25279+}
25280+
25281+/* minimal index of the page which doesn't contain
25282+ file data */
25283+static inline pgoff_t size_to_next_pg(loff_t size)
25284+{
25285+ return (size ? off_to_pg(size - 1) + 1 : 0);
25286+}
25287+
25288+/* how many bytes of file of size @cnt can be contained
25289+ in page of index @idx */
25290+static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
25291+{
25292+ if (idx > off_to_pg(cnt))
25293+ return 0;
25294+ if (idx < off_to_pg(cnt))
25295+ return PAGE_CACHE_SIZE;
25296+ return off_to_pgoff(cnt);
25297+}
25298+
25299+/* how many bytes of file of size @cnt can be contained
25300+ in logical cluster of index @idx */
25301+static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
25302+ struct inode *inode)
25303+{
25304+ if (idx > off_to_clust(cnt, inode))
25305+ return 0;
25306+ if (idx < off_to_clust(cnt, inode))
25307+ return inode_cluster_size(inode);
25308+ return off_to_cloff(cnt, inode);
25309+}
25310+
25311+static inline unsigned
25312+fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
25313+{
25314+ assert("edward-288", clust != NULL);
25315+ assert("edward-289", inode != NULL);
25316+
25317+ return cnt_to_clcnt(inode->i_size, clust->index, inode);
25318+}
25319+
25320+static inline int
25321+cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
25322+{
25323+ return clust->tc.lsize == inode_cluster_size(inode);
25324+}
25325+
25326+static inline void reiser4_slide_init(reiser4_slide_t * win)
25327+{
25328+ assert("edward-1084", win != NULL);
25329+ memset(win, 0, sizeof *win);
25330+}
25331+
25332+static inline tfm_action
25333+cluster_get_tfm_act(tfm_cluster_t * tc)
25334+{
25335+ assert("edward-1356", tc != NULL);
25336+ return tc->act;
25337+}
25338+
25339+static inline void
25340+cluster_set_tfm_act(tfm_cluster_t * tc, tfm_action act)
25341+{
25342+ assert("edward-1356", tc != NULL);
25343+ tc->act = act;
25344+}
25345+
25346+static inline void
25347+cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
25348+ assert("edward-84", clust != NULL);
25349+ memset(clust, 0, sizeof *clust);
25350+ cluster_set_tfm_act(&clust->tc, act);
25351+ clust->dstat = INVAL_DISK_CLUSTER;
25352+ clust->win = window;
25353+}
25354+
25355+static inline void
25356+cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
25357+{
25358+ cluster_init_act (clust, TFMA_READ, window);
25359+}
25360+
25361+static inline void
25362+cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
25363+{
25364+ cluster_init_act (clust, TFMA_WRITE, window);
25365+}
25366+
25367+static inline int dclust_get_extension_dsize(hint_t * hint)
25368+{
25369+ return hint->ext_coord.extension.ctail.dsize;
25370+}
25371+
25372+static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25373+{
25374+ hint->ext_coord.extension.ctail.dsize = dsize;
25375+}
25376+
25377+static inline int dclust_get_extension_shift(hint_t * hint)
25378+{
25379+ return hint->ext_coord.extension.ctail.shift;
25380+}
25381+
25382+static inline int dclust_get_extension_ncount(hint_t * hint)
25383+{
25384+ return hint->ext_coord.extension.ctail.ncount;
25385+}
25386+
25387+static inline void dclust_inc_extension_ncount(hint_t * hint)
25388+{
25389+ hint->ext_coord.extension.ctail.ncount ++;
25390+}
25391+
25392+static inline void dclust_init_extension(hint_t * hint)
25393+{
25394+ memset(&hint->ext_coord.extension.ctail, 0,
25395+ sizeof(hint->ext_coord.extension.ctail));
25396+}
25397+
25398+static inline int hint_is_unprepped_dclust(hint_t * hint)
25399+{
25400+ assert("edward-1451", hint_is_valid(hint));
25401+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25402+}
25403+
25404+static inline void coord_set_between_clusters(coord_t * coord)
25405+{
25406+#if REISER4_DEBUG
25407+ int result;
25408+ result = zload(coord->node);
25409+ assert("edward-1296", !result);
25410+#endif
25411+ if (!coord_is_between_items(coord)) {
25412+ coord->between = AFTER_ITEM;
25413+ coord->unit_pos = 0;
25414+ }
25415+#if REISER4_DEBUG
25416+ zrelse(coord->node);
25417+#endif
25418+}
25419+
25420+int reiser4_inflate_cluster(reiser4_cluster_t *, struct inode *);
25421+int find_disk_cluster(reiser4_cluster_t *, struct inode *, int read,
25422+ znode_lock_mode mode);
25423+int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
25424+int reiser4_deflate_cluster(reiser4_cluster_t *, struct inode *);
25425+void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t start,
25426+ int even_cows);
25427+void invalidate_hint_cluster(reiser4_cluster_t * clust);
25428+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
25429+ znode_lock_mode mode);
25430+int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
25431+ znode_lock_mode lock_mode);
25432+void reset_cluster_params(reiser4_cluster_t * clust);
25433+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
25434+ int count);
25435+int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
25436+ int capture);
25437+void reiser4_release_cluster_pages(reiser4_cluster_t *);
25438+void put_cluster_handle(reiser4_cluster_t * clust);
25439+int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
25440+int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
25441+void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
25442+void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
25443+
25444+/* move cluster handle to the target position
25445+ specified by the page of index @pgidx
25446+*/
25447+static inline void move_cluster_forward(reiser4_cluster_t * clust,
25448+ struct inode *inode,
25449+ pgoff_t pgidx)
25450+{
25451+ assert("edward-1297", clust != NULL);
25452+ assert("edward-1298", inode != NULL);
25453+
25454+ reset_cluster_params(clust);
25455+ if (clust->index_valid &&
25456+ /* Hole in the indices. Hint became invalid and can not be
25457+ used by find_cluster_item() even if seal/node versions
25458+ will coincide */
25459+ pg_to_clust(pgidx, inode) != clust->index + 1) {
25460+ reiser4_unset_hint(clust->hint);
25461+ invalidate_hint_cluster(clust);
25462+ }
25463+ clust->index = pg_to_clust(pgidx, inode);
25464+ clust->index_valid = 1;
25465+}
25466+
25467+static inline int
25468+alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
25469+{
25470+ assert("edward-791", clust != NULL);
25471+ assert("edward-792", inode != NULL);
25472+ clust->pages =
25473+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25474+ reiser4_ctx_gfp_mask_get());
25475+ if (!clust->pages)
25476+ return -ENOMEM;
25477+ return 0;
25478+}
25479+
25480+static inline void free_clust_pages(reiser4_cluster_t * clust)
25481+{
25482+ kfree(clust->pages);
25483+}
25484+
25485+#endif /* __FS_REISER4_CLUSTER_H__ */
25486+
25487+/* Make Linus happy.
25488+ Local variables:
25489+ c-indentation-style: "K&R"
25490+ mode-name: "LC"
25491+ c-basic-offset: 8
25492+ tab-width: 8
25493+ fill-column: 120
25494+ scroll-step: 1
25495+ End:
25496+*/
25497diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.20/fs/reiser4/plugin/compress/compress.c
25498--- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300
25499+++ linux-2.6.20/fs/reiser4/plugin/compress/compress.c 2007-05-06 14:50:43.746990723 +0400
25500@@ -0,0 +1,381 @@
25501+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25502+/* reiser4 compression transform plugins */
25503+
25504+#include "../../debug.h"
25505+#include "../../inode.h"
25506+#include "../plugin.h"
25507+#include "minilzo.h"
25508+
25509+#include <linux/zlib.h>
25510+#include <linux/types.h>
25511+#include <linux/hardirq.h>
25512+
25513+static int change_compression(struct inode *inode,
25514+ reiser4_plugin * plugin,
25515+ pset_member memb)
25516+{
25517+ assert("edward-1316", inode != NULL);
25518+ assert("edward-1317", plugin != NULL);
25519+ assert("edward-1318", is_reiser4_inode(inode));
25520+ assert("edward-1319",
25521+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25522+
25523+ /* cannot change compression plugin of already existing regular object */
25524+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25525+ return RETERR(-EINVAL);
25526+
25527+ /* If matches, nothing to change. */
25528+ if (inode_hash_plugin(inode) != NULL &&
25529+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25530+ return 0;
25531+
25532+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25533+ PSET_COMPRESSION, plugin);
25534+}
25535+
25536+static reiser4_plugin_ops compression_plugin_ops = {
25537+ .init = NULL,
25538+ .load = NULL,
25539+ .save_len = NULL,
25540+ .save = NULL,
25541+ .change = &change_compression
25542+};
25543+
25544+/******************************************************************************/
25545+/* gzip1 compression */
25546+/******************************************************************************/
25547+
25548+#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25549+#define GZIP1_DEF_WINBITS 15
25550+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25551+
25552+static int gzip1_init(void)
25553+{
25554+ int ret = -EINVAL;
25555+#if REISER4_ZLIB
25556+ ret = 0;
25557+#endif
25558+ if (ret == -EINVAL)
25559+ warning("edward-1337", "Zlib not compiled into kernel");
25560+ return ret;
25561+}
25562+
25563+static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25564+{
25565+ return 0;
25566+}
25567+
25568+static coa_t gzip1_alloc(tfm_action act)
25569+{
25570+ coa_t coa = NULL;
25571+#if REISER4_ZLIB
25572+ int ret = 0;
25573+ switch (act) {
25574+ case TFMA_WRITE: /* compress */
25575+ coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25576+ if (!coa) {
25577+ ret = -ENOMEM;
25578+ break;
25579+ }
25580+ memset(coa, 0, zlib_deflate_workspacesize());
25581+ break;
25582+ case TFMA_READ: /* decompress */
25583+ coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25584+ if (!coa) {
25585+ ret = -ENOMEM;
25586+ break;
25587+ }
25588+ memset(coa, 0, zlib_inflate_workspacesize());
25589+ break;
25590+ default:
25591+ impossible("edward-767",
25592+ "trying to alloc workspace for unknown tfm action");
25593+ }
25594+ if (ret) {
25595+ warning("edward-768",
25596+ "alloc workspace for gzip1 (tfm action = %d) failed\n",
25597+ act);
25598+ return ERR_PTR(ret);
25599+ }
25600+#endif
25601+ return coa;
25602+}
25603+
25604+static void gzip1_free(coa_t coa, tfm_action act)
25605+{
25606+ assert("edward-769", coa != NULL);
25607+
25608+ switch (act) {
25609+ case TFMA_WRITE: /* compress */
25610+ vfree(coa);
25611+ break;
25612+ case TFMA_READ: /* decompress */
25613+ vfree(coa);
25614+ break;
25615+ default:
25616+ impossible("edward-770", "unknown tfm action");
25617+ }
25618+ return;
25619+}
25620+
25621+static int gzip1_min_size_deflate(void)
25622+{
25623+ return 64;
25624+}
25625+
25626+static void
25627+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25628+ __u8 * dst_first, unsigned *dst_len)
25629+{
25630+#if REISER4_ZLIB
25631+ int ret = 0;
25632+ struct z_stream_s stream;
25633+
25634+ memset(&stream, 0, sizeof(stream));
25635+
25636+ assert("edward-842", coa != NULL);
25637+ assert("edward-875", src_len != 0);
25638+
25639+ stream.workspace = coa;
25640+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25641+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25642+ Z_DEFAULT_STRATEGY);
25643+ if (ret != Z_OK) {
25644+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25645+ goto rollback;
25646+ }
25647+ ret = zlib_deflateReset(&stream);
25648+ if (ret != Z_OK) {
25649+ warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25650+ goto rollback;
25651+ }
25652+ stream.next_in = src_first;
25653+ stream.avail_in = src_len;
25654+ stream.next_out = dst_first;
25655+ stream.avail_out = *dst_len;
25656+
25657+ ret = zlib_deflate(&stream, Z_FINISH);
25658+ if (ret != Z_STREAM_END) {
25659+ if (ret != Z_OK)
25660+ warning("edward-773",
25661+ "zlib_deflate returned %d\n", ret);
25662+ goto rollback;
25663+ }
25664+ *dst_len = stream.total_out;
25665+ return;
25666+ rollback:
25667+ *dst_len = src_len;
25668+#endif
25669+ return;
25670+}
25671+
25672+static void
25673+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25674+ __u8 * dst_first, unsigned *dst_len)
25675+{
25676+#if REISER4_ZLIB
25677+ int ret = 0;
25678+ struct z_stream_s stream;
25679+
25680+ memset(&stream, 0, sizeof(stream));
25681+
25682+ assert("edward-843", coa != NULL);
25683+ assert("edward-876", src_len != 0);
25684+
25685+ stream.workspace = coa;
25686+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25687+ if (ret != Z_OK) {
25688+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25689+ return;
25690+ }
25691+ ret = zlib_inflateReset(&stream);
25692+ if (ret != Z_OK) {
25693+ warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25694+ return;
25695+ }
25696+
25697+ stream.next_in = src_first;
25698+ stream.avail_in = src_len;
25699+ stream.next_out = dst_first;
25700+ stream.avail_out = *dst_len;
25701+
25702+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25703+ /*
25704+ * Work around a bug in zlib, which sometimes wants to taste an extra
25705+ * byte when being used in the (undocumented) raw deflate mode.
25706+ * (From USAGI).
25707+ */
25708+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25709+ u8 zerostuff = 0;
25710+ stream.next_in = &zerostuff;
25711+ stream.avail_in = 1;
25712+ ret = zlib_inflate(&stream, Z_FINISH);
25713+ }
25714+ if (ret != Z_STREAM_END) {
25715+ warning("edward-776", "zlib_inflate returned %d\n", ret);
25716+ return;
25717+ }
25718+ *dst_len = stream.total_out;
25719+#endif
25720+ return;
25721+}
25722+
25723+/******************************************************************************/
25724+/* lzo1 compression */
25725+/******************************************************************************/
25726+
25727+static int lzo1_init(void)
25728+{
25729+ int ret;
25730+ ret = lzo_init();
25731+ if (ret != LZO_E_OK)
25732+ warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
25733+ return ret;
25734+}
25735+
25736+static int lzo1_overrun(unsigned in_len)
25737+{
25738+ return in_len / 64 + 16 + 3;
25739+}
25740+
25741+#define LZO_HEAP_SIZE(size) \
25742+ sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
25743+
25744+static coa_t lzo1_alloc(tfm_action act)
25745+{
25746+ int ret = 0;
25747+ coa_t coa = NULL;
25748+
25749+ switch (act) {
25750+ case TFMA_WRITE: /* compress */
25751+ coa = reiser4_vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25752+ if (!coa) {
25753+ ret = -ENOMEM;
25754+ break;
25755+ }
25756+ memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
25757+ case TFMA_READ: /* decompress */
25758+ break;
25759+ default:
25760+ impossible("edward-877",
25761+ "trying to alloc workspace for unknown tfm action");
25762+ }
25763+ if (ret) {
25764+ warning("edward-878",
25765+ "alloc workspace for lzo1 (tfm action = %d) failed\n",
25766+ act);
25767+ return ERR_PTR(ret);
25768+ }
25769+ return coa;
25770+}
25771+
25772+static void lzo1_free(coa_t coa, tfm_action act)
25773+{
25774+ assert("edward-879", coa != NULL);
25775+
25776+ switch (act) {
25777+ case TFMA_WRITE: /* compress */
25778+ vfree(coa);
25779+ break;
25780+ case TFMA_READ: /* decompress */
25781+ impossible("edward-1304",
25782+ "trying to free non-allocated workspace");
25783+ default:
25784+ impossible("edward-880", "unknown tfm action");
25785+ }
25786+ return;
25787+}
25788+
25789+static int lzo1_min_size_deflate(void)
25790+{
25791+ return 256;
25792+}
25793+
25794+static void
25795+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25796+ __u8 * dst_first, unsigned *dst_len)
25797+{
25798+ int result;
25799+
25800+ assert("edward-846", coa != NULL);
25801+ assert("edward-847", src_len != 0);
25802+
25803+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25804+ if (result != LZO_E_OK) {
25805+ warning("edward-849", "lzo1x_1_compress failed\n");
25806+ goto out;
25807+ }
25808+ if (*dst_len >= src_len) {
25809+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25810+ goto out;
25811+ }
25812+ return;
25813+ out:
25814+ *dst_len = src_len;
25815+ return;
25816+}
25817+
25818+static void
25819+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25820+ __u8 * dst_first, unsigned *dst_len)
25821+{
25822+ int result;
25823+
25824+ assert("edward-851", coa == NULL);
25825+ assert("edward-852", src_len != 0);
25826+
25827+ result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
25828+ if (result != LZO_E_OK)
25829+ warning("edward-853", "lzo1x_1_decompress failed\n");
25830+ return;
25831+}
25832+
25833+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25834+ [LZO1_COMPRESSION_ID] = {
25835+ .h = {
25836+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25837+ .id = LZO1_COMPRESSION_ID,
25838+ .pops = &compression_plugin_ops,
25839+ .label = "lzo1",
25840+ .desc = "lzo1 compression transform",
25841+ .linkage = {NULL, NULL}
25842+ },
25843+ .init = lzo1_init,
25844+ .overrun = lzo1_overrun,
25845+ .alloc = lzo1_alloc,
25846+ .free = lzo1_free,
25847+ .min_size_deflate = lzo1_min_size_deflate,
25848+ .checksum = reiser4_adler32,
25849+ .compress = lzo1_compress,
25850+ .decompress = lzo1_decompress
25851+ },
25852+ [GZIP1_COMPRESSION_ID] = {
25853+ .h = {
25854+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25855+ .id = GZIP1_COMPRESSION_ID,
25856+ .pops = &compression_plugin_ops,
25857+ .label = "gzip1",
25858+ .desc = "gzip1 compression transform",
25859+ .linkage = {NULL, NULL}
25860+ },
25861+ .init = gzip1_init,
25862+ .overrun = gzip1_overrun,
25863+ .alloc = gzip1_alloc,
25864+ .free = gzip1_free,
25865+ .min_size_deflate = gzip1_min_size_deflate,
25866+ .checksum = reiser4_adler32,
25867+ .compress = gzip1_compress,
25868+ .decompress = gzip1_decompress
25869+ }
25870+};
25871+
25872+/*
25873+ Local variables:
25874+ c-indentation-style: "K&R"
25875+ mode-name: "LC"
25876+ c-basic-offset: 8
25877+ tab-width: 8
25878+ fill-column: 120
25879+ scroll-step: 1
25880+ End:
25881+*/
25882diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.20/fs/reiser4/plugin/compress/compress.h
25883--- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300
25884+++ linux-2.6.20/fs/reiser4/plugin/compress/compress.h 2007-05-06 14:50:43.746990723 +0400
25885@@ -0,0 +1,38 @@
25886+#if !defined( __FS_REISER4_COMPRESS_H__ )
25887+#define __FS_REISER4_COMPRESS_H__
25888+
25889+#include <linux/types.h>
25890+#include <linux/string.h>
25891+
25892+typedef enum {
25893+ TFMA_READ,
25894+ TFMA_WRITE,
25895+ TFMA_LAST
25896+} tfm_action;
25897+
25898+/* builtin compression plugins */
25899+
25900+typedef enum {
25901+ LZO1_COMPRESSION_ID,
25902+ GZIP1_COMPRESSION_ID,
25903+ LAST_COMPRESSION_ID,
25904+} reiser4_compression_id;
25905+
25906+typedef unsigned long cloff_t;
25907+typedef void *coa_t;
25908+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
25909+
25910+__u32 reiser4_adler32(char *data, __u32 len);
25911+
25912+#endif /* __FS_REISER4_COMPRESS_H__ */
25913+
25914+/* Make Linus happy.
25915+ Local variables:
25916+ c-indentation-style: "K&R"
25917+ mode-name: "LC"
25918+ c-basic-offset: 8
25919+ tab-width: 8
25920+ fill-column: 120
25921+ scroll-step: 1
25922+ End:
25923+*/
25924diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.20/fs/reiser4/plugin/compress/compress_mode.c
25925--- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300
25926+++ linux-2.6.20/fs/reiser4/plugin/compress/compress_mode.c 2007-05-06 14:50:43.750991972 +0400
25927@@ -0,0 +1,162 @@
25928+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25929+/* This file contains Reiser4 compression mode plugins.
25930+
25931+ Compression mode plugin is a set of handlers called by compressor
25932+ at flush time and represent some heuristics including the ones
25933+ which are to avoid compression of incompressible data, see
25934+ http://www.namesys.com/cryptcompress_design.html for more details.
25935+*/
25936+#include "../../inode.h"
25937+#include "../plugin.h"
25938+
25939+static int should_deflate_none(struct inode * inode, cloff_t index)
25940+{
25941+ return 0;
25942+}
25943+
25944+static int should_deflate_common(struct inode * inode, cloff_t index)
25945+{
25946+ return compression_is_on(cryptcompress_inode_data(inode));
25947+}
25948+
25949+static int discard_hook_ultim(struct inode *inode, cloff_t index)
25950+{
25951+ turn_off_compression(cryptcompress_inode_data(inode));
25952+ return 0;
25953+}
25954+
25955+static int discard_hook_lattd(struct inode *inode, cloff_t index)
25956+{
25957+ cryptcompress_info_t * info = cryptcompress_inode_data(inode);
25958+
25959+ assert("edward-1462",
25960+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
25961+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
25962+
25963+ turn_off_compression(info);
25964+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
25965+ set_lattice_factor(info, get_lattice_factor(info) << 1);
25966+ return 0;
25967+}
25968+
25969+static int accept_hook_lattd(struct inode *inode, cloff_t index)
25970+{
25971+ turn_on_compression(cryptcompress_inode_data(inode));
25972+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
25973+ return 0;
25974+}
25975+
25976+/* Check on dynamic lattice, the adaptive compression modes which
25977+ defines the following behavior:
25978+
25979+ Compression is on: try to compress everything and turn
25980+ it off, whenever cluster is incompressible.
25981+
25982+ Compression is off: try to compress clusters of indexes
25983+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
25984+ them is compressible. If incompressible, then increase FACTOR */
25985+
25986+/* check if @index belongs to one-dimensional lattice
25987+ of sparce factor @factor */
25988+static int is_on_lattice(cloff_t index, int factor)
25989+{
25990+ return (factor ? index % factor == 0: index == 0);
25991+}
25992+
25993+static int should_deflate_lattd(struct inode * inode, cloff_t index)
25994+{
25995+ return should_deflate_common(inode, index) ||
25996+ is_on_lattice(index,
25997+ get_lattice_factor
25998+ (cryptcompress_inode_data(inode)));
25999+}
26000+
26001+/* compression mode_plugins */
26002+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26003+ [NONE_COMPRESSION_MODE_ID] = {
26004+ .h = {
26005+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26006+ .id = NONE_COMPRESSION_MODE_ID,
26007+ .pops = NULL,
26008+ .label = "none",
26009+ .desc = "Compress nothing",
26010+ .linkage = {NULL, NULL}
26011+ },
26012+ .should_deflate = should_deflate_none,
26013+ .accept_hook = NULL,
26014+ .discard_hook = NULL
26015+ },
26016+ /* Check-on-dynamic-lattice adaptive compression mode */
26017+ [LATTD_COMPRESSION_MODE_ID] = {
26018+ .h = {
26019+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26020+ .id = LATTD_COMPRESSION_MODE_ID,
26021+ .pops = NULL,
26022+ .label = "lattd",
26023+ .desc = "Check on dynamic lattice",
26024+ .linkage = {NULL, NULL}
26025+ },
26026+ .should_deflate = should_deflate_lattd,
26027+ .accept_hook = accept_hook_lattd,
26028+ .discard_hook = discard_hook_lattd
26029+ },
26030+ /* Check-ultimately compression mode:
26031+ Turn off compression forever as soon as we meet
26032+ incompressible data */
26033+ [ULTIM_COMPRESSION_MODE_ID] = {
26034+ .h = {
26035+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26036+ .id = ULTIM_COMPRESSION_MODE_ID,
26037+ .pops = NULL,
26038+ .label = "ultim",
26039+ .desc = "Check ultimately",
26040+ .linkage = {NULL, NULL}
26041+ },
26042+ .should_deflate = should_deflate_common,
26043+ .accept_hook = NULL,
26044+ .discard_hook = discard_hook_ultim
26045+ },
26046+ /* Force-to-compress-everything compression mode */
26047+ [FORCE_COMPRESSION_MODE_ID] = {
26048+ .h = {
26049+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26050+ .id = FORCE_COMPRESSION_MODE_ID,
26051+ .pops = NULL,
26052+ .label = "force",
26053+ .desc = "Force to compress everything",
26054+ .linkage = {NULL, NULL}
26055+ },
26056+ .should_deflate = NULL,
26057+ .accept_hook = NULL,
26058+ .discard_hook = NULL
26059+ },
26060+ /* Convert-to-extent compression mode.
26061+ In this mode items will be converted to extents and management
26062+ will be passed to (classic) unix file plugin as soon as ->write()
26063+ detects that the first complete logical cluster (of index #0) is
26064+ incompressible. */
26065+ [CONVX_COMPRESSION_MODE_ID] = {
26066+ .h = {
26067+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26068+ .id = CONVX_COMPRESSION_MODE_ID,
26069+ .pops = NULL,
26070+ .label = "conv",
26071+ .desc = "Convert to extent",
26072+ .linkage = {NULL, NULL}
26073+ },
26074+ .should_deflate = should_deflate_common,
26075+ .accept_hook = NULL,
26076+ .discard_hook = NULL
26077+ }
26078+};
26079+
26080+/*
26081+ Local variables:
26082+ c-indentation-style: "K&R"
26083+ mode-name: "LC"
26084+ c-basic-offset: 8
26085+ tab-width: 8
26086+ fill-column: 120
26087+ scroll-step: 1
26088+ End:
26089+*/
26090diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/lzoconf.h linux-2.6.20/fs/reiser4/plugin/compress/lzoconf.h
26091--- linux-2.6.20.orig/fs/reiser4/plugin/compress/lzoconf.h 1970-01-01 03:00:00.000000000 +0300
26092+++ linux-2.6.20/fs/reiser4/plugin/compress/lzoconf.h 2007-05-06 14:50:43.750991972 +0400
26093@@ -0,0 +1,216 @@
26094+/* lzoconf.h -- configuration for the LZO real-time data compression library
26095+ adopted for reiser4 compression transform plugin.
26096+
26097+ This file is part of the LZO real-time data compression library
26098+ and not included in any proprietary licenses of reiser4.
26099+
26100+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26101+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26102+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26103+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26104+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26105+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26106+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26107+ All Rights Reserved.
26108+
26109+ The LZO library is free software; you can redistribute it and/or
26110+ modify it under the terms of the GNU General Public License as
26111+ published by the Free Software Foundation; either version 2 of
26112+ the License, or (at your option) any later version.
26113+
26114+ The LZO library is distributed in the hope that it will be useful,
26115+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26116+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26117+ GNU General Public License for more details.
26118+
26119+ You should have received a copy of the GNU General Public License
26120+ along with the LZO library; see the file COPYING.
26121+ If not, write to the Free Software Foundation, Inc.,
26122+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26123+
26124+ Markus F.X.J. Oberhumer
26125+ <markus@oberhumer.com>
26126+ http://www.oberhumer.com/opensource/lzo/
26127+ */
26128+
26129+#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
26130+
26131+#ifndef __LZOCONF_H
26132+#define __LZOCONF_H
26133+
26134+#define LZO_VERSION 0x1080
26135+#define LZO_VERSION_STRING "1.08"
26136+#define LZO_VERSION_DATE "Jul 12 2002"
26137+
26138+/* internal Autoconf configuration file - only used when building LZO */
26139+
26140+/***********************************************************************
26141+// LZO requires a conforming <limits.h>
26142+************************************************************************/
26143+
26144+#define CHAR_BIT 8
26145+#define USHRT_MAX 0xffff
26146+
26147+/* workaround a cpp bug under hpux 10.20 */
26148+#define LZO_0xffffffffL 4294967295ul
26149+
26150+/***********************************************************************
26151+// architecture defines
26152+************************************************************************/
26153+
26154+#if !defined(__LZO_i386)
26155+# if defined(__i386__) || defined(__386__) || defined(_M_IX86)
26156+# define __LZO_i386
26157+# endif
26158+#endif
26159+
26160+/* memory checkers */
26161+#if !defined(__LZO_CHECKER)
26162+# if defined(__BOUNDS_CHECKING_ON)
26163+# define __LZO_CHECKER
26164+# elif defined(__CHECKER__)
26165+# define __LZO_CHECKER
26166+# elif defined(__INSURE__)
26167+# define __LZO_CHECKER
26168+# elif defined(__PURIFY__)
26169+# define __LZO_CHECKER
26170+# endif
26171+#endif
26172+
26173+/***********************************************************************
26174+// integral and pointer types
26175+************************************************************************/
26176+
26177+/* Integral types with 32 bits or more */
26178+#if !defined(LZO_UINT32_MAX)
26179+# if (UINT_MAX >= LZO_0xffffffffL)
26180+ typedef unsigned int lzo_uint32;
26181+ typedef int lzo_int32;
26182+# define LZO_UINT32_MAX UINT_MAX
26183+# define LZO_INT32_MAX INT_MAX
26184+# define LZO_INT32_MIN INT_MIN
26185+# elif (ULONG_MAX >= LZO_0xffffffffL)
26186+ typedef unsigned long lzo_uint32;
26187+ typedef long lzo_int32;
26188+# define LZO_UINT32_MAX ULONG_MAX
26189+# define LZO_INT32_MAX LONG_MAX
26190+# define LZO_INT32_MIN LONG_MIN
26191+# else
26192+# error "lzo_uint32"
26193+# endif
26194+#endif
26195+
26196+/* lzo_uint is used like size_t */
26197+#if !defined(LZO_UINT_MAX)
26198+# if (UINT_MAX >= LZO_0xffffffffL)
26199+ typedef unsigned int lzo_uint;
26200+ typedef int lzo_int;
26201+# define LZO_UINT_MAX UINT_MAX
26202+# define LZO_INT_MAX INT_MAX
26203+# define LZO_INT_MIN INT_MIN
26204+# elif (ULONG_MAX >= LZO_0xffffffffL)
26205+ typedef unsigned long lzo_uint;
26206+ typedef long lzo_int;
26207+# define LZO_UINT_MAX ULONG_MAX
26208+# define LZO_INT_MAX LONG_MAX
26209+# define LZO_INT_MIN LONG_MIN
26210+# else
26211+# error "lzo_uint"
26212+# endif
26213+#endif
26214+
26215+ typedef int lzo_bool;
26216+
26217+/***********************************************************************
26218+// memory models
26219+************************************************************************/
26220+
26221+/* Memory model that allows to access memory at offsets of lzo_uint. */
26222+#if !defined(__LZO_MMODEL)
26223+# if (LZO_UINT_MAX <= UINT_MAX)
26224+# define __LZO_MMODEL
26225+# else
26226+# error "__LZO_MMODEL"
26227+# endif
26228+#endif
26229+
26230+/* no typedef here because of const-pointer issues */
26231+#define lzo_byte unsigned char __LZO_MMODEL
26232+#define lzo_bytep unsigned char __LZO_MMODEL *
26233+#define lzo_charp char __LZO_MMODEL *
26234+#define lzo_voidp void __LZO_MMODEL *
26235+#define lzo_shortp short __LZO_MMODEL *
26236+#define lzo_ushortp unsigned short __LZO_MMODEL *
26237+#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
26238+#define lzo_int32p lzo_int32 __LZO_MMODEL *
26239+#define lzo_uintp lzo_uint __LZO_MMODEL *
26240+#define lzo_intp lzo_int __LZO_MMODEL *
26241+#define lzo_voidpp lzo_voidp __LZO_MMODEL *
26242+#define lzo_bytepp lzo_bytep __LZO_MMODEL *
26243+
26244+#ifndef lzo_sizeof_dict_t
26245+# define lzo_sizeof_dict_t sizeof(lzo_bytep)
26246+#endif
26247+
26248+typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26249+ lzo_byte * dst, lzo_uintp dst_len,
26250+ lzo_voidp wrkmem);
26251+
26252+
26253+/***********************************************************************
26254+// error codes and prototypes
26255+************************************************************************/
26256+
26257+/* Error codes for the compression/decompression functions. Negative
26258+ * values are errors, positive values will be used for special but
26259+ * normal events.
26260+ */
26261+#define LZO_E_OK 0
26262+#define LZO_E_ERROR (-1)
26263+#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
26264+#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
26265+#define LZO_E_INPUT_OVERRUN (-4)
26266+#define LZO_E_OUTPUT_OVERRUN (-5)
26267+#define LZO_E_LOOKBEHIND_OVERRUN (-6)
26268+#define LZO_E_EOF_NOT_FOUND (-7)
26269+#define LZO_E_INPUT_NOT_CONSUMED (-8)
26270+
26271+/* lzo_init() should be the first function you call.
26272+ * Check the return code !
26273+ *
26274+ * lzo_init() is a macro to allow checking that the library and the
26275+ * compiler's view of various types are consistent.
26276+ */
26277+#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26278+ (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26279+ (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26280+ (int)sizeof(lzo_compress_t))
26281+ extern int __lzo_init2(unsigned, int, int, int, int, int, int,
26282+ int, int, int);
26283+
26284+/* checksum functions */
26285+extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf,
26286+ lzo_uint _len);
26287+/* misc. */
26288+ typedef union {
26289+ lzo_bytep p;
26290+ lzo_uint u;
26291+ } __lzo_pu_u;
26292+ typedef union {
26293+ lzo_bytep p;
26294+ lzo_uint32 u32;
26295+ } __lzo_pu32_u;
26296+ typedef union {
26297+ void *vp;
26298+ lzo_bytep bp;
26299+ lzo_uint32 u32;
26300+ long l;
26301+ } lzo_align_t;
26302+
26303+#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26304+ ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26305+
26306+/* deprecated - only for backward compatibility */
26307+#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26308+
26309+#endif /* already included */
26310diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.20/fs/reiser4/plugin/compress/Makefile
26311--- linux-2.6.20.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300
26312+++ linux-2.6.20/fs/reiser4/plugin/compress/Makefile 2007-05-06 14:50:43.750991972 +0400
26313@@ -0,0 +1,6 @@
26314+obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26315+
26316+compress_plugins-objs := \
26317+ compress.o \
26318+ minilzo.o \
26319+ compress_mode.o
26320diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.c linux-2.6.20/fs/reiser4/plugin/compress/minilzo.c
26321--- linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.c 1970-01-01 03:00:00.000000000 +0300
26322+++ linux-2.6.20/fs/reiser4/plugin/compress/minilzo.c 2007-05-06 14:50:43.754993222 +0400
26323@@ -0,0 +1,1967 @@
26324+/* minilzo.c -- mini subset of the LZO real-time data compression library
26325+ adopted for reiser4 compression transform plugin.
26326+
26327+ This file is part of the LZO real-time data compression library
26328+ and not included in any proprietary licenses of reiser4.
26329+
26330+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26331+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26332+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26333+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26334+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26335+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26336+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26337+ All Rights Reserved.
26338+
26339+ The LZO library is free software; you can redistribute it and/or
26340+ modify it under the terms of the GNU General Public License as
26341+ published by the Free Software Foundation; either version 2 of
26342+ the License, or (at your option) any later version.
26343+
26344+ The LZO library is distributed in the hope that it will be useful,
26345+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26346+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26347+ GNU General Public License for more details.
26348+
26349+ You should have received a copy of the GNU General Public License
26350+ along with the LZO library; see the file COPYING.
26351+ If not, write to the Free Software Foundation, Inc.,
26352+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26353+
26354+ Markus F.X.J. Oberhumer
26355+ <markus@oberhumer.com>
26356+ http://www.oberhumer.com/opensource/lzo/
26357+ */
26358+
26359+/*
26360+ * NOTE:
26361+ * the full LZO package can be found at
26362+ * http://www.oberhumer.com/opensource/lzo/
26363+ */
26364+
26365+#include "../../debug.h" /* for reiser4 assert macro -edward */
26366+
26367+#define __LZO_IN_MINILZO
26368+#define LZO_BUILD
26369+
26370+#include "minilzo.h"
26371+
26372+#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26373+# error "version mismatch in miniLZO source files"
26374+#endif
26375+
26376+#ifndef __LZO_CONF_H
26377+#define __LZO_CONF_H
26378+
26379+# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
26380+# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
26381+
26382+# define HAVE_MEMCMP
26383+# define HAVE_MEMCPY
26384+# define HAVE_MEMMOVE
26385+# define HAVE_MEMSET
26386+
26387+#undef NDEBUG
26388+#if !defined(LZO_DEBUG)
26389+# define NDEBUG
26390+#endif
26391+#if defined(LZO_DEBUG) || !defined(NDEBUG)
26392+# if !defined(NO_STDIO_H)
26393+# include <stdio.h>
26394+# endif
26395+#endif
26396+
26397+#if !defined(LZO_COMPILE_TIME_ASSERT)
26398+# define LZO_COMPILE_TIME_ASSERT(expr) \
26399+ { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26400+#endif
26401+
26402+#if !defined(LZO_UNUSED)
26403+# if 1
26404+# define LZO_UNUSED(var) ((void)&var)
26405+# elif 0
26406+# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26407+# else
26408+# define LZO_UNUSED(parm) (parm = parm)
26409+# endif
26410+#endif
26411+
26412+#if defined(NO_MEMCMP)
26413+# undef HAVE_MEMCMP
26414+#endif
26415+
26416+#if !defined(HAVE_MEMSET)
26417+# undef memset
26418+# define memset lzo_memset
26419+#endif
26420+
26421+# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
26422+
26423+#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
26424+#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
26425+#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26426+#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26427+
26428+#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
26429+
26430+#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26431+
26432+#define LZO_SIZE(bits) (1u << (bits))
26433+#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
26434+
26435+#define LZO_LSIZE(bits) (1ul << (bits))
26436+#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
26437+
26438+#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
26439+#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
26440+
26441+#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
26442+#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26443+
26444+#if !defined(SIZEOF_UNSIGNED)
26445+# if (UINT_MAX == 0xffff)
26446+# define SIZEOF_UNSIGNED 2
26447+# elif (UINT_MAX == LZO_0xffffffffL)
26448+# define SIZEOF_UNSIGNED 4
26449+# elif (UINT_MAX >= LZO_0xffffffffL)
26450+# define SIZEOF_UNSIGNED 8
26451+# else
26452+# error "SIZEOF_UNSIGNED"
26453+# endif
26454+#endif
26455+
26456+#if !defined(SIZEOF_UNSIGNED_LONG)
26457+# if (ULONG_MAX == LZO_0xffffffffL)
26458+# define SIZEOF_UNSIGNED_LONG 4
26459+# elif (ULONG_MAX >= LZO_0xffffffffL)
26460+# define SIZEOF_UNSIGNED_LONG 8
26461+# else
26462+# error "SIZEOF_UNSIGNED_LONG"
26463+# endif
26464+#endif
26465+
26466+#if !defined(SIZEOF_SIZE_T)
26467+# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
26468+#endif
26469+#if !defined(SIZE_T_MAX)
26470+# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26471+#endif
26472+
26473+#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26474+# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26475+# define LZO_UNALIGNED_OK_2
26476+# endif
26477+# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26478+# define LZO_UNALIGNED_OK_4
26479+# endif
26480+#endif
26481+
26482+#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26483+# if !defined(LZO_UNALIGNED_OK)
26484+# define LZO_UNALIGNED_OK
26485+# endif
26486+#endif
26487+
26488+#if defined(__LZO_NO_UNALIGNED)
26489+# undef LZO_UNALIGNED_OK
26490+# undef LZO_UNALIGNED_OK_2
26491+# undef LZO_UNALIGNED_OK_4
26492+#endif
26493+
26494+#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26495+# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26496+#endif
26497+#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26498+# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26499+#endif
26500+
26501+#if defined(__LZO_NO_ALIGNED)
26502+# undef LZO_ALIGNED_OK_4
26503+#endif
26504+
26505+#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26506+# error "LZO_ALIGNED_OK_4 must not be defined on this system"
26507+#endif
26508+
26509+#define LZO_LITTLE_ENDIAN 1234
26510+#define LZO_BIG_ENDIAN 4321
26511+#define LZO_PDP_ENDIAN 3412
26512+
26513+#if !defined(LZO_BYTE_ORDER)
26514+# if defined(MFX_BYTE_ORDER)
26515+# define LZO_BYTE_ORDER MFX_BYTE_ORDER
26516+# elif defined(__LZO_i386)
26517+# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
26518+# elif defined(BYTE_ORDER)
26519+# define LZO_BYTE_ORDER BYTE_ORDER
26520+# elif defined(__BYTE_ORDER)
26521+# define LZO_BYTE_ORDER __BYTE_ORDER
26522+# endif
26523+#endif
26524+
26525+#if defined(LZO_BYTE_ORDER)
26526+# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26527+ (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26528+# error "invalid LZO_BYTE_ORDER"
26529+# endif
26530+#endif
26531+
26532+#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
26533+# error "LZO_BYTE_ORDER is not defined"
26534+#endif
26535+
26536+#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
26537+
26538+#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
26539+# if defined(__GNUC__) && defined(__i386__)
26540+# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
26541+# define LZO_OPTIMIZE_GNUC_i386
26542+# endif
26543+# endif
26544+#endif
26545+
26546+extern const lzo_uint32 _lzo_crc32_table[256];
26547+
26548+#define _LZO_STRINGIZE(x) #x
26549+#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
26550+
26551+#define _LZO_CONCAT2(a,b) a ## b
26552+#define _LZO_CONCAT3(a,b,c) a ## b ## c
26553+#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
26554+#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
26555+
26556+#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
26557+#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
26558+#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
26559+#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
26560+
26561+#ifndef __LZO_PTR_H
26562+#define __LZO_PTR_H
26563+
26564+#if !defined(lzo_ptrdiff_t)
26565+# if (UINT_MAX >= LZO_0xffffffffL)
26566+typedef ptrdiff_t lzo_ptrdiff_t;
26567+# else
26568+typedef long lzo_ptrdiff_t;
26569+# endif
26570+#endif
26571+
26572+#if !defined(__LZO_HAVE_PTR_T)
26573+# if defined(lzo_ptr_t)
26574+# define __LZO_HAVE_PTR_T
26575+# endif
26576+#endif
26577+#if !defined(__LZO_HAVE_PTR_T)
26578+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
26579+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
26580+typedef unsigned long lzo_ptr_t;
26581+typedef long lzo_sptr_t;
26582+# define __LZO_HAVE_PTR_T
26583+# endif
26584+# endif
26585+#endif
26586+#if !defined(__LZO_HAVE_PTR_T)
26587+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
26588+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
26589+typedef unsigned int lzo_ptr_t;
26590+typedef int lzo_sptr_t;
26591+# define __LZO_HAVE_PTR_T
26592+# endif
26593+# endif
26594+#endif
26595+#if !defined(__LZO_HAVE_PTR_T)
26596+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
26597+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
26598+typedef unsigned short lzo_ptr_t;
26599+typedef short lzo_sptr_t;
26600+# define __LZO_HAVE_PTR_T
26601+# endif
26602+# endif
26603+#endif
26604+#if !defined(__LZO_HAVE_PTR_T)
26605+# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
26606+# error "no suitable type for lzo_ptr_t"
26607+# else
26608+typedef unsigned long lzo_ptr_t;
26609+typedef long lzo_sptr_t;
26610+# define __LZO_HAVE_PTR_T
26611+# endif
26612+#endif
26613+
26614+#define PTR(a) ((lzo_ptr_t) (a))
26615+#define PTR_LINEAR(a) PTR(a)
26616+#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
26617+#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
26618+#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
26619+#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
26620+
26621+#define PTR_LT(a,b) (PTR(a) < PTR(b))
26622+#define PTR_GE(a,b) (PTR(a) >= PTR(b))
26623+#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
26624+#define pd(a,b) ((lzo_uint) ((a)-(b)))
26625+
26626+typedef union {
26627+ char a_char;
26628+ unsigned char a_uchar;
26629+ short a_short;
26630+ unsigned short a_ushort;
26631+ int a_int;
26632+ unsigned int a_uint;
26633+ long a_long;
26634+ unsigned long a_ulong;
26635+ lzo_int a_lzo_int;
26636+ lzo_uint a_lzo_uint;
26637+ lzo_int32 a_lzo_int32;
26638+ lzo_uint32 a_lzo_uint32;
26639+ ptrdiff_t a_ptrdiff_t;
26640+ lzo_ptrdiff_t a_lzo_ptrdiff_t;
26641+ lzo_ptr_t a_lzo_ptr_t;
26642+ lzo_voidp a_lzo_voidp;
26643+ void *a_void_p;
26644+ lzo_bytep a_lzo_bytep;
26645+ lzo_bytepp a_lzo_bytepp;
26646+ lzo_uintp a_lzo_uintp;
26647+ lzo_uint *a_lzo_uint_p;
26648+ lzo_uint32p a_lzo_uint32p;
26649+ lzo_uint32 *a_lzo_uint32_p;
26650+ unsigned char *a_uchar_p;
26651+ char *a_char_p;
26652+} lzo_full_align_t;
26653+
26654+#endif
26655+#define LZO_DETERMINISTIC
26656+#define LZO_DICT_USE_PTR
26657+# define lzo_dict_t const lzo_bytep
26658+# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
26659+#if !defined(lzo_moff_t)
26660+#define lzo_moff_t lzo_uint
26661+#endif
26662+#endif
26663+static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
26664+{
26665+ return PTR_LINEAR(ptr);
26666+}
26667+
26668+static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
26669+{
26670+ lzo_ptr_t p, s, n;
26671+
26672+ assert("lzo-01", size > 0);
26673+
26674+ p = __lzo_ptr_linear(ptr);
26675+ s = (lzo_ptr_t) (size - 1);
26676+ n = (((p + s) / size) * size) - p;
26677+
26678+ assert("lzo-02", (long)n >= 0);
26679+ assert("lzo-03", n <= s);
26680+
26681+ return (unsigned)n;
26682+}
26683+
26684+#ifndef __LZO_UTIL_H
26685+#define __LZO_UTIL_H
26686+
26687+#ifndef __LZO_CONF_H
26688+#endif
26689+
26690+#if 1 && defined(HAVE_MEMCPY)
26691+#define MEMCPY8_DS(dest,src,len) \
26692+ memcpy(dest,src,len); \
26693+ dest += len; \
26694+ src += len
26695+#endif
26696+
26697+#if !defined(MEMCPY8_DS)
26698+
26699+#define MEMCPY8_DS(dest,src,len) \
26700+ { register lzo_uint __l = (len) / 8; \
26701+ do { \
26702+ *dest++ = *src++; \
26703+ *dest++ = *src++; \
26704+ *dest++ = *src++; \
26705+ *dest++ = *src++; \
26706+ *dest++ = *src++; \
26707+ *dest++ = *src++; \
26708+ *dest++ = *src++; \
26709+ *dest++ = *src++; \
26710+ } while (--__l > 0); }
26711+
26712+#endif
26713+
26714+#define MEMCPY_DS(dest,src,len) \
26715+ do *dest++ = *src++; \
26716+ while (--len > 0)
26717+
26718+#define MEMMOVE_DS(dest,src,len) \
26719+ do *dest++ = *src++; \
26720+ while (--len > 0)
26721+
26722+#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
26723+
26724+#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
26725+
26726+#else
26727+
26728+#define BZERO8_PTR(s,l,n) \
26729+ lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
26730+
26731+#endif
26732+#endif
26733+
26734+/* If you use the LZO library in a product, you *must* keep this
26735+ * copyright string in the executable of your product.
26736+ */
26737+
26738+static const lzo_byte __lzo_copyright[] =
26739+#if !defined(__LZO_IN_MINLZO)
26740+ LZO_VERSION_STRING;
26741+#else
26742+ "\n\n\n"
26743+ "LZO real-time data compression library.\n"
26744+ "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
26745+ "<markus.oberhumer@jk.uni-linz.ac.at>\n"
26746+ "http://www.oberhumer.com/opensource/lzo/\n"
26747+ "\n"
26748+ "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
26749+ "LZO build date: " __DATE__ " " __TIME__ "\n\n"
26750+ "LZO special compilation options:\n"
26751+#ifdef __cplusplus
26752+ " __cplusplus\n"
26753+#endif
26754+#if defined(__PIC__)
26755+ " __PIC__\n"
26756+#elif defined(__pic__)
26757+ " __pic__\n"
26758+#endif
26759+#if (UINT_MAX < LZO_0xffffffffL)
26760+ " 16BIT\n"
26761+#endif
26762+#if defined(__LZO_STRICT_16BIT)
26763+ " __LZO_STRICT_16BIT\n"
26764+#endif
26765+#if (UINT_MAX > LZO_0xffffffffL)
26766+ " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
26767+#endif
26768+#if (ULONG_MAX > LZO_0xffffffffL)
26769+ " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
26770+#endif
26771+#if defined(LZO_BYTE_ORDER)
26772+ " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
26773+#endif
26774+#if defined(LZO_UNALIGNED_OK_2)
26775+ " LZO_UNALIGNED_OK_2\n"
26776+#endif
26777+#if defined(LZO_UNALIGNED_OK_4)
26778+ " LZO_UNALIGNED_OK_4\n"
26779+#endif
26780+#if defined(LZO_ALIGNED_OK_4)
26781+ " LZO_ALIGNED_OK_4\n"
26782+#endif
26783+#if defined(LZO_DICT_USE_PTR)
26784+ " LZO_DICT_USE_PTR\n"
26785+#endif
26786+#if defined(__LZO_QUERY_COMPRESS)
26787+ " __LZO_QUERY_COMPRESS\n"
26788+#endif
26789+#if defined(__LZO_QUERY_DECOMPRESS)
26790+ " __LZO_QUERY_DECOMPRESS\n"
26791+#endif
26792+#if defined(__LZO_IN_MINILZO)
26793+ " __LZO_IN_MINILZO\n"
26794+#endif
26795+ "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
26796+#if defined(__GNUC__) && defined(__VERSION__)
26797+ " by gcc " __VERSION__
26798+#elif defined(__BORLANDC__)
26799+ " by Borland C " _LZO_MEXPAND(__BORLANDC__)
26800+#elif defined(_MSC_VER)
26801+ " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
26802+#elif defined(__PUREC__)
26803+ " by Pure C " _LZO_MEXPAND(__PUREC__)
26804+#elif defined(__SC__)
26805+ " by Symantec C " _LZO_MEXPAND(__SC__)
26806+#elif defined(__TURBOC__)
26807+ " by Turbo C " _LZO_MEXPAND(__TURBOC__)
26808+#elif defined(__WATCOMC__)
26809+ " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
26810+#endif
26811+ " $\n"
26812+ "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
26813+#endif
26814+
26815+#define LZO_BASE 65521u
26816+#define LZO_NMAX 5552
26817+
26818+#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
26819+#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
26820+#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
26821+#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
26822+#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
26823+
26824+# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
26825+# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
26826+
26827+#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
26828+
26829+static lzo_bool schedule_insns_bug(void);
26830+static lzo_bool strength_reduce_bug(int *);
26831+
26832+# define __lzo_assert(x) ((x) ? 1 : 0)
26833+
26834+#undef COMPILE_TIME_ASSERT
26835+
26836+# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
26837+
26838+static lzo_bool basic_integral_check(void)
26839+{
26840+ lzo_bool r = 1;
26841+
26842+ COMPILE_TIME_ASSERT(CHAR_BIT == 8);
26843+ COMPILE_TIME_ASSERT(sizeof(char) == 1);
26844+ COMPILE_TIME_ASSERT(sizeof(short) >= 2);
26845+ COMPILE_TIME_ASSERT(sizeof(long) >= 4);
26846+ COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
26847+ COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
26848+
26849+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
26850+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
26851+
26852+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
26853+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
26854+#if defined(__LZO_STRICT_16BIT)
26855+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
26856+#else
26857+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
26858+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
26859+#endif
26860+
26861+#if (USHRT_MAX == 65535u)
26862+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
26863+#elif (USHRT_MAX == LZO_0xffffffffL)
26864+ COMPILE_TIME_ASSERT(sizeof(short) == 4);
26865+#elif (USHRT_MAX >= LZO_0xffffffffL)
26866+ COMPILE_TIME_ASSERT(sizeof(short) > 4);
26867+#endif
26868+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
26869+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
26870+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
26871+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
26872+ COMPILE_TIME_ASSERT(IS_SIGNED(short));
26873+ COMPILE_TIME_ASSERT(IS_SIGNED(int));
26874+ COMPILE_TIME_ASSERT(IS_SIGNED(long));
26875+
26876+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
26877+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
26878+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
26879+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
26880+
26881+ COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
26882+ COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
26883+ COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
26884+ COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
26885+ COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
26886+ COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
26887+ LZO_UTYPE_MAX(sizeof(lzo_uint32)));
26888+ COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
26889+
26890+ r &= __lzo_assert(LZO_BYTE(257) == 1);
26891+
26892+ return r;
26893+}
26894+
26895+static lzo_bool basic_ptr_check(void)
26896+{
26897+ lzo_bool r = 1;
26898+
26899+ COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
26900+ COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
26901+
26902+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
26903+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
26904+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
26905+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
26906+
26907+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
26908+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
26909+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
26910+
26911+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
26912+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
26913+
26914+ COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
26915+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
26916+
26917+#if defined(SIZEOF_CHAR_P)
26918+ COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
26919+#endif
26920+#if defined(SIZEOF_PTRDIFF_T)
26921+ COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
26922+#endif
26923+
26924+ COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
26925+ COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
26926+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
26927+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
26928+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
26929+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
26930+
26931+ return r;
26932+}
26933+
26934+static lzo_bool ptr_check(void)
26935+{
26936+ lzo_bool r = 1;
26937+ int i;
26938+ char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
26939+ lzo_bytep wrkmem;
26940+ lzo_bytepp dict;
26941+ unsigned char x[4 * sizeof(lzo_full_align_t)];
26942+ long d;
26943+ lzo_full_align_t a;
26944+ lzo_full_align_t u;
26945+
26946+ for (i = 0; i < (int)sizeof(x); i++)
26947+ x[i] = LZO_BYTE(i);
26948+
26949+ wrkmem =
26950+ LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
26951+
26952+ u.a_lzo_bytep = wrkmem;
26953+ dict = u.a_lzo_bytepp;
26954+
26955+ d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
26956+ r &= __lzo_assert(d >= 0);
26957+ r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
26958+
26959+ memset(&a, 0, sizeof(a));
26960+ r &= __lzo_assert(a.a_lzo_voidp == NULL);
26961+
26962+ memset(&a, 0xff, sizeof(a));
26963+ r &= __lzo_assert(a.a_ushort == USHRT_MAX);
26964+ r &= __lzo_assert(a.a_uint == UINT_MAX);
26965+ r &= __lzo_assert(a.a_ulong == ULONG_MAX);
26966+ r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
26967+ r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
26968+
26969+ if (r == 1) {
26970+ for (i = 0; i < 8; i++)
26971+ r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
26972+ (const
26973+ lzo_voidp)(&wrkmem[i *
26974+ sizeof(lzo_byte
26975+ *)]));
26976+ }
26977+
26978+ memset(&a, 0, sizeof(a));
26979+ r &= __lzo_assert(a.a_char_p == NULL);
26980+ r &= __lzo_assert(a.a_lzo_bytep == NULL);
26981+ r &= __lzo_assert(NULL == (void *)0);
26982+ if (r == 1) {
26983+ for (i = 0; i < 10; i++)
26984+ dict[i] = wrkmem;
26985+ BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
26986+ r &= __lzo_assert(dict[0] == wrkmem);
26987+ for (i = 1; i < 9; i++)
26988+ r &= __lzo_assert(dict[i] == NULL);
26989+ r &= __lzo_assert(dict[9] == wrkmem);
26990+ }
26991+
26992+ if (r == 1) {
26993+ unsigned k = 1;
26994+ const unsigned n = (unsigned)sizeof(lzo_uint32);
26995+ lzo_byte *p0;
26996+ lzo_byte *p1;
26997+
26998+ k += __lzo_align_gap(&x[k], n);
26999+ p0 = (lzo_bytep) & x[k];
27000+#if defined(PTR_LINEAR)
27001+ r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27002+#else
27003+ r &= __lzo_assert(n == 4);
27004+ r &= __lzo_assert(PTR_ALIGNED_4(p0));
27005+#endif
27006+
27007+ r &= __lzo_assert(k >= 1);
27008+ p1 = (lzo_bytep) & x[1];
27009+ r &= __lzo_assert(PTR_GE(p0, p1));
27010+
27011+ r &= __lzo_assert(k < 1 + n);
27012+ p1 = (lzo_bytep) & x[1 + n];
27013+ r &= __lzo_assert(PTR_LT(p0, p1));
27014+
27015+ if (r == 1) {
27016+ lzo_uint32 v0, v1;
27017+
27018+ u.a_uchar_p = &x[k];
27019+ v0 = *u.a_lzo_uint32_p;
27020+ u.a_uchar_p = &x[k + n];
27021+ v1 = *u.a_lzo_uint32_p;
27022+
27023+ r &= __lzo_assert(v0 > 0);
27024+ r &= __lzo_assert(v1 > 0);
27025+ }
27026+ }
27027+
27028+ return r;
27029+}
27030+
27031+static int _lzo_config_check(void)
27032+{
27033+ lzo_bool r = 1;
27034+ int i;
27035+ union {
27036+ lzo_uint32 a;
27037+ unsigned short b;
27038+ lzo_uint32 aa[4];
27039+ unsigned char x[4 * sizeof(lzo_full_align_t)];
27040+ } u;
27041+
27042+ COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27043+ COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27044+ < 0);
27045+
27046+ r &= basic_integral_check();
27047+ r &= basic_ptr_check();
27048+ if (r != 1)
27049+ return LZO_E_ERROR;
27050+
27051+ u.a = 0;
27052+ u.b = 0;
27053+ for (i = 0; i < (int)sizeof(u.x); i++)
27054+ u.x[i] = LZO_BYTE(i);
27055+
27056+#if defined(LZO_BYTE_ORDER)
27057+ if (r == 1) {
27058+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27059+ lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27060+ unsigned short b = (unsigned short)(u.b & 0xffff);
27061+ r &= __lzo_assert(a == 0x03020100L);
27062+ r &= __lzo_assert(b == 0x0100);
27063+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27064+ lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27065+ unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27066+ r &= __lzo_assert(a == 0x00010203L);
27067+ r &= __lzo_assert(b == 0x0001);
27068+# else
27069+# error "invalid LZO_BYTE_ORDER"
27070+# endif
27071+ }
27072+#endif
27073+
27074+#if defined(LZO_UNALIGNED_OK_2)
27075+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
27076+ if (r == 1) {
27077+ unsigned short b[4];
27078+
27079+ for (i = 0; i < 4; i++)
27080+ b[i] = *(const unsigned short *)&u.x[i];
27081+
27082+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27083+ r &= __lzo_assert(b[0] == 0x0100);
27084+ r &= __lzo_assert(b[1] == 0x0201);
27085+ r &= __lzo_assert(b[2] == 0x0302);
27086+ r &= __lzo_assert(b[3] == 0x0403);
27087+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27088+ r &= __lzo_assert(b[0] == 0x0001);
27089+ r &= __lzo_assert(b[1] == 0x0102);
27090+ r &= __lzo_assert(b[2] == 0x0203);
27091+ r &= __lzo_assert(b[3] == 0x0304);
27092+# endif
27093+ }
27094+#endif
27095+
27096+#if defined(LZO_UNALIGNED_OK_4)
27097+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27098+ if (r == 1) {
27099+ lzo_uint32 a[4];
27100+
27101+ for (i = 0; i < 4; i++)
27102+ a[i] = *(const lzo_uint32 *)&u.x[i];
27103+
27104+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27105+ r &= __lzo_assert(a[0] == 0x03020100L);
27106+ r &= __lzo_assert(a[1] == 0x04030201L);
27107+ r &= __lzo_assert(a[2] == 0x05040302L);
27108+ r &= __lzo_assert(a[3] == 0x06050403L);
27109+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27110+ r &= __lzo_assert(a[0] == 0x00010203L);
27111+ r &= __lzo_assert(a[1] == 0x01020304L);
27112+ r &= __lzo_assert(a[2] == 0x02030405L);
27113+ r &= __lzo_assert(a[3] == 0x03040506L);
27114+# endif
27115+ }
27116+#endif
27117+
27118+#if defined(LZO_ALIGNED_OK_4)
27119+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27120+#endif
27121+
27122+ COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27123+
27124+ if (r == 1) {
27125+ r &= __lzo_assert(!schedule_insns_bug());
27126+ }
27127+
27128+ if (r == 1) {
27129+ static int x[3];
27130+ static unsigned xn = 3;
27131+ register unsigned j;
27132+
27133+ for (j = 0; j < xn; j++)
27134+ x[j] = (int)j - 3;
27135+ r &= __lzo_assert(!strength_reduce_bug(x));
27136+ }
27137+
27138+ if (r == 1) {
27139+ r &= ptr_check();
27140+ }
27141+
27142+ return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27143+}
27144+
27145+static lzo_bool schedule_insns_bug(void)
27146+{
27147+#if defined(__LZO_CHECKER)
27148+ return 0;
27149+#else
27150+ const int clone[] = { 1, 2, 0 };
27151+ const int *q;
27152+ q = clone;
27153+ return (*q) ? 0 : 1;
27154+#endif
27155+}
27156+
27157+static lzo_bool strength_reduce_bug(int *x)
27158+{
27159+ return x[0] != -3 || x[1] != -2 || x[2] != -1;
27160+}
27161+
27162+#undef COMPILE_TIME_ASSERT
27163+
27164+int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27165+ int s6, int s7, int s8, int s9)
27166+{
27167+ int r;
27168+
27169+ if (v == 0)
27170+ return LZO_E_ERROR;
27171+
27172+ r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27173+ (s2 == -1 || s2 == (int)sizeof(int)) &&
27174+ (s3 == -1 || s3 == (int)sizeof(long)) &&
27175+ (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27176+ (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27177+ (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27178+ (s7 == -1 || s7 == (int)sizeof(char *)) &&
27179+ (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27180+ (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27181+ if (!r)
27182+ return LZO_E_ERROR;
27183+
27184+ r = _lzo_config_check();
27185+ if (r != LZO_E_OK)
27186+ return r;
27187+
27188+ return r;
27189+}
27190+
27191+#define do_compress _lzo1x_1_do_compress
27192+
27193+#define LZO_NEED_DICT_H
27194+#define D_BITS 14
27195+#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
27196+#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27197+
27198+#ifndef __LZO_CONFIG1X_H
27199+#define __LZO_CONFIG1X_H
27200+
27201+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27202+# define LZO1X
27203+#endif
27204+
27205+#define LZO_EOF_CODE
27206+#undef LZO_DETERMINISTIC
27207+
27208+#define M1_MAX_OFFSET 0x0400
27209+#ifndef M2_MAX_OFFSET
27210+#define M2_MAX_OFFSET 0x0800
27211+#endif
27212+#define M3_MAX_OFFSET 0x4000
27213+#define M4_MAX_OFFSET 0xbfff
27214+
27215+#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
27216+
27217+#define M1_MIN_LEN 2
27218+#define M1_MAX_LEN 2
27219+#define M2_MIN_LEN 3
27220+#ifndef M2_MAX_LEN
27221+#define M2_MAX_LEN 8
27222+#endif
27223+#define M3_MIN_LEN 3
27224+#define M3_MAX_LEN 33
27225+#define M4_MIN_LEN 3
27226+#define M4_MAX_LEN 9
27227+
27228+#define M1_MARKER 0
27229+#define M2_MARKER 64
27230+#define M3_MARKER 32
27231+#define M4_MARKER 16
27232+
27233+#ifndef MIN_LOOKAHEAD
27234+#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
27235+#endif
27236+
27237+#if defined(LZO_NEED_DICT_H)
27238+
27239+#ifndef LZO_HASH
27240+#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
27241+#endif
27242+#define DL_MIN_LEN M2_MIN_LEN
27243+
27244+#ifndef __LZO_DICT_H
27245+#define __LZO_DICT_H
27246+
27247+#if !defined(D_BITS) && defined(DBITS)
27248+# define D_BITS DBITS
27249+#endif
27250+#if !defined(D_BITS)
27251+# error "D_BITS is not defined"
27252+#endif
27253+#if (D_BITS < 16)
27254+# define D_SIZE LZO_SIZE(D_BITS)
27255+# define D_MASK LZO_MASK(D_BITS)
27256+#else
27257+# define D_SIZE LZO_USIZE(D_BITS)
27258+# define D_MASK LZO_UMASK(D_BITS)
27259+#endif
27260+#define D_HIGH ((D_MASK >> 1) + 1)
27261+
27262+#if !defined(DD_BITS)
27263+# define DD_BITS 0
27264+#endif
27265+#define DD_SIZE LZO_SIZE(DD_BITS)
27266+#define DD_MASK LZO_MASK(DD_BITS)
27267+
27268+#if !defined(DL_BITS)
27269+# define DL_BITS (D_BITS - DD_BITS)
27270+#endif
27271+#if (DL_BITS < 16)
27272+# define DL_SIZE LZO_SIZE(DL_BITS)
27273+# define DL_MASK LZO_MASK(DL_BITS)
27274+#else
27275+# define DL_SIZE LZO_USIZE(DL_BITS)
27276+# define DL_MASK LZO_UMASK(DL_BITS)
27277+#endif
27278+
27279+#if (D_BITS != DL_BITS + DD_BITS)
27280+# error "D_BITS does not match"
27281+#endif
27282+#if (D_BITS < 8 || D_BITS > 18)
27283+# error "invalid D_BITS"
27284+#endif
27285+#if (DL_BITS < 8 || DL_BITS > 20)
27286+# error "invalid DL_BITS"
27287+#endif
27288+#if (DD_BITS < 0 || DD_BITS > 6)
27289+# error "invalid DD_BITS"
27290+#endif
27291+
27292+#if !defined(DL_MIN_LEN)
27293+# define DL_MIN_LEN 3
27294+#endif
27295+#if !defined(DL_SHIFT)
27296+# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27297+#endif
27298+
27299+#define LZO_HASH_GZIP 1
27300+#define LZO_HASH_GZIP_INCREMENTAL 2
27301+#define LZO_HASH_LZO_INCREMENTAL_A 3
27302+#define LZO_HASH_LZO_INCREMENTAL_B 4
27303+
27304+#if !defined(LZO_HASH)
27305+# error "choose a hashing strategy"
27306+#endif
27307+
27308+#if (DL_MIN_LEN == 3)
27309+# define _DV2_A(p,shift1,shift2) \
27310+ (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27311+# define _DV2_B(p,shift1,shift2) \
27312+ (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27313+# define _DV3_B(p,shift1,shift2,shift3) \
27314+ ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27315+#elif (DL_MIN_LEN == 2)
27316+# define _DV2_A(p,shift1,shift2) \
27317+ (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27318+# define _DV2_B(p,shift1,shift2) \
27319+ (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27320+#else
27321+# error "invalid DL_MIN_LEN"
27322+#endif
27323+#define _DV_A(p,shift) _DV2_A(p,shift,shift)
27324+#define _DV_B(p,shift) _DV2_B(p,shift,shift)
27325+#define DA2(p,s1,s2) \
27326+ (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27327+#define DS2(p,s1,s2) \
27328+ (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27329+#define DX2(p,s1,s2) \
27330+ (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27331+#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27332+#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27333+#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27334+#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27335+#define DM(v) DMS(v,0)
27336+
27337+#if (LZO_HASH == LZO_HASH_GZIP)
27338+# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
27339+
27340+#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27341+# define __LZO_HASH_INCREMENTAL
27342+# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
27343+# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
27344+# define _DINDEX(dv,p) (dv)
27345+# define DVAL_LOOKAHEAD DL_MIN_LEN
27346+
27347+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27348+# define __LZO_HASH_INCREMENTAL
27349+# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
27350+# define DVAL_NEXT(dv,p) \
27351+ dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27352+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27353+# define DVAL_LOOKAHEAD DL_MIN_LEN
27354+
27355+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27356+# define __LZO_HASH_INCREMENTAL
27357+# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
27358+# define DVAL_NEXT(dv,p) \
27359+ dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27360+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27361+# define DVAL_LOOKAHEAD DL_MIN_LEN
27362+
27363+#else
27364+# error "choose a hashing strategy"
27365+#endif
27366+
27367+#ifndef DINDEX
27368+#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27369+#endif
27370+#if !defined(DINDEX1) && defined(D_INDEX1)
27371+#define DINDEX1 D_INDEX1
27372+#endif
27373+#if !defined(DINDEX2) && defined(D_INDEX2)
27374+#define DINDEX2 D_INDEX2
27375+#endif
27376+
27377+#if !defined(__LZO_HASH_INCREMENTAL)
27378+# define DVAL_FIRST(dv,p) ((void) 0)
27379+# define DVAL_NEXT(dv,p) ((void) 0)
27380+# define DVAL_LOOKAHEAD 0
27381+#endif
27382+
27383+#if !defined(DVAL_ASSERT)
27384+#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
27385+static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p)
27386+{
27387+ lzo_uint32 df;
27388+ DVAL_FIRST(df, (p));
27389+ assert(DINDEX(dv, p) == DINDEX(df, p));
27390+}
27391+#else
27392+# define DVAL_ASSERT(dv,p) ((void) 0)
27393+#endif
27394+#endif
27395+
27396+# define DENTRY(p,in) (p)
27397+# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
27398+
27399+#if (DD_BITS == 0)
27400+
27401+# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27402+# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
27403+# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
27404+
27405+#else
27406+
27407+# define UPDATE_D(dict,drun,dv,p,in) \
27408+ dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27409+# define UPDATE_I(dict,drun,index,p,in) \
27410+ dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27411+# define UPDATE_P(ptr,drun,p,in) \
27412+ (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
27413+
27414+#endif
27415+
27416+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
27417+ (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
27418+
27419+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
27420+ (BOUNDS_CHECKING_OFF_IN_EXPR( \
27421+ (PTR_LT(m_pos,in) || \
27422+ (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
27423+ m_off > max_offset) ))
27424+
27425+#if defined(LZO_DETERMINISTIC)
27426+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
27427+#else
27428+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
27429+#endif
27430+#endif
27431+#endif
27432+#endif
27433+#define DO_COMPRESS lzo1x_1_compress
27434+static
27435+lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
27436+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27437+{
27438+ register const lzo_byte *ip;
27439+ lzo_byte *op;
27440+ const lzo_byte *const in_end = in + in_len;
27441+ const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
27442+ const lzo_byte *ii;
27443+ lzo_dict_p const dict = (lzo_dict_p) wrkmem;
27444+
27445+ op = out;
27446+ ip = in;
27447+ ii = ip;
27448+
27449+ ip += 4;
27450+ for (;;) {
27451+ register const lzo_byte *m_pos;
27452+
27453+ lzo_moff_t m_off;
27454+ lzo_uint m_len;
27455+ lzo_uint dindex;
27456+
27457+ DINDEX1(dindex, ip);
27458+ GINDEX(m_pos, m_off, dict, dindex, in);
27459+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27460+ goto literal;
27461+#if 1
27462+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27463+ goto try_match;
27464+ DINDEX2(dindex, ip);
27465+#endif
27466+ GINDEX(m_pos, m_off, dict, dindex, in);
27467+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27468+ goto literal;
27469+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27470+ goto try_match;
27471+ goto literal;
27472+
27473+ try_match:
27474+#if 1 && defined(LZO_UNALIGNED_OK_2)
27475+ if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
27476+#else
27477+ if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
27478+#endif
27479+ ;
27480+ } else {
27481+ if (m_pos[2] == ip[2]) {
27482+ goto match;
27483+ } else {
27484+ ;
27485+ }
27486+ }
27487+
27488+ literal:
27489+ UPDATE_I(dict, 0, dindex, ip, in);
27490+ ++ip;
27491+ if (ip >= ip_end)
27492+ break;
27493+ continue;
27494+
27495+ match:
27496+ UPDATE_I(dict, 0, dindex, ip, in);
27497+ if (pd(ip, ii) > 0) {
27498+ register lzo_uint t = pd(ip, ii);
27499+
27500+ if (t <= 3) {
27501+ assert("lzo-04", op - 2 > out);
27502+ op[-2] |= LZO_BYTE(t);
27503+ } else if (t <= 18)
27504+ *op++ = LZO_BYTE(t - 3);
27505+ else {
27506+ register lzo_uint tt = t - 18;
27507+
27508+ *op++ = 0;
27509+ while (tt > 255) {
27510+ tt -= 255;
27511+ *op++ = 0;
27512+ }
27513+ assert("lzo-05", tt > 0);
27514+ *op++ = LZO_BYTE(tt);
27515+ }
27516+ do
27517+ *op++ = *ii++;
27518+ while (--t > 0);
27519+ }
27520+
27521+ assert("lzo-06", ii == ip);
27522+ ip += 3;
27523+ if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
27524+ || m_pos[6] != *ip++ || m_pos[7] != *ip++
27525+ || m_pos[8] != *ip++
27526+#ifdef LZO1Y
27527+ || m_pos[9] != *ip++ || m_pos[10] != *ip++
27528+ || m_pos[11] != *ip++ || m_pos[12] != *ip++
27529+ || m_pos[13] != *ip++ || m_pos[14] != *ip++
27530+#endif
27531+ ) {
27532+ --ip;
27533+ m_len = ip - ii;
27534+ assert("lzo-07", m_len >= 3);
27535+ assert("lzo-08", m_len <= M2_MAX_LEN);
27536+
27537+ if (m_off <= M2_MAX_OFFSET) {
27538+ m_off -= 1;
27539+#if defined(LZO1X)
27540+ *op++ =
27541+ LZO_BYTE(((m_len -
27542+ 1) << 5) | ((m_off & 7) << 2));
27543+ *op++ = LZO_BYTE(m_off >> 3);
27544+#elif defined(LZO1Y)
27545+ *op++ =
27546+ LZO_BYTE(((m_len +
27547+ 1) << 4) | ((m_off & 3) << 2));
27548+ *op++ = LZO_BYTE(m_off >> 2);
27549+#endif
27550+ } else if (m_off <= M3_MAX_OFFSET) {
27551+ m_off -= 1;
27552+ *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
27553+ goto m3_m4_offset;
27554+ } else
27555+#if defined(LZO1X)
27556+ {
27557+ m_off -= 0x4000;
27558+ assert("lzo-09", m_off > 0);
27559+ assert("lzo-10", m_off <= 0x7fff);
27560+ *op++ = LZO_BYTE(M4_MARKER |
27561+ ((m_off & 0x4000) >> 11) |
27562+ (m_len - 2));
27563+ goto m3_m4_offset;
27564+ }
27565+#elif defined(LZO1Y)
27566+ goto m4_match;
27567+#endif
27568+ } else {
27569+ {
27570+ const lzo_byte *end = in_end;
27571+ const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
27572+ while (ip < end && *m == *ip)
27573+ m++, ip++;
27574+ m_len = (ip - ii);
27575+ }
27576+ assert("lzo-11", m_len > M2_MAX_LEN);
27577+
27578+ if (m_off <= M3_MAX_OFFSET) {
27579+ m_off -= 1;
27580+ if (m_len <= 33)
27581+ *op++ =
27582+ LZO_BYTE(M3_MARKER | (m_len - 2));
27583+ else {
27584+ m_len -= 33;
27585+ *op++ = M3_MARKER | 0;
27586+ goto m3_m4_len;
27587+ }
27588+ } else {
27589+#if defined(LZO1Y)
27590+ m4_match:
27591+#endif
27592+ m_off -= 0x4000;
27593+ assert("lzo-12", m_off > 0);
27594+ assert("lzo-13", m_off <= 0x7fff);
27595+ if (m_len <= M4_MAX_LEN)
27596+ *op++ = LZO_BYTE(M4_MARKER |
27597+ ((m_off & 0x4000) >>
27598+ 11) | (m_len - 2));
27599+ else {
27600+ m_len -= M4_MAX_LEN;
27601+ *op++ =
27602+ LZO_BYTE(M4_MARKER |
27603+ ((m_off & 0x4000) >> 11));
27604+ m3_m4_len:
27605+ while (m_len > 255) {
27606+ m_len -= 255;
27607+ *op++ = 0;
27608+ }
27609+ assert("lzo-14", m_len > 0);
27610+ *op++ = LZO_BYTE(m_len);
27611+ }
27612+ }
27613+
27614+ m3_m4_offset:
27615+ *op++ = LZO_BYTE((m_off & 63) << 2);
27616+ *op++ = LZO_BYTE(m_off >> 6);
27617+ }
27618+
27619+ ii = ip;
27620+ if (ip >= ip_end)
27621+ break;
27622+ }
27623+
27624+ *out_len = op - out;
27625+ return pd(in_end, ii);
27626+}
27627+
27628+int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
27629+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27630+{
27631+ lzo_byte *op = out;
27632+ lzo_uint t;
27633+
27634+#if defined(__LZO_QUERY_COMPRESS)
27635+ if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27636+ return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
27637+ D_SIZE, lzo_sizeof(lzo_dict_t));
27638+#endif
27639+
27640+ if (in_len <= M2_MAX_LEN + 5)
27641+ t = in_len;
27642+ else {
27643+ t = do_compress(in, in_len, op, out_len, wrkmem);
27644+ op += *out_len;
27645+ }
27646+
27647+ if (t > 0) {
27648+ const lzo_byte *ii = in + in_len - t;
27649+
27650+ if (op == out && t <= 238)
27651+ *op++ = LZO_BYTE(17 + t);
27652+ else if (t <= 3)
27653+ op[-2] |= LZO_BYTE(t);
27654+ else if (t <= 18)
27655+ *op++ = LZO_BYTE(t - 3);
27656+ else {
27657+ lzo_uint tt = t - 18;
27658+
27659+ *op++ = 0;
27660+ while (tt > 255) {
27661+ tt -= 255;
27662+ *op++ = 0;
27663+ }
27664+ assert("lzo-15", tt > 0);
27665+ *op++ = LZO_BYTE(tt);
27666+ }
27667+ do
27668+ *op++ = *ii++;
27669+ while (--t > 0);
27670+ }
27671+
27672+ *op++ = M4_MARKER | 1;
27673+ *op++ = 0;
27674+ *op++ = 0;
27675+
27676+ *out_len = op - out;
27677+ return LZO_E_OK;
27678+}
27679+
27680+#undef do_compress
27681+#undef DO_COMPRESS
27682+#undef LZO_HASH
27683+
27684+#undef LZO_TEST_DECOMPRESS_OVERRUN
27685+#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
27686+#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
27687+#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27688+#undef DO_DECOMPRESS
27689+#define DO_DECOMPRESS lzo1x_decompress
27690+
27691+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
27692+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27693+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
27694+# endif
27695+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27696+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
27697+# endif
27698+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27699+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27700+# endif
27701+#endif
27702+
27703+#undef TEST_IP
27704+#undef TEST_OP
27705+#undef TEST_LOOKBEHIND
27706+#undef NEED_IP
27707+#undef NEED_OP
27708+#undef HAVE_TEST_IP
27709+#undef HAVE_TEST_OP
27710+#undef HAVE_NEED_IP
27711+#undef HAVE_NEED_OP
27712+#undef HAVE_ANY_IP
27713+#undef HAVE_ANY_OP
27714+
27715+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27716+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
27717+# define TEST_IP (ip < ip_end)
27718+# endif
27719+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
27720+# define NEED_IP(x) \
27721+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
27722+# endif
27723+#endif
27724+
27725+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27726+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
27727+# define TEST_OP (op <= op_end)
27728+# endif
27729+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
27730+# undef TEST_OP
27731+# define NEED_OP(x) \
27732+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
27733+# endif
27734+#endif
27735+
27736+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27737+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
27738+#else
27739+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
27740+#endif
27741+
27742+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
27743+# define TEST_IP (ip < ip_end)
27744+#endif
27745+
27746+#if defined(TEST_IP)
27747+# define HAVE_TEST_IP
27748+#else
27749+# define TEST_IP 1
27750+#endif
27751+#if defined(TEST_OP)
27752+# define HAVE_TEST_OP
27753+#else
27754+# define TEST_OP 1
27755+#endif
27756+
27757+#if defined(NEED_IP)
27758+# define HAVE_NEED_IP
27759+#else
27760+# define NEED_IP(x) ((void) 0)
27761+#endif
27762+#if defined(NEED_OP)
27763+# define HAVE_NEED_OP
27764+#else
27765+# define NEED_OP(x) ((void) 0)
27766+#endif
27767+
27768+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
27769+# define HAVE_ANY_IP
27770+#endif
27771+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
27772+# define HAVE_ANY_OP
27773+#endif
27774+
27775+#undef __COPY4
27776+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
27777+
27778+#undef COPY4
27779+#if defined(LZO_UNALIGNED_OK_4)
27780+# define COPY4(dst,src) __COPY4(dst,src)
27781+#elif defined(LZO_ALIGNED_OK_4)
27782+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
27783+#endif
27784+
27785+#if defined(DO_DECOMPRESS)
27786+int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
27787+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27788+#endif
27789+{
27790+ register lzo_byte *op;
27791+ register const lzo_byte *ip;
27792+ register lzo_uint t;
27793+#if defined(COPY_DICT)
27794+ lzo_uint m_off;
27795+ const lzo_byte *dict_end;
27796+#else
27797+ register const lzo_byte *m_pos;
27798+#endif
27799+
27800+ const lzo_byte *const ip_end = in + in_len;
27801+#if defined(HAVE_ANY_OP)
27802+ lzo_byte *const op_end = out + *out_len;
27803+#endif
27804+#if defined(LZO1Z)
27805+ lzo_uint last_m_off = 0;
27806+#endif
27807+
27808+ LZO_UNUSED(wrkmem);
27809+
27810+#if defined(__LZO_QUERY_DECOMPRESS)
27811+ if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27812+ return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
27813+ 0, 0);
27814+#endif
27815+
27816+#if defined(COPY_DICT)
27817+ if (dict) {
27818+ if (dict_len > M4_MAX_OFFSET) {
27819+ dict += dict_len - M4_MAX_OFFSET;
27820+ dict_len = M4_MAX_OFFSET;
27821+ }
27822+ dict_end = dict + dict_len;
27823+ } else {
27824+ dict_len = 0;
27825+ dict_end = NULL;
27826+ }
27827+#endif
27828+
27829+ *out_len = 0;
27830+
27831+ op = out;
27832+ ip = in;
27833+
27834+ if (*ip > 17) {
27835+ t = *ip++ - 17;
27836+ if (t < 4)
27837+ goto match_next;
27838+ assert("lzo-16", t > 0);
27839+ NEED_OP(t);
27840+ NEED_IP(t + 1);
27841+ do
27842+ *op++ = *ip++;
27843+ while (--t > 0);
27844+ goto first_literal_run;
27845+ }
27846+
27847+ while (TEST_IP && TEST_OP) {
27848+ t = *ip++;
27849+ if (t >= 16)
27850+ goto match;
27851+ if (t == 0) {
27852+ NEED_IP(1);
27853+ while (*ip == 0) {
27854+ t += 255;
27855+ ip++;
27856+ NEED_IP(1);
27857+ }
27858+ t += 15 + *ip++;
27859+ }
27860+ assert("lzo-17", t > 0);
27861+ NEED_OP(t + 3);
27862+ NEED_IP(t + 4);
27863+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
27864+#if !defined(LZO_UNALIGNED_OK_4)
27865+ if (PTR_ALIGNED2_4(op, ip)) {
27866+#endif
27867+ COPY4(op, ip);
27868+ op += 4;
27869+ ip += 4;
27870+ if (--t > 0) {
27871+ if (t >= 4) {
27872+ do {
27873+ COPY4(op, ip);
27874+ op += 4;
27875+ ip += 4;
27876+ t -= 4;
27877+ } while (t >= 4);
27878+ if (t > 0)
27879+ do
27880+ *op++ = *ip++;
27881+ while (--t > 0);
27882+ } else
27883+ do
27884+ *op++ = *ip++;
27885+ while (--t > 0);
27886+ }
27887+#if !defined(LZO_UNALIGNED_OK_4)
27888+ } else
27889+#endif
27890+#endif
27891+#if !defined(LZO_UNALIGNED_OK_4)
27892+ {
27893+ *op++ = *ip++;
27894+ *op++ = *ip++;
27895+ *op++ = *ip++;
27896+ do
27897+ *op++ = *ip++;
27898+ while (--t > 0);
27899+ }
27900+#endif
27901+
27902+ first_literal_run:
27903+
27904+ t = *ip++;
27905+ if (t >= 16)
27906+ goto match;
27907+#if defined(COPY_DICT)
27908+#if defined(LZO1Z)
27909+ m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
27910+ last_m_off = m_off;
27911+#else
27912+ m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
27913+#endif
27914+ NEED_OP(3);
27915+ t = 3;
27916+ COPY_DICT(t, m_off)
27917+#else
27918+#if defined(LZO1Z)
27919+ t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
27920+ m_pos = op - t;
27921+ last_m_off = t;
27922+#else
27923+ m_pos = op - (1 + M2_MAX_OFFSET);
27924+ m_pos -= t >> 2;
27925+ m_pos -= *ip++ << 2;
27926+#endif
27927+ TEST_LOOKBEHIND(m_pos, out);
27928+ NEED_OP(3);
27929+ *op++ = *m_pos++;
27930+ *op++ = *m_pos++;
27931+ *op++ = *m_pos;
27932+#endif
27933+ goto match_done;
27934+
27935+ while (TEST_IP && TEST_OP) {
27936+ match:
27937+ if (t >= 64) {
27938+#if defined(COPY_DICT)
27939+#if defined(LZO1X)
27940+ m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
27941+ t = (t >> 5) - 1;
27942+#elif defined(LZO1Y)
27943+ m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
27944+ t = (t >> 4) - 3;
27945+#elif defined(LZO1Z)
27946+ m_off = t & 0x1f;
27947+ if (m_off >= 0x1c)
27948+ m_off = last_m_off;
27949+ else {
27950+ m_off = 1 + (m_off << 6) + (*ip++ >> 2);
27951+ last_m_off = m_off;
27952+ }
27953+ t = (t >> 5) - 1;
27954+#endif
27955+#else
27956+#if defined(LZO1X)
27957+ m_pos = op - 1;
27958+ m_pos -= (t >> 2) & 7;
27959+ m_pos -= *ip++ << 3;
27960+ t = (t >> 5) - 1;
27961+#elif defined(LZO1Y)
27962+ m_pos = op - 1;
27963+ m_pos -= (t >> 2) & 3;
27964+ m_pos -= *ip++ << 2;
27965+ t = (t >> 4) - 3;
27966+#elif defined(LZO1Z)
27967+ {
27968+ lzo_uint off = t & 0x1f;
27969+ m_pos = op;
27970+ if (off >= 0x1c) {
27971+ assert(last_m_off > 0);
27972+ m_pos -= last_m_off;
27973+ } else {
27974+ off =
27975+ 1 + (off << 6) +
27976+ (*ip++ >> 2);
27977+ m_pos -= off;
27978+ last_m_off = off;
27979+ }
27980+ }
27981+ t = (t >> 5) - 1;
27982+#endif
27983+ TEST_LOOKBEHIND(m_pos, out);
27984+ assert("lzo-18", t > 0);
27985+ NEED_OP(t + 3 - 1);
27986+ goto copy_match;
27987+#endif
27988+ } else if (t >= 32) {
27989+ t &= 31;
27990+ if (t == 0) {
27991+ NEED_IP(1);
27992+ while (*ip == 0) {
27993+ t += 255;
27994+ ip++;
27995+ NEED_IP(1);
27996+ }
27997+ t += 31 + *ip++;
27998+ }
27999+#if defined(COPY_DICT)
28000+#if defined(LZO1Z)
28001+ m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28002+ last_m_off = m_off;
28003+#else
28004+ m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28005+#endif
28006+#else
28007+#if defined(LZO1Z)
28008+ {
28009+ lzo_uint off =
28010+ 1 + (ip[0] << 6) + (ip[1] >> 2);
28011+ m_pos = op - off;
28012+ last_m_off = off;
28013+ }
28014+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28015+ m_pos = op - 1;
28016+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28017+#else
28018+ m_pos = op - 1;
28019+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28020+#endif
28021+#endif
28022+ ip += 2;
28023+ } else if (t >= 16) {
28024+#if defined(COPY_DICT)
28025+ m_off = (t & 8) << 11;
28026+#else
28027+ m_pos = op;
28028+ m_pos -= (t & 8) << 11;
28029+#endif
28030+ t &= 7;
28031+ if (t == 0) {
28032+ NEED_IP(1);
28033+ while (*ip == 0) {
28034+ t += 255;
28035+ ip++;
28036+ NEED_IP(1);
28037+ }
28038+ t += 7 + *ip++;
28039+ }
28040+#if defined(COPY_DICT)
28041+#if defined(LZO1Z)
28042+ m_off += (ip[0] << 6) + (ip[1] >> 2);
28043+#else
28044+ m_off += (ip[0] >> 2) + (ip[1] << 6);
28045+#endif
28046+ ip += 2;
28047+ if (m_off == 0)
28048+ goto eof_found;
28049+ m_off += 0x4000;
28050+#if defined(LZO1Z)
28051+ last_m_off = m_off;
28052+#endif
28053+#else
28054+#if defined(LZO1Z)
28055+ m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28056+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28057+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28058+#else
28059+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28060+#endif
28061+ ip += 2;
28062+ if (m_pos == op)
28063+ goto eof_found;
28064+ m_pos -= 0x4000;
28065+#if defined(LZO1Z)
28066+ last_m_off = op - m_pos;
28067+#endif
28068+#endif
28069+ } else {
28070+#if defined(COPY_DICT)
28071+#if defined(LZO1Z)
28072+ m_off = 1 + (t << 6) + (*ip++ >> 2);
28073+ last_m_off = m_off;
28074+#else
28075+ m_off = 1 + (t >> 2) + (*ip++ << 2);
28076+#endif
28077+ NEED_OP(2);
28078+ t = 2;
28079+ COPY_DICT(t, m_off)
28080+#else
28081+#if defined(LZO1Z)
28082+ t = 1 + (t << 6) + (*ip++ >> 2);
28083+ m_pos = op - t;
28084+ last_m_off = t;
28085+#else
28086+ m_pos = op - 1;
28087+ m_pos -= t >> 2;
28088+ m_pos -= *ip++ << 2;
28089+#endif
28090+ TEST_LOOKBEHIND(m_pos, out);
28091+ NEED_OP(2);
28092+ *op++ = *m_pos++;
28093+ *op++ = *m_pos;
28094+#endif
28095+ goto match_done;
28096+ }
28097+
28098+#if defined(COPY_DICT)
28099+
28100+ NEED_OP(t + 3 - 1);
28101+ t += 3 - 1;
28102+ COPY_DICT(t, m_off)
28103+#else
28104+
28105+ TEST_LOOKBEHIND(m_pos, out);
28106+ assert("lzo-19", t > 0);
28107+ NEED_OP(t + 3 - 1);
28108+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28109+#if !defined(LZO_UNALIGNED_OK_4)
28110+ if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28111+ assert((op - m_pos) >= 4);
28112+#else
28113+ if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28114+#endif
28115+ COPY4(op, m_pos);
28116+ op += 4;
28117+ m_pos += 4;
28118+ t -= 4 - (3 - 1);
28119+ do {
28120+ COPY4(op, m_pos);
28121+ op += 4;
28122+ m_pos += 4;
28123+ t -= 4;
28124+ } while (t >= 4);
28125+ if (t > 0)
28126+ do
28127+ *op++ = *m_pos++;
28128+ while (--t > 0);
28129+ } else
28130+#endif
28131+ {
28132+ copy_match:
28133+ *op++ = *m_pos++;
28134+ *op++ = *m_pos++;
28135+ do
28136+ *op++ = *m_pos++;
28137+ while (--t > 0);
28138+ }
28139+
28140+#endif
28141+
28142+ match_done:
28143+#if defined(LZO1Z)
28144+ t = ip[-1] & 3;
28145+#else
28146+ t = ip[-2] & 3;
28147+#endif
28148+ if (t == 0)
28149+ break;
28150+
28151+ match_next:
28152+ assert("lzo-20", t > 0);
28153+ NEED_OP(t);
28154+ NEED_IP(t + 1);
28155+ do
28156+ *op++ = *ip++;
28157+ while (--t > 0);
28158+ t = *ip++;
28159+ }
28160+ }
28161+
28162+#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28163+ *out_len = op - out;
28164+ return LZO_E_EOF_NOT_FOUND;
28165+#endif
28166+
28167+ eof_found:
28168+ assert("lzo-21", t == 1);
28169+ *out_len = op - out;
28170+ return (ip == ip_end ? LZO_E_OK :
28171+ (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28172+
28173+#if defined(HAVE_NEED_IP)
28174+ input_overrun:
28175+ *out_len = op - out;
28176+ return LZO_E_INPUT_OVERRUN;
28177+#endif
28178+
28179+#if defined(HAVE_NEED_OP)
28180+ output_overrun:
28181+ *out_len = op - out;
28182+ return LZO_E_OUTPUT_OVERRUN;
28183+#endif
28184+
28185+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28186+ lookbehind_overrun:
28187+ *out_len = op - out;
28188+ return LZO_E_LOOKBEHIND_OVERRUN;
28189+#endif
28190+}
28191+
28192+#define LZO_TEST_DECOMPRESS_OVERRUN
28193+#undef DO_DECOMPRESS
28194+#define DO_DECOMPRESS lzo1x_decompress_safe
28195+
28196+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28197+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28198+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28199+# endif
28200+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28201+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28202+# endif
28203+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28204+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28205+# endif
28206+#endif
28207+
28208+#undef TEST_IP
28209+#undef TEST_OP
28210+#undef TEST_LOOKBEHIND
28211+#undef NEED_IP
28212+#undef NEED_OP
28213+#undef HAVE_TEST_IP
28214+#undef HAVE_TEST_OP
28215+#undef HAVE_NEED_IP
28216+#undef HAVE_NEED_OP
28217+#undef HAVE_ANY_IP
28218+#undef HAVE_ANY_OP
28219+
28220+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28221+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28222+# define TEST_IP (ip < ip_end)
28223+# endif
28224+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28225+# define NEED_IP(x) \
28226+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28227+# endif
28228+#endif
28229+
28230+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28231+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28232+# define TEST_OP (op <= op_end)
28233+# endif
28234+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28235+# undef TEST_OP
28236+# define NEED_OP(x) \
28237+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28238+# endif
28239+#endif
28240+
28241+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28242+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28243+#else
28244+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28245+#endif
28246+
28247+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28248+# define TEST_IP (ip < ip_end)
28249+#endif
28250+
28251+#if defined(TEST_IP)
28252+# define HAVE_TEST_IP
28253+#else
28254+# define TEST_IP 1
28255+#endif
28256+#if defined(TEST_OP)
28257+# define HAVE_TEST_OP
28258+#else
28259+# define TEST_OP 1
28260+#endif
28261+
28262+#if defined(NEED_IP)
28263+# define HAVE_NEED_IP
28264+#else
28265+# define NEED_IP(x) ((void) 0)
28266+#endif
28267+#if defined(NEED_OP)
28268+# define HAVE_NEED_OP
28269+#else
28270+# define NEED_OP(x) ((void) 0)
28271+#endif
28272+
28273+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28274+# define HAVE_ANY_IP
28275+#endif
28276+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28277+# define HAVE_ANY_OP
28278+#endif
28279+
28280+#undef __COPY4
28281+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28282+
28283+#undef COPY4
28284+#if defined(LZO_UNALIGNED_OK_4)
28285+# define COPY4(dst,src) __COPY4(dst,src)
28286+#elif defined(LZO_ALIGNED_OK_4)
28287+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28288+#endif
28289+
28290+/***** End of minilzo.c *****/
28291diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.h linux-2.6.20/fs/reiser4/plugin/compress/minilzo.h
28292--- linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.h 1970-01-01 03:00:00.000000000 +0300
28293+++ linux-2.6.20/fs/reiser4/plugin/compress/minilzo.h 2007-05-06 14:50:43.754993222 +0400
28294@@ -0,0 +1,70 @@
28295+/* minilzo.h -- mini subset of the LZO real-time data compression library
28296+ adopted for reiser4 compression transform plugin.
28297+
28298+ This file is part of the LZO real-time data compression library
28299+ and not included in any proprietary licenses of reiser4.
28300+
28301+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28302+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28303+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28304+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28305+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28306+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28307+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28308+ All Rights Reserved.
28309+
28310+ The LZO library is free software; you can redistribute it and/or
28311+ modify it under the terms of the GNU General Public License as
28312+ published by the Free Software Foundation; either version 2 of
28313+ the License, or (at your option) any later version.
28314+
28315+ The LZO library is distributed in the hope that it will be useful,
28316+ but WITHOUT ANY WARRANTY; without even the implied warranty of
28317+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28318+ GNU General Public License for more details.
28319+
28320+ You should have received a copy of the GNU General Public License
28321+ along with the LZO library; see the file COPYING.
28322+ If not, write to the Free Software Foundation, Inc.,
28323+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28324+
28325+ Markus F.X.J. Oberhumer
28326+ <markus@oberhumer.com>
28327+ http://www.oberhumer.com/opensource/lzo/
28328+ */
28329+
28330+/*
28331+ * NOTE:
28332+ * the full LZO package can be found at
28333+ * http://www.oberhumer.com/opensource/lzo/
28334+ */
28335+
28336+#ifndef __MINILZO_H
28337+#define __MINILZO_H
28338+
28339+#define MINILZO_VERSION 0x1080
28340+
28341+#include "lzoconf.h"
28342+
28343+/* Memory required for the wrkmem parameter.
28344+ * When the required size is 0, you can also pass a NULL pointer.
28345+ */
28346+
28347+#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
28348+#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28349+#define LZO1X_MEM_DECOMPRESS (0)
28350+
28351+/* compression */
28352+extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28353+ lzo_byte * dst, lzo_uintp dst_len,
28354+ lzo_voidp wrkmem);
28355+/* decompression */
28356+extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28357+ lzo_byte * dst, lzo_uintp dst_len,
28358+ lzo_voidp wrkmem /* NOT USED */);
28359+/* safe decompression with overrun testing */
28360+extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
28361+ lzo_byte * dst, lzo_uintp dst_len,
28362+ lzo_voidp wrkmem /* NOT USED */ );
28363+
28364+#endif /* already included */
28365diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.20/fs/reiser4/plugin/crypto/cipher.c
28366--- linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
28367+++ linux-2.6.20/fs/reiser4/plugin/crypto/cipher.c 2007-05-06 14:50:43.754993222 +0400
28368@@ -0,0 +1,37 @@
28369+/* Copyright 2001, 2002, 2003 by Hans Reiser,
28370+ licensing governed by reiser4/README */
28371+/* Reiser4 cipher transform plugins */
28372+
28373+#include "../../debug.h"
28374+#include "../plugin.h"
28375+
28376+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
28377+ [NONE_CIPHER_ID] = {
28378+ .h = {
28379+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
28380+ .id = NONE_CIPHER_ID,
28381+ .pops = NULL,
28382+ .label = "none",
28383+ .desc = "no cipher transform",
28384+ .linkage = {NULL, NULL}
28385+ },
28386+ .alloc = NULL,
28387+ .free = NULL,
28388+ .scale = NULL,
28389+ .align_stream = NULL,
28390+ .setkey = NULL,
28391+ .encrypt = NULL,
28392+ .decrypt = NULL
28393+ }
28394+};
28395+
28396+/* Make Linus happy.
28397+ Local variables:
28398+ c-indentation-style: "K&R"
28399+ mode-name: "LC"
28400+ c-basic-offset: 8
28401+ tab-width: 8
28402+ fill-column: 120
28403+ scroll-step: 1
28404+ End:
28405+*/
28406diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.20/fs/reiser4/plugin/crypto/cipher.h
28407--- linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
28408+++ linux-2.6.20/fs/reiser4/plugin/crypto/cipher.h 2007-05-06 14:50:43.754993222 +0400
28409@@ -0,0 +1,55 @@
28410+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28411+/* This file contains definitions for the objects operated
28412+ by reiser4 key manager, which is something like keyring
28413+ wrapped by appropriate reiser4 plugin */
28414+
28415+#if !defined( __FS_REISER4_CRYPT_H__ )
28416+#define __FS_REISER4_CRYPT_H__
28417+
28418+#include <linux/crypto.h>
28419+
28420+/* key info imported from user space */
28421+typedef struct crypto_data {
28422+ int keysize; /* uninstantiated key size */
28423+ __u8 * key; /* uninstantiated key */
28424+ int keyid_size; /* size of passphrase */
28425+ __u8 * keyid; /* passphrase */
28426+} crypto_data_t;
28427+
28428+/* This object contains all needed infrastructure to implement
28429+ cipher transform. This is operated (allocating, inheriting,
28430+ validating, binding to host inode, etc..) by reiser4 key manager.
28431+
28432+ This info can be allocated in two cases:
28433+ 1. importing a key from user space.
28434+ 2. reading inode from disk */
28435+typedef struct crypto_stat {
28436+ struct inode * host;
28437+ struct crypto_hash * digest;
28438+ struct crypto_blkcipher * cipher;
28439+#if 0
28440+ cipher_key_plugin * kplug; /* key manager */
28441+#endif
28442+ __u8 * keyid; /* key fingerprint, created by digest plugin,
28443+ using uninstantiated key and passphrase.
28444+ supposed to be stored in disk stat-data */
28445+ int inst; /* this indicates if the cipher key is
28446+ instantiated (case 1 above) */
28447+ int keysize; /* uninstantiated key size (bytes), supposed
28448+ to be stored in disk stat-data */
28449+ int keyload_count; /* number of the objects which has this
28450+ crypto-stat attached */
28451+} crypto_stat_t;
28452+
28453+#endif /* __FS_REISER4_CRYPT_H__ */
28454+
28455+/*
28456+ Local variables:
28457+ c-indentation-style: "K&R"
28458+ mode-name: "LC"
28459+ c-basic-offset: 8
28460+ tab-width: 8
28461+ fill-column: 120
28462+ scroll-step: 1
28463+ End:
28464+*/
28465diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.20/fs/reiser4/plugin/crypto/digest.c
28466--- linux-2.6.20.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
28467+++ linux-2.6.20/fs/reiser4/plugin/crypto/digest.c 2007-05-06 14:50:43.754993222 +0400
28468@@ -0,0 +1,58 @@
28469+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28470+
28471+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
28472+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
28473+#include "../../debug.h"
28474+#include "../plugin_header.h"
28475+#include "../plugin.h"
28476+#include "../file/cryptcompress.h"
28477+
28478+#include <linux/types.h>
28479+
28480+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
28481+
28482+static struct crypto_hash * alloc_sha256 (void)
28483+{
28484+#if REISER4_SHA256
28485+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
28486+#else
28487+ warning("edward-1418", "sha256 unsupported");
28488+ return ERR_PTR(-EINVAL);
28489+#endif
28490+}
28491+
28492+static void free_sha256 (struct crypto_hash * tfm)
28493+{
28494+#if REISER4_SHA256
28495+ crypto_free_hash(tfm);
28496+#endif
28497+ return;
28498+}
28499+
28500+/* digest plugins */
28501+digest_plugin digest_plugins[LAST_DIGEST_ID] = {
28502+ [SHA256_32_DIGEST_ID] = {
28503+ .h = {
28504+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
28505+ .id = SHA256_32_DIGEST_ID,
28506+ .pops = NULL,
28507+ .label = "sha256_32",
28508+ .desc = "sha256_32 digest transform",
28509+ .linkage = {NULL, NULL}
28510+ },
28511+ .fipsize = sizeof(__u32),
28512+ .alloc = alloc_sha256,
28513+ .free = free_sha256
28514+ }
28515+};
28516+
28517+/*
28518+ Local variables:
28519+ c-indentation-style: "K&R"
28520+ mode-name: "LC"
28521+ c-basic-offset: 8
28522+ tab-width: 8
28523+ fill-column: 120
28524+ scroll-step: 1
28525+ End:
28526+*/
28527diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.20/fs/reiser4/plugin/dir/dir.h
28528--- linux-2.6.20.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
28529+++ linux-2.6.20/fs/reiser4/plugin/dir/dir.h 2007-05-06 14:50:43.754993222 +0400
28530@@ -0,0 +1,36 @@
28531+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28532+ * reiser4/README */
28533+
28534+/* this file contains declarations of methods implementing directory plugins */
28535+
28536+#if !defined( __REISER4_DIR_H__ )
28537+#define __REISER4_DIR_H__
28538+
28539+/*#include "../../key.h"
28540+
28541+#include <linux/fs.h>*/
28542+
28543+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
28544+
28545+/* "hashed" directory methods of dir plugin */
28546+void build_entry_key_hashed(const struct inode *, const struct qstr *,
28547+ reiser4_key *);
28548+
28549+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
28550+
28551+/* "seekable" directory methods of dir plugin */
28552+void build_entry_key_seekable(const struct inode *, const struct qstr *,
28553+ reiser4_key *);
28554+
28555+/* __REISER4_DIR_H__ */
28556+#endif
28557+
28558+/*
28559+ Local variables:
28560+ c-indentation-style: "K&R"
28561+ mode-name: "LC"
28562+ c-basic-offset: 8
28563+ tab-width: 8
28564+ fill-column: 120
28565+ End:
28566+*/
28567diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.20/fs/reiser4/plugin/dir/hashed_dir.c
28568--- linux-2.6.20.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300
28569+++ linux-2.6.20/fs/reiser4/plugin/dir/hashed_dir.c 2007-05-06 14:50:43.754993222 +0400
28570@@ -0,0 +1,81 @@
28571+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28572+ * reiser4/README */
28573+
28574+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
28575+ names to the files. */
28576+
28577+/*
28578+ * Hashed directory logically consists of persistent directory
28579+ * entries. Directory entry is a pair of a file name and a key of stat-data of
28580+ * a file that has this name in the given directory.
28581+ *
28582+ * Directory entries are stored in the tree in the form of directory
28583+ * items. Directory item should implement dir_entry_ops portion of item plugin
28584+ * interface (see plugin/item/item.h). Hashed directory interacts with
28585+ * directory item plugin exclusively through dir_entry_ops operations.
28586+ *
28587+ * Currently there are two implementations of directory items: "simple
28588+ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
28589+ * (plugin/item/cde.[ch]) with the latter being the default.
28590+ *
28591+ * There is, however some delicate way through which directory code interferes
28592+ * with item plugin: key assignment policy. A key for a directory item is
28593+ * chosen by directory code, and as described in kassign.c, this key contains
28594+ * a portion of file name. Directory item uses this knowledge to avoid storing
28595+ * this portion of file name twice: in the key and in the directory item body.
28596+ *
28597+ */
28598+
28599+#include "../../inode.h"
28600+
28601+void complete_entry_key(const struct inode *, const char *name,
28602+ int len, reiser4_key * result);
28603+
28604+/* this is implementation of build_entry_key method of dir
28605+ plugin for HASHED_DIR_PLUGIN_ID
28606+ */
28607+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
28608+ * (or will be) in.*/
28609+ const struct qstr *qname, /* name of file referenced
28610+ * by this entry */
28611+ reiser4_key * result /* resulting key of directory
28612+ * entry */ )
28613+{
28614+ const char *name;
28615+ int len;
28616+
28617+ assert("nikita-1139", dir != NULL);
28618+ assert("nikita-1140", qname != NULL);
28619+ assert("nikita-1141", qname->name != NULL);
28620+ assert("nikita-1142", result != NULL);
28621+
28622+ name = qname->name;
28623+ len = qname->len;
28624+
28625+ assert("nikita-2867", strlen(name) == len);
28626+
28627+ reiser4_key_init(result);
28628+ /* locality of directory entry's key is objectid of parent
28629+ directory */
28630+ set_key_locality(result, get_inode_oid(dir));
28631+ /* minor packing locality is constant */
28632+ set_key_type(result, KEY_FILE_NAME_MINOR);
28633+ /* dot is special case---we always want it to be first entry in
28634+ a directory. Actually, we just want to have smallest
28635+ directory entry.
28636+ */
28637+ if (len == 1 && name[0] == '.')
28638+ return;
28639+
28640+ /* initialize part of entry key which depends on file name */
28641+ complete_entry_key(dir, name, len, result);
28642+}
28643+
28644+/* Local variables:
28645+ c-indentation-style: "K&R"
28646+ mode-name: "LC"
28647+ c-basic-offset: 8
28648+ tab-width: 8
28649+ fill-column: 120
28650+ End:
28651+*/
28652diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.20/fs/reiser4/plugin/dir/Makefile
28653--- linux-2.6.20.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300
28654+++ linux-2.6.20/fs/reiser4/plugin/dir/Makefile 2007-05-06 14:50:43.758994472 +0400
28655@@ -0,0 +1,5 @@
28656+obj-$(CONFIG_REISER4_FS) += dir_plugins.o
28657+
28658+dir_plugins-objs := \
28659+ hashed_dir.o \
28660+ seekable_dir.o
28661diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.20/fs/reiser4/plugin/dir/seekable_dir.c
28662--- linux-2.6.20.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300
28663+++ linux-2.6.20/fs/reiser4/plugin/dir/seekable_dir.c 2007-05-06 14:50:43.758994472 +0400
28664@@ -0,0 +1,46 @@
28665+/* Copyright 2005 by Hans Reiser, licensing governed by
28666+ * reiser4/README */
28667+
28668+#include "../../inode.h"
28669+
28670+/* this is implementation of build_entry_key method of dir
28671+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
28672+ This is for directories where we want repeatable and restartable readdir()
28673+ even in case 32bit user level struct dirent (readdir(3)).
28674+*/
28675+void
28676+build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
28677+ reiser4_key * result)
28678+{
28679+ oid_t objectid;
28680+
28681+ assert("nikita-2283", dir != NULL);
28682+ assert("nikita-2284", name != NULL);
28683+ assert("nikita-2285", name->name != NULL);
28684+ assert("nikita-2286", result != NULL);
28685+
28686+ reiser4_key_init(result);
28687+ /* locality of directory entry's key is objectid of parent
28688+ directory */
28689+ set_key_locality(result, get_inode_oid(dir));
28690+ /* minor packing locality is constant */
28691+ set_key_type(result, KEY_FILE_NAME_MINOR);
28692+ /* dot is special case---we always want it to be first entry in
28693+ a directory. Actually, we just want to have smallest
28694+ directory entry.
28695+ */
28696+ if ((name->len == 1) && (name->name[0] == '.'))
28697+ return;
28698+
28699+ /* objectid of key is 31 lowest bits of hash. */
28700+ objectid =
28701+ inode_hash_plugin(dir)->hash(name->name,
28702+ (int)name->len) & 0x7fffffff;
28703+
28704+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
28705+ set_key_objectid(result, objectid);
28706+
28707+ /* offset is always 0. */
28708+ set_key_offset(result, (__u64) 0);
28709+ return;
28710+}
28711diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.20/fs/reiser4/plugin/dir_plugin_common.c
28712--- linux-2.6.20.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
28713+++ linux-2.6.20/fs/reiser4/plugin/dir_plugin_common.c 2007-05-06 14:50:43.758994472 +0400
28714@@ -0,0 +1,872 @@
28715+/* Copyright 2005 by Hans Reiser, licensing governed by
28716+ reiser4/README */
28717+
28718+/* this file contains typical implementations for most of methods of
28719+ directory plugin
28720+*/
28721+
28722+#include "../inode.h"
28723+
28724+int reiser4_find_entry(struct inode *dir, struct dentry *name,
28725+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
28726+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
28727+void check_light_weight(struct inode *inode, struct inode *parent);
28728+
28729+/* this is common implementation of get_parent method of dir plugin
28730+ this is used by NFS kernel server to "climb" up directory tree to
28731+ check permissions
28732+ */
28733+struct dentry *get_parent_common(struct inode *child)
28734+{
28735+ struct super_block *s;
28736+ struct inode *parent;
28737+ struct dentry dotdot;
28738+ struct dentry *dentry;
28739+ reiser4_key key;
28740+ int result;
28741+
28742+ /*
28743+ * lookup dotdot entry.
28744+ */
28745+
28746+ s = child->i_sb;
28747+ memset(&dotdot, 0, sizeof(dotdot));
28748+ dotdot.d_name.name = "..";
28749+ dotdot.d_name.len = 2;
28750+ dotdot.d_op = &get_super_private(s)->ops.dentry;
28751+
28752+ result = reiser4_lookup_name(child, &dotdot, &key);
28753+ if (result != 0)
28754+ return ERR_PTR(result);
28755+
28756+ parent = reiser4_iget(s, &key, 1);
28757+ if (!IS_ERR(parent)) {
28758+ /*
28759+ * FIXME-NIKITA dubious: attributes are inherited from @child
28760+ * to @parent. But:
28761+ *
28762+ * (*) this is the only this we can do
28763+ *
28764+ * (*) attributes of light-weight object are inherited
28765+ * from a parent through which object was looked up first,
28766+ * so it is ambiguous anyway.
28767+ *
28768+ */
28769+ check_light_weight(parent, child);
28770+ reiser4_iget_complete(parent);
28771+ dentry = d_alloc_anon(parent);
28772+ if (dentry == NULL) {
28773+ iput(parent);
28774+ dentry = ERR_PTR(RETERR(-ENOMEM));
28775+ } else
28776+ dentry->d_op = &get_super_private(s)->ops.dentry;
28777+ } else if (PTR_ERR(parent) == -ENOENT)
28778+ dentry = ERR_PTR(RETERR(-ESTALE));
28779+ else
28780+ dentry = (void *)parent;
28781+ return dentry;
28782+}
28783+
28784+/* this is common implementation of is_name_acceptable method of dir
28785+ plugin
28786+ */
28787+int is_name_acceptable_common(const struct inode *inode, /* directory to check */
28788+ const char *name UNUSED_ARG, /* name to check */
28789+ int len /* @name's length */ )
28790+{
28791+ assert("nikita-733", inode != NULL);
28792+ assert("nikita-734", name != NULL);
28793+ assert("nikita-735", len > 0);
28794+
28795+ return len <= reiser4_max_filename_len(inode);
28796+}
28797+
28798+/* there is no common implementation of build_entry_key method of dir
28799+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
28800+ plugin/dir/seekable.c:build_entry_key_seekable() for example
28801+*/
28802+
28803+/* this is common implementation of build_readdir_key method of dir
28804+ plugin
28805+ see reiser4_readdir_common for more details
28806+*/
28807+int build_readdir_key_common(struct file *dir /* directory being read */ ,
28808+ reiser4_key * result /* where to store key */ )
28809+{
28810+ reiser4_file_fsdata *fdata;
28811+ struct inode *inode;
28812+
28813+ assert("nikita-1361", dir != NULL);
28814+ assert("nikita-1362", result != NULL);
28815+ assert("nikita-1363", dir->f_dentry != NULL);
28816+ inode = dir->f_dentry->d_inode;
28817+ assert("nikita-1373", inode != NULL);
28818+
28819+ fdata = reiser4_get_file_fsdata(dir);
28820+ if (IS_ERR(fdata))
28821+ return PTR_ERR(fdata);
28822+ assert("nikita-1364", fdata != NULL);
28823+ return extract_key_from_de_id(get_inode_oid(inode),
28824+ &fdata->dir.readdir.position.
28825+ dir_entry_key, result);
28826+
28827+}
28828+
28829+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
28830+ int adj);
28831+
28832+/* this is common implementation of add_entry method of dir plugin
28833+*/
28834+int reiser4_add_entry_common(struct inode *object, /* directory to add new name
28835+ * in */
28836+ struct dentry *where, /* new name */
28837+ reiser4_object_create_data * data, /* parameters of
28838+ * new object */
28839+ reiser4_dir_entry_desc * entry /* parameters of
28840+ * new directory
28841+ * entry */)
28842+{
28843+ int result;
28844+ coord_t *coord;
28845+ lock_handle lh;
28846+ reiser4_dentry_fsdata *fsdata;
28847+ reiser4_block_nr reserve;
28848+
28849+ assert("nikita-1114", object != NULL);
28850+ assert("nikita-1250", where != NULL);
28851+
28852+ fsdata = reiser4_get_dentry_fsdata(where);
28853+ if (unlikely(IS_ERR(fsdata)))
28854+ return PTR_ERR(fsdata);
28855+
28856+ reserve = inode_dir_plugin(object)->estimate.add_entry(object);
28857+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
28858+ return RETERR(-ENOSPC);
28859+
28860+ init_lh(&lh);
28861+ coord = &fsdata->dec.entry_coord;
28862+ coord_clear_iplug(coord);
28863+
28864+ /* check for this entry in a directory. This is plugin method. */
28865+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
28866+ entry);
28867+ if (likely(result == -ENOENT)) {
28868+ /* add new entry. Just pass control to the directory
28869+ item plugin. */
28870+ assert("nikita-1709", inode_dir_item_plugin(object));
28871+ assert("nikita-2230", coord->node == lh.node);
28872+ reiser4_seal_done(&fsdata->dec.entry_seal);
28873+ result =
28874+ inode_dir_item_plugin(object)->s.dir.add_entry(object,
28875+ coord, &lh,
28876+ where,
28877+ entry);
28878+ if (result == 0) {
28879+ reiser4_adjust_dir_file(object, where,
28880+ fsdata->dec.pos + 1, +1);
28881+ INODE_INC_FIELD(object, i_size);
28882+ }
28883+ } else if (result == 0) {
28884+ assert("nikita-2232", coord->node == lh.node);
28885+ result = RETERR(-EEXIST);
28886+ }
28887+ done_lh(&lh);
28888+
28889+ return result;
28890+}
28891+
28892+/**
28893+ * rem_entry - remove entry from directory item
28894+ * @dir:
28895+ * @dentry:
28896+ * @entry:
28897+ * @coord:
28898+ * @lh:
28899+ *
28900+ * Checks that coordinate @coord is set properly and calls item plugin
28901+ * method to cut entry.
28902+ */
28903+static int
28904+rem_entry(struct inode *dir, struct dentry *dentry,
28905+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
28906+{
28907+ item_plugin *iplug;
28908+ struct inode *child;
28909+
28910+ iplug = inode_dir_item_plugin(dir);
28911+ child = dentry->d_inode;
28912+ assert("nikita-3399", child != NULL);
28913+
28914+ /* check that we are really destroying an entry for @child */
28915+ if (REISER4_DEBUG) {
28916+ int result;
28917+ reiser4_key key;
28918+
28919+ result = iplug->s.dir.extract_key(coord, &key);
28920+ if (result != 0)
28921+ return result;
28922+ if (get_key_objectid(&key) != get_inode_oid(child)) {
28923+ warning("nikita-3397",
28924+ "rem_entry: %#llx != %#llx\n",
28925+ get_key_objectid(&key),
28926+ (unsigned long long)get_inode_oid(child));
28927+ return RETERR(-EIO);
28928+ }
28929+ }
28930+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
28931+}
28932+
28933+/**
28934+ * reiser4_rem_entry_common - remove entry from a directory
28935+ * @dir: directory to remove entry from
28936+ * @where: name that is being removed
28937+ * @entry: description of entry being removed
28938+ *
28939+ * This is common implementation of rem_entry method of dir plugin.
28940+ */
28941+int reiser4_rem_entry_common(struct inode *dir,
28942+ struct dentry *dentry,
28943+ reiser4_dir_entry_desc *entry)
28944+{
28945+ int result;
28946+ coord_t *coord;
28947+ lock_handle lh;
28948+ reiser4_dentry_fsdata *fsdata;
28949+ __u64 tograb;
28950+
28951+ assert("nikita-1124", dir != NULL);
28952+ assert("nikita-1125", dentry != NULL);
28953+
28954+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
28955+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
28956+ if (result != 0)
28957+ return RETERR(-ENOSPC);
28958+
28959+ init_lh(&lh);
28960+
28961+ /* check for this entry in a directory. This is plugin method. */
28962+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
28963+ fsdata = reiser4_get_dentry_fsdata(dentry);
28964+ if (IS_ERR(fsdata)) {
28965+ done_lh(&lh);
28966+ return PTR_ERR(fsdata);
28967+ }
28968+
28969+ coord = &fsdata->dec.entry_coord;
28970+
28971+ assert("nikita-3404",
28972+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
28973+ dir->i_size <= 1);
28974+
28975+ coord_clear_iplug(coord);
28976+ if (result == 0) {
28977+ /* remove entry. Just pass control to the directory item
28978+ plugin. */
28979+ assert("vs-542", inode_dir_item_plugin(dir));
28980+ reiser4_seal_done(&fsdata->dec.entry_seal);
28981+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
28982+ result =
28983+ WITH_COORD(coord,
28984+ rem_entry(dir, dentry, entry, coord, &lh));
28985+ if (result == 0) {
28986+ if (dir->i_size >= 1)
28987+ INODE_DEC_FIELD(dir, i_size);
28988+ else {
28989+ warning("nikita-2509", "Dir %llu is runt",
28990+ (unsigned long long)
28991+ get_inode_oid(dir));
28992+ result = RETERR(-EIO);
28993+ }
28994+
28995+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
28996+ dentry->d_inode->i_size != 2 ||
28997+ inode_dir_plugin(dentry->d_inode) == NULL);
28998+ }
28999+ }
29000+ done_lh(&lh);
29001+
29002+ return result;
29003+}
29004+
29005+static reiser4_block_nr estimate_init(struct inode *parent,
29006+ struct inode *object);
29007+static int create_dot_dotdot(struct inode *object, struct inode *parent);
29008+
29009+/* this is common implementation of init method of dir plugin
29010+ create "." and ".." entries
29011+*/
29012+int reiser4_dir_init_common(struct inode *object, /* new directory */
29013+ struct inode *parent, /* parent directory */
29014+ reiser4_object_create_data * data /* info passed
29015+ * to us, this
29016+ * is filled by
29017+ * reiser4()
29018+ * syscall in
29019+ * particular */)
29020+{
29021+ reiser4_block_nr reserve;
29022+
29023+ assert("nikita-680", object != NULL);
29024+ assert("nikita-681", S_ISDIR(object->i_mode));
29025+ assert("nikita-682", parent != NULL);
29026+ assert("nikita-684", data != NULL);
29027+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29028+ assert("nikita-687", object->i_mode & S_IFDIR);
29029+
29030+ reserve = estimate_init(parent, object);
29031+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29032+ return RETERR(-ENOSPC);
29033+
29034+ return create_dot_dotdot(object, parent);
29035+}
29036+
29037+/* this is common implementation of done method of dir plugin
29038+ remove "." entry
29039+*/
29040+int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
29041+{
29042+ int result;
29043+ reiser4_block_nr reserve;
29044+ struct dentry goodby_dots;
29045+ reiser4_dir_entry_desc entry;
29046+
29047+ assert("nikita-1449", object != NULL);
29048+
29049+ if (reiser4_inode_get_flag(object, REISER4_NO_SD))
29050+ return 0;
29051+
29052+ /* of course, this can be rewritten to sweep everything in one
29053+ reiser4_cut_tree(). */
29054+ memset(&entry, 0, sizeof entry);
29055+
29056+ /* FIXME: this done method is called from reiser4_delete_dir_common which
29057+ * reserved space already */
29058+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29059+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29060+ return RETERR(-ENOSPC);
29061+
29062+ memset(&goodby_dots, 0, sizeof goodby_dots);
29063+ entry.obj = goodby_dots.d_inode = object;
29064+ goodby_dots.d_name.name = ".";
29065+ goodby_dots.d_name.len = 1;
29066+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29067+ reiser4_free_dentry_fsdata(&goodby_dots);
29068+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29069+ /* only worth a warning
29070+
29071+ "values of \ eB\ f will give rise to dom!\n"
29072+ -- v6src/s2/mv.c:89
29073+ */
29074+ warning("nikita-2252", "Cannot remove dot of %lli: %i",
29075+ (unsigned long long)get_inode_oid(object), result);
29076+ return 0;
29077+}
29078+
29079+/* this is common implementation of attach method of dir plugin
29080+*/
29081+int reiser4_attach_common(struct inode *child UNUSED_ARG,
29082+ struct inode *parent UNUSED_ARG)
29083+{
29084+ assert("nikita-2647", child != NULL);
29085+ assert("nikita-2648", parent != NULL);
29086+
29087+ return 0;
29088+}
29089+
29090+/* this is common implementation of detach method of dir plugin
29091+ remove "..", decrease nlink on parent
29092+*/
29093+int reiser4_detach_common(struct inode *object, struct inode *parent)
29094+{
29095+ int result;
29096+ struct dentry goodby_dots;
29097+ reiser4_dir_entry_desc entry;
29098+
29099+ assert("nikita-2885", object != NULL);
29100+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
29101+
29102+ memset(&entry, 0, sizeof entry);
29103+
29104+ /* NOTE-NIKITA this only works if @parent is -the- parent of
29105+ @object, viz. object whose key is stored in dotdot
29106+ entry. Wouldn't work with hard-links on directories. */
29107+ memset(&goodby_dots, 0, sizeof goodby_dots);
29108+ entry.obj = goodby_dots.d_inode = parent;
29109+ goodby_dots.d_name.name = "..";
29110+ goodby_dots.d_name.len = 2;
29111+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29112+ reiser4_free_dentry_fsdata(&goodby_dots);
29113+ if (result == 0) {
29114+ /* the dot should be the only entry remaining at this time... */
29115+ assert("nikita-3400",
29116+ object->i_size == 1 && object->i_nlink <= 2);
29117+#if 0
29118+ /* and, together with the only name directory can have, they
29119+ * provides for the last 2 remaining references. If we get
29120+ * here as part of error handling during mkdir, @object
29121+ * possibly has no name yet, so its nlink == 1. If we get here
29122+ * from rename (targeting empty directory), it has no name
29123+ * already, so its nlink == 1. */
29124+ assert("nikita-3401",
29125+ object->i_nlink == 2 || object->i_nlink == 1);
29126+#endif
29127+
29128+ /* decrement nlink of directory removed ".." pointed
29129+ to */
29130+ reiser4_del_nlink(parent, NULL, 0);
29131+ }
29132+ return result;
29133+}
29134+
29135+/* this is common implementation of estimate.add_entry method of
29136+ dir plugin
29137+ estimation of adding entry which supposes that entry is inserting a
29138+ unit into item
29139+*/
29140+reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29141+{
29142+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
29143+}
29144+
29145+/* this is common implementation of estimate.rem_entry method of dir
29146+ plugin
29147+*/
29148+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29149+{
29150+ return estimate_one_item_removal(reiser4_tree_by_inode(inode));
29151+}
29152+
29153+/* this is common implementation of estimate.unlink method of dir
29154+ plugin
29155+*/
29156+reiser4_block_nr
29157+dir_estimate_unlink_common(const struct inode * parent,
29158+ const struct inode * object)
29159+{
29160+ reiser4_block_nr res;
29161+
29162+ /* hashed_rem_entry(object) */
29163+ res = inode_dir_plugin(object)->estimate.rem_entry(object);
29164+ /* del_nlink(parent) */
29165+ res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29166+
29167+ return res;
29168+}
29169+
29170+/*
29171+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29172+ * methods: if @inode is a light-weight file, setup its credentials
29173+ * that are not stored in the stat-data in this case
29174+ */
29175+void check_light_weight(struct inode *inode, struct inode *parent)
29176+{
29177+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
29178+ inode->i_uid = parent->i_uid;
29179+ inode->i_gid = parent->i_gid;
29180+ /* clear light-weight flag. If inode would be read by any
29181+ other name, [ug]id wouldn't change. */
29182+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
29183+ }
29184+}
29185+
29186+/* looks for name specified in @dentry in directory @parent and if name is
29187+ found - key of object found entry points to is stored in @entry->key */
29188+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
29189+ * name in */
29190+ struct dentry *dentry, /* name to look for */
29191+ reiser4_key * key /* place to store key */ )
29192+{
29193+ int result;
29194+ coord_t *coord;
29195+ lock_handle lh;
29196+ const char *name;
29197+ int len;
29198+ reiser4_dir_entry_desc entry;
29199+ reiser4_dentry_fsdata *fsdata;
29200+
29201+ assert("nikita-1247", parent != NULL);
29202+ assert("nikita-1248", dentry != NULL);
29203+ assert("nikita-1123", dentry->d_name.name != NULL);
29204+ assert("vs-1486",
29205+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29206+
29207+ name = dentry->d_name.name;
29208+ len = dentry->d_name.len;
29209+
29210+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29211+ /* some arbitrary error code to return */
29212+ return RETERR(-ENAMETOOLONG);
29213+
29214+ fsdata = reiser4_get_dentry_fsdata(dentry);
29215+ if (IS_ERR(fsdata))
29216+ return PTR_ERR(fsdata);
29217+
29218+ coord = &fsdata->dec.entry_coord;
29219+ coord_clear_iplug(coord);
29220+ init_lh(&lh);
29221+
29222+ /* find entry in a directory. This is plugin method. */
29223+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
29224+ &entry);
29225+ if (result == 0) {
29226+ /* entry was found, extract object key from it. */
29227+ result =
29228+ WITH_COORD(coord,
29229+ item_plugin_by_coord(coord)->s.dir.
29230+ extract_key(coord, key));
29231+ }
29232+ done_lh(&lh);
29233+ return result;
29234+
29235+}
29236+
29237+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
29238+static reiser4_block_nr
29239+estimate_init(struct inode *parent, struct inode *object)
29240+{
29241+ reiser4_block_nr res = 0;
29242+
29243+ assert("vpf-321", parent != NULL);
29244+ assert("vpf-322", object != NULL);
29245+
29246+ /* hashed_add_entry(object) */
29247+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29248+ /* reiser4_add_nlink(object) */
29249+ res += inode_file_plugin(object)->estimate.update(object);
29250+ /* hashed_add_entry(object) */
29251+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29252+ /* reiser4_add_nlink(parent) */
29253+ res += inode_file_plugin(parent)->estimate.update(parent);
29254+
29255+ return 0;
29256+}
29257+
29258+/* helper function for reiser4_dir_init_common(). Create "." and ".." */
29259+static int create_dot_dotdot(struct inode *object /* object to create dot and
29260+ * dotdot for */ ,
29261+ struct inode *parent /* parent of @object */)
29262+{
29263+ int result;
29264+ struct dentry dots_entry;
29265+ reiser4_dir_entry_desc entry;
29266+
29267+ assert("nikita-688", object != NULL);
29268+ assert("nikita-689", S_ISDIR(object->i_mode));
29269+ assert("nikita-691", parent != NULL);
29270+
29271+ /* We store dot and dotdot as normal directory entries. This is
29272+ not necessary, because almost all information stored in them
29273+ is already in the stat-data of directory, the only thing
29274+ being missed is objectid of grand-parent directory that can
29275+ easily be added there as extension.
29276+
29277+ But it is done the way it is done, because not storing dot
29278+ and dotdot will lead to the following complications:
29279+
29280+ . special case handling in ->lookup().
29281+ . addition of another extension to the sd.
29282+ . dependency on key allocation policy for stat data.
29283+
29284+ */
29285+
29286+ memset(&entry, 0, sizeof entry);
29287+ memset(&dots_entry, 0, sizeof dots_entry);
29288+ entry.obj = dots_entry.d_inode = object;
29289+ dots_entry.d_name.name = ".";
29290+ dots_entry.d_name.len = 1;
29291+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
29292+ reiser4_free_dentry_fsdata(&dots_entry);
29293+
29294+ if (result == 0) {
29295+ result = reiser4_add_nlink(object, object, 0);
29296+ if (result == 0) {
29297+ entry.obj = dots_entry.d_inode = parent;
29298+ dots_entry.d_name.name = "..";
29299+ dots_entry.d_name.len = 2;
29300+ result = reiser4_add_entry_common(object,
29301+ &dots_entry, NULL, &entry);
29302+ reiser4_free_dentry_fsdata(&dots_entry);
29303+ /* if creation of ".." failed, iput() will delete
29304+ object with ".". */
29305+ if (result == 0) {
29306+ result = reiser4_add_nlink(parent, object, 0);
29307+ if (result != 0)
29308+ /*
29309+ * if we failed to bump i_nlink, try
29310+ * to remove ".."
29311+ */
29312+ reiser4_detach_common(object, parent);
29313+ }
29314+ }
29315+ }
29316+
29317+ if (result != 0) {
29318+ /*
29319+ * in the case of error, at least update stat-data so that,
29320+ * ->i_nlink updates are not lingering.
29321+ */
29322+ reiser4_update_sd(object);
29323+ reiser4_update_sd(parent);
29324+ }
29325+
29326+ return result;
29327+}
29328+
29329+/*
29330+ * return 0 iff @coord contains a directory entry for the file with the name
29331+ * @name.
29332+ */
29333+static int
29334+check_item(const struct inode *dir, const coord_t * coord, const char *name)
29335+{
29336+ item_plugin *iplug;
29337+ char buf[DE_NAME_BUF_LEN];
29338+
29339+ iplug = item_plugin_by_coord(coord);
29340+ if (iplug == NULL) {
29341+ warning("nikita-1135", "Cannot get item plugin");
29342+ print_coord("coord", coord, 1);
29343+ return RETERR(-EIO);
29344+ } else if (item_id_by_coord(coord) !=
29345+ item_id_by_plugin(inode_dir_item_plugin(dir))) {
29346+ /* item id of current item does not match to id of items a
29347+ directory is built of */
29348+ warning("nikita-1136", "Wrong item plugin");
29349+ print_coord("coord", coord, 1);
29350+ return RETERR(-EIO);
29351+ }
29352+ assert("nikita-1137", iplug->s.dir.extract_name);
29353+
29354+ /* Compare name stored in this entry with name we are looking for.
29355+
29356+ NOTE-NIKITA Here should go code for support of something like
29357+ unicode, code tables, etc.
29358+ */
29359+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
29360+}
29361+
29362+static int
29363+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
29364+{
29365+ return WITH_COORD(coord, check_item(dir, coord, name->name));
29366+}
29367+
29368+/*
29369+ * argument package used by entry_actor to scan entries with identical keys.
29370+ */
29371+typedef struct entry_actor_args {
29372+ /* name we are looking for */
29373+ const char *name;
29374+ /* key of directory entry. entry_actor() scans through sequence of
29375+ * items/units having the same key */
29376+ reiser4_key *key;
29377+ /* how many entries with duplicate key was scanned so far. */
29378+ int non_uniq;
29379+#if REISER4_USE_COLLISION_LIMIT
29380+ /* scan limit */
29381+ int max_non_uniq;
29382+#endif
29383+ /* return parameter: set to true, if ->name wasn't found */
29384+ int not_found;
29385+ /* what type of lock to take when moving to the next node during
29386+ * scan */
29387+ znode_lock_mode mode;
29388+
29389+ /* last coord that was visited during scan */
29390+ coord_t last_coord;
29391+ /* last node locked during scan */
29392+ lock_handle last_lh;
29393+ /* inode of directory */
29394+ const struct inode *inode;
29395+} entry_actor_args;
29396+
29397+/* Function called by reiser4_find_entry() to look for given name
29398+ in the directory. */
29399+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
29400+ coord_t * coord /* current coord */ ,
29401+ lock_handle * lh /* current lock handle */ ,
29402+ void *entry_actor_arg /* argument to scan */ )
29403+{
29404+ reiser4_key unit_key;
29405+ entry_actor_args *args;
29406+
29407+ assert("nikita-1131", tree != NULL);
29408+ assert("nikita-1132", coord != NULL);
29409+ assert("nikita-1133", entry_actor_arg != NULL);
29410+
29411+ args = entry_actor_arg;
29412+ ++args->non_uniq;
29413+#if REISER4_USE_COLLISION_LIMIT
29414+ if (args->non_uniq > args->max_non_uniq) {
29415+ args->not_found = 1;
29416+ /* hash collision overflow. */
29417+ return RETERR(-EBUSY);
29418+ }
29419+#endif
29420+
29421+ /*
29422+ * did we just reach the end of the sequence of items/units with
29423+ * identical keys?
29424+ */
29425+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
29426+ assert("nikita-1791",
29427+ keylt(args->key, unit_key_by_coord(coord, &unit_key)));
29428+ args->not_found = 1;
29429+ args->last_coord.between = AFTER_UNIT;
29430+ return 0;
29431+ }
29432+
29433+ coord_dup(&args->last_coord, coord);
29434+ /*
29435+ * did scan just moved to the next node?
29436+ */
29437+ if (args->last_lh.node != lh->node) {
29438+ int lock_result;
29439+
29440+ /*
29441+ * if so, lock new node with the mode requested by the caller
29442+ */
29443+ done_lh(&args->last_lh);
29444+ assert("nikita-1896", znode_is_any_locked(lh->node));
29445+ lock_result = longterm_lock_znode(&args->last_lh, lh->node,
29446+ args->mode, ZNODE_LOCK_HIPRI);
29447+ if (lock_result != 0)
29448+ return lock_result;
29449+ }
29450+ return check_item(args->inode, coord, args->name);
29451+}
29452+
29453+/* Look for given @name within directory @dir.
29454+
29455+ This is called during lookup, creation and removal of directory
29456+ entries and on reiser4_rename_common
29457+
29458+ First calculate key that directory entry for @name would have. Search
29459+ for this key in the tree. If such key is found, scan all items with
29460+ the same key, checking name in each directory entry along the way.
29461+*/
29462+int reiser4_find_entry(struct inode *dir, /* directory to scan */
29463+ struct dentry *de, /* name to search for */
29464+ lock_handle * lh, /* resulting lock handle */
29465+ znode_lock_mode mode, /* required lock mode */
29466+ reiser4_dir_entry_desc * entry /* parameters of found
29467+ directory entry */)
29468+{
29469+ const struct qstr *name;
29470+ seal_t *seal;
29471+ coord_t *coord;
29472+ int result;
29473+ __u32 flags;
29474+ de_location *dec;
29475+ reiser4_dentry_fsdata *fsdata;
29476+
29477+ assert("nikita-1130", lh != NULL);
29478+ assert("nikita-1128", dir != NULL);
29479+
29480+ name = &de->d_name;
29481+ assert("nikita-1129", name != NULL);
29482+
29483+ /* dentry private data don't require lock, because dentry
29484+ manipulations are protected by i_mutex on parent.
29485+
29486+ This is not so for inodes, because there is no -the- parent in
29487+ inode case.
29488+ */
29489+ fsdata = reiser4_get_dentry_fsdata(de);
29490+ if (IS_ERR(fsdata))
29491+ return PTR_ERR(fsdata);
29492+ dec = &fsdata->dec;
29493+
29494+ coord = &dec->entry_coord;
29495+ coord_clear_iplug(coord);
29496+ seal = &dec->entry_seal;
29497+ /* compose key of directory entry for @name */
29498+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
29499+
29500+ if (reiser4_seal_is_set(seal)) {
29501+ /* check seal */
29502+ result = reiser4_seal_validate(seal, coord, &entry->key,
29503+ lh, mode, ZNODE_LOCK_LOPRI);
29504+ if (result == 0) {
29505+ /* key was found. Check that it is really item we are
29506+ looking for. */
29507+ result = check_entry(dir, coord, name);
29508+ if (result == 0)
29509+ return 0;
29510+ }
29511+ }
29512+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
29513+ /*
29514+ * find place in the tree where directory item should be located.
29515+ */
29516+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
29517+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
29518+ flags, NULL /*ra_info */ );
29519+ if (result == CBK_COORD_FOUND) {
29520+ entry_actor_args arg;
29521+
29522+ /* fast path: no hash collisions */
29523+ result = check_entry(dir, coord, name);
29524+ if (result == 0) {
29525+ reiser4_seal_init(seal, coord, &entry->key);
29526+ dec->pos = 0;
29527+ } else if (result > 0) {
29528+ /* Iterate through all units with the same keys. */
29529+ arg.name = name->name;
29530+ arg.key = &entry->key;
29531+ arg.not_found = 0;
29532+ arg.non_uniq = 0;
29533+#if REISER4_USE_COLLISION_LIMIT
29534+ arg.max_non_uniq = max_hash_collisions(dir);
29535+ assert("nikita-2851", arg.max_non_uniq > 1);
29536+#endif
29537+ arg.mode = mode;
29538+ arg.inode = dir;
29539+ coord_init_zero(&arg.last_coord);
29540+ init_lh(&arg.last_lh);
29541+
29542+ result = reiser4_iterate_tree
29543+ (reiser4_tree_by_inode(dir),
29544+ coord, lh,
29545+ entry_actor, &arg, mode, 1);
29546+ /* if end of the tree or extent was reached during
29547+ scanning. */
29548+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
29549+ /* step back */
29550+ done_lh(lh);
29551+
29552+ result = zload(arg.last_coord.node);
29553+ if (result == 0) {
29554+ coord_clear_iplug(&arg.last_coord);
29555+ coord_dup(coord, &arg.last_coord);
29556+ move_lh(lh, &arg.last_lh);
29557+ result = RETERR(-ENOENT);
29558+ zrelse(arg.last_coord.node);
29559+ --arg.non_uniq;
29560+ }
29561+ }
29562+
29563+ done_lh(&arg.last_lh);
29564+ if (result == 0)
29565+ reiser4_seal_init(seal, coord, &entry->key);
29566+
29567+ if (result == 0 || result == -ENOENT) {
29568+ assert("nikita-2580", arg.non_uniq > 0);
29569+ dec->pos = arg.non_uniq - 1;
29570+ }
29571+ }
29572+ } else
29573+ dec->pos = -1;
29574+ return result;
29575+}
29576+
29577+/*
29578+ Local variables:
29579+ c-indentation-style: "K&R"
29580+ mode-name: "LC"
29581+ c-basic-offset: 8
29582+ tab-width: 8
29583+ fill-column: 120
29584+ scroll-step: 1
29585+ End:
29586+*/
29587diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.c
29588--- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300
29589+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.c 2007-05-06 14:50:43.762995722 +0400
29590@@ -0,0 +1,655 @@
29591+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29592+
29593+#include "../../debug.h"
29594+#include "../../dformat.h"
29595+#include "../../key.h"
29596+#include "../node/node.h"
29597+#include "../space/space_allocator.h"
29598+#include "disk_format40.h"
29599+#include "../plugin.h"
29600+#include "../../txnmgr.h"
29601+#include "../../jnode.h"
29602+#include "../../tree.h"
29603+#include "../../super.h"
29604+#include "../../wander.h"
29605+#include "../../inode.h"
29606+#include "../../ktxnmgrd.h"
29607+#include "../../status_flags.h"
29608+
29609+#include <linux/types.h> /* for __u?? */
29610+#include <linux/fs.h> /* for struct super_block */
29611+#include <linux/buffer_head.h>
29612+
29613+/* reiser 4.0 default disk layout */
29614+
29615+/* Amount of free blocks needed to perform release_format40 when fs gets
29616+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
29617+ & tx record. */
29618+#define RELEASE_RESERVED 4
29619+
29620+/* The greatest supported format40 version number */
29621+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
29622+
29623+/* This flag indicates that backup should be updated
29624+ (the update is performed by fsck) */
29625+#define FORMAT40_UPDATE_BACKUP (1 << 31)
29626+
29627+/* functions to access fields of format40_disk_super_block */
29628+static __u64 get_format40_block_count(const format40_disk_super_block * sb)
29629+{
29630+ return le64_to_cpu(get_unaligned(&sb->block_count));
29631+}
29632+
29633+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
29634+{
29635+ return le64_to_cpu(get_unaligned(&sb->free_blocks));
29636+}
29637+
29638+static __u64 get_format40_root_block(const format40_disk_super_block * sb)
29639+{
29640+ return le64_to_cpu(get_unaligned(&sb->root_block));
29641+}
29642+
29643+static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
29644+{
29645+ return le16_to_cpu(get_unaligned(&sb->tree_height));
29646+}
29647+
29648+static __u64 get_format40_file_count(const format40_disk_super_block * sb)
29649+{
29650+ return le64_to_cpu(get_unaligned(&sb->file_count));
29651+}
29652+
29653+static __u64 get_format40_oid(const format40_disk_super_block * sb)
29654+{
29655+ return le64_to_cpu(get_unaligned(&sb->oid));
29656+}
29657+
29658+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
29659+{
29660+ return le32_to_cpu(get_unaligned(&sb->mkfs_id));
29661+}
29662+
29663+static __u64 get_format40_flags(const format40_disk_super_block * sb)
29664+{
29665+ return le64_to_cpu(get_unaligned(&sb->flags));
29666+}
29667+
29668+static __u32 get_format40_version(const format40_disk_super_block * sb)
29669+{
29670+ return le32_to_cpu(get_unaligned(&sb->version)) &
29671+ ~FORMAT40_UPDATE_BACKUP;
29672+}
29673+
29674+static int update_backup_version(const format40_disk_super_block * sb)
29675+{
29676+ return (le32_to_cpu(get_unaligned(&sb->version)) &
29677+ FORMAT40_UPDATE_BACKUP);
29678+}
29679+
29680+static int update_disk_version(const format40_disk_super_block * sb)
29681+{
29682+ return (get_format40_version(sb) < FORMAT40_VERSION);
29683+}
29684+
29685+static int incomplete_compatibility(const format40_disk_super_block * sb)
29686+{
29687+ return (get_format40_version(sb) > FORMAT40_VERSION);
29688+}
29689+
29690+static format40_super_info *get_sb_info(struct super_block *super)
29691+{
29692+ return &get_super_private(super)->u.format40;
29693+}
29694+
29695+static int consult_diskmap(struct super_block *s)
29696+{
29697+ format40_super_info *info;
29698+ journal_location *jloc;
29699+
29700+ info = get_sb_info(s);
29701+ jloc = &get_super_private(s)->jloc;
29702+ /* Default format-specific locations, if there is nothing in
29703+ * diskmap */
29704+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
29705+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
29706+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
29707+#ifdef CONFIG_REISER4_BADBLOCKS
29708+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
29709+ &jloc->footer);
29710+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
29711+ &jloc->header);
29712+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
29713+ &info->loc.super);
29714+#endif
29715+ return 0;
29716+}
29717+
29718+/* find any valid super block of disk_format40 (even if the first
29719+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
29720+ if needed */
29721+static struct buffer_head *find_a_disk_format40_super_block(struct super_block
29722+ *s)
29723+{
29724+ struct buffer_head *super_bh;
29725+ format40_disk_super_block *disk_sb;
29726+ format40_super_info *info;
29727+
29728+ assert("umka-487", s != NULL);
29729+
29730+ info = get_sb_info(s);
29731+
29732+ super_bh = sb_bread(s, info->loc.super);
29733+ if (super_bh == NULL)
29734+ return ERR_PTR(RETERR(-EIO));
29735+
29736+ disk_sb = (format40_disk_super_block *) super_bh->b_data;
29737+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
29738+ brelse(super_bh);
29739+ return ERR_PTR(RETERR(-EINVAL));
29740+ }
29741+
29742+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
29743+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
29744+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29745+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29746+
29747+ return super_bh;
29748+}
29749+
29750+/* find the most recent version of super block. This is called after journal is
29751+ replayed */
29752+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
29753+{
29754+ /* Here the most recent superblock copy has to be read. However, as
29755+ journal replay isn't complete, we are using
29756+ find_a_disk_format40_super_block() function. */
29757+ return find_a_disk_format40_super_block(s);
29758+}
29759+
29760+static int get_super_jnode(struct super_block *s)
29761+{
29762+ reiser4_super_info_data *sbinfo = get_super_private(s);
29763+ jnode *sb_jnode;
29764+ int ret;
29765+
29766+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
29767+
29768+ ret = jload(sb_jnode);
29769+
29770+ if (ret) {
29771+ reiser4_drop_io_head(sb_jnode);
29772+ return ret;
29773+ }
29774+
29775+ pin_jnode_data(sb_jnode);
29776+ jrelse(sb_jnode);
29777+
29778+ sbinfo->u.format40.sb_jnode = sb_jnode;
29779+
29780+ return 0;
29781+}
29782+
29783+static void done_super_jnode(struct super_block *s)
29784+{
29785+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
29786+
29787+ if (sb_jnode) {
29788+ unpin_jnode_data(sb_jnode);
29789+ reiser4_drop_io_head(sb_jnode);
29790+ }
29791+}
29792+
29793+typedef enum format40_init_stage {
29794+ NONE_DONE = 0,
29795+ CONSULT_DISKMAP,
29796+ FIND_A_SUPER,
29797+ INIT_JOURNAL_INFO,
29798+ INIT_STATUS,
29799+ JOURNAL_REPLAY,
29800+ READ_SUPER,
29801+ KEY_CHECK,
29802+ INIT_OID,
29803+ INIT_TREE,
29804+ JOURNAL_RECOVER,
29805+ INIT_SA,
29806+ INIT_JNODE,
29807+ ALL_DONE
29808+} format40_init_stage;
29809+
29810+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
29811+{
29812+ format40_disk_super_block *sb_copy;
29813+
29814+ sb_copy = kmalloc(sizeof(format40_disk_super_block),
29815+ reiser4_ctx_gfp_mask_get());
29816+ if (sb_copy == NULL)
29817+ return ERR_PTR(RETERR(-ENOMEM));
29818+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
29819+ sizeof(format40_disk_super_block));
29820+ return sb_copy;
29821+}
29822+
29823+static int check_key_format(const format40_disk_super_block *sb_copy)
29824+{
29825+ if (!equi(REISER4_LARGE_KEY,
29826+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
29827+ warning("nikita-3228", "Key format mismatch. "
29828+ "Only %s keys are supported.",
29829+ REISER4_LARGE_KEY ? "large" : "small");
29830+ return RETERR(-EINVAL);
29831+ }
29832+ return 0;
29833+}
29834+
29835+/**
29836+ * try_init_format40
29837+ * @super:
29838+ * @stage:
29839+ *
29840+ */
29841+static int try_init_format40(struct super_block *super,
29842+ format40_init_stage *stage)
29843+{
29844+ int result;
29845+ struct buffer_head *super_bh;
29846+ reiser4_super_info_data *sbinfo;
29847+ format40_disk_super_block *sb_copy;
29848+ tree_level height;
29849+ reiser4_block_nr root_block;
29850+ node_plugin *nplug;
29851+
29852+ assert("vs-475", super != NULL);
29853+ assert("vs-474", get_super_private(super));
29854+
29855+ *stage = NONE_DONE;
29856+
29857+ result = consult_diskmap(super);
29858+ if (result)
29859+ return result;
29860+ *stage = CONSULT_DISKMAP;
29861+
29862+ super_bh = find_a_disk_format40_super_block(super);
29863+ if (IS_ERR(super_bh))
29864+ return PTR_ERR(super_bh);
29865+ brelse(super_bh);
29866+ *stage = FIND_A_SUPER;
29867+
29868+ /* ok, we are sure that filesystem format is a format40 format */
29869+
29870+ /* map jnodes for journal control blocks (header, footer) to disk */
29871+ result = reiser4_init_journal_info(super);
29872+ if (result)
29873+ return result;
29874+ *stage = INIT_JOURNAL_INFO;
29875+
29876+ /* ok, we are sure that filesystem format is a format40 format */
29877+ /* Now check it's state */
29878+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
29879+ if (result != 0 && result != -EINVAL)
29880+ /* -EINVAL means there is no magic, so probably just old
29881+ * fs. */
29882+ return result;
29883+ *stage = INIT_STATUS;
29884+
29885+ result = reiser4_status_query(NULL, NULL);
29886+ if (result == REISER4_STATUS_MOUNT_WARN)
29887+ notice("vpf-1363", "Warning: mounting %s with errors.",
29888+ super->s_id);
29889+ if (result == REISER4_STATUS_MOUNT_RO)
29890+ notice("vpf-1364", "Warning: mounting %s with fatal errors,"
29891+ " forcing read-only mount.", super->s_id);
29892+ result = reiser4_journal_replay(super);
29893+ if (result)
29894+ return result;
29895+ *stage = JOURNAL_REPLAY;
29896+
29897+ super_bh = read_super_block(super);
29898+ if (IS_ERR(super_bh))
29899+ return PTR_ERR(super_bh);
29900+ *stage = READ_SUPER;
29901+
29902+ /* allocate and make a copy of format40_disk_super_block */
29903+ sb_copy = copy_sb(super_bh);
29904+ brelse(super_bh);
29905+
29906+ if (IS_ERR(sb_copy))
29907+ return PTR_ERR(sb_copy);
29908+ printk("reiser4: %s: found disk format 4.0.%u.\n",
29909+ super->s_id,
29910+ get_format40_version(sb_copy));
29911+ if (incomplete_compatibility(sb_copy))
29912+ printk("reiser4: Warning: The last completely supported "
29913+ "version of disk format40 is %u. Some objects of "
29914+ "the semantic tree can be unaccessible.\n",
29915+ FORMAT40_VERSION);
29916+ /* make sure that key format of kernel and filesystem match */
29917+ result = check_key_format(sb_copy);
29918+ if (result) {
29919+ kfree(sb_copy);
29920+ return result;
29921+ }
29922+ *stage = KEY_CHECK;
29923+
29924+ result = oid_init_allocator(super, get_format40_file_count(sb_copy),
29925+ get_format40_oid(sb_copy));
29926+ if (result) {
29927+ kfree(sb_copy);
29928+ return result;
29929+ }
29930+ *stage = INIT_OID;
29931+
29932+ /* get things necessary to init reiser4_tree */
29933+ root_block = get_format40_root_block(sb_copy);
29934+ height = get_format40_tree_height(sb_copy);
29935+ nplug = node_plugin_by_id(NODE40_ID);
29936+
29937+ /* initialize reiser4_super_info_data */
29938+ sbinfo = get_super_private(super);
29939+ assert("", sbinfo->tree.super == super);
29940+ /* init reiser4_tree for the filesystem */
29941+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
29942+ if (result) {
29943+ kfree(sb_copy);
29944+ return result;
29945+ }
29946+ *stage = INIT_TREE;
29947+
29948+ /*
29949+ * initialize reiser4_super_info_data with data from format40 super
29950+ * block
29951+ */
29952+ sbinfo->default_uid = 0;
29953+ sbinfo->default_gid = 0;
29954+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
29955+ /* number of blocks in filesystem and reserved space */
29956+ reiser4_set_block_count(super, get_format40_block_count(sb_copy));
29957+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
29958+ sbinfo->version = get_format40_version(sb_copy);
29959+ kfree(sb_copy);
29960+
29961+ if (update_backup_version(sb_copy))
29962+ printk("reiser4: Warning: metadata backup is not updated. "
29963+ "Please run 'fsck.reiser4 --fix' on %s.\n",
29964+ super->s_id);
29965+
29966+ sbinfo->fsuid = 0;
29967+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
29968+ * are not supported */
29969+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
29970+ * layout 40 are
29971+ * of one
29972+ * plugin */
29973+ /* sbinfo->tmgr is initialized already */
29974+
29975+ /* recover sb data which were logged separately from sb block */
29976+
29977+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
29978+ * oid_init_allocator() and reiser4_set_free_blocks() with new
29979+ * data. What's the reason to call them above? */
29980+ result = reiser4_journal_recover_sb_data(super);
29981+ if (result != 0)
29982+ return result;
29983+ *stage = JOURNAL_RECOVER;
29984+
29985+ /*
29986+ * Set number of used blocks. The number of used blocks is not stored
29987+ * neither in on-disk super block nor in the journal footer blocks. At
29988+ * this moment actual values of total blocks and free block counters
29989+ * are set in the reiser4 super block (in-memory structure) and we can
29990+ * calculate number of used blocks from them.
29991+ */
29992+ reiser4_set_data_blocks(super,
29993+ reiser4_block_count(super) -
29994+ reiser4_free_blocks(super));
29995+
29996+#if REISER4_DEBUG
29997+ sbinfo->min_blocks_used = 16 /* reserved area */ +
29998+ 2 /* super blocks */ +
29999+ 2 /* journal footer and header */ ;
30000+#endif
30001+
30002+ /* init disk space allocator */
30003+ result = sa_init_allocator(reiser4_get_space_allocator(super),
30004+ super, NULL);
30005+ if (result)
30006+ return result;
30007+ *stage = INIT_SA;
30008+
30009+ result = get_super_jnode(super);
30010+ if (result == 0)
30011+ *stage = ALL_DONE;
30012+ return result;
30013+}
30014+
30015+/* plugin->u.format.get_ready */
30016+int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30017+{
30018+ int result;
30019+ format40_init_stage stage;
30020+
30021+ result = try_init_format40(s, &stage);
30022+ switch (stage) {
30023+ case ALL_DONE:
30024+ assert("nikita-3458", result == 0);
30025+ break;
30026+ case INIT_JNODE:
30027+ done_super_jnode(s);
30028+ case INIT_SA:
30029+ sa_destroy_allocator(reiser4_get_space_allocator(s), s);
30030+ case JOURNAL_RECOVER:
30031+ case INIT_TREE:
30032+ reiser4_done_tree(&get_super_private(s)->tree);
30033+ case INIT_OID:
30034+ case KEY_CHECK:
30035+ case READ_SUPER:
30036+ case JOURNAL_REPLAY:
30037+ case INIT_STATUS:
30038+ reiser4_status_finish();
30039+ case INIT_JOURNAL_INFO:
30040+ reiser4_done_journal_info(s);
30041+ case FIND_A_SUPER:
30042+ case CONSULT_DISKMAP:
30043+ case NONE_DONE:
30044+ break;
30045+ default:
30046+ impossible("nikita-3457", "init stage: %i", stage);
30047+ }
30048+
30049+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30050+ return RETERR(-ENOSPC);
30051+
30052+ return result;
30053+}
30054+
30055+static void pack_format40_super(const struct super_block *s, char *data)
30056+{
30057+ format40_disk_super_block *super_data =
30058+ (format40_disk_super_block *) data;
30059+
30060+ reiser4_super_info_data *sbinfo = get_super_private(s);
30061+
30062+ assert("zam-591", data != NULL);
30063+
30064+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30065+ &super_data->free_blocks);
30066+
30067+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
30068+ &super_data->root_block);
30069+
30070+ put_unaligned(cpu_to_le64(oid_next(s)),
30071+ &super_data->oid);
30072+
30073+ put_unaligned(cpu_to_le64(oids_used(s)),
30074+ &super_data->file_count);
30075+
30076+ put_unaligned(cpu_to_le16(sbinfo->tree.height),
30077+ &super_data->tree_height);
30078+
30079+ if (update_disk_version(super_data)) {
30080+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
30081+
30082+ put_unaligned(cpu_to_le32(version), &super_data->version);
30083+ }
30084+}
30085+
30086+/* plugin->u.format.log_super
30087+ return a jnode which should be added to transaction when the super block
30088+ gets logged */
30089+jnode *log_super_format40(struct super_block *s)
30090+{
30091+ jnode *sb_jnode;
30092+
30093+ sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30094+
30095+ jload(sb_jnode);
30096+
30097+ pack_format40_super(s, jdata(sb_jnode));
30098+
30099+ jrelse(sb_jnode);
30100+
30101+ return sb_jnode;
30102+}
30103+
30104+/* plugin->u.format.release */
30105+int release_format40(struct super_block *s)
30106+{
30107+ int ret;
30108+ reiser4_super_info_data *sbinfo;
30109+
30110+ sbinfo = get_super_private(s);
30111+ assert("zam-579", sbinfo != NULL);
30112+
30113+ if (!rofs_super(s)) {
30114+ ret = reiser4_capture_super_block(s);
30115+ if (ret != 0)
30116+ warning("vs-898",
30117+ "reiser4_capture_super_block failed: %d",
30118+ ret);
30119+
30120+ ret = txnmgr_force_commit_all(s, 1);
30121+ if (ret != 0)
30122+ warning("jmacd-74438", "txn_force failed: %d", ret);
30123+
30124+ all_grabbed2free();
30125+ }
30126+
30127+ sa_destroy_allocator(&sbinfo->space_allocator, s);
30128+ reiser4_done_journal_info(s);
30129+ done_super_jnode(s);
30130+
30131+ rcu_barrier();
30132+ reiser4_done_tree(&sbinfo->tree);
30133+ /* call finish_rcu(), because some znode were "released" in
30134+ * reiser4_done_tree(). */
30135+ rcu_barrier();
30136+
30137+ return 0;
30138+}
30139+
30140+#define FORMAT40_ROOT_LOCALITY 41
30141+#define FORMAT40_ROOT_OBJECTID 42
30142+
30143+/* plugin->u.format.root_dir_key */
30144+const reiser4_key *root_dir_key_format40(const struct super_block *super
30145+ UNUSED_ARG)
30146+{
30147+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30148+ .el = {
30149+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30150+#if REISER4_LARGE_KEY
30151+ ON_LARGE_KEY(0ull,)
30152+#endif
30153+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30154+ 0ull
30155+ }
30156+ };
30157+
30158+ return &FORMAT40_ROOT_DIR_KEY;
30159+}
30160+
30161+/* plugin->u.format.check_open.
30162+ Check the opened object for validness. For now it checks for the valid oid &
30163+ locality only, can be improved later and it its work may depend on the mount
30164+ options. */
30165+int check_open_format40(const struct inode *object)
30166+{
30167+ oid_t max, oid;
30168+
30169+ max = oid_next(object->i_sb) - 1;
30170+
30171+ /* Check the oid. */
30172+ oid = get_inode_oid(object);
30173+ if (oid > max) {
30174+ warning("vpf-1360", "The object with the oid %llu "
30175+ "greater then the max used oid %llu found.",
30176+ (unsigned long long)oid, (unsigned long long)max);
30177+
30178+ return RETERR(-EIO);
30179+ }
30180+
30181+ /* Check the locality. */
30182+ oid = reiser4_inode_data(object)->locality_id;
30183+ if (oid > max) {
30184+ warning("vpf-1361", "The object with the locality %llu "
30185+ "greater then the max used oid %llu found.",
30186+ (unsigned long long)oid, (unsigned long long)max);
30187+
30188+ return RETERR(-EIO);
30189+ }
30190+
30191+ return 0;
30192+}
30193+
30194+/* plugin->u.format.version_update.
30195+ Perform all version update operations from the on-disk
30196+ format40_disk_super_block.version on disk to FORMAT40_VERSION.
30197+ */
30198+int version_update_format40(struct super_block *super) {
30199+ txn_handle * trans;
30200+ lock_handle lh;
30201+ txn_atom *atom;
30202+ int ret;
30203+
30204+ /* Nothing to do if RO mount or the on-disk version is not less. */
30205+ if (super->s_flags & MS_RDONLY)
30206+ return 0;
30207+
30208+ if (get_super_private(super)->version >= FORMAT40_VERSION)
30209+ return 0;
30210+
30211+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
30212+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
30213+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
30214+
30215+ /* Mark the uber znode dirty to call log_super on write_logs. */
30216+ init_lh(&lh);
30217+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
30218+ ZNODE_LOCK_HIPRI, &lh);
30219+ if (ret != 0)
30220+ return ret;
30221+
30222+ znode_make_dirty(lh.node);
30223+ done_lh(&lh);
30224+
30225+ /* Update the backup blocks. */
30226+
30227+ /* Force write_logs immediately. */
30228+ trans = get_current_context()->trans;
30229+ atom = get_current_atom_locked();
30230+ assert("vpf-1906", atom != NULL);
30231+
30232+ spin_lock_txnh(trans);
30233+ return force_commit_atom(trans);
30234+}
30235+
30236+/* Make Linus happy.
30237+ Local variables:
30238+ c-indentation-style: "K&R"
30239+ mode-name: "LC"
30240+ c-basic-offset: 8
30241+ tab-width: 8
30242+ fill-column: 120
30243+ scroll-step: 1
30244+ End:
30245+*/
30246diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.h
30247--- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300
30248+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.h 2007-05-06 14:50:43.762995722 +0400
30249@@ -0,0 +1,109 @@
30250+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30251+
30252+/* this file contains:
30253+ - definition of ondisk super block of standart disk layout for
30254+ reiser 4.0 (layout 40)
30255+ - definition of layout 40 specific portion of in-core super block
30256+ - declarations of functions implementing methods of layout plugin
30257+ for layout 40
30258+ - declarations of functions used to get/set fields in layout 40 super block
30259+*/
30260+
30261+#ifndef __DISK_FORMAT40_H__
30262+#define __DISK_FORMAT40_H__
30263+
30264+/* magic for default reiser4 layout */
30265+#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30266+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30267+
30268+#include "../../dformat.h"
30269+
30270+#include <linux/fs.h> /* for struct super_block */
30271+
30272+typedef enum {
30273+ FORMAT40_LARGE_KEYS
30274+} format40_flags;
30275+
30276+/* ondisk super block for format 40. It is 512 bytes long */
30277+typedef struct format40_disk_super_block {
30278+ /* 0 */ d64 block_count;
30279+ /* number of block in a filesystem */
30280+ /* 8 */ d64 free_blocks;
30281+ /* number of free blocks */
30282+ /* 16 */ d64 root_block;
30283+ /* filesystem tree root block */
30284+ /* 24 */ d64 oid;
30285+ /* smallest free objectid */
30286+ /* 32 */ d64 file_count;
30287+ /* number of files in a filesystem */
30288+ /* 40 */ d64 flushes;
30289+ /* number of times super block was
30290+ flushed. Needed if format 40
30291+ will have few super blocks */
30292+ /* 48 */ d32 mkfs_id;
30293+ /* unique identifier of fs */
30294+ /* 52 */ char magic[16];
30295+ /* magic string ReIsEr40FoRmAt */
30296+ /* 68 */ d16 tree_height;
30297+ /* height of filesystem tree */
30298+ /* 70 */ d16 formatting_policy;
30299+ /* not used anymore */
30300+ /* 72 */ d64 flags;
30301+ /* 80 */ d32 version;
30302+ /* on-disk format version number
30303+ initially assigned by mkfs as the greatest format40
30304+ version number supported by reiser4progs and updated
30305+ in mount time in accordance with the greatest format40
30306+ version number supported by kernel.
30307+ Is used by fsck to catch possible corruption and
30308+ for various compatibility issues */
30309+ /* 84 */ char not_used[428];
30310+} format40_disk_super_block;
30311+
30312+/* format 40 specific part of reiser4_super_info_data */
30313+typedef struct format40_super_info {
30314+/* format40_disk_super_block actual_sb; */
30315+ jnode *sb_jnode;
30316+ struct {
30317+ reiser4_block_nr super;
30318+ } loc;
30319+} format40_super_info;
30320+
30321+/* Defines for journal header and footer respectively. */
30322+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
30323+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
30324+
30325+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
30326+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
30327+
30328+#define FORMAT40_STATUS_BLOCKNR \
30329+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
30330+
30331+/* Diskmap declarations */
30332+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
30333+#define FORMAT40_SUPER 1
30334+#define FORMAT40_JH 2
30335+#define FORMAT40_JF 3
30336+
30337+/* declarations of functions implementing methods of layout plugin for
30338+ format 40. The functions theirself are in disk_format40.c */
30339+extern int init_format_format40(struct super_block *, void *data);
30340+extern const reiser4_key *root_dir_key_format40(const struct super_block *);
30341+extern int release_format40(struct super_block *s);
30342+extern jnode *log_super_format40(struct super_block *s);
30343+extern int check_open_format40(const struct inode *object);
30344+extern int version_update_format40(struct super_block *super);
30345+
30346+/* __DISK_FORMAT40_H__ */
30347+#endif
30348+
30349+/* Make Linus happy.
30350+ Local variables:
30351+ c-indentation-style: "K&R"
30352+ mode-name: "LC"
30353+ c-basic-offset: 8
30354+ tab-width: 8
30355+ fill-column: 120
30356+ scroll-step: 1
30357+ End:
30358+*/
30359diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.c
30360--- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
30361+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.c 2007-05-06 14:50:43.762995722 +0400
30362@@ -0,0 +1,38 @@
30363+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30364+
30365+#include "../../debug.h"
30366+#include "../plugin_header.h"
30367+#include "disk_format40.h"
30368+#include "disk_format.h"
30369+#include "../plugin.h"
30370+
30371+/* initialization of disk layout plugins */
30372+disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30373+ [FORMAT40_ID] = {
30374+ .h = {
30375+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30376+ .id = FORMAT40_ID,
30377+ .pops = NULL,
30378+ .label = "reiser40",
30379+ .desc = "standard disk layout for reiser40",
30380+ .linkage = {NULL, NULL}
30381+ },
30382+ .init_format = init_format_format40,
30383+ .root_dir_key = root_dir_key_format40,
30384+ .release = release_format40,
30385+ .log_super = log_super_format40,
30386+ .check_open = check_open_format40,
30387+ .version_update = version_update_format40
30388+ }
30389+};
30390+
30391+/* Make Linus happy.
30392+ Local variables:
30393+ c-indentation-style: "K&R"
30394+ mode-name: "LC"
30395+ c-basic-offset: 8
30396+ tab-width: 8
30397+ fill-column: 120
30398+ scroll-step: 1
30399+ End:
30400+*/
30401diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.h
30402--- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
30403+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.h 2007-05-06 14:50:43.762995722 +0400
30404@@ -0,0 +1,27 @@
30405+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30406+
30407+/* identifiers for disk layouts, they are also used as indexes in array of disk
30408+ plugins */
30409+
30410+#if !defined( __REISER4_DISK_FORMAT_H__ )
30411+#define __REISER4_DISK_FORMAT_H__
30412+
30413+typedef enum {
30414+ /* standard reiser4 disk layout plugin id */
30415+ FORMAT40_ID,
30416+ LAST_FORMAT_ID
30417+} disk_format_id;
30418+
30419+/* __REISER4_DISK_FORMAT_H__ */
30420+#endif
30421+
30422+/* Make Linus happy.
30423+ Local variables:
30424+ c-indentation-style: "K&R"
30425+ mode-name: "LC"
30426+ c-basic-offset: 8
30427+ tab-width: 8
30428+ fill-column: 120
30429+ scroll-step: 1
30430+ End:
30431+*/
30432diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.20/fs/reiser4/plugin/disk_format/Makefile
30433--- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300
30434+++ linux-2.6.20/fs/reiser4/plugin/disk_format/Makefile 2007-05-06 14:50:43.762995722 +0400
30435@@ -0,0 +1,5 @@
30436+obj-$(CONFIG_REISER4_FS) += df_plugins.o
30437+
30438+df_plugins-objs := \
30439+ disk_format40.o \
30440+ disk_format.o
30441diff -urN linux-2.6.20.orig/fs/reiser4/plugin/fibration.c linux-2.6.20/fs/reiser4/plugin/fibration.c
30442--- linux-2.6.20.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300
30443+++ linux-2.6.20/fs/reiser4/plugin/fibration.c 2007-05-06 14:50:43.762995722 +0400
30444@@ -0,0 +1,175 @@
30445+/* Copyright 2004 by Hans Reiser, licensing governed by
30446+ * reiser4/README */
30447+
30448+/* Directory fibrations */
30449+
30450+/*
30451+ * Suppose we have a directory tree with sources of some project. During
30452+ * compilation .o files are created within this tree. This makes access
30453+ * to the original source files less efficient, because source files are
30454+ * now "diluted" by object files: default directory plugin uses prefix
30455+ * of a file name as a part of the key for directory entry (and this
30456+ * part is also inherited by the key of file body). This means that
30457+ * foo.o will be located close to foo.c and foo.h in the tree.
30458+ *
30459+ * To avoid this effect directory plugin fill highest 7 (unused
30460+ * originally) bits of the second component of the directory entry key
30461+ * by bit-pattern depending on the file name (see
30462+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
30463+ * "fibre". Fibre of the file name key is inherited by key of stat data
30464+ * and keys of file body (in the case of REISER4_LARGE_KEY).
30465+ *
30466+ * Fibre for a given file is chosen by per-directory fibration
30467+ * plugin. Names within given fibre are ordered lexicographically.
30468+ */
30469+
30470+#include "../debug.h"
30471+#include "plugin_header.h"
30472+#include "plugin.h"
30473+#include "../super.h"
30474+#include "../inode.h"
30475+
30476+#include <linux/types.h>
30477+
30478+static const int fibre_shift = 57;
30479+
30480+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
30481+
30482+/*
30483+ * Trivial fibration: all files of directory are just ordered
30484+ * lexicographically.
30485+ */
30486+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
30487+{
30488+ return FIBRE_NO(0);
30489+}
30490+
30491+/*
30492+ * dot-o fibration: place .o files after all others.
30493+ */
30494+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
30495+{
30496+ /* special treatment for .*\.o */
30497+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
30498+ return FIBRE_NO(1);
30499+ else
30500+ return FIBRE_NO(0);
30501+}
30502+
30503+/*
30504+ * ext.1 fibration: subdivide directory into 128 fibrations one for each
30505+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
30506+ * default fibre for the rest.
30507+ */
30508+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
30509+{
30510+ if (len > 2 && name[len - 2] == '.')
30511+ return FIBRE_NO(name[len - 1]);
30512+ else
30513+ return FIBRE_NO(0);
30514+}
30515+
30516+/*
30517+ * ext.3 fibration: try to separate files with different 3-character
30518+ * extensions from each other.
30519+ */
30520+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
30521+{
30522+ if (len > 4 && name[len - 4] == '.')
30523+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
30524+ else
30525+ return FIBRE_NO(0);
30526+}
30527+
30528+static int change_fibration(struct inode *inode,
30529+ reiser4_plugin * plugin,
30530+ pset_member memb)
30531+{
30532+ int result;
30533+
30534+ assert("nikita-3503", inode != NULL);
30535+ assert("nikita-3504", plugin != NULL);
30536+
30537+ assert("nikita-3505", is_reiser4_inode(inode));
30538+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
30539+ assert("nikita-3507",
30540+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
30541+
30542+ result = 0;
30543+ if (inode_fibration_plugin(inode) == NULL ||
30544+ inode_fibration_plugin(inode)->h.id != plugin->h.id) {
30545+ if (is_dir_empty(inode) == 0)
30546+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
30547+ PSET_FIBRATION, plugin);
30548+ else
30549+ result = RETERR(-ENOTEMPTY);
30550+
30551+ }
30552+ return result;
30553+}
30554+
30555+static reiser4_plugin_ops fibration_plugin_ops = {
30556+ .init = NULL,
30557+ .load = NULL,
30558+ .save_len = NULL,
30559+ .save = NULL,
30560+ .change = change_fibration
30561+};
30562+
30563+/* fibration plugins */
30564+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
30565+ [FIBRATION_LEXICOGRAPHIC] = {
30566+ .h = {
30567+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30568+ .id = FIBRATION_LEXICOGRAPHIC,
30569+ .pops = &fibration_plugin_ops,
30570+ .label = "lexicographic",
30571+ .desc = "no fibration",
30572+ .linkage = {NULL, NULL}
30573+ },
30574+ .fibre = fibre_trivial
30575+ },
30576+ [FIBRATION_DOT_O] = {
30577+ .h = {
30578+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30579+ .id = FIBRATION_DOT_O,
30580+ .pops = &fibration_plugin_ops,
30581+ .label = "dot-o",
30582+ .desc = "fibrate .o files separately",
30583+ .linkage = {NULL, NULL}
30584+ },
30585+ .fibre = fibre_dot_o
30586+ },
30587+ [FIBRATION_EXT_1] = {
30588+ .h = {
30589+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30590+ .id = FIBRATION_EXT_1,
30591+ .pops = &fibration_plugin_ops,
30592+ .label = "ext-1",
30593+ .desc = "fibrate file by single character extension",
30594+ .linkage = {NULL, NULL}
30595+ },
30596+ .fibre = fibre_ext_1
30597+ },
30598+ [FIBRATION_EXT_3] = {
30599+ .h = {
30600+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30601+ .id = FIBRATION_EXT_3,
30602+ .pops = &fibration_plugin_ops,
30603+ .label = "ext-3",
30604+ .desc = "fibrate file by three character extension",
30605+ .linkage = {NULL, NULL}
30606+ },
30607+ .fibre = fibre_ext_3
30608+ }
30609+};
30610+
30611+/*
30612+ * Local variables:
30613+ * c-indentation-style: "K&R"
30614+ * mode-name: "LC"
30615+ * c-basic-offset: 8
30616+ * tab-width: 8
30617+ * fill-column: 79
30618+ * End:
30619+ */
30620diff -urN linux-2.6.20.orig/fs/reiser4/plugin/fibration.h linux-2.6.20/fs/reiser4/plugin/fibration.h
30621--- linux-2.6.20.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300
30622+++ linux-2.6.20/fs/reiser4/plugin/fibration.h 2007-05-06 14:50:43.762995722 +0400
30623@@ -0,0 +1,37 @@
30624+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
30625+
30626+/* Fibration plugin used by hashed directory plugin to segment content
30627+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
30628+
30629+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
30630+#define __FS_REISER4_PLUGIN_FIBRATION_H__
30631+
30632+#include "plugin_header.h"
30633+
30634+typedef struct fibration_plugin {
30635+ /* generic fields */
30636+ plugin_header h;
30637+
30638+ __u64(*fibre) (const struct inode * dir, const char *name, int len);
30639+} fibration_plugin;
30640+
30641+typedef enum {
30642+ FIBRATION_LEXICOGRAPHIC,
30643+ FIBRATION_DOT_O,
30644+ FIBRATION_EXT_1,
30645+ FIBRATION_EXT_3,
30646+ LAST_FIBRATION_ID
30647+} reiser4_fibration_id;
30648+
30649+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
30650+#endif
30651+
30652+/* Make Linus happy.
30653+ Local variables:
30654+ c-indentation-style: "K&R"
30655+ mode-name: "LC"
30656+ c-basic-offset: 8
30657+ tab-width: 8
30658+ fill-column: 120
30659+ End:
30660+*/
30661diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.c
30662--- linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300
30663+++ linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.c 2007-05-06 14:50:43.770998222 +0400
30664@@ -0,0 +1,3760 @@
30665+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
30666+ reiser4/README */
30667+
30668+/* This file contains implementations of inode/file/address_space/file plugin
30669+ * operations specific for cryptcompress file plugin which manages files with
30670+ * compressed and encrypted bodies. "Cryptcompress file" is built of items of
30671+ * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
30672+ */
30673+
30674+#include "../../inode.h"
30675+#include "../cluster.h"
30676+#include "../object.h"
30677+#include "../../tree_walk.h"
30678+#include "cryptcompress.h"
30679+
30680+#include <asm/scatterlist.h>
30681+#include <linux/pagevec.h>
30682+#include <asm/uaccess.h>
30683+#include <linux/swap.h>
30684+#include <linux/writeback.h>
30685+#include <linux/random.h>
30686+
30687+/* get cryptcompress specific portion of inode */
30688+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
30689+{
30690+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
30691+}
30692+
30693+/* plugin->u.file.init_inode_data */
30694+void init_inode_data_cryptcompress(struct inode *inode,
30695+ reiser4_object_create_data * crd,
30696+ int create)
30697+{
30698+ cryptcompress_info_t *data;
30699+
30700+ data = cryptcompress_inode_data(inode);
30701+ assert("edward-685", data != NULL);
30702+
30703+ memset(data, 0, sizeof(*data));
30704+
30705+ turn_on_compression(data);
30706+ set_lattice_factor(data, MIN_LATTICE_FACTOR);
30707+ init_inode_ordering(inode, crd, create);
30708+}
30709+
30710+#if REISER4_DEBUG
30711+int cryptcompress_inode_ok(struct inode *inode)
30712+{
30713+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
30714+ return 0;
30715+ if (!cluster_shift_ok(inode_cluster_shift(inode)))
30716+ return 0;
30717+ return 1;
30718+}
30719+#endif
30720+
30721+/* The following is a part of reiser4 cipher key manager
30722+ which is called when opening/creating a cryptcompress file */
30723+
30724+/* get/set cipher key info */
30725+crypto_stat_t * inode_crypto_stat (struct inode * inode)
30726+{
30727+ assert("edward-90", inode != NULL);
30728+ assert("edward-91", reiser4_inode_data(inode) != NULL);
30729+ return cryptcompress_inode_data(inode)->crypt;
30730+}
30731+
30732+static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
30733+{
30734+ cryptcompress_inode_data(inode)->crypt = stat;
30735+}
30736+
30737+/* allocate a cipher key info */
30738+crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode)
30739+{
30740+ crypto_stat_t * info;
30741+ int fipsize;
30742+
30743+ info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
30744+ if (!info)
30745+ return ERR_PTR(-ENOMEM);
30746+ memset(info, 0, sizeof (*info));
30747+ fipsize = inode_digest_plugin(inode)->fipsize;
30748+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
30749+ if (!info->keyid) {
30750+ kfree(info);
30751+ return ERR_PTR(-ENOMEM);
30752+ }
30753+ info->host = inode;
30754+ return info;
30755+}
30756+
30757+#if 0
30758+/* allocate/free low-level info for cipher and digest
30759+ transforms */
30760+static int alloc_crypto_tfms(crypto_stat_t * info)
30761+{
30762+ struct crypto_blkcipher * ctfm = NULL;
30763+ struct crypto_hash * dtfm = NULL;
30764+ cipher_plugin * cplug = inode_cipher_plugin(info->host);
30765+ digest_plugin * dplug = inode_digest_plugin(info->host);
30766+
30767+ if (cplug->alloc) {
30768+ ctfm = cplug->alloc();
30769+ if (IS_ERR(ctfm)) {
30770+ warning("edward-1364",
30771+ "Can not allocate info for %s\n",
30772+ cplug->h.desc);
30773+ return RETERR(PTR_ERR(ctfm));
30774+ }
30775+ }
30776+ info_set_cipher(info, ctfm);
30777+ if (dplug->alloc) {
30778+ dtfm = dplug->alloc();
30779+ if (IS_ERR(dtfm)) {
30780+ warning("edward-1365",
30781+ "Can not allocate info for %s\n",
30782+ dplug->h.desc);
30783+ goto unhappy_with_digest;
30784+ }
30785+ }
30786+ info_set_digest(info, dtfm);
30787+ return 0;
30788+ unhappy_with_digest:
30789+ if (cplug->free) {
30790+ cplug->free(ctfm);
30791+ info_set_cipher(info, NULL);
30792+ }
30793+ return RETERR(PTR_ERR(dtfm));
30794+}
30795+#endif
30796+
30797+static void
30798+free_crypto_tfms(crypto_stat_t * info)
30799+{
30800+ assert("edward-1366", info != NULL);
30801+ if (!info_get_cipher(info)) {
30802+ assert("edward-1601", !info_get_digest(info));
30803+ return;
30804+ }
30805+ inode_cipher_plugin(info->host)->free(info_get_cipher(info));
30806+ info_set_cipher(info, NULL);
30807+ inode_digest_plugin(info->host)->free(info_get_digest(info));
30808+ info_set_digest(info, NULL);
30809+ return;
30810+}
30811+
30812+#if 0
30813+/* create a key fingerprint for disk stat-data */
30814+static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
30815+{
30816+ int ret = -ENOMEM;
30817+ size_t blk, pad;
30818+ __u8 * dmem;
30819+ __u8 * cmem;
30820+ struct hash_desc ddesc;
30821+ struct blkcipher_desc cdesc;
30822+ struct scatterlist sg;
30823+
30824+ assert("edward-1367", info != NULL);
30825+ assert("edward-1368", info->keyid != NULL);
30826+
30827+ ddesc.tfm = info_get_digest(info);
30828+ ddesc.flags = 0;
30829+ cdesc.tfm = info_get_cipher(info);
30830+ cdesc.flags = 0;
30831+
30832+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
30833+ reiser4_ctx_gfp_mask_get());
30834+ if (!dmem)
30835+ goto exit1;
30836+
30837+ blk = crypto_blkcipher_blocksize(cdesc.tfm);
30838+
30839+ pad = data->keyid_size % blk;
30840+ pad = (pad ? blk - pad : 0);
30841+
30842+ cmem = kmalloc((size_t)data->keyid_size + pad,
30843+ reiser4_ctx_gfp_mask_get());
30844+ if (!cmem)
30845+ goto exit2;
30846+ memcpy(cmem, data->keyid, data->keyid_size);
30847+ memset(cmem + data->keyid_size, 0, pad);
30848+
30849+ sg.page = virt_to_page(cmem);
30850+ sg.offset = offset_in_page(cmem);
30851+ sg.length = data->keyid_size + pad;
30852+
30853+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
30854+ data->keyid_size + pad);
30855+ if (ret) {
30856+ warning("edward-1369",
30857+ "encryption failed flags=%x\n", cdesc.flags);
30858+ goto exit3;
30859+ }
30860+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
30861+ if (ret) {
30862+ warning("edward-1602",
30863+ "digest failed flags=%x\n", ddesc.flags);
30864+ goto exit3;
30865+ }
30866+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
30867+ exit3:
30868+ kfree(cmem);
30869+ exit2:
30870+ kfree(dmem);
30871+ exit1:
30872+ return ret;
30873+}
30874+#endif
30875+
30876+static void destroy_keyid(crypto_stat_t * info)
30877+{
30878+ assert("edward-1370", info != NULL);
30879+ assert("edward-1371", info->keyid != NULL);
30880+ kfree(info->keyid);
30881+ return;
30882+}
30883+
30884+static void __free_crypto_stat (struct inode * inode)
30885+{
30886+ crypto_stat_t * info = inode_crypto_stat(inode);
30887+ assert("edward-1372", info != NULL);
30888+
30889+ free_crypto_tfms(info);
30890+ destroy_keyid(info);
30891+ kfree(info);
30892+}
30893+
30894+#if 0
30895+static void instantiate_crypto_stat(crypto_stat_t * info)
30896+{
30897+ assert("edward-1373", info != NULL);
30898+ assert("edward-1374", info->inst == 0);
30899+ info->inst = 1;
30900+}
30901+#endif
30902+
30903+static void uninstantiate_crypto_stat(crypto_stat_t * info)
30904+{
30905+ assert("edward-1375", info != NULL);
30906+ info->inst = 0;
30907+}
30908+
30909+static int crypto_stat_instantiated(crypto_stat_t * info)
30910+{
30911+ return info->inst;
30912+}
30913+
30914+static int inode_has_cipher_key(struct inode * inode)
30915+{
30916+ assert("edward-1376", inode != NULL);
30917+ return inode_crypto_stat(inode) &&
30918+ crypto_stat_instantiated(inode_crypto_stat(inode));
30919+}
30920+
30921+static void free_crypto_stat (struct inode * inode)
30922+{
30923+ uninstantiate_crypto_stat(inode_crypto_stat(inode));
30924+ __free_crypto_stat(inode);
30925+}
30926+
30927+static int need_cipher(struct inode * inode)
30928+{
30929+ return inode_cipher_plugin(inode) !=
30930+ cipher_plugin_by_id(NONE_CIPHER_ID);
30931+}
30932+
30933+/* Create a crypto-stat and attach result to the @object.
30934+ If success is returned, then low-level cipher info contains
30935+ an instantiated key */
30936+#if 0
30937+crypto_stat_t *
30938+create_crypto_stat(struct inode * object,
30939+ crypto_data_t * data /* this contains a (uninstantiated)
30940+ cipher key imported from user
30941+ space */)
30942+{
30943+ int ret;
30944+ crypto_stat_t * info;
30945+
30946+ assert("edward-1377", data != NULL);
30947+ assert("edward-1378", need_cipher(object));
30948+
30949+ if (inode_file_plugin(object) !=
30950+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
30951+ return ERR_PTR(-EINVAL);
30952+
30953+ info = reiser4_alloc_crypto_stat(object);
30954+ if (IS_ERR(info))
30955+ return info;
30956+ ret = alloc_crypto_tfms(info);
30957+ if (ret)
30958+ goto err;
30959+ /* instantiating a key */
30960+ ret = crypto_blkcipher_setkey(info_get_cipher(info),
30961+ data->key,
30962+ data->keysize);
30963+ if (ret) {
30964+ warning("edward-1379",
30965+ "setkey failed flags=%x\n",
30966+ crypto_blkcipher_get_flags(info_get_cipher(info)));
30967+ goto err;
30968+ }
30969+ info->keysize = data->keysize;
30970+ ret = create_keyid(info, data);
30971+ if (ret)
30972+ goto err;
30973+ instantiate_crypto_stat(info);
30974+ return info;
30975+ err:
30976+ __free_crypto_stat(object);
30977+ return ERR_PTR(ret);
30978+}
30979+#endif
30980+
30981+/* increment/decrement a load counter when
30982+ attaching/detaching the crypto-stat to any object */
30983+static void load_crypto_stat(crypto_stat_t * info)
30984+{
30985+ assert("edward-1380", info != NULL);
30986+ inc_keyload_count(info);
30987+}
30988+
30989+static void unload_crypto_stat(struct inode * inode)
30990+{
30991+ crypto_stat_t * info = inode_crypto_stat(inode);
30992+ assert("edward-1381", info->keyload_count > 0);
30993+
30994+ dec_keyload_count(inode_crypto_stat(inode));
30995+ if (info->keyload_count == 0)
30996+ /* final release */
30997+ free_crypto_stat(inode);
30998+}
30999+
31000+/* attach/detach an existing crypto-stat */
31001+void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
31002+{
31003+ assert("edward-1382", inode != NULL);
31004+ assert("edward-1383", info != NULL);
31005+ assert("edward-1384", inode_crypto_stat(inode) == NULL);
31006+
31007+ set_inode_crypto_stat(inode, info);
31008+ load_crypto_stat(info);
31009+}
31010+
31011+/* returns true, if crypto stat can be attached to the @host */
31012+#if REISER4_DEBUG
31013+static int host_allows_crypto_stat(struct inode * host)
31014+{
31015+ int ret;
31016+ file_plugin * fplug = inode_file_plugin(host);
31017+
31018+ switch (fplug->h.id) {
31019+ case CRYPTCOMPRESS_FILE_PLUGIN_ID:
31020+ ret = 1;
31021+ break;
31022+ default:
31023+ ret = 0;
31024+ }
31025+ return ret;
31026+}
31027+#endif /* REISER4_DEBUG */
31028+
31029+static void reiser4_detach_crypto_stat(struct inode * inode)
31030+{
31031+ assert("edward-1385", inode != NULL);
31032+ assert("edward-1386", host_allows_crypto_stat(inode));
31033+
31034+ if (inode_crypto_stat(inode))
31035+ unload_crypto_stat(inode);
31036+ set_inode_crypto_stat(inode, NULL);
31037+}
31038+
31039+#if 0
31040+
31041+/* compare fingerprints of @child and @parent */
31042+static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
31043+{
31044+ return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
31045+}
31046+
31047+/* check if a crypto-stat (which is bound to @parent) can be inherited */
31048+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
31049+{
31050+ if (!need_cipher(child))
31051+ return 0;
31052+ /* the child is created */
31053+ if (!inode_crypto_stat(child))
31054+ return 1;
31055+ /* the child is looked up */
31056+ if (!inode_crypto_stat(parent))
31057+ return 0;
31058+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31059+ inode_digest_plugin(child) == inode_digest_plugin(parent) &&
31060+ inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
31061+ keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
31062+}
31063+#endif
31064+
31065+/* helper functions for ->create() method of the cryptcompress plugin */
31066+static int inode_set_crypto(struct inode * object)
31067+{
31068+ reiser4_inode * info;
31069+ if (!inode_crypto_stat(object)) {
31070+ if (need_cipher(object))
31071+ return RETERR(-EINVAL);
31072+ /* the file is not to be encrypted */
31073+ return 0;
31074+ }
31075+ info = reiser4_inode_data(object);
31076+ info->extmask |= (1 << CRYPTO_STAT);
31077+ return 0;
31078+}
31079+
31080+static int inode_init_compression(struct inode * object)
31081+{
31082+ int result = 0;
31083+ assert("edward-1461", object != NULL);
31084+ if (inode_compression_plugin(object)->init)
31085+ result = inode_compression_plugin(object)->init();
31086+ return result;
31087+}
31088+
31089+static int inode_check_cluster(struct inode * object)
31090+{
31091+ assert("edward-696", object != NULL);
31092+
31093+ if (inode_cluster_size(object) < PAGE_CACHE_SIZE) {
31094+ warning("edward-1320", "Can not support '%s' "
31095+ "logical clusters (less then page size)",
31096+ inode_cluster_plugin(object)->h.label);
31097+ return RETERR(-EINVAL);
31098+ }
31099+ return 0;
31100+}
31101+
31102+/* ->destroy_inode() method of the cryptcompress plugin */
31103+void destroy_inode_cryptcompress(struct inode * inode)
31104+{
31105+ assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
31106+ reiser4_detach_crypto_stat(inode);
31107+ return;
31108+}
31109+
31110+/* ->create() method of the cryptcompress plugin
31111+
31112+. install plugins
31113+. attach crypto info if specified
31114+. attach compression info if specified
31115+. attach cluster info
31116+*/
31117+int
31118+create_cryptcompress(struct inode *object, struct inode *parent,
31119+ reiser4_object_create_data * data)
31120+{
31121+ int result;
31122+ reiser4_inode *info;
31123+
31124+ assert("edward-23", object != NULL);
31125+ assert("edward-24", parent != NULL);
31126+ assert("edward-30", data != NULL);
31127+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
31128+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
31129+
31130+ info = reiser4_inode_data(object);
31131+
31132+ assert("edward-29", info != NULL);
31133+
31134+ /* set file bit */
31135+ info->plugin_mask |= (1 << PSET_FILE);
31136+
31137+ /* set crypto */
31138+ result = inode_set_crypto(object);
31139+ if (result)
31140+ goto error;
31141+ /* set compression */
31142+ result = inode_init_compression(object);
31143+ if (result)
31144+ goto error;
31145+ /* set cluster */
31146+ result = inode_check_cluster(object);
31147+ if (result)
31148+ goto error;
31149+
31150+ /* save everything in disk stat-data */
31151+ result = write_sd_by_inode_common(object);
31152+ if (!result)
31153+ return 0;
31154+ error:
31155+ reiser4_detach_crypto_stat(object);
31156+ return result;
31157+}
31158+
31159+/* ->open() method of the cryptcompress plugin */
31160+int open_object_cryptcompress(struct inode * inode, struct file * file)
31161+{
31162+ int result;
31163+ struct inode * parent;
31164+
31165+ assert("edward-1394", inode != NULL);
31166+ assert("edward-1395", file != NULL);
31167+ assert("edward-1396", file != NULL);
31168+ assert("edward-1397", file->f_dentry->d_inode == inode);
31169+ assert("edward-1398", file->f_dentry->d_parent != NULL);
31170+ assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31171+ assert("edward-698",
31172+ inode_file_plugin(inode) ==
31173+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
31174+ result = inode_check_cluster(inode);
31175+ if (result)
31176+ return result;
31177+ result = inode_init_compression(inode);
31178+ if (result)
31179+ return result;
31180+ if (!need_cipher(inode))
31181+ /* the file is not to be ciphered */
31182+ return 0;
31183+ parent = file->f_dentry->d_parent->d_inode;
31184+ if (!inode_has_cipher_key(inode))
31185+ return RETERR(-EINVAL);
31186+ return 0;
31187+}
31188+
31189+/* returns a blocksize, the attribute of a cipher algorithm */
31190+static unsigned int
31191+cipher_blocksize(struct inode * inode)
31192+{
31193+ assert("edward-758", need_cipher(inode));
31194+ assert("edward-1400", inode_crypto_stat(inode) != NULL);
31195+ return crypto_blkcipher_blocksize
31196+ (info_get_cipher(inode_crypto_stat(inode)));
31197+}
31198+
31199+/* returns offset translated by scale factor of the crypto-algorithm */
31200+static loff_t inode_scaled_offset (struct inode * inode,
31201+ const loff_t src_off /* input offset */)
31202+{
31203+ assert("edward-97", inode != NULL);
31204+
31205+ if (!need_cipher(inode) ||
31206+ src_off == get_key_offset(reiser4_min_key()) ||
31207+ src_off == get_key_offset(reiser4_max_key()))
31208+ return src_off;
31209+
31210+ return inode_cipher_plugin(inode)->scale(inode,
31211+ cipher_blocksize(inode),
31212+ src_off);
31213+}
31214+
31215+/* returns disk cluster size */
31216+size_t inode_scaled_cluster_size(struct inode * inode)
31217+{
31218+ assert("edward-110", inode != NULL);
31219+
31220+ return inode_scaled_offset(inode, inode_cluster_size(inode));
31221+}
31222+
31223+static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
31224+{
31225+ return (clust_to_off(clust->index, inode) >= inode->i_size);
31226+}
31227+
31228+/* set number of cluster pages */
31229+static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
31230+{
31231+ reiser4_slide_t *win;
31232+
31233+ assert("edward-180", clust != NULL);
31234+ assert("edward-1040", inode != NULL);
31235+
31236+ win = clust->win;
31237+ if (!win) {
31238+ /* NOTE-EDWARD: i_size should be protected */
31239+ clust->nr_pages =
31240+ count_to_nrpages(fsize_to_count(clust, inode));
31241+ return;
31242+ }
31243+ assert("edward-1176", clust->op != PCL_UNKNOWN);
31244+ assert("edward-1064", win->off + win->count + win->delta != 0);
31245+
31246+ if (win->stat == HOLE_WINDOW &&
31247+ win->off == 0 && win->count == inode_cluster_size(inode)) {
31248+ /* special case: we start write hole from fake cluster */
31249+ clust->nr_pages = 0;
31250+ return;
31251+ }
31252+ clust->nr_pages =
31253+ count_to_nrpages(max_count(win->off + win->count + win->delta,
31254+ fsize_to_count(clust, inode)));
31255+ return;
31256+}
31257+
31258+/* ->key_by_inode() method of the cryptcompress plugin */
31259+/* see plugin/plugin.h for details */
31260+int
31261+key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
31262+{
31263+ loff_t clust_off;
31264+
31265+ assert("edward-64", inode != 0);
31266+ // assert("edward-112", ergo(off != get_key_offset(reiser4_max_key()), !off_to_cloff(off, inode)));
31267+ /* don't come here with other offsets */
31268+
31269+ clust_off =
31270+ (off ==
31271+ get_key_offset(reiser4_max_key())? get_key_offset(reiser4_max_key()) :
31272+ off_to_clust_to_off(off, inode));
31273+
31274+ key_by_inode_and_offset_common(inode, 0, key);
31275+ set_key_offset(key,
31276+ (__u64) (!inode_crypto_stat(inode) ? clust_off :
31277+ inode_scaled_offset(inode, clust_off)));
31278+ return 0;
31279+}
31280+
31281+/* plugin->flow_by_inode */
31282+int
31283+flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
31284+ const char __user *buf /* user level buffer */ ,
31285+ int user /* 1 if @buf is of user space, 0 - if it is
31286+ kernel space */ ,
31287+ loff_t size /* buffer size */ ,
31288+ loff_t off /* offset to start io from */ ,
31289+ rw_op op /* READ or WRITE */ ,
31290+ flow_t * f /* resulting flow */ )
31291+{
31292+ assert("edward-436", f != NULL);
31293+ assert("edward-149", inode != NULL);
31294+ assert("edward-150", inode_file_plugin(inode) != NULL);
31295+
31296+ f->length = size;
31297+ memcpy(&f->data, &buf, sizeof(buf));
31298+ f->user = user;
31299+ f->op = op;
31300+
31301+ if (op == WRITE_OP && user == 1)
31302+ return 0;
31303+ return key_by_inode_cryptcompress(inode, off, &f->key);
31304+}
31305+
31306+static int
31307+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
31308+ znode_lock_mode lock_mode)
31309+{
31310+ coord_t *coord;
31311+
31312+ assert("edward-704", hint != NULL);
31313+ assert("edward-1089", !hint_is_valid(hint));
31314+ assert("edward-706", hint->lh.owner == NULL);
31315+
31316+ coord = &hint->ext_coord.coord;
31317+
31318+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
31319+ /* hint either not set or set by different operation */
31320+ return RETERR(-E_REPEAT);
31321+
31322+ if (get_key_offset(key) != hint->offset)
31323+ /* hint is set for different key */
31324+ return RETERR(-E_REPEAT);
31325+
31326+ assert("edward-707", reiser4_schedulable());
31327+
31328+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
31329+ key, &hint->lh, lock_mode,
31330+ ZNODE_LOCK_LOPRI);
31331+}
31332+
31333+/* reserve disk space when writing a logical cluster */
31334+static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
31335+{
31336+ int result = 0;
31337+
31338+ assert("edward-965", reiser4_schedulable());
31339+ assert("edward-439", inode != NULL);
31340+ assert("edward-440", clust != NULL);
31341+ assert("edward-441", clust->pages != NULL);
31342+
31343+ if (clust->nr_pages == 0) {
31344+ assert("edward-1152", clust->win != NULL);
31345+ assert("edward-1153", clust->win->stat == HOLE_WINDOW);
31346+ /* don't reserve space for fake disk clusteer */
31347+ return 0;
31348+ }
31349+ assert("edward-442", jprivate(clust->pages[0]) != NULL);
31350+
31351+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
31352+ estimate_update_cluster(inode),
31353+ BA_CAN_COMMIT);
31354+ if (result)
31355+ return result;
31356+ clust->reserved = 1;
31357+ grabbed2cluster_reserved(estimate_insert_cluster(inode) +
31358+ estimate_update_cluster(inode));
31359+#if REISER4_DEBUG
31360+ clust->reserved_prepped = estimate_update_cluster(inode);
31361+ clust->reserved_unprepped = estimate_insert_cluster(inode);
31362+#endif
31363+ /* there can be space grabbed by txnmgr_force_commit_all */
31364+ return 0;
31365+}
31366+
31367+/* free reserved disk space if writing a logical cluster fails */
31368+static void
31369+free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
31370+{
31371+ assert("edward-967", clust->reserved == 1);
31372+
31373+ cluster_reserved2free(count);
31374+ clust->reserved = 0;
31375+}
31376+
31377+/* The core search procedure of the cryptcompress plugin.
31378+ If returned value is not cbk_errored, then current znode is locked */
31379+static int find_cluster_item(hint_t * hint,
31380+ const reiser4_key * key, /* key of the item we are
31381+ looking for */
31382+ znode_lock_mode lock_mode /* which lock */ ,
31383+ ra_info_t * ra_info, lookup_bias bias, __u32 flags)
31384+{
31385+ int result;
31386+ reiser4_key ikey;
31387+ int went_right = 0;
31388+ coord_t *coord = &hint->ext_coord.coord;
31389+ coord_t orig = *coord;
31390+
31391+ assert("edward-152", hint != NULL);
31392+
31393+ if (!hint_is_valid(hint)) {
31394+ result = cryptcompress_hint_validate(hint, key, lock_mode);
31395+ if (result == -E_REPEAT)
31396+ goto traverse_tree;
31397+ else if (result) {
31398+ assert("edward-1216", 0);
31399+ return result;
31400+ }
31401+ hint_set_valid(hint);
31402+ }
31403+ assert("edward-709", znode_is_any_locked(coord->node));
31404+
31405+ /* In-place lookup is going here, it means we just need to
31406+ check if next item of the @coord match to the @keyhint) */
31407+
31408+ if (equal_to_rdk(coord->node, key)) {
31409+ result = goto_right_neighbor(coord, &hint->lh);
31410+ if (result == -E_NO_NEIGHBOR) {
31411+ assert("edward-1217", 0);
31412+ return RETERR(-EIO);
31413+ }
31414+ if (result)
31415+ return result;
31416+ assert("edward-1218", equal_to_ldk(coord->node, key));
31417+ went_right = 1;
31418+ } else {
31419+ coord->item_pos++;
31420+ coord->unit_pos = 0;
31421+ coord->between = AT_UNIT;
31422+ }
31423+ result = zload(coord->node);
31424+ if (result)
31425+ return result;
31426+ assert("edward-1219", !node_is_empty(coord->node));
31427+
31428+ if (!coord_is_existing_item(coord)) {
31429+ zrelse(coord->node);
31430+ goto not_found;
31431+ }
31432+ item_key_by_coord(coord, &ikey);
31433+ zrelse(coord->node);
31434+ if (!keyeq(key, &ikey))
31435+ goto not_found;
31436+ /* Ok, item is found, update node counts */
31437+ if (went_right)
31438+ dclust_inc_extension_ncount(hint);
31439+ return CBK_COORD_FOUND;
31440+
31441+ not_found:
31442+ assert("edward-1220", coord->item_pos > 0);
31443+ //coord->item_pos--;
31444+ /* roll back */
31445+ *coord = orig;
31446+ ON_DEBUG(coord_update_v(coord));
31447+ return CBK_COORD_NOTFOUND;
31448+
31449+ traverse_tree:
31450+ assert("edward-713", hint->lh.owner == NULL);
31451+ assert("edward-714", reiser4_schedulable());
31452+
31453+ reiser4_unset_hint(hint);
31454+ dclust_init_extension(hint);
31455+ coord_init_zero(coord);
31456+ result = coord_by_key(current_tree, key, coord, &hint->lh,
31457+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
31458+ CBK_UNIQUE | flags, ra_info);
31459+ if (cbk_errored(result))
31460+ return result;
31461+ if(result == CBK_COORD_FOUND)
31462+ dclust_inc_extension_ncount(hint);
31463+ hint_set_valid(hint);
31464+ return result;
31465+}
31466+
31467+/* This function is called by deflate[inflate] manager when
31468+ creating a transformed/plain stream to check if we should
31469+ create/cut some overhead. If this returns true, then @oh
31470+ contains the size of this overhead.
31471+ */
31472+static int
31473+need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
31474+ rw_op rw, int * oh)
31475+{
31476+ tfm_cluster_t * tc = &clust->tc;
31477+ switch (rw) {
31478+ case WRITE_OP: /* estimate align */
31479+ *oh = tc->len % cipher_blocksize(inode);
31480+ if (*oh != 0)
31481+ return 1;
31482+ break;
31483+ case READ_OP: /* estimate cut */
31484+ *oh = *(tfm_output_data(clust) + tc->len - 1);
31485+ break;
31486+ default:
31487+ impossible("edward-1401", "bad option");
31488+ }
31489+ return (tc->len != tc->lsize);
31490+}
31491+
31492+/* create/cut an overhead of transformed/plain stream */
31493+static void
31494+align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
31495+{
31496+ int oh;
31497+ cipher_plugin * cplug = inode_cipher_plugin(inode);
31498+
31499+ assert("edward-1402", need_cipher(inode));
31500+
31501+ if (!need_cut_or_align(inode, clust, rw, &oh))
31502+ return;
31503+ switch (rw) {
31504+ case WRITE_OP: /* do align */
31505+ clust->tc.len +=
31506+ cplug->align_stream(tfm_input_data(clust) +
31507+ clust->tc.len, clust->tc.len,
31508+ cipher_blocksize(inode));
31509+ *(tfm_input_data(clust) + clust->tc.len - 1) =
31510+ cipher_blocksize(inode) - oh;
31511+ break;
31512+ case READ_OP: /* do cut */
31513+ assert("edward-1403", oh <= cipher_blocksize(inode));
31514+ clust->tc.len -= oh;
31515+ break;
31516+ default:
31517+ impossible("edward-1404", "bad option");
31518+ }
31519+ return;
31520+}
31521+
31522+/* the following two functions are to evaluate results
31523+ of compression transform */
31524+static unsigned
31525+max_cipher_overhead(struct inode * inode)
31526+{
31527+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
31528+ return 0;
31529+ return cipher_blocksize(inode);
31530+}
31531+
31532+static int deflate_overhead(struct inode *inode)
31533+{
31534+ return (inode_compression_plugin(inode)->
31535+ checksum ? DC_CHECKSUM_SIZE : 0);
31536+}
31537+
31538+static unsigned deflate_overrun(struct inode * inode, int ilen)
31539+{
31540+ return coa_overrun(inode_compression_plugin(inode), ilen);
31541+}
31542+
31543+/* Estimating compressibility of a logical cluster by various
31544+ policies represented by compression mode plugin.
31545+ If this returns false, then compressor won't be called for
31546+ the cluster of index @index.
31547+*/
31548+static int should_compress(tfm_cluster_t * tc, cloff_t index,
31549+ struct inode *inode)
31550+{
31551+ compression_plugin *cplug = inode_compression_plugin(inode);
31552+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
31553+
31554+ assert("edward-1321", tc->len != 0);
31555+ assert("edward-1322", cplug != NULL);
31556+ assert("edward-1323", mplug != NULL);
31557+
31558+ return /* estimate by size */
31559+ (cplug->min_size_deflate ?
31560+ tc->len >= cplug->min_size_deflate() :
31561+ 1) &&
31562+ /* estimate by compression mode plugin */
31563+ (mplug->should_deflate ?
31564+ mplug->should_deflate(inode, index) :
31565+ 1);
31566+}
31567+
31568+/* Evaluating results of compression transform.
31569+ Returns true, if we need to accept this results */
31570+static int
31571+save_compressed(int size_before, int size_after, struct inode * inode)
31572+{
31573+ return (size_after + deflate_overhead(inode) +
31574+ max_cipher_overhead(inode) < size_before);
31575+}
31576+
31577+/* Guess result of the evaluation above */
31578+static int
31579+need_inflate(reiser4_cluster_t * clust, struct inode *inode,
31580+ int encrypted /* is cluster encrypted */ )
31581+{
31582+ tfm_cluster_t *tc = &clust->tc;
31583+
31584+ assert("edward-142", tc != 0);
31585+ assert("edward-143", inode != NULL);
31586+
31587+ return tc->len <
31588+ (encrypted ?
31589+ inode_scaled_offset(inode, tc->lsize) :
31590+ tc->lsize);
31591+}
31592+
31593+/* If results of compression were accepted, then we add
31594+ a checksum to catch possible disk cluster corruption.
31595+ The following is a format of the data stored in disk clusters:
31596+
31597+ data This is (transformed) logical cluster.
31598+ cipher_overhead This is created by ->align() method
31599+ of cipher plugin. May be absent.
31600+ checksum (4) This is created by ->checksum method
31601+ of compression plugin to check
31602+ integrity. May be absent.
31603+
31604+ Crypto overhead format:
31605+
31606+ data
31607+ control_byte (1) contains aligned overhead size:
31608+ 1 <= overhead <= cipher_blksize
31609+*/
31610+/* Append a checksum at the end of a transformed stream */
31611+static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
31612+{
31613+ __u32 checksum;
31614+
31615+ assert("edward-1309", tc != NULL);
31616+ assert("edward-1310", tc->len > 0);
31617+ assert("edward-1311", cplug->checksum != NULL);
31618+
31619+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
31620+ put_unaligned(cpu_to_le32(checksum),
31621+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
31622+ tc->len += (int)DC_CHECKSUM_SIZE;
31623+}
31624+
31625+/* Check a disk cluster checksum.
31626+ Returns 0 if checksum is correct, otherwise returns 1 */
31627+static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
31628+{
31629+ assert("edward-1312", tc != NULL);
31630+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
31631+ assert("edward-1314", cplug->checksum != NULL);
31632+
31633+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
31634+ tc->len - (int)DC_CHECKSUM_SIZE) !=
31635+ le32_to_cpu(get_unaligned((d32 *)
31636+ (tfm_stream_data(tc, INPUT_STREAM)
31637+ + tc->len - (int)DC_CHECKSUM_SIZE)))) {
31638+ warning("edward-156",
31639+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
31640+ (int)le32_to_cpu
31641+ (get_unaligned((d32 *)
31642+ (tfm_stream_data(tc, INPUT_STREAM) +
31643+ tc->len - (int)DC_CHECKSUM_SIZE))),
31644+ (int)cplug->checksum
31645+ (tfm_stream_data(tc, INPUT_STREAM),
31646+ tc->len - (int)DC_CHECKSUM_SIZE));
31647+ return 1;
31648+ }
31649+ tc->len -= (int)DC_CHECKSUM_SIZE;
31650+ return 0;
31651+}
31652+
31653+/* get input/output stream for some transform action */
31654+int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
31655+ tfm_stream_id id)
31656+{
31657+ size_t size = inode_scaled_cluster_size(inode);
31658+
31659+ assert("edward-901", tc != NULL);
31660+ assert("edward-1027", inode_compression_plugin(inode) != NULL);
31661+
31662+ if (cluster_get_tfm_act(tc) == TFMA_WRITE)
31663+ size += deflate_overrun(inode, inode_cluster_size(inode));
31664+
31665+ if (!tfm_stream(tc, id) && id == INPUT_STREAM)
31666+ alternate_streams(tc);
31667+ if (!tfm_stream(tc, id))
31668+ return alloc_tfm_stream(tc, size, id);
31669+
31670+ assert("edward-902", tfm_stream_is_set(tc, id));
31671+
31672+ if (tfm_stream_size(tc, id) < size)
31673+ return realloc_tfm_stream(tc, size, id);
31674+ return 0;
31675+}
31676+
31677+/* Common deflate manager */
31678+int reiser4_deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
31679+{
31680+ int result = 0;
31681+ int compressed = 0;
31682+ int encrypted = 0;
31683+ tfm_cluster_t * tc = &clust->tc;
31684+ compression_plugin * coplug;
31685+
31686+ assert("edward-401", inode != NULL);
31687+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
31688+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
31689+ assert("edward-498", !tfm_cluster_is_uptodate(tc));
31690+
31691+ coplug = inode_compression_plugin(inode);
31692+ if (should_compress(tc, clust->index, inode)) {
31693+ /* try to compress, discard bad results */
31694+ __u32 dst_len;
31695+ compression_mode_plugin * mplug =
31696+ inode_compression_mode_plugin(inode);
31697+ assert("edward-602", coplug != NULL);
31698+ assert("edward-1423", coplug->compress != NULL);
31699+
31700+ result = grab_coa(tc, coplug);
31701+ if (result) {
31702+ warning("edward-1424",
31703+ "alloc_coa failed with ret=%d, skipped compression",
31704+ result);
31705+ goto cipher;
31706+ }
31707+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31708+ if (result) {
31709+ warning("edward-1425",
31710+ "alloc stream failed with ret=%d, skipped compression",
31711+ result);
31712+ goto cipher;
31713+ }
31714+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
31715+ coplug->compress(get_coa(tc, coplug->h.id, tc->act),
31716+ tfm_input_data(clust), tc->len,
31717+ tfm_output_data(clust), &dst_len);
31718+ /* make sure we didn't overwrite extra bytes */
31719+ assert("edward-603",
31720+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
31721+
31722+ /* evaluate results of compression transform */
31723+ if (save_compressed(tc->len, dst_len, inode)) {
31724+ /* good result, accept */
31725+ tc->len = dst_len;
31726+ if (mplug->accept_hook != NULL) {
31727+ result = mplug->accept_hook(inode, clust->index);
31728+ if (result)
31729+ warning("edward-1426",
31730+ "accept_hook failed with ret=%d",
31731+ result);
31732+ }
31733+ compressed = 1;
31734+ }
31735+ else {
31736+ /* bad result, discard */
31737+#if REISER4_DEBUG
31738+ if (cluster_is_complete(clust, inode))
31739+ warning("edward-1338",
31740+ "incompressible cluster %lu (inode %llu)",
31741+ clust->index,
31742+ (unsigned long long)get_inode_oid(inode));
31743+#endif
31744+ if (mplug->discard_hook != NULL &&
31745+ cluster_is_complete(clust, inode)) {
31746+ result = mplug->discard_hook(inode,
31747+ clust->index);
31748+ if (result)
31749+ warning("edward-1427",
31750+ "discard_hook failed with ret=%d",
31751+ result);
31752+ }
31753+ }
31754+ }
31755+ cipher:
31756+ if (need_cipher(inode)) {
31757+ cipher_plugin * ciplug;
31758+ struct blkcipher_desc desc;
31759+ struct scatterlist src;
31760+ struct scatterlist dst;
31761+
31762+ ciplug = inode_cipher_plugin(inode);
31763+ desc.tfm = info_get_cipher(inode_crypto_stat(inode));
31764+ desc.flags = 0;
31765+ if (compressed)
31766+ alternate_streams(tc);
31767+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31768+ if (result)
31769+ return result;
31770+
31771+ align_or_cut_overhead(inode, clust, WRITE_OP);
31772+ src.page = virt_to_page(tfm_input_data(clust));
31773+ src.offset = offset_in_page(tfm_input_data(clust));
31774+ src.length = tc->len;
31775+
31776+ dst.page = virt_to_page(tfm_output_data(clust));
31777+ dst.offset = offset_in_page(tfm_output_data(clust));
31778+ dst.length = tc->len;
31779+
31780+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
31781+ if (result) {
31782+ warning("edward-1405",
31783+ "encryption failed flags=%x\n", desc.flags);
31784+ return result;
31785+ }
31786+ encrypted = 1;
31787+ }
31788+ if (compressed && coplug->checksum != NULL)
31789+ dc_set_checksum(coplug, tc);
31790+ if (!compressed && !encrypted)
31791+ alternate_streams(tc);
31792+ return result;
31793+}
31794+
31795+/* Common inflate manager. */
31796+int reiser4_inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
31797+{
31798+ int result = 0;
31799+ int transformed = 0;
31800+ tfm_cluster_t * tc = &clust->tc;
31801+ compression_plugin * coplug;
31802+
31803+ assert("edward-905", inode != NULL);
31804+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
31805+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
31806+ assert("edward-1349", tc->act == TFMA_READ);
31807+ assert("edward-907", !tfm_cluster_is_uptodate(tc));
31808+
31809+ /* Handle a checksum (if any) */
31810+ coplug = inode_compression_plugin(inode);
31811+ if (need_inflate(clust, inode, need_cipher(inode)) &&
31812+ coplug->checksum != NULL) {
31813+ result = dc_check_checksum(coplug, tc);
31814+ if (unlikely(result)) {
31815+ warning("edward-1460",
31816+ "Inode %llu: disk cluster %lu looks corrupted",
31817+ (unsigned long long)get_inode_oid(inode),
31818+ clust->index);
31819+ return RETERR(-EIO);
31820+ }
31821+ }
31822+ if (need_cipher(inode)) {
31823+ cipher_plugin * ciplug;
31824+ struct blkcipher_desc desc;
31825+ struct scatterlist src;
31826+ struct scatterlist dst;
31827+
31828+ ciplug = inode_cipher_plugin(inode);
31829+ desc.tfm = info_get_cipher(inode_crypto_stat(inode));
31830+ desc.flags = 0;
31831+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31832+ if (result)
31833+ return result;
31834+ assert("edward-909", tfm_cluster_is_set(tc));
31835+
31836+ src.page = virt_to_page(tfm_input_data(clust));
31837+ src.offset = offset_in_page(tfm_input_data(clust));
31838+ src.length = tc->len;
31839+
31840+ dst.page = virt_to_page(tfm_output_data(clust));
31841+ dst.offset = offset_in_page(tfm_output_data(clust));
31842+ dst.length = tc->len;
31843+
31844+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
31845+ if (result) {
31846+ warning("edward-1600", "decrypt failed flags=%x\n",
31847+ desc.flags);
31848+ return result;
31849+ }
31850+ align_or_cut_overhead(inode, clust, READ_OP);
31851+ transformed = 1;
31852+ }
31853+ if (need_inflate(clust, inode, 0)) {
31854+ unsigned dst_len = inode_cluster_size(inode);
31855+ if(transformed)
31856+ alternate_streams(tc);
31857+
31858+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31859+ if (result)
31860+ return result;
31861+ assert("edward-1305", coplug->decompress != NULL);
31862+ assert("edward-910", tfm_cluster_is_set(tc));
31863+
31864+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
31865+ tfm_input_data(clust), tc->len,
31866+ tfm_output_data(clust), &dst_len);
31867+ /* check length */
31868+ tc->len = dst_len;
31869+ assert("edward-157", dst_len == tc->lsize);
31870+ transformed = 1;
31871+ }
31872+ if (!transformed)
31873+ alternate_streams(tc);
31874+ return result;
31875+}
31876+
31877+/* This is implementation of readpage method of struct
31878+ address_space_operations for cryptcompress plugin. */
31879+int readpage_cryptcompress(struct file *file, struct page *page)
31880+{
31881+ reiser4_context *ctx;
31882+ reiser4_cluster_t clust;
31883+ item_plugin *iplug;
31884+ int result;
31885+
31886+ assert("edward-88", PageLocked(page));
31887+ assert("vs-976", !PageUptodate(page));
31888+ assert("edward-89", page->mapping && page->mapping->host);
31889+
31890+ ctx = reiser4_init_context(page->mapping->host->i_sb);
31891+ if (IS_ERR(ctx)) {
31892+ unlock_page(page);
31893+ return PTR_ERR(ctx);
31894+ }
31895+ assert("edward-113",
31896+ ergo(file != NULL,
31897+ page->mapping == file->f_dentry->d_inode->i_mapping));
31898+
31899+ if (PageUptodate(page)) {
31900+ warning("edward-1338", "page is already uptodate\n");
31901+ unlock_page(page);
31902+ reiser4_exit_context(ctx);
31903+ return 0;
31904+ }
31905+ cluster_init_read(&clust, NULL);
31906+ clust.file = file;
31907+ iplug = item_plugin_by_id(CTAIL_ID);
31908+ if (!iplug->s.file.readpage) {
31909+ unlock_page(page);
31910+ put_cluster_handle(&clust);
31911+ reiser4_exit_context(ctx);
31912+ return -EINVAL;
31913+ }
31914+ result = iplug->s.file.readpage(&clust, page);
31915+
31916+ assert("edward-1459", !PageLocked(page));
31917+ assert("edward-64", ergo(result == 0, PageUptodate(page)));
31918+ put_cluster_handle(&clust);
31919+ reiser4_exit_context(ctx);
31920+ return result;
31921+}
31922+
31923+/* how much pages will be captured */
31924+static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
31925+{
31926+ switch (clust->op) {
31927+ case PCL_APPEND:
31928+ return clust->nr_pages;
31929+ case PCL_TRUNCATE:
31930+ assert("edward-1179", clust->win != NULL);
31931+ return count_to_nrpages(clust->win->off + clust->win->count);
31932+ default:
31933+ impossible("edward-1180", "bad page cluster option");
31934+ return 0;
31935+ }
31936+}
31937+
31938+static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
31939+{
31940+ int i;
31941+ struct page *pg;
31942+ int nrpages = cluster_nrpages_to_capture(clust);
31943+
31944+ for (i = 0; i < nrpages; i++) {
31945+
31946+ pg = clust->pages[i];
31947+ assert("edward-968", pg != NULL);
31948+ lock_page(pg);
31949+ assert("edward-1065", PageUptodate(pg));
31950+ reiser4_set_page_dirty_internal(pg);
31951+ unlock_page(pg);
31952+ mark_page_accessed(pg);
31953+ }
31954+}
31955+
31956+static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
31957+{
31958+ int i;
31959+ assert("edward-1275", clust != NULL);
31960+
31961+ for (i = 0; i < clust->nr_pages; i++) {
31962+ assert("edward-1276", clust->pages[i] != NULL);
31963+
31964+ lock_page(clust->pages[i]);
31965+ if (PageDirty(clust->pages[i])) {
31966+ assert("edward-1277", PageUptodate(clust->pages[i]));
31967+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
31968+ }
31969+#if REISER4_DEBUG
31970+ else
31971+ /* Race between flush and write:
31972+ some pages became clean when write() (or another
31973+ process which modifies data) capture the cluster. */
31974+ warning("edward-985", "Page of index %lu (inode %llu)"
31975+ " is not dirty\n", clust->pages[i]->index,
31976+ (unsigned long long)get_inode_oid(clust->
31977+ pages[i]->
31978+ mapping->
31979+ host));
31980+#endif
31981+ unlock_page(clust->pages[i]);
31982+ }
31983+}
31984+
31985+/* update i_size by window */
31986+static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
31987+{
31988+ loff_t size;
31989+ reiser4_slide_t *win;
31990+
31991+ assert("edward-1181", clust != NULL);
31992+ assert("edward-1182", inode != NULL);
31993+
31994+ win = clust->win;
31995+ assert("edward-1183", win != NULL);
31996+ assert("edward-1183", win->count != 0);
31997+
31998+ size = clust_to_off(clust->index, inode) + win->off;
31999+
32000+ switch (clust->op) {
32001+ case PCL_APPEND:
32002+ if (size + win->count <= inode->i_size)
32003+ /* overwrite only */
32004+ return;
32005+ size += win->count;
32006+ break;
32007+ case PCL_TRUNCATE:
32008+ break;
32009+ default:
32010+ impossible("edward-1184", "bad page cluster option");
32011+ break;
32012+ }
32013+ inode_check_scale_nolock(inode, inode->i_size, size);
32014+ inode->i_size = size;
32015+ return;
32016+}
32017+
32018+/* Check in page cluster modifications.
32019+ . Make jnode dirty, if it wasn't;
32020+ . Reserve space for a disk cluster update by flush algorithm, if needed;
32021+ . Clean up old references (if any).
32022+ . Put pages (grabbed in this thread) which will be truncated
32023+*/
32024+static void
32025+make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
32026+ loff_t * old_isize, struct inode *inode)
32027+{
32028+ int i;
32029+ int old_nrpages;
32030+ int new_nrpages = cluster_nrpages_to_capture(clust);
32031+
32032+ assert("edward-973", new_nrpages > 0);
32033+ assert("edward-221", node != NULL);
32034+ assert("edward-971", clust->reserved == 1);
32035+ assert_spin_locked(&(node->guard));
32036+ assert("edward-972", node->page_count <= cluster_nrpages(inode));
32037+ assert("edward-1263",
32038+ clust->reserved_prepped == estimate_update_cluster(inode));
32039+ assert("edward-1264", clust->reserved_unprepped == 0);
32040+
32041+ if (JF_ISSET(node, JNODE_DIRTY)) {
32042+ /* someone has modified this cluster, but
32043+ the modifications are not committed yet */
32044+ old_nrpages =
32045+ count_to_nrpages(cnt_to_clcnt(*old_isize,
32046+ clust->index, inode));
32047+ /* free space which is already reserved */
32048+ free_reserved4cluster(inode, clust,
32049+ estimate_update_cluster(inode));
32050+ /* put old references */
32051+ for (i = 0; i < old_nrpages; i++) {
32052+ assert("edward-975", clust->pages[i]);
32053+ assert("edward-1185", PageUptodate(clust->pages[i]));
32054+
32055+ page_cache_release(clust->pages[i]);
32056+#if REISER4_DEBUG
32057+ cryptcompress_inode_data(inode)->pgcount --;
32058+#endif
32059+ }
32060+ } else {
32061+ /* no captured pages */
32062+ assert("edward-1043", node->page_count == 0);
32063+ jnode_make_dirty_locked(node);
32064+ clust->reserved = 0;
32065+ }
32066+ /* put pages that will be truncated (if any) */
32067+ for (i = new_nrpages; i < clust->nr_pages; i++) {
32068+ assert("edward-1433", clust->pages[i]);
32069+ assert("edward-1434", PageUptodate(clust->pages[i]));
32070+ page_cache_release(clust->pages[i]);
32071+#if REISER4_DEBUG
32072+ cryptcompress_inode_data(inode)->pgcount --;
32073+#endif
32074+ }
32075+#if REISER4_DEBUG
32076+ clust->reserved_prepped -= estimate_update_cluster(inode);
32077+ node->page_count = new_nrpages;
32078+#endif
32079+ return;
32080+}
32081+
32082+/* This function spawns a transaction and
32083+ is called by any thread as a final step in page cluster modification.
32084+*/
32085+static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
32086+{
32087+ int result = 0;
32088+ loff_t old_size;
32089+ jnode *node;
32090+
32091+ assert("edward-1029", clust != NULL);
32092+ assert("edward-1030", clust->reserved == 1);
32093+ assert("edward-1031", clust->nr_pages != 0);
32094+ assert("edward-1032", clust->pages != NULL);
32095+ assert("edward-1033", clust->pages[0] != NULL);
32096+
32097+ node = jprivate(clust->pages[0]);
32098+ assert("edward-1035", node != NULL);
32099+ assert("edward-1446", jnode_is_cluster_page(node));
32100+
32101+ spin_lock_jnode(node);
32102+
32103+ old_size = inode->i_size;
32104+ if (clust->win)
32105+ inode_set_new_size(clust, inode);
32106+
32107+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32108+ if (result)
32109+ goto exit;
32110+ make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
32111+ exit:
32112+ spin_unlock_jnode(node);
32113+ jput(node);
32114+ return result;
32115+}
32116+
32117+/* Collect unlocked cluster pages for any modifications and attach a jnode.
32118+ We allocate only one jnode per cluster, this jnode is binded to the first
32119+ page of this cluster, so we have an extra-reference that will exist with
32120+ this jnode, other references will be cleaned up in flush time.
32121+*/
32122+static int
32123+grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
32124+{
32125+ int i;
32126+ int result = 0;
32127+ jnode *node = NULL;
32128+
32129+ assert("edward-182", clust != NULL);
32130+ assert("edward-183", clust->pages != NULL);
32131+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32132+
32133+ if (clust->nr_pages == 0)
32134+ return 0;
32135+
32136+ for (i = 0; i < clust->nr_pages; i++) {
32137+
32138+ assert("edward-1044", clust->pages[i] == NULL);
32139+
32140+ clust->pages[i] =
32141+ find_or_create_page(inode->i_mapping,
32142+ clust_to_pg(clust->index, inode) + i,
32143+ reiser4_ctx_gfp_mask_get());
32144+ if (!clust->pages[i]) {
32145+ result = RETERR(-ENOMEM);
32146+ break;
32147+ }
32148+ if (i == 0) {
32149+ node = jnode_of_page(clust->pages[i]);
32150+ if (IS_ERR(node)) {
32151+ result = PTR_ERR(node);
32152+ unlock_page(clust->pages[i]);
32153+ break;
32154+ }
32155+ JF_SET(node, JNODE_CLUSTER_PAGE);
32156+ unlock_page(clust->pages[i]);
32157+ assert("edward-919", node);
32158+ continue;
32159+ }
32160+ unlock_page(clust->pages[i]);
32161+ }
32162+ if (result) {
32163+ while (i)
32164+ page_cache_release(clust->pages[--i]);
32165+ if (node && !IS_ERR(node))
32166+ jput(node);
32167+ return result;
32168+ }
32169+ assert("edward-920", jprivate(clust->pages[0]));
32170+#if REISER4_DEBUG
32171+ cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
32172+#endif
32173+ return 0;
32174+}
32175+
32176+/* Collect unlocked cluster pages only for read (not to modify) */
32177+int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32178+{
32179+ int i;
32180+ int result = 0;
32181+
32182+ assert("edward-1428", inode != NULL);
32183+ assert("edward-1429", inode->i_mapping != NULL);
32184+ assert("edward-787", clust != NULL);
32185+ assert("edward-788", clust->pages != NULL);
32186+ assert("edward-789", clust->nr_pages != 0);
32187+ assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
32188+
32189+ for (i = 0; i < clust->nr_pages; i++) {
32190+ clust->pages[i] =
32191+ find_or_create_page(inode->i_mapping,
32192+ clust_to_pg(clust->index, inode) + i,
32193+ reiser4_ctx_gfp_mask_get());
32194+ if (!clust->pages[i]) {
32195+ result = RETERR(-ENOMEM);
32196+ break;
32197+ }
32198+ unlock_page(clust->pages[i]);
32199+ }
32200+ if (result)
32201+ while (i)
32202+ page_cache_release(clust->pages[--i]);
32203+ return result;
32204+}
32205+
32206+/* @node might be attached by reiser4_writepage(), not by
32207+ cryptcompress plugin code, but emergency flush should
32208+ understand that pages of cryptcompress files are not
32209+ flushable.
32210+*/
32211+#if 0
32212+int jnode_of_cluster(const jnode * node, struct page * page)
32213+{
32214+ assert("edward-1339", node != NULL);
32215+ assert("edward-1340", page != NULL);
32216+ assert("edward-1341", page->mapping != NULL);
32217+ assert("edward-1342", page->mapping->host != NULL);
32218+ assert("edward-1343",
32219+ ergo(jnode_is_unformatted(node),
32220+ get_inode_oid(page->mapping->host) ==
32221+ node->key.j.objectid));
32222+ if (inode_file_plugin(page->mapping->host) ==
32223+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) {
32224+#if REISER4_DEBUG
32225+ if (!jnode_is_cluster_page(node))
32226+ warning("edward-1345",
32227+ "inode %llu: cluster page of index %lu became private",
32228+ (unsigned long long)get_inode_oid(page->mapping->host),
32229+ page->index);
32230+#endif
32231+ return 1;
32232+ }
32233+ return 0;
32234+}
32235+#endif /* 0 */
32236+
32237+/* put cluster pages */
32238+void reiser4_release_cluster_pages(reiser4_cluster_t * clust)
32239+{
32240+ int i;
32241+
32242+ assert("edward-447", clust != NULL);
32243+ for (i = 0; i < clust->nr_pages; i++) {
32244+
32245+ assert("edward-449", clust->pages[i] != NULL);
32246+
32247+ page_cache_release(clust->pages[i]);
32248+ }
32249+}
32250+
32251+/* this is called when something is failed */
32252+static void reiser4_release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
32253+{
32254+ jnode *node;
32255+
32256+ assert("edward-445", clust != NULL);
32257+ assert("edward-922", clust->pages != NULL);
32258+ assert("edward-446", clust->pages[0] != NULL);
32259+
32260+ node = jprivate(clust->pages[0]);
32261+
32262+ assert("edward-447", node != NULL);
32263+
32264+ reiser4_release_cluster_pages(clust);
32265+ jput(node);
32266+}
32267+
32268+#if REISER4_DEBUG
32269+static int window_ok(reiser4_slide_t * win, struct inode *inode)
32270+{
32271+ assert("edward-1115", win != NULL);
32272+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32273+
32274+ return (win->off != inode_cluster_size(inode)) &&
32275+ (win->off + win->count + win->delta <= inode_cluster_size(inode));
32276+}
32277+
32278+static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
32279+{
32280+ assert("edward-279", clust != NULL);
32281+
32282+ if (!clust->pages)
32283+ return 0;
32284+ return (clust->win ? window_ok(clust->win, inode) : 1);
32285+}
32286+#endif
32287+
32288+/* guess next window stat */
32289+static inline window_stat next_window_stat(reiser4_slide_t * win)
32290+{
32291+ assert("edward-1130", win != NULL);
32292+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32293+ HOLE_WINDOW : DATA_WINDOW);
32294+}
32295+
32296+/* guess next cluster index and window params */
32297+static void
32298+update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32299+ loff_t to_file)
32300+{
32301+ reiser4_slide_t *win;
32302+
32303+ assert("edward-185", clust != NULL);
32304+ assert("edward-438", clust->pages != NULL);
32305+ assert("edward-281", cluster_ok(clust, inode));
32306+
32307+ win = clust->win;
32308+ if (!win)
32309+ return;
32310+
32311+ switch (win->stat) {
32312+ case DATA_WINDOW:
32313+ /* increment window position */
32314+ clust->index++;
32315+ win->stat = DATA_WINDOW;
32316+ win->off = 0;
32317+ win->count = min_count(inode_cluster_size(inode), to_file);
32318+ break;
32319+ case HOLE_WINDOW:
32320+ switch (next_window_stat(win)) {
32321+ case HOLE_WINDOW:
32322+ /* set window to fit the offset we start write from */
32323+ clust->index = off_to_clust(file_off, inode);
32324+ win->stat = HOLE_WINDOW;
32325+ win->off = 0;
32326+ win->count = off_to_cloff(file_off, inode);
32327+ win->delta =
32328+ min_count(inode_cluster_size(inode) - win->count,
32329+ to_file);
32330+ break;
32331+ case DATA_WINDOW:
32332+ /* do not move the window, just change its state,
32333+ off+count+delta=inv */
32334+ win->stat = DATA_WINDOW;
32335+ win->off = win->off + win->count;
32336+ win->count = win->delta;
32337+ win->delta = 0;
32338+ break;
32339+ default:
32340+ impossible("edward-282", "wrong next window state");
32341+ }
32342+ break;
32343+ default:
32344+ impossible("edward-283", "wrong current window state");
32345+ }
32346+ assert("edward-1068", cluster_ok(clust, inode));
32347+}
32348+
32349+static int update_sd_cryptcompress(struct inode *inode)
32350+{
32351+ int result = 0;
32352+
32353+ assert("edward-978", reiser4_schedulable());
32354+
32355+ result = reiser4_grab_space_force( /* one for stat data update */
32356+ estimate_update_common(inode),
32357+ BA_CAN_COMMIT);
32358+ if (result)
32359+ return result;
32360+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
32361+ result = reiser4_update_sd(inode);
32362+
32363+ return result;
32364+}
32365+
32366+/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
32367+static void uncapture_cluster_jnode(jnode * node)
32368+{
32369+ txn_atom *atom;
32370+
32371+ assert_spin_locked(&(node->guard));
32372+
32373+ /*jnode_make_clean(node); */
32374+ atom = jnode_get_atom(node);
32375+ if (atom == NULL) {
32376+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
32377+ spin_unlock_jnode(node);
32378+ return;
32379+ }
32380+
32381+ reiser4_uncapture_block(node);
32382+ spin_unlock_atom(atom);
32383+ jput(node);
32384+}
32385+
32386+static void forget_cluster_pages(struct page **pages, int nr)
32387+{
32388+ int i;
32389+ for (i = 0; i < nr; i++) {
32390+
32391+ assert("edward-1045", pages[i] != NULL);
32392+ page_cache_release(pages[i]);
32393+ }
32394+}
32395+
32396+/* Check out last modifications we are about to commit,
32397+ and prepare input stream for transform operations.
32398+*/
32399+int
32400+flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
32401+ struct inode *inode)
32402+{
32403+ int result = 0;
32404+ int i;
32405+ int nr_pages = 0;
32406+ tfm_cluster_t *tc = &clust->tc;
32407+#if REISER4_DEBUG
32408+ int node_pgcount;
32409+#endif
32410+ assert("edward-980", node != NULL);
32411+ assert("edward-236", inode != NULL);
32412+ assert("edward-237", clust != NULL);
32413+ assert("edward-240", !clust->win);
32414+ assert("edward-241", reiser4_schedulable());
32415+ assert("edward-718", cryptcompress_inode_ok(inode));
32416+
32417+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
32418+ if (result) {
32419+ warning("edward-1430",
32420+ "alloc stream failed with ret=%d", result);
32421+ return result;
32422+ }
32423+ spin_lock_jnode(node);
32424+#if REISER4_DEBUG
32425+ node_pgcount = node->page_count;
32426+#endif
32427+ if (!JF_ISSET(node, JNODE_DIRTY)) {
32428+ /* race with another flush */
32429+#if REISER4_DEBUG
32430+ assert("edward-981", node_pgcount == 0);
32431+ warning("edward-982", "flush_cluster_pages: jnode is not dirty "
32432+ "clust %lu, inode %llu\n",
32433+ clust->index, (unsigned long long)get_inode_oid(inode));
32434+#endif
32435+ spin_unlock_jnode(node);
32436+ return RETERR(-E_REPEAT);
32437+ }
32438+ /* Check out a size of logical cluster and
32439+ set a number of cluster pages to commit. */
32440+ tc->len = tc->lsize = fsize_to_count(clust, inode);
32441+ clust->nr_pages = count_to_nrpages(tc->len);
32442+
32443+#if REISER4_DEBUG
32444+ node->page_count = 0;
32445+#endif
32446+ cluster_reserved2grabbed(estimate_update_cluster(inode));
32447+ uncapture_cluster_jnode(node);
32448+
32449+ assert("edward-1224", reiser4_schedulable());
32450+ /* Check out page cluster for commit */
32451+ nr_pages =
32452+ find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
32453+ clust->nr_pages, clust->pages);
32454+ if (nr_pages != clust->nr_pages)
32455+ goto checkout_failed;
32456+
32457+ /* Try to construct input stream from the checked out pages */
32458+ for (i = 0; i < clust->nr_pages; i++) {
32459+ char *data;
32460+
32461+ assert("edward-242", clust->pages[i] != NULL);
32462+ if (clust->pages[i]->index !=
32463+ clust_to_pg(clust->index, inode) + i)
32464+ goto checkout_failed;
32465+ BUG_ON(!PageUptodate(clust->pages[i]));
32466+
32467+ /* flush the page into input transform stream */
32468+ lock_page(clust->pages[i]);
32469+ data = kmap(clust->pages[i]);
32470+
32471+ assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
32472+
32473+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
32474+ data, cnt_to_pgcnt(tc->len, i));
32475+ kunmap(clust->pages[i]);
32476+ unlock_page(clust->pages[i]);
32477+ }
32478+ /* page cluster flushed successfully */
32479+
32480+ clear_cluster_pages_dirty(clust);
32481+ reiser4_release_cluster_pages(clust);
32482+#if REISER4_DEBUG
32483+ cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
32484+#endif
32485+ goto out;
32486+ checkout_failed:
32487+#if REISER4_DEBUG
32488+ assert("edward-1282", node_pgcount == 0);
32489+ warning("edward-1435", "Inode %llu : checkout page cluster"
32490+ "of index %lu failed\n",
32491+ (unsigned long long)get_inode_oid(inode), clust->index);
32492+#endif /* REISER4_DEBUG */
32493+ result = RETERR(-E_REPEAT);
32494+ out:
32495+ /* put pages that were found here */
32496+ forget_cluster_pages(clust->pages, nr_pages);
32497+ return result;
32498+}
32499+
32500+/* set hint for the cluster of the index @index */
32501+static void set_hint_cluster(struct inode *inode, hint_t * hint,
32502+ cloff_t index, znode_lock_mode mode)
32503+{
32504+ reiser4_key key;
32505+ assert("edward-722", cryptcompress_inode_ok(inode));
32506+ assert("edward-723",
32507+ inode_file_plugin(inode) ==
32508+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
32509+
32510+ inode_file_plugin(inode)->key_by_inode(inode,
32511+ clust_to_off(index, inode),
32512+ &key);
32513+
32514+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
32515+ hint->offset = get_key_offset(&key);
32516+ hint->mode = mode;
32517+}
32518+
32519+void invalidate_hint_cluster(reiser4_cluster_t * clust)
32520+{
32521+ assert("edward-1291", clust != NULL);
32522+ assert("edward-1292", clust->hint != NULL);
32523+
32524+ done_lh(&clust->hint->lh);
32525+ hint_clr_valid(clust->hint);
32526+}
32527+
32528+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
32529+ znode_lock_mode mode)
32530+{
32531+ assert("edward-1286", clust != NULL);
32532+ assert("edward-1287", clust->hint != NULL);
32533+
32534+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
32535+ invalidate_hint_cluster(clust);
32536+}
32537+
32538+static int
32539+balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
32540+ loff_t off, loff_t to_file)
32541+{
32542+ int result;
32543+
32544+ assert("edward-724", inode != NULL);
32545+ assert("edward-725", cryptcompress_inode_ok(inode));
32546+
32547+ /* set next window params */
32548+ update_cluster(inode, clust, off, to_file);
32549+
32550+ result = update_sd_cryptcompress(inode);
32551+ if (result)
32552+ return result;
32553+ assert("edward-726", clust->hint->lh.owner == NULL);
32554+
32555+ reiser4_throttle_write(inode);
32556+ return 0;
32557+}
32558+
32559+/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
32560+static int
32561+write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32562+ loff_t to_file)
32563+{
32564+ char *data;
32565+ int result = 0;
32566+ unsigned cl_off, cl_count = 0;
32567+ unsigned to_pg, pg_off;
32568+ reiser4_slide_t *win;
32569+
32570+ assert("edward-190", clust != NULL);
32571+ assert("edward-1069", clust->win != NULL);
32572+ assert("edward-191", inode != NULL);
32573+ assert("edward-727", cryptcompress_inode_ok(inode));
32574+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
32575+ assert("edward-1154",
32576+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
32577+
32578+ win = clust->win;
32579+
32580+ assert("edward-1070", win != NULL);
32581+ assert("edward-201", win->stat == HOLE_WINDOW);
32582+ assert("edward-192", cluster_ok(clust, inode));
32583+
32584+ if (win->off == 0 && win->count == inode_cluster_size(inode)) {
32585+ /* the hole will be represented by fake disk cluster */
32586+ update_cluster(inode, clust, file_off, to_file);
32587+ return 0;
32588+ }
32589+ cl_count = win->count; /* number of zeroes to write */
32590+ cl_off = win->off;
32591+ pg_off = off_to_pgoff(win->off);
32592+
32593+ while (cl_count) {
32594+ struct page *page;
32595+ page = clust->pages[off_to_pg(cl_off)];
32596+
32597+ assert("edward-284", page != NULL);
32598+
32599+ to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
32600+ lock_page(page);
32601+ data = kmap_atomic(page, KM_USER0);
32602+ memset(data + pg_off, 0, to_pg);
32603+ flush_dcache_page(page);
32604+ kunmap_atomic(data, KM_USER0);
32605+ SetPageUptodate(page);
32606+ unlock_page(page);
32607+
32608+ cl_off += to_pg;
32609+ cl_count -= to_pg;
32610+ pg_off = 0;
32611+ }
32612+ if (!win->delta) {
32613+ /* only zeroes, try to capture */
32614+
32615+ set_cluster_pages_dirty(clust);
32616+ result = try_capture_cluster(clust, inode);
32617+ if (result)
32618+ return result;
32619+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
32620+ result =
32621+ balance_dirty_page_cluster(clust, inode, file_off, to_file);
32622+ } else
32623+ update_cluster(inode, clust, file_off, to_file);
32624+ return result;
32625+}
32626+
32627+/*
32628+ The main disk search procedure for cryptcompress plugins, which
32629+ . scans all items of disk cluster with the lock mode @mode
32630+ . maybe reads each one (if @read)
32631+ . maybe makes its znode dirty (if write lock mode was specified)
32632+
32633+ NOTE-EDWARD: Callers should handle the case when disk cluster
32634+ is incomplete (-EIO)
32635+*/
32636+int find_disk_cluster(reiser4_cluster_t * clust,
32637+ struct inode *inode, int read, znode_lock_mode mode)
32638+{
32639+ flow_t f;
32640+ hint_t *hint;
32641+ int result = 0;
32642+ unsigned long cl_idx;
32643+ ra_info_t ra_info;
32644+ file_plugin *fplug;
32645+ item_plugin *iplug;
32646+ tfm_cluster_t *tc;
32647+ int was_grabbed;
32648+
32649+ assert("edward-138", clust != NULL);
32650+ assert("edward-728", clust->hint != NULL);
32651+ assert("edward-226", reiser4_schedulable());
32652+ assert("edward-137", inode != NULL);
32653+ assert("edward-729", cryptcompress_inode_ok(inode));
32654+
32655+ hint = clust->hint;
32656+ cl_idx = clust->index;
32657+ fplug = inode_file_plugin(inode);
32658+ was_grabbed = get_current_context()->grabbed_blocks;
32659+ tc = &clust->tc;
32660+
32661+ assert("edward-462", !tfm_cluster_is_uptodate(tc));
32662+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
32663+
32664+ dclust_init_extension(hint);
32665+
32666+ /* set key of the first disk cluster item */
32667+ fplug->flow_by_inode(inode,
32668+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
32669+ 0 /* kernel space */ ,
32670+ inode_scaled_cluster_size(inode),
32671+ clust_to_off(cl_idx, inode), READ_OP, &f);
32672+ if (mode == ZNODE_WRITE_LOCK) {
32673+ /* reserve for flush to make dirty all the leaf nodes
32674+ which contain disk cluster */
32675+ result =
32676+ reiser4_grab_space_force(estimate_dirty_cluster(inode),
32677+ BA_CAN_COMMIT);
32678+ if (result)
32679+ goto out;
32680+ }
32681+
32682+ ra_info.key_to_stop = f.key;
32683+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
32684+
32685+ while (f.length) {
32686+ result = find_cluster_item(hint, &f.key, mode,
32687+ NULL, FIND_EXACT,
32688+ (mode == ZNODE_WRITE_LOCK ?
32689+ CBK_FOR_INSERT : 0));
32690+ switch (result) {
32691+ case CBK_COORD_NOTFOUND:
32692+ result = 0;
32693+ if (inode_scaled_offset
32694+ (inode,
32695+ clust_to_off(cl_idx,
32696+ inode)) == get_key_offset(&f.key)) {
32697+ /* first item not found, this is treated
32698+ as disk cluster is absent */
32699+ clust->dstat = FAKE_DISK_CLUSTER;
32700+ goto out;
32701+ }
32702+ /* we are outside the cluster, stop search here */
32703+ assert("edward-146",
32704+ f.length != inode_scaled_cluster_size(inode));
32705+ goto ok;
32706+ case CBK_COORD_FOUND:
32707+ assert("edward-148",
32708+ hint->ext_coord.coord.between == AT_UNIT);
32709+ assert("edward-460",
32710+ hint->ext_coord.coord.unit_pos == 0);
32711+
32712+ coord_clear_iplug(&hint->ext_coord.coord);
32713+ result = zload_ra(hint->ext_coord.coord.node, &ra_info);
32714+ if (unlikely(result))
32715+ goto out;
32716+ iplug = item_plugin_by_coord(&hint->ext_coord.coord);
32717+ assert("edward-147",
32718+ item_id_by_coord(&hint->ext_coord.coord) ==
32719+ CTAIL_ID);
32720+
32721+ result = iplug->s.file.read(NULL, &f, hint);
32722+ if (result) {
32723+ zrelse(hint->ext_coord.coord.node);
32724+ goto out;
32725+ }
32726+ if (mode == ZNODE_WRITE_LOCK) {
32727+ /* Don't make dirty more nodes then it was
32728+ estimated (see comments before
32729+ estimate_dirty_cluster). Missed nodes will be
32730+ read up in flush time if they are evicted from
32731+ memory */
32732+ if (dclust_get_extension_ncount(hint) <=
32733+ estimate_dirty_cluster(inode))
32734+ znode_make_dirty(hint->ext_coord.coord.node);
32735+
32736+ znode_set_convertible(hint->ext_coord.coord.
32737+ node);
32738+ }
32739+ zrelse(hint->ext_coord.coord.node);
32740+ break;
32741+ default:
32742+ goto out;
32743+ }
32744+ }
32745+ ok:
32746+ /* at least one item was found */
32747+ /* NOTE-EDWARD: Callers should handle the case
32748+ when disk cluster is incomplete (-EIO) */
32749+ tc->len = inode_scaled_cluster_size(inode) - f.length;
32750+ tc->lsize = fsize_to_count(clust, inode);
32751+ assert("edward-1196", tc->len > 0);
32752+ assert("edward-1406", tc->lsize > 0);
32753+
32754+ if (hint_is_unprepped_dclust(clust->hint))
32755+ clust->dstat = UNPR_DISK_CLUSTER;
32756+ else {
32757+ dclust_set_extension_dsize(clust->hint, tc->len);
32758+ clust->dstat = PREP_DISK_CLUSTER;
32759+ }
32760+ out:
32761+ assert("edward-1339",
32762+ get_current_context()->grabbed_blocks >= was_grabbed);
32763+ grabbed2free(get_current_context(),
32764+ get_current_super_private(),
32765+ get_current_context()->grabbed_blocks - was_grabbed);
32766+ return result;
32767+}
32768+
32769+int
32770+get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
32771+ znode_lock_mode lock_mode)
32772+{
32773+ reiser4_key key;
32774+ ra_info_t ra_info;
32775+
32776+ assert("edward-730", reiser4_schedulable());
32777+ assert("edward-731", clust != NULL);
32778+ assert("edward-732", inode != NULL);
32779+
32780+ if (hint_is_valid(clust->hint)) {
32781+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
32782+ assert("edward-1294",
32783+ znode_is_write_locked(clust->hint->lh.node));
32784+ /* already have a valid locked position */
32785+ return (clust->dstat ==
32786+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
32787+ CBK_COORD_FOUND);
32788+ }
32789+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
32790+ &key);
32791+ ra_info.key_to_stop = key;
32792+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
32793+
32794+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
32795+ CBK_FOR_INSERT);
32796+}
32797+
32798+/* Read needed cluster pages before modifying.
32799+ If success, @clust->hint contains locked position in the tree.
32800+ Also:
32801+ . find and set disk cluster state
32802+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
32803+*/
32804+static int
32805+read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32806+{
32807+ int i;
32808+ int result = 0;
32809+ item_plugin *iplug;
32810+ reiser4_slide_t *win = clust->win;
32811+ znode_lock_mode mode = ZNODE_WRITE_LOCK;
32812+
32813+ iplug = item_plugin_by_id(CTAIL_ID);
32814+
32815+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
32816+
32817+#if REISER4_DEBUG
32818+ if (clust->nr_pages == 0) {
32819+ /* start write hole from fake disk cluster */
32820+ assert("edward-1117", win != NULL);
32821+ assert("edward-1118", win->stat == HOLE_WINDOW);
32822+ assert("edward-1119", new_cluster(clust, inode));
32823+ }
32824+#endif
32825+ if (new_cluster(clust, inode)) {
32826+ /*
32827+ new page cluster is about to be written, nothing to read,
32828+ */
32829+ assert("edward-734", reiser4_schedulable());
32830+ assert("edward-735", clust->hint->lh.owner == NULL);
32831+
32832+ if (clust->nr_pages) {
32833+ int off;
32834+ char *data;
32835+ struct page * pg;
32836+ assert("edward-1419", clust->pages != NULL);
32837+ pg = clust->pages[clust->nr_pages - 1];
32838+ assert("edward-1420", pg != NULL);
32839+ off = off_to_pgoff(win->off+win->count+win->delta);
32840+ if (off) {
32841+ lock_page(pg);
32842+ data = kmap_atomic(pg, KM_USER0);
32843+ memset(data + off, 0, PAGE_CACHE_SIZE - off);
32844+ flush_dcache_page(pg);
32845+ kunmap_atomic(data, KM_USER0);
32846+ unlock_page(pg);
32847+ }
32848+ }
32849+ clust->dstat = FAKE_DISK_CLUSTER;
32850+ return 0;
32851+ }
32852+ /*
32853+ Here we should search for disk cluster to figure out its real state.
32854+ Also there is one more important reason to do disk search: we need
32855+ to make disk cluster _dirty_ if it exists
32856+ */
32857+
32858+ /* if windows is specified, read the only pages
32859+ that will be modified partially */
32860+
32861+ for (i = 0; i < clust->nr_pages; i++) {
32862+ struct page *pg = clust->pages[i];
32863+
32864+ lock_page(pg);
32865+ if (PageUptodate(pg)) {
32866+ unlock_page(pg);
32867+ continue;
32868+ }
32869+ unlock_page(pg);
32870+
32871+ if (win &&
32872+ i >= count_to_nrpages(win->off) &&
32873+ i < off_to_pg(win->off + win->count + win->delta))
32874+ /* page will be completely overwritten */
32875+ continue;
32876+
32877+ if (win && (i == clust->nr_pages - 1) &&
32878+ /* the last page is
32879+ partially modified,
32880+ not uptodate .. */
32881+ (count_to_nrpages(inode->i_size) <= pg->index)) {
32882+ /* .. and appended,
32883+ so set zeroes to the rest */
32884+ char *data;
32885+ int offset;
32886+ lock_page(pg);
32887+ data = kmap_atomic(pg, KM_USER0);
32888+
32889+ assert("edward-1260",
32890+ count_to_nrpages(win->off + win->count +
32891+ win->delta) - 1 == i);
32892+
32893+ offset =
32894+ off_to_pgoff(win->off + win->count + win->delta);
32895+ memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
32896+ flush_dcache_page(pg);
32897+ kunmap_atomic(data, KM_USER0);
32898+ unlock_page(pg);
32899+ /* still not uptodate */
32900+ break;
32901+ }
32902+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
32903+ result = ctail_read_disk_cluster(clust, inode, mode);
32904+ if (result)
32905+ goto out;
32906+ assert("edward-925",
32907+ tfm_cluster_is_uptodate(&clust->tc));
32908+ }
32909+ lock_page(pg);
32910+ result = do_readpage_ctail(inode, clust, pg, mode);
32911+ unlock_page(pg);
32912+ if (result) {
32913+ impossible("edward-219",
32914+ "do_readpage_ctail returned crap");
32915+ goto out;
32916+ }
32917+ }
32918+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
32919+ /* disk cluster unclaimed, but we need to make its znodes dirty
32920+ to make flush update convert its content */
32921+ result = find_disk_cluster(clust, inode, 0 /* do not read items */,
32922+ mode);
32923+ }
32924+ out:
32925+ tfm_cluster_clr_uptodate(&clust->tc);
32926+ return result;
32927+}
32928+
32929+static int
32930+should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
32931+{
32932+ assert("edward-737", clust != NULL);
32933+
32934+ switch (clust->dstat) {
32935+ case PREP_DISK_CLUSTER:
32936+ case UNPR_DISK_CLUSTER:
32937+ return 0;
32938+ case FAKE_DISK_CLUSTER:
32939+ if (clust->win &&
32940+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
32941+ assert("edward-1172", new_cluster(clust, inode));
32942+ return 0;
32943+ }
32944+ return 1;
32945+ default:
32946+ impossible("edward-1173", "bad disk cluster state");
32947+ return 0;
32948+ }
32949+}
32950+
32951+static int
32952+cryptcompress_make_unprepped_cluster(reiser4_cluster_t * clust,
32953+ struct inode *inode)
32954+{
32955+ int result;
32956+
32957+ assert("edward-1123", reiser4_schedulable());
32958+ assert("edward-737", clust != NULL);
32959+ assert("edward-738", inode != NULL);
32960+ assert("edward-739", cryptcompress_inode_ok(inode));
32961+ assert("edward-1053", clust->hint != NULL);
32962+
32963+ if (!should_create_unprepped_cluster(clust, inode)) {
32964+ if (clust->reserved) {
32965+ cluster_reserved2free(estimate_insert_cluster(inode));
32966+#if REISER4_DEBUG
32967+ assert("edward-1267",
32968+ clust->reserved_unprepped ==
32969+ estimate_insert_cluster(inode));
32970+ clust->reserved_unprepped -=
32971+ estimate_insert_cluster(inode);
32972+#endif
32973+ }
32974+ return 0;
32975+ }
32976+ assert("edward-1268", clust->reserved);
32977+ cluster_reserved2grabbed(estimate_insert_cluster(inode));
32978+#if REISER4_DEBUG
32979+ assert("edward-1441",
32980+ clust->reserved_unprepped == estimate_insert_cluster(inode));
32981+ clust->reserved_unprepped -= estimate_insert_cluster(inode);
32982+#endif
32983+ result = ctail_insert_unprepped_cluster(clust, inode);
32984+ if (result)
32985+ return result;
32986+
32987+ inode_add_bytes(inode, inode_cluster_size(inode));
32988+
32989+ assert("edward-743", cryptcompress_inode_ok(inode));
32990+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
32991+
32992+ clust->dstat = UNPR_DISK_CLUSTER;
32993+ return 0;
32994+}
32995+
32996+#if REISER4_DEBUG
32997+static int jnode_truncate_ok(struct inode *inode, cloff_t index)
32998+{
32999+ jnode *node;
33000+ node =
33001+ jlookup(current_tree, get_inode_oid(inode),
33002+ clust_to_pg(index, inode));
33003+ if (likely(!node))
33004+ return 1;
33005+ /* someone got this jnode */
33006+ warning("edward-1315", "jnode %p is untruncated\n", node);
33007+ jput(node);
33008+ return (atomic_read(&node->x_count));
33009+}
33010+#endif
33011+
33012+/* Collect unlocked cluster pages and jnode (the last is in the
33013+ case when the page cluster will be modified and captured) */
33014+int
33015+prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
33016+ int capture)
33017+{
33018+ assert("edward-177", inode != NULL);
33019+ assert("edward-741", cryptcompress_inode_ok(inode));
33020+ assert("edward-740", clust->pages != NULL);
33021+
33022+ set_cluster_nrpages(clust, inode);
33023+ reset_cluster_pgset(clust, cluster_nrpages(inode));
33024+ return (capture ?
33025+ grab_cluster_pages_jnode(inode, clust) :
33026+ grab_cluster_pages(inode, clust));
33027+}
33028+
33029+/* Truncate all pages of the cluster of index @index.
33030+ This is called by ->kill_hook() method of item plugin */
33031+void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t index,
33032+ int even_cows)
33033+{
33034+ int i;
33035+ int found = 0;
33036+ int nr_pages;
33037+ jnode *node;
33038+ struct page *pages[MAX_CLUSTER_NRPAGES];
33039+
33040+ node =
33041+ jlookup(current_tree, get_inode_oid(inode),
33042+ clust_to_pg(index, inode));
33043+ /* jnode is absent, just drop pages which can not
33044+ acquire jnode because of exclusive access */
33045+ if (!node)
33046+ goto truncate;
33047+ /* jnode is present and may be dirty */
33048+ nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
33049+
33050+ found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
33051+ nr_pages, pages);
33052+ spin_lock_jnode(node);
33053+
33054+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
33055+ && index == 0)
33056+ /* converting to unix_file in progress */
33057+ JF_CLR(node, JNODE_CLUSTER_PAGE);
33058+ if (JF_ISSET(node, JNODE_DIRTY)) {
33059+ /* someone has done modifications which are not
33060+ yet committed, so we need to release some resources */
33061+
33062+ /* free disk space grabbed for disk cluster converting */
33063+ cluster_reserved2grabbed(estimate_update_cluster(inode));
33064+ grabbed2free(get_current_context(),
33065+ get_current_super_private(),
33066+ estimate_update_cluster(inode));
33067+
33068+ assert("edward-1198", found == nr_pages);
33069+ assert("edward-1199", node->page_count == nr_pages);
33070+#if REISER4_DEBUG
33071+ node->page_count = 0;
33072+#endif
33073+ /* This will clear dirty bit */
33074+ uncapture_cluster_jnode(node);
33075+
33076+ /* put pages grabbed for last uncommitted modifications */
33077+ for (i = 0; i < nr_pages; i++) {
33078+ assert("edward-1200", PageUptodate(pages[i]));
33079+ page_cache_release(pages[i]);
33080+#if REISER4_DEBUG
33081+ cryptcompress_inode_data(inode)->pgcount --;
33082+#endif
33083+ }
33084+ } else
33085+ spin_unlock_jnode(node);
33086+ /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
33087+
33088+ jput(node);
33089+ /* put pages found here */
33090+ forget_cluster_pages(pages, found);
33091+ truncate:
33092+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
33093+ index == 0)
33094+ return;
33095+ reiser4_invalidate_pages(inode->i_mapping,
33096+ clust_to_pg(index, inode),
33097+ cluster_nrpages(inode),
33098+ even_cows);
33099+ assert("edward-1201",
33100+ ergo(!reiser4_inode_get_flag(inode,
33101+ REISER4_FILE_CONV_IN_PROGRESS),
33102+ jnode_truncate_ok(inode, index)));
33103+ return;
33104+}
33105+
33106+/* Prepare cluster handle before(after) modifications
33107+ which are supposed to be committed.
33108+
33109+ . grab cluster pages;
33110+ . reserve disk space;
33111+ . maybe read pages from disk and set the disk cluster dirty;
33112+ . maybe write hole;
33113+ . maybe create 'unprepped' disk cluster if the last one is fake
33114+ (i.e. is not represenred by any items)
33115+*/
33116+
33117+static int
33118+prepare_cluster(struct inode *inode,
33119+ loff_t file_off /* write position in the file */ ,
33120+ loff_t to_file, /* bytes of users data to write to the file */
33121+ reiser4_cluster_t * clust, page_cluster_op op)
33122+{
33123+ int result = 0;
33124+ reiser4_slide_t *win = clust->win;
33125+
33126+ reset_cluster_params(clust);
33127+ cluster_set_tfm_act(&clust->tc, TFMA_READ);
33128+#if REISER4_DEBUG
33129+ clust->ctx = get_current_context();
33130+#endif
33131+ assert("edward-1190", op != PCL_UNKNOWN);
33132+
33133+ clust->op = op;
33134+
33135+ result = prepare_page_cluster(inode, clust, 1);
33136+ if (result)
33137+ return result;
33138+ assert("edward-1447",
33139+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
33140+ assert("edward-1448",
33141+ ergo(clust->nr_pages != 0,
33142+ jnode_is_cluster_page(jprivate(clust->pages[0]))));
33143+
33144+ result = reserve4cluster(inode, clust);
33145+ if (result)
33146+ goto err1;
33147+ result = read_some_cluster_pages(inode, clust);
33148+ if (result) {
33149+ free_reserved4cluster(inode,
33150+ clust,
33151+ estimate_update_cluster(inode) +
33152+ estimate_insert_cluster(inode));
33153+ goto err1;
33154+ }
33155+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33156+
33157+ result = cryptcompress_make_unprepped_cluster(clust, inode);
33158+ if (result)
33159+ goto err2;
33160+ if (win && win->stat == HOLE_WINDOW) {
33161+ result = write_hole(inode, clust, file_off, to_file);
33162+ if (result)
33163+ goto err2;
33164+ }
33165+ return 0;
33166+ err2:
33167+ free_reserved4cluster(inode, clust,
33168+ estimate_update_cluster(inode));
33169+ err1:
33170+ reiser4_release_cluster_pages_and_jnode(clust);
33171+ assert("edward-1125", result == -ENOSPC);
33172+ return result;
33173+}
33174+
33175+/* set window by two offsets */
33176+static void
33177+set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
33178+ struct inode *inode, loff_t o1, loff_t o2)
33179+{
33180+ assert("edward-295", clust != NULL);
33181+ assert("edward-296", inode != NULL);
33182+ assert("edward-1071", win != NULL);
33183+ assert("edward-297", o1 <= o2);
33184+
33185+ clust->index = off_to_clust(o1, inode);
33186+
33187+ win->off = off_to_cloff(o1, inode);
33188+ win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
33189+ win->delta = 0;
33190+
33191+ clust->win = win;
33192+}
33193+
33194+static int
33195+set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
33196+ reiser4_slide_t * win, flow_t * f, loff_t file_off)
33197+{
33198+ int result;
33199+
33200+ assert("edward-197", clust != NULL);
33201+ assert("edward-1072", win != NULL);
33202+ assert("edward-198", inode != NULL);
33203+
33204+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33205+ if (result)
33206+ return result;
33207+
33208+ if (file_off > inode->i_size) {
33209+ /* Uhmm, hole in cryptcompress file... */
33210+ loff_t hole_size;
33211+ hole_size = file_off - inode->i_size;
33212+
33213+ set_window(clust, win, inode, inode->i_size, file_off);
33214+ win->stat = HOLE_WINDOW;
33215+ if (win->off + hole_size < inode_cluster_size(inode))
33216+ /* there is also user's data to append to the hole */
33217+ win->delta =
33218+ min_count(inode_cluster_size(inode) -
33219+ (win->off + win->count), f->length);
33220+ return 0;
33221+ }
33222+ set_window(clust, win, inode, file_off, file_off + f->length);
33223+ win->stat = DATA_WINDOW;
33224+ return 0;
33225+}
33226+
33227+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
33228+ int count)
33229+{
33230+ int result = 0;
33231+ int (*setting_actor)(reiser4_cluster_t * clust, int count);
33232+
33233+ assert("edward-1358", clust != NULL);
33234+ assert("edward-1359", page != NULL);
33235+ assert("edward-1360", page->mapping != NULL);
33236+ assert("edward-1361", page->mapping->host != NULL);
33237+
33238+ setting_actor = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
33239+ result = setting_actor(clust, count);
33240+ clust->index = pg_to_clust(page->index, page->mapping->host);
33241+ return result;
33242+}
33243+
33244+/* reset all the params that not get updated */
33245+void reset_cluster_params(reiser4_cluster_t * clust)
33246+{
33247+ assert("edward-197", clust != NULL);
33248+
33249+ clust->dstat = INVAL_DISK_CLUSTER;
33250+ clust->tc.uptodate = 0;
33251+ clust->tc.len = 0;
33252+}
33253+
33254+/* Core write procedure of cryptcompress plugin, which slices user's
33255+ flow into logical clusters, maps the last ones to the appropriate
33256+ page clusters, and tries to capture them.
33257+ If @buf != NULL, returns number of successfully written bytes,
33258+ otherwise returns error
33259+*/
33260+static loff_t
33261+write_cryptcompress_flow(struct file *file, struct inode *inode,
33262+ const char __user *buf, size_t count, loff_t pos,
33263+ int *conv_occured)
33264+{
33265+ int i;
33266+ flow_t f;
33267+ hint_t *hint;
33268+ int result = 0;
33269+ size_t to_write = 0;
33270+ loff_t file_off;
33271+ reiser4_slide_t win;
33272+ reiser4_cluster_t clust;
33273+
33274+ assert("edward-161", reiser4_schedulable());
33275+ assert("edward-748", cryptcompress_inode_ok(inode));
33276+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33277+ assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33278+
33279+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33280+ if (hint == NULL)
33281+ return RETERR(-ENOMEM);
33282+
33283+ result = load_file_hint(file, hint);
33284+ if (result) {
33285+ kfree(hint);
33286+ return result;
33287+ }
33288+
33289+ result =
33290+ flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
33291+ count, pos, WRITE_OP, &f);
33292+ if (result)
33293+ goto out;
33294+ to_write = f.length;
33295+
33296+ /* current write position in file */
33297+ file_off = pos;
33298+ reiser4_slide_init(&win);
33299+ cluster_init_read(&clust, &win);
33300+ clust.hint = hint;
33301+
33302+ result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
33303+ if (result)
33304+ goto out;
33305+
33306+ if (next_window_stat(&win) == HOLE_WINDOW) {
33307+ result = write_conversion_hook(file, inode, pos, &clust, NULL);
33308+ if (result)
33309+ goto out;
33310+ result =
33311+ prepare_cluster(inode, file_off, f.length, &clust,
33312+ PCL_APPEND);
33313+ if (result)
33314+ goto out;
33315+ }
33316+ do {
33317+ char *src;
33318+ unsigned page_off, page_count;
33319+
33320+ assert("edward-750", reiser4_schedulable());
33321+
33322+ result = write_conversion_hook(file, inode, pos, &clust,
33323+ conv_occured);
33324+ if (result || *conv_occured)
33325+ goto out;
33326+ result =
33327+ prepare_cluster(inode, file_off, f.length, &clust,
33328+ PCL_APPEND);
33329+ if (result)
33330+ goto out;
33331+
33332+ assert("edward-751", cryptcompress_inode_ok(inode));
33333+ assert("edward-204", win.stat == DATA_WINDOW);
33334+ assert("edward-1288", hint_is_valid(clust.hint));
33335+ assert("edward-752",
33336+ znode_is_write_locked(hint->ext_coord.coord.node));
33337+
33338+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33339+
33340+ /* set write position in page */
33341+ page_off = off_to_pgoff(win.off);
33342+
33343+ /* copy user's data to cluster pages */
33344+ for (i = off_to_pg(win.off), src = f.data;
33345+ i < count_to_nrpages(win.off + win.count);
33346+ i++, src += page_count) {
33347+ page_count =
33348+ cnt_to_pgcnt(win.off + win.count, i) - page_off;
33349+
33350+ assert("edward-1039",
33351+ page_off + page_count <= PAGE_CACHE_SIZE);
33352+ assert("edward-287", clust.pages[i] != NULL);
33353+
33354+ lock_page(clust.pages[i]);
33355+ result =
33356+ __copy_from_user((char *)kmap(clust.pages[i]) +
33357+ page_off, (char __user *)src, page_count);
33358+ kunmap(clust.pages[i]);
33359+ if (unlikely(result)) {
33360+ unlock_page(clust.pages[i]);
33361+ result = -EFAULT;
33362+ goto err2;
33363+ }
33364+ SetPageUptodate(clust.pages[i]);
33365+ unlock_page(clust.pages[i]);
33366+ page_off = 0;
33367+ }
33368+ assert("edward-753", cryptcompress_inode_ok(inode));
33369+
33370+ set_cluster_pages_dirty(&clust);
33371+
33372+ result = try_capture_cluster(&clust, inode);
33373+ if (result)
33374+ goto err2;
33375+
33376+ assert("edward-998", f.user == 1);
33377+
33378+ move_flow_forward(&f, win.count);
33379+
33380+ /* disk cluster may be already clean at this point */
33381+
33382+ /* . update cluster
33383+ . set hint for new offset
33384+ . unlock znode
33385+ . update inode
33386+ . balance dirty pages
33387+ */
33388+ result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
33389+ if (result)
33390+ goto err1;
33391+ assert("edward-755", hint->lh.owner == NULL);
33392+ reset_cluster_params(&clust);
33393+ continue;
33394+ err2:
33395+ reiser4_release_cluster_pages_and_jnode(&clust);
33396+ err1:
33397+ if (clust.reserved)
33398+ free_reserved4cluster(inode,
33399+ &clust,
33400+ estimate_update_cluster(inode));
33401+ break;
33402+ } while (f.length);
33403+ out:
33404+ done_lh(&hint->lh);
33405+ if (result == -EEXIST)
33406+ warning("edward-1407", "write returns EEXIST!\n");
33407+
33408+ put_cluster_handle(&clust);
33409+ save_file_hint(file, hint);
33410+ kfree(hint);
33411+ if (buf) {
33412+ /* if nothing were written - there must be an error */
33413+ assert("edward-195", ergo((to_write == f.length),
33414+ (result < 0 || *conv_occured)));
33415+ return (to_write - f.length) ? (to_write - f.length) : result;
33416+ }
33417+ return result;
33418+}
33419+
33420+/**
33421+ * write_cryptcompress - write of struct file_operations
33422+ * @file: file to write to
33423+ * @buf: address of user-space buffer
33424+ * @read_amount: number of bytes to write
33425+ * @off: position in file to write to
33426+ *
33427+ * This is implementation of vfs's write method of struct file_operations for
33428+ * cryptcompress plugin.
33429+ */
33430+ssize_t write_cryptcompress(struct file *file, const char __user *buf,
33431+ size_t count, loff_t *off, int *conv)
33432+{
33433+ ssize_t result;
33434+ struct inode *inode;
33435+ reiser4_context *ctx;
33436+ loff_t pos = *off;
33437+ cryptcompress_info_t *info;
33438+
33439+ assert("edward-1449", *conv == 0);
33440+
33441+ inode = file->f_dentry->d_inode;
33442+ assert("edward-196", cryptcompress_inode_ok(inode));
33443+
33444+ info = cryptcompress_inode_data(inode);
33445+
33446+ ctx = reiser4_init_context(inode->i_sb);
33447+ if (IS_ERR(ctx))
33448+ return PTR_ERR(ctx);
33449+
33450+ mutex_lock(&inode->i_mutex);
33451+
33452+ result = generic_write_checks(file, &pos, &count, 0);
33453+ if (unlikely(result != 0))
33454+ goto out;
33455+ if (unlikely(count == 0))
33456+ goto out;
33457+ result = remove_suid(file->f_dentry);
33458+ if (unlikely(result != 0))
33459+ goto out;
33460+ /* remove_suid might create a transaction */
33461+ reiser4_txn_restart(ctx);
33462+
33463+ result = write_cryptcompress_flow(file, inode, buf, count, pos, conv);
33464+
33465+ if (result < 0)
33466+ goto out;
33467+ /* update position in a file */
33468+ *off = pos + result;
33469+ out:
33470+ mutex_unlock(&inode->i_mutex);
33471+
33472+ context_set_commit_async(ctx);
33473+ reiser4_exit_context(ctx);
33474+ return result;
33475+}
33476+
33477+int readpages_cryptcompress(struct file *file, struct address_space *mapping,
33478+ struct list_head *pages, unsigned nr_pages)
33479+{
33480+ reiser4_context * ctx;
33481+ int ret;
33482+
33483+ ctx = reiser4_init_context(mapping->host->i_sb);
33484+ if (IS_ERR(ctx)) {
33485+ ret = PTR_ERR(ctx);
33486+ goto err;
33487+ }
33488+ /* crc files can be built of ctail items only */
33489+ ret = readpages_ctail(file, mapping, pages);
33490+ reiser4_exit_context(ctx);
33491+ if (ret) {
33492+err:
33493+ put_pages_list(pages);
33494+ }
33495+ return ret;
33496+}
33497+
33498+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
33499+{
33500+ /* reserve one block to update stat data item */
33501+ assert("edward-1193",
33502+ inode_file_plugin(inode)->estimate.update ==
33503+ estimate_update_common);
33504+ return estimate_update_common(inode);
33505+}
33506+
33507+/**
33508+ * read_cryptcompress - read of struct file_operations
33509+ * @file: file to read from
33510+ * @buf: address of user-space buffer
33511+ * @read_amount: number of bytes to read
33512+ * @off: position in file to read from
33513+ *
33514+ * This is implementation of vfs's read method of struct file_operations for
33515+ * cryptcompress plugin.
33516+ */
33517+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
33518+ loff_t * off)
33519+{
33520+ ssize_t result;
33521+ struct inode *inode;
33522+ reiser4_context *ctx;
33523+ cryptcompress_info_t *info;
33524+ reiser4_block_nr needed;
33525+
33526+ inode = file->f_dentry->d_inode;
33527+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
33528+
33529+ ctx = reiser4_init_context(inode->i_sb);
33530+ if (IS_ERR(ctx))
33531+ return PTR_ERR(ctx);
33532+
33533+ info = cryptcompress_inode_data(inode);
33534+ needed = cryptcompress_estimate_read(inode);
33535+
33536+ result = reiser4_grab_space(needed, BA_CAN_COMMIT);
33537+ if (result != 0) {
33538+ reiser4_exit_context(ctx);
33539+ return result;
33540+ }
33541+
33542+ LOCK_CNT_INC(inode_sem_r);
33543+
33544+ result = do_sync_read(file, buf, size, off);
33545+
33546+ LOCK_CNT_DEC(inode_sem_r);
33547+
33548+ context_set_commit_async(ctx);
33549+ reiser4_exit_context(ctx);
33550+
33551+ return result;
33552+}
33553+
33554+/* If @index > 0, find real disk cluster of the index (@index - 1),
33555+ If @index == 0 find the real disk cluster of the object of maximal index.
33556+ Keep incremented index of the result in @found.
33557+ It succes was returned:
33558+ (@index == 0 && @found == 0) means that the object doesn't have real disk
33559+ clusters.
33560+ (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
33561+ exist.
33562+*/
33563+static int
33564+find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
33565+{
33566+ int result;
33567+ reiser4_key key;
33568+ loff_t offset;
33569+ hint_t *hint;
33570+ lock_handle *lh;
33571+ lookup_bias bias;
33572+ coord_t *coord;
33573+ item_plugin *iplug;
33574+
33575+ assert("edward-1131", inode != NULL);
33576+ assert("edward-95", cryptcompress_inode_ok(inode));
33577+
33578+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33579+ if (hint == NULL)
33580+ return RETERR(-ENOMEM);
33581+ hint_init_zero(hint);
33582+ lh = &hint->lh;
33583+
33584+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
33585+ offset =
33586+ (index ? clust_to_off(index, inode) -
33587+ 1 : get_key_offset(reiser4_max_key()));
33588+
33589+ key_by_inode_cryptcompress(inode, offset, &key);
33590+
33591+ /* find the last item of this object */
33592+ result =
33593+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
33594+ bias, 0);
33595+ if (cbk_errored(result)) {
33596+ done_lh(lh);
33597+ kfree(hint);
33598+ return result;
33599+ }
33600+ if (result == CBK_COORD_NOTFOUND) {
33601+ /* no real disk clusters */
33602+ done_lh(lh);
33603+ kfree(hint);
33604+ *found = 0;
33605+ return 0;
33606+ }
33607+ /* disk cluster is found */
33608+ coord = &hint->ext_coord.coord;
33609+ coord_clear_iplug(coord);
33610+ result = zload(coord->node);
33611+ if (unlikely(result)) {
33612+ done_lh(lh);
33613+ kfree(hint);
33614+ return result;
33615+ }
33616+ iplug = item_plugin_by_coord(coord);
33617+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
33618+ assert("edward-1202", ctail_ok(coord));
33619+
33620+ item_key_by_coord(coord, &key);
33621+ *found = off_to_clust(get_key_offset(&key), inode) + 1;
33622+
33623+ assert("edward-1132", ergo(index, index == *found));
33624+
33625+ zrelse(coord->node);
33626+ done_lh(lh);
33627+ kfree(hint);
33628+ return 0;
33629+}
33630+
33631+static int find_fake_appended(struct inode *inode, cloff_t * index)
33632+{
33633+ return find_real_disk_cluster(inode, index,
33634+ 0 /* find last real one */ );
33635+}
33636+
33637+/* Set left coord when unit is not found after node_lookup()
33638+ This takes into account that there can be holes in a sequence
33639+ of disk clusters */
33640+
33641+static void adjust_left_coord(coord_t * left_coord)
33642+{
33643+ switch (left_coord->between) {
33644+ case AFTER_UNIT:
33645+ left_coord->between = AFTER_ITEM;
33646+ case AFTER_ITEM:
33647+ case BEFORE_UNIT:
33648+ break;
33649+ default:
33650+ impossible("edward-1204", "bad left coord to cut");
33651+ }
33652+ return;
33653+}
33654+
33655+#define CRC_CUT_TREE_MIN_ITERATIONS 64
33656+int
33657+cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
33658+ const reiser4_key * to_key,
33659+ reiser4_key * smallest_removed,
33660+ struct inode *object, int truncate, int *progress)
33661+{
33662+ lock_handle next_node_lock;
33663+ coord_t left_coord;
33664+ int result;
33665+
33666+ assert("edward-1158", tap->coord->node != NULL);
33667+ assert("edward-1159", znode_is_write_locked(tap->coord->node));
33668+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
33669+
33670+ *progress = 0;
33671+ init_lh(&next_node_lock);
33672+
33673+ while (1) {
33674+ znode *node; /* node from which items are cut */
33675+ node_plugin *nplug; /* node plugin for @node */
33676+
33677+ node = tap->coord->node;
33678+
33679+ /* Move next_node_lock to the next node on the left. */
33680+ result =
33681+ reiser4_get_left_neighbor(&next_node_lock, node,
33682+ ZNODE_WRITE_LOCK,
33683+ GN_CAN_USE_UPPER_LEVELS);
33684+ if (result != 0 && result != -E_NO_NEIGHBOR)
33685+ break;
33686+ /* FIXME-EDWARD: Check can we delete the node as a whole. */
33687+ result = reiser4_tap_load(tap);
33688+ if (result)
33689+ return result;
33690+
33691+ /* Prepare the second (right) point for cut_node() */
33692+ if (*progress)
33693+ coord_init_last_unit(tap->coord, node);
33694+
33695+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
33696+ /* set rightmost unit for the items without lookup method */
33697+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
33698+
33699+ nplug = node->nplug;
33700+
33701+ assert("edward-1161", nplug);
33702+ assert("edward-1162", nplug->lookup);
33703+
33704+ /* left_coord is leftmost unit cut from @node */
33705+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
33706+
33707+ if (IS_CBKERR(result))
33708+ break;
33709+
33710+ if (result == CBK_COORD_NOTFOUND)
33711+ adjust_left_coord(&left_coord);
33712+
33713+ /* adjust coordinates so that they are set to existing units */
33714+ if (coord_set_to_right(&left_coord)
33715+ || coord_set_to_left(tap->coord)) {
33716+ result = 0;
33717+ break;
33718+ }
33719+
33720+ if (coord_compare(&left_coord, tap->coord) ==
33721+ COORD_CMP_ON_RIGHT) {
33722+ /* keys from @from_key to @to_key are not in the tree */
33723+ result = 0;
33724+ break;
33725+ }
33726+
33727+ /* cut data from one node */
33728+ *smallest_removed = *reiser4_min_key();
33729+ result = kill_node_content(&left_coord,
33730+ tap->coord,
33731+ from_key,
33732+ to_key,
33733+ smallest_removed,
33734+ next_node_lock.node,
33735+ object, truncate);
33736+#if REISER4_DEBUG
33737+ /*node_check(node, ~0U); */
33738+#endif
33739+ reiser4_tap_relse(tap);
33740+
33741+ if (result)
33742+ break;
33743+
33744+ ++(*progress);
33745+
33746+ /* Check whether all items with keys >= from_key were removed
33747+ * from the tree. */
33748+ if (keyle(smallest_removed, from_key))
33749+ /* result = 0; */
33750+ break;
33751+
33752+ if (next_node_lock.node == NULL)
33753+ break;
33754+
33755+ result = reiser4_tap_move(tap, &next_node_lock);
33756+ done_lh(&next_node_lock);
33757+ if (result)
33758+ break;
33759+
33760+ /* Break long cut_tree operation (deletion of a large file) if
33761+ * atom requires commit. */
33762+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
33763+ && current_atom_should_commit()) {
33764+ result = -E_REPEAT;
33765+ break;
33766+ }
33767+ }
33768+ done_lh(&next_node_lock);
33769+ return result;
33770+}
33771+
33772+/* Append or expand hole in two steps (exclusive access should be aquired!)
33773+ 1) write zeroes to the current real cluster,
33774+ 2) expand hole via fake clusters (just increase i_size) */
33775+static int
33776+cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
33777+ loff_t new_size)
33778+{
33779+ int result = 0;
33780+ hint_t *hint;
33781+ lock_handle *lh;
33782+ loff_t hole_size;
33783+ int nr_zeroes;
33784+ reiser4_slide_t win;
33785+ reiser4_cluster_t clust;
33786+
33787+ assert("edward-1133", inode->i_size < new_size);
33788+ assert("edward-1134", reiser4_schedulable());
33789+ assert("edward-1135", cryptcompress_inode_ok(inode));
33790+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
33791+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
33792+
33793+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33794+ if (hint == NULL)
33795+ return RETERR(-ENOMEM);
33796+ hint_init_zero(hint);
33797+ lh = &hint->lh;
33798+
33799+ reiser4_slide_init(&win);
33800+ cluster_init_read(&clust, &win);
33801+ clust.hint = hint;
33802+
33803+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33804+ if (result)
33805+ goto out;
33806+ if (off_to_cloff(inode->i_size, inode) == 0)
33807+ goto fake_append;
33808+ hole_size = new_size - inode->i_size;
33809+ nr_zeroes =
33810+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
33811+ if (hole_size < nr_zeroes)
33812+ nr_zeroes = hole_size;
33813+ set_window(&clust, &win, inode, inode->i_size,
33814+ inode->i_size + nr_zeroes);
33815+ win.stat = HOLE_WINDOW;
33816+
33817+ assert("edward-1137",
33818+ clust.index == off_to_clust(inode->i_size, inode));
33819+
33820+ result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
33821+
33822+ assert("edward-1271", !result || result == -ENOSPC);
33823+ if (result)
33824+ goto out;
33825+ assert("edward-1139",
33826+ clust.dstat == PREP_DISK_CLUSTER ||
33827+ clust.dstat == UNPR_DISK_CLUSTER);
33828+
33829+ assert("edward-1431", hole_size >= nr_zeroes);
33830+ if (hole_size == nr_zeroes)
33831+ /* nothing to append anymore */
33832+ goto out;
33833+ fake_append:
33834+ INODE_SET_FIELD(inode, i_size, new_size);
33835+ out:
33836+ done_lh(lh);
33837+ kfree(hint);
33838+ put_cluster_handle(&clust);
33839+ return result;
33840+}
33841+
33842+#if REISER4_DEBUG
33843+static int
33844+pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
33845+{
33846+ struct pagevec pvec;
33847+ int i;
33848+ int count;
33849+ int rest;
33850+
33851+ rest = count_to_nrpages(old_size) - start;
33852+
33853+ pagevec_init(&pvec, 0);
33854+ count = min_count(pagevec_space(&pvec), rest);
33855+
33856+ while (rest) {
33857+ count = min_count(pagevec_space(&pvec), rest);
33858+ pvec.nr = find_get_pages(inode->i_mapping, start,
33859+ count, pvec.pages);
33860+ for (i = 0; i < pagevec_count(&pvec); i++) {
33861+ if (PageUptodate(pvec.pages[i])) {
33862+ warning("edward-1205",
33863+ "truncated page of index %lu is uptodate",
33864+ pvec.pages[i]->index);
33865+ return 0;
33866+ }
33867+ }
33868+ start += count;
33869+ rest -= count;
33870+ pagevec_release(&pvec);
33871+ }
33872+ return 1;
33873+}
33874+
33875+static int body_truncate_ok(struct inode *inode, cloff_t aidx)
33876+{
33877+ int result;
33878+ cloff_t raidx;
33879+
33880+ result = find_fake_appended(inode, &raidx);
33881+ return !result && (aidx == raidx);
33882+}
33883+#endif
33884+
33885+static int
33886+update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
33887+{
33888+ return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
33889+ ? 0 : reiser4_update_file_size(inode, key, update_sd));
33890+}
33891+
33892+/* prune cryptcompress file in two steps (exclusive access should be acquired!)
33893+ 1) cut all disk clusters but the last one partially truncated,
33894+ 2) set zeroes and capture last partially truncated page cluster if the last
33895+ one exists, otherwise truncate via prune fake cluster (just decrease i_size)
33896+*/
33897+static int
33898+prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
33899+ cloff_t aidx)
33900+{
33901+ int result = 0;
33902+ unsigned nr_zeroes;
33903+ loff_t to_prune;
33904+ loff_t old_size;
33905+ cloff_t ridx;
33906+
33907+ hint_t *hint;
33908+ lock_handle *lh;
33909+ reiser4_slide_t win;
33910+ reiser4_cluster_t clust;
33911+
33912+ assert("edward-1140", inode->i_size >= new_size);
33913+ assert("edward-1141", reiser4_schedulable());
33914+ assert("edward-1142", cryptcompress_inode_ok(inode));
33915+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
33916+
33917+ old_size = inode->i_size;
33918+
33919+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33920+ if (hint == NULL)
33921+ return RETERR(-ENOMEM);
33922+ hint_init_zero(hint);
33923+ lh = &hint->lh;
33924+
33925+ reiser4_slide_init(&win);
33926+ cluster_init_read(&clust, &win);
33927+ clust.hint = hint;
33928+
33929+ /* rightmost completely truncated cluster */
33930+ ridx = count_to_nrclust(new_size, inode);
33931+
33932+ assert("edward-1174", ridx <= aidx);
33933+ old_size = inode->i_size;
33934+ if (ridx != aidx) {
33935+ result = cut_file_items(inode,
33936+ clust_to_off(ridx, inode),
33937+ update_sd,
33938+ clust_to_off(aidx, inode),
33939+ update_cryptcompress_size);
33940+ if (result)
33941+ goto out;
33942+ }
33943+ if (!off_to_cloff(new_size, inode)) {
33944+ /* no partially truncated clusters */
33945+ assert("edward-1145", inode->i_size == new_size);
33946+ goto finish;
33947+ }
33948+ assert("edward-1146", new_size < inode->i_size);
33949+
33950+ to_prune = inode->i_size - new_size;
33951+
33952+ /* partial truncate of leftmost cluster,
33953+ first check if it is fake */
33954+ result = find_real_disk_cluster(inode, &aidx, ridx);
33955+ if (result)
33956+ goto out;
33957+ if (!aidx)
33958+ /* yup, this is fake one */
33959+ goto finish;
33960+
33961+ assert("edward-1148", aidx == ridx);
33962+
33963+ /* do partial truncate of the leftmost page cluster,
33964+ then try to capture this one */
33965+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33966+ if (result)
33967+ goto out;
33968+ nr_zeroes = (off_to_pgoff(new_size) ?
33969+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
33970+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
33971+ win.stat = HOLE_WINDOW;
33972+
33973+ assert("edward-1149", clust.index == ridx - 1);
33974+
33975+ result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
33976+ if (result)
33977+ goto out;
33978+ assert("edward-1151",
33979+ clust.dstat == PREP_DISK_CLUSTER ||
33980+ clust.dstat == UNPR_DISK_CLUSTER);
33981+
33982+ assert("edward-1191", inode->i_size == new_size);
33983+ assert("edward-1206", body_truncate_ok(inode, ridx));
33984+ finish:
33985+ /* drop all the pages that don't have jnodes (i.e. pages
33986+ which can not be truncated by cut_file_items() because
33987+ of holes represented by fake disk clusters) including
33988+ the pages of partially truncated cluster which was
33989+ released by prepare_cluster() */
33990+ truncate_inode_pages(inode->i_mapping, new_size);
33991+ INODE_SET_FIELD(inode, i_size, new_size);
33992+ out:
33993+ assert("edward-1334", !result || result == -ENOSPC);
33994+ assert("edward-1209",
33995+ pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
33996+ done_lh(lh);
33997+ kfree(hint);
33998+ put_cluster_handle(&clust);
33999+ return result;
34000+}
34001+
34002+/* Prepare cryptcompress file for truncate:
34003+ prune or append rightmost fake logical clusters (if any)
34004+*/
34005+static int
34006+start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
34007+ int update_sd)
34008+{
34009+ int result = 0;
34010+ int bytes;
34011+
34012+ if (new_size > inode->i_size) {
34013+ /* append */
34014+ if (inode->i_size < clust_to_off(aidx, inode))
34015+ /* no fake bytes */
34016+ return 0;
34017+ bytes = new_size - inode->i_size;
34018+ INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
34019+ } else {
34020+ /* prune */
34021+ if (inode->i_size <= clust_to_off(aidx, inode))
34022+ /* no fake bytes */
34023+ return 0;
34024+ bytes =
34025+ inode->i_size - max_count(new_size,
34026+ clust_to_off(aidx, inode));
34027+ if (!bytes)
34028+ return 0;
34029+ INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
34030+ /* In the case of fake prune we need to drop page cluster.
34031+ There are only 2 cases for partially truncated page:
34032+ 1. If is is dirty, therefore it is anonymous
34033+ (was dirtied via mmap), and will be captured
34034+ later via ->capture().
34035+ 2. If is clean, therefore it is filled by zeroes.
34036+ In both cases we don't need to make it dirty and
34037+ capture here.
34038+ */
34039+ truncate_inode_pages(inode->i_mapping, inode->i_size);
34040+ }
34041+ if (update_sd)
34042+ result = update_sd_cryptcompress(inode);
34043+ return result;
34044+}
34045+
34046+/* This is called in setattr_cryptcompress when it is used to truncate,
34047+ and in delete_cryptcompress */
34048+static int cryptcompress_truncate(struct inode *inode, /* old size */
34049+ loff_t new_size, /* new size */
34050+ int update_sd)
34051+{
34052+ int result;
34053+ cloff_t aidx;
34054+
34055+ result = find_fake_appended(inode, &aidx);
34056+ if (result)
34057+ return result;
34058+ assert("edward-1208",
34059+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34060+
34061+ result = start_truncate_fake(inode, aidx, new_size, update_sd);
34062+ if (result)
34063+ return result;
34064+ if (inode->i_size == new_size)
34065+ /* nothing to truncate anymore */
34066+ return 0;
34067+ result = (inode->i_size < new_size ?
34068+ cryptcompress_append_hole(inode, new_size) :
34069+ prune_cryptcompress(inode, new_size, update_sd, aidx));
34070+ if (!result && update_sd)
34071+ result = update_sd_cryptcompress(inode);
34072+ return result;
34073+}
34074+
34075+static void clear_moved_tag_cluster(struct address_space * mapping,
34076+ reiser4_cluster_t * clust)
34077+{
34078+ int i;
34079+ void * ret;
34080+ read_lock_irq(&mapping->tree_lock);
34081+ for (i = 0; i < clust->nr_pages; i++) {
34082+ assert("edward-1438", clust->pages[i] != NULL);
34083+ ret = radix_tree_tag_clear(&mapping->page_tree,
34084+ clust->pages[i]->index,
34085+ PAGECACHE_TAG_REISER4_MOVED);
34086+ assert("edward-1439", ret == clust->pages[i]);
34087+ }
34088+ read_unlock_irq(&mapping->tree_lock);
34089+}
34090+
34091+/* Capture an anonymous pager cluster. (Page cluser is
34092+ anonymous if it contains at least one anonymous page */
34093+static int
34094+capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
34095+{
34096+ int result;
34097+
34098+ assert("edward-1073", clust != NULL);
34099+ assert("edward-1074", inode != NULL);
34100+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34101+
34102+ result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
34103+ if (result)
34104+ return result;
34105+ set_cluster_pages_dirty(clust);
34106+ clear_moved_tag_cluster(inode->i_mapping, clust);
34107+
34108+ result = try_capture_cluster(clust, inode);
34109+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
34110+ if (unlikely(result)) {
34111+ /* set cleared tag back, so it will be
34112+ possible to capture it again later */
34113+ read_lock_irq(&inode->i_mapping->tree_lock);
34114+ radix_tree_tag_set(&inode->i_mapping->page_tree,
34115+ clust_to_pg(clust->index, inode),
34116+ PAGECACHE_TAG_REISER4_MOVED);
34117+ read_unlock_irq(&inode->i_mapping->tree_lock);
34118+
34119+ reiser4_release_cluster_pages_and_jnode(clust);
34120+ }
34121+ return result;
34122+}
34123+
34124+#define MAX_CLUSTERS_TO_CAPTURE(inode) (1024 >> cluster_nrpages_shift(inode))
34125+
34126+/* read lock should be acquired */
34127+static int
34128+capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
34129+ int to_capture)
34130+{
34131+ int result = 0;
34132+ int found;
34133+ struct page *page = NULL;
34134+ hint_t *hint;
34135+ lock_handle *lh;
34136+ reiser4_cluster_t clust;
34137+
34138+ assert("edward-1127", mapping != NULL);
34139+ assert("edward-1128", mapping->host != NULL);
34140+ assert("edward-1440", mapping->host->i_mapping == mapping);
34141+
34142+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34143+ if (hint == NULL)
34144+ return RETERR(-ENOMEM);
34145+ hint_init_zero(hint);
34146+ lh = &hint->lh;
34147+
34148+ cluster_init_read(&clust, NULL);
34149+ clust.hint = hint;
34150+
34151+ result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
34152+ if (result)
34153+ goto out;
34154+
34155+ while (to_capture > 0) {
34156+ found =
34157+ find_get_pages_tag(mapping, index,
34158+ PAGECACHE_TAG_REISER4_MOVED, 1, &page);
34159+ if (!found) {
34160+ *index = (pgoff_t) - 1;
34161+ break;
34162+ }
34163+ assert("edward-1109", page != NULL);
34164+
34165+ move_cluster_forward(&clust, mapping->host, page->index);
34166+ result = capture_page_cluster(&clust, mapping->host);
34167+ page_cache_release(page);
34168+ if (result)
34169+ break;
34170+ to_capture -= clust.nr_pages;
34171+ }
34172+ if (result) {
34173+ warning("edward-1077",
34174+ "Cannot capture anon pages: result=%i (captured=%d)\n",
34175+ result,
34176+ ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
34177+ to_capture);
34178+ } else {
34179+ /* something had to be found */
34180+ assert("edward-1078",
34181+ to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
34182+ if (to_capture <= 0)
34183+ /* there may be left more pages */
34184+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
34185+ }
34186+ out:
34187+ done_lh(lh);
34188+ kfree(hint);
34189+ put_cluster_handle(&clust);
34190+ return result;
34191+}
34192+
34193+/* Check mapping for existence of not captured dirty pages.
34194+ This returns !0 if either page tree contains pages tagged
34195+ PAGECACHE_TAG_REISER4_MOVED */
34196+static int cryptcompress_inode_has_anon_pages(struct inode *inode)
34197+{
34198+ return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
34199+}
34200+
34201+/* this is implementation of vfs's writepages method of struct
34202+ address_space_operations */
34203+int
34204+writepages_cryptcompress(struct address_space *mapping,
34205+ struct writeback_control *wbc)
34206+{
34207+ int result;
34208+ int to_capture;
34209+ pgoff_t nrpages;
34210+ pgoff_t index = 0;
34211+ cryptcompress_info_t *info;
34212+ struct inode *inode;
34213+
34214+ inode = mapping->host;
34215+ if (!cryptcompress_inode_has_anon_pages(inode)) {
34216+ result = 0;
34217+ goto end;
34218+ }
34219+
34220+ info = cryptcompress_inode_data(inode);
34221+ nrpages = count_to_nrpages(i_size_read(inode));
34222+
34223+ if (wbc->sync_mode != WB_SYNC_ALL)
34224+ to_capture =
34225+ min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
34226+ else
34227+ to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
34228+ do {
34229+ reiser4_context *ctx;
34230+
34231+ ctx = reiser4_init_context(inode->i_sb);
34232+ if (IS_ERR(ctx)) {
34233+ result = PTR_ERR(ctx);
34234+ break;
34235+ }
34236+ ctx->nobalance = 1;
34237+
34238+ assert("edward-1079",
34239+ lock_stack_isclean(get_current_lock_stack()));
34240+
34241+ LOCK_CNT_INC(inode_sem_r);
34242+
34243+ result =
34244+ capture_anonymous_clusters(inode->i_mapping, &index,
34245+ to_capture);
34246+
34247+ if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
34248+ reiser4_exit_context(ctx);
34249+ break;
34250+ }
34251+ result = txnmgr_force_commit_all(inode->i_sb, 0);
34252+ reiser4_exit_context(ctx);
34253+ } while (result == 0 && index < nrpages);
34254+
34255+ end:
34256+ if (is_in_reiser4_context()) {
34257+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34258+ /* there are already pages to flush, flush them out, do
34259+ not delay until end of reiser4_sync_inodes */
34260+ reiser4_writeout(inode->i_sb, wbc);
34261+ get_current_context()->nr_captured = 0;
34262+ }
34263+ }
34264+ return result;
34265+}
34266+
34267+/* plugin->u.file.mmap */
34268+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34269+{
34270+ int result;
34271+ struct inode *inode;
34272+ reiser4_context *ctx;
34273+
34274+ inode = file->f_dentry->d_inode;
34275+ ctx = reiser4_init_context(inode->i_sb);
34276+ if (IS_ERR(ctx))
34277+ return PTR_ERR(ctx);
34278+ /*
34279+ * generic_file_mmap will do update_atime. Grab space for stat data
34280+ * update.
34281+ */
34282+ result = reiser4_grab_space_force
34283+ (inode_file_plugin(inode)->estimate.update(inode),
34284+ BA_CAN_COMMIT);
34285+ if (result) {
34286+ reiser4_exit_context(ctx);
34287+ return result;
34288+ }
34289+ result = generic_file_mmap(file, vma);
34290+ reiser4_exit_context(ctx);
34291+ return result;
34292+}
34293+
34294+/* plugin->u.file.release */
34295+/* plugin->u.file.get_block */
34296+
34297+/* this is implementation of delete method of file plugin for
34298+ cryptcompress objects */
34299+int delete_object_cryptcompress(struct inode *inode)
34300+{
34301+ int result;
34302+
34303+ assert("edward-429", inode->i_nlink == 0);
34304+
34305+ reiser4_txn_restart_current();
34306+
34307+ result = cryptcompress_truncate(inode, 0, 0);
34308+ if (result) {
34309+ warning("edward-430",
34310+ "cannot truncate cryptcompress file %lli: %i",
34311+ (unsigned long long)get_inode_oid(inode),
34312+ result);
34313+ }
34314+ truncate_inode_pages(inode->i_mapping, 0);
34315+ /* and remove stat data */
34316+ return reiser4_delete_object_common(inode);
34317+}
34318+
34319+/* plugin->u.file.setattr method
34320+ This implements actual truncate (see comments in reiser4/page_cache.c) */
34321+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
34322+{
34323+ int result;
34324+ struct inode *inode;
34325+
34326+ inode = dentry->d_inode;
34327+ if (attr->ia_valid & ATTR_SIZE) {
34328+ if (inode->i_size != attr->ia_size) {
34329+ reiser4_context *ctx;
34330+ loff_t old_size;
34331+
34332+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
34333+ if (IS_ERR(ctx))
34334+ return PTR_ERR(ctx);
34335+
34336+ inode_check_scale(inode, inode->i_size, attr->ia_size);
34337+
34338+ old_size = inode->i_size;
34339+
34340+ result =
34341+ cryptcompress_truncate(inode, attr->ia_size,
34342+ 1 /* update stat data */ );
34343+ if (result) {
34344+ warning("edward-1192",
34345+ "truncate_cryptcompress failed: oid %lli, "
34346+ "old size %lld, new size %lld, retval %d",
34347+ (unsigned long long)
34348+ get_inode_oid(inode), old_size,
34349+ attr->ia_size, result);
34350+ }
34351+ context_set_commit_async(ctx);
34352+ reiser4_exit_context(ctx);
34353+ } else
34354+ result = 0;
34355+ } else
34356+ result = reiser4_setattr_common(dentry, attr);
34357+ return result;
34358+}
34359+
34360+/* sendfile_cryptcompress - sendfile of struct file_operations */
34361+ssize_t
34362+sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
34363+ read_actor_t actor, void *target)
34364+{
34365+ reiser4_context *ctx;
34366+ ssize_t result;
34367+ struct inode *inode;
34368+ cryptcompress_info_t *info;
34369+
34370+ inode = file->f_dentry->d_inode;
34371+ ctx = reiser4_init_context(inode->i_sb);
34372+ if (IS_ERR(ctx))
34373+ return PTR_ERR(ctx);
34374+ /*
34375+ * generic_file_sndfile may want to call update_atime. Grab space for
34376+ * stat data update
34377+ */
34378+ result = reiser4_grab_space(estimate_update_common(inode),
34379+ BA_CAN_COMMIT);
34380+ if (result)
34381+ goto exit;
34382+ info = cryptcompress_inode_data(inode);
34383+
34384+ result = generic_file_sendfile(file, ppos, count, actor, target);
34385+ exit:
34386+ reiser4_exit_context(ctx);
34387+ return result;
34388+}
34389+
34390+/*
34391+ * release_cryptcompress - release of struct file_operations
34392+ * @inode: inode of released file
34393+ * @file: file to release
34394+ */
34395+int release_cryptcompress(struct inode *inode, struct file *file)
34396+{
34397+ reiser4_context *ctx = reiser4_init_context(inode->i_sb);
34398+
34399+ if (IS_ERR(ctx))
34400+ return PTR_ERR(ctx);
34401+ reiser4_free_file_fsdata(file);
34402+ reiser4_exit_context(ctx);
34403+ return 0;
34404+}
34405+
34406+#if 0
34407+int prepare_write_cryptcompress(struct file *file, struct page *page,
34408+ unsigned from, unsigned to)
34409+{
34410+ return prepare_write_common(file, page, from, to);
34411+}
34412+#endif /* 0 */
34413+
34414+
34415+/*
34416+ Local variables:
34417+ c-indentation-style: "K&R"
34418+ mode-name: "LC"
34419+ c-basic-offset: 8
34420+ tab-width: 8
34421+ fill-column: 80
34422+ scroll-step: 1
34423+ End:
34424+*/
34425diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.h
34426--- linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300
34427+++ linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.h 2007-05-06 14:50:43.774999471 +0400
34428@@ -0,0 +1,554 @@
34429+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
34430+/* See http://www.namesys.com/cryptcompress_design.html */
34431+
34432+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
34433+#define __FS_REISER4_CRYPTCOMPRESS_H__
34434+
34435+#include "../../page_cache.h"
34436+#include "../compress/compress.h"
34437+#include "../crypto/cipher.h"
34438+
34439+#include <linux/pagemap.h>
34440+
34441+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
34442+#define MAX_CLUSTER_SHIFT 16
34443+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
34444+#define DC_CHECKSUM_SIZE 4
34445+
34446+#define MIN_LATTICE_FACTOR 1
34447+#define MAX_LATTICE_FACTOR 32
34448+
34449+/* this mask contains all non-standard plugins that might
34450+ be present in reiser4-specific part of inode managed by
34451+ cryptcompress file plugin */
34452+#define cryptcompress_mask \
34453+ ((1 << PSET_FILE) | \
34454+ (1 << PSET_CLUSTER) | \
34455+ (1 << PSET_CIPHER) | \
34456+ (1 << PSET_DIGEST) | \
34457+ (1 << PSET_COMPRESSION) | \
34458+ (1 << PSET_COMPRESSION_MODE))
34459+
34460+static inline loff_t min_count(loff_t a, loff_t b)
34461+{
34462+ return (a < b ? a : b);
34463+}
34464+
34465+static inline loff_t max_count(loff_t a, loff_t b)
34466+{
34467+ return (a > b ? a : b);
34468+}
34469+
34470+#if REISER4_DEBUG
34471+static inline int cluster_shift_ok(int shift)
34472+{
34473+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
34474+}
34475+#endif
34476+
34477+typedef struct tfm_stream {
34478+ __u8 *data;
34479+ size_t size;
34480+} tfm_stream_t;
34481+
34482+typedef enum {
34483+ INPUT_STREAM,
34484+ OUTPUT_STREAM,
34485+ LAST_STREAM
34486+} tfm_stream_id;
34487+
34488+typedef tfm_stream_t *tfm_unit[LAST_STREAM];
34489+
34490+static inline __u8 *ts_data(tfm_stream_t * stm)
34491+{
34492+ assert("edward-928", stm != NULL);
34493+ return stm->data;
34494+}
34495+
34496+static inline size_t ts_size(tfm_stream_t * stm)
34497+{
34498+ assert("edward-929", stm != NULL);
34499+ return stm->size;
34500+}
34501+
34502+static inline void set_ts_size(tfm_stream_t * stm, size_t size)
34503+{
34504+ assert("edward-930", stm != NULL);
34505+
34506+ stm->size = size;
34507+}
34508+
34509+static inline int alloc_ts(tfm_stream_t ** stm)
34510+{
34511+ assert("edward-931", stm);
34512+ assert("edward-932", *stm == NULL);
34513+
34514+ *stm = kmalloc(sizeof **stm, reiser4_ctx_gfp_mask_get());
34515+ if (*stm == NULL)
34516+ return -ENOMEM;
34517+ memset(*stm, 0, sizeof **stm);
34518+ return 0;
34519+}
34520+
34521+static inline void free_ts(tfm_stream_t * stm)
34522+{
34523+ assert("edward-933", !ts_data(stm));
34524+ assert("edward-934", !ts_size(stm));
34525+
34526+ kfree(stm);
34527+}
34528+
34529+static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
34530+{
34531+ assert("edward-935", !ts_data(stm));
34532+ assert("edward-936", !ts_size(stm));
34533+ assert("edward-937", size != 0);
34534+
34535+ stm->data = reiser4_vmalloc(size);
34536+ if (!stm->data)
34537+ return -ENOMEM;
34538+ set_ts_size(stm, size);
34539+ return 0;
34540+}
34541+
34542+static inline void free_ts_data(tfm_stream_t * stm)
34543+{
34544+ assert("edward-938", equi(ts_data(stm), ts_size(stm)));
34545+
34546+ if (ts_data(stm))
34547+ vfree(ts_data(stm));
34548+ memset(stm, 0, sizeof *stm);
34549+}
34550+
34551+/* Write modes for item conversion in flush convert phase */
34552+typedef enum {
34553+ CRC_APPEND_ITEM = 1,
34554+ CRC_OVERWRITE_ITEM = 2,
34555+ CRC_CUT_ITEM = 3
34556+} cryptcompress_write_mode_t;
34557+
34558+typedef enum {
34559+ PCL_UNKNOWN = 0, /* invalid option */
34560+ PCL_APPEND = 1, /* append and/or overwrite */
34561+ PCL_TRUNCATE = 2 /* truncate */
34562+} page_cluster_op;
34563+
34564+/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
34565+ using crypto/compression transforms implemented by reiser4 transform plugins.
34566+ Before each transform we allocate a pair of streams (tfm_unit) and assemble
34567+ page cluster into the input one. After transform we split output stream into
34568+ a set of items (disk cluster).
34569+*/
34570+typedef struct tfm_cluster {
34571+ coa_set coa;
34572+ tfm_unit tun;
34573+ tfm_action act;
34574+ int uptodate;
34575+ int lsize; /* size of the logical cluster */
34576+ int len; /* length of the transform stream */
34577+} tfm_cluster_t;
34578+
34579+static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
34580+{
34581+ return tc->coa[id][act];
34582+}
34583+
34584+static inline void
34585+set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
34586+{
34587+ tc->coa[id][act] = coa;
34588+}
34589+
34590+static inline int
34591+alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
34592+{
34593+ coa_t coa;
34594+
34595+ coa = cplug->alloc(tc->act);
34596+ if (IS_ERR(coa))
34597+ return PTR_ERR(coa);
34598+ set_coa(tc, cplug->h.id, tc->act, coa);
34599+ return 0;
34600+}
34601+
34602+static inline int
34603+grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
34604+{
34605+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
34606+ alloc_coa(tc, cplug) : 0);
34607+}
34608+
34609+static inline void free_coa_set(tfm_cluster_t * tc)
34610+{
34611+ tfm_action j;
34612+ reiser4_compression_id i;
34613+ compression_plugin *cplug;
34614+
34615+ assert("edward-810", tc != NULL);
34616+
34617+ for (j = 0; j < TFMA_LAST; j++)
34618+ for (i = 0; i < LAST_COMPRESSION_ID; i++) {
34619+ if (!get_coa(tc, i, j))
34620+ continue;
34621+ cplug = compression_plugin_by_id(i);
34622+ assert("edward-812", cplug->free != NULL);
34623+ cplug->free(get_coa(tc, i, j), j);
34624+ set_coa(tc, i, j, 0);
34625+ }
34626+ return;
34627+}
34628+
34629+static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
34630+{
34631+ return tc->tun[id];
34632+}
34633+
34634+static inline void
34635+set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
34636+{
34637+ tc->tun[id] = ts;
34638+}
34639+
34640+static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
34641+{
34642+ return ts_data(tfm_stream(tc, id));
34643+}
34644+
34645+static inline void
34646+set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
34647+{
34648+ tfm_stream(tc, id)->data = data;
34649+}
34650+
34651+static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
34652+{
34653+ return ts_size(tfm_stream(tc, id));
34654+}
34655+
34656+static inline void
34657+set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
34658+{
34659+ tfm_stream(tc, id)->size = size;
34660+}
34661+
34662+static inline int
34663+alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
34664+{
34665+ assert("edward-939", tc != NULL);
34666+ assert("edward-940", !tfm_stream(tc, id));
34667+
34668+ tc->tun[id] = kmalloc(sizeof(tfm_stream_t), reiser4_ctx_gfp_mask_get());
34669+ if (!tc->tun[id])
34670+ return -ENOMEM;
34671+ memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
34672+ return alloc_ts_data(tfm_stream(tc, id), size);
34673+}
34674+
34675+static inline int
34676+realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
34677+{
34678+ assert("edward-941", tfm_stream_size(tc, id) < size);
34679+ free_ts_data(tfm_stream(tc, id));
34680+ return alloc_ts_data(tfm_stream(tc, id), size);
34681+}
34682+
34683+static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
34684+{
34685+ free_ts_data(tfm_stream(tc, id));
34686+ free_ts(tfm_stream(tc, id));
34687+ set_tfm_stream(tc, id, 0);
34688+}
34689+
34690+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
34691+{
34692+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
34693+}
34694+
34695+static inline void free_tfm_unit(tfm_cluster_t * tc)
34696+{
34697+ tfm_stream_id id;
34698+ for (id = 0; id < LAST_STREAM; id++) {
34699+ if (!tfm_stream(tc, id))
34700+ continue;
34701+ free_tfm_stream(tc, id);
34702+ }
34703+}
34704+
34705+static inline void put_tfm_cluster(tfm_cluster_t * tc)
34706+{
34707+ assert("edward-942", tc != NULL);
34708+ free_coa_set(tc);
34709+ free_tfm_unit(tc);
34710+}
34711+
34712+static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
34713+{
34714+ assert("edward-943", tc != NULL);
34715+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
34716+ return (tc->uptodate == 1);
34717+}
34718+
34719+static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
34720+{
34721+ assert("edward-945", tc != NULL);
34722+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
34723+ tc->uptodate = 1;
34724+ return;
34725+}
34726+
34727+static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
34728+{
34729+ assert("edward-947", tc != NULL);
34730+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
34731+ tc->uptodate = 0;
34732+ return;
34733+}
34734+
34735+static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
34736+{
34737+ return (tfm_stream(tc, id) &&
34738+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
34739+}
34740+
34741+static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
34742+{
34743+ int i;
34744+ for (i = 0; i < LAST_STREAM; i++)
34745+ if (!tfm_stream_is_set(tc, i))
34746+ return 0;
34747+ return 1;
34748+}
34749+
34750+static inline void alternate_streams(tfm_cluster_t * tc)
34751+{
34752+ tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
34753+
34754+ set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
34755+ set_tfm_stream(tc, OUTPUT_STREAM, tmp);
34756+}
34757+
34758+/* a kind of data that we can write to the window */
34759+typedef enum {
34760+ DATA_WINDOW, /* the data we copy form user space */
34761+ HOLE_WINDOW /* zeroes if we write hole */
34762+} window_stat;
34763+
34764+/* Sliding window of cluster size which should be set to the approprite position
34765+ (defined by cluster index) in a file before page cluster modification by
34766+ file_write. Then we translate file size, offset to write from, number of
34767+ bytes to write, etc.. to the following configuration needed to estimate
34768+ number of pages to read before write, etc...
34769+*/
34770+typedef struct reiser4_slide {
34771+ unsigned off; /* offset we start to write/truncate from */
34772+ unsigned count; /* number of bytes (zeroes) to write/truncate */
34773+ unsigned delta; /* number of bytes to append to the hole */
34774+ window_stat stat; /* a kind of data to write to the window */
34775+} reiser4_slide_t;
34776+
34777+/* The following is a set of possible disk cluster states */
34778+typedef enum {
34779+ INVAL_DISK_CLUSTER, /* unknown state */
34780+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush
34781+ at least 1 time */
34782+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be
34783+ converted by flush */
34784+ FAKE_DISK_CLUSTER /* disk cluster doesn't exist neither in memory
34785+ nor on disk */
34786+} disk_cluster_stat;
34787+
34788+/*
34789+ While implementing all transforms (from page to disk cluster, and back)
34790+ reiser4 cluster manager fills the following structure incapsulating pointers
34791+ to all the clusters for the same index including the sliding window above
34792+*/
34793+typedef struct reiser4_cluster {
34794+ tfm_cluster_t tc; /* transform cluster */
34795+ int nr_pages; /* number of pages */
34796+ struct page **pages; /* page cluster */
34797+ page_cluster_op op; /* page cluster operation */
34798+ struct file *file;
34799+ hint_t *hint; /* disk cluster item for traversal */
34800+ disk_cluster_stat dstat; /* state of the current disk cluster */
34801+ cloff_t index; /* offset in the units of cluster size */
34802+ int index_valid; /* to validate the index above, if needed */
34803+ reiser4_slide_t *win; /* sliding window of cluster size */
34804+ int reserved; /* this indicates that space for disk
34805+ cluster modification is reserved */
34806+#if REISER4_DEBUG
34807+ reiser4_context *ctx;
34808+ int reserved_prepped;
34809+ int reserved_unprepped;
34810+#endif
34811+
34812+} reiser4_cluster_t;
34813+
34814+static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
34815+{
34816+ return tfm_stream_data(&clust->tc, INPUT_STREAM);
34817+}
34818+
34819+static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
34820+{
34821+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
34822+}
34823+
34824+static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
34825+{
34826+ assert("edward-1057", clust->pages != NULL);
34827+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
34828+ return 0;
34829+}
34830+
34831+static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
34832+{
34833+ assert("edward-949", clust != NULL);
34834+ assert("edward-1362", clust->pages == NULL);
34835+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
34836+
34837+ clust->pages =
34838+ kmalloc(sizeof(*clust->pages) * nrpages,
34839+ reiser4_ctx_gfp_mask_get());
34840+ if (!clust->pages)
34841+ return RETERR(-ENOMEM);
34842+ reset_cluster_pgset(clust, nrpages);
34843+ return 0;
34844+}
34845+
34846+static inline void free_cluster_pgset(reiser4_cluster_t * clust)
34847+{
34848+ assert("edward-951", clust->pages != NULL);
34849+ kfree(clust->pages);
34850+ clust->pages = NULL;
34851+}
34852+
34853+static inline void put_cluster_handle(reiser4_cluster_t * clust)
34854+{
34855+ assert("edward-435", clust != NULL);
34856+
34857+ put_tfm_cluster(&clust->tc);
34858+ if (clust->pages)
34859+ free_cluster_pgset(clust);
34860+ memset(clust, 0, sizeof *clust);
34861+}
34862+
34863+static inline void inc_keyload_count(crypto_stat_t * data)
34864+{
34865+ assert("edward-1410", data != NULL);
34866+ data->keyload_count++;
34867+}
34868+
34869+static inline void dec_keyload_count(crypto_stat_t * data)
34870+{
34871+ assert("edward-1411", data != NULL);
34872+ assert("edward-1412", data->keyload_count > 0);
34873+ data->keyload_count--;
34874+}
34875+
34876+/* cryptcompress specific part of reiser4_inode */
34877+typedef struct cryptcompress_info {
34878+ crypto_stat_t *crypt;
34879+ /* the following 2 fields are controlled by compression mode plugin */
34880+ int compress_toggle; /* current status of compressibility */
34881+ int lattice_factor; /* factor of dynamic lattice. FIXME: Have a
34882+ compression_toggle to keep the factor */
34883+#if REISER4_DEBUG
34884+ int pgcount; /* number of captured pages */
34885+#endif
34886+} cryptcompress_info_t;
34887+
34888+static inline void set_compression_toggle (cryptcompress_info_t * info, int val)
34889+{
34890+ info->compress_toggle = val;
34891+}
34892+
34893+static inline int get_compression_toggle (cryptcompress_info_t * info)
34894+{
34895+ return info->compress_toggle;
34896+}
34897+
34898+static inline int compression_is_on(cryptcompress_info_t * info)
34899+{
34900+ return get_compression_toggle(info) == 1;
34901+}
34902+
34903+static inline void turn_on_compression(cryptcompress_info_t * info)
34904+{
34905+ set_compression_toggle(info, 1);
34906+}
34907+
34908+static inline void turn_off_compression(cryptcompress_info_t * info)
34909+{
34910+ set_compression_toggle(info, 0);
34911+}
34912+
34913+static inline void set_lattice_factor(cryptcompress_info_t * info, int val)
34914+{
34915+ info->lattice_factor = val;
34916+}
34917+
34918+static inline int get_lattice_factor(cryptcompress_info_t * info)
34919+{
34920+ return info->lattice_factor;
34921+}
34922+
34923+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
34924+int equal_to_rdk(znode *, const reiser4_key *);
34925+int goto_right_neighbor(coord_t *, lock_handle *);
34926+int cryptcompress_inode_ok(struct inode *inode);
34927+int coord_is_unprepped_ctail(const coord_t * coord);
34928+extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *,
34929+ znode_lock_mode mode);
34930+extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
34931+ struct page * page, znode_lock_mode mode);
34932+extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
34933+ struct inode * inode);
34934+extern int readpages_cryptcompress(struct file*, struct address_space*,
34935+ struct list_head*, unsigned);
34936+int bind_cryptcompress(struct inode *child, struct inode *parent);
34937+void destroy_inode_cryptcompress(struct inode * inode);
34938+int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust);
34939+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
34940+ reiser4_cluster_t * clust, int * progress);
34941+crypto_stat_t * inode_crypto_stat (struct inode * inode);
34942+void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
34943+ int (*can_inherit)(struct inode * child,
34944+ struct inode * parent));
34945+void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
34946+void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
34947+crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode);
34948+
34949+static inline struct crypto_blkcipher * info_get_cipher(crypto_stat_t * info)
34950+{
34951+ return info->cipher;
34952+}
34953+
34954+static inline void info_set_cipher(crypto_stat_t * info,
34955+ struct crypto_blkcipher * tfm)
34956+{
34957+ info->cipher = tfm;
34958+}
34959+
34960+static inline struct crypto_hash * info_get_digest(crypto_stat_t * info)
34961+{
34962+ return info->digest;
34963+}
34964+
34965+static inline void info_set_digest(crypto_stat_t * info,
34966+ struct crypto_hash * tfm)
34967+{
34968+ info->digest = tfm;
34969+}
34970+
34971+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
34972+
34973+/* Make Linus happy.
34974+ Local variables:
34975+ c-indentation-style: "K&R"
34976+ mode-name: "LC"
34977+ c-basic-offset: 8
34978+ tab-width: 8
34979+ fill-column: 120
34980+ scroll-step: 1
34981+ End:
34982+*/
34983diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file.c linux-2.6.20/fs/reiser4/plugin/file/file.c
34984--- linux-2.6.20.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300
34985+++ linux-2.6.20/fs/reiser4/plugin/file/file.c 2007-05-06 14:50:43.779000721 +0400
34986@@ -0,0 +1,2821 @@
34987+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
34988+ * reiser4/README */
34989+
34990+/*
34991+ * this file contains implementations of inode/file/address_space/file plugin
34992+ * operations specific for "unix file plugin" (plugin id is
34993+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
34994+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
34995+ * no items but stat data)
34996+ */
34997+
34998+#include "../../inode.h"
34999+#include "../../super.h"
35000+#include "../../tree_walk.h"
35001+#include "../../carry.h"
35002+#include "../../page_cache.h"
35003+#include "../../ioctl.h"
35004+#include "../object.h"
35005+#include "../../safe_link.h"
35006+
35007+#include <linux/writeback.h>
35008+#include <linux/pagevec.h>
35009+#include <linux/syscalls.h>
35010+
35011+
35012+static int unpack(struct file *file, struct inode *inode, int forever);
35013+static void drop_access(unix_file_info_t *);
35014+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35015+ znode_lock_mode lock_mode);
35016+
35017+/* Get exclusive access and make sure that file is not partially
35018+ * converted (It may happen that another process is doing tail
35019+ * conversion. If so, wait until it completes)
35020+ */
35021+static inline void get_exclusive_access_careful(unix_file_info_t * uf_info,
35022+ struct inode *inode)
35023+{
35024+ do {
35025+ get_exclusive_access(uf_info);
35026+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
35027+ break;
35028+ drop_exclusive_access(uf_info);
35029+ schedule();
35030+ } while (1);
35031+}
35032+
35033+/* get unix file plugin specific portion of inode */
35034+unix_file_info_t *unix_file_inode_data(const struct inode *inode)
35035+{
35036+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35037+}
35038+
35039+/**
35040+ * equal_to_rdk - compare key and znode's right delimiting key
35041+ * @node: node whose right delimiting key to compare with @key
35042+ * @key: key to compare with @node's right delimiting key
35043+ *
35044+ * Returns true if @key is equal to right delimiting key of @node.
35045+ */
35046+int equal_to_rdk(znode *node, const reiser4_key *key)
35047+{
35048+ int result;
35049+
35050+ read_lock_dk(znode_get_tree(node));
35051+ result = keyeq(key, znode_get_rd_key(node));
35052+ read_unlock_dk(znode_get_tree(node));
35053+ return result;
35054+}
35055+
35056+#if REISER4_DEBUG
35057+
35058+/**
35059+ * equal_to_ldk - compare key and znode's left delimiting key
35060+ * @node: node whose left delimiting key to compare with @key
35061+ * @key: key to compare with @node's left delimiting key
35062+ *
35063+ * Returns true if @key is equal to left delimiting key of @node.
35064+ */
35065+int equal_to_ldk(znode *node, const reiser4_key *key)
35066+{
35067+ int result;
35068+
35069+ read_lock_dk(znode_get_tree(node));
35070+ result = keyeq(key, znode_get_ld_key(node));
35071+ read_unlock_dk(znode_get_tree(node));
35072+ return result;
35073+}
35074+
35075+/**
35076+ * check_coord - check whether coord corresponds to key
35077+ * @coord: coord to check
35078+ * @key: key @coord has to correspond to
35079+ *
35080+ * Returns true if @coord is set as if it was set as result of lookup with @key
35081+ * in coord->node.
35082+ */
35083+static int check_coord(const coord_t *coord, const reiser4_key *key)
35084+{
35085+ coord_t twin;
35086+
35087+ node_plugin_by_node(coord->node)->lookup(coord->node, key,
35088+ FIND_MAX_NOT_MORE_THAN, &twin);
35089+ return coords_equal(coord, &twin);
35090+}
35091+
35092+#endif /* REISER4_DEBUG */
35093+
35094+/**
35095+ * init_uf_coord - initialize extended coord
35096+ * @uf_coord:
35097+ * @lh:
35098+ *
35099+ *
35100+ */
35101+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35102+{
35103+ coord_init_zero(&uf_coord->coord);
35104+ coord_clear_iplug(&uf_coord->coord);
35105+ uf_coord->lh = lh;
35106+ init_lh(lh);
35107+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35108+ uf_coord->valid = 0;
35109+}
35110+
35111+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
35112+{
35113+ assert("vs-1333", uf_coord->valid == 0);
35114+
35115+ if (coord_is_between_items(&uf_coord->coord))
35116+ return;
35117+
35118+ assert("vs-1348",
35119+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35120+ init_coord_extension);
35121+
35122+ item_body_by_coord(&uf_coord->coord);
35123+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35124+ init_coord_extension(uf_coord, offset);
35125+}
35126+
35127+/**
35128+ * goto_right_neighbor - lock right neighbor, drop current node lock
35129+ * @coord:
35130+ * @lh:
35131+ *
35132+ * Obtain lock on right neighbor and drop lock on current node.
35133+ */
35134+int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35135+{
35136+ int result;
35137+ lock_handle lh_right;
35138+
35139+ assert("vs-1100", znode_is_locked(coord->node));
35140+
35141+ init_lh(&lh_right);
35142+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
35143+ znode_is_wlocked(coord->node) ?
35144+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35145+ GN_CAN_USE_UPPER_LEVELS);
35146+ if (result) {
35147+ done_lh(&lh_right);
35148+ return result;
35149+ }
35150+
35151+ /*
35152+ * we hold two longterm locks on neighboring nodes. Unlock left of
35153+ * them
35154+ */
35155+ done_lh(lh);
35156+
35157+ coord_init_first_unit_nocheck(coord, lh_right.node);
35158+ move_lh(lh, &lh_right);
35159+
35160+ return 0;
35161+
35162+}
35163+
35164+/**
35165+ * set_file_state
35166+ * @uf_info:
35167+ * @cbk_result:
35168+ * @level:
35169+ *
35170+ * This is to be used by find_file_item and in find_file_state to
35171+ * determine real state of file
35172+ */
35173+static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
35174+ tree_level level)
35175+{
35176+ if (cbk_errored(cbk_result))
35177+ /* error happened in find_file_item */
35178+ return;
35179+
35180+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35181+
35182+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35183+ /*
35184+ * container is unknown, therefore conversion can not be in
35185+ * progress
35186+ */
35187+ assert("",
35188+ !reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35189+ REISER4_PART_IN_CONV));
35190+ if (cbk_result == CBK_COORD_NOTFOUND)
35191+ uf_info->container = UF_CONTAINER_EMPTY;
35192+ else if (level == LEAF_LEVEL)
35193+ uf_info->container = UF_CONTAINER_TAILS;
35194+ else
35195+ uf_info->container = UF_CONTAINER_EXTENTS;
35196+ } else {
35197+ /*
35198+ * file state is known, check whether it is set correctly if
35199+ * file is not being tail converted
35200+ */
35201+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35202+ REISER4_PART_IN_CONV)) {
35203+ assert("vs-1162",
35204+ ergo(level == LEAF_LEVEL &&
35205+ cbk_result == CBK_COORD_FOUND,
35206+ uf_info->container == UF_CONTAINER_TAILS));
35207+ assert("vs-1165",
35208+ ergo(level == TWIG_LEVEL &&
35209+ cbk_result == CBK_COORD_FOUND,
35210+ uf_info->container == UF_CONTAINER_EXTENTS));
35211+ }
35212+ }
35213+}
35214+
35215+int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35216+ const reiser4_key *key, znode_lock_mode lock_mode,
35217+ struct inode *inode)
35218+{
35219+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
35220+ FIND_MAX_NOT_MORE_THAN,
35221+ TWIG_LEVEL, LEAF_LEVEL,
35222+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35223+ (CBK_UNIQUE | CBK_FOR_INSERT),
35224+ NULL /* ra_info */ );
35225+}
35226+
35227+/**
35228+ * find_file_item - look for file item in the tree
35229+ * @hint: provides coordinate, lock handle, seal
35230+ * @key: key for search
35231+ * @mode: mode of lock to put on returned node
35232+ * @ra_info:
35233+ * @inode:
35234+ *
35235+ * This finds position in the tree corresponding to @key. It first tries to use
35236+ * @hint's seal if it is set.
35237+ */
35238+int find_file_item(hint_t *hint, const reiser4_key *key,
35239+ znode_lock_mode lock_mode,
35240+ struct inode *inode)
35241+{
35242+ int result;
35243+ coord_t *coord;
35244+ lock_handle *lh;
35245+
35246+ assert("nikita-3030", reiser4_schedulable());
35247+ assert("vs-1707", hint != NULL);
35248+ assert("vs-47", inode != NULL);
35249+
35250+ coord = &hint->ext_coord.coord;
35251+ lh = hint->ext_coord.lh;
35252+ init_lh(lh);
35253+
35254+ result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35255+ if (!result) {
35256+ if (coord->between == AFTER_UNIT &&
35257+ equal_to_rdk(coord->node, key)) {
35258+ result = goto_right_neighbor(coord, lh);
35259+ if (result == -E_NO_NEIGHBOR)
35260+ return RETERR(-EIO);
35261+ if (result)
35262+ return result;
35263+ assert("vs-1152", equal_to_ldk(coord->node, key));
35264+ /*
35265+ * we moved to different node. Invalidate coord
35266+ * extension, zload is necessary to init it again
35267+ */
35268+ hint->ext_coord.valid = 0;
35269+ }
35270+
35271+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35272+ znode_get_level(coord->node));
35273+
35274+ return CBK_COORD_FOUND;
35275+ }
35276+
35277+ coord_init_zero(coord);
35278+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35279+ set_file_state(unix_file_inode_data(inode), result,
35280+ znode_get_level(coord->node));
35281+
35282+ /* FIXME: we might already have coord extension initialized */
35283+ hint->ext_coord.valid = 0;
35284+ return result;
35285+}
35286+
35287+/* plugin->u.file.write_flowom = NULL
35288+ plugin->u.file.read_flow = NULL */
35289+
35290+void hint_init_zero(hint_t * hint)
35291+{
35292+ memset(hint, 0, sizeof(*hint));
35293+ init_lh(&hint->lh);
35294+ hint->ext_coord.lh = &hint->lh;
35295+}
35296+
35297+static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
35298+{
35299+ int result;
35300+ reiser4_key key;
35301+ coord_t coord;
35302+ lock_handle lh;
35303+
35304+ assert("vs-1628", ea_obtained(uf_info));
35305+
35306+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35307+ key_by_inode_and_offset_common(inode, 0, &key);
35308+ init_lh(&lh);
35309+ result = find_file_item_nohint(&coord, &lh, &key,
35310+ ZNODE_READ_LOCK, inode);
35311+ set_file_state(uf_info, result, znode_get_level(coord.node));
35312+ done_lh(&lh);
35313+ if (!cbk_errored(result))
35314+ result = 0;
35315+ } else
35316+ result = 0;
35317+ assert("vs-1074",
35318+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
35319+ reiser4_txn_restart_current();
35320+ return result;
35321+}
35322+
35323+/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
35324+ data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
35325+ if page corresponds to hole extent and unallocated one will have to be created */
35326+static int reserve_partial_page(reiser4_tree * tree)
35327+{
35328+ grab_space_enable();
35329+ return reiser4_grab_reserved(reiser4_get_current_sb(),
35330+ 1 +
35331+ 2 * estimate_one_insert_into_item(tree),
35332+ BA_CAN_COMMIT);
35333+}
35334+
35335+/* estimate and reserve space needed to cut one item and update one stat data */
35336+static int reserve_cut_iteration(reiser4_tree * tree)
35337+{
35338+ __u64 estimate = estimate_one_item_removal(tree)
35339+ + estimate_one_insert_into_item(tree);
35340+
35341+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
35342+
35343+ grab_space_enable();
35344+ /* We need to double our estimate now that we can delete more than one
35345+ node. */
35346+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
35347+ BA_CAN_COMMIT);
35348+}
35349+
35350+int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
35351+ int update_sd)
35352+{
35353+ int result = 0;
35354+
35355+ INODE_SET_FIELD(inode, i_size, get_key_offset(key));
35356+ if (update_sd) {
35357+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
35358+ result = reiser4_update_sd(inode);
35359+ }
35360+ return result;
35361+}
35362+
35363+/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
35364+ and update file stat data on every single cut from the tree */
35365+int
35366+cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
35367+ loff_t cur_size, int (*update_actor) (struct inode *,
35368+ reiser4_key *, int))
35369+{
35370+ reiser4_key from_key, to_key;
35371+ reiser4_key smallest_removed;
35372+ file_plugin *fplug = inode_file_plugin(inode);
35373+ int result;
35374+ int progress = 0;
35375+
35376+ assert("vs-1248",
35377+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
35378+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
35379+
35380+ fplug->key_by_inode(inode, new_size, &from_key);
35381+ to_key = from_key;
35382+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
35383+ /* this loop normally runs just once */
35384+ while (1) {
35385+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
35386+ if (result)
35387+ break;
35388+
35389+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
35390+ &smallest_removed, inode, 1,
35391+ &progress);
35392+ if (result == -E_REPEAT) {
35393+ /* -E_REPEAT is a signal to interrupt a long file truncation process */
35394+ if (progress) {
35395+ result =
35396+ update_actor(inode, &smallest_removed,
35397+ update_sd);
35398+ if (result)
35399+ break;
35400+ }
35401+
35402+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
35403+ reiser4_release_reserved(inode->i_sb);
35404+
35405+ /* reiser4_cut_tree_object() was interrupted probably because
35406+ * current atom requires commit, we have to release
35407+ * transaction handle to allow atom commit. */
35408+ reiser4_txn_restart_current();
35409+ continue;
35410+ }
35411+ if (result
35412+ && !(result == CBK_COORD_NOTFOUND && new_size == 0
35413+ && inode->i_size == 0))
35414+ break;
35415+
35416+ set_key_offset(&smallest_removed, new_size);
35417+ /* Final sd update after the file gets its correct size */
35418+ result = update_actor(inode, &smallest_removed, update_sd);
35419+ break;
35420+ }
35421+
35422+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
35423+ reiser4_release_reserved(inode->i_sb);
35424+
35425+ return result;
35426+}
35427+
35428+int find_or_create_extent(struct page *page);
35429+
35430+/* part of truncate_file_body: it is called when truncate is used to make file
35431+ shorter */
35432+static int shorten_file(struct inode *inode, loff_t new_size)
35433+{
35434+ int result;
35435+ struct page *page;
35436+ int padd_from;
35437+ unsigned long index;
35438+ char *kaddr;
35439+ unix_file_info_t *uf_info;
35440+
35441+ /*
35442+ * all items of ordinary reiser4 file are grouped together. That is why
35443+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be
35444+ * truncated that simply
35445+ */
35446+ result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
35447+ get_key_offset(reiser4_max_key()),
35448+ reiser4_update_file_size);
35449+ if (result)
35450+ return result;
35451+
35452+ uf_info = unix_file_inode_data(inode);
35453+ assert("vs-1105", new_size == inode->i_size);
35454+ if (new_size == 0) {
35455+ uf_info->container = UF_CONTAINER_EMPTY;
35456+ return 0;
35457+ }
35458+
35459+ result = find_file_state(inode, uf_info);
35460+ if (result)
35461+ return result;
35462+ if (uf_info->container == UF_CONTAINER_TAILS)
35463+ /*
35464+ * No need to worry about zeroing last page after new file
35465+ * end
35466+ */
35467+ return 0;
35468+
35469+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
35470+ if (!padd_from)
35471+ /* file is truncated to page boundary */
35472+ return 0;
35473+
35474+ result = reserve_partial_page(reiser4_tree_by_inode(inode));
35475+ if (result) {
35476+ reiser4_release_reserved(inode->i_sb);
35477+ return result;
35478+ }
35479+
35480+ /* last page is partially truncated - zero its content */
35481+ index = (inode->i_size >> PAGE_CACHE_SHIFT);
35482+ page = read_mapping_page(inode->i_mapping, index, NULL);
35483+ if (IS_ERR(page)) {
35484+ /*
35485+ * the below does up(sbinfo->delete_mutex). Do not get
35486+ * confused
35487+ */
35488+ reiser4_release_reserved(inode->i_sb);
35489+ if (likely(PTR_ERR(page) == -EINVAL)) {
35490+ /* looks like file is built of tail items */
35491+ return 0;
35492+ }
35493+ return PTR_ERR(page);
35494+ }
35495+ wait_on_page_locked(page);
35496+ if (!PageUptodate(page)) {
35497+ page_cache_release(page);
35498+ /*
35499+ * the below does up(sbinfo->delete_mutex). Do not get
35500+ * confused
35501+ */
35502+ reiser4_release_reserved(inode->i_sb);
35503+ return RETERR(-EIO);
35504+ }
35505+
35506+ /*
35507+ * if page correspons to hole extent unit - unallocated one will be
35508+ * created here. This is not necessary
35509+ */
35510+ result = find_or_create_extent(page);
35511+
35512+ /*
35513+ * FIXME: cut_file_items has already updated inode. Probably it would
35514+ * be better to update it here when file is really truncated
35515+ */
35516+ if (result) {
35517+ page_cache_release(page);
35518+ /*
35519+ * the below does up(sbinfo->delete_mutex). Do not get
35520+ * confused
35521+ */
35522+ reiser4_release_reserved(inode->i_sb);
35523+ return result;
35524+ }
35525+
35526+ lock_page(page);
35527+ assert("vs-1066", PageLocked(page));
35528+ kaddr = kmap_atomic(page, KM_USER0);
35529+ memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
35530+ flush_dcache_page(page);
35531+ kunmap_atomic(kaddr, KM_USER0);
35532+ unlock_page(page);
35533+ page_cache_release(page);
35534+ /* the below does up(sbinfo->delete_mutex). Do not get confused */
35535+ reiser4_release_reserved(inode->i_sb);
35536+ return 0;
35537+}
35538+
35539+/**
35540+ * should_have_notail
35541+ * @uf_info:
35542+ * @new_size:
35543+ *
35544+ * Calls formatting plugin to see whether file of size @new_size has to be
35545+ * stored in unformatted nodes or in tail items. 0 is returned for later case.
35546+ */
35547+static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
35548+{
35549+ if (!uf_info->tplug)
35550+ return 1;
35551+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
35552+ new_size);
35553+
35554+}
35555+
35556+/**
35557+ * truncate_file_body - change length of file
35558+ * @inode: inode of file
35559+ * @new_size: new file length
35560+ *
35561+ * Adjusts items file @inode is built of to match @new_size. It may either cut
35562+ * items or add them to represent a hole at the end of file. The caller has to
35563+ * obtain exclusive access to the file.
35564+ */
35565+static int truncate_file_body(struct inode *inode, loff_t new_size)
35566+{
35567+ int result;
35568+
35569+ if (inode->i_size < new_size) {
35570+ /* expanding truncate */
35571+ struct dentry dentry;
35572+ struct file file;
35573+ unix_file_info_t *uf_info;
35574+
35575+ dentry.d_inode = inode;
35576+ file.f_dentry = &dentry;
35577+ file.private_data = NULL;
35578+ file.f_pos = new_size;
35579+ file.private_data = NULL;
35580+ uf_info = unix_file_inode_data(inode);
35581+ result = find_file_state(inode, uf_info);
35582+ if (result)
35583+ return result;
35584+
35585+ if (should_have_notail(uf_info, new_size)) {
35586+ /*
35587+ * file of size @new_size has to be built of
35588+ * extents. If it is built of tails - convert to
35589+ * extents
35590+ */
35591+ if (uf_info->container == UF_CONTAINER_TAILS) {
35592+ /*
35593+ * if file is being convered by another process
35594+ * - wait until it completes
35595+ */
35596+ while (1) {
35597+ if (reiser4_inode_get_flag(inode,
35598+ REISER4_PART_IN_CONV)) {
35599+ drop_exclusive_access(uf_info);
35600+ schedule();
35601+ get_exclusive_access(uf_info);
35602+ continue;
35603+ }
35604+ break;
35605+ }
35606+
35607+ if (uf_info->container == UF_CONTAINER_TAILS) {
35608+ result = tail2extent(uf_info);
35609+ if (result)
35610+ return result;
35611+ }
35612+ }
35613+ result = reiser4_write_extent(&file, NULL, 0,
35614+ &new_size);
35615+ if (result)
35616+ return result;
35617+ uf_info->container = UF_CONTAINER_EXTENTS;
35618+ } else {
35619+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
35620+ result = reiser4_write_extent(&file, NULL, 0,
35621+ &new_size);
35622+ if (result)
35623+ return result;
35624+ } else {
35625+ result = reiser4_write_tail(&file, NULL, 0,
35626+ &new_size);
35627+ if (result)
35628+ return result;
35629+ uf_info->container = UF_CONTAINER_TAILS;
35630+ }
35631+ }
35632+ BUG_ON(result > 0);
35633+ INODE_SET_FIELD(inode, i_size, new_size);
35634+ file_update_time(&file);
35635+ result = reiser4_update_sd(inode);
35636+ BUG_ON(result != 0);
35637+ reiser4_free_file_fsdata(&file);
35638+ } else
35639+ result = shorten_file(inode, new_size);
35640+ return result;
35641+}
35642+
35643+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
35644+
35645+/**
35646+ * load_file_hint - copy hint from struct file to local variable
35647+ * @file: file to get hint from
35648+ * @hint: structure to fill
35649+ *
35650+ * Reiser4 specific portion of struct file may contain information (hint)
35651+ * stored on exiting from previous read or write. That information includes
35652+ * seal of znode and coord within that znode where previous read or write
35653+ * stopped. This function copies that information to @hint if it was stored or
35654+ * initializes @hint by 0s otherwise.
35655+ */
35656+int load_file_hint(struct file *file, hint_t *hint)
35657+{
35658+ reiser4_file_fsdata *fsdata;
35659+
35660+ if (file) {
35661+ fsdata = reiser4_get_file_fsdata(file);
35662+ if (IS_ERR(fsdata))
35663+ return PTR_ERR(fsdata);
35664+
35665+ spin_lock_inode(file->f_dentry->d_inode);
35666+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
35667+ *hint = fsdata->reg.hint;
35668+ init_lh(&hint->lh);
35669+ hint->ext_coord.lh = &hint->lh;
35670+ spin_unlock_inode(file->f_dentry->d_inode);
35671+ /*
35672+ * force re-validation of the coord on the first
35673+ * iteration of the read/write loop.
35674+ */
35675+ hint->ext_coord.valid = 0;
35676+ assert("nikita-19892", coords_equal(&hint->seal.coord1,
35677+ &hint->ext_coord.
35678+ coord));
35679+ return 0;
35680+ }
35681+ memset(&fsdata->reg.hint, 0, sizeof(hint_t));
35682+ spin_unlock_inode(file->f_dentry->d_inode);
35683+ }
35684+ hint_init_zero(hint);
35685+ return 0;
35686+}
35687+
35688+/**
35689+ * save_file_hint - copy hint to reiser4 private struct file's part
35690+ * @file: file to save hint in
35691+ * @hint: hint to save
35692+ *
35693+ * This copies @hint to reiser4 private part of struct file. It can help
35694+ * speedup future accesses to the file.
35695+ */
35696+void save_file_hint(struct file *file, const hint_t *hint)
35697+{
35698+ reiser4_file_fsdata *fsdata;
35699+
35700+ assert("edward-1337", hint != NULL);
35701+
35702+ if (!file || !reiser4_seal_is_set(&hint->seal))
35703+ return;
35704+ fsdata = reiser4_get_file_fsdata(file);
35705+ assert("vs-965", !IS_ERR(fsdata));
35706+ assert("nikita-19891",
35707+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
35708+ assert("vs-30", hint->lh.owner == NULL);
35709+ spin_lock_inode(file->f_dentry->d_inode);
35710+ fsdata->reg.hint = *hint;
35711+ spin_unlock_inode(file->f_dentry->d_inode);
35712+ return;
35713+}
35714+
35715+void reiser4_unset_hint(hint_t * hint)
35716+{
35717+ assert("vs-1315", hint);
35718+ hint->ext_coord.valid = 0;
35719+ reiser4_seal_done(&hint->seal);
35720+ done_lh(&hint->lh);
35721+}
35722+
35723+/* coord must be set properly. So, that reiser4_set_hint
35724+ has nothing to do */
35725+void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
35726+ znode_lock_mode mode)
35727+{
35728+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
35729+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
35730+
35731+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
35732+ hint->offset = get_key_offset(key);
35733+ hint->mode = mode;
35734+ done_lh(&hint->lh);
35735+}
35736+
35737+int hint_is_set(const hint_t * hint)
35738+{
35739+ return reiser4_seal_is_set(&hint->seal);
35740+}
35741+
35742+#if REISER4_DEBUG
35743+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
35744+{
35745+ return (get_key_locality(k1) == get_key_locality(k2) &&
35746+ get_key_type(k1) == get_key_type(k2) &&
35747+ get_key_band(k1) == get_key_band(k2) &&
35748+ get_key_ordering(k1) == get_key_ordering(k2) &&
35749+ get_key_objectid(k1) == get_key_objectid(k2));
35750+}
35751+#endif
35752+
35753+static int
35754+hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35755+ znode_lock_mode lock_mode)
35756+{
35757+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
35758+ /* hint either not set or set by different operation */
35759+ return RETERR(-E_REPEAT);
35760+
35761+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
35762+
35763+ if (check_key && get_key_offset(key) != hint->offset)
35764+ /* hint is set for different key */
35765+ return RETERR(-E_REPEAT);
35766+
35767+ assert("vs-31", hint->ext_coord.lh == &hint->lh);
35768+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
35769+ hint->ext_coord.lh, lock_mode,
35770+ ZNODE_LOCK_LOPRI);
35771+}
35772+
35773+/**
35774+ * find_or_create_extent -
35775+ * @page:
35776+ *
35777+ *
35778+ */
35779+/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
35780+ unallocated extent if it does not exist yet, initialize jnode, capture page */
35781+int find_or_create_extent(struct page *page)
35782+{
35783+ int result;
35784+ struct inode *inode;
35785+ int plugged_hole;
35786+
35787+ jnode *node;
35788+
35789+ assert("vs-1065", page->mapping && page->mapping->host);
35790+ inode = page->mapping->host;
35791+
35792+ lock_page(page);
35793+ node = jnode_of_page(page);
35794+ if (IS_ERR(node)) {
35795+ unlock_page(page);
35796+ return PTR_ERR(node);
35797+ }
35798+ JF_SET(node, JNODE_WRITE_PREPARED);
35799+ unlock_page(page);
35800+
35801+ if (node->blocknr == 0) {
35802+ plugged_hole = 0;
35803+ result = reiser4_update_extent(inode, node, page_offset(page),
35804+ &plugged_hole);
35805+ if (result) {
35806+ JF_CLR(node, JNODE_WRITE_PREPARED);
35807+ jput(node);
35808+ warning("", "reiser4_update_extent failed: %d", result);
35809+ return result;
35810+ }
35811+ if (plugged_hole)
35812+ reiser4_update_sd(inode);
35813+ } else {
35814+ spin_lock_jnode(node);
35815+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
35816+ BUG_ON(result != 0);
35817+ jnode_make_dirty_locked(node);
35818+ spin_unlock_jnode(node);
35819+ }
35820+
35821+ BUG_ON(node->atom == NULL);
35822+ JF_CLR(node, JNODE_WRITE_PREPARED);
35823+ jput(node);
35824+
35825+ if (get_current_context()->entd) {
35826+ entd_context *ent = get_entd_context(node->tree->super);
35827+
35828+ if (ent->cur_request->page == page)
35829+ ent->cur_request->node = node;
35830+ }
35831+ return 0;
35832+}
35833+
35834+/**
35835+ * has_anonymous_pages - check whether inode has pages dirtied via mmap
35836+ * @inode: inode to check
35837+ *
35838+ * Returns true if inode's mapping has dirty pages which do not belong to any
35839+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
35840+ * tree or were eflushed and can be found via jnodes tagged
35841+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
35842+ */
35843+static int has_anonymous_pages(struct inode *inode)
35844+{
35845+ int result;
35846+
35847+ read_lock_irq(&inode->i_mapping->tree_lock);
35848+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
35849+ read_unlock_irq(&inode->i_mapping->tree_lock);
35850+ return result;
35851+}
35852+
35853+/**
35854+ * capture_page_and_create_extent -
35855+ * @page: page to be captured
35856+ *
35857+ * Grabs space for extent creation and stat data update and calls function to
35858+ * do actual work.
35859+ */
35860+static int capture_page_and_create_extent(struct page *page)
35861+{
35862+ int result;
35863+ struct inode *inode;
35864+
35865+ assert("vs-1084", page->mapping && page->mapping->host);
35866+ inode = page->mapping->host;
35867+ assert("vs-1139",
35868+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
35869+ /* page belongs to file */
35870+ assert("vs-1393",
35871+ inode->i_size > page_offset(page));
35872+
35873+ /* page capture may require extent creation (if it does not exist yet)
35874+ and stat data's update (number of blocks changes on extent
35875+ creation) */
35876+ grab_space_enable();
35877+ result = reiser4_grab_space(2 * estimate_one_insert_into_item
35878+ (reiser4_tree_by_inode(inode)),
35879+ BA_CAN_COMMIT);
35880+ if (likely(!result))
35881+ result = find_or_create_extent(page);
35882+
35883+ if (result != 0)
35884+ SetPageError(page);
35885+ return result;
35886+}
35887+
35888+/* this is implementation of method commit_write of struct
35889+ address_space_operations for unix file plugin */
35890+int
35891+commit_write_unix_file(struct file *file, struct page *page,
35892+ unsigned from, unsigned to)
35893+{
35894+ reiser4_context *ctx;
35895+ struct inode *inode;
35896+ int result;
35897+
35898+ assert("umka-3101", file != NULL);
35899+ assert("umka-3102", page != NULL);
35900+ assert("umka-3093", PageLocked(page));
35901+
35902+ SetPageUptodate(page);
35903+
35904+ inode = page->mapping->host;
35905+ ctx = reiser4_init_context(page->mapping->host->i_sb);
35906+ if (IS_ERR(ctx))
35907+ return PTR_ERR(ctx);
35908+ page_cache_get(page);
35909+ unlock_page(page);
35910+ result = capture_page_and_create_extent(page);
35911+ lock_page(page);
35912+ page_cache_release(page);
35913+
35914+ /* don't commit transaction under inode semaphore */
35915+ context_set_commit_async(ctx);
35916+ reiser4_exit_context(ctx);
35917+ return result;
35918+}
35919+
35920+/*
35921+ * Support for "anonymous" pages and jnodes.
35922+ *
35923+ * When file is write-accessed through mmap pages can be dirtied from the user
35924+ * level. In this case kernel is not notified until one of following happens:
35925+ *
35926+ * (1) msync()
35927+ *
35928+ * (2) truncate() (either explicit or through unlink)
35929+ *
35930+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before
35931+ * starting write-back.
35932+ *
35933+ * As a result of (3) ->writepage may be called on a dirty page without
35934+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
35935+ * (iozone) generate huge number of anonymous pages. Emergency flush handles
35936+ * this situation by creating jnode for anonymous page, starting IO on the
35937+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
35938+ * memory. Such jnode is also called anonymous.
35939+ *
35940+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
35941+ * tree. This is done by capture_anonymous_*() functions below.
35942+ */
35943+
35944+/**
35945+ * capture_anonymous_page - involve page into transaction
35946+ * @pg: page to deal with
35947+ *
35948+ * Takes care that @page has corresponding metadata in the tree, creates jnode
35949+ * for @page and captures it. On success 1 is returned.
35950+ */
35951+static int capture_anonymous_page(struct page *page)
35952+{
35953+ int result;
35954+
35955+ if (PageWriteback(page))
35956+ /* FIXME: do nothing? */
35957+ return 0;
35958+
35959+ result = capture_page_and_create_extent(page);
35960+ if (result == 0) {
35961+ result = 1;
35962+ } else
35963+ warning("nikita-3329",
35964+ "Cannot capture anon page: %i", result);
35965+
35966+ return result;
35967+}
35968+
35969+/**
35970+ * capture_anonymous_pages - find and capture pages dirtied via mmap
35971+ * @mapping: address space where to look for pages
35972+ * @index: start index
35973+ * @to_capture: maximum number of pages to capture
35974+ *
35975+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
35976+ * captures (involves into atom) them, returns number of captured pages,
35977+ * updates @index to next page after the last captured one.
35978+ */
35979+static int
35980+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
35981+ unsigned int to_capture)
35982+{
35983+ int result;
35984+ struct pagevec pvec;
35985+ unsigned int i, count;
35986+ int nr;
35987+
35988+ pagevec_init(&pvec, 0);
35989+ count = min(pagevec_space(&pvec), to_capture);
35990+ nr = 0;
35991+
35992+ /* find pages tagged MOVED */
35993+ write_lock_irq(&mapping->tree_lock);
35994+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
35995+ (void **)pvec.pages, *index, count,
35996+ PAGECACHE_TAG_REISER4_MOVED);
35997+ if (pagevec_count(&pvec) == 0) {
35998+ /*
35999+ * there are no pages tagged MOVED in mapping->page_tree
36000+ * starting from *index
36001+ */
36002+ write_unlock_irq(&mapping->tree_lock);
36003+ *index = (pgoff_t)-1;
36004+ return 0;
36005+ }
36006+
36007+ /* clear MOVED tag for all found pages */
36008+ for (i = 0; i < pagevec_count(&pvec); i++) {
36009+ void *p;
36010+
36011+ page_cache_get(pvec.pages[i]);
36012+ p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36013+ PAGECACHE_TAG_REISER4_MOVED);
36014+ assert("vs-49", p == pvec.pages[i]);
36015+ }
36016+ write_unlock_irq(&mapping->tree_lock);
36017+
36018+
36019+ *index = pvec.pages[i - 1]->index + 1;
36020+
36021+ for (i = 0; i < pagevec_count(&pvec); i++) {
36022+ /*
36023+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36024+ * reiser4_set_page_dirty_internal which is called when jnode is
36025+ * captured
36026+ */
36027+ result = capture_anonymous_page(pvec.pages[i]);
36028+ if (result == 1)
36029+ nr++;
36030+ else {
36031+ if (result < 0) {
36032+ warning("vs-1454",
36033+ "failed to capture page: "
36034+ "result=%d, captured=%d)\n",
36035+ result, i);
36036+
36037+ /*
36038+ * set MOVED tag to all pages which left not
36039+ * captured
36040+ */
36041+ write_lock_irq(&mapping->tree_lock);
36042+ for (; i < pagevec_count(&pvec); i ++) {
36043+ radix_tree_tag_set(&mapping->page_tree,
36044+ pvec.pages[i]->index,
36045+ PAGECACHE_TAG_REISER4_MOVED);
36046+ }
36047+ write_unlock_irq(&mapping->tree_lock);
36048+
36049+ pagevec_release(&pvec);
36050+ return result;
36051+ } else {
36052+ /*
36053+ * result == 0. capture_anonymous_page returns
36054+ * 0 for Writeback-ed page. Set MOVED tag on
36055+ * that page
36056+ */
36057+ write_lock_irq(&mapping->tree_lock);
36058+ radix_tree_tag_set(&mapping->page_tree,
36059+ pvec.pages[i]->index,
36060+ PAGECACHE_TAG_REISER4_MOVED);
36061+ write_unlock_irq(&mapping->tree_lock);
36062+ if (i == 0)
36063+ *index = pvec.pages[0]->index;
36064+ else
36065+ *index = pvec.pages[i - 1]->index + 1;
36066+ }
36067+ }
36068+ }
36069+ pagevec_release(&pvec);
36070+ return nr;
36071+}
36072+
36073+/**
36074+ * capture_anonymous_jnodes - find and capture anonymous jnodes
36075+ * @mapping: address space where to look for jnodes
36076+ * @from: start index
36077+ * @to: end index
36078+ * @to_capture: maximum number of jnodes to capture
36079+ *
36080+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36081+ * the range of indexes @from-@to and captures them, returns number of captured
36082+ * jnodes, updates @from to next jnode after the last captured one.
36083+ */
36084+static int
36085+capture_anonymous_jnodes(struct address_space *mapping,
36086+ pgoff_t *from, pgoff_t to, int to_capture)
36087+{
36088+ *from = to;
36089+ return 0;
36090+}
36091+
36092+/*
36093+ * Commit atom of the jnode of a page.
36094+ */
36095+static int sync_page(struct page *page)
36096+{
36097+ int result;
36098+ do {
36099+ jnode *node;
36100+ txn_atom *atom;
36101+
36102+ lock_page(page);
36103+ node = jprivate(page);
36104+ if (node != NULL) {
36105+ spin_lock_jnode(node);
36106+ atom = jnode_get_atom(node);
36107+ spin_unlock_jnode(node);
36108+ } else
36109+ atom = NULL;
36110+ unlock_page(page);
36111+ result = reiser4_sync_atom(atom);
36112+ } while (result == -E_REPEAT);
36113+ /*
36114+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36115+ * handle the case where more pages get added to the atom while we are
36116+ * syncing it?
36117+ */
36118+ assert("nikita-3485", ergo(result == 0,
36119+ get_current_context()->trans->atom == NULL));
36120+ return result;
36121+}
36122+
36123+/*
36124+ * Commit atoms of pages on @pages list.
36125+ * call sync_page for each page from mapping's page tree
36126+ */
36127+static int sync_page_list(struct inode *inode)
36128+{
36129+ int result;
36130+ struct address_space *mapping;
36131+ unsigned long from; /* start index for radix_tree_gang_lookup */
36132+ unsigned int found; /* return value for radix_tree_gang_lookup */
36133+
36134+ mapping = inode->i_mapping;
36135+ from = 0;
36136+ result = 0;
36137+ read_lock_irq(&mapping->tree_lock);
36138+ while (result == 0) {
36139+ struct page *page;
36140+
36141+ found =
36142+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36143+ from, 1);
36144+ assert("", found < 2);
36145+ if (found == 0)
36146+ break;
36147+
36148+ /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36149+ sys_fsync */
36150+ page_cache_get(page);
36151+ read_unlock_irq(&mapping->tree_lock);
36152+
36153+ from = page->index + 1;
36154+
36155+ result = sync_page(page);
36156+
36157+ page_cache_release(page);
36158+ read_lock_irq(&mapping->tree_lock);
36159+ }
36160+
36161+ read_unlock_irq(&mapping->tree_lock);
36162+ return result;
36163+}
36164+
36165+static int commit_file_atoms(struct inode *inode)
36166+{
36167+ int result;
36168+ unix_file_info_t *uf_info;
36169+
36170+ uf_info = unix_file_inode_data(inode);
36171+
36172+ get_exclusive_access(uf_info);
36173+ /*
36174+ * find what items file is made from
36175+ */
36176+ result = find_file_state(inode, uf_info);
36177+ drop_exclusive_access(uf_info);
36178+ if (result != 0)
36179+ return result;
36180+
36181+ /*
36182+ * file state cannot change because we are under ->i_mutex
36183+ */
36184+ switch (uf_info->container) {
36185+ case UF_CONTAINER_EXTENTS:
36186+ /* find_file_state might open join an atom */
36187+ reiser4_txn_restart_current();
36188+ result =
36189+ /*
36190+ * when we are called by
36191+ * filemap_fdatawrite->
36192+ * do_writepages()->
36193+ * reiser4_writepages()
36194+ *
36195+ * inode->i_mapping->dirty_pages are spices into
36196+ * ->io_pages, leaving ->dirty_pages dirty.
36197+ *
36198+ * When we are called from
36199+ * reiser4_fsync()->sync_unix_file(), we have to
36200+ * commit atoms of all pages on the ->dirty_list.
36201+ *
36202+ * So for simplicity we just commit ->io_pages and
36203+ * ->dirty_pages.
36204+ */
36205+ sync_page_list(inode);
36206+ break;
36207+ case UF_CONTAINER_TAILS:
36208+ /*
36209+ * NOTE-NIKITA probably we can be smarter for tails. For now
36210+ * just commit all existing atoms.
36211+ */
36212+ result = txnmgr_force_commit_all(inode->i_sb, 0);
36213+ break;
36214+ case UF_CONTAINER_EMPTY:
36215+ result = 0;
36216+ break;
36217+ case UF_CONTAINER_UNKNOWN:
36218+ default:
36219+ result = -EIO;
36220+ break;
36221+ }
36222+
36223+ /*
36224+ * commit current transaction: there can be captured nodes from
36225+ * find_file_state() and finish_conversion().
36226+ */
36227+ reiser4_txn_restart_current();
36228+ return result;
36229+}
36230+
36231+/**
36232+ * writepages_unix_file - writepages of struct address_space_operations
36233+ * @mapping:
36234+ * @wbc:
36235+ *
36236+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36237+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36238+ * created by reiser4_writepage.
36239+ */
36240+int writepages_unix_file(struct address_space *mapping,
36241+ struct writeback_control *wbc)
36242+{
36243+ int result;
36244+ unix_file_info_t *uf_info;
36245+ pgoff_t pindex, jindex, nr_pages;
36246+ long to_capture;
36247+ struct inode *inode;
36248+
36249+ inode = mapping->host;
36250+ if (!has_anonymous_pages(inode)) {
36251+ result = 0;
36252+ goto end;
36253+ }
36254+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
36255+ result = 0;
36256+ nr_pages =
36257+ (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
36258+ uf_info = unix_file_inode_data(inode);
36259+
36260+ do {
36261+ reiser4_context *ctx;
36262+
36263+ if (wbc->sync_mode != WB_SYNC_ALL)
36264+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36265+ else
36266+ to_capture = CAPTURE_APAGE_BURST;
36267+
36268+ ctx = reiser4_init_context(inode->i_sb);
36269+ if (IS_ERR(ctx)) {
36270+ result = PTR_ERR(ctx);
36271+ break;
36272+ }
36273+ /* avoid recursive calls to ->sync_inodes */
36274+ ctx->nobalance = 1;
36275+ assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36276+ assert("", LOCK_CNT_NIL(inode_sem_w));
36277+ assert("", LOCK_CNT_NIL(inode_sem_r));
36278+
36279+ reiser4_txn_restart_current();
36280+
36281+ /* we have to get nonexclusive access to the file */
36282+ if (get_current_context()->entd) {
36283+ /*
36284+ * use nonblocking version of nonexclusive_access to
36285+ * avoid deadlock which might look like the following:
36286+ * process P1 holds NEA on file F1 and called entd to
36287+ * reclaim some memory. Entd works for P1 and is going
36288+ * to capture pages of file F2. To do that entd has to
36289+ * get NEA to F2. F2 is held by process P2 which also
36290+ * called entd. But entd is serving P1 at the moment
36291+ * and P2 has to wait. Process P3 trying to get EA to
36292+ * file F2. Existence of pending EA request to file F2
36293+ * makes impossible for entd to get NEA to file
36294+ * F2. Neither of these process can continue. Using
36295+ * nonblocking version of gettign NEA is supposed to
36296+ * avoid this deadlock.
36297+ */
36298+ if (try_to_get_nonexclusive_access(uf_info) == 0) {
36299+ result = RETERR(-EBUSY);
36300+ reiser4_exit_context(ctx);
36301+ break;
36302+ }
36303+ } else
36304+ get_nonexclusive_access(uf_info);
36305+
36306+ while (to_capture > 0) {
36307+ pgoff_t start;
36308+
36309+ assert("vs-1727", jindex <= pindex);
36310+ if (pindex == jindex) {
36311+ start = pindex;
36312+ result =
36313+ capture_anonymous_pages(inode->i_mapping,
36314+ &pindex,
36315+ to_capture);
36316+ if (result <= 0)
36317+ break;
36318+ to_capture -= result;
36319+ wbc->nr_to_write -= result;
36320+ if (start + result == pindex) {
36321+ jindex = pindex;
36322+ continue;
36323+ }
36324+ if (to_capture <= 0)
36325+ break;
36326+ }
36327+ /* deal with anonymous jnodes between jindex and pindex */
36328+ result =
36329+ capture_anonymous_jnodes(inode->i_mapping, &jindex,
36330+ pindex, to_capture);
36331+ if (result < 0)
36332+ break;
36333+ to_capture -= result;
36334+ get_current_context()->nr_captured += result;
36335+
36336+ if (jindex == (pgoff_t) - 1) {
36337+ assert("vs-1728", pindex == (pgoff_t) - 1);
36338+ break;
36339+ }
36340+ }
36341+ if (to_capture <= 0)
36342+ /* there may be left more pages */
36343+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
36344+
36345+ drop_nonexclusive_access(uf_info);
36346+ if (result < 0) {
36347+ /* error happened */
36348+ reiser4_exit_context(ctx);
36349+ return result;
36350+ }
36351+ if (wbc->sync_mode != WB_SYNC_ALL) {
36352+ reiser4_exit_context(ctx);
36353+ return 0;
36354+ }
36355+ result = commit_file_atoms(inode);
36356+ reiser4_exit_context(ctx);
36357+ if (pindex >= nr_pages && jindex == pindex)
36358+ break;
36359+ } while (1);
36360+
36361+ end:
36362+ if (is_in_reiser4_context()) {
36363+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
36364+ /*
36365+ * there are already pages to flush, flush them out, do
36366+ * not delay until end of reiser4_sync_inodes
36367+ */
36368+ reiser4_writeout(inode->i_sb, wbc);
36369+ get_current_context()->nr_captured = 0;
36370+ }
36371+ }
36372+ return result;
36373+}
36374+
36375+/*
36376+ * ->sync() method for unix file.
36377+ *
36378+ * We are trying to be smart here. Instead of committing all atoms (original
36379+ * solution), we scan dirty pages of this file and commit all atoms they are
36380+ * part of.
36381+ *
36382+ * Situation is complicated by anonymous pages: i.e., extent-less pages
36383+ * dirtied through mmap. Fortunately sys_fsync() first calls
36384+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
36385+ * all missing extents and capture anonymous pages.
36386+ */
36387+int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
36388+{
36389+ reiser4_context *ctx;
36390+ txn_atom *atom;
36391+ reiser4_block_nr reserve;
36392+
36393+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
36394+ if (IS_ERR(ctx))
36395+ return PTR_ERR(ctx);
36396+
36397+ reserve = estimate_update_common(dentry->d_inode);
36398+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
36399+ reiser4_exit_context(ctx);
36400+ return RETERR(-ENOSPC);
36401+ }
36402+ write_sd_by_inode_common(dentry->d_inode);
36403+
36404+ atom = get_current_atom_locked();
36405+ spin_lock_txnh(ctx->trans);
36406+ force_commit_atom(ctx->trans);
36407+ reiser4_exit_context(ctx);
36408+ return 0;
36409+}
36410+
36411+/**
36412+ * readpage_unix_file_nolock - readpage of struct address_space_operations
36413+ * @file:
36414+ * @page:
36415+ *
36416+ * Compose a key and search for item containing information about @page
36417+ * data. If item is found - its readpage method is called.
36418+ */
36419+int readpage_unix_file(struct file *file, struct page *page)
36420+{
36421+ reiser4_context *ctx;
36422+ int result;
36423+ struct inode *inode;
36424+ reiser4_key key;
36425+ item_plugin *iplug;
36426+ hint_t *hint;
36427+ lock_handle *lh;
36428+ coord_t *coord;
36429+
36430+ assert("vs-1062", PageLocked(page));
36431+ assert("vs-976", !PageUptodate(page));
36432+ assert("vs-1061", page->mapping && page->mapping->host);
36433+
36434+ if (page->mapping->host->i_size <= page_offset(page)) {
36435+ /* page is out of file already */
36436+ unlock_page(page);
36437+ return -EINVAL;
36438+ }
36439+
36440+ inode = page->mapping->host;
36441+ ctx = reiser4_init_context(inode->i_sb);
36442+ if (IS_ERR(ctx)) {
36443+ unlock_page(page);
36444+ return PTR_ERR(ctx);
36445+ }
36446+
36447+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36448+ if (hint == NULL) {
36449+ unlock_page(page);
36450+ reiser4_exit_context(ctx);
36451+ return RETERR(-ENOMEM);
36452+ }
36453+
36454+ result = load_file_hint(file, hint);
36455+ if (result) {
36456+ kfree(hint);
36457+ unlock_page(page);
36458+ reiser4_exit_context(ctx);
36459+ return result;
36460+ }
36461+ lh = &hint->lh;
36462+
36463+ /* get key of first byte of the page */
36464+ key_by_inode_and_offset_common(inode, page_offset(page), &key);
36465+
36466+ /* look for file metadata corresponding to first byte of page */
36467+ page_cache_get(page);
36468+ unlock_page(page);
36469+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
36470+ lock_page(page);
36471+ page_cache_release(page);
36472+
36473+ if (page->mapping == NULL) {
36474+ /*
36475+ * readpage allows truncate to run concurrently. Page was
36476+ * truncated while it was not locked
36477+ */
36478+ done_lh(lh);
36479+ kfree(hint);
36480+ unlock_page(page);
36481+ reiser4_txn_restart(ctx);
36482+ reiser4_exit_context(ctx);
36483+ return -EINVAL;
36484+ }
36485+
36486+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
36487+ if (result == CBK_COORD_FOUND &&
36488+ hint->ext_coord.coord.between != AT_UNIT)
36489+ /* file is truncated */
36490+ result = -EINVAL;
36491+ done_lh(lh);
36492+ kfree(hint);
36493+ unlock_page(page);
36494+ reiser4_txn_restart(ctx);
36495+ reiser4_exit_context(ctx);
36496+ return result;
36497+ }
36498+
36499+ /*
36500+ * item corresponding to page is found. It can not be removed because
36501+ * znode lock is held
36502+ */
36503+ if (PageUptodate(page)) {
36504+ done_lh(lh);
36505+ kfree(hint);
36506+ unlock_page(page);
36507+ reiser4_txn_restart(ctx);
36508+ reiser4_exit_context(ctx);
36509+ return 0;
36510+ }
36511+
36512+ coord = &hint->ext_coord.coord;
36513+ result = zload(coord->node);
36514+ if (result) {
36515+ done_lh(lh);
36516+ kfree(hint);
36517+ unlock_page(page);
36518+ reiser4_txn_restart(ctx);
36519+ reiser4_exit_context(ctx);
36520+ return result;
36521+ }
36522+
36523+ validate_extended_coord(&hint->ext_coord, page_offset(page));
36524+
36525+ if (!coord_is_existing_unit(coord)) {
36526+ /* this indicates corruption */
36527+ warning("vs-280",
36528+ "Looking for page %lu of file %llu (size %lli). "
36529+ "No file items found (%d). File is corrupted?\n",
36530+ page->index, (unsigned long long)get_inode_oid(inode),
36531+ inode->i_size, result);
36532+ zrelse(coord->node);
36533+ done_lh(lh);
36534+ kfree(hint);
36535+ unlock_page(page);
36536+ reiser4_txn_restart(ctx);
36537+ reiser4_exit_context(ctx);
36538+ return RETERR(-EIO);
36539+ }
36540+
36541+ /*
36542+ * get plugin of found item or use plugin if extent if there are no
36543+ * one
36544+ */
36545+ iplug = item_plugin_by_coord(coord);
36546+ if (iplug->s.file.readpage)
36547+ result = iplug->s.file.readpage(coord, page);
36548+ else
36549+ result = RETERR(-EINVAL);
36550+
36551+ if (!result) {
36552+ set_key_offset(&key,
36553+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
36554+ /* FIXME should call reiser4_set_hint() */
36555+ reiser4_unset_hint(hint);
36556+ } else {
36557+ unlock_page(page);
36558+ reiser4_unset_hint(hint);
36559+ }
36560+ assert("vs-979",
36561+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
36562+ assert("vs-9791", ergo(result != 0, !PageLocked(page)));
36563+
36564+ zrelse(coord->node);
36565+ done_lh(lh);
36566+
36567+ save_file_hint(file, hint);
36568+ kfree(hint);
36569+
36570+ /*
36571+ * FIXME: explain why it is needed. HINT: page allocation in write can
36572+ * not be done when atom is not NULL because reiser4_writepage can not
36573+ * kick entd and have to eflush
36574+ */
36575+ reiser4_txn_restart(ctx);
36576+ reiser4_exit_context(ctx);
36577+ return result;
36578+}
36579+
36580+struct uf_readpages_context {
36581+ lock_handle lh;
36582+ coord_t coord;
36583+};
36584+
36585+/* A callback function for readpages_unix_file/read_cache_pages.
36586+ * If the file is build of tails, then return error (-ENOENT).
36587+ *
36588+ * @data -- a pointer to reiser4_readpages_context object,
36589+ * to save the twig lock and the coord between
36590+ * read_cache_page iterations.
36591+ * @page -- page to start read.
36592+ */
36593+static int uf_readpages_filler(void * data, struct page * page)
36594+{
36595+ struct uf_readpages_context *rc = data;
36596+ jnode * node;
36597+ int ret = 0;
36598+ reiser4_extent *ext;
36599+ __u64 ext_index;
36600+ int cbk_done = 0;
36601+ struct address_space * mapping = page->mapping;
36602+
36603+ if (PageUptodate(page)) {
36604+ unlock_page(page);
36605+ return 0;
36606+ }
36607+ if (rc->lh.node == 0) {
36608+ /* no twig lock - have to do tree search. */
36609+ reiser4_key key;
36610+ repeat:
36611+ unlock_page(page);
36612+ key_by_inode_and_offset_common(
36613+ mapping->host, page_offset(page), &key);
36614+ ret = coord_by_key(
36615+ &get_super_private(mapping->host->i_sb)->tree,
36616+ &key, &rc->coord, &rc->lh,
36617+ ZNODE_READ_LOCK, FIND_EXACT,
36618+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
36619+ if (ret)
36620+ return ret;
36621+ lock_page(page);
36622+ cbk_done = 1;
36623+ }
36624+ ret = zload(rc->coord.node);
36625+ if (ret) {
36626+ unlock_page(page);
36627+ return ret;
36628+ }
36629+ if (!coord_is_existing_item(&rc->coord) ||
36630+ !item_is_extent(&rc->coord)) {
36631+ zrelse(rc->coord.node);
36632+ unlock_page(page);
36633+ return RETERR(-EIO);
36634+ }
36635+ ext = extent_by_coord(&rc->coord);
36636+ ext_index = extent_unit_index(&rc->coord);
36637+ if (page->index < ext_index ||
36638+ page->index >= ext_index + extent_get_width(ext)) {
36639+ /* the page index doesn't belong to the extent unit
36640+ which the coord points to - release the lock and
36641+ repeat with tree search. */
36642+ zrelse(rc->coord.node);
36643+ done_lh(&rc->lh);
36644+ /* we can be here after a CBK call only in case of
36645+ corruption of the tree or the tree lookup algorithm bug. */
36646+ if (unlikely(cbk_done)) {
36647+ unlock_page(page);
36648+ return RETERR(-EIO);
36649+ }
36650+ goto repeat;
36651+ }
36652+ node = jnode_of_page(page);
36653+ if (unlikely(IS_ERR(node))) {
36654+ zrelse(rc->coord.node);
36655+ unlock_page(page);
36656+ return PTR_ERR(node);
36657+ }
36658+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
36659+ jput(node);
36660+ zrelse(rc->coord.node);
36661+ if (ret)
36662+ unlock_page(page);
36663+ return ret;
36664+}
36665+
36666+/**
36667+ * readpages_unix_file - called by the readahead code, starts reading for each
36668+ * page of given list of pages
36669+ */
36670+int readpages_unix_file(
36671+ struct file *file, struct address_space *mapping,
36672+ struct list_head *pages, unsigned nr_pages)
36673+{
36674+ reiser4_context *ctx;
36675+ struct uf_readpages_context rc;
36676+ int ret;
36677+
36678+ ctx = reiser4_init_context(mapping->host->i_sb);
36679+ if (IS_ERR(ctx)) {
36680+ put_pages_list(pages);
36681+ return PTR_ERR(ctx);
36682+ }
36683+ init_lh(&rc.lh);
36684+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
36685+ done_lh(&rc.lh);
36686+ context_set_commit_async(ctx);
36687+ /* close the transaction to protect further page allocation from deadlocks */
36688+ reiser4_txn_restart(ctx);
36689+ reiser4_exit_context(ctx);
36690+ return ret;
36691+}
36692+
36693+static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
36694+ loff_t count UNUSED_ARG)
36695+{
36696+ /* We should reserve one block, because of updating of the stat data
36697+ item */
36698+ assert("vs-1249",
36699+ inode_file_plugin(inode)->estimate.update ==
36700+ estimate_update_common);
36701+ return estimate_update_common(inode);
36702+}
36703+
36704+/* this is called with nonexclusive access obtained, file's container can not change */
36705+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
36706+ char __user *buf, /* address of user-space buffer */
36707+ size_t count, /* number of bytes to read */
36708+ loff_t *off)
36709+{
36710+ int result;
36711+ struct inode *inode;
36712+ flow_t flow;
36713+ int (*read_f) (struct file *, flow_t *, hint_t *);
36714+ coord_t *coord;
36715+ znode *loaded;
36716+
36717+ inode = file->f_dentry->d_inode;
36718+
36719+ /* build flow */
36720+ assert("vs-1250",
36721+ inode_file_plugin(inode)->flow_by_inode ==
36722+ flow_by_inode_unix_file);
36723+ result =
36724+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
36725+ *off, READ_OP, &flow);
36726+ if (unlikely(result))
36727+ return result;
36728+
36729+ /* get seal and coord sealed with it from reiser4 private data
36730+ of struct file. The coord will tell us where our last read
36731+ of this file finished, and the seal will help to determine
36732+ if that location is still valid.
36733+ */
36734+ coord = &hint->ext_coord.coord;
36735+ while (flow.length && result == 0) {
36736+ result =
36737+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
36738+ if (cbk_errored(result))
36739+ /* error happened */
36740+ break;
36741+
36742+ if (coord->between != AT_UNIT) {
36743+ /* there were no items corresponding to given offset */
36744+ done_lh(hint->ext_coord.lh);
36745+ break;
36746+ }
36747+
36748+ loaded = coord->node;
36749+ result = zload(loaded);
36750+ if (unlikely(result)) {
36751+ done_lh(hint->ext_coord.lh);
36752+ break;
36753+ }
36754+
36755+ if (hint->ext_coord.valid == 0)
36756+ validate_extended_coord(&hint->ext_coord,
36757+ get_key_offset(&flow.key));
36758+
36759+ assert("vs-4", hint->ext_coord.valid == 1);
36760+ assert("vs-33", hint->ext_coord.lh == &hint->lh);
36761+ /* call item's read method */
36762+ read_f = item_plugin_by_coord(coord)->s.file.read;
36763+ result = read_f(file, &flow, hint);
36764+ zrelse(loaded);
36765+ done_lh(hint->ext_coord.lh);
36766+ }
36767+
36768+ return (count - flow.length) ? (count - flow.length) : result;
36769+}
36770+
36771+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
36772+
36773+/**
36774+ * read_unix_file - read of struct file_operations
36775+ * @file: file to read from
36776+ * @buf: address of user-space buffer
36777+ * @read_amount: number of bytes to read
36778+ * @off: position in file to read from
36779+ *
36780+ * This is implementation of vfs's read method of struct file_operations for
36781+ * unix file plugin.
36782+ */
36783+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
36784+ loff_t *off)
36785+{
36786+ reiser4_context *ctx;
36787+ ssize_t result;
36788+ struct inode *inode;
36789+ unix_file_info_t *uf_info;
36790+
36791+ if (unlikely(read_amount == 0))
36792+ return 0;
36793+
36794+ assert("umka-072", file != NULL);
36795+ assert("umka-074", off != NULL);
36796+ inode = file->f_dentry->d_inode;
36797+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
36798+
36799+ ctx = reiser4_init_context(inode->i_sb);
36800+ if (IS_ERR(ctx))
36801+ return PTR_ERR(ctx);
36802+ uf_info = unix_file_inode_data(inode);
36803+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
36804+ get_exclusive_access(uf_info);
36805+ result = find_file_state(inode, uf_info);
36806+ if (unlikely(result != 0))
36807+ goto out;
36808+ } else
36809+ get_nonexclusive_access(uf_info);
36810+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
36811+ BA_CAN_COMMIT);
36812+ if (unlikely(result != 0))
36813+ goto out;
36814+ if (uf_info->container == UF_CONTAINER_EXTENTS){
36815+ result = do_sync_read(file, buf, read_amount, off);
36816+ } else if (uf_info->container == UF_CONTAINER_TAILS ||
36817+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
36818+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
36819+ result = read_unix_file_container_tails(file, buf, read_amount, off);
36820+ } else {
36821+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
36822+ result = 0;
36823+ }
36824+out:
36825+ drop_access(uf_info);
36826+ context_set_commit_async(ctx);
36827+ reiser4_exit_context(ctx);
36828+ return result;
36829+}
36830+
36831+static ssize_t read_unix_file_container_tails(
36832+ struct file *file, char __user *buf, size_t read_amount, loff_t *off)
36833+{
36834+ int result;
36835+ struct inode *inode;
36836+ hint_t *hint;
36837+ unix_file_info_t *uf_info;
36838+ size_t count, read, left;
36839+ loff_t size;
36840+
36841+ assert("umka-072", file != NULL);
36842+ assert("umka-074", off != NULL);
36843+ inode = file->f_dentry->d_inode;
36844+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
36845+
36846+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36847+ if (hint == NULL)
36848+ return RETERR(-ENOMEM);
36849+
36850+ result = load_file_hint(file, hint);
36851+ if (result) {
36852+ kfree(hint);
36853+ return result;
36854+ }
36855+
36856+ left = read_amount;
36857+ count = 0;
36858+ uf_info = unix_file_inode_data(inode);
36859+ while (left > 0) {
36860+ reiser4_txn_restart_current();
36861+ size = i_size_read(inode);
36862+ if (*off >= size)
36863+ /* position to read from is past the end of file */
36864+ break;
36865+ if (*off + left > size)
36866+ left = size - *off;
36867+ /* faultin user page */
36868+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
36869+ if (result)
36870+ return RETERR(-EFAULT);
36871+
36872+ read = read_file(hint, file, buf,
36873+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
36874+ off);
36875+ if (read < 0) {
36876+ result = read;
36877+ break;
36878+ }
36879+ left -= read;
36880+ buf += read;
36881+
36882+ /* update position in a file */
36883+ *off += read;
36884+ /* total number of read bytes */
36885+ count += read;
36886+ }
36887+ done_lh(&hint->lh);
36888+ save_file_hint(file, hint);
36889+ kfree(hint);
36890+ if (count)
36891+ file_accessed(file);
36892+ /* return number of read bytes or error code if nothing is read */
36893+ return count ? count : result;
36894+}
36895+
36896+/* This function takes care about @file's pages. First of all it checks if
36897+ filesystems readonly and if so gets out. Otherwise, it throws out all
36898+ pages of file if it was mapped for read and going to be mapped for write
36899+ and consists of tails. This is done in order to not manage few copies
36900+ of the data (first in page cache and second one in tails them selves)
36901+ for the case of mapping files consisting tails.
36902+
36903+ Here also tail2extent conversion is performed if it is allowed and file
36904+ is going to be written or mapped for write. This functions may be called
36905+ from write_unix_file() or mmap_unix_file(). */
36906+static int check_pages_unix_file(struct file *file, struct inode *inode)
36907+{
36908+ reiser4_invalidate_pages(inode->i_mapping, 0,
36909+ (inode->i_size + PAGE_CACHE_SIZE -
36910+ 1) >> PAGE_CACHE_SHIFT, 0);
36911+ return unpack(file, inode, 0 /* not forever */ );
36912+}
36913+
36914+/**
36915+ * mmap_unix_file - mmap of struct file_operations
36916+ * @file: file to mmap
36917+ * @vma:
36918+ *
36919+ * This is implementation of vfs's mmap method of struct file_operations for
36920+ * unix file plugin. It converts file to extent if necessary. Sets
36921+ * reiser4_inode's flag - REISER4_HAS_MMAP.
36922+ */
36923+int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
36924+{
36925+ reiser4_context *ctx;
36926+ int result;
36927+ struct inode *inode;
36928+ unix_file_info_t *uf_info;
36929+ reiser4_block_nr needed;
36930+
36931+ inode = file->f_dentry->d_inode;
36932+ ctx = reiser4_init_context(inode->i_sb);
36933+ if (IS_ERR(ctx))
36934+ return PTR_ERR(ctx);
36935+
36936+ uf_info = unix_file_inode_data(inode);
36937+
36938+ get_exclusive_access_careful(uf_info, inode);
36939+
36940+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
36941+ /*
36942+ * we need file built of extent items. If it is still built of
36943+ * tail items we have to convert it. Find what items the file
36944+ * is built of
36945+ */
36946+ result = find_file_state(inode, uf_info);
36947+ if (result != 0) {
36948+ drop_exclusive_access(uf_info);
36949+ reiser4_exit_context(ctx);
36950+ return result;
36951+ }
36952+
36953+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
36954+ uf_info->container == UF_CONTAINER_EXTENTS ||
36955+ uf_info->container == UF_CONTAINER_EMPTY));
36956+ if (uf_info->container == UF_CONTAINER_TAILS) {
36957+ /*
36958+ * invalidate all pages and convert file from tails to
36959+ * extents
36960+ */
36961+ result = check_pages_unix_file(file, inode);
36962+ if (result) {
36963+ drop_exclusive_access(uf_info);
36964+ reiser4_exit_context(ctx);
36965+ return result;
36966+ }
36967+ }
36968+ }
36969+
36970+ /*
36971+ * generic_file_mmap will do update_atime. Grab space for stat data
36972+ * update.
36973+ */
36974+ needed = inode_file_plugin(inode)->estimate.update(inode);
36975+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
36976+ if (result) {
36977+ drop_exclusive_access(uf_info);
36978+ reiser4_exit_context(ctx);
36979+ return result;
36980+ }
36981+
36982+ result = generic_file_mmap(file, vma);
36983+ if (result == 0) {
36984+ /* mark file as having mapping. */
36985+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
36986+ }
36987+
36988+ drop_exclusive_access(uf_info);
36989+ reiser4_exit_context(ctx);
36990+ return result;
36991+}
36992+
36993+/**
36994+ * find_first_item
36995+ * @inode:
36996+ *
36997+ * Finds file item which is responsible for first byte in the file.
36998+ */
36999+static int find_first_item(struct inode *inode)
37000+{
37001+ coord_t coord;
37002+ lock_handle lh;
37003+ reiser4_key key;
37004+ int result;
37005+
37006+ coord_init_zero(&coord);
37007+ init_lh(&lh);
37008+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37009+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37010+ inode);
37011+ if (result == CBK_COORD_FOUND) {
37012+ if (coord.between == AT_UNIT) {
37013+ result = zload(coord.node);
37014+ if (result == 0) {
37015+ result = item_id_by_coord(&coord);
37016+ zrelse(coord.node);
37017+ if (result != EXTENT_POINTER_ID &&
37018+ result != FORMATTING_ID)
37019+ result = RETERR(-EIO);
37020+ }
37021+ } else
37022+ result = RETERR(-EIO);
37023+ }
37024+ done_lh(&lh);
37025+ return result;
37026+}
37027+
37028+/**
37029+ * open_unix_file
37030+ * @inode:
37031+ * @file:
37032+ *
37033+ * If filesystem is not readonly - complete uncompleted tail conversion if
37034+ * there was one
37035+ */
37036+int open_unix_file(struct inode *inode, struct file *file)
37037+{
37038+ int result;
37039+ reiser4_context *ctx;
37040+ unix_file_info_t *uf_info;
37041+
37042+ if (IS_RDONLY(inode))
37043+ return 0;
37044+
37045+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
37046+ return 0;
37047+
37048+ ctx = reiser4_init_context(inode->i_sb);
37049+ if (IS_ERR(ctx))
37050+ return PTR_ERR(ctx);
37051+
37052+ uf_info = unix_file_inode_data(inode);
37053+
37054+ get_exclusive_access_careful(uf_info, inode);
37055+
37056+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37057+ /*
37058+ * other process completed the conversion
37059+ */
37060+ drop_exclusive_access(uf_info);
37061+ reiser4_exit_context(ctx);
37062+ return 0;
37063+ }
37064+
37065+ /*
37066+ * file left in semi converted state after unclean shutdown or another
37067+ * thread is doing conversion and dropped exclusive access which doing
37068+ * balance dirty pages. Complete the conversion
37069+ */
37070+ result = find_first_item(inode);
37071+ if (result == EXTENT_POINTER_ID)
37072+ /*
37073+ * first item is extent, therefore there was incomplete
37074+ * tail2extent conversion. Complete it
37075+ */
37076+ result = tail2extent(unix_file_inode_data(inode));
37077+ else if (result == FORMATTING_ID)
37078+ /*
37079+ * first item is formatting item, therefore there was
37080+ * incomplete extent2tail conversion. Complete it
37081+ */
37082+ result = extent2tail(unix_file_inode_data(inode));
37083+ else
37084+ result = -EIO;
37085+
37086+ assert("vs-1712",
37087+ ergo(result == 0,
37088+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
37089+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
37090+ drop_exclusive_access(uf_info);
37091+ reiser4_exit_context(ctx);
37092+ return result;
37093+}
37094+
37095+#define NEITHER_OBTAINED 0
37096+#define EA_OBTAINED 1
37097+#define NEA_OBTAINED 2
37098+
37099+static void drop_access(unix_file_info_t *uf_info)
37100+{
37101+ if (uf_info->exclusive_use)
37102+ drop_exclusive_access(uf_info);
37103+ else
37104+ drop_nonexclusive_access(uf_info);
37105+}
37106+
37107+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37108+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37109+
37110+/**
37111+ * write_unix_file - write of struct file_operations
37112+ * @file: file to write to
37113+ * @buf: address of user-space buffer
37114+ * @write_amount: number of bytes to write
37115+ * @off: position in file to write to
37116+ *
37117+ * This is implementation of vfs's write method of struct file_operations for
37118+ * unix file plugin.
37119+ */
37120+ssize_t write_unix_file(struct file *file, const char __user *buf,
37121+ size_t count, loff_t *pos)
37122+{
37123+ int result;
37124+ reiser4_context *ctx;
37125+ struct inode *inode;
37126+ unix_file_info_t *uf_info;
37127+ ssize_t written;
37128+ int try_free_space;
37129+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37130+ size_t left;
37131+ ssize_t (*write_op)(struct file *, const char __user *, size_t,
37132+ loff_t *pos);
37133+ int ea;
37134+ loff_t new_size;
37135+
37136+ inode = file->f_dentry->d_inode;
37137+ ctx = reiser4_init_context(inode->i_sb);
37138+ if (IS_ERR(ctx))
37139+ return PTR_ERR(ctx);
37140+
37141+ mutex_lock(&inode->i_mutex);
37142+
37143+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37144+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
37145+
37146+ /* check amount of bytes to write and writing position */
37147+ result = generic_write_checks(file, pos, &count, 0);
37148+ if (result) {
37149+ mutex_unlock(&inode->i_mutex);
37150+ context_set_commit_async(ctx);
37151+ reiser4_exit_context(ctx);
37152+ return result;
37153+ }
37154+
37155+ result = remove_suid(file->f_dentry);
37156+ if (result) {
37157+ mutex_unlock(&inode->i_mutex);
37158+ context_set_commit_async(ctx);
37159+ reiser4_exit_context(ctx);
37160+ return result;
37161+ }
37162+
37163+ uf_info = unix_file_inode_data(inode);
37164+
37165+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
37166+ written = 0;
37167+ try_free_space = 0;
37168+ left = count;
37169+ ea = NEITHER_OBTAINED;
37170+
37171+ new_size = i_size_read(inode);
37172+ if (*pos + count > new_size)
37173+ new_size = *pos + count;
37174+
37175+ while (left) {
37176+ if (left < to_write)
37177+ to_write = left;
37178+
37179+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37180+ get_exclusive_access(uf_info);
37181+ ea = EA_OBTAINED;
37182+ if (uf_info->container != UF_CONTAINER_EMPTY) {
37183+ /* file is made not empty by another process */
37184+ drop_exclusive_access(uf_info);
37185+ ea = NEITHER_OBTAINED;
37186+ continue;
37187+ }
37188+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37189+ /*
37190+ * get exclusive access directly just to not have to
37191+ * re-obtain it if file will appear empty
37192+ */
37193+ get_exclusive_access(uf_info);
37194+ ea = EA_OBTAINED;
37195+ result = find_file_state(inode, uf_info);
37196+ if (result) {
37197+ drop_exclusive_access(uf_info);
37198+ ea = NEITHER_OBTAINED;
37199+ break;
37200+ }
37201+ } else {
37202+ get_nonexclusive_access(uf_info);
37203+ ea = NEA_OBTAINED;
37204+ }
37205+
37206+ /* either EA or NEA is obtained. Choose item write method */
37207+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
37208+ /* file is built of extent items */
37209+ write_op = reiser4_write_extent;
37210+ } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37211+ /* file is empty */
37212+ if (should_have_notail(uf_info, new_size))
37213+ write_op = reiser4_write_extent;
37214+ else
37215+ write_op = reiser4_write_tail;
37216+ } else {
37217+ /* file is built of tail items */
37218+ if (should_have_notail(uf_info, new_size)) {
37219+ if (ea == NEA_OBTAINED) {
37220+ drop_nonexclusive_access(uf_info);
37221+ get_exclusive_access(uf_info);
37222+ ea = EA_OBTAINED;
37223+ }
37224+ if (uf_info->container == UF_CONTAINER_TAILS) {
37225+ /*
37226+ * if file is being convered by another
37227+ * process - wait until it completes
37228+ */
37229+ while (1) {
37230+ if (reiser4_inode_get_flag(inode,
37231+ REISER4_PART_IN_CONV)) {
37232+ drop_exclusive_access(uf_info);
37233+ schedule();
37234+ get_exclusive_access(uf_info);
37235+ continue;
37236+ }
37237+ break;
37238+ }
37239+ if (uf_info->container == UF_CONTAINER_TAILS) {
37240+ result = tail2extent(uf_info);
37241+ if (result)
37242+ break;
37243+ }
37244+ }
37245+ drop_exclusive_access(uf_info);
37246+ ea = NEITHER_OBTAINED;
37247+ continue;
37248+ }
37249+ write_op = reiser4_write_tail;
37250+ }
37251+
37252+ written = write_op(file, buf, to_write, pos);
37253+ if (written == -ENOSPC && try_free_space) {
37254+ drop_access(uf_info);
37255+ txnmgr_force_commit_all(inode->i_sb, 0);
37256+ try_free_space = 0;
37257+ continue;
37258+ }
37259+ if (written < 0) {
37260+ drop_access(uf_info);
37261+ result = written;
37262+ break;
37263+ }
37264+ /* something is written. */
37265+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37266+ assert("", ea == EA_OBTAINED);
37267+ uf_info->container =
37268+ (write_op == reiser4_write_extent) ?
37269+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37270+ } else {
37271+ assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
37272+ write_op == reiser4_write_extent));
37273+ assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
37274+ write_op == reiser4_write_tail));
37275+ }
37276+ if (*pos + written > inode->i_size)
37277+ INODE_SET_FIELD(inode, i_size, *pos + written);
37278+ file_update_time(file);
37279+ result = reiser4_update_sd(inode);
37280+ if (result) {
37281+ mutex_unlock(&inode->i_mutex);
37282+ current->backing_dev_info = NULL;
37283+ drop_access(uf_info);
37284+ context_set_commit_async(ctx);
37285+ reiser4_exit_context(ctx);
37286+ return result;
37287+ }
37288+ drop_access(uf_info);
37289+ ea = NEITHER_OBTAINED;
37290+ reiser4_txn_restart(ctx);
37291+ current->journal_info = NULL;
37292+ /*
37293+ * tell VM how many pages were dirtied. Maybe number of pages
37294+ * which were dirty already should not be counted
37295+ */
37296+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
37297+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
37298+ current->journal_info = ctx;
37299+
37300+ left -= written;
37301+ buf += written;
37302+ *pos += written;
37303+ }
37304+
37305+ mutex_unlock(&inode->i_mutex);
37306+
37307+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
37308+ reiser4_txn_restart_current();
37309+ grab_space_enable();
37310+ result = sync_unix_file(file, file->f_dentry,
37311+ 0 /* data and stat data */ );
37312+ if (result)
37313+ warning("reiser4-7", "failed to sync file %llu",
37314+ (unsigned long long)get_inode_oid(inode));
37315+ }
37316+
37317+ current->backing_dev_info = NULL;
37318+
37319+ reiser4_exit_context(ctx);
37320+
37321+ /*
37322+ * return number of written bytes or error code if nothing is
37323+ * written. Note, that it does not work correctly in case when
37324+ * sync_unix_file returns error
37325+ */
37326+ return (count - left) ? (count - left) : result;
37327+}
37328+
37329+/**
37330+ * release_unix_file - release of struct file_operations
37331+ * @inode: inode of released file
37332+ * @file: file to release
37333+ *
37334+ * Implementation of release method of struct file_operations for unix file
37335+ * plugin. If last reference to indode is released - convert all extent items
37336+ * into tail items if necessary. Frees reiser4 specific file data.
37337+ */
37338+int release_unix_file(struct inode *inode, struct file *file)
37339+{
37340+ reiser4_context *ctx;
37341+ unix_file_info_t *uf_info;
37342+ int result;
37343+ int in_reiser4;
37344+
37345+ in_reiser4 = is_in_reiser4_context();
37346+
37347+ ctx = reiser4_init_context(inode->i_sb);
37348+ if (IS_ERR(ctx))
37349+ return PTR_ERR(ctx);
37350+
37351+ result = 0;
37352+ if (in_reiser4 == 0) {
37353+ uf_info = unix_file_inode_data(inode);
37354+
37355+ get_exclusive_access_careful(uf_info, inode);
37356+ if (atomic_read(&file->f_dentry->d_count) == 1 &&
37357+ uf_info->container == UF_CONTAINER_EXTENTS &&
37358+ !should_have_notail(uf_info, inode->i_size) &&
37359+ !rofs_inode(inode)) {
37360+ result = extent2tail(uf_info);
37361+ if (result != 0) {
37362+ warning("nikita-3233",
37363+ "Failed (%d) to convert in %s (%llu)",
37364+ result, __FUNCTION__,
37365+ (unsigned long long)
37366+ get_inode_oid(inode));
37367+ }
37368+ }
37369+ drop_exclusive_access(uf_info);
37370+ } else {
37371+ /*
37372+ we are within reiser4 context already. How latter is
37373+ possible? Simple:
37374+
37375+ (gdb) bt
37376+ #0 get_exclusive_access ()
37377+ #2 0xc01e56d3 in release_unix_file ()
37378+ #3 0xc01c3643 in reiser4_release ()
37379+ #4 0xc014cae0 in __fput ()
37380+ #5 0xc013ffc3 in remove_vm_struct ()
37381+ #6 0xc0141786 in exit_mmap ()
37382+ #7 0xc0118480 in mmput ()
37383+ #8 0xc0133205 in oom_kill ()
37384+ #9 0xc01332d1 in out_of_memory ()
37385+ #10 0xc013bc1d in try_to_free_pages ()
37386+ #11 0xc013427b in __alloc_pages ()
37387+ #12 0xc013f058 in do_anonymous_page ()
37388+ #13 0xc013f19d in do_no_page ()
37389+ #14 0xc013f60e in handle_mm_fault ()
37390+ #15 0xc01131e5 in do_page_fault ()
37391+ #16 0xc0104935 in error_code ()
37392+ #17 0xc025c0c6 in __copy_to_user_ll ()
37393+ #18 0xc01d496f in reiser4_read_tail ()
37394+ #19 0xc01e4def in read_unix_file ()
37395+ #20 0xc01c3504 in reiser4_read ()
37396+ #21 0xc014bd4f in vfs_read ()
37397+ #22 0xc014bf66 in sys_read ()
37398+ */
37399+ warning("vs-44", "out of memory?");
37400+ }
37401+
37402+ reiser4_free_file_fsdata(file);
37403+
37404+ reiser4_exit_context(ctx);
37405+ return result;
37406+}
37407+
37408+static void set_file_notail(struct inode *inode)
37409+{
37410+ reiser4_inode *state;
37411+ formatting_plugin *tplug;
37412+
37413+ state = reiser4_inode_data(inode);
37414+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
37415+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
37416+}
37417+
37418+/* if file is built of tails - convert it to extents */
37419+static int unpack(struct file *filp, struct inode *inode, int forever)
37420+{
37421+ int result = 0;
37422+ unix_file_info_t *uf_info;
37423+
37424+ uf_info = unix_file_inode_data(inode);
37425+ assert("vs-1628", ea_obtained(uf_info));
37426+
37427+ result = find_file_state(inode, uf_info);
37428+ if (result)
37429+ return result;
37430+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
37431+
37432+ if (uf_info->container == UF_CONTAINER_TAILS) {
37433+ /*
37434+ * if file is being convered by another process - wait until it
37435+ * completes
37436+ */
37437+ while (1) {
37438+ if (reiser4_inode_get_flag(inode,
37439+ REISER4_PART_IN_CONV)) {
37440+ drop_exclusive_access(uf_info);
37441+ schedule();
37442+ get_exclusive_access(uf_info);
37443+ continue;
37444+ }
37445+ break;
37446+ }
37447+ if (uf_info->container == UF_CONTAINER_TAILS) {
37448+ result = tail2extent(uf_info);
37449+ if (result)
37450+ return result;
37451+ }
37452+ }
37453+ if (forever) {
37454+ /* safe new formatting plugin in stat data */
37455+ __u64 tograb;
37456+
37457+ set_file_notail(inode);
37458+
37459+ grab_space_enable();
37460+ tograb = inode_file_plugin(inode)->estimate.update(inode);
37461+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
37462+ result = reiser4_update_sd(inode);
37463+ }
37464+
37465+ return result;
37466+}
37467+
37468+/* implentation of vfs' ioctl method of struct file_operations for unix file
37469+ plugin
37470+*/
37471+int
37472+ioctl_unix_file(struct inode *inode, struct file *filp,
37473+ unsigned int cmd, unsigned long arg UNUSED_ARG)
37474+{
37475+ reiser4_context *ctx;
37476+ int result;
37477+
37478+ ctx = reiser4_init_context(inode->i_sb);
37479+ if (IS_ERR(ctx))
37480+ return PTR_ERR(ctx);
37481+
37482+ switch (cmd) {
37483+ case REISER4_IOC_UNPACK:
37484+ get_exclusive_access(unix_file_inode_data(inode));
37485+ result = unpack(filp, inode, 1 /* forever */ );
37486+ drop_exclusive_access(unix_file_inode_data(inode));
37487+ break;
37488+
37489+ default:
37490+ result = RETERR(-ENOSYS);
37491+ break;
37492+ }
37493+ reiser4_exit_context(ctx);
37494+ return result;
37495+}
37496+
37497+/* implentation of vfs' bmap method of struct address_space_operations for unix
37498+ file plugin
37499+*/
37500+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
37501+{
37502+ reiser4_context *ctx;
37503+ sector_t result;
37504+ reiser4_key key;
37505+ coord_t coord;
37506+ lock_handle lh;
37507+ struct inode *inode;
37508+ item_plugin *iplug;
37509+ sector_t block;
37510+
37511+ inode = mapping->host;
37512+
37513+ ctx = reiser4_init_context(inode->i_sb);
37514+ if (IS_ERR(ctx))
37515+ return PTR_ERR(ctx);
37516+ key_by_inode_and_offset_common(inode,
37517+ (loff_t) lblock * current_blocksize,
37518+ &key);
37519+
37520+ init_lh(&lh);
37521+ result =
37522+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
37523+ if (cbk_errored(result)) {
37524+ done_lh(&lh);
37525+ reiser4_exit_context(ctx);
37526+ return result;
37527+ }
37528+
37529+ result = zload(coord.node);
37530+ if (result) {
37531+ done_lh(&lh);
37532+ reiser4_exit_context(ctx);
37533+ return result;
37534+ }
37535+
37536+ iplug = item_plugin_by_coord(&coord);
37537+ if (iplug->s.file.get_block) {
37538+ result = iplug->s.file.get_block(&coord, lblock, &block);
37539+ if (result == 0)
37540+ result = block;
37541+ } else
37542+ result = RETERR(-EINVAL);
37543+
37544+ zrelse(coord.node);
37545+ done_lh(&lh);
37546+ reiser4_exit_context(ctx);
37547+ return result;
37548+}
37549+
37550+/**
37551+ * flow_by_inode_unix_file - initizlize structure flow
37552+ * @inode: inode of file for which read or write is abou
37553+ * @buf: buffer to perform read to or write from
37554+ * @user: flag showing whether @buf is user space or kernel space
37555+ * @size: size of buffer @buf
37556+ * @off: start offset fro read or write
37557+ * @op: READ or WRITE
37558+ * @flow:
37559+ *
37560+ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
37561+ */
37562+int flow_by_inode_unix_file(struct inode *inode,
37563+ const char __user *buf, int user,
37564+ loff_t size, loff_t off,
37565+ rw_op op, flow_t *flow)
37566+{
37567+ assert("nikita-1100", inode != NULL);
37568+
37569+ flow->length = size;
37570+ memcpy(&flow->data, &buf, sizeof(buf));
37571+ flow->user = user;
37572+ flow->op = op;
37573+ assert("nikita-1931", inode_file_plugin(inode) != NULL);
37574+ assert("nikita-1932",
37575+ inode_file_plugin(inode)->key_by_inode ==
37576+ key_by_inode_and_offset_common);
37577+ /* calculate key of write position and insert it into flow->key */
37578+ return key_by_inode_and_offset_common(inode, off, &flow->key);
37579+}
37580+
37581+/* plugin->u.file.set_plug_in_sd = NULL
37582+ plugin->u.file.set_plug_in_inode = NULL
37583+ plugin->u.file.create_blank_sd = NULL */
37584+/* plugin->u.file.delete */
37585+/*
37586+ plugin->u.file.add_link = reiser4_add_link_common
37587+ plugin->u.file.rem_link = NULL */
37588+
37589+/* plugin->u.file.owns_item
37590+ this is common_file_owns_item with assertion */
37591+/* Audited by: green(2002.06.15) */
37592+int
37593+owns_item_unix_file(const struct inode *inode /* object to check against */ ,
37594+ const coord_t * coord /* coord to check */ )
37595+{
37596+ int result;
37597+
37598+ result = owns_item_common(inode, coord);
37599+ if (!result)
37600+ return 0;
37601+ if (!plugin_of_group(item_plugin_by_coord(coord),
37602+ UNIX_FILE_METADATA_ITEM_TYPE))
37603+ return 0;
37604+ assert("vs-547",
37605+ item_id_by_coord(coord) == EXTENT_POINTER_ID ||
37606+ item_id_by_coord(coord) == FORMATTING_ID);
37607+ return 1;
37608+}
37609+
37610+static int setattr_truncate(struct inode *inode, struct iattr *attr)
37611+{
37612+ int result;
37613+ int s_result;
37614+ loff_t old_size;
37615+ reiser4_tree *tree;
37616+
37617+ inode_check_scale(inode, inode->i_size, attr->ia_size);
37618+
37619+ old_size = inode->i_size;
37620+ tree = reiser4_tree_by_inode(inode);
37621+
37622+ result = safe_link_grab(tree, BA_CAN_COMMIT);
37623+ if (result == 0)
37624+ result = safe_link_add(inode, SAFE_TRUNCATE);
37625+ if (result == 0)
37626+ result = truncate_file_body(inode, attr->ia_size);
37627+ if (result)
37628+ warning("vs-1588", "truncate_file failed: oid %lli, "
37629+ "old size %lld, new size %lld, retval %d",
37630+ (unsigned long long)get_inode_oid(inode),
37631+ old_size, attr->ia_size, result);
37632+
37633+ s_result = safe_link_grab(tree, BA_CAN_COMMIT);
37634+ if (s_result == 0)
37635+ s_result =
37636+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
37637+ if (s_result != 0) {
37638+ warning("nikita-3417", "Cannot kill safelink %lli: %i",
37639+ (unsigned long long)get_inode_oid(inode), s_result);
37640+ }
37641+ safe_link_release(tree);
37642+ return result;
37643+}
37644+
37645+/* plugin->u.file.setattr method */
37646+/* This calls inode_setattr and if truncate is in effect it also takes
37647+ exclusive inode access to avoid races */
37648+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
37649+ struct iattr *attr /* change description */ )
37650+{
37651+ int result;
37652+
37653+ if (attr->ia_valid & ATTR_SIZE) {
37654+ reiser4_context *ctx;
37655+ unix_file_info_t *uf_info;
37656+
37657+ /* truncate does reservation itself and requires exclusive
37658+ access obtained */
37659+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
37660+ if (IS_ERR(ctx))
37661+ return PTR_ERR(ctx);
37662+
37663+ uf_info = unix_file_inode_data(dentry->d_inode);
37664+ get_exclusive_access_careful(uf_info, dentry->d_inode);
37665+ result = setattr_truncate(dentry->d_inode, attr);
37666+ drop_exclusive_access(uf_info);
37667+ context_set_commit_async(ctx);
37668+ reiser4_exit_context(ctx);
37669+ } else
37670+ result = reiser4_setattr_common(dentry, attr);
37671+
37672+ return result;
37673+}
37674+
37675+/* plugin->u.file.init_inode_data */
37676+void
37677+init_inode_data_unix_file(struct inode *inode,
37678+ reiser4_object_create_data * crd, int create)
37679+{
37680+ unix_file_info_t *data;
37681+
37682+ data = unix_file_inode_data(inode);
37683+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
37684+ init_rwsem(&data->latch);
37685+ data->tplug = inode_formatting_plugin(inode);
37686+ data->exclusive_use = 0;
37687+
37688+#if REISER4_DEBUG
37689+ data->ea_owner = NULL;
37690+ atomic_set(&data->nr_neas, 0);
37691+#endif
37692+ init_inode_ordering(inode, crd, create);
37693+}
37694+
37695+/**
37696+ * delete_object_unix_file - delete_object of file_plugin
37697+ * @inode: inode to be deleted
37698+ *
37699+ * Truncates file to length 0, removes stat data and safe link.
37700+ */
37701+int delete_object_unix_file(struct inode *inode)
37702+{
37703+ unix_file_info_t *uf_info;
37704+ int result;
37705+
37706+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
37707+ return 0;
37708+
37709+ /* truncate file bogy first */
37710+ uf_info = unix_file_inode_data(inode);
37711+ get_exclusive_access(uf_info);
37712+ result = truncate_file_body(inode, 0 /* size */ );
37713+ drop_exclusive_access(uf_info);
37714+
37715+ if (result)
37716+ warning("", "failed to truncate file (%llu) on removal: %d",
37717+ get_inode_oid(inode), result);
37718+
37719+ /* remove stat data and safe link */
37720+ return reiser4_delete_object_common(inode);
37721+}
37722+
37723+/**
37724+ * sendfile_unix_file - sendfile of struct file_operations
37725+ * @file: file to be sent
37726+ * @ppos: position to start from
37727+ * @count: number of bytes to send
37728+ * @actor: function to copy data
37729+ * @target: where to copy read data
37730+ *
37731+ * Reads @count bytes from @file and calls @actor for every page read. This is
37732+ * needed for loop back devices support.
37733+ */
37734+ssize_t
37735+sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
37736+ read_actor_t actor, void *target)
37737+{
37738+ reiser4_context *ctx;
37739+ ssize_t result;
37740+ struct inode *inode;
37741+ unix_file_info_t *uf_info;
37742+
37743+ inode = file->f_dentry->d_inode;
37744+ ctx = reiser4_init_context(inode->i_sb);
37745+ if (IS_ERR(ctx))
37746+ return PTR_ERR(ctx);
37747+
37748+ /*
37749+ * generic_file_sndfile may want to call update_atime. Grab space for
37750+ * stat data update
37751+ */
37752+ result = reiser4_grab_space(estimate_update_common(inode),
37753+ BA_CAN_COMMIT);
37754+ if (result)
37755+ goto error;
37756+ mutex_lock(&inode->i_mutex);
37757+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
37758+ mutex_unlock(&inode->i_mutex);
37759+
37760+ uf_info = unix_file_inode_data(inode);
37761+ get_nonexclusive_access(uf_info);
37762+ result = generic_file_sendfile(file, ppos, count, actor, target);
37763+ drop_nonexclusive_access(uf_info);
37764+ error:
37765+ reiser4_exit_context(ctx);
37766+ return result;
37767+}
37768+
37769+int
37770+prepare_write_unix_file(struct file *file, struct page *page,
37771+ unsigned from, unsigned to)
37772+{
37773+ reiser4_context *ctx;
37774+ unix_file_info_t *uf_info;
37775+ int ret;
37776+
37777+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
37778+ if (IS_ERR(ctx))
37779+ return PTR_ERR(ctx);
37780+
37781+ uf_info = unix_file_inode_data(file->f_dentry->d_inode);
37782+ get_exclusive_access(uf_info);
37783+ ret = find_file_state(file->f_dentry->d_inode, uf_info);
37784+ if (ret == 0) {
37785+ if (uf_info->container == UF_CONTAINER_TAILS)
37786+ ret = -EINVAL;
37787+ else
37788+ ret = do_prepare_write(file, page, from, to);
37789+ }
37790+ drop_exclusive_access(uf_info);
37791+
37792+ /* don't commit transaction under inode semaphore */
37793+ context_set_commit_async(ctx);
37794+ reiser4_exit_context(ctx);
37795+ return ret;
37796+}
37797+
37798+/*
37799+ * Local variables:
37800+ * c-indentation-style: "K&R"
37801+ * mode-name: "LC"
37802+ * c-basic-offset: 8
37803+ * tab-width: 8
37804+ * fill-column: 79
37805+ * scroll-step: 1
37806+ * End:
37807+ */
37808diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.20/fs/reiser4/plugin/file/file_conversion.c
37809--- linux-2.6.20.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300
37810+++ linux-2.6.20/fs/reiser4/plugin/file/file_conversion.c 2007-05-06 14:50:43.783001971 +0400
37811@@ -0,0 +1,594 @@
37812+/* Copyright 2001, 2002, 2003 by Hans Reiser,
37813+ licensing governed by reiser4/README */
37814+
37815+/* This file contains hooks that converts (*) cryptcompress files to unix-files,
37816+ and a set of protected (**) methods of a cryptcompress file plugin to perform
37817+ such conversion.
37818+
37819+(*)
37820+ The conversion is performed for incompressible files to reduce cpu and memory
37821+ usage. If first logical cluster (64K by default) of a file is incompressible,
37822+ then we make a desicion, that the whole file is incompressible.
37823+ The conversion can be enabled via installing a special compression mode
37824+ plugin (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for
37825+ details).
37826+
37827+(**)
37828+ The protection means serialization of critical sections (readers and writers
37829+ of @pset->file)
37830+*/
37831+
37832+#include "../../inode.h"
37833+#include "../cluster.h"
37834+#include "file.h"
37835+
37836+#define conversion_enabled(inode) \
37837+ (inode_compression_mode_plugin(inode) == \
37838+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
37839+
37840+
37841+/* Located sections (readers and writers of @pset->file) are not
37842+ permanently critical: cryptcompress file can be converted only
37843+ if the conversion is enabled (see the macrio above). And we don't
37844+ convert unix files at all.
37845+ The following helper macro is a sanity check to decide if we
37846+ need to protect a located section.
37847+*/
37848+#define should_protect(inode) \
37849+ (inode_file_plugin(inode) == \
37850+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
37851+ conversion_enabled(inode))
37852+
37853+/* All protected methods have prefix "prot" in their names.
37854+ It is convenient to construct them by usual (unprotected) ones
37855+ using the following common macros:
37856+*/
37857+
37858+/* Macro for passive protection.
37859+ method_cryptcompress contains only readers */
37860+#define PROT_PASSIVE(type, method, args) \
37861+({ \
37862+ type _result; \
37863+ struct rw_semaphore * guard = \
37864+ &reiser4_inode_data(inode)->conv_sem; \
37865+ \
37866+ if (should_protect(inode)) { \
37867+ down_read(guard); \
37868+ if (!should_protect(inode)) \
37869+ up_read(guard); \
37870+ } \
37871+ if (inode_file_plugin(inode) == \
37872+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
37873+ _result = method ## _unix_file args; \
37874+ else \
37875+ _result = method ## _cryptcompress args; \
37876+ if (should_protect(inode)) \
37877+ up_read(guard); \
37878+ _result; \
37879+})
37880+
37881+#define PROT_PASSIVE_VOID(method, args) \
37882+({ \
37883+ struct rw_semaphore * guard = \
37884+ &reiser4_inode_data(inode)->conv_sem; \
37885+ \
37886+ if (should_protect(inode)) { \
37887+ down_read(guard); \
37888+ if (!should_protect(inode)) \
37889+ up_read(guard); \
37890+ } \
37891+ if (inode_file_plugin(inode) == \
37892+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
37893+ method ## _unix_file args; \
37894+ else \
37895+ method ## _cryptcompress args; \
37896+ if (should_protect(inode)) \
37897+ up_read(guard); \
37898+})
37899+
37900+/* Macro for active protection.
37901+ active_expr contains readers and writers; after its
37902+ evaluation conversion should be disabled */
37903+#define PROT_ACTIVE(type, method, args, active_expr) \
37904+({ \
37905+ type _result = 0; \
37906+ struct rw_semaphore * guard = \
37907+ &reiser4_inode_data(inode)->conv_sem; \
37908+ reiser4_context * ctx = reiser4_init_context(inode->i_sb); \
37909+ if (IS_ERR(ctx)) \
37910+ return PTR_ERR(ctx); \
37911+ \
37912+ if (should_protect(inode)) { \
37913+ down_write(guard); \
37914+ if (should_protect(inode)) \
37915+ _result = active_expr; \
37916+ up_write(guard); \
37917+ } \
37918+ if (_result == 0) { \
37919+ if (inode_file_plugin(inode) == \
37920+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
37921+ _result = method ## _unix_file args; \
37922+ else \
37923+ _result = method ## _cryptcompress args; \
37924+ } \
37925+ reiser4_exit_context(ctx); \
37926+ _result; \
37927+})
37928+
37929+/* Pass management to the unix-file plugin with "notail" policy */
37930+static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
37931+{
37932+ int result;
37933+ reiser4_inode *info;
37934+ unix_file_info_t * uf;
37935+ info = reiser4_inode_data(inode);
37936+
37937+ result = aset_set_unsafe(&info->pset,
37938+ PSET_FILE,
37939+ (reiser4_plugin *)
37940+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
37941+ if (result)
37942+ return result;
37943+ result = aset_set_unsafe(&info->pset,
37944+ PSET_FORMATTING,
37945+ (reiser4_plugin *)
37946+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
37947+ if (result)
37948+ return result;
37949+ /* get rid of non-standard plugins */
37950+ info->plugin_mask &= ~cryptcompress_mask;
37951+ /* get rid of plugin stat-data extension */
37952+ info->extmask &= ~(1 << PLUGIN_STAT);
37953+
37954+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
37955+
37956+ /* FIXME use init_inode_data_unix_file() instead,
37957+ but aviod init_inode_ordering() */
37958+ /* Init unix-file specific part of inode */
37959+ uf = unix_file_inode_data(inode);
37960+ uf->container = UF_CONTAINER_UNKNOWN;
37961+ init_rwsem(&uf->latch);
37962+ uf->tplug = inode_formatting_plugin(inode);
37963+ uf->exclusive_use = 0;
37964+#if REISER4_DEBUG
37965+ uf->ea_owner = NULL;
37966+ atomic_set(&uf->nr_neas, 0);
37967+#endif
37968+ inode->i_op =
37969+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->inode_ops;
37970+ inode->i_fop =
37971+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->file_ops;
37972+ inode->i_mapping->a_ops =
37973+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->as_ops;
37974+ file->f_op = inode->i_fop;
37975+ return 0;
37976+}
37977+
37978+#if REISER4_DEBUG
37979+static int disabled_conversion_inode_ok(struct inode * inode)
37980+{
37981+ __u64 extmask = reiser4_inode_data(inode)->extmask;
37982+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
37983+
37984+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
37985+ (extmask & (1 << UNIX_STAT)) &&
37986+ (extmask & (1 << LARGE_TIMES_STAT)) &&
37987+ (extmask & (1 << PLUGIN_STAT)) &&
37988+ (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
37989+}
37990+#endif
37991+
37992+/* Assign another mode that will control
37993+ compression at flush time only */
37994+static int disable_conversion_no_update_sd(struct inode * inode)
37995+{
37996+ int result;
37997+ result =
37998+ force_plugin_pset(inode,
37999+ PSET_COMPRESSION_MODE,
38000+ (reiser4_plugin *)compression_mode_plugin_by_id
38001+ (LATTD_COMPRESSION_MODE_ID));
38002+ assert("edward-1500",
38003+ ergo(!result, disabled_conversion_inode_ok(inode)));
38004+ return result;
38005+}
38006+
38007+/* Disable future attempts to check/convert. This function is called by
38008+ conversion hooks. */
38009+static int disable_conversion(struct inode * inode)
38010+{
38011+ return disable_conversion_no_update_sd(inode);
38012+}
38013+
38014+static int check_position(struct inode * inode,
38015+ loff_t pos /* initial position in the file */,
38016+ reiser4_cluster_t * clust,
38017+ int * check_compress)
38018+{
38019+ assert("edward-1505", conversion_enabled(inode));
38020+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
38021+ /* if file size is more then cluster size, then compressible
38022+ status must be figured out (i.e. compression was disabled,
38023+ or file plugin was converted to unix_file) */
38024+
38025+ if (pos > inode->i_size)
38026+ /* first logical cluster will contain a (partial) hole */
38027+ return disable_conversion(inode);
38028+ if (inode->i_size == inode_cluster_size(inode))
38029+ *check_compress = 1;
38030+ return 0;
38031+}
38032+
38033+static void start_check_compressibility(struct inode * inode,
38034+ reiser4_cluster_t * clust,
38035+ hint_t * hint)
38036+{
38037+ assert("edward-1507", clust->index == 1);
38038+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
38039+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
38040+
38041+ hint_init_zero(hint);
38042+ clust->hint = hint;
38043+ clust->index --;
38044+ clust->nr_pages = count_to_nrpages(fsize_to_count(clust, inode));
38045+
38046+ /* first logical cluster (of index #0) must be complete */
38047+ assert("edward-1510", fsize_to_count(clust, inode) ==
38048+ inode_cluster_size(inode));
38049+}
38050+
38051+static void finish_check_compressibility(struct inode * inode,
38052+ reiser4_cluster_t * clust,
38053+ hint_t * hint)
38054+{
38055+ reiser4_unset_hint(clust->hint);
38056+ clust->hint = hint;
38057+ clust->index ++;
38058+}
38059+
38060+#if REISER4_DEBUG
38061+static int prepped_dclust_ok(hint_t * hint)
38062+{
38063+ reiser4_key key;
38064+ coord_t * coord = &hint->ext_coord.coord;
38065+
38066+ item_key_by_coord(coord, &key);
38067+ return (item_id_by_coord(coord) == CTAIL_ID &&
38068+ !coord_is_unprepped_ctail(coord) &&
38069+ (get_key_offset(&key) + nr_units_ctail(coord) ==
38070+ dclust_get_extension_dsize(hint)));
38071+}
38072+#endif
38073+
38074+#define fifty_persent(size) (size >> 1)
38075+/* evaluation of data compressibility */
38076+#define data_is_compressible(osize, isize) \
38077+ (osize < fifty_persent(isize))
38078+
38079+/* This is called only once per file life.
38080+ Read first logical cluster (of index #0) and estimate its compressibility.
38081+ Save estimation result in @compressible */
38082+static int read_check_compressibility(struct inode * inode,
38083+ reiser4_cluster_t * clust,
38084+ int * compressible)
38085+{
38086+ int i;
38087+ int result;
38088+ __u32 dst_len;
38089+ hint_t tmp_hint;
38090+ hint_t * cur_hint = clust->hint;
38091+
38092+ start_check_compressibility(inode, clust, &tmp_hint);
38093+
38094+ result = grab_cluster_pages(inode, clust);
38095+ if (result)
38096+ return result;
38097+ /* Read page cluster here */
38098+ for (i = 0; i < clust->nr_pages; i++) {
38099+ struct page *page = clust->pages[i];
38100+ lock_page(page);
38101+ result = do_readpage_ctail(inode, clust, page,
38102+ ZNODE_READ_LOCK);
38103+ unlock_page(page);
38104+ if (result)
38105+ goto error;
38106+ }
38107+ tfm_cluster_clr_uptodate(&clust->tc);
38108+
38109+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
38110+
38111+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
38112+ /* lenght of compressed data is known, no need to compress */
38113+ assert("edward-1511",
38114+ znode_is_write_locked(tmp_hint.ext_coord.coord.node));
38115+ assert("edward-1512",
38116+ WITH_DATA(tmp_hint.ext_coord.coord.node,
38117+ prepped_dclust_ok(&tmp_hint)));
38118+ dst_len = dclust_get_extension_dsize(&tmp_hint);
38119+ }
38120+ else {
38121+ tfm_cluster_t * tc = &clust->tc;
38122+ compression_plugin * cplug = inode_compression_plugin(inode);
38123+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
38124+ if (result)
38125+ goto error;
38126+ for (i = 0; i < clust->nr_pages; i++) {
38127+ char *data;
38128+ lock_page(clust->pages[i]);
38129+ BUG_ON(!PageUptodate(clust->pages[i]));
38130+ data = kmap(clust->pages[i]);
38131+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
38132+ data, PAGE_CACHE_SIZE);
38133+ kunmap(clust->pages[i]);
38134+ unlock_page(clust->pages[i]);
38135+ }
38136+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
38137+ if (result)
38138+ goto error;
38139+ result = grab_coa(tc, cplug);
38140+ if (result)
38141+ goto error;
38142+ tc->len = tc->lsize = fsize_to_count(clust, inode);
38143+ assert("edward-1513", tc->len == inode_cluster_size(inode));
38144+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
38145+ cplug->compress(get_coa(tc, cplug->h.id, tc->act),
38146+ tfm_input_data(clust), tc->len,
38147+ tfm_output_data(clust), &dst_len);
38148+ assert("edward-1514",
38149+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
38150+ }
38151+ finish_check_compressibility(inode, clust, cur_hint);
38152+ *compressible = data_is_compressible(dst_len,
38153+ inode_cluster_size(inode));
38154+ return 0;
38155+ error:
38156+ reiser4_release_cluster_pages(clust);
38157+ return result;
38158+}
38159+
38160+/* Cut disk cluster of index @idx */
38161+static int cut_disk_cluster(struct inode * inode, cloff_t idx)
38162+{
38163+ reiser4_key from, to;
38164+ assert("edward-1515", inode_file_plugin(inode) ==
38165+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
38166+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
38167+ to = from;
38168+ set_key_offset(&to,
38169+ get_key_offset(&from) + inode_cluster_size(inode) - 1);
38170+ return reiser4_cut_tree(reiser4_tree_by_inode(inode),
38171+ &from, &to, inode, 0);
38172+}
38173+
38174+static int reserve_cryptcompress2unixfile(struct inode *inode)
38175+{
38176+ reiser4_block_nr unformatted_nodes;
38177+ reiser4_tree *tree;
38178+
38179+ tree = reiser4_tree_by_inode(inode);
38180+
38181+ /* number of unformatted nodes which will be created */
38182+ unformatted_nodes = cluster_nrpages(inode); /* N */
38183+
38184+ /*
38185+ * space required for one iteration of extent->tail conversion:
38186+ *
38187+ * 1. kill ctail items
38188+ *
38189+ * 2. insert N unformatted nodes
38190+ *
38191+ * 3. insert N (worst-case single-block
38192+ * extents) extent units.
38193+ *
38194+ * 4. drilling to the leaf level by coord_by_key()
38195+ *
38196+ * 5. possible update of stat-data
38197+ *
38198+ */
38199+ grab_space_enable();
38200+ return reiser4_grab_space
38201+ (2 * tree->height +
38202+ unformatted_nodes +
38203+ unformatted_nodes * estimate_one_insert_into_item(tree) +
38204+ 1 + estimate_one_insert_item(tree) +
38205+ inode_file_plugin(inode)->estimate.update(inode),
38206+ BA_CAN_COMMIT);
38207+}
38208+
38209+/* clear flag that indicated conversion and update
38210+ stat-data with new (unix-file - specific) info */
38211+static int complete_file_conversion(struct inode *inode)
38212+{
38213+ int result;
38214+
38215+ grab_space_enable();
38216+ result =
38217+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
38218+ BA_CAN_COMMIT);
38219+ if (result == 0) {
38220+ reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38221+ result = reiser4_update_sd(inode);
38222+ }
38223+ if (result)
38224+ warning("edward-1452",
38225+ "Converting %llu to unix-file: update sd failed (%i)",
38226+ (unsigned long long)get_inode_oid(inode), result);
38227+ return 0;
38228+}
38229+
38230+
38231+/* do conversion */
38232+static int cryptcompress2unixfile(struct file *file, struct inode * inode,
38233+ reiser4_cluster_t * clust)
38234+{
38235+ int i;
38236+ int result = 0;
38237+ cryptcompress_info_t *cr_info;
38238+ unix_file_info_t *uf_info;
38239+
38240+ assert("edward-1516", clust->pages[0]->index == 0);
38241+ assert("edward-1517", clust->hint != NULL);
38242+
38243+ /* release all cryptcompress-specific recources */
38244+ cr_info = cryptcompress_inode_data(inode);
38245+ result = reserve_cryptcompress2unixfile(inode);
38246+ if (result)
38247+ goto out;
38248+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38249+ reiser4_unset_hint(clust->hint);
38250+ result = cut_disk_cluster(inode, 0);
38251+ if (result)
38252+ goto out;
38253+ /* captured jnode of cluster and assotiated resources (pages,
38254+ reserved disk space) were released by ->kill_hook() method
38255+ of the item plugin */
38256+
38257+ result = __cryptcompress2unixfile(file, inode);
38258+ if (result)
38259+ goto out;
38260+ /* At this point file is managed by unix file plugin */
38261+
38262+ uf_info = unix_file_inode_data(inode);
38263+
38264+ assert("edward-1518",
38265+ ergo(jprivate(clust->pages[0]),
38266+ !jnode_is_cluster_page(jprivate(clust->pages[0]))));
38267+ for(i = 0; i < clust->nr_pages; i++) {
38268+ assert("edward-1519", clust->pages[i]);
38269+ assert("edward-1520", PageUptodate(clust->pages[i]));
38270+
38271+ result = find_or_create_extent(clust->pages[i]);
38272+ if (result)
38273+ break;
38274+ }
38275+ if (!result) {
38276+ uf_info->container = UF_CONTAINER_EXTENTS;
38277+ complete_file_conversion(inode);
38278+ }
38279+ out:
38280+ all_grabbed2free();
38281+ if (result)
38282+ warning("edward-1453", "Failed to convert file %llu: %i",
38283+ (unsigned long long)get_inode_oid(inode), result);
38284+ return result;
38285+}
38286+
38287+/* Check, then perform or disable conversion if needed */
38288+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
38289+ reiser4_cluster_t * clust, int * progress)
38290+{
38291+ int result;
38292+ int check_compress = 0;
38293+ int compressible = 0;
38294+
38295+ if (!conversion_enabled(inode))
38296+ return 0;
38297+ result = check_position(inode, pos, clust, &check_compress);
38298+ if (result || !check_compress)
38299+ return result;
38300+ result = read_check_compressibility(inode, clust, &compressible);
38301+ if (result)
38302+ return result;
38303+
38304+ /* At this point page cluster is grabbed and uptodate */
38305+ if (!compressible) {
38306+ result = cryptcompress2unixfile(file, inode, clust);
38307+ if (result == 0)
38308+ *progress = 1;
38309+ }
38310+ else
38311+ result = disable_conversion(inode);
38312+
38313+ reiser4_release_cluster_pages(clust);
38314+ return result;
38315+}
38316+
38317+static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
38318+{
38319+ return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
38320+}
38321+
38322+/* Protected methods of cryptcompress file plugin constructed
38323+ by the macros above */
38324+
38325+/* Wrappers with active protection for:
38326+ . write_cryptcompress;
38327+ . setattr_cryptcompress;
38328+*/
38329+
38330+ssize_t prot_write_cryptcompress(struct file *file, const char __user *buf,
38331+ size_t count, loff_t *off)
38332+{
38333+ int prot = 0;
38334+ int conv = 0;
38335+ ssize_t written_cr = 0;
38336+ ssize_t written_uf = 0;
38337+ struct inode * inode = file->f_dentry->d_inode;
38338+ struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
38339+
38340+ if (should_protect(inode)) {
38341+ prot = 1;
38342+ down_write(guard);
38343+ }
38344+ written_cr = write_cryptcompress(file, buf, count, off, &conv);
38345+ if (prot)
38346+ up_write(guard);
38347+ if (written_cr < 0)
38348+ return written_cr;
38349+ if (conv)
38350+ written_uf = write_unix_file(file, buf + written_cr,
38351+ count - written_cr, off);
38352+ return written_cr + (written_uf < 0 ? 0 : written_uf);
38353+}
38354+
38355+int prot_setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
38356+{
38357+ struct inode * inode = dentry->d_inode;
38358+ return PROT_ACTIVE(int, setattr, (dentry, attr),
38359+ setattr_conversion_hook(inode, attr));
38360+}
38361+
38362+/* Wrappers with passive protection for:
38363+ . read_cryptcomperess;
38364+ . mmap_cryptcompress;
38365+ . release_cryptcompress;
38366+ . sendfile_cryptcompress;
38367+ . delete_object_cryptcompress.
38368+*/
38369+ssize_t prot_read_cryptcompress(struct file * file, char __user * buf,
38370+ size_t size, loff_t * off)
38371+{
38372+ struct inode * inode = file->f_dentry->d_inode;
38373+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
38374+}
38375+
38376+int prot_mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
38377+{
38378+ struct inode *inode = file->f_dentry->d_inode;
38379+ return PROT_PASSIVE(int, mmap, (file, vma));
38380+}
38381+
38382+int prot_release_cryptcompress(struct inode *inode, struct file *file)
38383+{
38384+ return PROT_PASSIVE(int, release, (inode, file));
38385+}
38386+
38387+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos,
38388+ size_t count, read_actor_t actor,
38389+ void *target)
38390+{
38391+ struct inode * inode = file->f_dentry->d_inode;
38392+ return PROT_PASSIVE(ssize_t, sendfile,
38393+ (file, ppos, count, actor, target));
38394+}
38395+
38396+/*
38397+ Local variables:
38398+ c-indentation-style: "K&R"
38399+ mode-name: "LC"
38400+ c-basic-offset: 8
38401+ tab-width: 8
38402+ fill-column: 80
38403+ scroll-step: 1
38404+ End:
38405+*/
38406diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file.h linux-2.6.20/fs/reiser4/plugin/file/file.h
38407--- linux-2.6.20.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300
38408+++ linux-2.6.20/fs/reiser4/plugin/file/file.h 2007-05-06 14:50:43.783001971 +0400
38409@@ -0,0 +1,272 @@
38410+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38411+ * reiser4/README */
38412+
38413+/* this file contains declarations of methods implementing
38414+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
38415+ and SYMLINK_FILE_PLUGIN_ID) */
38416+
38417+#if !defined( __REISER4_FILE_H__ )
38418+#define __REISER4_FILE_H__
38419+
38420+/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38421+
38422+/* inode operations */
38423+int setattr_unix_file(struct dentry *, struct iattr *);
38424+
38425+/* file operations */
38426+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38427+ loff_t *off);
38428+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38429+ loff_t * off);
38430+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38431+ unsigned long arg);
38432+int mmap_unix_file(struct file *, struct vm_area_struct *);
38433+int open_unix_file(struct inode *, struct file *);
38434+int release_unix_file(struct inode *, struct file *);
38435+int sync_unix_file(struct file *, struct dentry *, int datasync);
38436+ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38437+ read_actor_t, void *target);
38438+
38439+/* address space operations */
38440+int readpage_unix_file(struct file *, struct page *);
38441+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
38442+int writepages_unix_file(struct address_space *, struct writeback_control *);
38443+int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38444+ unsigned to);
38445+int commit_write_unix_file(struct file *, struct page *, unsigned from,
38446+ unsigned to);
38447+sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38448+
38449+/* file plugin operations */
38450+int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38451+ int user, loff_t, loff_t, rw_op, flow_t *);
38452+int owns_item_unix_file(const struct inode *, const coord_t *);
38453+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38454+ int create);
38455+int delete_object_unix_file(struct inode *);
38456+
38457+/*
38458+ * all the write into unix file is performed by item write method. Write method
38459+ * of unix file plugin only decides which item plugin (extent or tail) and in
38460+ * which mode (one from the enum below) to call
38461+ */
38462+typedef enum {
38463+ FIRST_ITEM = 1,
38464+ APPEND_ITEM = 2,
38465+ OVERWRITE_ITEM = 3
38466+} write_mode_t;
38467+
38468+/* unix file may be in one the following states */
38469+typedef enum {
38470+ UF_CONTAINER_UNKNOWN = 0,
38471+ UF_CONTAINER_TAILS = 1,
38472+ UF_CONTAINER_EXTENTS = 2,
38473+ UF_CONTAINER_EMPTY = 3
38474+} file_container_t;
38475+
38476+struct formatting_plugin;
38477+struct inode;
38478+
38479+/* unix file plugin specific part of reiser4 inode */
38480+typedef struct unix_file_info {
38481+ /*
38482+ * this read-write lock protects file containerization change. Accesses
38483+ * which do not change file containerization (see file_container_t)
38484+ * (read, readpage, writepage, write (until tail conversion is
38485+ * involved)) take read-lock. Accesses which modify file
38486+ * containerization (truncate, conversion from tail to extent and back)
38487+ * take write-lock.
38488+ */
38489+ struct rw_semaphore latch;
38490+ /* this enum specifies which items are used to build the file */
38491+ file_container_t container;
38492+ /*
38493+ * plugin which controls when file is to be converted to extents and
38494+ * back to tail
38495+ */
38496+ struct formatting_plugin *tplug;
38497+ /* if this is set, file is in exclusive use */
38498+ int exclusive_use;
38499+#if REISER4_DEBUG
38500+ /* pointer to task struct of thread owning exclusive access to file */
38501+ void *ea_owner;
38502+ atomic_t nr_neas;
38503+ void *last_reader;
38504+#endif
38505+} unix_file_info_t;
38506+
38507+struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38508+void get_exclusive_access(unix_file_info_t *);
38509+void drop_exclusive_access(unix_file_info_t *);
38510+void get_nonexclusive_access(unix_file_info_t *);
38511+void drop_nonexclusive_access(unix_file_info_t *);
38512+int try_to_get_nonexclusive_access(unix_file_info_t *);
38513+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38514+ struct inode *);
38515+int find_file_item_nohint(coord_t *, lock_handle *,
38516+ const reiser4_key *, znode_lock_mode,
38517+ struct inode *);
38518+
38519+int load_file_hint(struct file *, hint_t *);
38520+void save_file_hint(struct file *, const hint_t *);
38521+
38522+#include "../item/extent.h"
38523+#include "../item/tail.h"
38524+#include "../item/ctail.h"
38525+
38526+struct uf_coord {
38527+ coord_t coord;
38528+ lock_handle *lh;
38529+ int valid;
38530+ union {
38531+ extent_coord_extension_t extent;
38532+ tail_coord_extension_t tail;
38533+ ctail_coord_extension_t ctail;
38534+ } extension;
38535+};
38536+
38537+#include "../../forward.h"
38538+#include "../../seal.h"
38539+#include "../../lock.h"
38540+
38541+/*
38542+ * This structure is used to speed up file operations (reads and writes). A
38543+ * hint is a suggestion about where a key resolved to last time. A seal
38544+ * indicates whether a node has been modified since a hint was last recorded.
38545+ * You check the seal, and if the seal is still valid, you can use the hint
38546+ * without traversing the tree again.
38547+ */
38548+struct hint {
38549+ seal_t seal; /* a seal over last file item accessed */
38550+ uf_coord_t ext_coord;
38551+ loff_t offset;
38552+ znode_lock_mode mode;
38553+ lock_handle lh;
38554+};
38555+
38556+static inline int hint_is_valid(hint_t * hint)
38557+{
38558+ return hint->ext_coord.valid;
38559+}
38560+
38561+static inline void hint_set_valid(hint_t * hint)
38562+{
38563+ hint->ext_coord.valid = 1;
38564+}
38565+
38566+static inline void hint_clr_valid(hint_t * hint)
38567+{
38568+ hint->ext_coord.valid = 0;
38569+}
38570+
38571+int load_file_hint(struct file *, hint_t *);
38572+void save_file_hint(struct file *, const hint_t *);
38573+void hint_init_zero(hint_t *);
38574+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38575+int hint_is_set(const hint_t *);
38576+void reiser4_unset_hint(hint_t *);
38577+
38578+int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
38579+int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38580+ loff_t cur_size, int (*update_actor) (struct inode *,
38581+ reiser4_key *, int));
38582+#if REISER4_DEBUG
38583+
38584+/* return 1 is exclusive access is obtained, 0 - otherwise */
38585+static inline int ea_obtained(unix_file_info_t * uf_info)
38586+{
38587+ int ret;
38588+
38589+ ret = down_read_trylock(&uf_info->latch);
38590+ if (ret)
38591+ up_read(&uf_info->latch);
38592+ return !ret;
38593+}
38594+
38595+#endif
38596+
38597+/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38598+int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
38599+ reiser4_object_create_data *);
38600+void destroy_inode_symlink(struct inode *);
38601+
38602+/* declarations of functions implementing CRYPTCOMPRESS_FILE_PLUGIN_ID
38603+ file plugin */
38604+
38605+/* inode operations */
38606+int setattr_cryptcompress(struct dentry *, struct iattr *);
38607+int prot_setattr_cryptcompress(struct dentry *, struct iattr *);
38608+
38609+/* file operations */
38610+ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38611+ loff_t * off);
38612+ssize_t prot_read_cryptcompress(struct file *, char __user *buf,
38613+ size_t read_amount, loff_t * off);
38614+
38615+ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38616+ loff_t * off, int * conv);
38617+ssize_t prot_write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38618+ loff_t * off);
38619+int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38620+int prot_mmap_cryptcompress(struct file *, struct vm_area_struct *);
38621+ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38622+ read_actor_t actor, void *target);
38623+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38624+ read_actor_t actor, void *target);
38625+
38626+int release_cryptcompress(struct inode *, struct file *);
38627+int prot_release_cryptcompress(struct inode *, struct file *);
38628+
38629+/* address space operations */
38630+extern int readpage_cryptcompress(struct file *, struct page *);
38631+extern int writepages_cryptcompress(struct address_space *,
38632+ struct writeback_control *);
38633+/* file plugin operations */
38634+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38635+ int user, loff_t, loff_t, rw_op, flow_t *);
38636+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38637+int create_cryptcompress(struct inode *, struct inode *,
38638+ reiser4_object_create_data *);
38639+int delete_object_cryptcompress(struct inode *);
38640+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38641+ int create);
38642+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38643+ const reiser4_key * to_key,
38644+ reiser4_key * smallest_removed,
38645+ struct inode *object, int truncate,
38646+ int *progress);
38647+void destroy_inode_cryptcompress(struct inode *);
38648+int open_object_cryptcompress(struct inode * inode, struct file * file);
38649+
38650+extern reiser4_plugin_ops cryptcompress_plugin_ops;
38651+
38652+#define WRITE_GRANULARITY 32
38653+
38654+int tail2extent(unix_file_info_t *);
38655+int extent2tail(unix_file_info_t *);
38656+
38657+int goto_right_neighbor(coord_t *, lock_handle *);
38658+int find_or_create_extent(struct page *);
38659+int equal_to_ldk(znode *, const reiser4_key *);
38660+
38661+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
38662+
38663+static inline int cbk_errored(int cbk_result)
38664+{
38665+ return (cbk_result != CBK_COORD_NOTFOUND
38666+ && cbk_result != CBK_COORD_FOUND);
38667+}
38668+
38669+/* __REISER4_FILE_H__ */
38670+#endif
38671+
38672+/*
38673+ * Local variables:
38674+ * c-indentation-style: "K&R"
38675+ * mode-name: "LC"
38676+ * c-basic-offset: 8
38677+ * tab-width: 8
38678+ * fill-column: 79
38679+ * scroll-step: 1
38680+ * End:
38681+*/
38682diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/invert.c linux-2.6.20/fs/reiser4/plugin/file/invert.c
38683--- linux-2.6.20.orig/fs/reiser4/plugin/file/invert.c 1970-01-01 03:00:00.000000000 +0300
38684+++ linux-2.6.20/fs/reiser4/plugin/file/invert.c 2007-05-06 14:50:43.783001971 +0400
38685@@ -0,0 +1,493 @@
38686+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38687+
38688+/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
38689+ buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert
38690+ provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files
38691+ when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it
38692+ to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to
38693+ make that easy for you by providing those delimiters in what you read from it.
38694+
38695+ When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a
38696+ bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
38697+ would create those files. But which files? Well, that must be specified in the body of the invert using a special
38698+ syntax, and that specification is called the invert of the assignment.
38699+
38700+ When written to, an invert performs the assignment command that is written
38701+ to it, and modifies its own body to contain the invert of that
38702+ assignment.
38703+
38704+ In other words, writing to an invert file what you have read from it
38705+ is the identity operation.
38706+
38707+ Malformed assignments cause write errors. Partial writes are not
38708+ supported in v4.0, but will be.
38709+
38710+ Example:
38711+
38712+ If an invert contains:
38713+
38714+ /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
38715+
38716+======================
38717+Each element in this definition should be an invert, and all files
38718+should be called recursively - too. This is bad. If one of the
38719+included files in not a regular or invert file, then we can't read
38720+main file.
38721+
38722+I think to make it is possible easier:
38723+
38724+internal structure of invert file should be like symlink file. But
38725+read and write method should be explitely indicated in i/o operation..
38726+
38727+By default we read and write (if probably) as symlink and if we
38728+specify ..invert at reading time that too we can specify it at write time.
38729+
38730+example:
38731+/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
38732+will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
38733+
38734+read of /my_invert_file/..invert will be
38735+/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38736+
38737+but read of /my_invert_file/ will be
38738+The contents of filenameAsome text stored in the invertThe contents of filenameB
38739+
38740+we also can creat this file as
38741+/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
38742+will create /my_invert_file , and use existing files /filenameA and /filenameB.
38743+
38744+and when we will read it will be as previously invert file.
38745+
38746+This is correct?
38747+
38748+ vv
38749+DEMIDOV-FIXME-HANS:
38750+
38751+Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
38752+
38753+Do you agree? Discuss it on reiserfs-list....
38754+
38755+-Hans
38756+=======================
38757+
38758+ Then a read will return:
38759+
38760+ /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
38761+
38762+ and a write of the line above to the invert will set the contents of
38763+ the invert and filenameA and filenameB to their original values.
38764+
38765+ Note that the contents of an invert have no influence on the effect
38766+ of a write unless the write is a partial write (and a write of a
38767+ shorter file without using truncate first is a partial write).
38768+
38769+ truncate() has no effect on filenameA and filenameB, it merely
38770+ resets the value of the invert.
38771+
38772+ Writes to subfiles via the invert are implemented by preceding them
38773+ with truncates.
38774+
38775+ Parse failures cause write failures.
38776+
38777+ Questions to ponder: should the invert be acted on prior to file
38778+ close when writing to an open filedescriptor?
38779+
38780+ Example:
38781+
38782+ If an invert contains:
38783+
38784+ "(This text and a pair of quotes are all that is here.)
38785+
38786+Then a read will return:
38787+
38788+ "(This text and a pair of quotes are all that is here.)
38789+
38790+*/
38791+
38792+/* OPEN method places a struct file in memory associated with invert body
38793+ and returns something like file descriptor to the user for the future access
38794+ to the invert file.
38795+ During opening we parse the body of invert and get a list of the 'entryes'
38796+ (that describes all its subfiles) and place pointer on the first struct in
38797+ reiserfs-specific part of invert inode (arbitrary decision).
38798+
38799+ Each subfile is described by the struct inv_entry that has a pointer @sd on
38800+ in-core based stat-data and a pointer on struct file @f (if we find that the
38801+ subfile uses more then one unformated node (arbitrary decision), we load
38802+ struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
38803+ of some other information we need)
38804+
38805+ Since READ and WRITE methods for inverts were formulated in assignment
38806+ language, they don't contain arguments 'size' and 'offset' that make sense
38807+ only in ordinary read/write methods.
38808+
38809+ READ method is a combination of two methods:
38810+ 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
38811+ with @f != 0, this method uses pointer on struct file as an argument
38812+ 2) read method for inode-less files with @sd != 0, this method uses
38813+ in-core based stat-data instead struct file as an argument.
38814+ in the first case we don't use pagecache, just copy data that we got after
38815+ cbk() into userspace.
38816+
38817+ WRITE method for invert files is more complex.
38818+ Besides declared WRITE-interface in assignment languageb above we need
38819+ to have an opportunity to edit unwrapped body of invert file with some
38820+ text editor, it means we need GENERIC WRITE METHOD for invert file:
38821+
38822+ my_invert_file/..invert <- "string"
38823+
38824+ this method parses "string" and looks for correct subfile signatures, also
38825+ the parsing process splits this "string" on the set of flows in accordance
38826+ with the set of subfiles specified by this signarure.
38827+ The found list of signatures #S is compared with the opened one #I of invert
38828+ file. If it doesn't have this one (#I==0, it will be so for instance if we
38829+ have just create this invert file) the write method assignes found signature
38830+ (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
38831+ itself to the some write methods for ordinary or light-weight, or call itself
38832+ recursively for invert files with corresponding flows.
38833+ I am not sure, but the list of signatures looks like what mr.Demidov means
38834+ by 'delimiters'.
38835+
38836+ The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
38837+ and cause delete (create new) subfiles (arbitrary decision - it may looks
38838+ too complex, but this interface will be the completest). The order of entries
38839+ of list #S (#I) and inherited order on #I (#S) must coincide.
38840+ The other parsing results give malformed signature that aborts READ method
38841+ and releases all resources.
38842+
38843+ Format of subfile (entry) signature:
38844+
38845+ "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
38846+
38847+ Legend:
38848+
38849+ START_MAGIC - keyword indicates the start of subfile signature;
38850+
38851+ <> indicates the start of 'subfile metadata', that is the pair
38852+ (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
38853+
38854+ TYPE - the string "type" indicates the start of one of the three words:
38855+ - ORDINARY_FILE,
38856+ - LIGHT_WEIGHT_FILE,
38857+ - INVERT_FILE;
38858+
38859+ LOOKUP_ARG - lookup argument depends on previous type:
38860+ */
38861+
38862+ /************************************************************/
38863+ /* TYPE * LOOKUP ARGUMENT */
38864+ /************************************************************/
38865+ /* LIGH_WEIGHT_FILE * stat-data key */
38866+ /************************************************************/
38867+ /* ORDINARY_FILE * filename */
38868+ /************************************************************/
38869+ /* INVERT_FILE * filename */
38870+ /************************************************************/
38871+
38872+ /* where:
38873+ *stat-data key - the string contains stat data key of this subfile, it will be
38874+ passed to fast-access lookup method for light-weight files;
38875+ *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
38876+ for ordinary and invert files;
38877+
38878+ SUBFILE_BODY - data of this subfile (it will go to the flow)
38879+ END_MAGIC - the keyword indicates the end of subfile signature.
38880+
38881+ The other simbols inside the signature interpreted as 'unformatted content',
38882+ which is available with VFS's read_link() (arbitraruy decision).
38883+
38884+ NOTE: Parse method for a body of invert file uses mentioned signatures _without_
38885+ subfile bodies.
38886+
38887+ Now the only unclear thing is WRITE in regular light-weight subfile A that we
38888+ can describe only in assignment language:
38889+
38890+ A <- "some_string"
38891+
38892+ I guess we don't want to change stat-data and body items of file A
38893+ if this file exist, and size(A) != size("some_string") because this operation is
38894+ expencive, so we only do the partial write if size(A) > size("some_string")
38895+ and do truncate of the "some_string", and then do A <- "truncated string", if
38896+ size(A) < size("some_string"). This decision is also arbitrary..
38897+ */
38898+
38899+/* here is infrastructure for formated flows */
38900+
38901+#define SUBFILE_HEADER_MAGIC 0x19196605
38902+#define FLOW_HEADER_MAGIC 0x01194304
38903+
38904+#include "../plugin.h"
38905+#include "../../debug.h"
38906+#include "../../forward.h"
38907+#include "../object.h"
38908+#include "../item/item.h"
38909+#include "../item/static_stat.h"
38910+#include "../../dformat.h"
38911+#include "../znode.h"
38912+#include "../inode.h"
38913+
38914+#include <linux/types.h>
38915+#include <linux/fs.h> /* for struct file */
38916+#include <linux/list.h> /* for struct list_head */
38917+
38918+typedef enum {
38919+ LIGHT_WEIGHT_FILE,
38920+ ORDINARY_FILE,
38921+ INVERT_FILE
38922+} inv_entry_type;
38923+
38924+typedef struct flow_header {
38925+ d32 fl_magic;
38926+ d16 fl_nr; /* number of subfiles in the flow */
38927+};
38928+
38929+typedef struct subfile_header {
38930+ d32 sh_magic; /* subfile magic */
38931+ d16 sh_type; /* type of subfile: light-weight, ordinary, invert */
38932+ d16 sh_arg_len; /* lenght of lookup argument (filename, key) */
38933+ d32 sh_body_len; /* lenght of subfile body */
38934+};
38935+
38936+/* functions to get/set fields of flow header */
38937+
38938+static void fl_set_magic(flow_header * fh, __u32 value)
38939+{
38940+ cputod32(value, &fh->fh_magic);
38941+}
38942+
38943+static __u32 fl_get_magic(flow_header * fh)
38944+{
38945+ return d32tocpu(&fh->fh_magic);
38946+}
38947+static void fl_set_number(flow_header * fh, __u16 value)
38948+{
38949+ cputod16(value, &fh->fh_nr);
38950+}
38951+static unsigned fl_get_number(flow_header * fh)
38952+{
38953+ return d16tocpu(&fh->fh_nr);
38954+}
38955+
38956+/* functions to get/set fields of subfile header */
38957+
38958+static void sh_set_magic(subfile_header * sh, __u32 value)
38959+{
38960+ cputod32(value, &sh->sh_magic);
38961+}
38962+
38963+static __u32 sh_get_magic(subfile_header * sh)
38964+{
38965+ return d32tocpu(&sh->sh_magic);
38966+}
38967+static void sh_set_type(subfile_header * sh, __u16 value)
38968+{
38969+ cputod16(value, &sh->sh_magic);
38970+}
38971+static unsigned sh_get_type(subfile_header * sh)
38972+{
38973+ return d16tocpu(&sh->sh_magic);
38974+}
38975+static void sh_set_arg_len(subfile_header * sh, __u16 value)
38976+{
38977+ cputod16(value, &sh->sh_arg_len);
38978+}
38979+static unsigned sh_get_arg_len(subfile_header * sh)
38980+{
38981+ return d16tocpu(&sh->sh_arg_len);
38982+}
38983+static void sh_set_body_len(subfile_header * sh, __u32 value)
38984+{
38985+ cputod32(value, &sh->sh_body_len);
38986+}
38987+
38988+static __u32 sh_get_body_len(subfile_header * sh)
38989+{
38990+ return d32tocpu(&sh->sh_body_len);
38991+}
38992+
38993+/* in-core minimal stat-data, light-weight analog of inode */
38994+
38995+struct incore_sd_base {
38996+ umode_t isd_mode;
38997+ nlink_t isd_nlink;
38998+ loff_t isd_size;
38999+ char *isd_data; /* 'subflow' to write */
39000+};
39001+
39002+/* open invert create a list of invert entries,
39003+ every entry is represented by structure inv_entry */
39004+
39005+struct inv_entry {
39006+ struct list_head *ie_list;
39007+ struct file *ie_file; /* this is NULL if the file doesn't
39008+ have unformated nodes */
39009+ struct incore_sd_base *ie_sd; /* inode-less analog of struct file */
39010+};
39011+
39012+/* allocate and init invert entry */
39013+
39014+static struct inv_entry *allocate_inv_entry(void)
39015+{
39016+ struct inv_entry *inv_entry;
39017+
39018+ inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
39019+ if (!inv_entry)
39020+ return ERR_PTR(RETERR(-ENOMEM));
39021+ inv_entry->ie_file = NULL;
39022+ inv_entry->ie_sd = NULL;
39023+ INIT_LIST_HEAD(&inv_entry->ie_list);
39024+ return inv_entry;
39025+}
39026+
39027+static int put_inv_entry(struct inv_entry *ientry)
39028+{
39029+ int result = 0;
39030+
39031+ assert("edward-96", ientry != NULL);
39032+ assert("edward-97", ientry->ie_list != NULL);
39033+
39034+ list_del(ientry->ie_list);
39035+ if (ientry->ie_sd != NULL) {
39036+ kfree(ientry->ie_sd);
39037+ kfree(ientry);
39038+ }
39039+ if (ientry->ie_file != NULL)
39040+ result = filp_close(ientry->file, NULL);
39041+ return result;
39042+}
39043+
39044+static int allocate_incore_sd_base(struct inv_entry *inv_entry)
39045+{
39046+ struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
39047+ assert("edward-99", inv_entry->ie_inode = NULL);
39048+ assert("edward-100", inv_entry->ie_sd = NULL);
39049+
39050+ isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
39051+ if (!isd_base)
39052+ return RETERR(-ENOMEM);
39053+ inv_entry->ie_sd = isd_base;
39054+ return 0;
39055+}
39056+
39057+/* this can be installed as ->init_inv_entry () method of
39058+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
39059+ Copies data from on-disk stat-data format into light-weight analog of inode .
39060+ Doesn't hanlde stat-data extensions. */
39061+
39062+static void sd_base_load(struct inv_entry *inv_entry, char *sd)
39063+{
39064+ reiser4_stat_data_base *sd_base;
39065+
39066+ assert("edward-101", inv_entry != NULL);
39067+ assert("edward-101", inv_entry->ie_sd != NULL);
39068+ assert("edward-102", sd != NULL);
39069+
39070+ sd_base = (reiser4_stat_data_base *) sd;
39071+ inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
39072+ inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
39073+ inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
39074+ inv_entry->incore_sd_base->isd_data = NULL;
39075+}
39076+
39077+/* initialise incore stat-data */
39078+
39079+static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
39080+{
39081+ reiser4_plugin *plugin = item_plugin_by_coord(coord);
39082+ void *body = item_body_by_coord(coord);
39083+
39084+ assert("edward-103", inv_entry != NULL);
39085+ assert("edward-104", plugin != NULL);
39086+ assert("edward-105", body != NULL);
39087+
39088+ sd_base_load(inv_entry, body);
39089+}
39090+
39091+/* takes a key or filename and allocates new invert_entry,
39092+ init and adds it into the list,
39093+ we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
39094+
39095+int get_inv_entry(struct inode *invert_inode, /* inode of invert's body */
39096+ inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */
39097+ const reiser4_key * key, /* key of invert entry stat-data */
39098+ char *filename, /* filename of the file to be opened */
39099+ int flags, int mode)
39100+{
39101+ int result;
39102+ struct inv_entry *ientry;
39103+
39104+ assert("edward-107", invert_inode != NULL);
39105+
39106+ ientry = allocate_inv_entry();
39107+ if (IS_ERR(ientry))
39108+ return (PTR_ERR(ientry));
39109+
39110+ if (type == LIGHT_WEIGHT_FILE) {
39111+ coord_t coord;
39112+ lock_handle lh;
39113+
39114+ assert("edward-108", key != NULL);
39115+
39116+ init_coord(&coord);
39117+ init_lh(&lh);
39118+ result =
39119+ lookup_sd_by_key(reiser4_tree_by_inode(invert_inode),
39120+ ZNODE_READ_LOCK, &coord, &lh, key);
39121+ if (result == 0)
39122+ init_incore_sd_base(ientry, coord);
39123+
39124+ done_lh(&lh);
39125+ done_coord(&coord);
39126+ return (result);
39127+ } else {
39128+ struct file *file = filp_open(filename, flags, mode);
39129+ /* FIXME_EDWARD here we need to check if we
39130+ did't follow to any mount point */
39131+
39132+ assert("edward-108", filename != NULL);
39133+
39134+ if (IS_ERR(file))
39135+ return (PTR_ERR(file));
39136+ ientry->ie_file = file;
39137+ return 0;
39138+ }
39139+}
39140+
39141+/* takes inode of invert, reads the body of this invert, parses it,
39142+ opens all invert entries and return pointer on the first inv_entry */
39143+
39144+struct inv_entry *open_invert(struct file *invert_file)
39145+{
39146+
39147+}
39148+
39149+ssize_t subfile_read(struct *invert_entry, flow * f)
39150+{
39151+
39152+}
39153+
39154+ssize_t subfile_write(struct *invert_entry, flow * f)
39155+{
39156+
39157+}
39158+
39159+ssize_t invert_read(struct *file, flow * f)
39160+{
39161+
39162+}
39163+
39164+ssize_t invert_write(struct *file, flow * f)
39165+{
39166+
39167+}
39168+
39169+/* Make Linus happy.
39170+ Local variables:
39171+ c-indentation-style: "K&R"
39172+ mode-name: "LC"
39173+ c-basic-offset: 8
39174+ tab-width: 8
39175+ fill-column: 120
39176+ scroll-step: 1
39177+ End:
39178+*/
39179diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/Makefile linux-2.6.20/fs/reiser4/plugin/file/Makefile
39180--- linux-2.6.20.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300
39181+++ linux-2.6.20/fs/reiser4/plugin/file/Makefile 2007-05-06 14:50:43.783001971 +0400
39182@@ -0,0 +1,7 @@
39183+obj-$(CONFIG_REISER4_FS) += file_plugins.o
39184+
39185+file_plugins-objs := \
39186+ file.o \
39187+ tail_conversion.o \
39188+ symlink.o \
39189+ cryptcompress.o
39190diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.20/fs/reiser4/plugin/file/symfile.c
39191--- linux-2.6.20.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300
39192+++ linux-2.6.20/fs/reiser4/plugin/file/symfile.c 2007-05-06 14:50:43.787003221 +0400
39193@@ -0,0 +1,87 @@
39194+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39195+
39196+/* Symfiles are a generalization of Unix symlinks.
39197+
39198+ A symfile when read behaves as though you took its contents and
39199+ substituted them into the reiser4 naming system as the right hand side
39200+ of an assignment, and then read that which you had assigned to it.
39201+
39202+ A key issue for symfiles is how to implement writes through to
39203+ subfiles. In general, one must have some method of determining what
39204+ of that which is written to the symfile is written to what subfile.
39205+ This can be done by use of custom plugin methods written by users, or
39206+ by using a few general methods we provide for those willing to endure
39207+ the insertion of delimiters into what is read.
39208+
39209+ Writing to symfiles without delimiters to denote what is written to
39210+ what subfile is not supported by any plugins we provide in this
39211+ release. Our most sophisticated support for writes is that embodied
39212+ by the invert plugin (see invert.c).
39213+
39214+ A read only version of the /etc/passwd file might be
39215+ constructed as a symfile whose contents are as follows:
39216+
39217+ /etc/passwd/userlines/*
39218+
39219+ or
39220+
39221+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
39222+
39223+ or
39224+
39225+ /etc/passwd/userlines/(demidov+edward+reiser+root)
39226+
39227+ A symfile with contents
39228+
39229+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
39230+
39231+ will return when read
39232+
39233+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
39234+
39235+ and write of what has been read will not be possible to implement as
39236+ an identity operation because there are no delimiters denoting the
39237+ boundaries of what is to be written to what subfile.
39238+
39239+ Note that one could make this a read/write symfile if one specified
39240+ delimiters, and the write method understood those delimiters delimited
39241+ what was written to subfiles.
39242+
39243+ So, specifying the symfile in a manner that allows writes:
39244+
39245+ /etc/passwd/userlines/demidov+"(
39246+ )+/etc/passwd/userlines/edward+"(
39247+ )+/etc/passwd/userlines/reiser+"(
39248+ )+/etc/passwd/userlines/root+"(
39249+ )
39250+
39251+ or
39252+
39253+ /etc/passwd/userlines/(demidov+"(
39254+ )+edward+"(
39255+ )+reiser+"(
39256+ )+root+"(
39257+ ))
39258+
39259+ and the file demidov might be specified as:
39260+
39261+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39262+
39263+ or
39264+
39265+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39266+
39267+ Notice that if the file demidov has a carriage return in it, the
39268+ parsing fails, but then if you put carriage returns in the wrong place
39269+ in a normal /etc/passwd file it breaks things also.
39270+
39271+ Note that it is forbidden to have no text between two interpolations
39272+ if one wants to be able to define what parts of a write go to what
39273+ subfiles referenced in an interpolation.
39274+
39275+ If one wants to be able to add new lines by writing to the file, one
39276+ must either write a custom plugin for /etc/passwd that knows how to
39277+ name an added line, or one must use an invert, or one must use a more
39278+ sophisticated symfile syntax that we are not planning to write for
39279+ version 4.0.
39280+*/
39281diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.20/fs/reiser4/plugin/file/symlink.c
39282--- linux-2.6.20.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300
39283+++ linux-2.6.20/fs/reiser4/plugin/file/symlink.c 2007-05-06 14:50:43.787003221 +0400
39284@@ -0,0 +1,95 @@
39285+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39286+
39287+#include "../../inode.h"
39288+
39289+#include <linux/types.h>
39290+#include <linux/fs.h>
39291+
39292+/* file plugin methods specific for symlink files
39293+ (SYMLINK_FILE_PLUGIN_ID) */
39294+
39295+/* this is implementation of create_object method of file plugin for
39296+ SYMLINK_FILE_PLUGIN_ID
39297+ */
39298+
39299+/**
39300+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39301+ * @symlink: inode of symlink object
39302+ * @dir: inode of parent directory
39303+ * @info: parameters of new object
39304+ *
39305+ * Inserts stat data with symlink extension where into the tree.
39306+ */
39307+int reiser4_create_symlink(struct inode *symlink,
39308+ struct inode *dir UNUSED_ARG,
39309+ reiser4_object_create_data *data /* info passed to us
39310+ * this is filled by
39311+ * reiser4() syscall
39312+ * in particular */)
39313+{
39314+ int result;
39315+
39316+ assert("nikita-680", symlink != NULL);
39317+ assert("nikita-681", S_ISLNK(symlink->i_mode));
39318+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
39319+ assert("nikita-682", dir != NULL);
39320+ assert("nikita-684", data != NULL);
39321+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39322+
39323+ /*
39324+ * stat data of symlink has symlink extension in which we store
39325+ * symlink content, that is, path symlink is pointing to.
39326+ */
39327+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39328+
39329+ assert("vs-838", symlink->i_private == NULL);
39330+ symlink->i_private = (void *)data->name;
39331+
39332+ assert("vs-843", symlink->i_size == 0);
39333+ INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39334+
39335+ /* insert stat data appended with data->name */
39336+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39337+ if (result) {
39338+ /* FIXME-VS: Make sure that symlink->i_private is not attached
39339+ to kmalloced data */
39340+ INODE_SET_FIELD(symlink, i_size, 0);
39341+ } else {
39342+ assert("vs-849", symlink->i_private
39343+ && reiser4_inode_get_flag(symlink,
39344+ REISER4_GENERIC_PTR_USED));
39345+ assert("vs-850",
39346+ !memcmp((char *)symlink->i_private, data->name,
39347+ (size_t) symlink->i_size + 1));
39348+ }
39349+ return result;
39350+}
39351+
39352+/* this is implementation of destroy_inode method of file plugin for
39353+ SYMLINK_FILE_PLUGIN_ID
39354+ */
39355+void destroy_inode_symlink(struct inode *inode)
39356+{
39357+ assert("edward-799",
39358+ inode_file_plugin(inode) ==
39359+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39360+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39361+ assert("edward-801", reiser4_inode_get_flag(inode,
39362+ REISER4_GENERIC_PTR_USED));
39363+ assert("vs-839", S_ISLNK(inode->i_mode));
39364+
39365+ kfree(inode->i_private);
39366+ inode->i_private = NULL;
39367+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39368+}
39369+
39370+/*
39371+ Local variables:
39372+ c-indentation-style: "K&R"
39373+ mode-name: "LC"
39374+ c-basic-offset: 8
39375+ tab-width: 8
39376+ fill-column: 80
39377+ scroll-step: 1
39378+ End:
39379+*/
39380diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.20/fs/reiser4/plugin/file/tail_conversion.c
39381--- linux-2.6.20.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300
39382+++ linux-2.6.20/fs/reiser4/plugin/file/tail_conversion.c 2007-05-06 14:50:43.787003221 +0400
39383@@ -0,0 +1,729 @@
39384+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39385+
39386+#include "../../inode.h"
39387+#include "../../super.h"
39388+#include "../../page_cache.h"
39389+#include "../../carry.h"
39390+#include "../../safe_link.h"
39391+#include "../../vfs_ops.h"
39392+
39393+#include <linux/writeback.h>
39394+
39395+/* this file contains:
39396+ tail2extent and extent2tail */
39397+
39398+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39399+void get_exclusive_access(unix_file_info_t * uf_info)
39400+{
39401+ assert("nikita-3028", reiser4_schedulable());
39402+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39403+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39404+ /*
39405+ * "deadlock avoidance": sometimes we commit a transaction under
39406+ * rw-semaphore on a file. Such commit can deadlock with another
39407+ * thread that captured some block (hence preventing atom from being
39408+ * committed) and waits on rw-semaphore.
39409+ */
39410+ reiser4_txn_restart_current();
39411+ LOCK_CNT_INC(inode_sem_w);
39412+ down_write(&uf_info->latch);
39413+ uf_info->exclusive_use = 1;
39414+ assert("vs-1713", uf_info->ea_owner == NULL);
39415+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39416+ ON_DEBUG(uf_info->ea_owner = current);
39417+}
39418+
39419+void drop_exclusive_access(unix_file_info_t * uf_info)
39420+{
39421+ assert("vs-1714", uf_info->ea_owner == current);
39422+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39423+ ON_DEBUG(uf_info->ea_owner = NULL);
39424+ uf_info->exclusive_use = 0;
39425+ up_write(&uf_info->latch);
39426+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39427+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39428+ LOCK_CNT_DEC(inode_sem_w);
39429+ reiser4_txn_restart_current();
39430+}
39431+
39432+/**
39433+ * nea_grabbed - do something when file semaphore is down_read-ed
39434+ * @uf_info:
39435+ *
39436+ * This is called when nonexclisive access is obtained on file. All it does is
39437+ * for debugging purposes.
39438+ */
39439+static void nea_grabbed(unix_file_info_t *uf_info)
39440+{
39441+#if REISER4_DEBUG
39442+ LOCK_CNT_INC(inode_sem_r);
39443+ assert("vs-1716", uf_info->ea_owner == NULL);
39444+ atomic_inc(&uf_info->nr_neas);
39445+ uf_info->last_reader = current;
39446+#endif
39447+}
39448+
39449+/**
39450+ * get_nonexclusive_access - get nonexclusive access to a file
39451+ * @uf_info: unix file specific part of inode to obtain access to
39452+ *
39453+ * Nonexclusive access is obtained on a file before read, write, readpage.
39454+ */
39455+void get_nonexclusive_access(unix_file_info_t *uf_info)
39456+{
39457+ assert("nikita-3029", reiser4_schedulable());
39458+ assert("nikita-3361", get_current_context()->trans->atom == NULL);
39459+
39460+ down_read(&uf_info->latch);
39461+ nea_grabbed(uf_info);
39462+}
39463+
39464+/**
39465+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39466+ * @uf_info: unix file specific part of inode to obtain access to
39467+ *
39468+ * Non-blocking version of nonexclusive access obtaining.
39469+ */
39470+int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
39471+{
39472+ int result;
39473+
39474+ result = down_read_trylock(&uf_info->latch);
39475+ if (result)
39476+ nea_grabbed(uf_info);
39477+ return result;
39478+}
39479+
39480+void drop_nonexclusive_access(unix_file_info_t * uf_info)
39481+{
39482+ assert("vs-1718", uf_info->ea_owner == NULL);
39483+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
39484+ ON_DEBUG(atomic_dec(&uf_info->nr_neas));
39485+
39486+ up_read(&uf_info->latch);
39487+
39488+ LOCK_CNT_DEC(inode_sem_r);
39489+ reiser4_txn_restart_current();
39490+}
39491+
39492+/* part of tail2extent. Cut all items covering @count bytes starting from
39493+ @offset */
39494+/* Audited by: green(2002.06.15) */
39495+static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
39496+{
39497+ reiser4_key from, to;
39498+
39499+ /* AUDIT: How about putting an assertion here, what would check
39500+ all provided range is covered by tail items only? */
39501+ /* key of first byte in the range to be cut */
39502+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39503+
39504+ /* key of last byte in that range */
39505+ to = from;
39506+ set_key_offset(&to, (__u64) (offset + count - 1));
39507+
39508+ /* cut everything between those keys */
39509+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
39510+ inode, 0);
39511+}
39512+
39513+static void release_all_pages(struct page **pages, unsigned nr_pages)
39514+{
39515+ unsigned i;
39516+
39517+ for (i = 0; i < nr_pages; i++) {
39518+ if (pages[i] == NULL) {
39519+ unsigned j;
39520+ for (j = i + 1; j < nr_pages; j++)
39521+ assert("vs-1620", pages[j] == NULL);
39522+ break;
39523+ }
39524+ page_cache_release(pages[i]);
39525+ pages[i] = NULL;
39526+ }
39527+}
39528+
39529+/* part of tail2extent. replace tail items with extent one. Content of tail
39530+ items (@count bytes) being cut are copied already into
39531+ pages. extent_writepage method is called to create extents corresponding to
39532+ those pages */
39533+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
39534+{
39535+ int result;
39536+ unsigned i;
39537+ STORE_COUNTERS;
39538+
39539+ if (nr_pages == 0)
39540+ return 0;
39541+
39542+ assert("vs-596", pages[0]);
39543+
39544+ /* cut copied items */
39545+ result = cut_formatting_items(inode, page_offset(pages[0]), count);
39546+ if (result)
39547+ return result;
39548+
39549+ CHECK_COUNTERS;
39550+
39551+ /* put into tree replacement for just removed items: extent item, namely */
39552+ for (i = 0; i < nr_pages; i++) {
39553+ result = add_to_page_cache_lru(pages[i], inode->i_mapping,
39554+ pages[i]->index,
39555+ mapping_gfp_mask(inode->
39556+ i_mapping));
39557+ if (result)
39558+ break;
39559+ unlock_page(pages[i]);
39560+ result = find_or_create_extent(pages[i]);
39561+ if (result)
39562+ break;
39563+ SetPageUptodate(pages[i]);
39564+ }
39565+ return result;
39566+}
39567+
39568+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
39569+ * items */
39570+
39571+static int reserve_tail2extent_iteration(struct inode *inode)
39572+{
39573+ reiser4_block_nr unformatted_nodes;
39574+ reiser4_tree *tree;
39575+
39576+ tree = reiser4_tree_by_inode(inode);
39577+
39578+ /* number of unformatted nodes which will be created */
39579+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
39580+
39581+ /*
39582+ * space required for one iteration of extent->tail conversion:
39583+ *
39584+ * 1. kill N tail items
39585+ *
39586+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
39587+ *
39588+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
39589+ * extents) extent units.
39590+ *
39591+ * 4. drilling to the leaf level by coord_by_key()
39592+ *
39593+ * 5. possible update of stat-data
39594+ *
39595+ */
39596+ grab_space_enable();
39597+ return reiser4_grab_space
39598+ (2 * tree->height +
39599+ TAIL2EXTENT_PAGE_NUM +
39600+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
39601+ 1 + estimate_one_insert_item(tree) +
39602+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39603+}
39604+
39605+/* clear stat data's flag indicating that conversion is being converted */
39606+static int complete_conversion(struct inode *inode)
39607+{
39608+ int result;
39609+
39610+ grab_space_enable();
39611+ result =
39612+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39613+ BA_CAN_COMMIT);
39614+ if (result == 0) {
39615+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
39616+ result = reiser4_update_sd(inode);
39617+ }
39618+ if (result)
39619+ warning("vs-1696", "Failed to clear converting bit of %llu: %i",
39620+ (unsigned long long)get_inode_oid(inode), result);
39621+ return 0;
39622+}
39623+
39624+/**
39625+ * find_start
39626+ * @inode:
39627+ * @id:
39628+ * @offset:
39629+ *
39630+ * this is used by tail2extent and extent2tail to detect where previous
39631+ * uncompleted conversion stopped
39632+ */
39633+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
39634+{
39635+ int result;
39636+ lock_handle lh;
39637+ coord_t coord;
39638+ unix_file_info_t *ufo;
39639+ int found;
39640+ reiser4_key key;
39641+
39642+ ufo = unix_file_inode_data(inode);
39643+ init_lh(&lh);
39644+ result = 0;
39645+ found = 0;
39646+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
39647+ do {
39648+ init_lh(&lh);
39649+ result = find_file_item_nohint(&coord, &lh, &key,
39650+ ZNODE_READ_LOCK, inode);
39651+
39652+ if (result == CBK_COORD_FOUND) {
39653+ if (coord.between == AT_UNIT) {
39654+ /*coord_clear_iplug(&coord); */
39655+ result = zload(coord.node);
39656+ if (result == 0) {
39657+ if (item_id_by_coord(&coord) == id)
39658+ found = 1;
39659+ else
39660+ item_plugin_by_coord(&coord)->s.
39661+ file.append_key(&coord,
39662+ &key);
39663+ zrelse(coord.node);
39664+ }
39665+ } else
39666+ result = RETERR(-ENOENT);
39667+ }
39668+ done_lh(&lh);
39669+ } while (result == 0 && !found);
39670+ *offset = get_key_offset(&key);
39671+ return result;
39672+}
39673+
39674+/**
39675+ * tail2extent
39676+ * @uf_info:
39677+ *
39678+ *
39679+ */
39680+int tail2extent(unix_file_info_t *uf_info)
39681+{
39682+ int result;
39683+ reiser4_key key; /* key of next byte to be moved to page */
39684+ char *p_data; /* data of page */
39685+ unsigned page_off = 0, /* offset within the page where to copy data */
39686+ count; /* number of bytes of item which can be
39687+ * copied to page */
39688+ struct page *pages[TAIL2EXTENT_PAGE_NUM];
39689+ struct page *page;
39690+ int done; /* set to 1 when all file is read */
39691+ char *item;
39692+ int i;
39693+ struct inode *inode;
39694+ int first_iteration;
39695+ int bytes;
39696+ __u64 offset;
39697+
39698+ assert("nikita-3362", ea_obtained(uf_info));
39699+ inode = unix_file_info_to_inode(uf_info);
39700+ assert("nikita-3412", !IS_RDONLY(inode));
39701+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
39702+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
39703+
39704+ offset = 0;
39705+ first_iteration = 1;
39706+ result = 0;
39707+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
39708+ /*
39709+ * file is marked on disk as there was a conversion which did
39710+ * not complete due to either crash or some error. Find which
39711+ * offset tail conversion stopped at
39712+ */
39713+ result = find_start(inode, FORMATTING_ID, &offset);
39714+ if (result == -ENOENT) {
39715+ /* no tail items found, everything is converted */
39716+ uf_info->container = UF_CONTAINER_EXTENTS;
39717+ complete_conversion(inode);
39718+ return 0;
39719+ } else if (result != 0)
39720+ /* some other error */
39721+ return result;
39722+ first_iteration = 0;
39723+ }
39724+
39725+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
39726+
39727+ /* get key of first byte of a file */
39728+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
39729+
39730+ done = 0;
39731+ while (done == 0) {
39732+ memset(pages, 0, sizeof(pages));
39733+ result = reserve_tail2extent_iteration(inode);
39734+ if (result != 0)
39735+ goto out;
39736+ if (first_iteration) {
39737+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
39738+ reiser4_update_sd(inode);
39739+ first_iteration = 0;
39740+ }
39741+ bytes = 0;
39742+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
39743+ assert("vs-598",
39744+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
39745+ page = alloc_page(reiser4_ctx_gfp_mask_get());
39746+ if (!page) {
39747+ result = RETERR(-ENOMEM);
39748+ goto error;
39749+ }
39750+
39751+ page->index =
39752+ (unsigned long)(get_key_offset(&key) >>
39753+ PAGE_CACHE_SHIFT);
39754+ /*
39755+ * usually when one is going to longterm lock znode (as
39756+ * find_file_item does, for instance) he must not hold
39757+ * locked pages. However, there is an exception for
39758+ * case tail2extent. Pages appearing here are not
39759+ * reachable to everyone else, they are clean, they do
39760+ * not have jnodes attached so keeping them locked do
39761+ * not risk deadlock appearance
39762+ */
39763+ assert("vs-983", !PagePrivate(page));
39764+ reiser4_invalidate_pages(inode->i_mapping, page->index,
39765+ 1, 0);
39766+
39767+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
39768+ coord_t coord;
39769+ lock_handle lh;
39770+
39771+ /* get next item */
39772+ /* FIXME: we might want to readahead here */
39773+ init_lh(&lh);
39774+ result =
39775+ find_file_item_nohint(&coord, &lh, &key,
39776+ ZNODE_READ_LOCK,
39777+ inode);
39778+ if (result != CBK_COORD_FOUND) {
39779+ /*
39780+ * error happened of not items of file
39781+ * were found
39782+ */
39783+ done_lh(&lh);
39784+ page_cache_release(page);
39785+ goto error;
39786+ }
39787+
39788+ if (coord.between == AFTER_UNIT) {
39789+ /*
39790+ * end of file is reached. Padd page
39791+ * with zeros
39792+ */
39793+ done_lh(&lh);
39794+ done = 1;
39795+ p_data = kmap_atomic(page, KM_USER0);
39796+ memset(p_data + page_off, 0,
39797+ PAGE_CACHE_SIZE - page_off);
39798+ kunmap_atomic(p_data, KM_USER0);
39799+ break;
39800+ }
39801+
39802+ result = zload(coord.node);
39803+ if (result) {
39804+ page_cache_release(page);
39805+ done_lh(&lh);
39806+ goto error;
39807+ }
39808+ assert("vs-856", coord.between == AT_UNIT);
39809+ item = ((char *)item_body_by_coord(&coord)) +
39810+ coord.unit_pos;
39811+
39812+ /* how many bytes to copy */
39813+ count =
39814+ item_length_by_coord(&coord) -
39815+ coord.unit_pos;
39816+ /* limit length of copy to end of page */
39817+ if (count > PAGE_CACHE_SIZE - page_off)
39818+ count = PAGE_CACHE_SIZE - page_off;
39819+
39820+ /*
39821+ * copy item (as much as will fit starting from
39822+ * the beginning of the item) into the page
39823+ */
39824+ p_data = kmap_atomic(page, KM_USER0);
39825+ memcpy(p_data + page_off, item, count);
39826+ kunmap_atomic(p_data, KM_USER0);
39827+
39828+ page_off += count;
39829+ bytes += count;
39830+ set_key_offset(&key,
39831+ get_key_offset(&key) + count);
39832+
39833+ zrelse(coord.node);
39834+ done_lh(&lh);
39835+ } /* end of loop which fills one page by content of
39836+ * formatting items */
39837+
39838+ if (page_off) {
39839+ /* something was copied into page */
39840+ pages[i] = page;
39841+ } else {
39842+ page_cache_release(page);
39843+ assert("vs-1648", done == 1);
39844+ break;
39845+ }
39846+ } /* end of loop through pages of one conversion iteration */
39847+
39848+ if (i > 0) {
39849+ result = replace(inode, pages, i, bytes);
39850+ release_all_pages(pages, sizeof_array(pages));
39851+ if (result)
39852+ goto error;
39853+ /*
39854+ * We have to drop exclusive access to avoid deadlock
39855+ * which may happen because called by reiser4_writepages
39856+ * capture_unix_file requires to get non-exclusive
39857+ * access to a file. It is safe to drop EA in the middle
39858+ * of tail2extent conversion because write_unix_file,
39859+ * setattr_unix_file(truncate), mmap_unix_file,
39860+ * release_unix_file(extent2tail) checks if conversion
39861+ * is not in progress (see comments before
39862+ * get_exclusive_access_careful().
39863+ * Other processes that acquire non-exclusive access
39864+ * (read_unix_file, reiser4_writepages, etc) should work
39865+ * on partially converted files.
39866+ */
39867+ drop_exclusive_access(uf_info);
39868+ /* throttle the conversion */
39869+ reiser4_throttle_write(inode);
39870+ get_exclusive_access(uf_info);
39871+
39872+ /*
39873+ * nobody is allowed to complete conversion but a
39874+ * process which started it
39875+ */
39876+ assert("", reiser4_inode_get_flag(inode,
39877+ REISER4_PART_MIXED));
39878+ }
39879+ }
39880+
39881+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
39882+
39883+ if (result == 0) {
39884+ /* file is converted to extent items */
39885+ assert("vs-1697", reiser4_inode_get_flag(inode,
39886+ REISER4_PART_MIXED));
39887+
39888+ uf_info->container = UF_CONTAINER_EXTENTS;
39889+ complete_conversion(inode);
39890+ } else {
39891+ /*
39892+ * conversion is not complete. Inode was already marked as
39893+ * REISER4_PART_CONV and stat-data were updated at the first
39894+ * iteration of the loop above.
39895+ */
39896+ error:
39897+ release_all_pages(pages, sizeof_array(pages));
39898+ warning("nikita-2282", "Partial conversion of %llu: %i",
39899+ (unsigned long long)get_inode_oid(inode), result);
39900+ }
39901+
39902+ out:
39903+ return result;
39904+}
39905+
39906+static int reserve_extent2tail_iteration(struct inode *inode)
39907+{
39908+ reiser4_tree *tree;
39909+
39910+ tree = reiser4_tree_by_inode(inode);
39911+ /*
39912+ * reserve blocks for (in this order):
39913+ *
39914+ * 1. removal of extent item
39915+ *
39916+ * 2. insertion of tail by insert_flow()
39917+ *
39918+ * 3. drilling to the leaf level by coord_by_key()
39919+ *
39920+ * 4. possible update of stat-data
39921+ */
39922+ grab_space_enable();
39923+ return reiser4_grab_space
39924+ (estimate_one_item_removal(tree) +
39925+ estimate_insert_flow(tree->height) +
39926+ 1 + estimate_one_insert_item(tree) +
39927+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39928+}
39929+
39930+/* for every page of file: read page, cut part of extent pointing to this page,
39931+ put data of page tree by tail item */
39932+int extent2tail(unix_file_info_t *uf_info)
39933+{
39934+ int result;
39935+ struct inode *inode;
39936+ struct page *page;
39937+ unsigned long num_pages, i;
39938+ unsigned long start_page;
39939+ reiser4_key from;
39940+ reiser4_key to;
39941+ unsigned count;
39942+ __u64 offset;
39943+
39944+ assert("nikita-3362", ea_obtained(uf_info));
39945+ inode = unix_file_info_to_inode(uf_info);
39946+ assert("nikita-3412", !IS_RDONLY(inode));
39947+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
39948+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
39949+
39950+ offset = 0;
39951+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
39952+ /*
39953+ * file is marked on disk as there was a conversion which did
39954+ * not complete due to either crash or some error. Find which
39955+ * offset tail conversion stopped at
39956+ */
39957+ result = find_start(inode, EXTENT_POINTER_ID, &offset);
39958+ if (result == -ENOENT) {
39959+ /* no extent found, everything is converted */
39960+ uf_info->container = UF_CONTAINER_TAILS;
39961+ complete_conversion(inode);
39962+ return 0;
39963+ } else if (result != 0)
39964+ /* some other error */
39965+ return result;
39966+ }
39967+
39968+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
39969+
39970+ /* number of pages in the file */
39971+ num_pages =
39972+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
39973+ start_page = offset >> PAGE_CACHE_SHIFT;
39974+
39975+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39976+ to = from;
39977+
39978+ result = 0;
39979+ for (i = 0; i < num_pages; i++) {
39980+ __u64 start_byte;
39981+
39982+ result = reserve_extent2tail_iteration(inode);
39983+ if (result != 0)
39984+ break;
39985+ if (i == 0 && offset == 0) {
39986+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
39987+ reiser4_update_sd(inode);
39988+ }
39989+
39990+ page = read_mapping_page(inode->i_mapping,
39991+ (unsigned)(i + start_page), NULL);
39992+ if (IS_ERR(page)) {
39993+ result = PTR_ERR(page);
39994+ break;
39995+ }
39996+
39997+ wait_on_page_locked(page);
39998+
39999+ if (!PageUptodate(page)) {
40000+ page_cache_release(page);
40001+ result = RETERR(-EIO);
40002+ break;
40003+ }
40004+
40005+ /* cut part of file we have read */
40006+ start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
40007+ set_key_offset(&from, start_byte);
40008+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
40009+ /*
40010+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
40011+ * commits during over-long truncates. But
40012+ * extent->tail conversion should be performed in one
40013+ * transaction.
40014+ */
40015+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
40016+ &to, inode, 0);
40017+
40018+ if (result) {
40019+ page_cache_release(page);
40020+ break;
40021+ }
40022+
40023+ /* put page data into tree via tail_write */
40024+ count = PAGE_CACHE_SIZE;
40025+ if ((i == (num_pages - 1)) &&
40026+ (inode->i_size & ~PAGE_CACHE_MASK))
40027+ /* last page can be incompleted */
40028+ count = (inode->i_size & ~PAGE_CACHE_MASK);
40029+ while (count) {
40030+ struct dentry dentry;
40031+ struct file file;
40032+ loff_t pos;
40033+
40034+ dentry.d_inode = inode;
40035+ file.f_dentry = &dentry;
40036+ file.private_data = NULL;
40037+ file.f_pos = start_byte;
40038+ file.private_data = NULL;
40039+ pos = start_byte;
40040+ result = reiser4_write_tail(&file,
40041+ (char __user *)kmap(page),
40042+ count, &pos);
40043+ reiser4_free_file_fsdata(&file);
40044+ if (result <= 0) {
40045+ warning("", "reiser4_write_tail failed");
40046+ page_cache_release(page);
40047+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40048+ return result;
40049+ }
40050+ count -= result;
40051+ }
40052+
40053+ /* release page */
40054+ lock_page(page);
40055+ /* page is already detached from jnode and mapping. */
40056+ assert("vs-1086", page->mapping == NULL);
40057+ assert("nikita-2690",
40058+ (!PagePrivate(page) && jprivate(page) == 0));
40059+ /* waiting for writeback completion with page lock held is
40060+ * perfectly valid. */
40061+ wait_on_page_writeback(page);
40062+ reiser4_drop_page(page);
40063+ /* release reference taken by read_cache_page() above */
40064+ page_cache_release(page);
40065+
40066+ drop_exclusive_access(uf_info);
40067+ /* throttle the conversion */
40068+ reiser4_throttle_write(inode);
40069+ get_exclusive_access(uf_info);
40070+ /*
40071+ * nobody is allowed to complete conversion but a process which
40072+ * started it
40073+ */
40074+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
40075+ }
40076+
40077+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40078+
40079+ if (i == num_pages) {
40080+ /* file is converted to formatted items */
40081+ assert("vs-1698", reiser4_inode_get_flag(inode,
40082+ REISER4_PART_MIXED));
40083+ assert("vs-1260",
40084+ inode_has_no_jnodes(reiser4_inode_data(inode)));
40085+
40086+ uf_info->container = UF_CONTAINER_TAILS;
40087+ complete_conversion(inode);
40088+ return 0;
40089+ }
40090+ /*
40091+ * conversion is not complete. Inode was already marked as
40092+ * REISER4_PART_MIXED and stat-data were updated at the first *
40093+ * iteration of the loop above.
40094+ */
40095+ warning("nikita-2282",
40096+ "Partial conversion of %llu: %lu of %lu: %i",
40097+ (unsigned long long)get_inode_oid(inode), i,
40098+ num_pages, result);
40099+
40100+ return result;
40101+}
40102+
40103+/*
40104+ * Local variables:
40105+ * c-indentation-style: "K&R"
40106+ * mode-name: "LC"
40107+ * c-basic-offset: 8
40108+ * tab-width: 8
40109+ * fill-column: 79
40110+ * scroll-step: 1
40111+ * End:
40112+ */
40113diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_ops.c linux-2.6.20/fs/reiser4/plugin/file_ops.c
40114--- linux-2.6.20.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300
40115+++ linux-2.6.20/fs/reiser4/plugin/file_ops.c 2007-05-06 14:50:43.787003221 +0400
40116@@ -0,0 +1,168 @@
40117+/* Copyright 2005 by Hans Reiser, licensing governed by
40118+ reiser4/README */
40119+
40120+/* this file contains typical implementations for some of methods of
40121+ struct file_operations and of struct address_space_operations
40122+*/
40123+
40124+#include "../inode.h"
40125+#include "object.h"
40126+
40127+/* file operations */
40128+
40129+/* implementation of vfs's llseek method of struct file_operations for
40130+ typical directory can be found in readdir_common.c
40131+*/
40132+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
40133+
40134+/* implementation of vfs's readdir method of struct file_operations for
40135+ typical directory can be found in readdir_common.c
40136+*/
40137+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
40138+
40139+/**
40140+ * reiser4_release_dir_common - release of struct file_operations
40141+ * @inode: inode of released file
40142+ * @file: file to release
40143+ *
40144+ * Implementation of release method of struct file_operations for typical
40145+ * directory. All it does is freeing of reiser4 specific file data.
40146+*/
40147+int reiser4_release_dir_common(struct inode *inode, struct file *file)
40148+{
40149+ reiser4_context *ctx;
40150+
40151+ ctx = reiser4_init_context(inode->i_sb);
40152+ if (IS_ERR(ctx))
40153+ return PTR_ERR(ctx);
40154+ reiser4_free_file_fsdata(file);
40155+ reiser4_exit_context(ctx);
40156+ return 0;
40157+}
40158+
40159+/* this is common implementation of vfs's fsync method of struct
40160+ file_operations
40161+*/
40162+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
40163+{
40164+ reiser4_context *ctx;
40165+ int result;
40166+
40167+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
40168+ if (IS_ERR(ctx))
40169+ return PTR_ERR(ctx);
40170+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
40171+
40172+ context_set_commit_async(ctx);
40173+ reiser4_exit_context(ctx);
40174+ return result;
40175+}
40176+
40177+/* this is common implementation of vfs's sendfile method of struct
40178+ file_operations
40179+
40180+ Reads @count bytes from @file and calls @actor for every page read. This is
40181+ needed for loop back devices support.
40182+*/
40183+#if 0
40184+ssize_t
40185+sendfile_common(struct file *file, loff_t *ppos, size_t count,
40186+ read_actor_t actor, void *target)
40187+{
40188+ reiser4_context *ctx;
40189+ ssize_t result;
40190+
40191+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
40192+ if (IS_ERR(ctx))
40193+ return PTR_ERR(ctx);
40194+ result = generic_file_sendfile(file, ppos, count, actor, target);
40195+ reiser4_exit_context(ctx);
40196+ return result;
40197+}
40198+#endif /* 0 */
40199+
40200+/* address space operations */
40201+
40202+/* this is common implementation of vfs's prepare_write method of struct
40203+ address_space_operations
40204+*/
40205+int
40206+prepare_write_common(struct file *file, struct page *page, unsigned from,
40207+ unsigned to)
40208+{
40209+ reiser4_context *ctx;
40210+ int result;
40211+
40212+ ctx = reiser4_init_context(page->mapping->host->i_sb);
40213+ result = do_prepare_write(file, page, from, to);
40214+
40215+ /* don't commit transaction under inode semaphore */
40216+ context_set_commit_async(ctx);
40217+ reiser4_exit_context(ctx);
40218+
40219+ return result;
40220+}
40221+
40222+/* this is helper for prepare_write_common and prepare_write_unix_file
40223+ */
40224+int
40225+do_prepare_write(struct file *file, struct page *page, unsigned from,
40226+ unsigned to)
40227+{
40228+ int result;
40229+ file_plugin *fplug;
40230+ struct inode *inode;
40231+
40232+ assert("umka-3099", file != NULL);
40233+ assert("umka-3100", page != NULL);
40234+ assert("umka-3095", PageLocked(page));
40235+
40236+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
40237+ return 0;
40238+
40239+ inode = page->mapping->host;
40240+ fplug = inode_file_plugin(inode);
40241+
40242+ if (page->mapping->a_ops->readpage == NULL)
40243+ return RETERR(-EINVAL);
40244+
40245+ result = page->mapping->a_ops->readpage(file, page);
40246+ if (result != 0) {
40247+ SetPageError(page);
40248+ ClearPageUptodate(page);
40249+ /* All reiser4 readpage() implementations should return the
40250+ * page locked in case of error. */
40251+ assert("nikita-3472", PageLocked(page));
40252+ } else {
40253+ /*
40254+ * ->readpage() either:
40255+ *
40256+ * 1. starts IO against @page. @page is locked for IO in
40257+ * this case.
40258+ *
40259+ * 2. doesn't start IO. @page is unlocked.
40260+ *
40261+ * In either case, page should be locked.
40262+ */
40263+ lock_page(page);
40264+ /*
40265+ * IO (if any) is completed at this point. Check for IO
40266+ * errors.
40267+ */
40268+ if (!PageUptodate(page))
40269+ result = RETERR(-EIO);
40270+ }
40271+ assert("umka-3098", PageLocked(page));
40272+ return result;
40273+}
40274+
40275+/*
40276+ * Local variables:
40277+ * c-indentation-style: "K&R"
40278+ * mode-name: "LC"
40279+ * c-basic-offset: 8
40280+ * tab-width: 8
40281+ * fill-column: 79
40282+ * scroll-step: 1
40283+ * End:
40284+ */
40285diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.20/fs/reiser4/plugin/file_ops_readdir.c
40286--- linux-2.6.20.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300
40287+++ linux-2.6.20/fs/reiser4/plugin/file_ops_readdir.c 2007-05-06 14:50:43.791004471 +0400
40288@@ -0,0 +1,657 @@
40289+/* Copyright 2005 by Hans Reiser, licensing governed by
40290+ * reiser4/README */
40291+
40292+#include "../inode.h"
40293+
40294+/* return true, iff @coord points to the valid directory item that is part of
40295+ * @inode directory. */
40296+static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40297+{
40298+ return plugin_of_group(item_plugin_by_coord(coord),
40299+ DIR_ENTRY_ITEM_TYPE) &&
40300+ inode_file_plugin(inode)->owns_item(inode, coord);
40301+}
40302+
40303+/* compare two logical positions within the same directory */
40304+static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
40305+{
40306+ cmp_t result;
40307+
40308+ assert("nikita-2534", p1 != NULL);
40309+ assert("nikita-2535", p2 != NULL);
40310+
40311+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40312+ if (result == EQUAL_TO) {
40313+ int diff;
40314+
40315+ diff = p1->pos - p2->pos;
40316+ result =
40317+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40318+ }
40319+ return result;
40320+}
40321+
40322+/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
40323+ * necessary. */
40324+static void
40325+adjust_dir_pos(struct file *dir,
40326+ readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
40327+{
40328+ dir_pos *pos;
40329+
40330+ /*
40331+ * new directory entry was added (adj == +1) or removed (adj == -1) at
40332+ * the @mod_point. Directory file descriptor @dir is doing readdir and
40333+ * is currently positioned at @readdir_spot. Latter has to be updated
40334+ * to maintain stable readdir.
40335+ */
40336+ /* directory is positioned to the beginning. */
40337+ if (readdir_spot->entry_no == 0)
40338+ return;
40339+
40340+ pos = &readdir_spot->position;
40341+ switch (dir_pos_cmp(mod_point, pos)) {
40342+ case LESS_THAN:
40343+ /* @mod_pos is _before_ @readdir_spot, that is, entry was
40344+ * added/removed on the left (in key order) of current
40345+ * position. */
40346+ /* logical number of directory entry readdir is "looking" at
40347+ * changes */
40348+ readdir_spot->entry_no += adj;
40349+ assert("nikita-2577",
40350+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
40351+ if (de_id_cmp(&pos->dir_entry_key,
40352+ &mod_point->dir_entry_key) == EQUAL_TO) {
40353+ assert("nikita-2575", mod_point->pos < pos->pos);
40354+ /*
40355+ * if entry added/removed has the same key as current
40356+ * for readdir, update counter of duplicate keys in
40357+ * @readdir_spot.
40358+ */
40359+ pos->pos += adj;
40360+ }
40361+ break;
40362+ case GREATER_THAN:
40363+ /* directory is modified after @pos: nothing to do. */
40364+ break;
40365+ case EQUAL_TO:
40366+ /* cannot insert an entry readdir is looking at, because it
40367+ already exists. */
40368+ assert("nikita-2576", adj < 0);
40369+ /* directory entry to which @pos points to is being
40370+ removed.
40371+
40372+ NOTE-NIKITA: Right thing to do is to update @pos to point
40373+ to the next entry. This is complex (we are under spin-lock
40374+ for one thing). Just rewind it to the beginning. Next
40375+ readdir will have to scan the beginning of
40376+ directory. Proper solution is to use semaphore in
40377+ spin lock's stead and use rewind_right() here.
40378+
40379+ NOTE-NIKITA: now, semaphore is used, so...
40380+ */
40381+ memset(readdir_spot, 0, sizeof *readdir_spot);
40382+ }
40383+}
40384+
40385+/* scan all file-descriptors for this directory and adjust their
40386+ positions respectively. Should be used by implementations of
40387+ add_entry and rem_entry of dir plugin */
40388+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
40389+ int offset, int adj)
40390+{
40391+ reiser4_file_fsdata *scan;
40392+ dir_pos mod_point;
40393+
40394+ assert("nikita-2536", dir != NULL);
40395+ assert("nikita-2538", de != NULL);
40396+ assert("nikita-2539", adj != 0);
40397+
40398+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40399+ mod_point.pos = offset;
40400+
40401+ spin_lock_inode(dir);
40402+
40403+ /*
40404+ * new entry was added/removed in directory @dir. Scan all file
40405+ * descriptors for @dir that are currently involved into @readdir and
40406+ * update them.
40407+ */
40408+
40409+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40410+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40411+
40412+ spin_unlock_inode(dir);
40413+}
40414+
40415+/*
40416+ * traverse tree to start/continue readdir from the readdir position @pos.
40417+ */
40418+static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
40419+{
40420+ reiser4_key key;
40421+ int result;
40422+ struct inode *inode;
40423+
40424+ assert("nikita-2554", pos != NULL);
40425+
40426+ inode = dir->f_dentry->d_inode;
40427+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40428+ if (result != 0)
40429+ return result;
40430+ result = reiser4_object_lookup(inode,
40431+ &key,
40432+ tap->coord,
40433+ tap->lh,
40434+ tap->mode,
40435+ FIND_EXACT,
40436+ LEAF_LEVEL, LEAF_LEVEL,
40437+ 0, &tap->ra_info);
40438+ if (result == CBK_COORD_FOUND)
40439+ result = rewind_right(tap, (int)pos->position.pos);
40440+ else {
40441+ tap->coord->node = NULL;
40442+ done_lh(tap->lh);
40443+ result = RETERR(-EIO);
40444+ }
40445+ return result;
40446+}
40447+
40448+/*
40449+ * handling of non-unique keys: calculate at what ordinal position within
40450+ * sequence of directory items with identical keys @pos is.
40451+ */
40452+static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
40453+{
40454+ int result;
40455+ coord_t coord;
40456+ lock_handle lh;
40457+ tap_t scan;
40458+ de_id *did;
40459+ reiser4_key de_key;
40460+
40461+ coord_init_zero(&coord);
40462+ init_lh(&lh);
40463+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40464+ reiser4_tap_copy(&scan, tap);
40465+ reiser4_tap_load(&scan);
40466+ pos->position.pos = 0;
40467+
40468+ did = &pos->position.dir_entry_key;
40469+
40470+ if (is_valid_dir_coord(inode, scan.coord)) {
40471+
40472+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40473+
40474+ while (1) {
40475+
40476+ result = go_prev_unit(&scan);
40477+ if (result != 0)
40478+ break;
40479+
40480+ if (!is_valid_dir_coord(inode, scan.coord)) {
40481+ result = -EINVAL;
40482+ break;
40483+ }
40484+
40485+ /* get key of directory entry */
40486+ unit_key_by_coord(scan.coord, &de_key);
40487+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
40488+ /* duplicate-sequence is over */
40489+ break;
40490+ }
40491+ pos->position.pos++;
40492+ }
40493+ } else
40494+ result = RETERR(-ENOENT);
40495+ reiser4_tap_relse(&scan);
40496+ reiser4_tap_done(&scan);
40497+ return result;
40498+}
40499+
40500+/*
40501+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
40502+ */
40503+static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
40504+{
40505+ __u64 destination;
40506+ __s64 shift;
40507+ int result;
40508+ struct inode *inode;
40509+ loff_t dirpos;
40510+
40511+ assert("nikita-2553", dir != NULL);
40512+ assert("nikita-2548", pos != NULL);
40513+ assert("nikita-2551", tap->coord != NULL);
40514+ assert("nikita-2552", tap->lh != NULL);
40515+
40516+ dirpos = reiser4_get_dir_fpos(dir);
40517+ shift = dirpos - pos->fpos;
40518+ /* this is logical directory entry within @dir which we are rewinding
40519+ * to */
40520+ destination = pos->entry_no + shift;
40521+
40522+ inode = dir->f_dentry->d_inode;
40523+ if (dirpos < 0)
40524+ return RETERR(-EINVAL);
40525+ else if (destination == 0ll || dirpos == 0) {
40526+ /* rewind to the beginning of directory */
40527+ memset(pos, 0, sizeof *pos);
40528+ return dir_go_to(dir, pos, tap);
40529+ } else if (destination >= inode->i_size)
40530+ return RETERR(-ENOENT);
40531+
40532+ if (shift < 0) {
40533+ /* I am afraid of negative numbers */
40534+ shift = -shift;
40535+ /* rewinding to the left */
40536+ if (shift <= (int)pos->position.pos) {
40537+ /* destination is within sequence of entries with
40538+ duplicate keys. */
40539+ result = dir_go_to(dir, pos, tap);
40540+ } else {
40541+ shift -= pos->position.pos;
40542+ while (1) {
40543+ /* repetitions: deadlock is possible when
40544+ going to the left. */
40545+ result = dir_go_to(dir, pos, tap);
40546+ if (result == 0) {
40547+ result = rewind_left(tap, shift);
40548+ if (result == -E_DEADLOCK) {
40549+ reiser4_tap_done(tap);
40550+ continue;
40551+ }
40552+ }
40553+ break;
40554+ }
40555+ }
40556+ } else {
40557+ /* rewinding to the right */
40558+ result = dir_go_to(dir, pos, tap);
40559+ if (result == 0)
40560+ result = rewind_right(tap, shift);
40561+ }
40562+ if (result == 0) {
40563+ result = set_pos(inode, pos, tap);
40564+ if (result == 0) {
40565+ /* update pos->position.pos */
40566+ pos->entry_no = destination;
40567+ pos->fpos = dirpos;
40568+ }
40569+ }
40570+ return result;
40571+}
40572+
40573+/*
40574+ * Function that is called by common_readdir() on each directory entry while
40575+ * doing readdir. ->filldir callback may block, so we had to release long term
40576+ * lock while calling it. To avoid repeating tree traversal, seal is used. If
40577+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
40578+ *
40579+ * Whether node is unlocked in case of any other error is undefined. It is
40580+ * guaranteed to be still locked if success (0) is returned.
40581+ *
40582+ * When ->filldir() wants no more, feed_entry() returns 1, and node is
40583+ * unlocked.
40584+ */
40585+static int
40586+feed_entry(struct file *f,
40587+ readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
40588+{
40589+ item_plugin *iplug;
40590+ char *name;
40591+ reiser4_key sd_key;
40592+ int result;
40593+ char buf[DE_NAME_BUF_LEN];
40594+ char name_buf[32];
40595+ char *local_name;
40596+ unsigned file_type;
40597+ seal_t seal;
40598+ coord_t *coord;
40599+ reiser4_key entry_key;
40600+
40601+ coord = tap->coord;
40602+ iplug = item_plugin_by_coord(coord);
40603+
40604+ /* pointer to name within the node */
40605+ name = iplug->s.dir.extract_name(coord, buf);
40606+ assert("nikita-1371", name != NULL);
40607+
40608+ /* key of object the entry points to */
40609+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
40610+ return RETERR(-EIO);
40611+
40612+ /* we must release longterm znode lock before calling filldir to avoid
40613+ deadlock which may happen if filldir causes page fault. So, copy
40614+ name to intermediate buffer */
40615+ if (strlen(name) + 1 > sizeof(name_buf)) {
40616+ local_name = kmalloc(strlen(name) + 1,
40617+ reiser4_ctx_gfp_mask_get());
40618+ if (local_name == NULL)
40619+ return RETERR(-ENOMEM);
40620+ } else
40621+ local_name = name_buf;
40622+
40623+ strcpy(local_name, name);
40624+ file_type = iplug->s.dir.extract_file_type(coord);
40625+
40626+ unit_key_by_coord(coord, &entry_key);
40627+ reiser4_seal_init(&seal, coord, &entry_key);
40628+
40629+ longterm_unlock_znode(tap->lh);
40630+
40631+ /*
40632+ * send information about directory entry to the ->filldir() filler
40633+ * supplied to us by caller (VFS).
40634+ *
40635+ * ->filldir is entitled to do weird things. For example, ->filldir
40636+ * supplied by knfsd re-enters file system. Make sure no locks are
40637+ * held.
40638+ */
40639+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
40640+
40641+ reiser4_txn_restart_current();
40642+ result = filldir(dirent, name, (int)strlen(name),
40643+ /* offset of this entry */
40644+ f->f_pos,
40645+ /* inode number of object bounden by this entry */
40646+ oid_to_uino(get_key_objectid(&sd_key)), file_type);
40647+ if (local_name != name_buf)
40648+ kfree(local_name);
40649+ if (result < 0)
40650+ /* ->filldir() is satisfied. (no space in buffer, IOW) */
40651+ result = 1;
40652+ else
40653+ result = reiser4_seal_validate(&seal, coord, &entry_key,
40654+ tap->lh, tap->mode,
40655+ ZNODE_LOCK_HIPRI);
40656+ return result;
40657+}
40658+
40659+static void move_entry(readdir_pos * pos, coord_t * coord)
40660+{
40661+ reiser4_key de_key;
40662+ de_id *did;
40663+
40664+ /* update @pos */
40665+ ++pos->entry_no;
40666+ did = &pos->position.dir_entry_key;
40667+
40668+ /* get key of directory entry */
40669+ unit_key_by_coord(coord, &de_key);
40670+
40671+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
40672+ /* we are within sequence of directory entries
40673+ with duplicate keys. */
40674+ ++pos->position.pos;
40675+ else {
40676+ pos->position.pos = 0;
40677+ build_de_id_by_key(&de_key, did);
40678+ }
40679+ ++pos->fpos;
40680+}
40681+
40682+/*
40683+ * STATELESS READDIR
40684+ *
40685+ * readdir support in reiser4 relies on ability to update readdir_pos embedded
40686+ * into reiser4_file_fsdata on each directory modification (name insertion and
40687+ * removal), see reiser4_readdir_common() function below. This obviously doesn't
40688+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
40689+ * across client READDIR requests for the same directory.
40690+ *
40691+ * To address this we maintain a "pool" of detached reiser4_file_fsdata
40692+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
40693+ * find detached reiser4_file_fsdata corresponding to previous readdir
40694+ * request. In other words, additional state is maintained on the
40695+ * server. (This is somewhat contrary to the design goals of NFS protocol.)
40696+ *
40697+ * To efficiently detect when our ->readdir() method is called by NFS server,
40698+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
40699+ * file_is_stateless() function).
40700+ *
40701+ * To find out d_cursor in the pool, we encode client id (cid) in the highest
40702+ * bits of NFS readdir cookie: when first readdir request comes to the given
40703+ * directory from the given client, cookie is set to 0. This situation is
40704+ * detected, global cid_counter is incremented, and stored in highest bits of
40705+ * all direntry offsets returned to the client, including last one. As the
40706+ * only valid readdir cookie is one obtained as direntry->offset, we are
40707+ * guaranteed that next readdir request (continuing current one) will have
40708+ * current cid in the highest bits of starting readdir cookie. All d_cursors
40709+ * are hashed into per-super-block hash table by (oid, cid) key.
40710+ *
40711+ * In addition d_cursors are placed into per-super-block radix tree where they
40712+ * are keyed by oid alone. This is necessary to efficiently remove them during
40713+ * rmdir.
40714+ *
40715+ * At last, currently unused d_cursors are linked into special list. This list
40716+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
40717+ *
40718+ */
40719+
40720+/*
40721+ * prepare for readdir.
40722+ */
40723+static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
40724+{
40725+ struct inode *inode;
40726+ reiser4_file_fsdata *fsdata;
40727+ int result;
40728+
40729+ assert("nikita-1359", f != NULL);
40730+ inode = f->f_dentry->d_inode;
40731+ assert("nikita-1360", inode != NULL);
40732+
40733+ if (!S_ISDIR(inode->i_mode))
40734+ return RETERR(-ENOTDIR);
40735+
40736+ /* try to find detached readdir state */
40737+ result = reiser4_attach_fsdata(f, inode);
40738+ if (result != 0)
40739+ return result;
40740+
40741+ fsdata = reiser4_get_file_fsdata(f);
40742+ assert("nikita-2571", fsdata != NULL);
40743+ if (IS_ERR(fsdata))
40744+ return PTR_ERR(fsdata);
40745+
40746+ /* add file descriptor to the readdir list hanging of directory
40747+ * inode. This list is used to scan "readdirs-in-progress" while
40748+ * inserting or removing names in the directory. */
40749+ spin_lock_inode(inode);
40750+ if (list_empty_careful(&fsdata->dir.linkage))
40751+ list_add(&fsdata->dir.linkage, get_readdir_list(inode));
40752+ *pos = &fsdata->dir.readdir;
40753+ spin_unlock_inode(inode);
40754+
40755+ /* move @tap to the current position */
40756+ return dir_rewind(f, *pos, tap);
40757+}
40758+
40759+/* this is implementation of vfs's llseek method of struct file_operations for
40760+ typical directory
40761+ See comment before reiser4_readdir_common() for explanation.
40762+*/
40763+loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
40764+{
40765+ reiser4_context *ctx;
40766+ loff_t result;
40767+ struct inode *inode;
40768+
40769+ inode = file->f_dentry->d_inode;
40770+
40771+ ctx = reiser4_init_context(inode->i_sb);
40772+ if (IS_ERR(ctx))
40773+ return PTR_ERR(ctx);
40774+
40775+ mutex_lock(&inode->i_mutex);
40776+
40777+ /* update ->f_pos */
40778+ result = default_llseek(file, off, origin);
40779+ if (result >= 0) {
40780+ int ff;
40781+ coord_t coord;
40782+ lock_handle lh;
40783+ tap_t tap;
40784+ readdir_pos *pos;
40785+
40786+ coord_init_zero(&coord);
40787+ init_lh(&lh);
40788+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40789+
40790+ ff = dir_readdir_init(file, &tap, &pos);
40791+ reiser4_detach_fsdata(file);
40792+ if (ff != 0)
40793+ result = (loff_t) ff;
40794+ reiser4_tap_done(&tap);
40795+ }
40796+ reiser4_detach_fsdata(file);
40797+ mutex_unlock(&inode->i_mutex);
40798+
40799+ reiser4_exit_context(ctx);
40800+ return result;
40801+}
40802+
40803+/* this is common implementation of vfs's readdir method of struct
40804+ file_operations
40805+
40806+ readdir problems:
40807+
40808+ readdir(2)/getdents(2) interface is based on implicit assumption that
40809+ readdir can be restarted from any particular point by supplying file system
40810+ with off_t-full of data. That is, file system fills ->d_off field in struct
40811+ dirent and later user passes ->d_off to the seekdir(3), which is, actually,
40812+ implemented by glibc as lseek(2) on directory.
40813+
40814+ Reiser4 cannot restart readdir from 64 bits of data, because two last
40815+ components of the key of directory entry are unknown, which given 128 bits:
40816+ locality and type fields in the key of directory entry are always known, to
40817+ start readdir() from given point objectid and offset fields have to be
40818+ filled.
40819+
40820+ Traditional UNIX API for scanning through directory
40821+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
40822+ assumption that directory is structured very much like regular file, in
40823+ particular, it is implied that each name within given directory (directory
40824+ entry) can be uniquely identified by scalar offset and that such offset is
40825+ stable across the life-time of the name is identifies.
40826+
40827+ This is manifestly not so for reiser4. In reiser4 the only stable unique
40828+ identifies for the directory entry is its key that doesn't fit into
40829+ seekdir/telldir API.
40830+
40831+ solution:
40832+
40833+ Within each file descriptor participating in readdir-ing of directory
40834+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
40835+ the "current" directory entry that file descriptor looks at. It contains a
40836+ key of directory entry (plus some additional info to deal with non-unique
40837+ keys that we wouldn't dwell onto here) and a logical position of this
40838+ directory entry starting from the beginning of the directory, that is
40839+ ordinal number of this entry in the readdir order.
40840+
40841+ Obviously this logical position is not stable in the face of directory
40842+ modifications. To work around this, on each addition or removal of directory
40843+ entry all file descriptors for directory inode are scanned and their
40844+ readdir_pos are updated accordingly (adjust_dir_pos()).
40845+*/
40846+int reiser4_readdir_common(struct file *f /* directory file being read */,
40847+ void *dirent /* opaque data passed to us by VFS */,
40848+ filldir_t filld /* filler function passed to us
40849+ * by VFS */)
40850+{
40851+ reiser4_context *ctx;
40852+ int result;
40853+ struct inode *inode;
40854+ coord_t coord;
40855+ lock_handle lh;
40856+ tap_t tap;
40857+ readdir_pos *pos;
40858+
40859+ assert("nikita-1359", f != NULL);
40860+ inode = f->f_dentry->d_inode;
40861+ assert("nikita-1360", inode != NULL);
40862+
40863+ if (!S_ISDIR(inode->i_mode))
40864+ return RETERR(-ENOTDIR);
40865+
40866+ ctx = reiser4_init_context(inode->i_sb);
40867+ if (IS_ERR(ctx))
40868+ return PTR_ERR(ctx);
40869+
40870+ coord_init_zero(&coord);
40871+ init_lh(&lh);
40872+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
40873+
40874+ reiser4_readdir_readahead_init(inode, &tap);
40875+
40876+ repeat:
40877+ result = dir_readdir_init(f, &tap, &pos);
40878+ if (result == 0) {
40879+ result = reiser4_tap_load(&tap);
40880+ /* scan entries one by one feeding them to @filld */
40881+ while (result == 0) {
40882+ coord_t *coord;
40883+
40884+ coord = tap.coord;
40885+ assert("nikita-2572", coord_is_existing_unit(coord));
40886+ assert("nikita-3227", is_valid_dir_coord(inode, coord));
40887+
40888+ result = feed_entry(f, pos, &tap, filld, dirent);
40889+ if (result > 0) {
40890+ break;
40891+ } else if (result == 0) {
40892+ ++f->f_pos;
40893+ result = go_next_unit(&tap);
40894+ if (result == -E_NO_NEIGHBOR ||
40895+ result == -ENOENT) {
40896+ result = 0;
40897+ break;
40898+ } else if (result == 0) {
40899+ if (is_valid_dir_coord(inode, coord))
40900+ move_entry(pos, coord);
40901+ else
40902+ break;
40903+ }
40904+ } else if (result == -E_REPEAT) {
40905+ /* feed_entry() had to restart. */
40906+ ++f->f_pos;
40907+ reiser4_tap_relse(&tap);
40908+ goto repeat;
40909+ } else
40910+ warning("vs-1617",
40911+ "reiser4_readdir_common: unexpected error %d",
40912+ result);
40913+ }
40914+ reiser4_tap_relse(&tap);
40915+
40916+ if (result >= 0)
40917+ f->f_version = inode->i_version;
40918+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
40919+ result = 0;
40920+ reiser4_tap_done(&tap);
40921+ reiser4_detach_fsdata(f);
40922+
40923+ /* try to update directory's atime */
40924+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
40925+ BA_CAN_COMMIT) != 0)
40926+ warning("", "failed to update atime on readdir: %llu",
40927+ get_inode_oid(inode));
40928+ else
40929+ file_accessed(f);
40930+
40931+ context_set_commit_async(ctx);
40932+ reiser4_exit_context(ctx);
40933+
40934+ return (result <= 0) ? result : 0;
40935+}
40936+
40937+/*
40938+ * Local variables:
40939+ * c-indentation-style: "K&R"
40940+ * mode-name: "LC"
40941+ * c-basic-offset: 8
40942+ * tab-width: 8
40943+ * fill-column: 79
40944+ * End:
40945+ */
40946diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.20/fs/reiser4/plugin/file_plugin_common.c
40947--- linux-2.6.20.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
40948+++ linux-2.6.20/fs/reiser4/plugin/file_plugin_common.c 2007-05-06 14:50:43.791004471 +0400
40949@@ -0,0 +1,1007 @@
40950+/* Copyright 2005 by Hans Reiser, licensing governed by
40951+ reiser4/README */
40952+
40953+/* this file contains typical implementations for most of methods of
40954+ file plugin
40955+*/
40956+
40957+#include "../inode.h"
40958+#include "object.h"
40959+#include "../safe_link.h"
40960+
40961+#include <linux/quotaops.h>
40962+
40963+static int insert_new_sd(struct inode *inode);
40964+static int update_sd(struct inode *inode);
40965+
40966+/* this is common implementation of write_sd_by_inode method of file plugin
40967+ either insert stat data or update it
40968+ */
40969+int write_sd_by_inode_common(struct inode *inode /* object to save */ )
40970+{
40971+ int result;
40972+
40973+ assert("nikita-730", inode != NULL);
40974+
40975+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
40976+ /* object doesn't have stat-data yet */
40977+ result = insert_new_sd(inode);
40978+ else
40979+ result = update_sd(inode);
40980+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
40981+ /* Don't issue warnings about "name is too long" */
40982+ warning("nikita-2221", "Failed to save sd for %llu: %i",
40983+ (unsigned long long)get_inode_oid(inode), result);
40984+ return result;
40985+}
40986+
40987+/* this is common implementation of key_by_inode method of file plugin
40988+ */
40989+int
40990+key_by_inode_and_offset_common(struct inode *inode, loff_t off,
40991+ reiser4_key * key)
40992+{
40993+ reiser4_key_init(key);
40994+ set_key_locality(key, reiser4_inode_data(inode)->locality_id);
40995+ set_key_ordering(key, get_inode_ordering(inode));
40996+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
40997+ set_key_type(key, KEY_BODY_MINOR);
40998+ set_key_offset(key, (__u64) off);
40999+ return 0;
41000+}
41001+
41002+/* this is common implementation of set_plug_in_inode method of file plugin
41003+ */
41004+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
41005+ struct inode *parent /* parent object */ ,
41006+ reiser4_object_create_data * data /* creational
41007+ * data */ )
41008+{
41009+ __u64 mask;
41010+
41011+ object->i_mode = data->mode;
41012+ /* this should be plugin decision */
41013+ object->i_uid = current->fsuid;
41014+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
41015+
41016+ /* support for BSD style group-id assignment. See mount's manual page
41017+ description of bsdgroups ext2 mount options for more details */
41018+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
41019+ object->i_gid = parent->i_gid;
41020+ else if (parent->i_mode & S_ISGID) {
41021+ /* parent directory has sguid bit */
41022+ object->i_gid = parent->i_gid;
41023+ if (S_ISDIR(object->i_mode))
41024+ /* sguid is inherited by sub-directories */
41025+ object->i_mode |= S_ISGID;
41026+ } else
41027+ object->i_gid = current->fsgid;
41028+
41029+ /* this object doesn't have stat-data yet */
41030+ reiser4_inode_set_flag(object, REISER4_NO_SD);
41031+#if 0
41032+ /* this is now called after all inode plugins are initialized:
41033+ do_create_vfs_child after adjust_to_parent */
41034+ /* setup inode and file-operations for this inode */
41035+ setup_inode_ops(object, data);
41036+#endif
41037+ object->i_nlink = 0;
41038+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
41039+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
41040+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
41041+ mask |= (1 << LARGE_TIMES_STAT);
41042+
41043+ reiser4_inode_data(object)->extmask = mask;
41044+ return 0;
41045+}
41046+
41047+/* this is common implementation of adjust_to_parent method of file plugin for
41048+ regular files
41049+ */
41050+int adjust_to_parent_common(struct inode *object /* new object */ ,
41051+ struct inode *parent /* parent directory */ ,
41052+ struct inode *root /* root directory */ )
41053+{
41054+ assert("nikita-2165", object != NULL);
41055+ if (parent == NULL)
41056+ parent = root;
41057+ assert("nikita-2069", parent != NULL);
41058+
41059+ /*
41060+ * inherit missing plugins from parent
41061+ */
41062+
41063+ grab_plugin_pset(object, parent, PSET_FILE);
41064+ grab_plugin_pset(object, parent, PSET_SD);
41065+ grab_plugin_pset(object, parent, PSET_FORMATTING);
41066+ grab_plugin_pset(object, parent, PSET_PERM);
41067+ return 0;
41068+}
41069+
41070+/* this is common implementation of adjust_to_parent method of file plugin for
41071+ typical directories
41072+ */
41073+int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
41074+ struct inode *parent /* parent directory */ ,
41075+ struct inode *root /* root directory */ )
41076+{
41077+ int result = 0;
41078+ pset_member memb;
41079+
41080+ assert("nikita-2166", object != NULL);
41081+ if (parent == NULL)
41082+ parent = root;
41083+ assert("nikita-2167", parent != NULL);
41084+
41085+ /*
41086+ * inherit missing plugins from parent
41087+ */
41088+ for (memb = 0; memb < PSET_LAST; ++memb) {
41089+ result = grab_plugin_pset(object, parent, memb);
41090+ if (result != 0)
41091+ break;
41092+ }
41093+ return result;
41094+}
41095+
41096+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
41097+ struct inode *parent /* parent directory */,
41098+ struct inode *root /* root directory */)
41099+{
41100+ int result;
41101+ result = adjust_to_parent_common(object, parent, root);
41102+ if (result)
41103+ return result;
41104+ assert("edward-1416", parent != NULL);
41105+
41106+ grab_plugin_pset(object, parent, PSET_CLUSTER);
41107+ grab_plugin_pset(object, parent, PSET_CIPHER);
41108+ grab_plugin_pset(object, parent, PSET_DIGEST);
41109+ grab_plugin_pset(object, parent, PSET_COMPRESSION);
41110+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
41111+
41112+ return 0;
41113+}
41114+
41115+/* this is common implementation of create_object method of file plugin
41116+ */
41117+int reiser4_create_object_common(struct inode *object, struct inode *parent,
41118+ reiser4_object_create_data * data)
41119+{
41120+ reiser4_block_nr reserve;
41121+ assert("nikita-744", object != NULL);
41122+ assert("nikita-745", parent != NULL);
41123+ assert("nikita-747", data != NULL);
41124+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
41125+
41126+ reserve = estimate_create_common(object);
41127+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41128+ return RETERR(-ENOSPC);
41129+ return write_sd_by_inode_common(object);
41130+}
41131+
41132+static int common_object_delete_no_reserve(struct inode *inode);
41133+
41134+/**
41135+ * reiser4_delete_object_common - delete_object of file_plugin
41136+ * @inode: inode to be deleted
41137+ *
41138+ * This is common implementation of delete_object method of file_plugin. It
41139+ * applies to object its deletion consists of removing two items - stat data
41140+ * and safe-link.
41141+ */
41142+int reiser4_delete_object_common(struct inode *inode)
41143+{
41144+ int result;
41145+
41146+ assert("nikita-1477", inode != NULL);
41147+ /* FIXME: if file body deletion failed (i/o error, for instance),
41148+ inode->i_size can be != 0 here */
41149+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
41150+ assert("nikita-3421", inode->i_nlink == 0);
41151+
41152+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
41153+ reiser4_block_nr reserve;
41154+
41155+ /* grab space which is needed to remove 2 items from the tree:
41156+ stat data and safe-link */
41157+ reserve = 2 *
41158+ estimate_one_item_removal(reiser4_tree_by_inode(inode));
41159+ if (reiser4_grab_space_force(reserve,
41160+ BA_RESERVED | BA_CAN_COMMIT))
41161+ return RETERR(-ENOSPC);
41162+ result = common_object_delete_no_reserve(inode);
41163+ } else
41164+ result = 0;
41165+ return result;
41166+}
41167+
41168+/**
41169+ * reiser4_delete_dir_common - delete_object of file_plugin
41170+ * @inode: inode to be deleted
41171+ *
41172+ * This is common implementation of delete_object method of file_plugin for
41173+ * typical directory. It calls done method of dir_plugin to remove "." and
41174+ * removes stat data and safe-link.
41175+ */
41176+int reiser4_delete_dir_common(struct inode *inode)
41177+{
41178+ int result;
41179+ dir_plugin *dplug;
41180+
41181+ assert("", (get_current_context() &&
41182+ get_current_context()->trans->atom == NULL));
41183+
41184+ dplug = inode_dir_plugin(inode);
41185+ assert("vs-1101", dplug && dplug->done);
41186+
41187+ /* kill cursors which might be attached to inode */
41188+ reiser4_kill_cursors(inode);
41189+
41190+ /* grab space enough for removing two items */
41191+ if (reiser4_grab_space
41192+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
41193+ BA_RESERVED | BA_CAN_COMMIT))
41194+ return RETERR(-ENOSPC);
41195+
41196+ result = dplug->done(inode);
41197+ if (!result)
41198+ result = common_object_delete_no_reserve(inode);
41199+ return result;
41200+}
41201+
41202+/* this is common implementation of add_link method of file plugin
41203+ */
41204+int reiser4_add_link_common(struct inode *object, struct inode *parent)
41205+{
41206+ /*
41207+ * increment ->i_nlink and update ->i_ctime
41208+ */
41209+
41210+ INODE_INC_FIELD(object, i_nlink);
41211+ object->i_ctime = CURRENT_TIME;
41212+ return 0;
41213+}
41214+
41215+/* this is common implementation of rem_link method of file plugin
41216+ */
41217+int reiser4_rem_link_common(struct inode *object, struct inode *parent)
41218+{
41219+ assert("nikita-2021", object != NULL);
41220+ assert("nikita-2163", object->i_nlink > 0);
41221+
41222+ /*
41223+ * decrement ->i_nlink and update ->i_ctime
41224+ */
41225+
41226+ INODE_DEC_FIELD(object, i_nlink);
41227+ object->i_ctime = CURRENT_TIME;
41228+ return 0;
41229+}
41230+
41231+/* this is common implementation of rem_link method of file plugin for typical
41232+ directory
41233+*/
41234+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
41235+{
41236+ assert("nikita-20211", object != NULL);
41237+ assert("nikita-21631", object->i_nlink > 0);
41238+
41239+ /*
41240+ * decrement ->i_nlink and update ->i_ctime
41241+ */
41242+ INODE_DEC_FIELD(object, i_nlink);
41243+ if (object->i_nlink == 1)
41244+ INODE_DEC_FIELD(object, i_nlink);
41245+ object->i_ctime = CURRENT_TIME;
41246+ return 0;
41247+}
41248+
41249+/* this is common implementation of owns_item method of file plugin
41250+ compare objectids of keys in inode and coord */
41251+int owns_item_common(const struct inode *inode, /* object to check
41252+ * against */
41253+ const coord_t * coord /* coord to check */ )
41254+{
41255+ reiser4_key item_key;
41256+ reiser4_key file_key;
41257+
41258+ assert("nikita-760", inode != NULL);
41259+ assert("nikita-761", coord != NULL);
41260+
41261+ return coord_is_existing_item(coord) &&
41262+ (get_key_objectid(build_sd_key(inode, &file_key)) ==
41263+ get_key_objectid(item_key_by_coord(coord, &item_key)));
41264+}
41265+
41266+/* this is common implementation of owns_item method of file plugin
41267+ for typical directory
41268+*/
41269+int owns_item_common_dir(const struct inode *inode, /* object to check against */
41270+ const coord_t * coord /* coord of item to check */ )
41271+{
41272+ reiser4_key item_key;
41273+
41274+ assert("nikita-1335", inode != NULL);
41275+ assert("nikita-1334", coord != NULL);
41276+
41277+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
41278+ return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41279+ get_inode_oid(inode);
41280+ else
41281+ return owns_item_common(inode, coord);
41282+}
41283+
41284+/* this is common implementation of can_add_link method of file plugin
41285+ checks whether yet another hard links to this object can be added
41286+*/
41287+int can_add_link_common(const struct inode *object /* object to check */ )
41288+{
41289+ assert("nikita-732", object != NULL);
41290+
41291+ /* inode->i_nlink is unsigned int, so just check for integer
41292+ overflow */
41293+ return object->i_nlink + 1 != 0;
41294+}
41295+
41296+/* this is common implementation of can_rem_link method of file plugin for
41297+ typical directory
41298+*/
41299+int can_rem_link_common_dir(const struct inode *inode)
41300+{
41301+ /* is_dir_empty() returns 0 is dir is empty */
41302+ return !is_dir_empty(inode);
41303+}
41304+
41305+/* this is common implementation of detach method of file plugin for typical
41306+ directory
41307+*/
41308+int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
41309+{
41310+ dir_plugin *dplug;
41311+
41312+ dplug = inode_dir_plugin(child);
41313+ assert("nikita-2883", dplug != NULL);
41314+ assert("nikita-2884", dplug->detach != NULL);
41315+ return dplug->detach(child, parent);
41316+}
41317+
41318+/* this is common implementation of bind method of file plugin for typical
41319+ directory
41320+*/
41321+int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
41322+{
41323+ dir_plugin *dplug;
41324+
41325+ dplug = inode_dir_plugin(child);
41326+ assert("nikita-2646", dplug != NULL);
41327+ return dplug->attach(child, parent);
41328+}
41329+
41330+static int process_truncate(struct inode *, __u64 size);
41331+
41332+/* this is common implementation of safelink method of file plugin
41333+ */
41334+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41335+{
41336+ int result;
41337+
41338+ assert("vs-1705", get_current_context()->trans->atom == NULL);
41339+ if (link == SAFE_UNLINK)
41340+ /* nothing to do. iput() in the caller (process_safelink) will
41341+ * finish with file */
41342+ result = 0;
41343+ else if (link == SAFE_TRUNCATE)
41344+ result = process_truncate(object, value);
41345+ else {
41346+ warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41347+ result = RETERR(-EIO);
41348+ }
41349+ return result;
41350+}
41351+
41352+/* this is common implementation of estimate.create method of file plugin
41353+ can be used when object creation involves insertion of one item (usually stat
41354+ data) into tree
41355+*/
41356+reiser4_block_nr estimate_create_common(const struct inode * object)
41357+{
41358+ return estimate_one_insert_item(reiser4_tree_by_inode(object));
41359+}
41360+
41361+/* this is common implementation of estimate.create method of file plugin for
41362+ typical directory
41363+ can be used when directory creation involves insertion of two items (usually
41364+ stat data and item containing "." and "..") into tree
41365+*/
41366+reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41367+{
41368+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
41369+}
41370+
41371+/* this is common implementation of estimate.update method of file plugin
41372+ can be used when stat data update does not do more than inserting a unit
41373+ into a stat data item which is probably true for most cases
41374+*/
41375+reiser4_block_nr estimate_update_common(const struct inode * inode)
41376+{
41377+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
41378+}
41379+
41380+/* this is common implementation of estimate.unlink method of file plugin
41381+ */
41382+reiser4_block_nr
41383+estimate_unlink_common(const struct inode * object UNUSED_ARG,
41384+ const struct inode * parent UNUSED_ARG)
41385+{
41386+ return 0;
41387+}
41388+
41389+/* this is common implementation of estimate.unlink method of file plugin for
41390+ typical directory
41391+*/
41392+reiser4_block_nr
41393+estimate_unlink_common_dir(const struct inode * object,
41394+ const struct inode * parent)
41395+{
41396+ dir_plugin *dplug;
41397+
41398+ dplug = inode_dir_plugin(object);
41399+ assert("nikita-2888", dplug != NULL);
41400+ assert("nikita-2887", dplug->estimate.unlink != NULL);
41401+ return dplug->estimate.unlink(object, parent);
41402+}
41403+
41404+char *wire_write_common(struct inode *inode, char *start)
41405+{
41406+ return build_inode_onwire(inode, start);
41407+}
41408+
41409+char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41410+{
41411+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41412+}
41413+
41414+struct dentry *wire_get_common(struct super_block *sb,
41415+ reiser4_object_on_wire * obj)
41416+{
41417+ struct inode *inode;
41418+ struct dentry *dentry;
41419+ reiser4_key key;
41420+
41421+ extract_key_from_id(&obj->u.std.key_id, &key);
41422+ inode = reiser4_iget(sb, &key, 1);
41423+ if (!IS_ERR(inode)) {
41424+ reiser4_iget_complete(inode);
41425+ dentry = d_alloc_anon(inode);
41426+ if (dentry == NULL) {
41427+ iput(inode);
41428+ dentry = ERR_PTR(-ENOMEM);
41429+ } else
41430+ dentry->d_op = &get_super_private(sb)->ops.dentry;
41431+ } else if (PTR_ERR(inode) == -ENOENT)
41432+ /*
41433+ * inode wasn't found at the key encoded in the file
41434+ * handle. Hence, file handle is stale.
41435+ */
41436+ dentry = ERR_PTR(RETERR(-ESTALE));
41437+ else
41438+ dentry = (void *)inode;
41439+ return dentry;
41440+}
41441+
41442+int wire_size_common(struct inode *inode)
41443+{
41444+ return inode_onwire_size(inode);
41445+}
41446+
41447+void wire_done_common(reiser4_object_on_wire * obj)
41448+{
41449+ /* nothing to do */
41450+}
41451+
41452+/* helper function to print errors */
41453+static void key_warning(const reiser4_key * key /* key to print */ ,
41454+ const struct inode *inode,
41455+ int code /* error code to print */ )
41456+{
41457+ assert("nikita-716", key != NULL);
41458+
41459+ if (code != -ENOMEM) {
41460+ warning("nikita-717", "Error for inode %llu (%i)",
41461+ (unsigned long long)get_key_objectid(key), code);
41462+ reiser4_print_key("for key", key);
41463+ }
41464+}
41465+
41466+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41467+#if REISER4_DEBUG
41468+static void
41469+check_inode_seal(const struct inode *inode,
41470+ const coord_t * coord, const reiser4_key * key)
41471+{
41472+ reiser4_key unit_key;
41473+
41474+ unit_key_by_coord(coord, &unit_key);
41475+ assert("nikita-2752",
41476+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
41477+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
41478+}
41479+
41480+static void check_sd_coord(coord_t * coord, const reiser4_key * key)
41481+{
41482+ reiser4_key ukey;
41483+
41484+ coord_clear_iplug(coord);
41485+ if (zload(coord->node))
41486+ return;
41487+
41488+ if (!coord_is_existing_unit(coord) ||
41489+ !item_plugin_by_coord(coord) ||
41490+ !keyeq(unit_key_by_coord(coord, &ukey), key) ||
41491+ (znode_get_level(coord->node) != LEAF_LEVEL) ||
41492+ !item_is_statdata(coord)) {
41493+ warning("nikita-1901", "Conspicuous seal");
41494+ reiser4_print_key("key", key);
41495+ print_coord("coord", coord, 1);
41496+ impossible("nikita-2877", "no way");
41497+ }
41498+ zrelse(coord->node);
41499+}
41500+
41501+#else
41502+#define check_inode_seal(inode, coord, key) noop
41503+#define check_sd_coord(coord, key) noop
41504+#endif
41505+
41506+/* insert new stat-data into tree. Called with inode state
41507+ locked. Return inode state locked. */
41508+static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
41509+{
41510+ int result;
41511+ reiser4_key key;
41512+ coord_t coord;
41513+ reiser4_item_data data;
41514+ char *area;
41515+ reiser4_inode *ref;
41516+ lock_handle lh;
41517+ oid_t oid;
41518+
41519+ assert("nikita-723", inode != NULL);
41520+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
41521+
41522+ ref = reiser4_inode_data(inode);
41523+ spin_lock_inode(inode);
41524+
41525+ if (ref->plugin_mask != 0)
41526+ /* inode has non-standard plugins */
41527+ inode_set_extension(inode, PLUGIN_STAT);
41528+ /*
41529+ * prepare specification of new item to be inserted
41530+ */
41531+
41532+ data.iplug = inode_sd_plugin(inode);
41533+ data.length = data.iplug->s.sd.save_len(inode);
41534+ spin_unlock_inode(inode);
41535+
41536+ data.data = NULL;
41537+ data.user = 0;
41538+/* could be optimized for case where there is only one node format in
41539+ * use in the filesystem, probably there are lots of such
41540+ * places we could optimize for only one node layout.... -Hans */
41541+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
41542+ /* This is silly check, but we don't know actual node where
41543+ insertion will go into. */
41544+ return RETERR(-ENAMETOOLONG);
41545+ }
41546+ oid = oid_allocate(inode->i_sb);
41547+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
41548+ if (oid == ABSOLUTE_MAX_OID)
41549+ return RETERR(-EOVERFLOW);
41550+
41551+ set_inode_oid(inode, oid);
41552+
41553+ coord_init_zero(&coord);
41554+ init_lh(&lh);
41555+
41556+ result = insert_by_key(reiser4_tree_by_inode(inode),
41557+ build_sd_key(inode, &key), &data, &coord, &lh,
41558+ /* stat data lives on a leaf level */
41559+ LEAF_LEVEL, CBK_UNIQUE);
41560+
41561+ /* we don't want to re-check that somebody didn't insert
41562+ stat-data while we were doing io, because if it did,
41563+ insert_by_key() returned error. */
41564+ /* but what _is_ possible is that plugin for inode's stat-data,
41565+ list of non-standard plugins or their state would change
41566+ during io, so that stat-data wouldn't fit into sd. To avoid
41567+ this race we keep inode_state lock. This lock has to be
41568+ taken each time you access inode in a way that would cause
41569+ changes in sd size: changing plugins etc.
41570+ */
41571+
41572+ if (result == IBK_INSERT_OK) {
41573+ coord_clear_iplug(&coord);
41574+ result = zload(coord.node);
41575+ if (result == 0) {
41576+ /* have we really inserted stat data? */
41577+ assert("nikita-725", item_is_statdata(&coord));
41578+
41579+ /* inode was just created. It is inserted into hash
41580+ table, but no directory entry was yet inserted into
41581+ parent. So, inode is inaccessible through
41582+ ->lookup(). All places that directly grab inode
41583+ from hash-table (like old knfsd), should check
41584+ IMMUTABLE flag that is set by common_create_child.
41585+ */
41586+ assert("nikita-3240", data.iplug != NULL);
41587+ assert("nikita-3241", data.iplug->s.sd.save != NULL);
41588+ area = item_body_by_coord(&coord);
41589+ result = data.iplug->s.sd.save(inode, &area);
41590+ znode_make_dirty(coord.node);
41591+ if (result == 0) {
41592+ /* object has stat-data now */
41593+ reiser4_inode_clr_flag(inode, REISER4_NO_SD);
41594+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41595+ /* initialise stat-data seal */
41596+ reiser4_seal_init(&ref->sd_seal, &coord, &key);
41597+ ref->sd_coord = coord;
41598+ check_inode_seal(inode, &coord, &key);
41599+ } else if (result != -ENOMEM)
41600+ /*
41601+ * convert any other error code to -EIO to
41602+ * avoid confusing user level with unexpected
41603+ * errors.
41604+ */
41605+ result = RETERR(-EIO);
41606+ zrelse(coord.node);
41607+ }
41608+ }
41609+ done_lh(&lh);
41610+
41611+ if (result != 0)
41612+ key_warning(&key, inode, result);
41613+ else
41614+ oid_count_allocated();
41615+
41616+ return result;
41617+}
41618+
41619+/* find sd of inode in a tree, deal with errors */
41620+int lookup_sd(struct inode *inode /* inode to look sd for */ ,
41621+ znode_lock_mode lock_mode /* lock mode */ ,
41622+ coord_t * coord /* resulting coord */ ,
41623+ lock_handle * lh /* resulting lock handle */ ,
41624+ const reiser4_key * key /* resulting key */ ,
41625+ int silent)
41626+{
41627+ int result;
41628+ __u32 flags;
41629+
41630+ assert("nikita-1692", inode != NULL);
41631+ assert("nikita-1693", coord != NULL);
41632+ assert("nikita-1694", key != NULL);
41633+
41634+ /* look for the object's stat data in a tree.
41635+ This returns in "node" pointer to a locked znode and in "pos"
41636+ position of an item found in node. Both are only valid if
41637+ coord_found is returned. */
41638+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
41639+ flags |= CBK_UNIQUE;
41640+ /*
41641+ * traverse tree to find stat data. We cannot use vroot here, because
41642+ * it only covers _body_ of the file, and stat data don't belong
41643+ * there.
41644+ */
41645+ result = coord_by_key(reiser4_tree_by_inode(inode),
41646+ key,
41647+ coord,
41648+ lh,
41649+ lock_mode,
41650+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
41651+ if (REISER4_DEBUG && result == 0)
41652+ check_sd_coord(coord, key);
41653+
41654+ if (result != 0 && !silent)
41655+ key_warning(key, inode, result);
41656+ return result;
41657+}
41658+
41659+static int
41660+locate_inode_sd(struct inode *inode,
41661+ reiser4_key * key, coord_t * coord, lock_handle * lh)
41662+{
41663+ reiser4_inode *state;
41664+ seal_t seal;
41665+ int result;
41666+
41667+ assert("nikita-3483", inode != NULL);
41668+
41669+ state = reiser4_inode_data(inode);
41670+ spin_lock_inode(inode);
41671+ *coord = state->sd_coord;
41672+ coord_clear_iplug(coord);
41673+ seal = state->sd_seal;
41674+ spin_unlock_inode(inode);
41675+
41676+ build_sd_key(inode, key);
41677+ if (reiser4_seal_is_set(&seal)) {
41678+ /* first, try to use seal */
41679+ result = reiser4_seal_validate(&seal,
41680+ coord,
41681+ key,
41682+ lh, ZNODE_WRITE_LOCK,
41683+ ZNODE_LOCK_LOPRI);
41684+ if (result == 0)
41685+ check_sd_coord(coord, key);
41686+ } else
41687+ result = -E_REPEAT;
41688+
41689+ if (result != 0) {
41690+ coord_init_zero(coord);
41691+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
41692+ }
41693+ return result;
41694+}
41695+
41696+#if REISER4_DEBUG
41697+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
41698+{
41699+ return (get_key_locality(k1) == get_key_locality(k2) &&
41700+ get_key_type(k1) == get_key_type(k2) &&
41701+ get_key_band(k1) == get_key_band(k2) &&
41702+ get_key_ordering(k1) == get_key_ordering(k2) &&
41703+ get_key_objectid(k1) == get_key_objectid(k2));
41704+}
41705+
41706+#include "../tree_walk.h"
41707+
41708+/* make some checks before and after stat-data resize operation */
41709+static int check_sd_resize(struct inode * inode, coord_t * coord,
41710+ int length, int progress /* 1 means after resize */)
41711+{
41712+ int ret = 0;
41713+ lock_handle left_lock;
41714+ coord_t left_coord;
41715+ reiser4_key left_key;
41716+ reiser4_key key;
41717+
41718+ if (inode_file_plugin(inode) !=
41719+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
41720+ return 0;
41721+ if (!length)
41722+ return 0;
41723+ if (coord->item_pos != 0)
41724+ return 0;
41725+
41726+ init_lh(&left_lock);
41727+ ret = reiser4_get_left_neighbor(&left_lock,
41728+ coord->node,
41729+ ZNODE_WRITE_LOCK,
41730+ GN_CAN_USE_UPPER_LEVELS);
41731+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
41732+ ret == -ENOENT || ret == -EINVAL
41733+ || ret == -E_DEADLOCK) {
41734+ ret = 0;
41735+ goto exit;
41736+ }
41737+ ret = zload(left_lock.node);
41738+ if (ret)
41739+ goto exit;
41740+ coord_init_last_unit(&left_coord, left_lock.node);
41741+ item_key_by_coord(&left_coord, &left_key);
41742+ item_key_by_coord(coord, &key);
41743+
41744+ if (all_but_offset_key_eq(&key, &left_key))
41745+ /* corruption occured */
41746+ ret = 1;
41747+ zrelse(left_lock.node);
41748+ exit:
41749+ done_lh(&left_lock);
41750+ return ret;
41751+}
41752+#endif
41753+
41754+/* update stat-data at @coord */
41755+static int
41756+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
41757+ lock_handle * lh)
41758+{
41759+ int result;
41760+ reiser4_item_data data;
41761+ char *area;
41762+ reiser4_inode *state;
41763+ znode *loaded;
41764+
41765+ state = reiser4_inode_data(inode);
41766+
41767+ coord_clear_iplug(coord);
41768+ result = zload(coord->node);
41769+ if (result != 0)
41770+ return result;
41771+ loaded = coord->node;
41772+
41773+ spin_lock_inode(inode);
41774+ assert("nikita-728", inode_sd_plugin(inode) != NULL);
41775+ data.iplug = inode_sd_plugin(inode);
41776+
41777+ /* if inode has non-standard plugins, add appropriate stat data
41778+ * extension */
41779+ if (state->extmask & (1 << PLUGIN_STAT)) {
41780+ if (state->plugin_mask == 0)
41781+ inode_clr_extension(inode, PLUGIN_STAT);
41782+ } else if (state->plugin_mask != 0)
41783+ inode_set_extension(inode, PLUGIN_STAT);
41784+
41785+ if (state->extmask & (1 << HEIR_STAT)) {
41786+ if (state->heir_mask == 0)
41787+ inode_clr_extension(inode, HEIR_STAT);
41788+ } else if (state->heir_mask != 0)
41789+ inode_set_extension(inode, HEIR_STAT);
41790+
41791+ /* data.length is how much space to add to (or remove
41792+ from if negative) sd */
41793+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
41794+ /* recalculate stat-data length */
41795+ data.length =
41796+ data.iplug->s.sd.save_len(inode) -
41797+ item_length_by_coord(coord);
41798+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41799+ } else
41800+ data.length = 0;
41801+ spin_unlock_inode(inode);
41802+
41803+ /* if on-disk stat data is of different length than required
41804+ for this inode, resize it */
41805+
41806+ if (data.length != 0) {
41807+ data.data = NULL;
41808+ data.user = 0;
41809+
41810+ assert("edward-1441",
41811+ !check_sd_resize(inode, coord,
41812+ data.length, 0/* before resize */));
41813+
41814+ /* insertion code requires that insertion point (coord) was
41815+ * between units. */
41816+ coord->between = AFTER_UNIT;
41817+ result = reiser4_resize_item(coord, &data, key, lh,
41818+ COPI_DONT_SHIFT_LEFT);
41819+ if (result != 0) {
41820+ key_warning(key, inode, result);
41821+ zrelse(loaded);
41822+ return result;
41823+ }
41824+ if (loaded != coord->node) {
41825+ /* reiser4_resize_item moved coord to another node.
41826+ Zload it */
41827+ zrelse(loaded);
41828+ coord_clear_iplug(coord);
41829+ result = zload(coord->node);
41830+ if (result != 0)
41831+ return result;
41832+ loaded = coord->node;
41833+ }
41834+ assert("edward-1442",
41835+ !check_sd_resize(inode, coord,
41836+ data.length, 1/* after resize */));
41837+ }
41838+ area = item_body_by_coord(coord);
41839+ spin_lock_inode(inode);
41840+ result = data.iplug->s.sd.save(inode, &area);
41841+ znode_make_dirty(coord->node);
41842+
41843+ /* re-initialise stat-data seal */
41844+
41845+ /*
41846+ * coord.between was possibly skewed from AT_UNIT when stat-data size
41847+ * was changed and new extensions were pasted into item.
41848+ */
41849+ coord->between = AT_UNIT;
41850+ reiser4_seal_init(&state->sd_seal, coord, key);
41851+ state->sd_coord = *coord;
41852+ spin_unlock_inode(inode);
41853+ check_inode_seal(inode, coord, key);
41854+ zrelse(loaded);
41855+ return result;
41856+}
41857+
41858+/* Update existing stat-data in a tree. Called with inode state locked. Return
41859+ inode state locked. */
41860+static int update_sd(struct inode *inode /* inode to update sd for */ )
41861+{
41862+ int result;
41863+ reiser4_key key;
41864+ coord_t coord;
41865+ lock_handle lh;
41866+
41867+ assert("nikita-726", inode != NULL);
41868+
41869+ /* no stat-data, nothing to update?! */
41870+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
41871+
41872+ init_lh(&lh);
41873+
41874+ result = locate_inode_sd(inode, &key, &coord, &lh);
41875+ if (result == 0)
41876+ result = update_sd_at(inode, &coord, &key, &lh);
41877+ done_lh(&lh);
41878+
41879+ return result;
41880+}
41881+
41882+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
41883+ Remove object stat data. Space for that must be reserved by caller before
41884+*/
41885+static int
41886+common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
41887+{
41888+ int result;
41889+
41890+ assert("nikita-1477", inode != NULL);
41891+
41892+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
41893+ reiser4_key sd_key;
41894+
41895+ DQUOT_FREE_INODE(inode);
41896+ DQUOT_DROP(inode);
41897+
41898+ build_sd_key(inode, &sd_key);
41899+ result =
41900+ reiser4_cut_tree(reiser4_tree_by_inode(inode),
41901+ &sd_key, &sd_key, NULL, 0);
41902+ if (result == 0) {
41903+ reiser4_inode_set_flag(inode, REISER4_NO_SD);
41904+ result = oid_release(inode->i_sb, get_inode_oid(inode));
41905+ if (result == 0) {
41906+ oid_count_released();
41907+
41908+ result = safe_link_del(reiser4_tree_by_inode(inode),
41909+ get_inode_oid(inode),
41910+ SAFE_UNLINK);
41911+ }
41912+ }
41913+ } else
41914+ result = 0;
41915+ return result;
41916+}
41917+
41918+/* helper for safelink_common */
41919+static int process_truncate(struct inode *inode, __u64 size)
41920+{
41921+ int result;
41922+ struct iattr attr;
41923+ file_plugin *fplug;
41924+ reiser4_context *ctx;
41925+ struct dentry dentry;
41926+
41927+ assert("vs-21", is_in_reiser4_context());
41928+ ctx = reiser4_init_context(inode->i_sb);
41929+ assert("vs-22", !IS_ERR(ctx));
41930+
41931+ attr.ia_size = size;
41932+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
41933+ fplug = inode_file_plugin(inode);
41934+
41935+ mutex_lock(&inode->i_mutex);
41936+ assert("vs-1704", get_current_context()->trans->atom == NULL);
41937+ dentry.d_inode = inode;
41938+ result = inode->i_op->setattr(&dentry, &attr);
41939+ mutex_unlock(&inode->i_mutex);
41940+
41941+ context_set_commit_async(ctx);
41942+ reiser4_exit_context(ctx);
41943+
41944+ return result;
41945+}
41946+
41947+/*
41948+ Local variables:
41949+ c-indentation-style: "K&R"
41950+ mode-name: "LC"
41951+ c-basic-offset: 8
41952+ tab-width: 8
41953+ fill-column: 80
41954+ scroll-step: 1
41955+ End:
41956+*/
41957diff -urN linux-2.6.20.orig/fs/reiser4/plugin/hash.c linux-2.6.20/fs/reiser4/plugin/hash.c
41958--- linux-2.6.20.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300
41959+++ linux-2.6.20/fs/reiser4/plugin/hash.c 2007-05-06 14:50:43.791004471 +0400
41960@@ -0,0 +1,353 @@
41961+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
41962+ * reiser4/README */
41963+
41964+/* Hash functions */
41965+
41966+#include "../debug.h"
41967+#include "plugin_header.h"
41968+#include "plugin.h"
41969+#include "../super.h"
41970+#include "../inode.h"
41971+
41972+#include <linux/types.h>
41973+
41974+/* old rupasov (yura) hash */
41975+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
41976+ int len /* @name's length */ )
41977+{
41978+ int i;
41979+ int j;
41980+ int pow;
41981+ __u64 a;
41982+ __u64 c;
41983+
41984+ assert("nikita-672", name != NULL);
41985+ assert("nikita-673", len >= 0);
41986+
41987+ for (pow = 1, i = 1; i < len; ++i)
41988+ pow = pow * 10;
41989+
41990+ if (len == 1)
41991+ a = name[0] - 48;
41992+ else
41993+ a = (name[0] - 48) * pow;
41994+
41995+ for (i = 1; i < len; ++i) {
41996+ c = name[i] - 48;
41997+ for (pow = 1, j = i; j < len - 1; ++j)
41998+ pow = pow * 10;
41999+ a = a + c * pow;
42000+ }
42001+ for (; i < 40; ++i) {
42002+ c = '0' - 48;
42003+ for (pow = 1, j = i; j < len - 1; ++j)
42004+ pow = pow * 10;
42005+ a = a + c * pow;
42006+ }
42007+
42008+ for (; i < 256; ++i) {
42009+ c = i;
42010+ for (pow = 1, j = i; j < len - 1; ++j)
42011+ pow = pow * 10;
42012+ a = a + c * pow;
42013+ }
42014+
42015+ a = a << 7;
42016+ return a;
42017+}
42018+
42019+/* r5 hash */
42020+static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
42021+ int len UNUSED_ARG /* @name's length */ )
42022+{
42023+ __u64 a = 0;
42024+
42025+ assert("nikita-674", name != NULL);
42026+ assert("nikita-675", len >= 0);
42027+
42028+ while (*name) {
42029+ a += *name << 4;
42030+ a += *name >> 4;
42031+ a *= 11;
42032+ name++;
42033+ }
42034+ return a;
42035+}
42036+
42037+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
42038+ H0 = Key
42039+ Hi = E Mi(Hi-1) + Hi-1
42040+
42041+ (see Applied Cryptography, 2nd edition, p448).
42042+
42043+ Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
42044+
42045+ Jeremy has agreed to the contents of reiserfs/README. -Hans
42046+
42047+ This code was blindly upgraded to __u64 by s/__u32/__u64/g.
42048+*/
42049+static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
42050+ int len /* @name's length */ )
42051+{
42052+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
42053+
42054+ __u64 h0 = k[0], h1 = k[1];
42055+ __u64 a, b, c, d;
42056+ __u64 pad;
42057+ int i;
42058+
42059+ assert("nikita-676", name != NULL);
42060+ assert("nikita-677", len >= 0);
42061+
42062+#define DELTA 0x9E3779B9u
42063+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
42064+#define PARTROUNDS 6 /* 6 gets complete mixing */
42065+
42066+/* a, b, c, d - data; h0, h1 - accumulated hash */
42067+#define TEACORE(rounds) \
42068+ do { \
42069+ __u64 sum = 0; \
42070+ int n = rounds; \
42071+ __u64 b0, b1; \
42072+ \
42073+ b0 = h0; \
42074+ b1 = h1; \
42075+ \
42076+ do \
42077+ { \
42078+ sum += DELTA; \
42079+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
42080+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
42081+ } while(--n); \
42082+ \
42083+ h0 += b0; \
42084+ h1 += b1; \
42085+ } while(0)
42086+
42087+ pad = (__u64) len | ((__u64) len << 8);
42088+ pad |= pad << 16;
42089+
42090+ while (len >= 16) {
42091+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42092+ 16 | (__u64) name[3] << 24;
42093+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42094+ 16 | (__u64) name[7] << 24;
42095+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42096+ 16 | (__u64) name[11] << 24;
42097+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
42098+ << 16 | (__u64) name[15] << 24;
42099+
42100+ TEACORE(PARTROUNDS);
42101+
42102+ len -= 16;
42103+ name += 16;
42104+ }
42105+
42106+ if (len >= 12) {
42107+ //assert(len < 16);
42108+ if (len >= 16)
42109+ *(int *)0 = 0;
42110+
42111+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42112+ 16 | (__u64) name[3] << 24;
42113+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42114+ 16 | (__u64) name[7] << 24;
42115+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42116+ 16 | (__u64) name[11] << 24;
42117+
42118+ d = pad;
42119+ for (i = 12; i < len; i++) {
42120+ d <<= 8;
42121+ d |= name[i];
42122+ }
42123+ } else if (len >= 8) {
42124+ //assert(len < 12);
42125+ if (len >= 12)
42126+ *(int *)0 = 0;
42127+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42128+ 16 | (__u64) name[3] << 24;
42129+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42130+ 16 | (__u64) name[7] << 24;
42131+
42132+ c = d = pad;
42133+ for (i = 8; i < len; i++) {
42134+ c <<= 8;
42135+ c |= name[i];
42136+ }
42137+ } else if (len >= 4) {
42138+ //assert(len < 8);
42139+ if (len >= 8)
42140+ *(int *)0 = 0;
42141+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42142+ 16 | (__u64) name[3] << 24;
42143+
42144+ b = c = d = pad;
42145+ for (i = 4; i < len; i++) {
42146+ b <<= 8;
42147+ b |= name[i];
42148+ }
42149+ } else {
42150+ //assert(len < 4);
42151+ if (len >= 4)
42152+ *(int *)0 = 0;
42153+ a = b = c = d = pad;
42154+ for (i = 0; i < len; i++) {
42155+ a <<= 8;
42156+ a |= name[i];
42157+ }
42158+ }
42159+
42160+ TEACORE(FULLROUNDS);
42161+
42162+/* return 0;*/
42163+ return h0 ^ h1;
42164+
42165+}
42166+
42167+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
42168+
42169+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
42170+
42171+ Excerpts:
42172+
42173+ FNV hashes are designed to be fast while maintaining a low collision
42174+ rate.
42175+
42176+ [This version also seems to preserve lexicographical order locally.]
42177+
42178+ FNV hash algorithms and source code have been released into the public
42179+ domain.
42180+
42181+*/
42182+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
42183+ int len UNUSED_ARG /* @name's length */ )
42184+{
42185+ unsigned long long a = 0xcbf29ce484222325ull;
42186+ const unsigned long long fnv_64_prime = 0x100000001b3ull;
42187+
42188+ assert("nikita-678", name != NULL);
42189+ assert("nikita-679", len >= 0);
42190+
42191+ /* FNV-1 hash each octet in the buffer */
42192+ for (; *name; ++name) {
42193+ /* multiply by the 32 bit FNV magic prime mod 2^64 */
42194+ a *= fnv_64_prime;
42195+ /* xor the bottom with the current octet */
42196+ a ^= (unsigned long long)(*name);
42197+ }
42198+ /* return our new hash value */
42199+ return a;
42200+}
42201+
42202+/* degenerate hash function used to simplify testing of non-unique key
42203+ handling */
42204+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
42205+ int len UNUSED_ARG /* @name's length */ )
42206+{
42207+ return 0xc0c0c0c010101010ull;
42208+}
42209+
42210+static int change_hash(struct inode *inode,
42211+ reiser4_plugin * plugin,
42212+ pset_member memb)
42213+{
42214+ int result;
42215+
42216+ assert("nikita-3503", inode != NULL);
42217+ assert("nikita-3504", plugin != NULL);
42218+
42219+ assert("nikita-3505", is_reiser4_inode(inode));
42220+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
42221+
42222+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
42223+ return RETERR(-EINVAL);
42224+
42225+ result = 0;
42226+ if (inode_hash_plugin(inode) == NULL ||
42227+ inode_hash_plugin(inode)->h.id != plugin->h.id) {
42228+ if (is_dir_empty(inode) == 0)
42229+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
42230+ PSET_HASH, plugin);
42231+ else
42232+ result = RETERR(-ENOTEMPTY);
42233+
42234+ }
42235+ return result;
42236+}
42237+
42238+static reiser4_plugin_ops hash_plugin_ops = {
42239+ .init = NULL,
42240+ .load = NULL,
42241+ .save_len = NULL,
42242+ .save = NULL,
42243+ .change = change_hash
42244+};
42245+
42246+/* hash plugins */
42247+hash_plugin hash_plugins[LAST_HASH_ID] = {
42248+ [RUPASOV_HASH_ID] = {
42249+ .h = {
42250+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42251+ .id = RUPASOV_HASH_ID,
42252+ .pops = &hash_plugin_ops,
42253+ .label = "rupasov",
42254+ .desc = "Original Yura's hash",
42255+ .linkage = {NULL, NULL}
42256+ },
42257+ .hash = hash_rupasov
42258+ },
42259+ [R5_HASH_ID] = {
42260+ .h = {
42261+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42262+ .id = R5_HASH_ID,
42263+ .pops = &hash_plugin_ops,
42264+ .label = "r5",
42265+ .desc = "r5 hash",
42266+ .linkage = {NULL, NULL}
42267+ },
42268+ .hash = hash_r5
42269+ },
42270+ [TEA_HASH_ID] = {
42271+ .h = {
42272+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42273+ .id = TEA_HASH_ID,
42274+ .pops = &hash_plugin_ops,
42275+ .label = "tea",
42276+ .desc = "tea hash",
42277+ .linkage = {NULL, NULL}
42278+ },
42279+ .hash = hash_tea
42280+ },
42281+ [FNV1_HASH_ID] = {
42282+ .h = {
42283+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42284+ .id = FNV1_HASH_ID,
42285+ .pops = &hash_plugin_ops,
42286+ .label = "fnv1",
42287+ .desc = "fnv1 hash",
42288+ .linkage = {NULL, NULL}
42289+ },
42290+ .hash = hash_fnv1
42291+ },
42292+ [DEGENERATE_HASH_ID] = {
42293+ .h = {
42294+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42295+ .id = DEGENERATE_HASH_ID,
42296+ .pops = &hash_plugin_ops,
42297+ .label = "degenerate hash",
42298+ .desc = "Degenerate hash: only for testing",
42299+ .linkage = {NULL, NULL}
42300+ },
42301+ .hash = hash_deg
42302+ }
42303+};
42304+
42305+/* Make Linus happy.
42306+ Local variables:
42307+ c-indentation-style: "K&R"
42308+ mode-name: "LC"
42309+ c-basic-offset: 8
42310+ tab-width: 8
42311+ fill-column: 120
42312+ End:
42313+*/
42314diff -urN linux-2.6.20.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.20/fs/reiser4/plugin/inode_ops.c
42315--- linux-2.6.20.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300
42316+++ linux-2.6.20/fs/reiser4/plugin/inode_ops.c 2007-05-06 14:50:43.795005721 +0400
42317@@ -0,0 +1,897 @@
42318+/*
42319+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42320+ */
42321+
42322+/*
42323+ * this file contains typical implementations for most of methods of struct
42324+ * inode_operations
42325+ */
42326+
42327+#include "../inode.h"
42328+#include "../safe_link.h"
42329+
42330+#include <linux/quotaops.h>
42331+#include <linux/namei.h>
42332+
42333+static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42334+ reiser4_object_create_data *data);
42335+
42336+/**
42337+ * reiser4_create_common - create of inode operations
42338+ * @parent: inode of parent directory
42339+ * @dentry: dentry of new object to create
42340+ * @mode: the permissions to use
42341+ * @nameidata:
42342+ *
42343+ * This is common implementation of vfs's create method of struct
42344+ * inode_operations.
42345+ * Creates regular file using file plugin from parent directory plugin set.
42346+ */
42347+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
42348+ int mode, struct nameidata *nameidata)
42349+{
42350+ reiser4_object_create_data data;
42351+ file_plugin *fplug;
42352+
42353+ memset(&data, 0, sizeof data);
42354+ data.mode = S_IFREG | mode;
42355+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
42356+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
42357+ warning("vpf-1900", "'%s' is not a regular file plugin.",
42358+ fplug->h.label);
42359+ return RETERR(-EIO);
42360+ }
42361+ data.id = fplug->h.id;
42362+ return create_vfs_object(parent, dentry, &data);
42363+}
42364+
42365+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42366+void check_light_weight(struct inode *inode, struct inode *parent);
42367+
42368+/**
42369+ * reiser4_lookup_common - lookup of inode operations
42370+ * @parent: inode of directory to lookup into
42371+ * @dentry: name to look for
42372+ * @nameidata:
42373+ *
42374+ * This is common implementation of vfs's lookup method of struct
42375+ * inode_operations.
42376+ */
42377+struct dentry *reiser4_lookup_common(struct inode *parent,
42378+ struct dentry *dentry,
42379+ struct nameidata *nameidata)
42380+{
42381+ reiser4_context *ctx;
42382+ int result;
42383+ struct dentry *new;
42384+ struct inode *inode;
42385+ reiser4_dir_entry_desc entry;
42386+
42387+ ctx = reiser4_init_context(parent->i_sb);
42388+ if (IS_ERR(ctx))
42389+ return (struct dentry *)ctx;
42390+
42391+ /* set up operations on dentry. */
42392+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42393+
42394+ result = reiser4_lookup_name(parent, dentry, &entry.key);
42395+ if (result) {
42396+ context_set_commit_async(ctx);
42397+ reiser4_exit_context(ctx);
42398+ if (result == -ENOENT) {
42399+ /* object not found */
42400+ if (!IS_DEADDIR(parent))
42401+ d_add(dentry, NULL);
42402+ return NULL;
42403+ }
42404+ return ERR_PTR(result);
42405+ }
42406+
42407+ inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42408+ if (IS_ERR(inode)) {
42409+ context_set_commit_async(ctx);
42410+ reiser4_exit_context(ctx);
42411+ return ERR_PTR(PTR_ERR(inode));
42412+ }
42413+
42414+ /* success */
42415+ check_light_weight(inode, parent);
42416+ new = d_splice_alias(inode, dentry);
42417+ reiser4_iget_complete(inode);
42418+
42419+ /* prevent balance_dirty_pages() from being called: we don't want to
42420+ * do this under directory i_mutex. */
42421+ context_set_commit_async(ctx);
42422+ reiser4_exit_context(ctx);
42423+ return new;
42424+}
42425+
42426+static reiser4_block_nr common_estimate_link(struct inode *parent,
42427+ struct inode *object);
42428+int reiser4_update_dir(struct inode *);
42429+
42430+/**
42431+ * reiser4_link_common - link of inode operations
42432+ * @existing: dentry of object which is to get new name
42433+ * @parent: directory where new name is to be created
42434+ * @newname: new name
42435+ *
42436+ * This is common implementation of vfs's link method of struct
42437+ * inode_operations.
42438+ */
42439+int reiser4_link_common(struct dentry *existing, struct inode *parent,
42440+ struct dentry *newname)
42441+{
42442+ reiser4_context *ctx;
42443+ int result;
42444+ struct inode *object;
42445+ dir_plugin *parent_dplug;
42446+ reiser4_dir_entry_desc entry;
42447+ reiser4_object_create_data data;
42448+ reiser4_block_nr reserve;
42449+
42450+ ctx = reiser4_init_context(parent->i_sb);
42451+ if (IS_ERR(ctx))
42452+ return PTR_ERR(ctx);
42453+
42454+ assert("nikita-1431", existing != NULL);
42455+ assert("nikita-1432", parent != NULL);
42456+ assert("nikita-1433", newname != NULL);
42457+
42458+ object = existing->d_inode;
42459+ assert("nikita-1434", object != NULL);
42460+
42461+ /* check for race with create_object() */
42462+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
42463+ context_set_commit_async(ctx);
42464+ reiser4_exit_context(ctx);
42465+ return RETERR(-E_REPEAT);
42466+ }
42467+
42468+ parent_dplug = inode_dir_plugin(parent);
42469+
42470+ memset(&entry, 0, sizeof entry);
42471+ entry.obj = object;
42472+
42473+ data.mode = object->i_mode;
42474+ data.id = inode_file_plugin(object)->h.id;
42475+
42476+ reserve = common_estimate_link(parent, existing->d_inode);
42477+ if ((__s64) reserve < 0) {
42478+ context_set_commit_async(ctx);
42479+ reiser4_exit_context(ctx);
42480+ return reserve;
42481+ }
42482+
42483+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42484+ context_set_commit_async(ctx);
42485+ reiser4_exit_context(ctx);
42486+ return RETERR(-ENOSPC);
42487+ }
42488+
42489+ /*
42490+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
42491+ * means that link(2) can race against unlink(2) or rename(2), and
42492+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
42493+ *
42494+ * For such inode we have to undo special processing done in
42495+ * reiser4_unlink() viz. creation of safe-link.
42496+ */
42497+ if (unlikely(object->i_nlink == 0)) {
42498+ result = safe_link_del(reiser4_tree_by_inode(object),
42499+ get_inode_oid(object), SAFE_UNLINK);
42500+ if (result != 0) {
42501+ context_set_commit_async(ctx);
42502+ reiser4_exit_context(ctx);
42503+ return result;
42504+ }
42505+ }
42506+
42507+ /* increment nlink of @existing and update its stat data */
42508+ result = reiser4_add_nlink(object, parent, 1);
42509+ if (result == 0) {
42510+ /* add entry to the parent */
42511+ result =
42512+ parent_dplug->add_entry(parent, newname, &data, &entry);
42513+ if (result != 0) {
42514+ /* failed to add entry to the parent, decrement nlink
42515+ of @existing */
42516+ reiser4_del_nlink(object, parent, 1);
42517+ /*
42518+ * now, if that failed, we have a file with too big
42519+ * nlink---space leak, much better than directory
42520+ * entry pointing to nowhere
42521+ */
42522+ }
42523+ }
42524+ if (result == 0) {
42525+ atomic_inc(&object->i_count);
42526+ /*
42527+ * Upon successful completion, link() shall mark for update
42528+ * the st_ctime field of the file. Also, the st_ctime and
42529+ * st_mtime fields of the directory that contains the new
42530+ * entry shall be marked for update. --SUS
42531+ */
42532+ result = reiser4_update_dir(parent);
42533+ }
42534+ if (result == 0)
42535+ d_instantiate(newname, existing->d_inode);
42536+
42537+ context_set_commit_async(ctx);
42538+ reiser4_exit_context(ctx);
42539+ return result;
42540+}
42541+
42542+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
42543+
42544+/**
42545+ * reiser4_unlink_common - unlink of inode operations
42546+ * @parent: inode of directory to remove name from
42547+ * @victim: name to be removed
42548+ *
42549+ * This is common implementation of vfs's unlink method of struct
42550+ * inode_operations.
42551+ */
42552+int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
42553+{
42554+ reiser4_context *ctx;
42555+ int result;
42556+ struct inode *object;
42557+ file_plugin *fplug;
42558+
42559+ ctx = reiser4_init_context(parent->i_sb);
42560+ if (IS_ERR(ctx))
42561+ return PTR_ERR(ctx);
42562+
42563+ object = victim->d_inode;
42564+ fplug = inode_file_plugin(object);
42565+ assert("nikita-2882", fplug->detach != NULL);
42566+
42567+ result = unlink_check_and_grab(parent, victim);
42568+ if (result != 0) {
42569+ context_set_commit_async(ctx);
42570+ reiser4_exit_context(ctx);
42571+ return result;
42572+ }
42573+
42574+ result = fplug->detach(object, parent);
42575+ if (result == 0) {
42576+ dir_plugin *parent_dplug;
42577+ reiser4_dir_entry_desc entry;
42578+
42579+ parent_dplug = inode_dir_plugin(parent);
42580+ memset(&entry, 0, sizeof entry);
42581+
42582+ /* first, delete directory entry */
42583+ result = parent_dplug->rem_entry(parent, victim, &entry);
42584+ if (result == 0) {
42585+ /*
42586+ * if name was removed successfully, we _have_ to
42587+ * return 0 from this function, because upper level
42588+ * caller (vfs_{rmdir,unlink}) expect this.
42589+ *
42590+ * now that directory entry is removed, update
42591+ * stat-data
42592+ */
42593+ reiser4_del_nlink(object, parent, 1);
42594+ /*
42595+ * Upon successful completion, unlink() shall mark for
42596+ * update the st_ctime and st_mtime fields of the
42597+ * parent directory. Also, if the file's link count is
42598+ * not 0, the st_ctime field of the file shall be
42599+ * marked for update. --SUS
42600+ */
42601+ reiser4_update_dir(parent);
42602+ /* add safe-link for this file */
42603+ if (object->i_nlink == 0)
42604+ safe_link_add(object, SAFE_UNLINK);
42605+ }
42606+ }
42607+
42608+ if (unlikely(result != 0)) {
42609+ if (result != -ENOMEM)
42610+ warning("nikita-3398", "Cannot unlink %llu (%i)",
42611+ (unsigned long long)get_inode_oid(object),
42612+ result);
42613+ /* if operation failed commit pending inode modifications to
42614+ * the stat-data */
42615+ reiser4_update_sd(object);
42616+ reiser4_update_sd(parent);
42617+ }
42618+
42619+ reiser4_release_reserved(object->i_sb);
42620+
42621+ /* @object's i_ctime was updated by ->rem_link() method(). */
42622+
42623+ /* @victim can be already removed from the disk by this time. Inode is
42624+ then marked so that iput() wouldn't try to remove stat data. But
42625+ inode itself is still there.
42626+ */
42627+
42628+ /*
42629+ * we cannot release directory semaphore here, because name has
42630+ * already been deleted, but dentry (@victim) still exists. Prevent
42631+ * balance_dirty_pages() from being called on exiting this context: we
42632+ * don't want to do this under directory i_mutex.
42633+ */
42634+ context_set_commit_async(ctx);
42635+ reiser4_exit_context(ctx);
42636+ return result;
42637+}
42638+
42639+/**
42640+ * reiser4_symlink_common - symlink of inode operations
42641+ * @parent: inode of parent directory
42642+ * @dentry: dentry of object to be created
42643+ * @linkname: string symlink is to contain
42644+ *
42645+ * This is common implementation of vfs's symlink method of struct
42646+ * inode_operations.
42647+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
42648+ */
42649+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
42650+ const char *linkname)
42651+{
42652+ reiser4_object_create_data data;
42653+
42654+ memset(&data, 0, sizeof data);
42655+ data.name = linkname;
42656+ data.id = SYMLINK_FILE_PLUGIN_ID;
42657+ data.mode = S_IFLNK | S_IRWXUGO;
42658+ return create_vfs_object(parent, dentry, &data);
42659+}
42660+
42661+/**
42662+ * reiser4_mkdir_common - mkdir of inode operations
42663+ * @parent: inode of parent directory
42664+ * @dentry: dentry of object to be created
42665+ * @mode: the permissions to use
42666+ *
42667+ * This is common implementation of vfs's mkdir method of struct
42668+ * inode_operations.
42669+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
42670+ */
42671+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
42672+{
42673+ reiser4_object_create_data data;
42674+
42675+ memset(&data, 0, sizeof data);
42676+ data.mode = S_IFDIR | mode;
42677+ data.id = DIRECTORY_FILE_PLUGIN_ID;
42678+ return create_vfs_object(parent, dentry, &data);
42679+}
42680+
42681+/**
42682+ * reiser4_mknod_common - mknod of inode operations
42683+ * @parent: inode of parent directory
42684+ * @dentry: dentry of object to be created
42685+ * @mode: the permissions to use and file type
42686+ * @rdev: minor and major of new device file
42687+ *
42688+ * This is common implementation of vfs's mknod method of struct
42689+ * inode_operations.
42690+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
42691+ */
42692+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
42693+ int mode, dev_t rdev)
42694+{
42695+ reiser4_object_create_data data;
42696+
42697+ memset(&data, 0, sizeof data);
42698+ data.mode = mode;
42699+ data.rdev = rdev;
42700+ data.id = SPECIAL_FILE_PLUGIN_ID;
42701+ return create_vfs_object(parent, dentry, &data);
42702+}
42703+
42704+/*
42705+ * implementation of vfs's rename method of struct inode_operations for typical
42706+ * directory is in inode_ops_rename.c
42707+ */
42708+
42709+/**
42710+ * reiser4_follow_link_common - follow_link of inode operations
42711+ * @dentry: dentry of symlink
42712+ * @data:
42713+ *
42714+ * This is common implementation of vfs's followlink method of struct
42715+ * inode_operations.
42716+ * Assumes that inode's i_private points to the content of symbolic link.
42717+ */
42718+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
42719+{
42720+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
42721+
42722+ if (!dentry->d_inode->i_private
42723+ || !reiser4_inode_get_flag(dentry->d_inode,
42724+ REISER4_GENERIC_PTR_USED))
42725+ return ERR_PTR(RETERR(-EINVAL));
42726+ nd_set_link(nd, dentry->d_inode->i_private);
42727+ return NULL;
42728+}
42729+
42730+/**
42731+ * reiser4_permission_common - permission of inode operations
42732+ * @inode: inode to check permissions for
42733+ * @mask: mode bits to check permissions for
42734+ * @nameidata:
42735+ *
42736+ * Uses generic function to check for rwx permissions.
42737+ */
42738+int reiser4_permission_common(struct inode *inode, int mask,
42739+ struct nameidata *nameidata)
42740+{
42741+ return generic_permission(inode, mask, NULL);
42742+}
42743+
42744+static int setattr_reserve(reiser4_tree *);
42745+
42746+/* this is common implementation of vfs's setattr method of struct
42747+ inode_operations
42748+*/
42749+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
42750+{
42751+ reiser4_context *ctx;
42752+ struct inode *inode;
42753+ int result;
42754+
42755+ inode = dentry->d_inode;
42756+ result = inode_change_ok(inode, attr);
42757+ if (result)
42758+ return result;
42759+
42760+ ctx = reiser4_init_context(inode->i_sb);
42761+ if (IS_ERR(ctx))
42762+ return PTR_ERR(ctx);
42763+
42764+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
42765+
42766+ /*
42767+ * grab disk space and call standard inode_setattr().
42768+ */
42769+ result = setattr_reserve(reiser4_tree_by_inode(inode));
42770+ if (!result) {
42771+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
42772+ || (attr->ia_valid & ATTR_GID
42773+ && attr->ia_gid != inode->i_gid)) {
42774+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
42775+ if (result) {
42776+ context_set_commit_async(ctx);
42777+ reiser4_exit_context(ctx);
42778+ return result;
42779+ }
42780+ }
42781+ result = inode_setattr(inode, attr);
42782+ if (!result)
42783+ reiser4_update_sd(inode);
42784+ }
42785+
42786+ context_set_commit_async(ctx);
42787+ reiser4_exit_context(ctx);
42788+ return result;
42789+}
42790+
42791+/* this is common implementation of vfs's getattr method of struct
42792+ inode_operations
42793+*/
42794+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
42795+ struct dentry *dentry, struct kstat *stat)
42796+{
42797+ struct inode *obj;
42798+
42799+ assert("nikita-2298", dentry != NULL);
42800+ assert("nikita-2299", stat != NULL);
42801+ assert("nikita-2300", dentry->d_inode != NULL);
42802+
42803+ obj = dentry->d_inode;
42804+
42805+ stat->dev = obj->i_sb->s_dev;
42806+ stat->ino = oid_to_uino(get_inode_oid(obj));
42807+ stat->mode = obj->i_mode;
42808+ /* don't confuse userland with huge nlink. This is not entirely
42809+ * correct, because nlink_t is not necessary 16 bit signed. */
42810+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
42811+ stat->uid = obj->i_uid;
42812+ stat->gid = obj->i_gid;
42813+ stat->rdev = obj->i_rdev;
42814+ stat->atime = obj->i_atime;
42815+ stat->mtime = obj->i_mtime;
42816+ stat->ctime = obj->i_ctime;
42817+ stat->size = obj->i_size;
42818+ stat->blocks =
42819+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
42820+ /* "preferred" blocksize for efficient file system I/O */
42821+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
42822+
42823+ return 0;
42824+}
42825+
42826+/* Estimate the maximum amount of nodes which might be allocated or changed on
42827+ typical new object creation. Typical creation consists of calling create
42828+ method of file plugin, adding directory entry to parent and update parent
42829+ directory's stat data.
42830+*/
42831+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
42832+ struct inode *object
42833+ /* object */ )
42834+{
42835+ assert("vpf-309", parent != NULL);
42836+ assert("vpf-307", object != NULL);
42837+
42838+ return
42839+ /* object creation estimation */
42840+ inode_file_plugin(object)->estimate.create(object) +
42841+ /* stat data of parent directory estimation */
42842+ inode_file_plugin(parent)->estimate.update(parent) +
42843+ /* adding entry estimation */
42844+ inode_dir_plugin(parent)->estimate.add_entry(parent) +
42845+ /* to undo in the case of failure */
42846+ inode_dir_plugin(parent)->estimate.rem_entry(parent);
42847+}
42848+
42849+/* Create child in directory.
42850+
42851+ . get object's plugin
42852+ . get fresh inode
42853+ . initialize inode
42854+ . add object's stat-data
42855+ . initialize object's directory
42856+ . add entry to the parent
42857+ . instantiate dentry
42858+
42859+*/
42860+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
42861+ object */
42862+ struct inode **retobj)
42863+{
42864+ int result;
42865+
42866+ struct dentry *dentry; /* parent object */
42867+ struct inode *parent; /* new name */
42868+
42869+ dir_plugin *par_dir; /* directory plugin on the parent */
42870+ dir_plugin *obj_dir; /* directory plugin on the new object */
42871+ file_plugin *obj_plug; /* object plugin on the new object */
42872+ struct inode *object; /* new object */
42873+ reiser4_block_nr reserve;
42874+
42875+ reiser4_dir_entry_desc entry; /* new directory entry */
42876+
42877+ assert("nikita-1420", data != NULL);
42878+ parent = data->parent;
42879+ dentry = data->dentry;
42880+
42881+ assert("nikita-1418", parent != NULL);
42882+ assert("nikita-1419", dentry != NULL);
42883+
42884+ /* check, that name is acceptable for parent */
42885+ par_dir = inode_dir_plugin(parent);
42886+ if (par_dir->is_name_acceptable &&
42887+ !par_dir->is_name_acceptable(parent,
42888+ dentry->d_name.name,
42889+ (int)dentry->d_name.len))
42890+ return RETERR(-ENAMETOOLONG);
42891+
42892+ result = 0;
42893+ obj_plug = file_plugin_by_id((int)data->id);
42894+ if (obj_plug == NULL) {
42895+ warning("nikita-430", "Cannot find plugin %i", data->id);
42896+ return RETERR(-ENOENT);
42897+ }
42898+ object = new_inode(parent->i_sb);
42899+ if (object == NULL)
42900+ return RETERR(-ENOMEM);
42901+ /* we'll update i_nlink below */
42902+ object->i_nlink = 0;
42903+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
42904+ * to simplify error handling: if some error occurs before i_ino is
42905+ * initialized with oid, i_ino should already be set to some
42906+ * distinguished value. */
42907+ object->i_ino = 0;
42908+
42909+ /* So that on error iput will be called. */
42910+ *retobj = object;
42911+
42912+ if (DQUOT_ALLOC_INODE(object)) {
42913+ DQUOT_DROP(object);
42914+ object->i_flags |= S_NOQUOTA;
42915+ return RETERR(-EDQUOT);
42916+ }
42917+
42918+ memset(&entry, 0, sizeof entry);
42919+ entry.obj = object;
42920+
42921+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
42922+ file_plugin_to_plugin(obj_plug));
42923+ result = obj_plug->set_plug_in_inode(object, parent, data);
42924+ if (result) {
42925+ warning("nikita-431", "Cannot install plugin %i on %llx",
42926+ data->id, (unsigned long long)get_inode_oid(object));
42927+ DQUOT_FREE_INODE(object);
42928+ object->i_flags |= S_NOQUOTA;
42929+ return result;
42930+ }
42931+
42932+ /* reget plugin after installation */
42933+ obj_plug = inode_file_plugin(object);
42934+
42935+ if (obj_plug->create_object == NULL) {
42936+ DQUOT_FREE_INODE(object);
42937+ object->i_flags |= S_NOQUOTA;
42938+ return RETERR(-EPERM);
42939+ }
42940+
42941+ /* if any of hash, tail, sd or permission plugins for newly created
42942+ object are not set yet set them here inheriting them from parent
42943+ directory
42944+ */
42945+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
42946+ result = obj_plug->adjust_to_parent(object,
42947+ parent,
42948+ object->i_sb->s_root->d_inode);
42949+ if (result == 0)
42950+ result = finish_pset(object);
42951+ if (result != 0) {
42952+ warning("nikita-432", "Cannot inherit from %llx to %llx",
42953+ (unsigned long long)get_inode_oid(parent),
42954+ (unsigned long long)get_inode_oid(object));
42955+ DQUOT_FREE_INODE(object);
42956+ object->i_flags |= S_NOQUOTA;
42957+ return result;
42958+ }
42959+
42960+ /* setup inode and file-operations for this inode */
42961+ setup_inode_ops(object, data);
42962+
42963+ /* call file plugin's method to initialize plugin specific part of
42964+ * inode */
42965+ if (obj_plug->init_inode_data)
42966+ obj_plug->init_inode_data(object, data, 1 /*create */ );
42967+
42968+ /* obtain directory plugin (if any) for new object. */
42969+ obj_dir = inode_dir_plugin(object);
42970+ if (obj_dir != NULL && obj_dir->init == NULL) {
42971+ DQUOT_FREE_INODE(object);
42972+ object->i_flags |= S_NOQUOTA;
42973+ return RETERR(-EPERM);
42974+ }
42975+
42976+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
42977+
42978+ reserve = estimate_create_vfs_object(parent, object);
42979+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42980+ DQUOT_FREE_INODE(object);
42981+ object->i_flags |= S_NOQUOTA;
42982+ return RETERR(-ENOSPC);
42983+ }
42984+
42985+ /* mark inode `immutable'. We disable changes to the file being
42986+ created until valid directory entry for it is inserted. Otherwise,
42987+ if file were expanded and insertion of directory entry fails, we
42988+ have to remove file, but we only alloted enough space in
42989+ transaction to remove _empty_ file. 3.x code used to remove stat
42990+ data in different transaction thus possibly leaking disk space on
42991+ crash. This all only matters if it's possible to access file
42992+ without name, for example, by inode number
42993+ */
42994+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
42995+
42996+ /* create empty object, this includes allocation of new objectid. For
42997+ directories this implies creation of dot and dotdot */
42998+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
42999+
43000+ /* mark inode as `loaded'. From this point onward
43001+ reiser4_delete_inode() will try to remove its stat-data. */
43002+ reiser4_inode_set_flag(object, REISER4_LOADED);
43003+
43004+ result = obj_plug->create_object(object, parent, data);
43005+ if (result != 0) {
43006+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43007+ if (result != -ENAMETOOLONG && result != -ENOMEM)
43008+ warning("nikita-2219",
43009+ "Failed to create sd for %llu",
43010+ (unsigned long long)get_inode_oid(object));
43011+ DQUOT_FREE_INODE(object);
43012+ object->i_flags |= S_NOQUOTA;
43013+ return result;
43014+ }
43015+
43016+ if (obj_dir != NULL)
43017+ result = obj_dir->init(object, parent, data);
43018+ if (result == 0) {
43019+ assert("nikita-434", !reiser4_inode_get_flag(object,
43020+ REISER4_NO_SD));
43021+ /* insert inode into VFS hash table */
43022+ insert_inode_hash(object);
43023+ /* create entry */
43024+ result = par_dir->add_entry(parent, dentry, data, &entry);
43025+ if (result == 0) {
43026+ result = reiser4_add_nlink(object, parent, 0);
43027+ /* If O_CREAT is set and the file did not previously
43028+ exist, upon successful completion, open() shall
43029+ mark for update the st_atime, st_ctime, and
43030+ st_mtime fields of the file and the st_ctime and
43031+ st_mtime fields of the parent directory. --SUS
43032+ */
43033+ /* @object times are already updated by
43034+ reiser4_add_nlink() */
43035+ if (result == 0)
43036+ reiser4_update_dir(parent);
43037+ if (result != 0)
43038+ /* cleanup failure to add nlink */
43039+ par_dir->rem_entry(parent, dentry, &entry);
43040+ }
43041+ if (result != 0)
43042+ /* cleanup failure to add entry */
43043+ obj_plug->detach(object, parent);
43044+ } else if (result != -ENOMEM)
43045+ warning("nikita-2219", "Failed to initialize dir for %llu: %i",
43046+ (unsigned long long)get_inode_oid(object), result);
43047+
43048+ /*
43049+ * update stat-data, committing all pending modifications to the inode
43050+ * fields.
43051+ */
43052+ reiser4_update_sd(object);
43053+ if (result != 0) {
43054+ DQUOT_FREE_INODE(object);
43055+ object->i_flags |= S_NOQUOTA;
43056+ /* if everything was ok (result == 0), parent stat-data is
43057+ * already updated above (update_parent_dir()) */
43058+ reiser4_update_sd(parent);
43059+ /* failure to create entry, remove object */
43060+ obj_plug->delete_object(object);
43061+ }
43062+
43063+ /* file has name now, clear immutable flag */
43064+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43065+
43066+ /* on error, iput() will call ->delete_inode(). We should keep track
43067+ of the existence of stat-data for this inode and avoid attempt to
43068+ remove it in reiser4_delete_inode(). This is accomplished through
43069+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
43070+ */
43071+ return result;
43072+}
43073+
43074+/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
43075+ reiser4_mknod and reiser4_symlink
43076+*/
43077+static int
43078+create_vfs_object(struct inode *parent,
43079+ struct dentry *dentry, reiser4_object_create_data * data)
43080+{
43081+ reiser4_context *ctx;
43082+ int result;
43083+ struct inode *child;
43084+
43085+ ctx = reiser4_init_context(parent->i_sb);
43086+ if (IS_ERR(ctx))
43087+ return PTR_ERR(ctx);
43088+ context_set_commit_async(ctx);
43089+
43090+ data->parent = parent;
43091+ data->dentry = dentry;
43092+ child = NULL;
43093+ result = do_create_vfs_child(data, &child);
43094+ if (unlikely(result != 0)) {
43095+ if (child != NULL) {
43096+ reiser4_make_bad_inode(child);
43097+ iput(child);
43098+ }
43099+ } else
43100+ d_instantiate(dentry, child);
43101+
43102+ reiser4_exit_context(ctx);
43103+ return result;
43104+}
43105+
43106+/* helper for link_common. Estimate disk space necessary to add a link
43107+ from @parent to @object
43108+*/
43109+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
43110+ struct inode *object
43111+ /* object to which new link is being cerated */
43112+ )
43113+{
43114+ reiser4_block_nr res = 0;
43115+ file_plugin *fplug;
43116+ dir_plugin *dplug;
43117+
43118+ assert("vpf-317", object != NULL);
43119+ assert("vpf-318", parent != NULL);
43120+
43121+ fplug = inode_file_plugin(object);
43122+ dplug = inode_dir_plugin(parent);
43123+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
43124+ /* reiser4_add_nlink(object) */
43125+ res += fplug->estimate.update(object);
43126+ /* add_entry(parent) */
43127+ res += dplug->estimate.add_entry(parent);
43128+ /* reiser4_del_nlink(object) */
43129+ res += fplug->estimate.update(object);
43130+ /* update_dir(parent) */
43131+ res += inode_file_plugin(parent)->estimate.update(parent);
43132+ /* safe-link */
43133+ res += estimate_one_item_removal(reiser4_tree_by_inode(object));
43134+
43135+ return res;
43136+}
43137+
43138+/* Estimate disk space necessary to remove a link between @parent and
43139+ @object.
43140+*/
43141+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
43142+ struct inode *object
43143+ /* object to which new link is being cerated */
43144+ )
43145+{
43146+ reiser4_block_nr res = 0;
43147+ file_plugin *fplug;
43148+ dir_plugin *dplug;
43149+
43150+ assert("vpf-317", object != NULL);
43151+ assert("vpf-318", parent != NULL);
43152+
43153+ fplug = inode_file_plugin(object);
43154+ dplug = inode_dir_plugin(parent);
43155+
43156+ /* rem_entry(parent) */
43157+ res += dplug->estimate.rem_entry(parent);
43158+ /* reiser4_del_nlink(object) */
43159+ res += fplug->estimate.update(object);
43160+ /* update_dir(parent) */
43161+ res += inode_file_plugin(parent)->estimate.update(parent);
43162+ /* fplug->unlink */
43163+ res += fplug->estimate.unlink(object, parent);
43164+ /* safe-link */
43165+ res += estimate_one_insert_item(reiser4_tree_by_inode(object));
43166+
43167+ return res;
43168+}
43169+
43170+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
43171+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
43172+{
43173+ file_plugin *fplug;
43174+ struct inode *child;
43175+ int result;
43176+
43177+ result = 0;
43178+ child = victim->d_inode;
43179+ fplug = inode_file_plugin(child);
43180+
43181+ /* check for race with create_object() */
43182+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
43183+ return RETERR(-E_REPEAT);
43184+ /* object being deleted should have stat data */
43185+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
43186+
43187+ /* ask object plugin */
43188+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
43189+ return RETERR(-ENOTEMPTY);
43190+
43191+ result = (int)estimate_unlink(parent, child);
43192+ if (result < 0)
43193+ return result;
43194+
43195+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
43196+}
43197+
43198+/* helper for reiser4_setattr_common */
43199+static int setattr_reserve(reiser4_tree * tree)
43200+{
43201+ assert("vs-1096", is_grab_enabled(get_current_context()));
43202+ return reiser4_grab_space(estimate_one_insert_into_item(tree),
43203+ BA_CAN_COMMIT);
43204+}
43205+
43206+/* helper function. Standards require that for many file-system operations
43207+ on success ctime and mtime of parent directory is to be updated. */
43208+int reiser4_update_dir(struct inode *dir)
43209+{
43210+ assert("nikita-2525", dir != NULL);
43211+
43212+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
43213+ return reiser4_update_sd(dir);
43214+}
43215diff -urN linux-2.6.20.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.20/fs/reiser4/plugin/inode_ops_rename.c
43216--- linux-2.6.20.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300
43217+++ linux-2.6.20/fs/reiser4/plugin/inode_ops_rename.c 2007-05-06 14:50:43.795005721 +0400
43218@@ -0,0 +1,914 @@
43219+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
43220+ * reiser4/README */
43221+
43222+#include "../inode.h"
43223+#include "../safe_link.h"
43224+
43225+static const char *possible_leak = "Possible disk space leak.";
43226+
43227+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43228+
43229+ Helper function called from hashed_rename() */
43230+static int replace_name(struct inode *to_inode, /* inode where @from_coord is
43231+ * to be re-targeted at */
43232+ struct inode *from_dir, /* directory where @from_coord
43233+ * lives */
43234+ struct inode *from_inode, /* inode @from_coord
43235+ * originally point to */
43236+ coord_t * from_coord, /* where directory entry is in
43237+ * the tree */
43238+ lock_handle * from_lh /* lock handle on @from_coord */ )
43239+{
43240+ item_plugin *from_item;
43241+ int result;
43242+ znode *node;
43243+
43244+ coord_clear_iplug(from_coord);
43245+ node = from_coord->node;
43246+ result = zload(node);
43247+ if (result != 0)
43248+ return result;
43249+ from_item = item_plugin_by_coord(from_coord);
43250+ if (plugin_of_group(item_plugin_by_coord(from_coord),
43251+ DIR_ENTRY_ITEM_TYPE))
43252+ {
43253+ reiser4_key to_key;
43254+
43255+ build_sd_key(to_inode, &to_key);
43256+
43257+ /* everything is found and prepared to change directory entry
43258+ at @from_coord to point to @to_inode.
43259+
43260+ @to_inode is just about to get new name, so bump its link
43261+ counter.
43262+
43263+ */
43264+ result = reiser4_add_nlink(to_inode, from_dir, 0);
43265+ if (result != 0) {
43266+ /* Don't issue warning: this may be plain -EMLINK */
43267+ zrelse(node);
43268+ return result;
43269+ }
43270+
43271+ result =
43272+ from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43273+ if (result != 0) {
43274+ reiser4_del_nlink(to_inode, from_dir, 0);
43275+ zrelse(node);
43276+ return result;
43277+ }
43278+
43279+ /* @from_inode just lost its name, he-he.
43280+
43281+ If @from_inode was directory, it contained dotdot pointing
43282+ to @from_dir. @from_dir i_nlink will be decreased when
43283+ iput() will be called on @from_inode.
43284+
43285+ If file-system is not ADG (hard-links are
43286+ supported on directories), iput(from_inode) will not remove
43287+ @from_inode, and thus above is incorrect, but hard-links on
43288+ directories are problematic in many other respects.
43289+ */
43290+ result = reiser4_del_nlink(from_inode, from_dir, 0);
43291+ if (result != 0) {
43292+ warning("nikita-2330",
43293+ "Cannot remove link from source: %i. %s",
43294+ result, possible_leak);
43295+ }
43296+ /* Has to return success, because entry is already
43297+ * modified. */
43298+ result = 0;
43299+
43300+ /* NOTE-NIKITA consider calling plugin method in stead of
43301+ accessing inode fields directly. */
43302+ from_dir->i_mtime = CURRENT_TIME;
43303+ } else {
43304+ warning("nikita-2326", "Unexpected item type");
43305+ result = RETERR(-EIO);
43306+ }
43307+ zrelse(node);
43308+ return result;
43309+}
43310+
43311+/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43312+
43313+ Helper function used by hashed_rename(). */
43314+static int add_name(struct inode *inode, /* inode where @coord is to be
43315+ * re-targeted at */
43316+ struct inode *dir, /* directory where @coord lives */
43317+ struct dentry *name, /* new name */
43318+ coord_t * coord, /* where directory entry is in the tree */
43319+ lock_handle * lh, /* lock handle on @coord */
43320+ int is_dir /* true, if @inode is directory */ )
43321+{
43322+ int result;
43323+ reiser4_dir_entry_desc entry;
43324+
43325+ assert("nikita-2333", lh->node == coord->node);
43326+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43327+
43328+ memset(&entry, 0, sizeof entry);
43329+ entry.obj = inode;
43330+ /* build key of directory entry description */
43331+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43332+
43333+ /* ext2 does this in different order: first inserts new entry,
43334+ then increases directory nlink. We don't want do this,
43335+ because reiser4_add_nlink() calls ->add_link() plugin
43336+ method that can fail for whatever reason, leaving as with
43337+ cleanup problems.
43338+ */
43339+ /* @inode is getting new name */
43340+ reiser4_add_nlink(inode, dir, 0);
43341+ /* create @new_name in @new_dir pointing to
43342+ @old_inode */
43343+ result = WITH_COORD(coord,
43344+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43345+ coord,
43346+ lh,
43347+ name,
43348+ &entry));
43349+ if (result != 0) {
43350+ int result2;
43351+ result2 = reiser4_del_nlink(inode, dir, 0);
43352+ if (result2 != 0) {
43353+ warning("nikita-2327",
43354+ "Cannot drop link on %lli %i. %s",
43355+ (unsigned long long)get_inode_oid(inode),
43356+ result2, possible_leak);
43357+ }
43358+ } else
43359+ INODE_INC_FIELD(dir, i_size);
43360+ return result;
43361+}
43362+
43363+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43364+ struct dentry *old_name, /* old name */
43365+ struct inode *new_dir, /* directory where @new is located */
43366+ struct dentry *new_name /* new name */ )
43367+{
43368+ reiser4_block_nr res1, res2;
43369+ dir_plugin *p_parent_old, *p_parent_new;
43370+ file_plugin *p_child_old, *p_child_new;
43371+
43372+ assert("vpf-311", old_dir != NULL);
43373+ assert("vpf-312", new_dir != NULL);
43374+ assert("vpf-313", old_name != NULL);
43375+ assert("vpf-314", new_name != NULL);
43376+
43377+ p_parent_old = inode_dir_plugin(old_dir);
43378+ p_parent_new = inode_dir_plugin(new_dir);
43379+ p_child_old = inode_file_plugin(old_name->d_inode);
43380+ if (new_name->d_inode)
43381+ p_child_new = inode_file_plugin(new_name->d_inode);
43382+ else
43383+ p_child_new = NULL;
43384+
43385+ /* find_entry - can insert one leaf. */
43386+ res1 = res2 = 1;
43387+
43388+ /* replace_name */
43389+ {
43390+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43391+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43392+ /* update key */
43393+ res1 += 1;
43394+ /* reiser4_del_nlink(p_child_new) */
43395+ if (p_child_new)
43396+ res1 += p_child_new->estimate.update(new_name->d_inode);
43397+ }
43398+
43399+ /* else add_name */
43400+ {
43401+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43402+ res2 +=
43403+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43404+ /* reiser4_add_nlink(p_parent_old) */
43405+ res2 += p_child_old->estimate.update(old_name->d_inode);
43406+ /* add_entry(p_parent_new) */
43407+ res2 += p_parent_new->estimate.add_entry(new_dir);
43408+ /* reiser4_del_nlink(p_parent_old) */
43409+ res2 += p_child_old->estimate.update(old_name->d_inode);
43410+ }
43411+
43412+ res1 = res1 < res2 ? res2 : res1;
43413+
43414+ /* reiser4_write_sd(p_parent_new) */
43415+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43416+
43417+ /* reiser4_write_sd(p_child_new) */
43418+ if (p_child_new)
43419+ res1 += p_child_new->estimate.update(new_name->d_inode);
43420+
43421+ /* hashed_rem_entry(p_parent_old) */
43422+ res1 += p_parent_old->estimate.rem_entry(old_dir);
43423+
43424+ /* reiser4_del_nlink(p_child_old) */
43425+ res1 += p_child_old->estimate.update(old_name->d_inode);
43426+
43427+ /* replace_name */
43428+ {
43429+ /* reiser4_add_nlink(p_parent_dir_new) */
43430+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43431+ /* update_key */
43432+ res1 += 1;
43433+ /* reiser4_del_nlink(p_parent_new) */
43434+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43435+ /* reiser4_del_nlink(p_parent_old) */
43436+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43437+ }
43438+
43439+ /* reiser4_write_sd(p_parent_old) */
43440+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43441+
43442+ /* reiser4_write_sd(p_child_old) */
43443+ res1 += p_child_old->estimate.update(old_name->d_inode);
43444+
43445+ return res1;
43446+}
43447+
43448+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
43449+ struct dentry *old_name, /* old name */
43450+ struct inode *new_dir, /* directory where @new is located */
43451+ struct dentry *new_name
43452+ /* new name */ )
43453+{
43454+ reiser4_block_nr reserve;
43455+
43456+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43457+
43458+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43459+ return RETERR(-ENOSPC);
43460+
43461+ return 0;
43462+}
43463+
43464+/* check whether @old_inode and @new_inode can be moved within file system
43465+ * tree. This singles out attempts to rename pseudo-files, for example. */
43466+static int can_rename(struct inode *old_dir, struct inode *old_inode,
43467+ struct inode *new_dir, struct inode *new_inode)
43468+{
43469+ file_plugin *fplug;
43470+ dir_plugin *dplug;
43471+
43472+ assert("nikita-3370", old_inode != NULL);
43473+
43474+ dplug = inode_dir_plugin(new_dir);
43475+ fplug = inode_file_plugin(old_inode);
43476+
43477+ if (dplug == NULL)
43478+ return RETERR(-ENOTDIR);
43479+ else if (new_dir->i_op->create == NULL)
43480+ return RETERR(-EPERM);
43481+ else if (!fplug->can_add_link(old_inode))
43482+ return RETERR(-EMLINK);
43483+ else if (new_inode != NULL) {
43484+ fplug = inode_file_plugin(new_inode);
43485+ if (fplug->can_rem_link != NULL &&
43486+ !fplug->can_rem_link(new_inode))
43487+ return RETERR(-EBUSY);
43488+ }
43489+ return 0;
43490+}
43491+
43492+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
43493+ znode_lock_mode, reiser4_dir_entry_desc *);
43494+int reiser4_update_dir(struct inode *);
43495+
43496+/* this is common implementation of vfs's rename method of struct
43497+ inode_operations
43498+ See comments in the body.
43499+
43500+ It is arguable that this function can be made generic so, that it
43501+ will be applicable to any kind of directory plugin that deals with
43502+ directories composed out of directory entries. The only obstacle
43503+ here is that we don't have any data-type to represent directory
43504+ entry. This should be re-considered when more than one different
43505+ directory plugin will be implemented.
43506+*/
43507+int reiser4_rename_common(struct inode *old_dir /* directory where @old
43508+ * is located */ ,
43509+ struct dentry *old_name /* old name */ ,
43510+ struct inode *new_dir /* directory where @new
43511+ * is located */ ,
43512+ struct dentry *new_name /* new name */ )
43513+{
43514+ /* From `The Open Group Base Specifications Issue 6'
43515+
43516+ If either the old or new argument names a symbolic link, rename()
43517+ shall operate on the symbolic link itself, and shall not resolve
43518+ the last component of the argument. If the old argument and the new
43519+ argument resolve to the same existing file, rename() shall return
43520+ successfully and perform no other action.
43521+
43522+ [this is done by VFS: vfs_rename()]
43523+
43524+ If the old argument points to the pathname of a file that is not a
43525+ directory, the new argument shall not point to the pathname of a
43526+ directory.
43527+
43528+ [checked by VFS: vfs_rename->may_delete()]
43529+
43530+ If the link named by the new argument exists, it shall
43531+ be removed and old renamed to new. In this case, a link named new
43532+ shall remain visible to other processes throughout the renaming
43533+ operation and refer either to the file referred to by new or old
43534+ before the operation began.
43535+
43536+ [we should assure this]
43537+
43538+ Write access permission is required for
43539+ both the directory containing old and the directory containing new.
43540+
43541+ [checked by VFS: vfs_rename->may_delete(), may_create()]
43542+
43543+ If the old argument points to the pathname of a directory, the new
43544+ argument shall not point to the pathname of a file that is not a
43545+ directory.
43546+
43547+ [checked by VFS: vfs_rename->may_delete()]
43548+
43549+ If the directory named by the new argument exists, it
43550+ shall be removed and old renamed to new. In this case, a link named
43551+ new shall exist throughout the renaming operation and shall refer
43552+ either to the directory referred to by new or old before the
43553+ operation began.
43554+
43555+ [we should assure this]
43556+
43557+ If new names an existing directory, it shall be
43558+ required to be an empty directory.
43559+
43560+ [we should check this]
43561+
43562+ If the old argument points to a pathname of a symbolic link, the
43563+ symbolic link shall be renamed. If the new argument points to a
43564+ pathname of a symbolic link, the symbolic link shall be removed.
43565+
43566+ The new pathname shall not contain a path prefix that names
43567+ old. Write access permission is required for the directory
43568+ containing old and the directory containing new. If the old
43569+ argument points to the pathname of a directory, write access
43570+ permission may be required for the directory named by old, and, if
43571+ it exists, the directory named by new.
43572+
43573+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
43574+
43575+ If the link named by the new argument exists and the file's link
43576+ count becomes 0 when it is removed and no process has the file
43577+ open, the space occupied by the file shall be freed and the file
43578+ shall no longer be accessible. If one or more processes have the
43579+ file open when the last link is removed, the link shall be removed
43580+ before rename() returns, but the removal of the file contents shall
43581+ be postponed until all references to the file are closed.
43582+
43583+ [iput() handles this, but we can do this manually, a la
43584+ reiser4_unlink()]
43585+
43586+ Upon successful completion, rename() shall mark for update the
43587+ st_ctime and st_mtime fields of the parent directory of each file.
43588+
43589+ [N/A]
43590+
43591+ */
43592+ reiser4_context *ctx;
43593+ int result;
43594+ int is_dir; /* is @old_name directory */
43595+
43596+ struct inode *old_inode;
43597+ struct inode *new_inode;
43598+ coord_t *new_coord;
43599+
43600+ reiser4_dentry_fsdata *new_fsdata;
43601+ dir_plugin *dplug;
43602+ file_plugin *fplug;
43603+
43604+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
43605+ lock_handle *new_lh, *dotdot_lh;
43606+ struct dentry *dotdot_name;
43607+ reiser4_dentry_fsdata *dataonstack;
43608+
43609+ ctx = reiser4_init_context(old_dir->i_sb);
43610+ if (IS_ERR(ctx))
43611+ return PTR_ERR(ctx);
43612+
43613+ old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43614+ sizeof(*dotdot_name) + sizeof(*dataonstack),
43615+ reiser4_ctx_gfp_mask_get());
43616+ if (old_entry == NULL) {
43617+ context_set_commit_async(ctx);
43618+ reiser4_exit_context(ctx);
43619+ return RETERR(-ENOMEM);
43620+ }
43621+ memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43622+ sizeof(*dotdot_name) + sizeof(*dataonstack));
43623+
43624+ new_entry = old_entry + 1;
43625+ dotdot_entry = old_entry + 2;
43626+ new_lh = (lock_handle *)(old_entry + 3);
43627+ dotdot_lh = new_lh + 1;
43628+ dotdot_name = (struct dentry *)(new_lh + 2);
43629+ dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
43630+
43631+ assert("nikita-2318", old_dir != NULL);
43632+ assert("nikita-2319", new_dir != NULL);
43633+ assert("nikita-2320", old_name != NULL);
43634+ assert("nikita-2321", new_name != NULL);
43635+
43636+ old_inode = old_name->d_inode;
43637+ new_inode = new_name->d_inode;
43638+
43639+ dplug = inode_dir_plugin(old_dir);
43640+ fplug = NULL;
43641+
43642+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
43643+ if (IS_ERR(new_fsdata)) {
43644+ kfree(old_entry);
43645+ context_set_commit_async(ctx);
43646+ reiser4_exit_context(ctx);
43647+ return PTR_ERR(new_fsdata);
43648+ }
43649+
43650+ new_coord = &new_fsdata->dec.entry_coord;
43651+ coord_clear_iplug(new_coord);
43652+
43653+ is_dir = S_ISDIR(old_inode->i_mode);
43654+
43655+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43656+
43657+ /* if target is existing directory and it's not empty---return error.
43658+
43659+ This check is done specifically, because is_dir_empty() requires
43660+ tree traversal and have to be done before locks are taken.
43661+ */
43662+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
43663+ kfree(old_entry);
43664+ context_set_commit_async(ctx);
43665+ reiser4_exit_context(ctx);
43666+ return RETERR(-ENOTEMPTY);
43667+ }
43668+
43669+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
43670+ if (result != 0) {
43671+ kfree(old_entry);
43672+ context_set_commit_async(ctx);
43673+ reiser4_exit_context(ctx);
43674+ return result;
43675+ }
43676+
43677+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
43678+ new_dir, new_name);
43679+ if (result != 0) {
43680+ kfree(old_entry);
43681+ context_set_commit_async(ctx);
43682+ reiser4_exit_context(ctx);
43683+ return result;
43684+ }
43685+
43686+ init_lh(new_lh);
43687+
43688+ /* find entry for @new_name */
43689+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
43690+ new_entry);
43691+
43692+ if (IS_CBKERR(result)) {
43693+ done_lh(new_lh);
43694+ kfree(old_entry);
43695+ context_set_commit_async(ctx);
43696+ reiser4_exit_context(ctx);
43697+ return result;
43698+ }
43699+
43700+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
43701+
43702+ /* add or replace name for @old_inode as @new_name */
43703+ if (new_inode != NULL) {
43704+ /* target (@new_name) exists. */
43705+ /* Not clear what to do with objects that are
43706+ both directories and files at the same time. */
43707+ if (result == CBK_COORD_FOUND) {
43708+ result = replace_name(old_inode,
43709+ new_dir,
43710+ new_inode, new_coord, new_lh);
43711+ if (result == 0)
43712+ fplug = inode_file_plugin(new_inode);
43713+ } else if (result == CBK_COORD_NOTFOUND) {
43714+ /* VFS told us that @new_name is bound to existing
43715+ inode, but we failed to find directory entry. */
43716+ warning("nikita-2324", "Target not found");
43717+ result = RETERR(-ENOENT);
43718+ }
43719+ } else {
43720+ /* target (@new_name) doesn't exists. */
43721+ if (result == CBK_COORD_NOTFOUND)
43722+ result = add_name(old_inode,
43723+ new_dir,
43724+ new_name, new_coord, new_lh, is_dir);
43725+ else if (result == CBK_COORD_FOUND) {
43726+ /* VFS told us that @new_name is "negative" dentry,
43727+ but we found directory entry. */
43728+ warning("nikita-2331", "Target found unexpectedly");
43729+ result = RETERR(-EIO);
43730+ }
43731+ }
43732+
43733+ assert("nikita-3462", ergo(result == 0,
43734+ old_inode->i_nlink >= 2 + !!is_dir));
43735+
43736+ /* We are done with all modifications to the @new_dir, release lock on
43737+ node. */
43738+ done_lh(new_lh);
43739+
43740+ if (fplug != NULL) {
43741+ /* detach @new_inode from name-space */
43742+ result = fplug->detach(new_inode, new_dir);
43743+ if (result != 0)
43744+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
43745+ (unsigned long long)get_inode_oid(new_inode),
43746+ result, possible_leak);
43747+ }
43748+
43749+ if (new_inode != NULL)
43750+ reiser4_update_sd(new_inode);
43751+
43752+ if (result == 0) {
43753+ old_entry->obj = old_inode;
43754+
43755+ dplug->build_entry_key(old_dir,
43756+ &old_name->d_name, &old_entry->key);
43757+
43758+ /* At this stage new name was introduced for
43759+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43760+ counters were updated.
43761+
43762+ We want to remove @old_name now. If @old_inode wasn't
43763+ directory this is simple.
43764+ */
43765+ result = dplug->rem_entry(old_dir, old_name, old_entry);
43766+ if (result != 0 && result != -ENOMEM) {
43767+ warning("nikita-2335",
43768+ "Cannot remove old name: %i", result);
43769+ } else {
43770+ result = reiser4_del_nlink(old_inode, old_dir, 0);
43771+ if (result != 0 && result != -ENOMEM) {
43772+ warning("nikita-2337",
43773+ "Cannot drop link on old: %i", result);
43774+ }
43775+ }
43776+
43777+ if (result == 0 && is_dir) {
43778+ /* @old_inode is directory. We also have to update
43779+ dotdot entry. */
43780+ coord_t *dotdot_coord;
43781+
43782+ memset(dataonstack, 0, sizeof dataonstack);
43783+ memset(dotdot_entry, 0, sizeof dotdot_entry);
43784+ dotdot_entry->obj = old_dir;
43785+ memset(dotdot_name, 0, sizeof dotdot_name);
43786+ dotdot_name->d_name.name = "..";
43787+ dotdot_name->d_name.len = 2;
43788+ /*
43789+ * allocate ->d_fsdata on the stack to avoid using
43790+ * reiser4_get_dentry_fsdata(). Locking is not needed,
43791+ * because dentry is private to the current thread.
43792+ */
43793+ dotdot_name->d_fsdata = dataonstack;
43794+ init_lh(dotdot_lh);
43795+
43796+ dotdot_coord = &dataonstack->dec.entry_coord;
43797+ coord_clear_iplug(dotdot_coord);
43798+
43799+ result = reiser4_find_entry(old_inode, dotdot_name,
43800+ dotdot_lh, ZNODE_WRITE_LOCK,
43801+ dotdot_entry);
43802+ if (result == 0) {
43803+ /* replace_name() decreases i_nlink on
43804+ * @old_dir */
43805+ result = replace_name(new_dir,
43806+ old_inode,
43807+ old_dir,
43808+ dotdot_coord, dotdot_lh);
43809+ } else
43810+ result = RETERR(-EIO);
43811+ done_lh(dotdot_lh);
43812+ }
43813+ }
43814+ reiser4_update_dir(new_dir);
43815+ reiser4_update_dir(old_dir);
43816+ reiser4_update_sd(old_inode);
43817+ if (result == 0) {
43818+ file_plugin *fplug;
43819+
43820+ if (new_inode != NULL) {
43821+ /* add safe-link for target file (in case we removed
43822+ * last reference to the poor fellow */
43823+ fplug = inode_file_plugin(new_inode);
43824+ if (new_inode->i_nlink == 0)
43825+ result = safe_link_add(new_inode, SAFE_UNLINK);
43826+ }
43827+ }
43828+ kfree(old_entry);
43829+ context_set_commit_async(ctx);
43830+ reiser4_exit_context(ctx);
43831+ return result;
43832+}
43833+
43834+#if 0
43835+int reiser4_rename_common(struct inode *old_dir /* directory where @old
43836+ * is located */ ,
43837+ struct dentry *old_name /* old name */ ,
43838+ struct inode *new_dir /* directory where @new
43839+ * is located */ ,
43840+ struct dentry *new_name /* new name */ )
43841+{
43842+ /* From `The Open Group Base Specifications Issue 6'
43843+
43844+ If either the old or new argument names a symbolic link, rename()
43845+ shall operate on the symbolic link itself, and shall not resolve
43846+ the last component of the argument. If the old argument and the new
43847+ argument resolve to the same existing file, rename() shall return
43848+ successfully and perform no other action.
43849+
43850+ [this is done by VFS: vfs_rename()]
43851+
43852+ If the old argument points to the pathname of a file that is not a
43853+ directory, the new argument shall not point to the pathname of a
43854+ directory.
43855+
43856+ [checked by VFS: vfs_rename->may_delete()]
43857+
43858+ If the link named by the new argument exists, it shall
43859+ be removed and old renamed to new. In this case, a link named new
43860+ shall remain visible to other processes throughout the renaming
43861+ operation and refer either to the file referred to by new or old
43862+ before the operation began.
43863+
43864+ [we should assure this]
43865+
43866+ Write access permission is required for
43867+ both the directory containing old and the directory containing new.
43868+
43869+ [checked by VFS: vfs_rename->may_delete(), may_create()]
43870+
43871+ If the old argument points to the pathname of a directory, the new
43872+ argument shall not point to the pathname of a file that is not a
43873+ directory.
43874+
43875+ [checked by VFS: vfs_rename->may_delete()]
43876+
43877+ If the directory named by the new argument exists, it
43878+ shall be removed and old renamed to new. In this case, a link named
43879+ new shall exist throughout the renaming operation and shall refer
43880+ either to the directory referred to by new or old before the
43881+ operation began.
43882+
43883+ [we should assure this]
43884+
43885+ If new names an existing directory, it shall be
43886+ required to be an empty directory.
43887+
43888+ [we should check this]
43889+
43890+ If the old argument points to a pathname of a symbolic link, the
43891+ symbolic link shall be renamed. If the new argument points to a
43892+ pathname of a symbolic link, the symbolic link shall be removed.
43893+
43894+ The new pathname shall not contain a path prefix that names
43895+ old. Write access permission is required for the directory
43896+ containing old and the directory containing new. If the old
43897+ argument points to the pathname of a directory, write access
43898+ permission may be required for the directory named by old, and, if
43899+ it exists, the directory named by new.
43900+
43901+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
43902+
43903+ If the link named by the new argument exists and the file's link
43904+ count becomes 0 when it is removed and no process has the file
43905+ open, the space occupied by the file shall be freed and the file
43906+ shall no longer be accessible. If one or more processes have the
43907+ file open when the last link is removed, the link shall be removed
43908+ before rename() returns, but the removal of the file contents shall
43909+ be postponed until all references to the file are closed.
43910+
43911+ [iput() handles this, but we can do this manually, a la
43912+ reiser4_unlink()]
43913+
43914+ Upon successful completion, rename() shall mark for update the
43915+ st_ctime and st_mtime fields of the parent directory of each file.
43916+
43917+ [N/A]
43918+
43919+ */
43920+ reiser4_context *ctx;
43921+ int result;
43922+ int is_dir; /* is @old_name directory */
43923+ struct inode *old_inode;
43924+ struct inode *new_inode;
43925+ reiser4_dir_entry_desc old_entry;
43926+ reiser4_dir_entry_desc new_entry;
43927+ coord_t *new_coord;
43928+ reiser4_dentry_fsdata *new_fsdata;
43929+ lock_handle new_lh;
43930+ dir_plugin *dplug;
43931+ file_plugin *fplug;
43932+
43933+ ctx = reiser4_init_context(old_dir->i_sb);
43934+ if (IS_ERR(ctx))
43935+ return PTR_ERR(ctx);
43936+
43937+ assert("nikita-2318", old_dir != NULL);
43938+ assert("nikita-2319", new_dir != NULL);
43939+ assert("nikita-2320", old_name != NULL);
43940+ assert("nikita-2321", new_name != NULL);
43941+
43942+ old_inode = old_name->d_inode;
43943+ new_inode = new_name->d_inode;
43944+
43945+ dplug = inode_dir_plugin(old_dir);
43946+ fplug = NULL;
43947+
43948+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
43949+ if (IS_ERR(new_fsdata)) {
43950+ result = PTR_ERR(new_fsdata);
43951+ goto exit;
43952+ }
43953+
43954+ new_coord = &new_fsdata->dec.entry_coord;
43955+ coord_clear_iplug(new_coord);
43956+
43957+ is_dir = S_ISDIR(old_inode->i_mode);
43958+
43959+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43960+
43961+ /* if target is existing directory and it's not empty---return error.
43962+
43963+ This check is done specifically, because is_dir_empty() requires
43964+ tree traversal and have to be done before locks are taken.
43965+ */
43966+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
43967+ return RETERR(-ENOTEMPTY);
43968+
43969+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
43970+ if (result != 0)
43971+ goto exit;
43972+
43973+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
43974+ new_dir, new_name);
43975+ if (result != 0)
43976+ goto exit;
43977+
43978+ init_lh(&new_lh);
43979+
43980+ /* find entry for @new_name */
43981+ result = reiser4_find_entry(new_dir, new_name, &new_lh,
43982+ ZNODE_WRITE_LOCK, &new_entry);
43983+
43984+ if (IS_CBKERR(result)) {
43985+ done_lh(&new_lh);
43986+ goto exit;
43987+ }
43988+
43989+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
43990+
43991+ /* add or replace name for @old_inode as @new_name */
43992+ if (new_inode != NULL) {
43993+ /* target (@new_name) exists. */
43994+ /* Not clear what to do with objects that are
43995+ both directories and files at the same time. */
43996+ if (result == CBK_COORD_FOUND) {
43997+ result = replace_name(old_inode,
43998+ new_dir,
43999+ new_inode, new_coord, &new_lh);
44000+ if (result == 0)
44001+ fplug = inode_file_plugin(new_inode);
44002+ } else if (result == CBK_COORD_NOTFOUND) {
44003+ /* VFS told us that @new_name is bound to existing
44004+ inode, but we failed to find directory entry. */
44005+ warning("nikita-2324", "Target not found");
44006+ result = RETERR(-ENOENT);
44007+ }
44008+ } else {
44009+ /* target (@new_name) doesn't exists. */
44010+ if (result == CBK_COORD_NOTFOUND)
44011+ result = add_name(old_inode,
44012+ new_dir,
44013+ new_name, new_coord, &new_lh, is_dir);
44014+ else if (result == CBK_COORD_FOUND) {
44015+ /* VFS told us that @new_name is "negative" dentry,
44016+ but we found directory entry. */
44017+ warning("nikita-2331", "Target found unexpectedly");
44018+ result = RETERR(-EIO);
44019+ }
44020+ }
44021+
44022+ assert("nikita-3462", ergo(result == 0,
44023+ old_inode->i_nlink >= 2 + !!is_dir));
44024+
44025+ /* We are done with all modifications to the @new_dir, release lock on
44026+ node. */
44027+ done_lh(&new_lh);
44028+
44029+ if (fplug != NULL) {
44030+ /* detach @new_inode from name-space */
44031+ result = fplug->detach(new_inode, new_dir);
44032+ if (result != 0)
44033+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
44034+ (unsigned long long)get_inode_oid(new_inode),
44035+ result, possible_leak);
44036+ }
44037+
44038+ if (new_inode != NULL)
44039+ reiser4_update_sd(new_inode);
44040+
44041+ if (result == 0) {
44042+ memset(&old_entry, 0, sizeof old_entry);
44043+ old_entry.obj = old_inode;
44044+
44045+ dplug->build_entry_key(old_dir,
44046+ &old_name->d_name, &old_entry.key);
44047+
44048+ /* At this stage new name was introduced for
44049+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
44050+ counters were updated.
44051+
44052+ We want to remove @old_name now. If @old_inode wasn't
44053+ directory this is simple.
44054+ */
44055+ result = dplug->rem_entry(old_dir, old_name, &old_entry);
44056+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
44057+ if (result != 0 && result != -ENOMEM) {
44058+ warning("nikita-2335",
44059+ "Cannot remove old name: %i", result);
44060+ } else {
44061+ result = reiser4_del_nlink(old_inode, old_dir, 0);
44062+ if (result != 0 && result != -ENOMEM) {
44063+ warning("nikita-2337",
44064+ "Cannot drop link on old: %i", result);
44065+ }
44066+ }
44067+
44068+ if (result == 0 && is_dir) {
44069+ /* @old_inode is directory. We also have to update
44070+ dotdot entry. */
44071+ coord_t *dotdot_coord;
44072+ lock_handle dotdot_lh;
44073+ struct dentry dotdot_name;
44074+ reiser4_dir_entry_desc dotdot_entry;
44075+ reiser4_dentry_fsdata dataonstack;
44076+ reiser4_dentry_fsdata *fsdata;
44077+
44078+ memset(&dataonstack, 0, sizeof dataonstack);
44079+ memset(&dotdot_entry, 0, sizeof dotdot_entry);
44080+ dotdot_entry.obj = old_dir;
44081+ memset(&dotdot_name, 0, sizeof dotdot_name);
44082+ dotdot_name.d_name.name = "..";
44083+ dotdot_name.d_name.len = 2;
44084+ /*
44085+ * allocate ->d_fsdata on the stack to avoid using
44086+ * reiser4_get_dentry_fsdata(). Locking is not needed,
44087+ * because dentry is private to the current thread.
44088+ */
44089+ dotdot_name.d_fsdata = &dataonstack;
44090+ init_lh(&dotdot_lh);
44091+
44092+ fsdata = &dataonstack;
44093+ dotdot_coord = &fsdata->dec.entry_coord;
44094+ coord_clear_iplug(dotdot_coord);
44095+
44096+ result = reiser4_find_entry(old_inode,
44097+ &dotdot_name,
44098+ &dotdot_lh,
44099+ ZNODE_WRITE_LOCK,
44100+ &dotdot_entry);
44101+ if (result == 0) {
44102+ /* replace_name() decreases i_nlink on
44103+ * @old_dir */
44104+ result = replace_name(new_dir,
44105+ old_inode,
44106+ old_dir,
44107+ dotdot_coord, &dotdot_lh);
44108+ } else
44109+ result = RETERR(-EIO);
44110+ done_lh(&dotdot_lh);
44111+ }
44112+ }
44113+ reiser4_update_dir(new_dir);
44114+ reiser4_update_dir(old_dir);
44115+ reiser4_update_sd(old_inode);
44116+ if (result == 0) {
44117+ file_plugin *fplug;
44118+
44119+ if (new_inode != NULL) {
44120+ /* add safe-link for target file (in case we removed
44121+ * last reference to the poor fellow */
44122+ fplug = inode_file_plugin(new_inode);
44123+ if (new_inode->i_nlink == 0)
44124+ result = safe_link_add(new_inode, SAFE_UNLINK);
44125+ }
44126+ }
44127+ exit:
44128+ context_set_commit_async(ctx);
44129+ reiser4_exit_context(ctx);
44130+ return result;
44131+}
44132+#endif
44133diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/acl.h linux-2.6.20/fs/reiser4/plugin/item/acl.h
44134--- linux-2.6.20.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300
44135+++ linux-2.6.20/fs/reiser4/plugin/item/acl.h 2007-05-06 14:50:43.799006970 +0400
44136@@ -0,0 +1,66 @@
44137+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44138+
44139+/* Directory entry. */
44140+
44141+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
44142+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
44143+
44144+#include "../../forward.h"
44145+#include "../../dformat.h"
44146+#include "../../kassign.h"
44147+#include "../../key.h"
44148+
44149+#include <linux/fs.h>
44150+#include <linux/dcache.h> /* for struct dentry */
44151+
44152+typedef struct directory_entry_format {
44153+ /* key of object stat-data. It's not necessary to store whole
44154+ key here, because it's always key of stat-data, so minor
44155+ packing locality and offset can be omitted here. But this
44156+ relies on particular key allocation scheme for stat-data, so,
44157+ for extensibility sake, whole key can be stored here.
44158+
44159+ We store key as array of bytes, because we don't want 8-byte
44160+ alignment of dir entries.
44161+ */
44162+ obj_key_id id;
44163+ /* file name. Null terminated string. */
44164+ d8 name[0];
44165+} directory_entry_format;
44166+
44167+void print_de(const char *prefix, coord_t * coord);
44168+int extract_key_de(const coord_t * coord, reiser4_key * key);
44169+int update_key_de(const coord_t * coord, const reiser4_key * key,
44170+ lock_handle * lh);
44171+char *extract_name_de(const coord_t * coord, char *buf);
44172+unsigned extract_file_type_de(const coord_t * coord);
44173+int add_entry_de(struct inode *dir, coord_t * coord,
44174+ lock_handle * lh, const struct dentry *name,
44175+ reiser4_dir_entry_desc * entry);
44176+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
44177+ lock_handle * lh, reiser4_dir_entry_desc * entry);
44178+int max_name_len_de(const struct inode *dir);
44179+
44180+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
44181+
44182+char *extract_dent_name(const coord_t * coord,
44183+ directory_entry_format * dent, char *buf);
44184+
44185+#if REISER4_LARGE_KEY
44186+#define DE_NAME_BUF_LEN (24)
44187+#else
44188+#define DE_NAME_BUF_LEN (16)
44189+#endif
44190+
44191+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
44192+#endif
44193+
44194+/* Make Linus happy.
44195+ Local variables:
44196+ c-indentation-style: "K&R"
44197+ mode-name: "LC"
44198+ c-basic-offset: 8
44199+ tab-width: 8
44200+ fill-column: 120
44201+ End:
44202+*/
44203diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.20/fs/reiser4/plugin/item/blackbox.c
44204--- linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
44205+++ linux-2.6.20/fs/reiser4/plugin/item/blackbox.c 2007-05-06 14:50:43.799006970 +0400
44206@@ -0,0 +1,142 @@
44207+/* Copyright 2003 by Hans Reiser, licensing governed by
44208+ * reiser4/README */
44209+
44210+/* Black box item implementation */
44211+
44212+#include "../../forward.h"
44213+#include "../../debug.h"
44214+#include "../../dformat.h"
44215+#include "../../kassign.h"
44216+#include "../../coord.h"
44217+#include "../../tree.h"
44218+#include "../../lock.h"
44219+
44220+#include "blackbox.h"
44221+#include "item.h"
44222+#include "../plugin.h"
44223+
44224+int
44225+store_black_box(reiser4_tree * tree,
44226+ const reiser4_key * key, void *data, int length)
44227+{
44228+ int result;
44229+ reiser4_item_data idata;
44230+ coord_t coord;
44231+ lock_handle lh;
44232+
44233+ memset(&idata, 0, sizeof idata);
44234+
44235+ idata.data = data;
44236+ idata.user = 0;
44237+ idata.length = length;
44238+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
44239+
44240+ init_lh(&lh);
44241+ result = insert_by_key(tree, key,
44242+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
44243+
44244+ assert("nikita-3413",
44245+ ergo(result == 0,
44246+ WITH_COORD(&coord,
44247+ item_length_by_coord(&coord) == length)));
44248+
44249+ done_lh(&lh);
44250+ return result;
44251+}
44252+
44253+int
44254+load_black_box(reiser4_tree * tree,
44255+ reiser4_key * key, void *data, int length, int exact)
44256+{
44257+ int result;
44258+ coord_t coord;
44259+ lock_handle lh;
44260+
44261+ init_lh(&lh);
44262+ result = coord_by_key(tree, key,
44263+ &coord, &lh, ZNODE_READ_LOCK,
44264+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44265+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44266+
44267+ if (result == 0) {
44268+ int ilen;
44269+
44270+ result = zload(coord.node);
44271+ if (result == 0) {
44272+ ilen = item_length_by_coord(&coord);
44273+ if (ilen <= length) {
44274+ memcpy(data, item_body_by_coord(&coord), ilen);
44275+ unit_key_by_coord(&coord, key);
44276+ } else if (exact) {
44277+ /*
44278+ * item is larger than buffer provided by the
44279+ * user. Only issue a warning if @exact is
44280+ * set. If @exact is false, we are iterating
44281+ * over all safe-links and here we are reaching
44282+ * the end of the iteration.
44283+ */
44284+ warning("nikita-3415",
44285+ "Wrong black box length: %i > %i",
44286+ ilen, length);
44287+ result = RETERR(-EIO);
44288+ }
44289+ zrelse(coord.node);
44290+ }
44291+ }
44292+
44293+ done_lh(&lh);
44294+ return result;
44295+
44296+}
44297+
44298+int
44299+update_black_box(reiser4_tree * tree,
44300+ const reiser4_key * key, void *data, int length)
44301+{
44302+ int result;
44303+ coord_t coord;
44304+ lock_handle lh;
44305+
44306+ init_lh(&lh);
44307+ result = coord_by_key(tree, key,
44308+ &coord, &lh, ZNODE_READ_LOCK,
44309+ FIND_EXACT,
44310+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44311+ if (result == 0) {
44312+ int ilen;
44313+
44314+ result = zload(coord.node);
44315+ if (result == 0) {
44316+ ilen = item_length_by_coord(&coord);
44317+ if (length <= ilen) {
44318+ memcpy(item_body_by_coord(&coord), data,
44319+ length);
44320+ } else {
44321+ warning("nikita-3437",
44322+ "Wrong black box length: %i < %i",
44323+ ilen, length);
44324+ result = RETERR(-EIO);
44325+ }
44326+ zrelse(coord.node);
44327+ }
44328+ }
44329+
44330+ done_lh(&lh);
44331+ return result;
44332+
44333+}
44334+
44335+int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44336+{
44337+ return reiser4_cut_tree(tree, key, key, NULL, 1);
44338+}
44339+
44340+/* Make Linus happy.
44341+ Local variables:
44342+ c-indentation-style: "K&R"
44343+ mode-name: "LC"
44344+ c-basic-offset: 8
44345+ tab-width: 8
44346+ fill-column: 120
44347+ End:
44348+*/
44349diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.20/fs/reiser4/plugin/item/blackbox.h
44350--- linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
44351+++ linux-2.6.20/fs/reiser4/plugin/item/blackbox.h 2007-05-06 14:50:43.799006970 +0400
44352@@ -0,0 +1,33 @@
44353+/* Copyright 2003 by Hans Reiser, licensing governed by
44354+ * reiser4/README */
44355+
44356+/* "Black box" entry to fixed-width contain user supplied data */
44357+
44358+#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44359+#define __FS_REISER4_BLACK_BOX_H__
44360+
44361+#include "../../forward.h"
44362+#include "../../dformat.h"
44363+#include "../../kassign.h"
44364+#include "../../key.h"
44365+
44366+extern int store_black_box(reiser4_tree * tree,
44367+ const reiser4_key * key, void *data, int length);
44368+extern int load_black_box(reiser4_tree * tree,
44369+ reiser4_key * key, void *data, int length, int exact);
44370+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44371+extern int update_black_box(reiser4_tree * tree,
44372+ const reiser4_key * key, void *data, int length);
44373+
44374+/* __FS_REISER4_BLACK_BOX_H__ */
44375+#endif
44376+
44377+/* Make Linus happy.
44378+ Local variables:
44379+ c-indentation-style: "K&R"
44380+ mode-name: "LC"
44381+ c-basic-offset: 8
44382+ tab-width: 8
44383+ fill-column: 120
44384+ End:
44385+*/
44386diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/cde.c linux-2.6.20/fs/reiser4/plugin/item/cde.c
44387--- linux-2.6.20.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300
44388+++ linux-2.6.20/fs/reiser4/plugin/item/cde.c 2007-05-06 14:50:43.799006970 +0400
44389@@ -0,0 +1,1008 @@
44390+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44391+
44392+/* Directory entry implementation */
44393+
44394+/* DESCRIPTION:
44395+
44396+ This is "compound" directory item plugin implementation. This directory
44397+ item type is compound (as opposed to the "simple directory item" in
44398+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44399+ entries.
44400+
44401+ The reason behind this decision is disk space efficiency: all directory
44402+ entries inside the same directory have identical fragment in their
44403+ keys. This, of course, depends on key assignment policy. In our default key
44404+ assignment policy, all directory entries have the same locality which is
44405+ equal to the object id of their directory.
44406+
44407+ Composing directory item out of several directory entries for the same
44408+ directory allows us to store said key fragment only once. That is, this is
44409+ some ad hoc form of key compression (stem compression) that is implemented
44410+ here, because general key compression is not supposed to be implemented in
44411+ v4.0.
44412+
44413+ Another decision that was made regarding all directory item plugins, is
44414+ that they will store entry keys unaligned. This is for that sake of disk
44415+ space efficiency again.
44416+
44417+ In should be noted, that storing keys unaligned increases CPU consumption,
44418+ at least on some architectures.
44419+
44420+ Internal on-disk structure of the compound directory item is the following:
44421+
44422+ HEADER cde_item_format. Here number of entries is stored.
44423+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
44424+ ENTRY_HEADER_1 offset of entry body are stored.
44425+ ENTRY_HEADER_2 (basically two last parts of key)
44426+ ...
44427+ ENTRY_HEADER_N
44428+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
44429+ ENTRY_BODY_1 NUL-terminated name are stored.
44430+ ENTRY_BODY_2 (part of statadta key in the
44431+ sence that since all SDs have
44432+ zero offset, this offset is not
44433+ stored on disk).
44434+ ...
44435+ ENTRY_BODY_N
44436+
44437+ When it comes to the balancing, each directory entry in compound directory
44438+ item is unit, that is, something that can be cut from one item and pasted
44439+ into another item of the same type. Handling of unit cut and paste is major
44440+ reason for the complexity of code below.
44441+
44442+*/
44443+
44444+#include "../../forward.h"
44445+#include "../../debug.h"
44446+#include "../../dformat.h"
44447+#include "../../kassign.h"
44448+#include "../../key.h"
44449+#include "../../coord.h"
44450+#include "sde.h"
44451+#include "cde.h"
44452+#include "item.h"
44453+#include "../node/node.h"
44454+#include "../plugin.h"
44455+#include "../../znode.h"
44456+#include "../../carry.h"
44457+#include "../../tree.h"
44458+#include "../../inode.h"
44459+
44460+#include <linux/fs.h> /* for struct inode */
44461+#include <linux/dcache.h> /* for struct dentry */
44462+#include <linux/quotaops.h>
44463+
44464+#if 0
44465+#define CHECKME(coord) \
44466+({ \
44467+ const char *message; \
44468+ coord_t dup; \
44469+ \
44470+ coord_dup_nocheck(&dup, (coord)); \
44471+ dup.unit_pos = 0; \
44472+ assert("nikita-2871", cde_check(&dup, &message) == 0); \
44473+})
44474+#else
44475+#define CHECKME(coord) noop
44476+#endif
44477+
44478+/* return body of compound directory item at @coord */
44479+static inline cde_item_format *formatted_at(const coord_t * coord)
44480+{
44481+ assert("nikita-1282", coord != NULL);
44482+ return item_body_by_coord(coord);
44483+}
44484+
44485+/* return entry header at @coord */
44486+static inline cde_unit_header *header_at(const coord_t *
44487+ coord /* coord of item */ ,
44488+ int idx /* index of unit */ )
44489+{
44490+ assert("nikita-1283", coord != NULL);
44491+ return &formatted_at(coord)->entry[idx];
44492+}
44493+
44494+/* return number of units in compound directory item at @coord */
44495+static int units(const coord_t * coord /* coord of item */ )
44496+{
44497+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
44498+}
44499+
44500+/* return offset of the body of @idx-th entry in @coord */
44501+static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
44502+ int idx /* index of unit */ )
44503+{
44504+ if (idx < units(coord))
44505+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
44506+ else if (idx == units(coord))
44507+ return item_length_by_coord(coord);
44508+ else
44509+ impossible("nikita-1308", "Wrong idx");
44510+ return 0;
44511+}
44512+
44513+/* set offset of the body of @idx-th entry in @coord */
44514+static void set_offset(const coord_t * coord /* coord of item */ ,
44515+ int idx /* index of unit */ ,
44516+ unsigned int offset /* new offset */ )
44517+{
44518+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
44519+}
44520+
44521+static void adj_offset(const coord_t * coord /* coord of item */ ,
44522+ int idx /* index of unit */ ,
44523+ int delta /* offset change */ )
44524+{
44525+ d16 *doffset;
44526+ __u16 offset;
44527+
44528+ doffset = &header_at(coord, idx)->offset;
44529+ offset = le16_to_cpu(get_unaligned(doffset));
44530+ offset += delta;
44531+ put_unaligned(cpu_to_le16((__u16) offset), doffset);
44532+}
44533+
44534+/* return pointer to @offset-th byte from the beginning of @coord */
44535+static char *address(const coord_t * coord /* coord of item */ ,
44536+ int offset)
44537+{
44538+ return ((char *)item_body_by_coord(coord)) + offset;
44539+}
44540+
44541+/* return pointer to the body of @idx-th entry in @coord */
44542+static directory_entry_format *entry_at(const coord_t * coord /* coord of
44543+ * item */ ,
44544+ int idx /* index of unit */ )
44545+{
44546+ return (directory_entry_format *) address(coord,
44547+ (int)offset_of(coord, idx));
44548+}
44549+
44550+/* return number of unit referenced by @coord */
44551+static int idx_of(const coord_t * coord /* coord of item */ )
44552+{
44553+ assert("nikita-1285", coord != NULL);
44554+ return coord->unit_pos;
44555+}
44556+
44557+/* find position where entry with @entry_key would be inserted into @coord */
44558+static int find(const coord_t * coord /* coord of item */ ,
44559+ const reiser4_key * entry_key /* key to look for */ ,
44560+ cmp_t * last /* result of last comparison */ )
44561+{
44562+ int entries;
44563+
44564+ int left;
44565+ int right;
44566+
44567+ cde_unit_header *header;
44568+
44569+ assert("nikita-1295", coord != NULL);
44570+ assert("nikita-1296", entry_key != NULL);
44571+ assert("nikita-1297", last != NULL);
44572+
44573+ entries = units(coord);
44574+ left = 0;
44575+ right = entries - 1;
44576+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
44577+ int median;
44578+
44579+ median = (left + right) >> 1;
44580+
44581+ header = header_at(coord, median);
44582+ *last = de_id_key_cmp(&header->hash, entry_key);
44583+ switch (*last) {
44584+ case LESS_THAN:
44585+ left = median;
44586+ break;
44587+ case GREATER_THAN:
44588+ right = median;
44589+ break;
44590+ case EQUAL_TO:{
44591+ do {
44592+ median--;
44593+ header--;
44594+ } while (median >= 0 &&
44595+ de_id_key_cmp(&header->hash,
44596+ entry_key) == EQUAL_TO);
44597+ return median + 1;
44598+ }
44599+ }
44600+ }
44601+ header = header_at(coord, left);
44602+ for (; left < entries; ++left, ++header) {
44603+ prefetch(header + 1);
44604+ *last = de_id_key_cmp(&header->hash, entry_key);
44605+ if (*last != LESS_THAN)
44606+ break;
44607+ }
44608+ if (left < entries)
44609+ return left;
44610+ else
44611+ return RETERR(-ENOENT);
44612+
44613+}
44614+
44615+/* expand @coord as to accommodate for insertion of @no new entries starting
44616+ from @pos, with total bodies size @size. */
44617+static int expand_item(const coord_t * coord /* coord of item */ ,
44618+ int pos /* unit position */ , int no /* number of new
44619+ * units*/ ,
44620+ int size /* total size of new units' data */ ,
44621+ unsigned int data_size /* free space already reserved
44622+ * in the item for insertion */ )
44623+{
44624+ int entries;
44625+ cde_unit_header *header;
44626+ char *dent;
44627+ int i;
44628+
44629+ assert("nikita-1310", coord != NULL);
44630+ assert("nikita-1311", pos >= 0);
44631+ assert("nikita-1312", no > 0);
44632+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
44633+ assert("nikita-1343",
44634+ item_length_by_coord(coord) >=
44635+ (int)(size + data_size + no * sizeof *header));
44636+
44637+ entries = units(coord);
44638+
44639+ if (pos == entries)
44640+ dent = address(coord, size);
44641+ else
44642+ dent = (char *)entry_at(coord, pos);
44643+ /* place where new header will be in */
44644+ header = header_at(coord, pos);
44645+ /* free space for new entry headers */
44646+ memmove(header + no, header,
44647+ (unsigned)(address(coord, size) - (char *)header));
44648+ /* if adding to the end initialise first new header */
44649+ if (pos == entries) {
44650+ set_offset(coord, pos, (unsigned)size);
44651+ }
44652+
44653+ /* adjust entry pointer and size */
44654+ dent = dent + no * sizeof *header;
44655+ size += no * sizeof *header;
44656+ /* free space for new entries */
44657+ memmove(dent + data_size, dent,
44658+ (unsigned)(address(coord, size) - dent));
44659+
44660+ /* increase counter */
44661+ entries += no;
44662+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
44663+
44664+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
44665+ bytes. */
44666+ for (i = 0; i <= pos; ++i)
44667+ adj_offset(coord, i, no * sizeof *header);
44668+ /* [ pos + no ... +\infty ) entries were shifted by ( no *
44669+ sizeof *header + data_size ) bytes */
44670+ for (i = pos + no; i < entries; ++i)
44671+ adj_offset(coord, i, no * sizeof *header + data_size);
44672+ return 0;
44673+}
44674+
44675+/* insert new @entry into item */
44676+static int expand(const coord_t * coord /* coord of item */ ,
44677+ cde_entry * entry /* entry to insert */ ,
44678+ int len /* length of @entry data */ ,
44679+ int *pos /* position to insert */ ,
44680+ reiser4_dir_entry_desc * dir_entry /* parameters for new
44681+ * entry */ )
44682+{
44683+ cmp_t cmp_res;
44684+ int datasize;
44685+
44686+ *pos = find(coord, &dir_entry->key, &cmp_res);
44687+ if (*pos < 0)
44688+ *pos = units(coord);
44689+
44690+ datasize = sizeof(directory_entry_format);
44691+ if (is_longname(entry->name->name, entry->name->len))
44692+ datasize += entry->name->len + 1;
44693+
44694+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
44695+ datasize);
44696+ return 0;
44697+}
44698+
44699+/* paste body of @entry into item */
44700+static int paste_entry(const coord_t * coord /* coord of item */ ,
44701+ cde_entry * entry /* new entry */ ,
44702+ int pos /* position to insert */ ,
44703+ reiser4_dir_entry_desc * dir_entry /* parameters for
44704+ * new entry */ )
44705+{
44706+ cde_unit_header *header;
44707+ directory_entry_format *dent;
44708+ const char *name;
44709+ int len;
44710+
44711+ header = header_at(coord, pos);
44712+ dent = entry_at(coord, pos);
44713+
44714+ build_de_id_by_key(&dir_entry->key, &header->hash);
44715+ build_inode_key_id(entry->obj, &dent->id);
44716+ /* AUDIT unsafe strcpy() operation! It should be replaced with
44717+ much less CPU hungry
44718+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
44719+
44720+ Also a more major thing is that there should be a way to figure out
44721+ amount of space in dent -> name and be able to check that we are
44722+ not going to overwrite more than we supposed to */
44723+ name = entry->name->name;
44724+ len = entry->name->len;
44725+ if (is_longname(name, len)) {
44726+ strcpy((unsigned char *)dent->name, name);
44727+ put_unaligned(0, &dent->name[len]);
44728+ }
44729+ return 0;
44730+}
44731+
44732+/* estimate how much space is necessary in item to insert/paste set of entries
44733+ described in @data. */
44734+int estimate_cde(const coord_t * coord /* coord of item */ ,
44735+ const reiser4_item_data * data /* parameters for new item */ )
44736+{
44737+ cde_entry_data *e;
44738+ int result;
44739+ int i;
44740+
44741+ e = (cde_entry_data *) data->data;
44742+
44743+ assert("nikita-1288", e != NULL);
44744+ assert("nikita-1289", e->num_of_entries >= 0);
44745+
44746+ if (coord == NULL)
44747+ /* insert */
44748+ result = sizeof(cde_item_format);
44749+ else
44750+ /* paste */
44751+ result = 0;
44752+
44753+ result += e->num_of_entries *
44754+ (sizeof(cde_unit_header) + sizeof(directory_entry_format));
44755+ for (i = 0; i < e->num_of_entries; ++i) {
44756+ const char *name;
44757+ int len;
44758+
44759+ name = e->entry[i].name->name;
44760+ len = e->entry[i].name->len;
44761+ assert("nikita-2054", strlen(name) == len);
44762+ if (is_longname(name, len))
44763+ result += len + 1;
44764+ }
44765+ ((reiser4_item_data *) data)->length = result;
44766+ return result;
44767+}
44768+
44769+/* ->nr_units() method for this item plugin. */
44770+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
44771+{
44772+ return units(coord);
44773+}
44774+
44775+/* ->unit_key() method for this item plugin. */
44776+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
44777+ reiser4_key * key /* resulting key */ )
44778+{
44779+ assert("nikita-1452", coord != NULL);
44780+ assert("nikita-1345", idx_of(coord) < units(coord));
44781+ assert("nikita-1346", key != NULL);
44782+
44783+ item_key_by_coord(coord, key);
44784+ extract_key_from_de_id(extract_dir_id_from_key(key),
44785+ &header_at(coord, idx_of(coord))->hash, key);
44786+ return key;
44787+}
44788+
44789+/* mergeable_cde(): implementation of ->mergeable() item method.
44790+
44791+ Two directory items are mergeable iff they are from the same
44792+ directory. That simple.
44793+
44794+*/
44795+int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
44796+ const coord_t * p2 /* coord of second item */ )
44797+{
44798+ reiser4_key k1;
44799+ reiser4_key k2;
44800+
44801+ assert("nikita-1339", p1 != NULL);
44802+ assert("nikita-1340", p2 != NULL);
44803+
44804+ return
44805+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
44806+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
44807+ extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
44808+
44809+}
44810+
44811+/* ->max_key_inside() method for this item plugin. */
44812+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
44813+ reiser4_key * result /* resulting key */ )
44814+{
44815+ assert("nikita-1342", coord != NULL);
44816+
44817+ item_key_by_coord(coord, result);
44818+ set_key_ordering(result, get_key_ordering(reiser4_max_key()));
44819+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
44820+ set_key_offset(result, get_key_offset(reiser4_max_key()));
44821+ return result;
44822+}
44823+
44824+/* @data contains data which are to be put into tree */
44825+int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
44826+ const reiser4_key * key /* key to check */ ,
44827+ const reiser4_item_data * data /* parameters of new
44828+ * item/unit being
44829+ * created */ )
44830+{
44831+ reiser4_key item_key;
44832+
44833+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only
44834+ data->iplug is initialized */
44835+ assert("vs-457", data && data->iplug);
44836+/* assert( "vs-553", data -> user == 0 );*/
44837+ item_key_by_coord(coord, &item_key);
44838+
44839+ return (item_plugin_by_coord(coord) == data->iplug) &&
44840+ (extract_dir_id_from_key(&item_key) ==
44841+ extract_dir_id_from_key(key));
44842+}
44843+
44844+#if REISER4_DEBUG
44845+/* cde_check ->check() method for compressed directory items
44846+
44847+ used for debugging, every item should have here the most complete
44848+ possible check of the consistency of the item that the inventor can
44849+ construct
44850+*/
44851+int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
44852+ const char **error /* where to store error message */)
44853+{
44854+ int i;
44855+ int result;
44856+ char *item_start;
44857+ char *item_end;
44858+ reiser4_key key;
44859+
44860+ coord_t c;
44861+
44862+ assert("nikita-1357", coord != NULL);
44863+ assert("nikita-1358", error != NULL);
44864+
44865+ if (!ergo(coord->item_pos != 0,
44866+ is_dot_key(item_key_by_coord(coord, &key)))) {
44867+ *error = "CDE doesn't start with dot";
44868+ return -1;
44869+ }
44870+ item_start = item_body_by_coord(coord);
44871+ item_end = item_start + item_length_by_coord(coord);
44872+
44873+ coord_dup(&c, coord);
44874+ result = 0;
44875+ for (i = 0; i < units(coord); ++i) {
44876+ directory_entry_format *entry;
44877+
44878+ if ((char *)(header_at(coord, i) + 1) >
44879+ item_end - units(coord) * sizeof *entry) {
44880+ *error = "CDE header is out of bounds";
44881+ result = -1;
44882+ break;
44883+ }
44884+ entry = entry_at(coord, i);
44885+ if ((char *)entry < item_start + sizeof(cde_item_format)) {
44886+ *error = "CDE header is too low";
44887+ result = -1;
44888+ break;
44889+ }
44890+ if ((char *)(entry + 1) > item_end) {
44891+ *error = "CDE header is too high";
44892+ result = -1;
44893+ break;
44894+ }
44895+ }
44896+
44897+ return result;
44898+}
44899+#endif
44900+
44901+/* ->init() method for this item plugin. */
44902+int init_cde(coord_t * coord /* coord of item */ ,
44903+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
44904+ UNUSED_ARG)
44905+{
44906+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
44907+ return 0;
44908+}
44909+
44910+/* ->lookup() method for this item plugin. */
44911+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
44912+ lookup_bias bias /* search bias */ ,
44913+ coord_t * coord /* coord of item to lookup in */ )
44914+{
44915+ cmp_t last_comp;
44916+ int pos;
44917+
44918+ reiser4_key utmost_key;
44919+
44920+ assert("nikita-1293", coord != NULL);
44921+ assert("nikita-1294", key != NULL);
44922+
44923+ CHECKME(coord);
44924+
44925+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
44926+ coord->unit_pos = 0;
44927+ coord->between = BEFORE_UNIT;
44928+ return CBK_COORD_NOTFOUND;
44929+ }
44930+ pos = find(coord, key, &last_comp);
44931+ if (pos >= 0) {
44932+ coord->unit_pos = (int)pos;
44933+ switch (last_comp) {
44934+ case EQUAL_TO:
44935+ coord->between = AT_UNIT;
44936+ return CBK_COORD_FOUND;
44937+ case GREATER_THAN:
44938+ coord->between = BEFORE_UNIT;
44939+ return RETERR(-ENOENT);
44940+ case LESS_THAN:
44941+ default:
44942+ impossible("nikita-1298", "Broken find");
44943+ return RETERR(-EIO);
44944+ }
44945+ } else {
44946+ coord->unit_pos = units(coord) - 1;
44947+ coord->between = AFTER_UNIT;
44948+ return (bias ==
44949+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
44950+ CBK_COORD_NOTFOUND;
44951+ }
44952+}
44953+
44954+/* ->paste() method for this item plugin. */
44955+int paste_cde(coord_t * coord /* coord of item */ ,
44956+ reiser4_item_data * data /* parameters of new unit being
44957+ * inserted */ ,
44958+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
44959+{
44960+ cde_entry_data *e;
44961+ int result;
44962+ int i;
44963+
44964+ CHECKME(coord);
44965+ e = (cde_entry_data *) data->data;
44966+
44967+ result = 0;
44968+ for (i = 0; i < e->num_of_entries; ++i) {
44969+ int pos;
44970+ int phantom_size;
44971+
44972+ phantom_size = data->length;
44973+ if (units(coord) == 0)
44974+ phantom_size -= sizeof(cde_item_format);
44975+
44976+ result =
44977+ expand(coord, e->entry + i, phantom_size, &pos, data->arg);
44978+ if (result != 0)
44979+ break;
44980+ result = paste_entry(coord, e->entry + i, pos, data->arg);
44981+ if (result != 0)
44982+ break;
44983+ }
44984+ CHECKME(coord);
44985+ return result;
44986+}
44987+
44988+/* amount of space occupied by all entries starting from @idx both headers and
44989+ bodies. */
44990+static unsigned int part_size(const coord_t * coord /* coord of item */ ,
44991+ int idx /* index of unit */ )
44992+{
44993+ assert("nikita-1299", coord != NULL);
44994+ assert("nikita-1300", idx < (int)units(coord));
44995+
44996+ return sizeof(cde_item_format) +
44997+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
44998+ idx + 1) -
44999+ offset_of(coord, 0);
45000+}
45001+
45002+/* how many but not more than @want units of @source can be merged with
45003+ item in @target node. If pend == append - we try to append last item
45004+ of @target by first units of @source. If pend == prepend - we try to
45005+ "prepend" first item in @target by last units of @source. @target
45006+ node has @free_space bytes of free space. Total size of those units
45007+ are returned via @size */
45008+int can_shift_cde(unsigned free_space /* free space in item */ ,
45009+ coord_t * coord /* coord of source item */ ,
45010+ znode * target /* target node */ ,
45011+ shift_direction pend /* shift direction */ ,
45012+ unsigned *size /* resulting number of shifted bytes */ ,
45013+ unsigned want /* maximal number of bytes to shift */ )
45014+{
45015+ int shift;
45016+
45017+ CHECKME(coord);
45018+ if (want == 0) {
45019+ *size = 0;
45020+ return 0;
45021+ }
45022+
45023+ /* pend == SHIFT_LEFT <==> shifting to the left */
45024+ if (pend == SHIFT_LEFT) {
45025+ for (shift = min((int)want - 1, units(coord)); shift >= 0;
45026+ --shift) {
45027+ *size = part_size(coord, shift);
45028+ if (target != NULL)
45029+ *size -= sizeof(cde_item_format);
45030+ if (*size <= free_space)
45031+ break;
45032+ }
45033+ shift = shift + 1;
45034+ } else {
45035+ int total_size;
45036+
45037+ assert("nikita-1301", pend == SHIFT_RIGHT);
45038+
45039+ total_size = item_length_by_coord(coord);
45040+ for (shift = units(coord) - want - 1; shift < units(coord) - 1;
45041+ ++shift) {
45042+ *size = total_size - part_size(coord, shift);
45043+ if (target == NULL)
45044+ *size += sizeof(cde_item_format);
45045+ if (*size <= free_space)
45046+ break;
45047+ }
45048+ shift = units(coord) - shift - 1;
45049+ }
45050+ if (shift == 0)
45051+ *size = 0;
45052+ CHECKME(coord);
45053+ return shift;
45054+}
45055+
45056+/* ->copy_units() method for this item plugin. */
45057+void copy_units_cde(coord_t * target /* coord of target item */ ,
45058+ coord_t * source /* coord of source item */ ,
45059+ unsigned from /* starting unit */ ,
45060+ unsigned count /* how many units to copy */ ,
45061+ shift_direction where_is_free_space /* shift direction */ ,
45062+ unsigned free_space /* free space in item */ )
45063+{
45064+ char *header_from;
45065+ char *header_to;
45066+
45067+ char *entry_from;
45068+ char *entry_to;
45069+
45070+ int pos_in_target;
45071+ int data_size;
45072+ int data_delta;
45073+ int i;
45074+
45075+ assert("nikita-1303", target != NULL);
45076+ assert("nikita-1304", source != NULL);
45077+ assert("nikita-1305", (int)from < units(source));
45078+ assert("nikita-1307", (int)(from + count) <= units(source));
45079+
45080+ if (where_is_free_space == SHIFT_LEFT) {
45081+ assert("nikita-1453", from == 0);
45082+ pos_in_target = units(target);
45083+ } else {
45084+ assert("nikita-1309", (int)(from + count) == units(source));
45085+ pos_in_target = 0;
45086+ memmove(item_body_by_coord(target),
45087+ (char *)item_body_by_coord(target) + free_space,
45088+ item_length_by_coord(target) - free_space);
45089+ }
45090+
45091+ CHECKME(target);
45092+ CHECKME(source);
45093+
45094+ /* expand @target */
45095+ data_size =
45096+ offset_of(source, (int)(from + count)) - offset_of(source,
45097+ (int)from);
45098+
45099+ if (units(target) == 0)
45100+ free_space -= sizeof(cde_item_format);
45101+
45102+ expand_item(target, pos_in_target, (int)count,
45103+ (int)(item_length_by_coord(target) - free_space),
45104+ (unsigned)data_size);
45105+
45106+ /* copy first @count units of @source into @target */
45107+ data_delta =
45108+ offset_of(target, pos_in_target) - offset_of(source, (int)from);
45109+
45110+ /* copy entries */
45111+ entry_from = (char *)entry_at(source, (int)from);
45112+ entry_to = (char *)entry_at(source, (int)(from + count));
45113+ memmove(entry_at(target, pos_in_target), entry_from,
45114+ (unsigned)(entry_to - entry_from));
45115+
45116+ /* copy headers */
45117+ header_from = (char *)header_at(source, (int)from);
45118+ header_to = (char *)header_at(source, (int)(from + count));
45119+ memmove(header_at(target, pos_in_target), header_from,
45120+ (unsigned)(header_to - header_from));
45121+
45122+ /* update offsets */
45123+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
45124+ adj_offset(target, i, data_delta);
45125+ CHECKME(target);
45126+ CHECKME(source);
45127+}
45128+
45129+/* ->cut_units() method for this item plugin. */
45130+int cut_units_cde(coord_t * coord /* coord of item */ ,
45131+ pos_in_node_t from /* start unit pos */ ,
45132+ pos_in_node_t to /* stop unit pos */ ,
45133+ struct carry_cut_data *cdata UNUSED_ARG,
45134+ reiser4_key * smallest_removed, reiser4_key * new_first)
45135+{
45136+ char *header_from;
45137+ char *header_to;
45138+
45139+ char *entry_from;
45140+ char *entry_to;
45141+
45142+ int size;
45143+ int entry_delta;
45144+ int header_delta;
45145+ int i;
45146+
45147+ unsigned count;
45148+
45149+ CHECKME(coord);
45150+
45151+ count = to - from + 1;
45152+
45153+ assert("nikita-1454", coord != NULL);
45154+ assert("nikita-1455", (int)(from + count) <= units(coord));
45155+
45156+ if (smallest_removed)
45157+ unit_key_by_coord(coord, smallest_removed);
45158+
45159+ if (new_first) {
45160+ coord_t next;
45161+
45162+ /* not everything is cut from item head */
45163+ assert("vs-1527", from == 0);
45164+ assert("vs-1528", to < units(coord) - 1);
45165+
45166+ coord_dup(&next, coord);
45167+ next.unit_pos++;
45168+ unit_key_by_coord(&next, new_first);
45169+ }
45170+
45171+ size = item_length_by_coord(coord);
45172+ if (count == (unsigned)units(coord)) {
45173+ return size;
45174+ }
45175+
45176+ header_from = (char *)header_at(coord, (int)from);
45177+ header_to = (char *)header_at(coord, (int)(from + count));
45178+
45179+ entry_from = (char *)entry_at(coord, (int)from);
45180+ entry_to = (char *)entry_at(coord, (int)(from + count));
45181+
45182+ /* move headers */
45183+ memmove(header_from, header_to,
45184+ (unsigned)(address(coord, size) - header_to));
45185+
45186+ header_delta = header_to - header_from;
45187+
45188+ entry_from -= header_delta;
45189+ entry_to -= header_delta;
45190+ size -= header_delta;
45191+
45192+ /* copy entries */
45193+ memmove(entry_from, entry_to,
45194+ (unsigned)(address(coord, size) - entry_to));
45195+
45196+ entry_delta = entry_to - entry_from;
45197+ size -= entry_delta;
45198+
45199+ /* update offsets */
45200+
45201+ for (i = 0; i < (int)from; ++i)
45202+ adj_offset(coord, i, -header_delta);
45203+
45204+ for (i = from; i < units(coord) - (int)count; ++i)
45205+ adj_offset(coord, i, -header_delta - entry_delta);
45206+
45207+ put_unaligned(cpu_to_le16((__u16) units(coord) - count),
45208+ &formatted_at(coord)->num_of_entries);
45209+
45210+ if (from == 0) {
45211+ /* entries from head was removed - move remaining to right */
45212+ memmove((char *)item_body_by_coord(coord) +
45213+ header_delta + entry_delta, item_body_by_coord(coord),
45214+ (unsigned)size);
45215+ if (REISER4_DEBUG)
45216+ memset(item_body_by_coord(coord), 0,
45217+ (unsigned)header_delta + entry_delta);
45218+ } else {
45219+ /* freed space is already at the end of item */
45220+ if (REISER4_DEBUG)
45221+ memset((char *)item_body_by_coord(coord) + size, 0,
45222+ (unsigned)header_delta + entry_delta);
45223+ }
45224+
45225+ return header_delta + entry_delta;
45226+}
45227+
45228+int kill_units_cde(coord_t * coord /* coord of item */ ,
45229+ pos_in_node_t from /* start unit pos */ ,
45230+ pos_in_node_t to /* stop unit pos */ ,
45231+ struct carry_kill_data *kdata UNUSED_ARG,
45232+ reiser4_key * smallest_removed, reiser4_key * new_first)
45233+{
45234+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
45235+}
45236+
45237+/* ->s.dir.extract_key() method for this item plugin. */
45238+int extract_key_cde(const coord_t * coord /* coord of item */ ,
45239+ reiser4_key * key /* resulting key */ )
45240+{
45241+ directory_entry_format *dent;
45242+
45243+ assert("nikita-1155", coord != NULL);
45244+ assert("nikita-1156", key != NULL);
45245+
45246+ dent = entry_at(coord, idx_of(coord));
45247+ return extract_key_from_id(&dent->id, key);
45248+}
45249+
45250+int
45251+update_key_cde(const coord_t * coord, const reiser4_key * key,
45252+ lock_handle * lh UNUSED_ARG)
45253+{
45254+ directory_entry_format *dent;
45255+ obj_key_id obj_id;
45256+ int result;
45257+
45258+ assert("nikita-2344", coord != NULL);
45259+ assert("nikita-2345", key != NULL);
45260+
45261+ dent = entry_at(coord, idx_of(coord));
45262+ result = build_obj_key_id(key, &obj_id);
45263+ if (result == 0) {
45264+ dent->id = obj_id;
45265+ znode_make_dirty(coord->node);
45266+ }
45267+ return 0;
45268+}
45269+
45270+/* ->s.dir.extract_name() method for this item plugin. */
45271+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45272+{
45273+ directory_entry_format *dent;
45274+
45275+ assert("nikita-1157", coord != NULL);
45276+
45277+ dent = entry_at(coord, idx_of(coord));
45278+ return extract_dent_name(coord, dent, buf);
45279+}
45280+
45281+static int cde_bytes(int pasting, const reiser4_item_data * data)
45282+{
45283+ int result;
45284+
45285+ result = data->length;
45286+ if (!pasting)
45287+ result -= sizeof(cde_item_format);
45288+ return result;
45289+}
45290+
45291+/* ->s.dir.add_entry() method for this item plugin */
45292+int add_entry_cde(struct inode *dir /* directory object */ ,
45293+ coord_t * coord /* coord of item */ ,
45294+ lock_handle * lh /* lock handle for insertion */ ,
45295+ const struct dentry *name /* name to insert */ ,
45296+ reiser4_dir_entry_desc * dir_entry /* parameters of new
45297+ * directory entry */ )
45298+{
45299+ reiser4_item_data data;
45300+ cde_entry entry;
45301+ cde_entry_data edata;
45302+ int result;
45303+
45304+ assert("nikita-1656", coord->node == lh->node);
45305+ assert("nikita-1657", znode_is_write_locked(coord->node));
45306+
45307+ edata.num_of_entries = 1;
45308+ edata.entry = &entry;
45309+
45310+ entry.dir = dir;
45311+ entry.obj = dir_entry->obj;
45312+ entry.name = &name->d_name;
45313+
45314+ data.data = (char *)&edata;
45315+ data.user = 0; /* &edata is not user space */
45316+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45317+ data.arg = dir_entry;
45318+ assert("nikita-1302", data.iplug != NULL);
45319+
45320+ result = is_dot_key(&dir_entry->key);
45321+ data.length = estimate_cde(result ? coord : NULL, &data);
45322+
45323+ /* NOTE-NIKITA quota plugin? */
45324+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45325+ return RETERR(-EDQUOT);
45326+
45327+ if (result)
45328+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45329+ else
45330+ result = reiser4_resize_item(coord, &data, &dir_entry->key,
45331+ lh, 0);
45332+ return result;
45333+}
45334+
45335+/* ->s.dir.rem_entry() */
45336+int rem_entry_cde(struct inode *dir /* directory of item */ ,
45337+ const struct qstr *name, coord_t * coord /* coord of item */ ,
45338+ lock_handle * lh UNUSED_ARG /* lock handle for
45339+ * removal */ ,
45340+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
45341+ * directory entry
45342+ * being removed */ )
45343+{
45344+ coord_t shadow;
45345+ int result;
45346+ int length;
45347+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45348+
45349+ assert("nikita-2870", strlen(name->name) == name->len);
45350+ assert("nikita-2869",
45351+ !strcmp(name->name, extract_name_cde(coord, buf)));
45352+
45353+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45354+ if (is_longname(name->name, name->len))
45355+ length += name->len + 1;
45356+
45357+ if (inode_get_bytes(dir) < length) {
45358+ warning("nikita-2628", "Dir is broke: %llu: %llu",
45359+ (unsigned long long)get_inode_oid(dir),
45360+ inode_get_bytes(dir));
45361+
45362+ return RETERR(-EIO);
45363+ }
45364+
45365+ /* cut_node() is supposed to take pointers to _different_
45366+ coords, because it will modify them without respect to
45367+ possible aliasing. To work around this, create temporary copy
45368+ of @coord.
45369+ */
45370+ coord_dup(&shadow, coord);
45371+ result =
45372+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45373+ if (result == 0) {
45374+ /* NOTE-NIKITA quota plugin? */
45375+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
45376+ }
45377+ return result;
45378+}
45379+
45380+/* ->s.dir.max_name_len() method for this item plugin */
45381+int max_name_len_cde(const struct inode *dir /* directory */ )
45382+{
45383+ return
45384+ reiser4_tree_by_inode(dir)->nplug->max_item_size() -
45385+ sizeof(directory_entry_format) - sizeof(cde_item_format) -
45386+ sizeof(cde_unit_header) - 2;
45387+}
45388+
45389+/* Make Linus happy.
45390+ Local variables:
45391+ c-indentation-style: "K&R"
45392+ mode-name: "LC"
45393+ c-basic-offset: 8
45394+ tab-width: 8
45395+ fill-column: 120
45396+ End:
45397+*/
45398diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/cde.h linux-2.6.20/fs/reiser4/plugin/item/cde.h
45399--- linux-2.6.20.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300
45400+++ linux-2.6.20/fs/reiser4/plugin/item/cde.h 2007-05-06 14:50:43.803008220 +0400
45401@@ -0,0 +1,87 @@
45402+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45403+
45404+/* Compound directory item. See cde.c for description. */
45405+
45406+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45407+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45408+
45409+#include "../../forward.h"
45410+#include "../../kassign.h"
45411+#include "../../dformat.h"
45412+
45413+#include <linux/fs.h> /* for struct inode */
45414+#include <linux/dcache.h> /* for struct dentry, etc */
45415+
45416+typedef struct cde_unit_header {
45417+ de_id hash;
45418+ d16 offset;
45419+} cde_unit_header;
45420+
45421+typedef struct cde_item_format {
45422+ d16 num_of_entries;
45423+ cde_unit_header entry[0];
45424+} cde_item_format;
45425+
45426+typedef struct cde_entry {
45427+ const struct inode *dir;
45428+ const struct inode *obj;
45429+ const struct qstr *name;
45430+} cde_entry;
45431+
45432+typedef struct cde_entry_data {
45433+ int num_of_entries;
45434+ cde_entry *entry;
45435+} cde_entry_data;
45436+
45437+/* plugin->item.b.* */
45438+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
45439+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
45440+ const reiser4_item_data *);
45441+int mergeable_cde(const coord_t * p1, const coord_t * p2);
45442+pos_in_node_t nr_units_cde(const coord_t * coord);
45443+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
45444+int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
45445+void print_cde(const char *prefix, coord_t * coord);
45446+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
45447+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
45448+ coord_t * coord);
45449+int paste_cde(coord_t * coord, reiser4_item_data * data,
45450+ carry_plugin_info * info UNUSED_ARG);
45451+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
45452+ shift_direction pend, unsigned *size, unsigned want);
45453+void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
45454+ unsigned count, shift_direction where_is_free_space,
45455+ unsigned free_space);
45456+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45457+ struct carry_cut_data *, reiser4_key * smallest_removed,
45458+ reiser4_key * new_first);
45459+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45460+ struct carry_kill_data *, reiser4_key * smallest_removed,
45461+ reiser4_key * new_first);
45462+void print_cde(const char *prefix, coord_t * coord);
45463+int reiser4_check_cde(const coord_t * coord, const char **error);
45464+
45465+/* plugin->u.item.s.dir.* */
45466+int extract_key_cde(const coord_t * coord, reiser4_key * key);
45467+int update_key_cde(const coord_t * coord, const reiser4_key * key,
45468+ lock_handle * lh);
45469+char *extract_name_cde(const coord_t * coord, char *buf);
45470+int add_entry_cde(struct inode *dir, coord_t * coord,
45471+ lock_handle * lh, const struct dentry *name,
45472+ reiser4_dir_entry_desc * entry);
45473+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
45474+ lock_handle * lh, reiser4_dir_entry_desc * entry);
45475+int max_name_len_cde(const struct inode *dir);
45476+
45477+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
45478+#endif
45479+
45480+/* Make Linus happy.
45481+ Local variables:
45482+ c-indentation-style: "K&R"
45483+ mode-name: "LC"
45484+ c-basic-offset: 8
45485+ tab-width: 8
45486+ fill-column: 120
45487+ End:
45488+*/
45489diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.20/fs/reiser4/plugin/item/ctail.c
45490--- linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300
45491+++ linux-2.6.20/fs/reiser4/plugin/item/ctail.c 2007-05-06 14:50:43.803008220 +0400
45492@@ -0,0 +1,1570 @@
45493+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45494+
45495+/* ctails (aka "clustered tails") are items for cryptcompress objects */
45496+
45497+/* DESCRIPTION:
45498+
45499+Each cryptcompress object is stored on disk as a set of clusters sliced
45500+into ctails.
45501+
45502+Internal on-disk structure:
45503+
45504+ HEADER (1) Here stored disk cluster shift
45505+ BODY
45506+*/
45507+
45508+#include "../../forward.h"
45509+#include "../../debug.h"
45510+#include "../../dformat.h"
45511+#include "../../kassign.h"
45512+#include "../../key.h"
45513+#include "../../coord.h"
45514+#include "item.h"
45515+#include "../node/node.h"
45516+#include "../plugin.h"
45517+#include "../object.h"
45518+#include "../../znode.h"
45519+#include "../../carry.h"
45520+#include "../../tree.h"
45521+#include "../../inode.h"
45522+#include "../../super.h"
45523+#include "../../context.h"
45524+#include "../../page_cache.h"
45525+#include "../cluster.h"
45526+#include "../../flush.h"
45527+#include "../../tree_walk.h"
45528+
45529+#include <linux/pagevec.h>
45530+#include <linux/swap.h>
45531+#include <linux/fs.h>
45532+
45533+/* return body of ctail item at @coord */
45534+static ctail_item_format *ctail_formatted_at(const coord_t * coord)
45535+{
45536+ assert("edward-60", coord != NULL);
45537+ return item_body_by_coord(coord);
45538+}
45539+
45540+static int cluster_shift_by_coord(const coord_t * coord)
45541+{
45542+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
45543+}
45544+
45545+static inline void dclust_set_extension_shift(hint_t * hint)
45546+{
45547+ assert("edward-1270",
45548+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
45549+ hint->ext_coord.extension.ctail.shift =
45550+ cluster_shift_by_coord(&hint->ext_coord.coord);
45551+}
45552+
45553+static loff_t off_by_coord(const coord_t * coord)
45554+{
45555+ reiser4_key key;
45556+ return get_key_offset(item_key_by_coord(coord, &key));
45557+}
45558+
45559+int coord_is_unprepped_ctail(const coord_t * coord)
45560+{
45561+ assert("edward-1233", coord != NULL);
45562+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
45563+ assert("edward-1235",
45564+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
45565+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
45566+
45567+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
45568+}
45569+
45570+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
45571+{
45572+ int shift;
45573+
45574+ if (inode != NULL) {
45575+ shift = inode_cluster_shift(inode);
45576+ assert("edward-1236",
45577+ ergo(!coord_is_unprepped_ctail(coord),
45578+ shift == cluster_shift_by_coord(coord)));
45579+ } else {
45580+ assert("edward-1237", !coord_is_unprepped_ctail(coord));
45581+ shift = cluster_shift_by_coord(coord);
45582+ }
45583+ return off_by_coord(coord) >> shift;
45584+}
45585+
45586+static int disk_cluster_size(const coord_t * coord)
45587+{
45588+ assert("edward-1156",
45589+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45590+ /* calculation of disk cluster size
45591+ is meaninless if ctail is unprepped */
45592+ assert("edward-1238", !coord_is_unprepped_ctail(coord));
45593+
45594+ return 1 << cluster_shift_by_coord(coord);
45595+}
45596+
45597+/* true if the key is of first disk cluster item */
45598+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
45599+{
45600+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
45601+
45602+ return coord_is_unprepped_ctail(coord) ||
45603+ ((get_key_offset(key) &
45604+ ((loff_t) disk_cluster_size(coord) - 1)) == 0);
45605+}
45606+
45607+static char *first_unit(coord_t * coord)
45608+{
45609+ /* FIXME: warning: pointer of type `void *' used in arithmetic */
45610+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
45611+}
45612+
45613+/* plugin->u.item.b.max_key_inside :
45614+ tail_max_key_inside */
45615+
45616+/* plugin->u.item.b.can_contain_key */
45617+int
45618+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
45619+ const reiser4_item_data * data)
45620+{
45621+ reiser4_key item_key;
45622+
45623+ if (item_plugin_by_coord(coord) != data->iplug)
45624+ return 0;
45625+
45626+ item_key_by_coord(coord, &item_key);
45627+ if (get_key_locality(key) != get_key_locality(&item_key) ||
45628+ get_key_objectid(key) != get_key_objectid(&item_key))
45629+ return 0;
45630+ if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
45631+ get_key_offset(key))
45632+ return 0;
45633+ if (is_disk_cluster_key(key, coord))
45634+ return 0;
45635+ return 1;
45636+}
45637+
45638+/* plugin->u.item.b.mergeable
45639+ c-tails of different clusters are not mergeable */
45640+int mergeable_ctail(const coord_t * p1, const coord_t * p2)
45641+{
45642+ reiser4_key key1, key2;
45643+
45644+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
45645+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
45646+ UNIX_FILE_METADATA_ITEM_TYPE));
45647+
45648+ if (item_id_by_coord(p2) != CTAIL_ID) {
45649+ /* second item is of another type */
45650+ return 0;
45651+ }
45652+
45653+ item_key_by_coord(p1, &key1);
45654+ item_key_by_coord(p2, &key2);
45655+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
45656+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
45657+ get_key_type(&key1) != get_key_type(&key2)) {
45658+ /* items of different objects */
45659+ return 0;
45660+ }
45661+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
45662+ /* not adjacent items */
45663+ return 0;
45664+ if (is_disk_cluster_key(&key2, p2))
45665+ return 0;
45666+ return 1;
45667+}
45668+
45669+/* plugin->u.item.b.nr_units */
45670+pos_in_node_t nr_units_ctail(const coord_t * coord)
45671+{
45672+ return (item_length_by_coord(coord) -
45673+ sizeof(ctail_formatted_at(coord)->cluster_shift));
45674+}
45675+
45676+/* plugin->u.item.b.estimate:
45677+ estimate how much space is needed to insert/paste @data->length bytes
45678+ into ctail at @coord */
45679+int estimate_ctail(const coord_t * coord /* coord of item */ ,
45680+ const reiser4_item_data *
45681+ data /* parameters for new item */ )
45682+{
45683+ if (coord == NULL)
45684+ /* insert */
45685+ return (sizeof(ctail_item_format) + data->length);
45686+ else
45687+ /* paste */
45688+ return data->length;
45689+}
45690+
45691+/* ->init() method for this item plugin. */
45692+int init_ctail(coord_t * to /* coord of item */ ,
45693+ coord_t * from /* old_item */ ,
45694+ reiser4_item_data * data /* structure used for insertion */ )
45695+{
45696+ int cluster_shift; /* cpu value to convert */
45697+
45698+ if (data) {
45699+ assert("edward-463", data->length > sizeof(ctail_item_format));
45700+ cluster_shift = *((int *)(data->arg));
45701+ data->length -= sizeof(ctail_item_format);
45702+ } else {
45703+ assert("edward-464", from != NULL);
45704+ assert("edward-855", ctail_ok(from));
45705+ cluster_shift = (int)(cluster_shift_by_coord(from));
45706+ }
45707+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
45708+ assert("edward-856", ctail_ok(to));
45709+ return 0;
45710+}
45711+
45712+/* plugin->u.item.b.lookup:
45713+ NULL: We are looking for item keys only */
45714+
45715+#if REISER4_DEBUG
45716+int ctail_ok(const coord_t * coord)
45717+{
45718+ return coord_is_unprepped_ctail(coord) ||
45719+ cluster_shift_ok(cluster_shift_by_coord(coord));
45720+}
45721+
45722+/* plugin->u.item.b.check */
45723+int check_ctail(const coord_t * coord, const char **error)
45724+{
45725+ if (!ctail_ok(coord)) {
45726+ if (error)
45727+ *error = "bad cluster shift in ctail";
45728+ return 1;
45729+ }
45730+ return 0;
45731+}
45732+#endif
45733+
45734+/* plugin->u.item.b.paste */
45735+int
45736+paste_ctail(coord_t * coord, reiser4_item_data * data,
45737+ carry_plugin_info * info UNUSED_ARG)
45738+{
45739+ unsigned old_nr_units;
45740+
45741+ assert("edward-268", data->data != NULL);
45742+ /* copy only from kernel space */
45743+ assert("edward-66", data->user == 0);
45744+
45745+ old_nr_units =
45746+ item_length_by_coord(coord) - sizeof(ctail_item_format) -
45747+ data->length;
45748+
45749+ /* ctail items never get pasted in the middle */
45750+
45751+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
45752+
45753+ /* paste at the beginning when create new item */
45754+ assert("edward-450",
45755+ item_length_by_coord(coord) ==
45756+ data->length + sizeof(ctail_item_format));
45757+ assert("edward-451", old_nr_units == 0);
45758+ } else if (coord->unit_pos == old_nr_units - 1
45759+ && coord->between == AFTER_UNIT) {
45760+
45761+ /* paste at the end */
45762+ coord->unit_pos++;
45763+ } else
45764+ impossible("edward-453", "bad paste position");
45765+
45766+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
45767+
45768+ assert("edward-857", ctail_ok(coord));
45769+
45770+ return 0;
45771+}
45772+
45773+/* plugin->u.item.b.fast_paste */
45774+
45775+/* plugin->u.item.b.can_shift
45776+ number of units is returned via return value, number of bytes via @size. For
45777+ ctail items they coincide */
45778+int
45779+can_shift_ctail(unsigned free_space, coord_t * source,
45780+ znode * target, shift_direction direction UNUSED_ARG,
45781+ unsigned *size /* number of bytes */ , unsigned want)
45782+{
45783+ /* make sure that that we do not want to shift more than we have */
45784+ assert("edward-68", want > 0 && want <= nr_units_ctail(source));
45785+
45786+ *size = min(want, free_space);
45787+
45788+ if (!target) {
45789+ /* new item will be created */
45790+ if (*size <= sizeof(ctail_item_format)) {
45791+ *size = 0;
45792+ return 0;
45793+ }
45794+ return *size - sizeof(ctail_item_format);
45795+ }
45796+ return *size;
45797+}
45798+
45799+/* plugin->u.item.b.copy_units
45800+ cooperates with ->can_shift() */
45801+void
45802+copy_units_ctail(coord_t * target, coord_t * source,
45803+ unsigned from, unsigned count /* units */ ,
45804+ shift_direction where_is_free_space,
45805+ unsigned free_space /* bytes */ )
45806+{
45807+ /* make sure that item @target is expanded already */
45808+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
45809+ assert("edward-70", free_space == count || free_space == count + 1);
45810+
45811+ assert("edward-858", ctail_ok(source));
45812+
45813+ if (where_is_free_space == SHIFT_LEFT) {
45814+ /* append item @target with @count first bytes of @source:
45815+ this restriction came from ordinary tails */
45816+ assert("edward-71", from == 0);
45817+ assert("edward-860", ctail_ok(target));
45818+
45819+ memcpy(first_unit(target) + nr_units_ctail(target) - count,
45820+ first_unit(source), count);
45821+ } else {
45822+ /* target item is moved to right already */
45823+ reiser4_key key;
45824+
45825+ assert("edward-72", nr_units_ctail(source) == from + count);
45826+
45827+ if (free_space == count) {
45828+ init_ctail(target, source, NULL);
45829+ } else {
45830+ /* new item has been created */
45831+ assert("edward-862", ctail_ok(target));
45832+ }
45833+ memcpy(first_unit(target), first_unit(source) + from, count);
45834+
45835+ assert("edward-863", ctail_ok(target));
45836+
45837+ /* new units are inserted before first unit in an item,
45838+ therefore, we have to update item key */
45839+ item_key_by_coord(source, &key);
45840+ set_key_offset(&key, get_key_offset(&key) + from);
45841+
45842+ node_plugin_by_node(target->node)->update_item_key(target, &key,
45843+ NULL /*info */);
45844+ }
45845+}
45846+
45847+/* plugin->u.item.b.create_hook */
45848+int create_hook_ctail(const coord_t * coord, void *arg)
45849+{
45850+ assert("edward-864", znode_is_loaded(coord->node));
45851+
45852+ znode_set_convertible(coord->node);
45853+ return 0;
45854+}
45855+
45856+/* plugin->u.item.b.kill_hook */
45857+int
45858+kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
45859+ carry_kill_data * kdata)
45860+{
45861+ struct inode *inode;
45862+
45863+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
45864+ assert("edward-291", znode_is_write_locked(coord->node));
45865+
45866+ inode = kdata->inode;
45867+ if (inode) {
45868+ reiser4_key key;
45869+ item_key_by_coord(coord, &key);
45870+
45871+ if (from == 0 && is_disk_cluster_key(&key, coord)) {
45872+ /* disk cluster is killed */
45873+ cloff_t start =
45874+ off_to_clust(get_key_offset(&key), inode);
45875+ truncate_page_cluster_cryptcompress(inode, start,
45876+ kdata->params.truncate);
45877+ inode_sub_bytes(inode, inode_cluster_size(inode));
45878+ }
45879+ }
45880+ return 0;
45881+}
45882+
45883+/* for shift_hook_ctail(),
45884+ return true if the first disk cluster item has dirty child
45885+*/
45886+static int ctail_convertible(const coord_t * coord)
45887+{
45888+ int result;
45889+ reiser4_key key;
45890+ jnode *child = NULL;
45891+
45892+ assert("edward-477", coord != NULL);
45893+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
45894+
45895+ if (coord_is_unprepped_ctail(coord))
45896+ /* unprepped ctail should be converted */
45897+ return 1;
45898+
45899+ item_key_by_coord(coord, &key);
45900+ child = jlookup(current_tree,
45901+ get_key_objectid(&key),
45902+ off_to_pg(off_by_coord(coord)));
45903+ if (!child)
45904+ return 0;
45905+ result = JF_ISSET(child, JNODE_DIRTY);
45906+ jput(child);
45907+ return result;
45908+}
45909+
45910+/* FIXME-EDWARD */
45911+/* plugin->u.item.b.shift_hook */
45912+int shift_hook_ctail(const coord_t * item /* coord of item */ ,
45913+ unsigned from UNUSED_ARG /* start unit */ ,
45914+ unsigned count UNUSED_ARG /* stop unit */ ,
45915+ znode * old_node /* old parent */ )
45916+{
45917+ assert("edward-479", item != NULL);
45918+ assert("edward-480", item->node != old_node);
45919+
45920+ if (!znode_convertible(old_node) || znode_convertible(item->node))
45921+ return 0;
45922+ if (ctail_convertible(item))
45923+ znode_set_convertible(item->node);
45924+ return 0;
45925+}
45926+
45927+static int
45928+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45929+ int cut, void *p, reiser4_key * smallest_removed,
45930+ reiser4_key * new_first)
45931+{
45932+ pos_in_node_t count; /* number of units to cut */
45933+ char *item;
45934+
45935+ count = to - from + 1;
45936+ item = item_body_by_coord(coord);
45937+
45938+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
45939+
45940+ if (smallest_removed) {
45941+ /* store smallest key removed */
45942+ item_key_by_coord(coord, smallest_removed);
45943+ set_key_offset(smallest_removed,
45944+ get_key_offset(smallest_removed) + from);
45945+ }
45946+
45947+ if (new_first) {
45948+ assert("vs-1531", from == 0);
45949+
45950+ item_key_by_coord(coord, new_first);
45951+ set_key_offset(new_first,
45952+ get_key_offset(new_first) + from + count);
45953+ }
45954+
45955+ if (!cut)
45956+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
45957+
45958+ if (from == 0) {
45959+ if (count != nr_units_ctail(coord)) {
45960+ /* part of item is removed, so move free space at the beginning
45961+ of the item and update item key */
45962+ reiser4_key key;
45963+ memcpy(item + to + 1, item, sizeof(ctail_item_format));
45964+ item_key_by_coord(coord, &key);
45965+ set_key_offset(&key, get_key_offset(&key) + count);
45966+ node_plugin_by_node(coord->node)->update_item_key(coord,
45967+ &key,
45968+ NULL);
45969+ } else {
45970+ /* cut_units should not be called to cut evrything */
45971+ assert("vs-1532", ergo(cut, 0));
45972+ /* whole item is cut, so more then amount of space occupied
45973+ by units got freed */
45974+ count += sizeof(ctail_item_format);
45975+ }
45976+ if (REISER4_DEBUG)
45977+ memset(item, 0, count);
45978+ } else if (REISER4_DEBUG)
45979+ memset(item + sizeof(ctail_item_format) + from, 0, count);
45980+ return count;
45981+}
45982+
45983+/* plugin->u.item.b.cut_units */
45984+int
45985+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45986+ carry_cut_data * cdata, reiser4_key * smallest_removed,
45987+ reiser4_key * new_first)
45988+{
45989+ return cut_or_kill_ctail_units(item, from, to, 1, NULL,
45990+ smallest_removed, new_first);
45991+}
45992+
45993+/* plugin->u.item.b.kill_units */
45994+int
45995+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45996+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
45997+ reiser4_key * new_first)
45998+{
45999+ return cut_or_kill_ctail_units(item, from, to, 0, kdata,
46000+ smallest_removed, new_first);
46001+}
46002+
46003+/* plugin->u.item.s.file.read */
46004+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
46005+{
46006+ uf_coord_t *uf_coord;
46007+ coord_t *coord;
46008+
46009+ uf_coord = &hint->ext_coord;
46010+ coord = &uf_coord->coord;
46011+ assert("edward-127", f->user == 0);
46012+ assert("edward-129", coord && coord->node);
46013+ assert("edward-130", coord_is_existing_unit(coord));
46014+ assert("edward-132", znode_is_loaded(coord->node));
46015+
46016+ /* start read only from the beginning of ctail */
46017+ assert("edward-133", coord->unit_pos == 0);
46018+ /* read only whole ctails */
46019+ assert("edward-135", nr_units_ctail(coord) <= f->length);
46020+
46021+ assert("edward-136", reiser4_schedulable());
46022+ assert("edward-886", ctail_ok(coord));
46023+
46024+ if (f->data)
46025+ memcpy(f->data, (char *)first_unit(coord),
46026+ (size_t) nr_units_ctail(coord));
46027+
46028+ dclust_set_extension_shift(hint);
46029+ mark_page_accessed(znode_page(coord->node));
46030+ move_flow_forward(f, nr_units_ctail(coord));
46031+
46032+ return 0;
46033+}
46034+
46035+/* Reads a disk cluster consists of ctail items,
46036+ attaches a transform stream with plain text */
46037+int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
46038+ znode_lock_mode mode)
46039+{
46040+ int result;
46041+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
46042+ assert("edward-671", clust->hint != NULL);
46043+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
46044+ assert("edward-672", cryptcompress_inode_ok(inode));
46045+
46046+ /* set input stream */
46047+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
46048+ if (result)
46049+ return result;
46050+
46051+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
46052+ assert("edward-1340", !result);
46053+ if (result)
46054+ return result;
46055+ if (mode == ZNODE_READ_LOCK)
46056+ /* write still need the lock to insert unprepped
46057+ items, etc... */
46058+ put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
46059+
46060+ if (clust->dstat == FAKE_DISK_CLUSTER ||
46061+ clust->dstat == UNPR_DISK_CLUSTER) {
46062+ tfm_cluster_set_uptodate(&clust->tc);
46063+ return 0;
46064+ }
46065+ result = grab_coa(&clust->tc, inode_compression_plugin(inode));
46066+ if (result)
46067+ return result;
46068+ result = reiser4_inflate_cluster(clust, inode);
46069+ if (result)
46070+ return result;
46071+ tfm_cluster_set_uptodate(&clust->tc);
46072+ return 0;
46073+}
46074+
46075+/* read one locked page */
46076+int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
46077+ struct page *page, znode_lock_mode mode)
46078+{
46079+ int ret;
46080+ unsigned cloff;
46081+ char *data;
46082+ size_t pgcnt;
46083+ tfm_cluster_t *tc = &clust->tc;
46084+
46085+ assert("edward-212", PageLocked(page));
46086+
46087+ if (PageUptodate(page))
46088+ goto exit;
46089+
46090+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
46091+ clust->index = pg_to_clust(page->index, inode);
46092+ unlock_page(page);
46093+ ret = ctail_read_disk_cluster(clust, inode, mode);
46094+ lock_page(page);
46095+ if (ret)
46096+ return ret;
46097+ }
46098+ if (PageUptodate(page))
46099+ /* races with another read/write */
46100+ goto exit;
46101+
46102+ /* bytes in the page */
46103+ pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
46104+
46105+ if (pgcnt == 0) {
46106+ assert("edward-1290", 0);
46107+ return RETERR(-EINVAL);
46108+ }
46109+ assert("edward-119", tfm_cluster_is_uptodate(tc));
46110+
46111+ switch (clust->dstat) {
46112+ case UNPR_DISK_CLUSTER:
46113+ assert("edward-1285", 0);
46114+#if REISER4_DEBUG
46115+ warning("edward-1168",
46116+ "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
46117+ page->index, clust->index,
46118+ (unsigned long long)get_inode_oid(inode));
46119+#endif
46120+ case FAKE_DISK_CLUSTER:
46121+ /* fill the page by zeroes */
46122+ data = kmap_atomic(page, KM_USER0);
46123+
46124+ memset(data, 0, PAGE_CACHE_SIZE);
46125+ flush_dcache_page(page);
46126+ kunmap_atomic(data, KM_USER0);
46127+ SetPageUptodate(page);
46128+ break;
46129+ case PREP_DISK_CLUSTER:
46130+ /* fill the page by transformed data */
46131+ assert("edward-1058", !PageUptodate(page));
46132+ assert("edward-120", tc->len <= inode_cluster_size(inode));
46133+
46134+ /* start page offset in the cluster */
46135+ cloff = pg_to_off_to_cloff(page->index, inode);
46136+
46137+ data = kmap(page);
46138+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
46139+ memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
46140+ flush_dcache_page(page);
46141+ kunmap(page);
46142+ SetPageUptodate(page);
46143+ break;
46144+ default:
46145+ impossible("edward-1169", "bad disk cluster state");
46146+ }
46147+ exit:
46148+ return 0;
46149+}
46150+
46151+/* plugin->u.item.s.file.readpage */
46152+int readpage_ctail(void *vp, struct page *page)
46153+{
46154+ int result;
46155+ hint_t *hint;
46156+ reiser4_cluster_t *clust = vp;
46157+
46158+ assert("edward-114", clust != NULL);
46159+ assert("edward-115", PageLocked(page));
46160+ assert("edward-116", !PageUptodate(page));
46161+ assert("edward-117", !jprivate(page) && !PagePrivate(page));
46162+ assert("edward-118", page->mapping && page->mapping->host);
46163+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
46164+
46165+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46166+ if (hint == NULL) {
46167+ unlock_page(page);
46168+ return RETERR(-ENOMEM);
46169+ }
46170+ clust->hint = hint;
46171+ result = load_file_hint(clust->file, hint);
46172+ if (result) {
46173+ kfree(hint);
46174+ unlock_page(page);
46175+ return result;
46176+ }
46177+ assert("vs-25", hint->ext_coord.lh == &hint->lh);
46178+ result = do_readpage_ctail(page->mapping->host, clust, page,
46179+ ZNODE_READ_LOCK);
46180+
46181+ assert("edward-213", PageLocked(page));
46182+ assert("edward-1163", ergo(!result, PageUptodate(page)));
46183+ assert("edward-868",
46184+ ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
46185+
46186+ unlock_page(page);
46187+ done_lh(&hint->lh);
46188+ hint->ext_coord.valid = 0;
46189+ save_file_hint(clust->file, hint);
46190+ kfree(hint);
46191+ tfm_cluster_clr_uptodate(&clust->tc);
46192+
46193+ return result;
46194+}
46195+
46196+/* Helper function for ->readpages() */
46197+static int
46198+ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
46199+{
46200+ int i;
46201+ int result;
46202+ assert("edward-779", clust != NULL);
46203+ assert("edward-1059", clust->win == NULL);
46204+ assert("edward-780", inode != NULL);
46205+
46206+ result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
46207+ if (result)
46208+ return result;
46209+ result = ctail_read_disk_cluster(clust, inode, ZNODE_READ_LOCK);
46210+ if (result)
46211+ goto out;
46212+ /* at this point stream with valid plain text is attached */
46213+ assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
46214+
46215+ for (i = 0; i < clust->nr_pages; i++) {
46216+ struct page *page = clust->pages[i];
46217+ lock_page(page);
46218+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46219+ unlock_page(page);
46220+ if (result)
46221+ break;
46222+ }
46223+ tfm_cluster_clr_uptodate(&clust->tc);
46224+ out:
46225+ reiser4_release_cluster_pages(clust);
46226+ return result;
46227+}
46228+
46229+/* filler for read_cache_pages() */
46230+static int ctail_readpages_filler(void * data, struct page * page)
46231+{
46232+ int ret = 0;
46233+ reiser4_cluster_t * clust = data;
46234+ struct inode * inode = clust->file->f_dentry->d_inode;
46235+
46236+ if (PageUptodate(page)) {
46237+ unlock_page(page);
46238+ return 0;
46239+ }
46240+ unlock_page(page);
46241+ move_cluster_forward(clust, inode, page->index);
46242+ ret = ctail_read_page_cluster(clust, inode);
46243+ if (ret)
46244+ return ret;
46245+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
46246+
46247+ lock_page(page);
46248+ ret = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46249+ assert("edward-1061", ergo(!ret, PageUptodate(page)));
46250+ unlock_page(page);
46251+
46252+ return ret;
46253+}
46254+
46255+/* We populate a bit more then upper readahead suggests:
46256+ with each nominated page we read the whole page cluster
46257+ this page belongs to. */
46258+int readpages_ctail(struct file *file, struct address_space *mapping,
46259+ struct list_head *pages)
46260+{
46261+ int ret = 0;
46262+ hint_t *hint;
46263+ reiser4_cluster_t clust;
46264+ struct inode *inode = mapping->host;
46265+
46266+ assert("edward-1521", inode == file->f_dentry->d_inode);
46267+
46268+ cluster_init_read(&clust, NULL);
46269+ clust.file = file;
46270+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46271+ if (hint == NULL) {
46272+ warning("vs-28", "failed to allocate hint");
46273+ ret = RETERR(-ENOMEM);
46274+ goto exit1;
46275+ }
46276+ clust.hint = hint;
46277+ ret = load_file_hint(clust.file, hint);
46278+ if (ret) {
46279+ warning("edward-1522", "failed to load hint");
46280+ goto exit2;
46281+ }
46282+ assert("vs-26", hint->ext_coord.lh == &hint->lh);
46283+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46284+ if (ret) {
46285+ warning("edward-1523", "failed to alloc pgset");
46286+ goto exit3;
46287+ }
46288+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
46289+
46290+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46291+ exit3:
46292+ done_lh(&hint->lh);
46293+ save_file_hint(file, hint);
46294+ hint->ext_coord.valid = 0;
46295+ exit2:
46296+ kfree(hint);
46297+ exit1:
46298+ put_cluster_handle(&clust);
46299+ return ret;
46300+}
46301+
46302+/*
46303+ plugin->u.item.s.file.append_key
46304+ key of the first item of the next disk cluster
46305+*/
46306+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46307+{
46308+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46309+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46310+
46311+ item_key_by_coord(coord, key);
46312+ set_key_offset(key,
46313+ ((__u64) (clust_by_coord(coord, NULL)) +
46314+ 1) << cluster_shift_by_coord(coord));
46315+ return key;
46316+}
46317+
46318+static int
46319+insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
46320+{
46321+ int result;
46322+ char buf[UCTAIL_NR_UNITS];
46323+ reiser4_item_data data;
46324+ reiser4_key key;
46325+ int shift = (int)UCTAIL_SHIFT;
46326+
46327+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46328+ result = key_by_inode_cryptcompress(inode,
46329+ clust_to_off(clust->index, inode),
46330+ &key);
46331+ if (result)
46332+ return result;
46333+ data.user = 0;
46334+ data.iplug = item_plugin_by_id(CTAIL_ID);
46335+ data.arg = &shift;
46336+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46337+ data.data = buf;
46338+
46339+ result = insert_by_coord(&clust->hint->ext_coord.coord,
46340+ &data, &key, clust->hint->ext_coord.lh, 0);
46341+ return result;
46342+}
46343+
46344+static int
46345+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46346+ struct inode *inode)
46347+{
46348+ int result;
46349+ carry_pool *pool;
46350+ carry_level *lowest_level;
46351+ reiser4_item_data *data;
46352+ carry_op *op;
46353+ int cluster_shift = inode_cluster_shift(inode);
46354+
46355+ pool =
46356+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46357+ sizeof(*data));
46358+ if (IS_ERR(pool))
46359+ return PTR_ERR(pool);
46360+ lowest_level = (carry_level *) (pool + 1);
46361+ init_carry_level(lowest_level, pool);
46362+ data = (reiser4_item_data *) (lowest_level + 3);
46363+
46364+ assert("edward-466", coord->between == AFTER_ITEM
46365+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46366+ || coord->between == EMPTY_NODE
46367+ || coord->between == BEFORE_UNIT);
46368+
46369+ if (coord->between == AFTER_UNIT) {
46370+ coord->unit_pos = 0;
46371+ coord->between = AFTER_ITEM;
46372+ }
46373+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46374+ 0 /* operate directly on coord -> node */);
46375+ if (IS_ERR(op) || (op == NULL)) {
46376+ done_carry_pool(pool);
46377+ return RETERR(op ? PTR_ERR(op) : -EIO);
46378+ }
46379+ data->user = 0;
46380+ data->iplug = item_plugin_by_id(CTAIL_ID);
46381+ data->arg = &cluster_shift;
46382+
46383+ data->length = 0;
46384+ data->data = NULL;
46385+
46386+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46387+ op->u.insert_flow.insert_point = coord;
46388+ op->u.insert_flow.flow = f;
46389+ op->u.insert_flow.data = data;
46390+ op->u.insert_flow.new_nodes = 0;
46391+
46392+ lowest_level->track_type = CARRY_TRACK_CHANGE;
46393+ lowest_level->tracked = lh;
46394+
46395+ result = reiser4_carry(lowest_level, NULL);
46396+ done_carry_pool(pool);
46397+
46398+ return result;
46399+}
46400+
46401+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46402+static int insert_cryptcompress_flow_in_place(coord_t * coord,
46403+ lock_handle * lh, flow_t * f,
46404+ struct inode *inode)
46405+{
46406+ int ret;
46407+ coord_t pos;
46408+ lock_handle lock;
46409+
46410+ assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46411+ assert("edward-484", coord->between == AT_UNIT
46412+ || coord->between == AFTER_ITEM);
46413+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46414+
46415+ coord_dup(&pos, coord);
46416+ pos.unit_pos = 0;
46417+ pos.between = AFTER_ITEM;
46418+
46419+ init_lh(&lock);
46420+ copy_lh(&lock, lh);
46421+
46422+ ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
46423+ done_lh(&lock);
46424+ assert("edward-1347", znode_is_write_locked(lh->node));
46425+ assert("edward-1228", !ret);
46426+ return ret;
46427+}
46428+
46429+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46430+static int overwrite_ctail(coord_t * coord, flow_t * f)
46431+{
46432+ unsigned count;
46433+
46434+ assert("edward-269", f->user == 0);
46435+ assert("edward-270", f->data != NULL);
46436+ assert("edward-271", f->length > 0);
46437+ assert("edward-272", coord_is_existing_unit(coord));
46438+ assert("edward-273", coord->unit_pos == 0);
46439+ assert("edward-274", znode_is_write_locked(coord->node));
46440+ assert("edward-275", reiser4_schedulable());
46441+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
46442+ assert("edward-1243", ctail_ok(coord));
46443+
46444+ count = nr_units_ctail(coord);
46445+
46446+ if (count > f->length)
46447+ count = f->length;
46448+ memcpy(first_unit(coord), f->data, count);
46449+ move_flow_forward(f, count);
46450+ coord->unit_pos += count;
46451+ return 0;
46452+}
46453+
46454+/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
46455+ cut ctail (part or whole) starting from next unit position */
46456+static int cut_ctail(coord_t * coord)
46457+{
46458+ coord_t stop;
46459+
46460+ assert("edward-435", coord->between == AT_UNIT &&
46461+ coord->item_pos < coord_num_items(coord) &&
46462+ coord->unit_pos <= coord_num_units(coord));
46463+
46464+ if (coord->unit_pos == coord_num_units(coord))
46465+ /* nothing to cut */
46466+ return 0;
46467+ coord_dup(&stop, coord);
46468+ stop.unit_pos = coord_last_unit_pos(coord);
46469+
46470+ return cut_node_content(coord, &stop, NULL, NULL, NULL);
46471+}
46472+
46473+int
46474+ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
46475+{
46476+ int result;
46477+ assert("edward-1244", inode != NULL);
46478+ assert("edward-1245", clust->hint != NULL);
46479+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
46480+ assert("edward-1247", clust->reserved == 1);
46481+
46482+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
46483+ if (cbk_errored(result))
46484+ return result;
46485+ assert("edward-1249", result == CBK_COORD_NOTFOUND);
46486+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
46487+
46488+ assert("edward-1295",
46489+ clust->hint->ext_coord.lh->node ==
46490+ clust->hint->ext_coord.coord.node);
46491+
46492+ coord_set_between_clusters(&clust->hint->ext_coord.coord);
46493+
46494+ result = insert_unprepped_ctail(clust, inode);
46495+ all_grabbed2free();
46496+
46497+ assert("edward-1251", !result);
46498+ assert("edward-1252", cryptcompress_inode_ok(inode));
46499+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
46500+ assert("edward-1254",
46501+ reiser4_clustered_blocks(reiser4_get_current_sb()));
46502+ assert("edward-1255",
46503+ znode_convertible(clust->hint->ext_coord.coord.node));
46504+
46505+ return result;
46506+}
46507+
46508+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
46509+{
46510+ int result = 0;
46511+ convert_item_info_t *info;
46512+
46513+ assert("edward-468", pos != NULL);
46514+ assert("edward-469", pos->sq != NULL);
46515+ assert("edward-845", item_convert_data(pos) != NULL);
46516+
46517+ info = item_convert_data(pos);
46518+ assert("edward-679", info->flow.data != NULL);
46519+
46520+ switch (mode) {
46521+ case CRC_APPEND_ITEM:
46522+ assert("edward-1229", info->flow.length != 0);
46523+ assert("edward-1256",
46524+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
46525+ result =
46526+ insert_cryptcompress_flow_in_place(&pos->coord,
46527+ &pos->lock,
46528+ &info->flow,
46529+ info->inode);
46530+ break;
46531+ case CRC_OVERWRITE_ITEM:
46532+ assert("edward-1230", info->flow.length != 0);
46533+ overwrite_ctail(&pos->coord, &info->flow);
46534+ if (info->flow.length != 0)
46535+ break;
46536+ case CRC_CUT_ITEM:
46537+ assert("edward-1231", info->flow.length == 0);
46538+ result = cut_ctail(&pos->coord);
46539+ break;
46540+ default:
46541+ result = RETERR(-EIO);
46542+ impossible("edward-244", "bad convert mode");
46543+ }
46544+ return result;
46545+}
46546+
46547+/* plugin->u.item.f.scan */
46548+int scan_ctail(flush_scan * scan)
46549+{
46550+ int result = 0;
46551+ struct page *page;
46552+ struct inode *inode;
46553+ jnode *node = scan->node;
46554+
46555+ assert("edward-227", scan->node != NULL);
46556+ assert("edward-228", jnode_is_cluster_page(scan->node));
46557+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
46558+
46559+ page = jnode_page(node);
46560+ inode = page->mapping->host;
46561+
46562+ if (!reiser4_scanning_left(scan))
46563+ return result;
46564+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
46565+ znode_make_dirty(scan->parent_lock.node);
46566+
46567+ if (!znode_convertible(scan->parent_lock.node)) {
46568+ if (JF_ISSET(scan->node, JNODE_DIRTY))
46569+ znode_set_convertible(scan->parent_lock.node);
46570+ else {
46571+ warning("edward-681",
46572+ "cluster page is already processed");
46573+ return -EAGAIN;
46574+ }
46575+ }
46576+ return result;
46577+}
46578+
46579+/* If true, this function attaches children */
46580+static int should_attach_convert_idata(flush_pos_t * pos)
46581+{
46582+ int result;
46583+ assert("edward-431", pos != NULL);
46584+ assert("edward-432", pos->child == NULL);
46585+ assert("edward-619", znode_is_write_locked(pos->coord.node));
46586+ assert("edward-470",
46587+ item_plugin_by_coord(&pos->coord) ==
46588+ item_plugin_by_id(CTAIL_ID));
46589+
46590+ /* check for leftmost child */
46591+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
46592+
46593+ if (!pos->child)
46594+ return 0;
46595+ spin_lock_jnode(pos->child);
46596+ result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
46597+ pos->child->atom == ZJNODE(pos->coord.node)->atom);
46598+ spin_unlock_jnode(pos->child);
46599+ if (!result && pos->child) {
46600+ /* existing child isn't to attach, clear up this one */
46601+ jput(pos->child);
46602+ pos->child = NULL;
46603+ }
46604+ return result;
46605+}
46606+
46607+/* plugin->init_convert_data() */
46608+static int
46609+init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
46610+{
46611+ assert("edward-813", idata != NULL);
46612+ assert("edward-814", inode != NULL);
46613+
46614+ idata->inode = inode;
46615+ idata->d_cur = DC_FIRST_ITEM;
46616+ idata->d_next = DC_INVALID_STATE;
46617+
46618+ return 0;
46619+}
46620+
46621+static int alloc_item_convert_data(convert_info_t * sq)
46622+{
46623+ assert("edward-816", sq != NULL);
46624+ assert("edward-817", sq->itm == NULL);
46625+
46626+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
46627+ if (sq->itm == NULL)
46628+ return RETERR(-ENOMEM);
46629+ return 0;
46630+}
46631+
46632+static void free_item_convert_data(convert_info_t * sq)
46633+{
46634+ assert("edward-818", sq != NULL);
46635+ assert("edward-819", sq->itm != NULL);
46636+ assert("edward-820", sq->iplug != NULL);
46637+
46638+ kfree(sq->itm);
46639+ sq->itm = NULL;
46640+ return;
46641+}
46642+
46643+static int alloc_convert_data(flush_pos_t * pos)
46644+{
46645+ assert("edward-821", pos != NULL);
46646+ assert("edward-822", pos->sq == NULL);
46647+
46648+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
46649+ if (!pos->sq)
46650+ return RETERR(-ENOMEM);
46651+ memset(pos->sq, 0, sizeof(*pos->sq));
46652+ cluster_init_write(&pos->sq->clust, 0);
46653+ return 0;
46654+}
46655+
46656+void free_convert_data(flush_pos_t * pos)
46657+{
46658+ convert_info_t *sq;
46659+
46660+ assert("edward-823", pos != NULL);
46661+ assert("edward-824", pos->sq != NULL);
46662+
46663+ sq = pos->sq;
46664+ if (sq->itm)
46665+ free_item_convert_data(sq);
46666+ put_cluster_handle(&sq->clust);
46667+ kfree(pos->sq);
46668+ pos->sq = NULL;
46669+ return;
46670+}
46671+
46672+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
46673+{
46674+ convert_info_t *sq;
46675+
46676+ assert("edward-825", pos != NULL);
46677+ assert("edward-826", pos->sq != NULL);
46678+ assert("edward-827", item_convert_data(pos) != NULL);
46679+ assert("edward-828", inode != NULL);
46680+
46681+ sq = pos->sq;
46682+
46683+ memset(sq->itm, 0, sizeof(*sq->itm));
46684+
46685+ /* iplug->init_convert_data() */
46686+ return init_convert_data_ctail(sq->itm, inode);
46687+}
46688+
46689+/* create and attach disk cluster info used by 'convert' phase of the flush
46690+ squalloc() */
46691+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
46692+{
46693+ int ret = 0;
46694+ convert_item_info_t *info;
46695+ reiser4_cluster_t *clust;
46696+ file_plugin *fplug = inode_file_plugin(inode);
46697+ compression_plugin *cplug = inode_compression_plugin(inode);
46698+
46699+ assert("edward-248", pos != NULL);
46700+ assert("edward-249", pos->child != NULL);
46701+ assert("edward-251", inode != NULL);
46702+ assert("edward-682", cryptcompress_inode_ok(inode));
46703+ assert("edward-252",
46704+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
46705+ assert("edward-473",
46706+ item_plugin_by_coord(&pos->coord) ==
46707+ item_plugin_by_id(CTAIL_ID));
46708+
46709+ if (!pos->sq) {
46710+ ret = alloc_convert_data(pos);
46711+ if (ret)
46712+ return ret;
46713+ }
46714+ clust = &pos->sq->clust;
46715+ ret = grab_coa(&clust->tc, cplug);
46716+ if (ret)
46717+ goto err;
46718+ ret = set_cluster_by_page(clust,
46719+ jnode_page(pos->child),
46720+ MAX_CLUSTER_NRPAGES);
46721+ if (ret)
46722+ goto err;
46723+
46724+ assert("edward-829", pos->sq != NULL);
46725+ assert("edward-250", item_convert_data(pos) == NULL);
46726+
46727+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
46728+
46729+ ret = alloc_item_convert_data(pos->sq);
46730+ if (ret)
46731+ goto err;
46732+ ret = init_item_convert_data(pos, inode);
46733+ if (ret)
46734+ goto err;
46735+ info = item_convert_data(pos);
46736+
46737+ ret = flush_cluster_pages(clust, pos->child, inode);
46738+ if (ret)
46739+ goto err;
46740+
46741+ reiser4_deflate_cluster(clust, inode);
46742+ inc_item_convert_count(pos);
46743+
46744+ /* make flow by transformed stream */
46745+ fplug->flow_by_inode(info->inode,
46746+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
46747+ 0 /* kernel space */ ,
46748+ clust->tc.len,
46749+ clust_to_off(clust->index, inode),
46750+ WRITE_OP, &info->flow);
46751+ jput(pos->child);
46752+
46753+ assert("edward-683", cryptcompress_inode_ok(inode));
46754+ return 0;
46755+ err:
46756+ jput(pos->child);
46757+ free_convert_data(pos);
46758+ return ret;
46759+}
46760+
46761+/* clear up disk cluster info */
46762+static void detach_convert_idata(convert_info_t * sq)
46763+{
46764+ convert_item_info_t *info;
46765+
46766+ assert("edward-253", sq != NULL);
46767+ assert("edward-840", sq->itm != NULL);
46768+
46769+ info = sq->itm;
46770+ assert("edward-255", info->inode != NULL);
46771+ assert("edward-1212", info->flow.length == 0);
46772+
46773+ free_item_convert_data(sq);
46774+ return;
46775+}
46776+
46777+/* plugin->u.item.f.utmost_child */
46778+
46779+/* This function sets leftmost child for a first cluster item,
46780+ if the child exists, and NULL in other cases.
46781+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */
46782+
46783+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
46784+{
46785+ reiser4_key key;
46786+
46787+ item_key_by_coord(coord, &key);
46788+
46789+ assert("edward-257", coord != NULL);
46790+ assert("edward-258", child != NULL);
46791+ assert("edward-259", side == LEFT_SIDE);
46792+ assert("edward-260",
46793+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46794+
46795+ if (!is_disk_cluster_key(&key, coord))
46796+ *child = NULL;
46797+ else
46798+ *child = jlookup(current_tree,
46799+ get_key_objectid(item_key_by_coord
46800+ (coord, &key)),
46801+ off_to_pg(get_key_offset(&key)));
46802+ return 0;
46803+}
46804+
46805+/* Returns true if @p2 is the next item to @p1
46806+ in the _same_ disk cluster.
46807+ Disk cluster is a set of items. If ->clustered() != NULL,
46808+ with each item the whole disk cluster should be read/modified
46809+*/
46810+static int clustered_ctail(const coord_t * p1, const coord_t * p2)
46811+{
46812+ return mergeable_ctail(p1, p2);
46813+}
46814+
46815+/* Go rightward and check for next disk cluster item, set
46816+ d_next to DC_CHAINED_ITEM, if the last one exists.
46817+ If the current position is last item, go to right neighbor.
46818+ Skip empty nodes. Note, that right neighbors may be not in
46819+ the slum because of races. If so, make it dirty and
46820+ convertible.
46821+*/
46822+static int next_item_dc_stat(flush_pos_t * pos)
46823+{
46824+ int ret = 0;
46825+ int stop = 0;
46826+ znode *cur;
46827+ coord_t coord;
46828+ lock_handle lh;
46829+ lock_handle right_lock;
46830+
46831+ assert("edward-1232", !node_is_empty(pos->coord.node));
46832+ assert("edward-1014",
46833+ pos->coord.item_pos < coord_num_items(&pos->coord));
46834+ assert("edward-1015", chaining_data_present(pos));
46835+ assert("edward-1017",
46836+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
46837+
46838+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
46839+
46840+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
46841+ return ret;
46842+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
46843+ return ret;
46844+
46845+ /* check next slum item */
46846+ init_lh(&right_lock);
46847+ cur = pos->coord.node;
46848+
46849+ while (!stop) {
46850+ init_lh(&lh);
46851+ ret = reiser4_get_right_neighbor(&lh,
46852+ cur,
46853+ ZNODE_WRITE_LOCK,
46854+ GN_CAN_USE_UPPER_LEVELS);
46855+ if (ret)
46856+ break;
46857+ ret = zload(lh.node);
46858+ if (ret) {
46859+ done_lh(&lh);
46860+ break;
46861+ }
46862+ coord_init_before_first_item(&coord, lh.node);
46863+
46864+ if (node_is_empty(lh.node)) {
46865+ znode_make_dirty(lh.node);
46866+ znode_set_convertible(lh.node);
46867+ stop = 0;
46868+ } else if (clustered_ctail(&pos->coord, &coord)) {
46869+
46870+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
46871+
46872+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
46873+ /*
46874+ warning("edward-1024",
46875+ "next slum item mergeable, "
46876+ "but znode %p isn't dirty\n",
46877+ lh.node);
46878+ */
46879+ znode_make_dirty(lh.node);
46880+ }
46881+ if (!znode_convertible(lh.node)) {
46882+ /*
46883+ warning("edward-1272",
46884+ "next slum item mergeable, "
46885+ "but znode %p isn't convertible\n",
46886+ lh.node);
46887+ */
46888+ znode_set_convertible(lh.node);
46889+ }
46890+ stop = 1;
46891+ } else
46892+ stop = 1;
46893+ zrelse(lh.node);
46894+ done_lh(&right_lock);
46895+ copy_lh(&right_lock, &lh);
46896+ done_lh(&lh);
46897+ cur = right_lock.node;
46898+ }
46899+ done_lh(&right_lock);
46900+
46901+ if (ret == -E_NO_NEIGHBOR)
46902+ ret = 0;
46903+ return ret;
46904+}
46905+
46906+static int
46907+assign_convert_mode(convert_item_info_t * idata,
46908+ cryptcompress_write_mode_t * mode)
46909+{
46910+ int result = 0;
46911+
46912+ assert("edward-1025", idata != NULL);
46913+
46914+ if (idata->flow.length) {
46915+ /* append or overwrite */
46916+ switch (idata->d_cur) {
46917+ case DC_FIRST_ITEM:
46918+ case DC_CHAINED_ITEM:
46919+ *mode = CRC_OVERWRITE_ITEM;
46920+ break;
46921+ case DC_AFTER_CLUSTER:
46922+ *mode = CRC_APPEND_ITEM;
46923+ break;
46924+ default:
46925+ impossible("edward-1018", "wrong current item state");
46926+ }
46927+ } else {
46928+ /* cut or invalidate */
46929+ switch (idata->d_cur) {
46930+ case DC_FIRST_ITEM:
46931+ case DC_CHAINED_ITEM:
46932+ *mode = CRC_CUT_ITEM;
46933+ break;
46934+ case DC_AFTER_CLUSTER:
46935+ result = 1;
46936+ break;
46937+ default:
46938+ impossible("edward-1019", "wrong current item state");
46939+ }
46940+ }
46941+ return result;
46942+}
46943+
46944+/* plugin->u.item.f.convert */
46945+/* write ctail in guessed mode */
46946+int convert_ctail(flush_pos_t * pos)
46947+{
46948+ int result;
46949+ int nr_items;
46950+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
46951+
46952+ assert("edward-1020", pos != NULL);
46953+ assert("edward-1213", coord_num_items(&pos->coord) != 0);
46954+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
46955+ assert("edward-1258", ctail_ok(&pos->coord));
46956+ assert("edward-261", pos->coord.node != NULL);
46957+
46958+ nr_items = coord_num_items(&pos->coord);
46959+ if (!chaining_data_present(pos)) {
46960+ if (should_attach_convert_idata(pos)) {
46961+ /* attach convert item info */
46962+ struct inode *inode;
46963+
46964+ assert("edward-264", pos->child != NULL);
46965+ assert("edward-265", jnode_page(pos->child) != NULL);
46966+ assert("edward-266",
46967+ jnode_page(pos->child)->mapping != NULL);
46968+
46969+ inode = jnode_page(pos->child)->mapping->host;
46970+
46971+ assert("edward-267", inode != NULL);
46972+
46973+ /* attach item convert info by child and put the last one */
46974+ result = attach_convert_idata(pos, inode);
46975+ pos->child = NULL;
46976+ if (result == -E_REPEAT) {
46977+ /* jnode became clean, or there is no dirty
46978+ pages (nothing to update in disk cluster) */
46979+ warning("edward-1021",
46980+ "convert_ctail: nothing to attach");
46981+ return 0;
46982+ }
46983+ if (result != 0)
46984+ return result;
46985+ } else
46986+ /* unconvertible */
46987+ return 0;
46988+ } else {
46989+ /* use old convert info */
46990+
46991+ convert_item_info_t *idata;
46992+
46993+ idata = item_convert_data(pos);
46994+
46995+ result = assign_convert_mode(idata, &mode);
46996+ if (result) {
46997+ /* disk cluster is over,
46998+ nothing to update anymore */
46999+ detach_convert_idata(pos->sq);
47000+ return 0;
47001+ }
47002+ }
47003+
47004+ assert("edward-433", chaining_data_present(pos));
47005+ assert("edward-1022",
47006+ pos->coord.item_pos < coord_num_items(&pos->coord));
47007+
47008+ result = next_item_dc_stat(pos);
47009+ if (result) {
47010+ detach_convert_idata(pos->sq);
47011+ return result;
47012+ }
47013+ result = do_convert_ctail(pos, mode);
47014+ if (result) {
47015+ detach_convert_idata(pos->sq);
47016+ return result;
47017+ }
47018+ switch (mode) {
47019+ case CRC_CUT_ITEM:
47020+ assert("edward-1214", item_convert_data(pos)->flow.length == 0);
47021+ assert("edward-1215",
47022+ coord_num_items(&pos->coord) == nr_items ||
47023+ coord_num_items(&pos->coord) == nr_items - 1);
47024+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
47025+ break;
47026+ if (coord_num_items(&pos->coord) != nr_items) {
47027+ /* the item was killed, no more chained items */
47028+ detach_convert_idata(pos->sq);
47029+ if (!node_is_empty(pos->coord.node))
47030+ /* make sure the next item will be scanned */
47031+ coord_init_before_item(&pos->coord);
47032+ break;
47033+ }
47034+ case CRC_APPEND_ITEM:
47035+ assert("edward-434", item_convert_data(pos)->flow.length == 0);
47036+ detach_convert_idata(pos->sq);
47037+ break;
47038+ case CRC_OVERWRITE_ITEM:
47039+ if (coord_is_unprepped_ctail(&pos->coord)) {
47040+ /* convert unpprepped ctail to prepped one */
47041+ int shift;
47042+ shift =
47043+ inode_cluster_shift(item_convert_data(pos)->inode);
47044+ assert("edward-1259", cluster_shift_ok(shift));
47045+ put_unaligned((d8)shift,
47046+ &ctail_formatted_at(&pos->coord)->
47047+ cluster_shift);
47048+ }
47049+ break;
47050+ }
47051+ return result;
47052+}
47053+
47054+/* Make Linus happy.
47055+ Local variables:
47056+ c-indentation-style: "K&R"
47057+ mode-name: "LC"
47058+ c-basic-offset: 8
47059+ tab-width: 8
47060+ fill-column: 120
47061+ End:
47062+*/
47063diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.20/fs/reiser4/plugin/item/ctail.h
47064--- linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300
47065+++ linux-2.6.20/fs/reiser4/plugin/item/ctail.h 2007-05-06 14:50:43.803008220 +0400
47066@@ -0,0 +1,97 @@
47067+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47068+
47069+#if !defined( __FS_REISER4_CTAIL_H__ )
47070+#define __FS_REISER4_CTAIL_H__
47071+
47072+/* Disk format of ctail item */
47073+typedef struct ctail_item_format {
47074+ /* packed shift; size of (prepped) disk cluster
47075+ is calculated as (1 << cluster_shift) */
47076+ d8 cluster_shift;
47077+ /* ctail body */
47078+ d8 body[0];
47079+} __attribute__ ((packed)) ctail_item_format;
47080+
47081+/* Unprepped disk cluster is represented by a single ctail item
47082+ with the following "magic" attributes: */
47083+/* "magic" cluster_shift */
47084+#define UCTAIL_SHIFT 0xff
47085+/* How many units unprepped ctail item has */
47086+#define UCTAIL_NR_UNITS 1
47087+
47088+/* The following is a set of various item states in a disk cluster.
47089+ Disk cluster is a set of items whose keys belong to the interval
47090+ [dc_key , dc_key + disk_cluster_size - 1] */
47091+typedef enum {
47092+ DC_INVALID_STATE = 0,
47093+ DC_FIRST_ITEM = 1,
47094+ DC_CHAINED_ITEM = 2,
47095+ DC_AFTER_CLUSTER = 3
47096+} dc_item_stat;
47097+
47098+/* ctail-specific extension.
47099+ In particular this describes parameters of disk cluster an item belongs to */
47100+typedef struct {
47101+ int shift; /* this contains cluster_shift extracted from
47102+ ctail_item_format (above), or UCTAIL_SHIFT
47103+ (the last one is the "magic" of unprepped disk clusters)*/
47104+ int dsize; /* size of a prepped disk cluster */
47105+ int ncount; /* count of nodes occupied by a disk cluster */
47106+} ctail_coord_extension_t;
47107+
47108+struct cut_list;
47109+
47110+/* plugin->item.b.* */
47111+int can_contain_key_ctail(const coord_t *, const reiser4_key *,
47112+ const reiser4_item_data *);
47113+int mergeable_ctail(const coord_t * p1, const coord_t * p2);
47114+pos_in_node_t nr_units_ctail(const coord_t * coord);
47115+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
47116+void print_ctail(const char *prefix, coord_t * coord);
47117+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
47118+
47119+int paste_ctail(coord_t * coord, reiser4_item_data * data,
47120+ carry_plugin_info * info UNUSED_ARG);
47121+int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
47122+int can_shift_ctail(unsigned free_space, coord_t * coord,
47123+ znode * target, shift_direction pend, unsigned *size,
47124+ unsigned want);
47125+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
47126+ unsigned count, shift_direction where_is_free_space,
47127+ unsigned free_space);
47128+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47129+ carry_cut_data *, reiser4_key * smallest_removed,
47130+ reiser4_key * new_first);
47131+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47132+ carry_kill_data *, reiser4_key * smallest_removed,
47133+ reiser4_key * new_first);
47134+int ctail_ok(const coord_t * coord);
47135+int check_ctail(const coord_t * coord, const char **error);
47136+
47137+/* plugin->u.item.s.* */
47138+int read_ctail(struct file *, flow_t *, hint_t *);
47139+int readpage_ctail(void *, struct page *);
47140+int readpages_ctail(struct file *, struct address_space *, struct list_head *);
47141+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
47142+int create_hook_ctail(const coord_t * coord, void *arg);
47143+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
47144+ carry_kill_data *);
47145+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
47146+
47147+/* plugin->u.item.f */
47148+int utmost_child_ctail(const coord_t *, sideof, jnode **);
47149+int scan_ctail(flush_scan *);
47150+int convert_ctail(flush_pos_t *);
47151+size_t inode_scaled_cluster_size(struct inode *);
47152+
47153+#endif /* __FS_REISER4_CTAIL_H__ */
47154+
47155+/* Make Linus happy.
47156+ Local variables:
47157+ c-indentation-style: "K&R"
47158+ mode-name: "LC"
47159+ c-basic-offset: 8
47160+ tab-width: 8
47161+ fill-column: 120
47162+ End:
47163+*/
47164diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent.c linux-2.6.20/fs/reiser4/plugin/item/extent.c
47165--- linux-2.6.20.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300
47166+++ linux-2.6.20/fs/reiser4/plugin/item/extent.c 2007-05-06 14:50:43.807009470 +0400
47167@@ -0,0 +1,197 @@
47168+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47169+
47170+#include "item.h"
47171+#include "../../key.h"
47172+#include "../../super.h"
47173+#include "../../carry.h"
47174+#include "../../inode.h"
47175+#include "../../page_cache.h"
47176+#include "../../flush.h"
47177+#include "../object.h"
47178+
47179+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
47180+/* Audited by: green(2002.06.13) */
47181+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47182+ int nr_extents)
47183+{
47184+ data->data = ext_unit;
47185+ /* data->data is kernel space */
47186+ data->user = 0;
47187+ data->length = sizeof(reiser4_extent) * nr_extents;
47188+ data->arg = NULL;
47189+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47190+ return data;
47191+}
47192+
47193+/* how many bytes are addressed by @nr first extents of the extent item */
47194+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
47195+{
47196+ pos_in_node_t i;
47197+ reiser4_block_nr blocks;
47198+ reiser4_extent *ext;
47199+
47200+ ext = item_body_by_coord(coord);
47201+ assert("vs-263", nr <= nr_units_extent(coord));
47202+
47203+ blocks = 0;
47204+ for (i = 0; i < nr; i++, ext++) {
47205+ blocks += extent_get_width(ext);
47206+ }
47207+
47208+ return blocks * current_blocksize;
47209+}
47210+
47211+extent_state state_of_extent(reiser4_extent * ext)
47212+{
47213+ switch ((int)extent_get_start(ext)) {
47214+ case 0:
47215+ return HOLE_EXTENT;
47216+ case 1:
47217+ return UNALLOCATED_EXTENT;
47218+ default:
47219+ break;
47220+ }
47221+ return ALLOCATED_EXTENT;
47222+}
47223+
47224+int extent_is_unallocated(const coord_t * item)
47225+{
47226+ assert("jmacd-5133", item_is_extent(item));
47227+
47228+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47229+}
47230+
47231+/* set extent's start and width */
47232+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
47233+ reiser4_block_nr width)
47234+{
47235+ extent_set_start(ext, start);
47236+ extent_set_width(ext, width);
47237+}
47238+
47239+/**
47240+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it
47241+ * @un_extent: coordinate of extent to be overwritten
47242+ * @lh: need better comment
47243+ * @key: need better comment
47244+ * @exts_to_add: data prepared for insertion into tree
47245+ * @replace: need better comment
47246+ * @flags: need better comment
47247+ * @return_insert_position: need better comment
47248+ *
47249+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
47250+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47251+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47252+ * set to extent which was overwritten.
47253+ */
47254+int reiser4_replace_extent(struct replace_handle *h,
47255+ int return_inserted_position)
47256+{
47257+ int result;
47258+ znode *orig_znode;
47259+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
47260+
47261+ assert("vs-990", coord_is_existing_unit(h->coord));
47262+ assert("vs-1375", znode_is_write_locked(h->coord->node));
47263+ assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47264+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47265+ assert("vs-1427", ergo(h->nr_new_extents == 2,
47266+ extent_get_width(&h->new_extents[1]) != 0));
47267+
47268+ /* compose structure for paste */
47269+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47270+
47271+ coord_dup(&h->coord_after, h->coord);
47272+ init_lh(&h->lh_after);
47273+ copy_lh(&h->lh_after, h->lh);
47274+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47275+ reiser4_tap_monitor(&h->watch);
47276+
47277+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47278+ orig_znode = h->coord->node;
47279+
47280+#if REISER4_DEBUG
47281+ /* make sure that key is set properly */
47282+ unit_key_by_coord(h->coord, &h->tmp);
47283+ set_key_offset(&h->tmp,
47284+ get_key_offset(&h->tmp) +
47285+ extent_get_width(&h->overwrite) * current_blocksize);
47286+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47287+#endif
47288+
47289+ /* set insert point after unit to be replaced */
47290+ h->coord->between = AFTER_UNIT;
47291+
47292+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47293+ &h->paste_key, &h->item, h->flags);
47294+ if (!result) {
47295+ /* now we have to replace the unit after which new units were
47296+ inserted. Its position is tracked by @watch */
47297+ reiser4_extent *ext;
47298+ znode *node;
47299+
47300+ node = h->coord_after.node;
47301+ if (node != orig_znode) {
47302+ coord_clear_iplug(&h->coord_after);
47303+ result = zload(node);
47304+ }
47305+
47306+ if (likely(!result)) {
47307+ ext = extent_by_coord(&h->coord_after);
47308+
47309+ assert("vs-987", znode_is_loaded(node));
47310+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47311+
47312+ /* overwrite extent unit */
47313+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47314+ znode_make_dirty(node);
47315+
47316+ if (node != orig_znode)
47317+ zrelse(node);
47318+
47319+ if (return_inserted_position == 0) {
47320+ /* coord and lh are to be set to overwritten
47321+ extent */
47322+ assert("vs-1662",
47323+ WITH_DATA(node, !memcmp(&h->overwrite,
47324+ extent_by_coord(
47325+ &h->coord_after),
47326+ sizeof(reiser4_extent))));
47327+
47328+ *h->coord = h->coord_after;
47329+ done_lh(h->lh);
47330+ copy_lh(h->lh, &h->lh_after);
47331+ } else {
47332+ /* h->coord and h->lh are to be set to first of
47333+ inserted units */
47334+ assert("vs-1663",
47335+ WITH_DATA(h->coord->node,
47336+ !memcmp(&h->new_extents[0],
47337+ extent_by_coord(h->coord),
47338+ sizeof(reiser4_extent))));
47339+ assert("vs-1664", h->lh->node == h->coord->node);
47340+ }
47341+ }
47342+ }
47343+ reiser4_tap_done(&h->watch);
47344+
47345+ return result;
47346+}
47347+
47348+lock_handle *znode_lh(znode *node)
47349+{
47350+ assert("vs-1371", znode_is_write_locked(node));
47351+ assert("vs-1372", znode_is_wlocked_once(node));
47352+ return list_entry(node->lock.owners.next, lock_handle, owners_link);
47353+}
47354+
47355+/*
47356+ * Local variables:
47357+ * c-indentation-style: "K&R"
47358+ * mode-name: "LC"
47359+ * c-basic-offset: 8
47360+ * tab-width: 8
47361+ * fill-column: 79
47362+ * scroll-step: 1
47363+ * End:
47364+ */
47365diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_file_ops.c
47366--- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300
47367+++ linux-2.6.20/fs/reiser4/plugin/item/extent_file_ops.c 2007-05-06 14:50:43.807009470 +0400
47368@@ -0,0 +1,1443 @@
47369+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47370+
47371+#include "item.h"
47372+#include "../../inode.h"
47373+#include "../../page_cache.h"
47374+#include "../object.h"
47375+
47376+#include <linux/quotaops.h>
47377+#include <linux/swap.h>
47378+#include "../../../../mm/filemap.h"
47379+
47380+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
47381+{
47382+ reiser4_extent *ext;
47383+
47384+ ext = (reiser4_extent *) (zdata(node) + offset);
47385+ return ext;
47386+}
47387+
47388+/**
47389+ * check_uf_coord - verify coord extension
47390+ * @uf_coord:
47391+ * @key:
47392+ *
47393+ * Makes sure that all fields of @uf_coord are set properly. If @key is
47394+ * specified - check whether @uf_coord is set correspondingly.
47395+ */
47396+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
47397+{
47398+#if REISER4_DEBUG
47399+ const coord_t *coord;
47400+ const extent_coord_extension_t *ext_coord;
47401+ reiser4_extent *ext;
47402+
47403+ coord = &uf_coord->coord;
47404+ ext_coord = &uf_coord->extension.extent;
47405+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
47406+
47407+ assert("",
47408+ WITH_DATA(coord->node,
47409+ (uf_coord->valid == 1 &&
47410+ coord_is_iplug_set(coord) &&
47411+ item_is_extent(coord) &&
47412+ ext_coord->nr_units == nr_units_extent(coord) &&
47413+ ext == extent_by_coord(coord) &&
47414+ ext_coord->width == extent_get_width(ext) &&
47415+ coord->unit_pos < ext_coord->nr_units &&
47416+ ext_coord->pos_in_unit < ext_coord->width &&
47417+ memcmp(ext, &ext_coord->extent,
47418+ sizeof(reiser4_extent)) == 0)));
47419+ if (key) {
47420+ reiser4_key coord_key;
47421+
47422+ unit_key_by_coord(&uf_coord->coord, &coord_key);
47423+ set_key_offset(&coord_key,
47424+ get_key_offset(&coord_key) +
47425+ (uf_coord->extension.extent.
47426+ pos_in_unit << PAGE_CACHE_SHIFT));
47427+ assert("", keyeq(key, &coord_key));
47428+ }
47429+#endif
47430+}
47431+
47432+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
47433+{
47434+ check_uf_coord(uf_coord, NULL);
47435+
47436+ return ext_by_offset(uf_coord->coord.node,
47437+ uf_coord->extension.extent.ext_offset);
47438+}
47439+
47440+#if REISER4_DEBUG
47441+
47442+/**
47443+ * offset_is_in_unit
47444+ *
47445+ *
47446+ *
47447+ */
47448+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
47449+ pos_in_unit inside of unit correspondingly */
47450+static int offset_is_in_unit(const coord_t *coord, loff_t off)
47451+{
47452+ reiser4_key unit_key;
47453+ __u64 unit_off;
47454+ reiser4_extent *ext;
47455+
47456+ ext = extent_by_coord(coord);
47457+
47458+ unit_key_extent(coord, &unit_key);
47459+ unit_off = get_key_offset(&unit_key);
47460+ if (off < unit_off)
47461+ return 0;
47462+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
47463+ return 0;
47464+ return 1;
47465+}
47466+
47467+static int
47468+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
47469+{
47470+ reiser4_key item_key;
47471+
47472+ assert("vs-771", coord_is_existing_unit(coord));
47473+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
47474+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
47475+
47476+ return offset_is_in_unit(coord, get_key_offset(key));
47477+}
47478+
47479+#endif
47480+
47481+/**
47482+ * can_append -
47483+ * @key:
47484+ * @coord:
47485+ *
47486+ * Returns 1 if @key is equal to an append key of item @coord is set to
47487+ */
47488+static int can_append(const reiser4_key *key, const coord_t *coord)
47489+{
47490+ reiser4_key append_key;
47491+
47492+ return keyeq(key, append_key_extent(coord, &append_key));
47493+}
47494+
47495+/**
47496+ * append_hole
47497+ * @coord:
47498+ * @lh:
47499+ * @key:
47500+ *
47501+ */
47502+static int append_hole(coord_t *coord, lock_handle *lh,
47503+ const reiser4_key *key)
47504+{
47505+ reiser4_key append_key;
47506+ reiser4_block_nr hole_width;
47507+ reiser4_extent *ext, new_ext;
47508+ reiser4_item_data idata;
47509+
47510+ /* last item of file may have to be appended with hole */
47511+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
47512+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
47513+
47514+ /* key of first byte which is not addressed by this extent */
47515+ append_key_extent(coord, &append_key);
47516+
47517+ assert("", keyle(&append_key, key));
47518+
47519+ /*
47520+ * extent item has to be appended with hole. Calculate length of that
47521+ * hole
47522+ */
47523+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
47524+ current_blocksize - 1) >> current_blocksize_bits);
47525+ assert("vs-954", hole_width > 0);
47526+
47527+ /* set coord after last unit */
47528+ coord_init_after_item_end(coord);
47529+
47530+ /* get last extent in the item */
47531+ ext = extent_by_coord(coord);
47532+ if (state_of_extent(ext) == HOLE_EXTENT) {
47533+ /*
47534+ * last extent of a file is hole extent. Widen that extent by
47535+ * @hole_width blocks. Note that we do not worry about
47536+ * overflowing - extent width is 64 bits
47537+ */
47538+ reiser4_set_extent(ext, HOLE_EXTENT_START,
47539+ extent_get_width(ext) + hole_width);
47540+ znode_make_dirty(coord->node);
47541+ return 0;
47542+ }
47543+
47544+ /* append last item of the file with hole extent unit */
47545+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
47546+ state_of_extent(ext) == UNALLOCATED_EXTENT));
47547+
47548+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47549+ init_new_extent(&idata, &new_ext, 1);
47550+ return insert_into_item(coord, lh, &append_key, &idata, 0);
47551+}
47552+
47553+/**
47554+ * check_jnodes
47555+ * @twig: longterm locked twig node
47556+ * @key:
47557+ *
47558+ */
47559+static void check_jnodes(znode *twig, const reiser4_key *key, int count)
47560+{
47561+#if REISER4_DEBUG
47562+ coord_t c;
47563+ reiser4_key node_key, jnode_key;
47564+
47565+ jnode_key = *key;
47566+
47567+ assert("", twig != NULL);
47568+ assert("", znode_get_level(twig) == TWIG_LEVEL);
47569+ assert("", znode_is_write_locked(twig));
47570+
47571+ zload(twig);
47572+ /* get the smallest key in twig node */
47573+ coord_init_first_unit(&c, twig);
47574+ unit_key_by_coord(&c, &node_key);
47575+ assert("", keyle(&node_key, &jnode_key));
47576+
47577+ coord_init_last_unit(&c, twig);
47578+ unit_key_by_coord(&c, &node_key);
47579+ if (item_plugin_by_coord(&c)->s.file.append_key)
47580+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
47581+ set_key_offset(&jnode_key,
47582+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
47583+ assert("", keylt(&jnode_key, &node_key));
47584+ zrelse(twig);
47585+#endif
47586+}
47587+
47588+/**
47589+ * append_last_extent - append last file item
47590+ * @uf_coord: coord to start insertion from
47591+ * @jnodes: array of jnodes
47592+ * @count: number of jnodes in the array
47593+ *
47594+ * There is already at least one extent item of file @inode in the tree. Append
47595+ * the last of them with unallocated extent unit of width @count. Assign
47596+ * fake block numbers to jnodes corresponding to the inserted extent.
47597+ */
47598+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47599+ jnode **jnodes, int count)
47600+{
47601+ int result;
47602+ reiser4_extent new_ext;
47603+ reiser4_item_data idata;
47604+ coord_t *coord;
47605+ extent_coord_extension_t *ext_coord;
47606+ reiser4_extent *ext;
47607+ reiser4_block_nr block;
47608+ jnode *node;
47609+ int i;
47610+
47611+ coord = &uf_coord->coord;
47612+ ext_coord = &uf_coord->extension.extent;
47613+ ext = ext_by_ext_coord(uf_coord);
47614+
47615+ /* check correctness of position in the item */
47616+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
47617+ assert("vs-1311", coord->between == AFTER_UNIT);
47618+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
47619+
47620+ if (!can_append(key, coord)) {
47621+ /* hole extent has to be inserted */
47622+ result = append_hole(coord, uf_coord->lh, key);
47623+ uf_coord->valid = 0;
47624+ return result;
47625+ }
47626+
47627+ if (count == 0)
47628+ return 0;
47629+
47630+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
47631+
47632+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
47633+ count);
47634+ BUG_ON(result != 0);
47635+
47636+ switch (state_of_extent(ext)) {
47637+ case UNALLOCATED_EXTENT:
47638+ /*
47639+ * last extent unit of the file is unallocated one. Increase
47640+ * its width by @count
47641+ */
47642+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
47643+ extent_get_width(ext) + count);
47644+ znode_make_dirty(coord->node);
47645+
47646+ /* update coord extension */
47647+ ext_coord->width += count;
47648+ ON_DEBUG(extent_set_width
47649+ (&uf_coord->extension.extent.extent,
47650+ ext_coord->width));
47651+ break;
47652+
47653+ case HOLE_EXTENT:
47654+ case ALLOCATED_EXTENT:
47655+ /*
47656+ * last extent unit of the file is either hole or allocated
47657+ * one. Append one unallocated extent of width @count
47658+ */
47659+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47660+ init_new_extent(&idata, &new_ext, 1);
47661+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
47662+ uf_coord->valid = 0;
47663+ if (result)
47664+ return result;
47665+ break;
47666+
47667+ default:
47668+ return RETERR(-EIO);
47669+ }
47670+
47671+ /*
47672+ * make sure that we hold long term locked twig node containing all
47673+ * jnodes we are about to capture
47674+ */
47675+ check_jnodes(uf_coord->lh->node, key, count);
47676+
47677+ /*
47678+ * assign fake block numbers to all jnodes. FIXME: make sure whether
47679+ * twig node containing inserted extent item is locked
47680+ */
47681+ block = fake_blocknr_unformatted(count);
47682+ for (i = 0; i < count; i ++, block ++) {
47683+ node = jnodes[i];
47684+ spin_lock_jnode(node);
47685+ JF_SET(node, JNODE_CREATED);
47686+ jnode_set_block(node, &block);
47687+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47688+ BUG_ON(result != 0);
47689+ jnode_make_dirty_locked(node);
47690+ spin_unlock_jnode(node);
47691+ }
47692+ return count;
47693+}
47694+
47695+/**
47696+ * insert_first_hole - inser hole extent into tree
47697+ * @coord:
47698+ * @lh:
47699+ * @key:
47700+ *
47701+ *
47702+ */
47703+static int insert_first_hole(coord_t *coord, lock_handle *lh,
47704+ const reiser4_key *key)
47705+{
47706+ reiser4_extent new_ext;
47707+ reiser4_item_data idata;
47708+ reiser4_key item_key;
47709+ reiser4_block_nr hole_width;
47710+
47711+ /* @coord must be set for inserting of new item */
47712+ assert("vs-711", coord_is_between_items(coord));
47713+
47714+ item_key = *key;
47715+ set_key_offset(&item_key, 0ull);
47716+
47717+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
47718+ current_blocksize_bits);
47719+ assert("vs-710", hole_width > 0);
47720+
47721+ /* compose body of hole extent and insert item into tree */
47722+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47723+ init_new_extent(&idata, &new_ext, 1);
47724+ return insert_extent_by_coord(coord, &idata, &item_key, lh);
47725+}
47726+
47727+
47728+/**
47729+ * insert_first_extent - insert first file item
47730+ * @inode: inode of file
47731+ * @uf_coord: coord to start insertion from
47732+ * @jnodes: array of jnodes
47733+ * @count: number of jnodes in the array
47734+ * @inode:
47735+ *
47736+ * There are no items of file @inode in the tree yet. Insert unallocated extent
47737+ * of width @count into tree or hole extent if writing not to the
47738+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
47739+ * unallocated extent. Returns number of jnodes or error code.
47740+ */
47741+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47742+ jnode **jnodes, int count,
47743+ struct inode *inode)
47744+{
47745+ int result;
47746+ int i;
47747+ reiser4_extent new_ext;
47748+ reiser4_item_data idata;
47749+ reiser4_block_nr block;
47750+ unix_file_info_t *uf_info;
47751+ jnode *node;
47752+
47753+ /* first extent insertion starts at leaf level */
47754+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
47755+ assert("vs-711", coord_is_between_items(&uf_coord->coord));
47756+
47757+ if (get_key_offset(key) != 0) {
47758+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
47759+ uf_coord->valid = 0;
47760+ uf_info = unix_file_inode_data(inode);
47761+
47762+ /*
47763+ * first item insertion is only possible when writing to empty
47764+ * file or performing tail conversion
47765+ */
47766+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
47767+ (reiser4_inode_get_flag(inode,
47768+ REISER4_PART_MIXED) &&
47769+ reiser4_inode_get_flag(inode,
47770+ REISER4_PART_IN_CONV))));
47771+ /* if file was empty - update its state */
47772+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
47773+ uf_info->container = UF_CONTAINER_EXTENTS;
47774+ return result;
47775+ }
47776+
47777+ if (count == 0)
47778+ return 0;
47779+
47780+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
47781+ BUG_ON(result != 0);
47782+
47783+ /*
47784+ * prepare for tree modification: compose body of item and item data
47785+ * structure needed for insertion
47786+ */
47787+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47788+ init_new_extent(&idata, &new_ext, 1);
47789+
47790+ /* insert extent item into the tree */
47791+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
47792+ uf_coord->lh);
47793+ if (result)
47794+ return result;
47795+
47796+ /*
47797+ * make sure that we hold long term locked twig node containing all
47798+ * jnodes we are about to capture
47799+ */
47800+ check_jnodes(uf_coord->lh->node, key, count);
47801+ /*
47802+ * assign fake block numbers to all jnodes, capture and mark them dirty
47803+ */
47804+ block = fake_blocknr_unformatted(count);
47805+ for (i = 0; i < count; i ++, block ++) {
47806+ node = jnodes[i];
47807+ spin_lock_jnode(node);
47808+ JF_SET(node, JNODE_CREATED);
47809+ jnode_set_block(node, &block);
47810+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47811+ BUG_ON(result != 0);
47812+ jnode_make_dirty_locked(node);
47813+ spin_unlock_jnode(node);
47814+ }
47815+
47816+ /*
47817+ * invalidate coordinate, research must be performed to continue
47818+ * because write will continue on twig level
47819+ */
47820+ uf_coord->valid = 0;
47821+ return count;
47822+}
47823+
47824+/**
47825+ * plug_hole - replace hole extent with unallocated and holes
47826+ * @uf_coord:
47827+ * @key:
47828+ * @node:
47829+ * @h: structure containing coordinate, lock handle, key, etc
47830+ *
47831+ * Creates an unallocated extent of width 1 within a hole. In worst case two
47832+ * additional extents can be created.
47833+ */
47834+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
47835+{
47836+ struct replace_handle rh;
47837+ reiser4_extent *ext;
47838+ reiser4_block_nr width, pos_in_unit;
47839+ coord_t *coord;
47840+ extent_coord_extension_t *ext_coord;
47841+ int return_inserted_position;
47842+
47843+ check_uf_coord(uf_coord, key);
47844+
47845+ rh.coord = coord_by_uf_coord(uf_coord);
47846+ rh.lh = uf_coord->lh;
47847+ rh.flags = 0;
47848+
47849+ coord = coord_by_uf_coord(uf_coord);
47850+ ext_coord = ext_coord_by_uf_coord(uf_coord);
47851+ ext = ext_by_ext_coord(uf_coord);
47852+
47853+ width = ext_coord->width;
47854+ pos_in_unit = ext_coord->pos_in_unit;
47855+
47856+ *how = 0;
47857+ if (width == 1) {
47858+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
47859+ znode_make_dirty(coord->node);
47860+ /* update uf_coord */
47861+ ON_DEBUG(ext_coord->extent = *ext);
47862+ *how = 1;
47863+ return 0;
47864+ } else if (pos_in_unit == 0) {
47865+ /* we deal with first element of extent */
47866+ if (coord->unit_pos) {
47867+ /* there is an extent to the left */
47868+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
47869+ /*
47870+ * left neighboring unit is an unallocated
47871+ * extent. Increase its width and decrease
47872+ * width of hole
47873+ */
47874+ extent_set_width(ext - 1,
47875+ extent_get_width(ext - 1) + 1);
47876+ extent_set_width(ext, width - 1);
47877+ znode_make_dirty(coord->node);
47878+
47879+ /* update coord extension */
47880+ coord->unit_pos--;
47881+ ext_coord->width = extent_get_width(ext - 1);
47882+ ext_coord->pos_in_unit = ext_coord->width - 1;
47883+ ext_coord->ext_offset -= sizeof(reiser4_extent);
47884+ ON_DEBUG(ext_coord->extent =
47885+ *extent_by_coord(coord));
47886+ *how = 2;
47887+ return 0;
47888+ }
47889+ }
47890+ /* extent for replace */
47891+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
47892+ /* extent to be inserted */
47893+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
47894+ width - 1);
47895+ rh.nr_new_extents = 1;
47896+
47897+ /* have reiser4_replace_extent to return with @coord and
47898+ @uf_coord->lh set to unit which was replaced */
47899+ return_inserted_position = 0;
47900+ *how = 3;
47901+ } else if (pos_in_unit == width - 1) {
47902+ /* we deal with last element of extent */
47903+ if (coord->unit_pos < nr_units_extent(coord) - 1) {
47904+ /* there is an extent unit to the right */
47905+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
47906+ /*
47907+ * right neighboring unit is an unallocated
47908+ * extent. Increase its width and decrease
47909+ * width of hole
47910+ */
47911+ extent_set_width(ext + 1,
47912+ extent_get_width(ext + 1) + 1);
47913+ extent_set_width(ext, width - 1);
47914+ znode_make_dirty(coord->node);
47915+
47916+ /* update coord extension */
47917+ coord->unit_pos++;
47918+ ext_coord->width = extent_get_width(ext + 1);
47919+ ext_coord->pos_in_unit = 0;
47920+ ext_coord->ext_offset += sizeof(reiser4_extent);
47921+ ON_DEBUG(ext_coord->extent =
47922+ *extent_by_coord(coord));
47923+ *how = 4;
47924+ return 0;
47925+ }
47926+ }
47927+ /* extent for replace */
47928+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
47929+ /* extent to be inserted */
47930+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47931+ 1);
47932+ rh.nr_new_extents = 1;
47933+
47934+ /* have reiser4_replace_extent to return with @coord and
47935+ @uf_coord->lh set to unit which was inserted */
47936+ return_inserted_position = 1;
47937+ *how = 5;
47938+ } else {
47939+ /* extent for replace */
47940+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
47941+ pos_in_unit);
47942+ /* extents to be inserted */
47943+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47944+ 1);
47945+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
47946+ width - pos_in_unit - 1);
47947+ rh.nr_new_extents = 2;
47948+
47949+ /* have reiser4_replace_extent to return with @coord and
47950+ @uf_coord->lh set to first of units which were inserted */
47951+ return_inserted_position = 1;
47952+ *how = 6;
47953+ }
47954+ unit_key_by_coord(coord, &rh.paste_key);
47955+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
47956+ extent_get_width(&rh.overwrite) * current_blocksize);
47957+
47958+ uf_coord->valid = 0;
47959+ return reiser4_replace_extent(&rh, return_inserted_position);
47960+}
47961+
47962+/**
47963+ * overwrite_one_block -
47964+ * @uf_coord:
47965+ * @key:
47966+ * @node:
47967+ *
47968+ * If @node corresponds to hole extent - create unallocated extent for it and
47969+ * assign fake block number. If @node corresponds to allocated extent - assign
47970+ * block number of jnode
47971+ */
47972+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
47973+ jnode *node, int *hole_plugged)
47974+{
47975+ int result;
47976+ extent_coord_extension_t *ext_coord;
47977+ reiser4_extent *ext;
47978+ reiser4_block_nr block;
47979+ int how;
47980+
47981+ assert("vs-1312", uf_coord->coord.between == AT_UNIT);
47982+
47983+ result = 0;
47984+ ext_coord = ext_coord_by_uf_coord(uf_coord);
47985+ ext = ext_by_ext_coord(uf_coord);
47986+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
47987+
47988+ switch (state_of_extent(ext)) {
47989+ case ALLOCATED_EXTENT:
47990+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
47991+ break;
47992+
47993+ case HOLE_EXTENT:
47994+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
47995+ BUG_ON(result != 0);
47996+ result = plug_hole(uf_coord, key, &how);
47997+ if (result)
47998+ return result;
47999+ block = fake_blocknr_unformatted(1);
48000+ if (hole_plugged)
48001+ *hole_plugged = 1;
48002+ JF_SET(node, JNODE_CREATED);
48003+ break;
48004+
48005+ default:
48006+ return RETERR(-EIO);
48007+ }
48008+
48009+ jnode_set_block(node, &block);
48010+ return 0;
48011+}
48012+
48013+/**
48014+ * move_coord - move coordinate forward
48015+ * @uf_coord:
48016+ *
48017+ * Move coordinate one data block pointer forward. Return 1 if coord is set to
48018+ * the last one already or is invalid.
48019+ */
48020+static int move_coord(uf_coord_t *uf_coord)
48021+{
48022+ extent_coord_extension_t *ext_coord;
48023+
48024+ if (uf_coord->valid == 0)
48025+ return 1;
48026+ ext_coord = &uf_coord->extension.extent;
48027+ ext_coord->pos_in_unit ++;
48028+ if (ext_coord->pos_in_unit < ext_coord->width)
48029+ /* coordinate moved within the unit */
48030+ return 0;
48031+
48032+ /* end of unit is reached. Try to move to next unit */
48033+ ext_coord->pos_in_unit = 0;
48034+ uf_coord->coord.unit_pos ++;
48035+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
48036+ /* coordinate moved to next unit */
48037+ ext_coord->ext_offset += sizeof(reiser4_extent);
48038+ ext_coord->width =
48039+ extent_get_width(ext_by_offset
48040+ (uf_coord->coord.node,
48041+ ext_coord->ext_offset));
48042+ ON_DEBUG(ext_coord->extent =
48043+ *ext_by_offset(uf_coord->coord.node,
48044+ ext_coord->ext_offset));
48045+ return 0;
48046+ }
48047+ /* end of item is reached */
48048+ uf_coord->valid = 0;
48049+ return 1;
48050+}
48051+
48052+/**
48053+ * overwrite_extent -
48054+ * @inode:
48055+ *
48056+ * Returns number of handled jnodes.
48057+ */
48058+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48059+ jnode **jnodes, int count, int *plugged_hole)
48060+{
48061+ int result;
48062+ reiser4_key k;
48063+ int i;
48064+ jnode *node;
48065+
48066+ k = *key;
48067+ for (i = 0; i < count; i ++) {
48068+ node = jnodes[i];
48069+ if (*jnode_get_block(node) == 0) {
48070+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
48071+ if (result)
48072+ return result;
48073+ }
48074+ /*
48075+ * make sure that we hold long term locked twig node containing
48076+ * all jnodes we are about to capture
48077+ */
48078+ check_jnodes(uf_coord->lh->node, &k, 1);
48079+ /*
48080+ * assign fake block numbers to all jnodes, capture and mark
48081+ * them dirty
48082+ */
48083+ spin_lock_jnode(node);
48084+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48085+ BUG_ON(result != 0);
48086+ jnode_make_dirty_locked(node);
48087+ spin_unlock_jnode(node);
48088+
48089+ if (uf_coord->valid == 0)
48090+ return i + 1;
48091+
48092+ check_uf_coord(uf_coord, &k);
48093+
48094+ if (move_coord(uf_coord)) {
48095+ /*
48096+ * failed to move to the next node pointer. Either end
48097+ * of file or end of twig node is reached. In the later
48098+ * case we might go to the right neighbor.
48099+ */
48100+ uf_coord->valid = 0;
48101+ return i + 1;
48102+ }
48103+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
48104+ }
48105+
48106+ return count;
48107+}
48108+
48109+/**
48110+ * reiser4_update_extent
48111+ * @file:
48112+ * @jnodes:
48113+ * @count:
48114+ * @off:
48115+ *
48116+ */
48117+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
48118+ int *plugged_hole)
48119+{
48120+ int result;
48121+ znode *loaded;
48122+ uf_coord_t uf_coord;
48123+ coord_t *coord;
48124+ lock_handle lh;
48125+ reiser4_key key;
48126+
48127+ assert("", reiser4_lock_counters()->d_refs == 0);
48128+
48129+ key_by_inode_and_offset_common(inode, pos, &key);
48130+
48131+ init_uf_coord(&uf_coord, &lh);
48132+ coord = &uf_coord.coord;
48133+ result = find_file_item_nohint(coord, &lh, &key,
48134+ ZNODE_WRITE_LOCK, inode);
48135+ if (IS_CBKERR(result)) {
48136+ assert("", reiser4_lock_counters()->d_refs == 0);
48137+ return result;
48138+ }
48139+
48140+ result = zload(coord->node);
48141+ BUG_ON(result != 0);
48142+ loaded = coord->node;
48143+
48144+ if (coord->between == AFTER_UNIT) {
48145+ /*
48146+ * append existing extent item with unallocated extent of width
48147+ * nr_jnodes
48148+ */
48149+ init_coord_extension_extent(&uf_coord,
48150+ get_key_offset(&key));
48151+ result = append_last_extent(&uf_coord, &key,
48152+ &node, 1);
48153+ } else if (coord->between == AT_UNIT) {
48154+ /*
48155+ * overwrite
48156+ * not optimal yet. Will be optimized if new write will show
48157+ * performance win.
48158+ */
48159+ init_coord_extension_extent(&uf_coord,
48160+ get_key_offset(&key));
48161+ result = overwrite_extent(&uf_coord, &key,
48162+ &node, 1, plugged_hole);
48163+ } else {
48164+ /*
48165+ * there are no items of this file in the tree yet. Create
48166+ * first item of the file inserting one unallocated extent of
48167+ * width nr_jnodes
48168+ */
48169+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
48170+ }
48171+ assert("", result == 1 || result < 0);
48172+ zrelse(loaded);
48173+ done_lh(&lh);
48174+ assert("", reiser4_lock_counters()->d_refs == 0);
48175+ return (result == 1) ? 0 : result;
48176+}
48177+
48178+/**
48179+ * update_extents
48180+ * @file:
48181+ * @jnodes:
48182+ * @count:
48183+ * @off:
48184+ *
48185+ */
48186+static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
48187+{
48188+ struct inode *inode;
48189+ struct hint hint;
48190+ reiser4_key key;
48191+ int result;
48192+ znode *loaded;
48193+
48194+ result = load_file_hint(file, &hint);
48195+ BUG_ON(result != 0);
48196+
48197+ inode = file->f_dentry->d_inode;
48198+ if (count != 0)
48199+ /*
48200+ * count == 0 is special case: expanding truncate
48201+ */
48202+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
48203+ key_by_inode_and_offset_common(inode, pos, &key);
48204+
48205+ assert("", reiser4_lock_counters()->d_refs == 0);
48206+
48207+ do {
48208+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
48209+ if (IS_CBKERR(result)) {
48210+ assert("", reiser4_lock_counters()->d_refs == 0);
48211+ return result;
48212+ }
48213+
48214+ result = zload(hint.ext_coord.coord.node);
48215+ BUG_ON(result != 0);
48216+ loaded = hint.ext_coord.coord.node;
48217+
48218+ if (hint.ext_coord.coord.between == AFTER_UNIT) {
48219+ /*
48220+ * append existing extent item with unallocated extent
48221+ * of width nr_jnodes
48222+ */
48223+ if (hint.ext_coord.valid == 0)
48224+ /* NOTE: get statistics on this */
48225+ init_coord_extension_extent(&hint.ext_coord,
48226+ get_key_offset(&key));
48227+ result = append_last_extent(&hint.ext_coord, &key,
48228+ jnodes, count);
48229+ } else if (hint.ext_coord.coord.between == AT_UNIT) {
48230+ /*
48231+ * overwrite
48232+ * not optimal yet. Will be optimized if new write will
48233+ * show performance win.
48234+ */
48235+ if (hint.ext_coord.valid == 0)
48236+ /* NOTE: get statistics on this */
48237+ init_coord_extension_extent(&hint.ext_coord,
48238+ get_key_offset(&key));
48239+ result = overwrite_extent(&hint.ext_coord, &key,
48240+ jnodes, count, NULL);
48241+ } else {
48242+ /*
48243+ * there are no items of this file in the tree
48244+ * yet. Create first item of the file inserting one
48245+ * unallocated extent of * width nr_jnodes
48246+ */
48247+ result = insert_first_extent(&hint.ext_coord, &key,
48248+ jnodes, count, inode);
48249+ }
48250+ zrelse(loaded);
48251+ if (result < 0) {
48252+ done_lh(hint.ext_coord.lh);
48253+ break;
48254+ }
48255+
48256+ jnodes += result;
48257+ count -= result;
48258+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
48259+
48260+ /* seal and unlock znode */
48261+ if (hint.ext_coord.valid)
48262+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
48263+ else
48264+ reiser4_unset_hint(&hint);
48265+
48266+ } while (count > 0);
48267+
48268+ save_file_hint(file, &hint);
48269+ assert("", reiser4_lock_counters()->d_refs == 0);
48270+ return result;
48271+}
48272+
48273+/**
48274+ * write_extent_reserve_space - reserve space for extent write operation
48275+ * @inode:
48276+ *
48277+ * Estimates and reserves space which may be required for writing
48278+ * WRITE_GRANULARITY pages of file.
48279+ */
48280+static int write_extent_reserve_space(struct inode *inode)
48281+{
48282+ __u64 count;
48283+ reiser4_tree *tree;
48284+
48285+ /*
48286+ * to write WRITE_GRANULARITY pages to a file by extents we have to
48287+ * reserve disk space for:
48288+
48289+ * 1. find_file_item may have to insert empty node to the tree (empty
48290+ * leaf node between two extent items). This requires 1 block and
48291+ * number of blocks which are necessary to perform insertion of an
48292+ * internal item into twig level.
48293+
48294+ * 2. for each of written pages there might be needed 1 block and
48295+ * number of blocks which might be necessary to perform insertion of or
48296+ * paste to an extent item.
48297+
48298+ * 3. stat data update
48299+ */
48300+ tree = reiser4_tree_by_inode(inode);
48301+ count = estimate_one_insert_item(tree) +
48302+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
48303+ estimate_one_insert_item(tree);
48304+ grab_space_enable();
48305+ return reiser4_grab_space(count, 0 /* flags */);
48306+}
48307+
48308+/**
48309+ * reiser4_write_extent - write method of extent item plugin
48310+ * @file: file to write to
48311+ * @buf: address of user-space buffer
48312+ * @count: number of bytes to write
48313+ * @pos: position in file to write to
48314+ *
48315+ */
48316+ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
48317+ size_t count, loff_t *pos)
48318+{
48319+ int have_to_update_extent;
48320+ int nr_pages, nr_dirty;
48321+ struct page *page;
48322+ jnode *jnodes[WRITE_GRANULARITY + 1];
48323+ struct inode *inode;
48324+ unsigned long index;
48325+ unsigned long end;
48326+ int i;
48327+ int to_page, page_off;
48328+ size_t left, written;
48329+ int result = 0;
48330+
48331+ inode = file->f_dentry->d_inode;
48332+ if (write_extent_reserve_space(inode))
48333+ return RETERR(-ENOSPC);
48334+
48335+ if (count == 0) {
48336+ /* truncate case */
48337+ update_extents(file, jnodes, 0, *pos);
48338+ return 0;
48339+ }
48340+
48341+ BUG_ON(get_current_context()->trans->atom != NULL);
48342+
48343+ left = count;
48344+ index = *pos >> PAGE_CACHE_SHIFT;
48345+ /* calculate number of pages which are to be written */
48346+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
48347+ nr_pages = end - index + 1;
48348+ nr_dirty = 0;
48349+ assert("", nr_pages <= WRITE_GRANULARITY + 1);
48350+
48351+ /* get pages and jnodes */
48352+ for (i = 0; i < nr_pages; i ++) {
48353+ page = find_or_create_page(inode->i_mapping, index + i,
48354+ reiser4_ctx_gfp_mask_get());
48355+ if (page == NULL) {
48356+ nr_pages = i;
48357+ result = RETERR(-ENOMEM);
48358+ goto out;
48359+ }
48360+
48361+ jnodes[i] = jnode_of_page(page);
48362+ if (IS_ERR(jnodes[i])) {
48363+ unlock_page(page);
48364+ page_cache_release(page);
48365+ nr_pages = i;
48366+ result = RETERR(-ENOMEM);
48367+ goto out;
48368+ }
48369+ /* prevent jnode and page from disconnecting */
48370+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
48371+ unlock_page(page);
48372+ }
48373+
48374+ BUG_ON(get_current_context()->trans->atom != NULL);
48375+
48376+ have_to_update_extent = 0;
48377+
48378+ page_off = (*pos & (PAGE_CACHE_SIZE - 1));
48379+ for (i = 0; i < nr_pages; i ++) {
48380+ to_page = PAGE_CACHE_SIZE - page_off;
48381+ if (to_page > left)
48382+ to_page = left;
48383+ page = jnode_page(jnodes[i]);
48384+ if (page_offset(page) < inode->i_size &&
48385+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48386+ /*
48387+ * the above is not optimal for partial write to last
48388+ * page of file when file size is not at boundary of
48389+ * page
48390+ */
48391+ lock_page(page);
48392+ if (!PageUptodate(page)) {
48393+ result = readpage_unix_file(NULL, page);
48394+ BUG_ON(result != 0);
48395+ /* wait for read completion */
48396+ lock_page(page);
48397+ BUG_ON(!PageUptodate(page));
48398+ } else
48399+ result = 0;
48400+ unlock_page(page);
48401+ }
48402+
48403+ BUG_ON(get_current_context()->trans->atom != NULL);
48404+ fault_in_pages_readable(buf, to_page);
48405+ BUG_ON(get_current_context()->trans->atom != NULL);
48406+
48407+ lock_page(page);
48408+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48409+ void *kaddr;
48410+
48411+ kaddr = kmap_atomic(page, KM_USER0);
48412+ memset(kaddr, 0, page_off);
48413+ memset(kaddr + page_off + to_page, 0,
48414+ PAGE_CACHE_SIZE - (page_off + to_page));
48415+ flush_dcache_page(page);
48416+ kunmap_atomic(kaddr, KM_USER0);
48417+ }
48418+
48419+ written = filemap_copy_from_user(page, page_off, buf, to_page);
48420+ if (unlikely(written != to_page)) {
48421+ unlock_page(page);
48422+ result = RETERR(-EFAULT);
48423+ break;
48424+ }
48425+
48426+ flush_dcache_page(page);
48427+ reiser4_set_page_dirty_internal(page);
48428+ unlock_page(page);
48429+ nr_dirty++;
48430+
48431+ mark_page_accessed(page);
48432+ SetPageUptodate(page);
48433+
48434+ if (jnodes[i]->blocknr == 0)
48435+ have_to_update_extent ++;
48436+
48437+ page_off = 0;
48438+ buf += to_page;
48439+ left -= to_page;
48440+ BUG_ON(get_current_context()->trans->atom != NULL);
48441+ }
48442+
48443+ if (have_to_update_extent) {
48444+ update_extents(file, jnodes, nr_dirty, *pos);
48445+ } else {
48446+ for (i = 0; i < nr_dirty; i ++) {
48447+ int ret;
48448+ spin_lock_jnode(jnodes[i]);
48449+ ret = reiser4_try_capture(jnodes[i],
48450+ ZNODE_WRITE_LOCK, 0);
48451+ BUG_ON(ret != 0);
48452+ jnode_make_dirty_locked(jnodes[i]);
48453+ spin_unlock_jnode(jnodes[i]);
48454+ }
48455+ }
48456+out:
48457+ for (i = 0; i < nr_pages; i ++) {
48458+ page_cache_release(jnode_page(jnodes[i]));
48459+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
48460+ jput(jnodes[i]);
48461+ }
48462+
48463+ /* the only errors handled so far is ENOMEM and
48464+ EFAULT on copy_from_user */
48465+
48466+ return (count - left) ? (count - left) : result;
48467+}
48468+
48469+static inline void zero_page(struct page *page)
48470+{
48471+ char *kaddr = kmap_atomic(page, KM_USER0);
48472+
48473+ memset(kaddr, 0, PAGE_CACHE_SIZE);
48474+ flush_dcache_page(page);
48475+ kunmap_atomic(kaddr, KM_USER0);
48476+ SetPageUptodate(page);
48477+ unlock_page(page);
48478+}
48479+
48480+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
48481+ struct page *page)
48482+{
48483+ jnode *j;
48484+ struct address_space *mapping;
48485+ unsigned long index;
48486+ oid_t oid;
48487+ reiser4_block_nr block;
48488+
48489+ mapping = page->mapping;
48490+ oid = get_inode_oid(mapping->host);
48491+ index = page->index;
48492+
48493+ switch (state_of_extent(ext)) {
48494+ case HOLE_EXTENT:
48495+ /*
48496+ * it is possible to have hole page with jnode, if page was
48497+ * eflushed previously.
48498+ */
48499+ j = jfind(mapping, index);
48500+ if (j == NULL) {
48501+ zero_page(page);
48502+ return 0;
48503+ }
48504+ spin_lock_jnode(j);
48505+ if (!jnode_page(j)) {
48506+ jnode_attach_page(j, page);
48507+ } else {
48508+ BUG_ON(jnode_page(j) != page);
48509+ assert("vs-1504", jnode_page(j) == page);
48510+ }
48511+ block = *jnode_get_io_block(j);
48512+ spin_unlock_jnode(j);
48513+ if (block == 0) {
48514+ zero_page(page);
48515+ jput(j);
48516+ return 0;
48517+ }
48518+ break;
48519+
48520+ case ALLOCATED_EXTENT:
48521+ j = jnode_of_page(page);
48522+ if (IS_ERR(j))
48523+ return PTR_ERR(j);
48524+ if (*jnode_get_block(j) == 0) {
48525+ reiser4_block_nr blocknr;
48526+
48527+ blocknr = extent_get_start(ext) + pos;
48528+ jnode_set_block(j, &blocknr);
48529+ } else
48530+ assert("vs-1403",
48531+ j->blocknr == extent_get_start(ext) + pos);
48532+ break;
48533+
48534+ case UNALLOCATED_EXTENT:
48535+ j = jfind(mapping, index);
48536+ assert("nikita-2688", j);
48537+ assert("vs-1426", jnode_page(j) == NULL);
48538+
48539+ spin_lock_jnode(j);
48540+ jnode_attach_page(j, page);
48541+ spin_unlock_jnode(j);
48542+ break;
48543+
48544+ default:
48545+ warning("vs-957", "wrong extent\n");
48546+ return RETERR(-EIO);
48547+ }
48548+
48549+ BUG_ON(j == 0);
48550+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
48551+ jput(j);
48552+ return 0;
48553+}
48554+
48555+/* Implements plugin->u.item.s.file.read operation for extent items. */
48556+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
48557+{
48558+ int result;
48559+ struct page *page;
48560+ unsigned long cur_page, next_page;
48561+ unsigned long page_off, count;
48562+ struct address_space *mapping;
48563+ loff_t file_off;
48564+ uf_coord_t *uf_coord;
48565+ coord_t *coord;
48566+ extent_coord_extension_t *ext_coord;
48567+ unsigned long nr_pages;
48568+ char *kaddr;
48569+
48570+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
48571+ assert("vs-572", flow->user == 1);
48572+ assert("vs-1351", flow->length > 0);
48573+
48574+ uf_coord = &hint->ext_coord;
48575+
48576+ check_uf_coord(uf_coord, NULL);
48577+ assert("vs-33", uf_coord->lh == &hint->lh);
48578+
48579+ coord = &uf_coord->coord;
48580+ assert("vs-1119", znode_is_rlocked(coord->node));
48581+ assert("vs-1120", znode_is_loaded(coord->node));
48582+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
48583+
48584+ mapping = file->f_dentry->d_inode->i_mapping;
48585+ ext_coord = &uf_coord->extension.extent;
48586+
48587+ /* offset in a file to start read from */
48588+ file_off = get_key_offset(&flow->key);
48589+ /* offset within the page to start read from */
48590+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
48591+ /* bytes which can be read from the page which contains file_off */
48592+ count = PAGE_CACHE_SIZE - page_off;
48593+
48594+ /* index of page containing offset read is to start from */
48595+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
48596+ next_page = cur_page;
48597+ /* number of pages flow spans over */
48598+ nr_pages =
48599+ ((file_off + flow->length + PAGE_CACHE_SIZE -
48600+ 1) >> PAGE_CACHE_SHIFT) - cur_page;
48601+
48602+ /* we start having twig node read locked. However, we do not want to
48603+ keep that lock all the time readahead works. So, set a sel and
48604+ release twig node. */
48605+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
48606+ /* &hint->lh is done-ed */
48607+
48608+ do {
48609+ reiser4_txn_restart_current();
48610+ page = read_mapping_page(mapping, cur_page, file);
48611+ if (IS_ERR(page))
48612+ return PTR_ERR(page);
48613+ lock_page(page);
48614+ if (!PageUptodate(page)) {
48615+ unlock_page(page);
48616+ page_cache_release(page);
48617+ warning("jmacd-97178", "extent_read: page is not up to date");
48618+ return RETERR(-EIO);
48619+ }
48620+ mark_page_accessed(page);
48621+ unlock_page(page);
48622+
48623+ /* If users can be writing to this page using arbitrary virtual
48624+ addresses, take care about potential aliasing before reading
48625+ the page on the kernel side.
48626+ */
48627+ if (mapping_writably_mapped(mapping))
48628+ flush_dcache_page(page);
48629+
48630+ assert("nikita-3034", reiser4_schedulable());
48631+
48632+ /* number of bytes which are to be read from the page */
48633+ if (count > flow->length)
48634+ count = flow->length;
48635+
48636+ result = fault_in_pages_writeable(flow->data, count);
48637+ if (result) {
48638+ page_cache_release(page);
48639+ return RETERR(-EFAULT);
48640+ }
48641+
48642+ kaddr = kmap_atomic(page, KM_USER0);
48643+ result = __copy_to_user_inatomic(flow->data,
48644+ kaddr + page_off, count);
48645+ kunmap_atomic(kaddr, KM_USER0);
48646+ if (result != 0) {
48647+ kaddr = kmap(page);
48648+ result = __copy_to_user(flow->data, kaddr + page_off, count);
48649+ kunmap(page);
48650+ if (unlikely(result))
48651+ return RETERR(-EFAULT);
48652+ }
48653+
48654+ page_cache_release(page);
48655+
48656+ /* increase key (flow->key), update user area pointer (flow->data) */
48657+ move_flow_forward(flow, count);
48658+
48659+ page_off = 0;
48660+ cur_page ++;
48661+ count = PAGE_CACHE_SIZE;
48662+ nr_pages--;
48663+ } while (flow->length);
48664+
48665+ return 0;
48666+}
48667+
48668+/*
48669+ plugin->s.file.readpage
48670+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
48671+ or
48672+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
48673+
48674+ At the beginning: coord->node is read locked, zloaded, page is
48675+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
48676+*/
48677+int reiser4_readpage_extent(void *vp, struct page *page)
48678+{
48679+ uf_coord_t *uf_coord = vp;
48680+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
48681+ ON_DEBUG(reiser4_key key);
48682+
48683+ assert("vs-1040", PageLocked(page));
48684+ assert("vs-1050", !PageUptodate(page));
48685+ assert("vs-1039", page->mapping && page->mapping->host);
48686+
48687+ assert("vs-1044", znode_is_loaded(coord->node));
48688+ assert("vs-758", item_is_extent(coord));
48689+ assert("vs-1046", coord_is_existing_unit(coord));
48690+ assert("vs-1045", znode_is_rlocked(coord->node));
48691+ assert("vs-1047",
48692+ page->mapping->host->i_ino ==
48693+ get_key_objectid(item_key_by_coord(coord, &key)));
48694+ check_uf_coord(uf_coord, NULL);
48695+
48696+ return reiser4_do_readpage_extent(
48697+ ext_by_ext_coord(uf_coord),
48698+ uf_coord->extension.extent.pos_in_unit, page);
48699+}
48700+
48701+/**
48702+ * get_block_address_extent
48703+ * @coord:
48704+ * @block:
48705+ * @result:
48706+ *
48707+ *
48708+ */
48709+int get_block_address_extent(const coord_t *coord, sector_t block,
48710+ sector_t *result)
48711+{
48712+ reiser4_extent *ext;
48713+
48714+ if (!coord_is_existing_unit(coord))
48715+ return RETERR(-EINVAL);
48716+
48717+ ext = extent_by_coord(coord);
48718+
48719+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
48720+ /* FIXME: bad things may happen if it is unallocated extent */
48721+ *result = 0;
48722+ else {
48723+ reiser4_key key;
48724+
48725+ unit_key_by_coord(coord, &key);
48726+ assert("vs-1645",
48727+ block >= get_key_offset(&key) >> current_blocksize_bits);
48728+ assert("vs-1646",
48729+ block <
48730+ (get_key_offset(&key) >> current_blocksize_bits) +
48731+ extent_get_width(ext));
48732+ *result =
48733+ extent_get_start(ext) + (block -
48734+ (get_key_offset(&key) >>
48735+ current_blocksize_bits));
48736+ }
48737+ return 0;
48738+}
48739+
48740+/*
48741+ plugin->u.item.s.file.append_key
48742+ key of first byte which is the next to last byte by addressed by this extent
48743+*/
48744+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
48745+{
48746+ item_key_by_coord(coord, key);
48747+ set_key_offset(key,
48748+ get_key_offset(key) + reiser4_extent_size(coord,
48749+ nr_units_extent
48750+ (coord)));
48751+
48752+ assert("vs-610", get_key_offset(key)
48753+ && (get_key_offset(key) & (current_blocksize - 1)) == 0);
48754+ return key;
48755+}
48756+
48757+/* plugin->u.item.s.file.init_coord_extension */
48758+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
48759+{
48760+ coord_t *coord;
48761+ extent_coord_extension_t *ext_coord;
48762+ reiser4_key key;
48763+ loff_t offset;
48764+
48765+ assert("vs-1295", uf_coord->valid == 0);
48766+
48767+ coord = &uf_coord->coord;
48768+ assert("vs-1288", coord_is_iplug_set(coord));
48769+ assert("vs-1327", znode_is_loaded(coord->node));
48770+
48771+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
48772+ return;
48773+
48774+ ext_coord = &uf_coord->extension.extent;
48775+ ext_coord->nr_units = nr_units_extent(coord);
48776+ ext_coord->ext_offset =
48777+ (char *)extent_by_coord(coord) - zdata(coord->node);
48778+ ext_coord->width = extent_get_width(extent_by_coord(coord));
48779+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
48780+ uf_coord->valid = 1;
48781+
48782+ /* pos_in_unit is the only uninitialized field in extended coord */
48783+ if (coord->between == AFTER_UNIT) {
48784+ assert("vs-1330",
48785+ coord->unit_pos == nr_units_extent(coord) - 1);
48786+
48787+ ext_coord->pos_in_unit = ext_coord->width - 1;
48788+ } else {
48789+ /* AT_UNIT */
48790+ unit_key_by_coord(coord, &key);
48791+ offset = get_key_offset(&key);
48792+
48793+ assert("vs-1328", offset <= lookuped);
48794+ assert("vs-1329",
48795+ lookuped <
48796+ offset + ext_coord->width * current_blocksize);
48797+ ext_coord->pos_in_unit =
48798+ ((lookuped - offset) >> current_blocksize_bits);
48799+ }
48800+}
48801+
48802+/*
48803+ * Local variables:
48804+ * c-indentation-style: "K&R"
48805+ * mode-name: "LC"
48806+ * c-basic-offset: 8
48807+ * tab-width: 8
48808+ * fill-column: 79
48809+ * scroll-step: 1
48810+ * End:
48811+ */
48812diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_flush_ops.c
48813--- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
48814+++ linux-2.6.20/fs/reiser4/plugin/item/extent_flush_ops.c 2007-05-06 14:50:43.811010720 +0400
48815@@ -0,0 +1,1028 @@
48816+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48817+
48818+#include "item.h"
48819+#include "../../tree.h"
48820+#include "../../jnode.h"
48821+#include "../../super.h"
48822+#include "../../flush.h"
48823+#include "../../carry.h"
48824+#include "../object.h"
48825+
48826+#include <linux/pagemap.h>
48827+
48828+static reiser4_block_nr extent_unit_start(const coord_t * item);
48829+
48830+/* Return either first or last extent (depending on @side) of the item
48831+ @coord is set to. Set @pos_in_unit either to first or to last block
48832+ of extent. */
48833+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
48834+ reiser4_block_nr * pos_in_unit)
48835+{
48836+ reiser4_extent *ext;
48837+
48838+ if (side == LEFT_SIDE) {
48839+ /* get first extent of item */
48840+ ext = extent_item(coord);
48841+ *pos_in_unit = 0;
48842+ } else {
48843+ /* get last extent of item and last position within it */
48844+ assert("vs-363", side == RIGHT_SIDE);
48845+ ext = extent_item(coord) + coord_last_unit_pos(coord);
48846+ *pos_in_unit = extent_get_width(ext) - 1;
48847+ }
48848+
48849+ return ext;
48850+}
48851+
48852+/* item_plugin->f.utmost_child */
48853+/* Return the child. Coord is set to extent item. Find jnode corresponding
48854+ either to first or to last unformatted node pointed by the item */
48855+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
48856+{
48857+ reiser4_extent *ext;
48858+ reiser4_block_nr pos_in_unit;
48859+
48860+ ext = extent_utmost_ext(coord, side, &pos_in_unit);
48861+
48862+ switch (state_of_extent(ext)) {
48863+ case HOLE_EXTENT:
48864+ *childp = NULL;
48865+ return 0;
48866+ case ALLOCATED_EXTENT:
48867+ case UNALLOCATED_EXTENT:
48868+ break;
48869+ default:
48870+ /* this should never happen */
48871+ assert("vs-1417", 0);
48872+ }
48873+
48874+ {
48875+ reiser4_key key;
48876+ reiser4_tree *tree;
48877+ unsigned long index;
48878+
48879+ if (side == LEFT_SIDE) {
48880+ /* get key of first byte addressed by the extent */
48881+ item_key_by_coord(coord, &key);
48882+ } else {
48883+ /* get key of byte which next after last byte addressed by the extent */
48884+ append_key_extent(coord, &key);
48885+ }
48886+
48887+ assert("vs-544",
48888+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
48889+ /* index of first or last (depending on @side) page addressed
48890+ by the extent */
48891+ index =
48892+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
48893+ if (side == RIGHT_SIDE)
48894+ index--;
48895+
48896+ tree = coord->node->zjnode.tree;
48897+ *childp = jlookup(tree, get_key_objectid(&key), index);
48898+ }
48899+
48900+ return 0;
48901+}
48902+
48903+/* item_plugin->f.utmost_child_real_block */
48904+/* Return the child's block, if allocated. */
48905+int
48906+utmost_child_real_block_extent(const coord_t * coord, sideof side,
48907+ reiser4_block_nr * block)
48908+{
48909+ reiser4_extent *ext;
48910+
48911+ ext = extent_by_coord(coord);
48912+
48913+ switch (state_of_extent(ext)) {
48914+ case ALLOCATED_EXTENT:
48915+ *block = extent_get_start(ext);
48916+ if (side == RIGHT_SIDE)
48917+ *block += extent_get_width(ext) - 1;
48918+ break;
48919+ case HOLE_EXTENT:
48920+ case UNALLOCATED_EXTENT:
48921+ *block = 0;
48922+ break;
48923+ default:
48924+ /* this should never happen */
48925+ assert("vs-1418", 0);
48926+ }
48927+
48928+ return 0;
48929+}
48930+
48931+/* item_plugin->f.scan */
48932+/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
48933+ This scan continues, advancing the parent coordinate, until either it encounters a
48934+ formatted child or it finishes scanning this node.
48935+
48936+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
48937+ not sure this is last property (same atom) is enforced, but it should be the case since
48938+ one atom must write the parent and the others must read the parent, thus fusing?). In
48939+ any case, the code below asserts this case for unallocated extents. Unallocated
48940+ extents are thus optimized because we can skip to the endpoint when scanning.
48941+
48942+ It returns control to reiser4_scan_extent, handles these terminating conditions,
48943+ e.g., by loading the next twig.
48944+*/
48945+int reiser4_scan_extent(flush_scan * scan)
48946+{
48947+ coord_t coord;
48948+ jnode *neighbor;
48949+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
48950+ reiser4_block_nr unit_start;
48951+ __u64 oid;
48952+ reiser4_key key;
48953+ int ret = 0, allocated, incr;
48954+ reiser4_tree *tree;
48955+
48956+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
48957+ scan->stop = 1;
48958+ return 0; /* Race with truncate, this node is already
48959+ * truncated. */
48960+ }
48961+
48962+ coord_dup(&coord, &scan->parent_coord);
48963+
48964+ assert("jmacd-1404", !reiser4_scan_finished(scan));
48965+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
48966+ assert("jmacd-1406", jnode_is_unformatted(scan->node));
48967+
48968+ /* The scan_index variable corresponds to the current page index of the
48969+ unformatted block scan position. */
48970+ scan_index = index_jnode(scan->node);
48971+
48972+ assert("jmacd-7889", item_is_extent(&coord));
48973+
48974+ repeat:
48975+ /* objectid of file */
48976+ oid = get_key_objectid(item_key_by_coord(&coord, &key));
48977+
48978+ allocated = !extent_is_unallocated(&coord);
48979+ /* Get the values of this extent unit: */
48980+ unit_index = extent_unit_index(&coord);
48981+ unit_width = extent_unit_width(&coord);
48982+ unit_start = extent_unit_start(&coord);
48983+
48984+ assert("jmacd-7187", unit_width > 0);
48985+ assert("jmacd-7188", scan_index >= unit_index);
48986+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
48987+
48988+ /* Depending on the scan direction, we set different maximum values for scan_index
48989+ (scan_max) and the number of nodes that would be passed if the scan goes the
48990+ entire way (scan_dist). Incr is an integer reflecting the incremental
48991+ direction of scan_index. */
48992+ if (reiser4_scanning_left(scan)) {
48993+ scan_max = unit_index;
48994+ scan_dist = scan_index - unit_index;
48995+ incr = -1;
48996+ } else {
48997+ scan_max = unit_index + unit_width - 1;
48998+ scan_dist = scan_max - unit_index;
48999+ incr = +1;
49000+ }
49001+
49002+ tree = coord.node->zjnode.tree;
49003+
49004+ /* If the extent is allocated we have to check each of its blocks. If the extent
49005+ is unallocated we can skip to the scan_max. */
49006+ if (allocated) {
49007+ do {
49008+ neighbor = jlookup(tree, oid, scan_index);
49009+ if (neighbor == NULL)
49010+ goto stop_same_parent;
49011+
49012+ if (scan->node != neighbor
49013+ && !reiser4_scan_goto(scan, neighbor)) {
49014+ /* @neighbor was jput() by reiser4_scan_goto */
49015+ goto stop_same_parent;
49016+ }
49017+
49018+ ret = scan_set_current(scan, neighbor, 1, &coord);
49019+ if (ret != 0) {
49020+ goto exit;
49021+ }
49022+
49023+ /* reference to @neighbor is stored in @scan, no need
49024+ to jput(). */
49025+ scan_index += incr;
49026+
49027+ } while (incr + scan_max != scan_index);
49028+
49029+ } else {
49030+ /* Optimized case for unallocated extents, skip to the end. */
49031+ neighbor = jlookup(tree, oid, scan_max /*index */ );
49032+ if (neighbor == NULL) {
49033+ /* Race with truncate */
49034+ scan->stop = 1;
49035+ ret = 0;
49036+ goto exit;
49037+ }
49038+
49039+ assert("zam-1043",
49040+ reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
49041+
49042+ ret = scan_set_current(scan, neighbor, scan_dist, &coord);
49043+ if (ret != 0) {
49044+ goto exit;
49045+ }
49046+ }
49047+
49048+ if (coord_sideof_unit(&coord, scan->direction) == 0
49049+ && item_is_extent(&coord)) {
49050+ /* Continue as long as there are more extent units. */
49051+
49052+ scan_index =
49053+ extent_unit_index(&coord) +
49054+ (reiser4_scanning_left(scan) ?
49055+ extent_unit_width(&coord) - 1 : 0);
49056+ goto repeat;
49057+ }
49058+
49059+ if (0) {
49060+ stop_same_parent:
49061+
49062+ /* If we are scanning left and we stop in the middle of an allocated
49063+ extent, we know the preceder immediately.. */
49064+ /* middle of extent is (scan_index - unit_index) != 0. */
49065+ if (reiser4_scanning_left(scan) &&
49066+ (scan_index - unit_index) != 0) {
49067+ /* FIXME(B): Someone should step-through and verify that this preceder
49068+ calculation is indeed correct. */
49069+ /* @unit_start is starting block (number) of extent
49070+ unit. Flush stopped at the @scan_index block from
49071+ the beginning of the file, which is (scan_index -
49072+ unit_index) block within extent.
49073+ */
49074+ if (unit_start) {
49075+ /* skip preceder update when we are at hole */
49076+ scan->preceder_blk =
49077+ unit_start + scan_index - unit_index;
49078+ check_preceder(scan->preceder_blk);
49079+ }
49080+ }
49081+
49082+ /* In this case, we leave coord set to the parent of scan->node. */
49083+ scan->stop = 1;
49084+
49085+ } else {
49086+ /* In this case, we are still scanning, coord is set to the next item which is
49087+ either off-the-end of the node or not an extent. */
49088+ assert("jmacd-8912", scan->stop == 0);
49089+ assert("jmacd-7812",
49090+ (coord_is_after_sideof_unit(&coord, scan->direction)
49091+ || !item_is_extent(&coord)));
49092+ }
49093+
49094+ ret = 0;
49095+ exit:
49096+ return ret;
49097+}
49098+
49099+/* ask block allocator for some blocks */
49100+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
49101+ reiser4_block_nr wanted_count,
49102+ reiser4_block_nr *first_allocated,
49103+ reiser4_block_nr *allocated,
49104+ block_stage_t block_stage)
49105+{
49106+ *allocated = wanted_count;
49107+ preceder->max_dist = 0; /* scan whole disk, if needed */
49108+
49109+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
49110+ preceder->block_stage = block_stage;
49111+
49112+ /* FIXME: we do not handle errors here now */
49113+ check_me("vs-420",
49114+ reiser4_alloc_blocks(preceder, first_allocated, allocated,
49115+ BA_PERMANENT) == 0);
49116+ /* update flush_pos's preceder to last allocated block number */
49117+ preceder->blk = *first_allocated + *allocated - 1;
49118+}
49119+
49120+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
49121+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
49122+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
49123+static reiser4_block_nr reserve_replace(void)
49124+{
49125+ reiser4_block_nr grabbed, needed;
49126+
49127+ grabbed = get_current_context()->grabbed_blocks;
49128+ needed = estimate_one_insert_into_item(current_tree);
49129+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
49130+ return grabbed;
49131+}
49132+
49133+static void free_replace_reserved(reiser4_block_nr grabbed)
49134+{
49135+ reiser4_context *ctx;
49136+
49137+ ctx = get_current_context();
49138+ grabbed2free(ctx, get_super_private(ctx->super),
49139+ ctx->grabbed_blocks - grabbed);
49140+}
49141+
49142+/* Block offset of first block addressed by unit */
49143+__u64 extent_unit_index(const coord_t * item)
49144+{
49145+ reiser4_key key;
49146+
49147+ assert("vs-648", coord_is_existing_unit(item));
49148+ unit_key_by_coord(item, &key);
49149+ return get_key_offset(&key) >> current_blocksize_bits;
49150+}
49151+
49152+/* AUDIT shouldn't return value be of reiser4_block_nr type?
49153+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
49154+__u64 extent_unit_width(const coord_t * item)
49155+{
49156+ assert("vs-649", coord_is_existing_unit(item));
49157+ return width_by_coord(item);
49158+}
49159+
49160+/* Starting block location of this unit */
49161+static reiser4_block_nr extent_unit_start(const coord_t * item)
49162+{
49163+ return extent_get_start(extent_by_coord(item));
49164+}
49165+
49166+/**
49167+ * split_allocated_extent -
49168+ * @coord:
49169+ * @pos_in_unit:
49170+ *
49171+ * replace allocated extent with two allocated extents
49172+ */
49173+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
49174+{
49175+ int result;
49176+ struct replace_handle *h;
49177+ reiser4_extent *ext;
49178+ reiser4_block_nr grabbed;
49179+
49180+ ext = extent_by_coord(coord);
49181+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
49182+ assert("vs-1411", extent_get_width(ext) > pos_in_unit);
49183+
49184+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
49185+ if (h == NULL)
49186+ return RETERR(-ENOMEM);
49187+ h->coord = coord;
49188+ h->lh = znode_lh(coord->node);
49189+ h->pkey = &h->key;
49190+ unit_key_by_coord(coord, h->pkey);
49191+ set_key_offset(h->pkey,
49192+ (get_key_offset(h->pkey) +
49193+ pos_in_unit * current_blocksize));
49194+ reiser4_set_extent(&h->overwrite, extent_get_start(ext),
49195+ pos_in_unit);
49196+ reiser4_set_extent(&h->new_extents[0],
49197+ extent_get_start(ext) + pos_in_unit,
49198+ extent_get_width(ext) - pos_in_unit);
49199+ h->nr_new_extents = 1;
49200+ h->flags = COPI_DONT_SHIFT_LEFT;
49201+ h->paste_key = h->key;
49202+
49203+ /* reserve space for extent unit paste, @grabbed is reserved before */
49204+ grabbed = reserve_replace();
49205+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49206+ extent */);
49207+ /* restore reserved */
49208+ free_replace_reserved(grabbed);
49209+ kfree(h);
49210+ return result;
49211+}
49212+
49213+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
49214+ one). Return 1 if it succeeded, 0 - otherwise */
49215+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
49216+ reiser4_extent *replace)
49217+{
49218+ assert("vs-1415", extent_by_coord(coord) == ext);
49219+
49220+ if (coord->unit_pos == 0
49221+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
49222+ /* @ext either does not exist or is not allocated extent */
49223+ return 0;
49224+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
49225+ extent_get_start(replace))
49226+ return 0;
49227+
49228+ /* we can glue, widen previous unit */
49229+ extent_set_width(ext - 1,
49230+ extent_get_width(ext - 1) + extent_get_width(replace));
49231+
49232+ if (extent_get_width(ext) != extent_get_width(replace)) {
49233+ /* make current extent narrower */
49234+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
49235+ extent_set_start(ext,
49236+ extent_get_start(ext) +
49237+ extent_get_width(replace));
49238+ extent_set_width(ext,
49239+ extent_get_width(ext) -
49240+ extent_get_width(replace));
49241+ } else {
49242+ /* current extent completely glued with its left neighbor, remove it */
49243+ coord_t from, to;
49244+
49245+ coord_dup(&from, coord);
49246+ from.unit_pos = nr_units_extent(coord) - 1;
49247+ coord_dup(&to, &from);
49248+
49249+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got
49250+ freed after unit removal to end of item */
49251+ memmove(ext, ext + 1,
49252+ (from.unit_pos -
49253+ coord->unit_pos) * sizeof(reiser4_extent));
49254+ /* wipe part of item which is going to be cut, so that node_check will not be confused */
49255+ cut_node_content(&from, &to, NULL, NULL, NULL);
49256+ }
49257+ znode_make_dirty(coord->node);
49258+ /* move coord back */
49259+ coord->unit_pos--;
49260+ return 1;
49261+}
49262+
49263+/**
49264+ * conv_extent - replace extent with 2 ones
49265+ * @coord: coordinate of extent to be replaced
49266+ * @replace: extent to overwrite the one @coord is set to
49267+ *
49268+ * Overwrites extent @coord is set to and paste one extent unit after
49269+ * overwritten one if @replace is shorter than initial extent
49270+ */
49271+static int conv_extent(coord_t *coord, reiser4_extent *replace)
49272+{
49273+ int result;
49274+ struct replace_handle *h;
49275+ reiser4_extent *ext;
49276+ reiser4_block_nr start, width, new_width;
49277+ reiser4_block_nr grabbed;
49278+ extent_state state;
49279+
49280+ ext = extent_by_coord(coord);
49281+ state = state_of_extent(ext);
49282+ start = extent_get_start(ext);
49283+ width = extent_get_width(ext);
49284+ new_width = extent_get_width(replace);
49285+
49286+ assert("vs-1458", (state == UNALLOCATED_EXTENT ||
49287+ state == ALLOCATED_EXTENT));
49288+ assert("vs-1459", width >= new_width);
49289+
49290+ if (try_to_merge_with_left(coord, ext, replace)) {
49291+ /* merged @replace with left neighbor. Current unit is either
49292+ removed or narrowed */
49293+ return 0;
49294+ }
49295+
49296+ if (width == new_width) {
49297+ /* replace current extent with @replace */
49298+ *ext = *replace;
49299+ znode_make_dirty(coord->node);
49300+ return 0;
49301+ }
49302+
49303+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
49304+ if (h == NULL)
49305+ return RETERR(-ENOMEM);
49306+ h->coord = coord;
49307+ h->lh = znode_lh(coord->node);
49308+ h->pkey = &h->key;
49309+ unit_key_by_coord(coord, h->pkey);
49310+ set_key_offset(h->pkey,
49311+ (get_key_offset(h->pkey) + new_width * current_blocksize));
49312+ h->overwrite = *replace;
49313+
49314+ /* replace @ext with @replace and padding extent */
49315+ reiser4_set_extent(&h->new_extents[0],
49316+ (state == ALLOCATED_EXTENT) ?
49317+ (start + new_width) :
49318+ UNALLOCATED_EXTENT_START,
49319+ width - new_width);
49320+ h->nr_new_extents = 1;
49321+ h->flags = COPI_DONT_SHIFT_LEFT;
49322+ h->paste_key = h->key;
49323+
49324+ /* reserve space for extent unit paste, @grabbed is reserved before */
49325+ grabbed = reserve_replace();
49326+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49327+ extent */);
49328+
49329+ /* restore reserved */
49330+ free_replace_reserved(grabbed);
49331+ kfree(h);
49332+ return result;
49333+}
49334+
49335+/**
49336+ * assign_real_blocknrs
49337+ * @flush_pos:
49338+ * @oid: objectid of file jnodes to assign block number to belongs to
49339+ * @index: first jnode on the range
49340+ * @count: number of jnodes to assign block numbers to
49341+ * @first: start of allocated block range
49342+ *
49343+ * Assigns block numbers to each of @count jnodes. Index of first jnode is
49344+ * @index. Jnodes get lookuped with jlookup.
49345+ */
49346+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
49347+ unsigned long index, reiser4_block_nr count,
49348+ reiser4_block_nr first)
49349+{
49350+ unsigned long i;
49351+ reiser4_tree *tree;
49352+ txn_atom *atom;
49353+ int nr;
49354+
49355+ atom = atom_locked_by_fq(flush_pos->fq);
49356+ assert("vs-1468", atom);
49357+ BUG_ON(atom == NULL);
49358+
49359+ nr = 0;
49360+ tree = current_tree;
49361+ for (i = 0; i < count; ++i, ++index) {
49362+ jnode *node;
49363+
49364+ node = jlookup(tree, oid, index);
49365+ assert("", node != NULL);
49366+ BUG_ON(node == NULL);
49367+
49368+ spin_lock_jnode(node);
49369+ assert("", !jnode_is_flushprepped(node));
49370+ assert("vs-1475", node->atom == atom);
49371+ assert("vs-1476", atomic_read(&node->x_count) > 0);
49372+
49373+ JF_CLR(node, JNODE_FLUSH_RESERVED);
49374+ jnode_set_block(node, &first);
49375+ unformatted_make_reloc(node, flush_pos->fq);
49376+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
49377+ FQ_LIST, 0));
49378+ spin_unlock_jnode(node);
49379+ first++;
49380+
49381+ atomic_dec(&node->x_count);
49382+ nr ++;
49383+ }
49384+
49385+ spin_unlock_atom(atom);
49386+ return;
49387+}
49388+
49389+/**
49390+ * make_node_ovrwr - assign node to overwrite set
49391+ * @jnodes: overwrite set list head
49392+ * @node: jnode to belong to overwrite set
49393+ *
49394+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
49395+ * which is an accumulator for nodes before they get to overwrite set list of
49396+ * atom.
49397+ */
49398+static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
49399+{
49400+ spin_lock_jnode(node);
49401+
49402+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
49403+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
49404+
49405+ JF_SET(node, JNODE_OVRWR);
49406+ list_move_tail(&node->capture_link, jnodes);
49407+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
49408+
49409+ spin_unlock_jnode(node);
49410+}
49411+
49412+/**
49413+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
49414+ * @flush_pos: flush position
49415+ * @oid: objectid of file jnodes belong to
49416+ * @index: starting index
49417+ * @width: extent width
49418+ *
49419+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
49420+ * overwrite set. Starting from the one with index @index. If end of slum is
49421+ * detected (node is not found or flushprepped) - stop iterating and set flush
49422+ * position's state to POS_INVALID.
49423+ */
49424+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
49425+ unsigned long index, reiser4_block_nr width)
49426+{
49427+ unsigned long i;
49428+ reiser4_tree *tree;
49429+ jnode *node;
49430+ txn_atom *atom;
49431+ LIST_HEAD(jnodes);
49432+
49433+ tree = current_tree;
49434+
49435+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
49436+ assert("vs-1478", atom);
49437+
49438+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
49439+ node = jlookup(tree, oid, index);
49440+ if (!node) {
49441+ flush_pos->state = POS_INVALID;
49442+ break;
49443+ }
49444+ if (jnode_check_flushprepped(node)) {
49445+ flush_pos->state = POS_INVALID;
49446+ atomic_dec(&node->x_count);
49447+ break;
49448+ }
49449+ if (node->atom != atom) {
49450+ flush_pos->state = POS_INVALID;
49451+ atomic_dec(&node->x_count);
49452+ break;
49453+ }
49454+ make_node_ovrwr(&jnodes, node);
49455+ atomic_dec(&node->x_count);
49456+ }
49457+
49458+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
49459+ spin_unlock_atom(atom);
49460+}
49461+
49462+/**
49463+ * allocated_extent_slum_size
49464+ * @flush_pos:
49465+ * @oid:
49466+ * @index:
49467+ * @count:
49468+ *
49469+ *
49470+ */
49471+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
49472+ unsigned long index, unsigned long count)
49473+{
49474+ unsigned long i;
49475+ reiser4_tree *tree;
49476+ txn_atom *atom;
49477+ int nr;
49478+
49479+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
49480+ assert("vs-1468", atom);
49481+
49482+ nr = 0;
49483+ tree = current_tree;
49484+ for (i = 0; i < count; ++i, ++index) {
49485+ jnode *node;
49486+
49487+ node = jlookup(tree, oid, index);
49488+ if (!node)
49489+ break;
49490+
49491+ if (jnode_check_flushprepped(node)) {
49492+ atomic_dec(&node->x_count);
49493+ break;
49494+ }
49495+
49496+ if (node->atom != atom) {
49497+ /*
49498+ * this is possible on overwrite: extent_write may
49499+ * capture several unformatted nodes without capturing
49500+ * any formatted nodes.
49501+ */
49502+ atomic_dec(&node->x_count);
49503+ break;
49504+ }
49505+
49506+ assert("vs-1476", atomic_read(&node->x_count) > 1);
49507+ atomic_dec(&node->x_count);
49508+ nr ++;
49509+ }
49510+
49511+ spin_unlock_atom(atom);
49512+ return nr;
49513+}
49514+
49515+/**
49516+ * alloc_extent
49517+ * @flush_pos:
49518+ *
49519+ *
49520+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
49521+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes
49522+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
49523+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
49524+ * set to 1 and to overwrite set otherwise
49525+ */
49526+int reiser4_alloc_extent(flush_pos_t *flush_pos)
49527+{
49528+ coord_t *coord;
49529+ reiser4_extent *ext;
49530+ reiser4_extent replace_ext;
49531+ oid_t oid;
49532+ reiser4_block_nr protected;
49533+ reiser4_block_nr start;
49534+ __u64 index;
49535+ __u64 width;
49536+ extent_state state;
49537+ int result;
49538+ reiser4_block_nr first_allocated;
49539+ __u64 allocated;
49540+ reiser4_key key;
49541+ block_stage_t block_stage;
49542+
49543+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
49544+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
49545+ && item_is_extent(&flush_pos->coord));
49546+
49547+ coord = &flush_pos->coord;
49548+
49549+ ext = extent_by_coord(coord);
49550+ state = state_of_extent(ext);
49551+ if (state == HOLE_EXTENT) {
49552+ flush_pos->state = POS_INVALID;
49553+ return 0;
49554+ }
49555+
49556+ item_key_by_coord(coord, &key);
49557+ oid = get_key_objectid(&key);
49558+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
49559+ start = extent_get_start(ext);
49560+ width = extent_get_width(ext);
49561+
49562+ assert("vs-1457", width > flush_pos->pos_in_unit);
49563+
49564+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
49565+ /* relocate */
49566+ if (flush_pos->pos_in_unit) {
49567+ /* split extent unit into two */
49568+ result =
49569+ split_allocated_extent(coord,
49570+ flush_pos->pos_in_unit);
49571+ flush_pos->pos_in_unit = 0;
49572+ return result;
49573+ }
49574+
49575+ /* limit number of nodes to allocate */
49576+ if (flush_pos->nr_to_write < width)
49577+ width = flush_pos->nr_to_write;
49578+
49579+ if (state == ALLOCATED_EXTENT) {
49580+ /*
49581+ * all protected nodes are not flushprepped, therefore
49582+ * they are counted as flush_reserved
49583+ */
49584+ block_stage = BLOCK_FLUSH_RESERVED;
49585+ protected = allocated_extent_slum_size(flush_pos, oid,
49586+ index, width);
49587+ if (protected == 0) {
49588+ flush_pos->state = POS_INVALID;
49589+ flush_pos->pos_in_unit = 0;
49590+ return 0;
49591+ }
49592+ } else {
49593+ block_stage = BLOCK_UNALLOCATED;
49594+ protected = width;
49595+ }
49596+
49597+ /*
49598+ * look at previous unit if possible. If it is allocated, make
49599+ * preceder more precise
49600+ */
49601+ if (coord->unit_pos &&
49602+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
49603+ reiser4_pos_hint(flush_pos)->blk =
49604+ extent_get_start(ext - 1) +
49605+ extent_get_width(ext - 1);
49606+
49607+ /* allocate new block numbers for protected nodes */
49608+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49609+ protected,
49610+ &first_allocated, &allocated,
49611+ block_stage);
49612+
49613+ if (state == ALLOCATED_EXTENT)
49614+ /*
49615+ * on relocating - free nodes which are going to be
49616+ * relocated
49617+ */
49618+ reiser4_dealloc_blocks(&start, &allocated,
49619+ BLOCK_ALLOCATED, BA_DEFER);
49620+
49621+ /* assign new block numbers to protected nodes */
49622+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
49623+
49624+ /* prepare extent which will replace current one */
49625+ reiser4_set_extent(&replace_ext, first_allocated, allocated);
49626+
49627+ /* adjust extent item */
49628+ result = conv_extent(coord, &replace_ext);
49629+ if (result != 0 && result != -ENOMEM) {
49630+ warning("vs-1461",
49631+ "Failed to allocate extent. Should not happen\n");
49632+ return result;
49633+ }
49634+
49635+ /*
49636+ * break flush: we prepared for flushing as many blocks as we
49637+ * were asked for
49638+ */
49639+ if (flush_pos->nr_to_write == allocated)
49640+ flush_pos->state = POS_INVALID;
49641+ } else {
49642+ /* overwrite */
49643+ mark_jnodes_overwrite(flush_pos, oid, index, width);
49644+ }
49645+ flush_pos->pos_in_unit = 0;
49646+ return 0;
49647+}
49648+
49649+/* if @key is glueable to the item @coord is set to */
49650+static int must_insert(const coord_t *coord, const reiser4_key *key)
49651+{
49652+ reiser4_key last;
49653+
49654+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID
49655+ && keyeq(append_key_extent(coord, &last), key))
49656+ return 0;
49657+ return 1;
49658+}
49659+
49660+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
49661+ or modify last unit of last item to have greater width */
49662+static int put_unit_to_end(znode *node, const reiser4_key *key,
49663+ reiser4_extent *copy_ext)
49664+{
49665+ int result;
49666+ coord_t coord;
49667+ cop_insert_flag flags;
49668+ reiser4_extent *last_ext;
49669+ reiser4_item_data data;
49670+
49671+ /* set coord after last unit in an item */
49672+ coord_init_last_unit(&coord, node);
49673+ coord.between = AFTER_UNIT;
49674+
49675+ flags =
49676+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
49677+ if (must_insert(&coord, key)) {
49678+ result =
49679+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
49680+ key, NULL /*lh */ , flags);
49681+
49682+ } else {
49683+ /* try to glue with last unit */
49684+ last_ext = extent_by_coord(&coord);
49685+ if (state_of_extent(last_ext) &&
49686+ extent_get_start(last_ext) + extent_get_width(last_ext) ==
49687+ extent_get_start(copy_ext)) {
49688+ /* widen last unit of node */
49689+ extent_set_width(last_ext,
49690+ extent_get_width(last_ext) +
49691+ extent_get_width(copy_ext));
49692+ znode_make_dirty(node);
49693+ return 0;
49694+ }
49695+
49696+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
49697+ result =
49698+ insert_into_item(&coord, NULL /*lh */ , key,
49699+ init_new_extent(&data, copy_ext, 1),
49700+ flags);
49701+ }
49702+
49703+ assert("vs-438", result == 0 || result == -E_NODE_FULL);
49704+ return result;
49705+}
49706+
49707+/* @coord is set to extent unit */
49708+squeeze_result squalloc_extent(znode *left, const coord_t *coord,
49709+ flush_pos_t *flush_pos,
49710+ reiser4_key *stop_key)
49711+{
49712+ reiser4_extent *ext;
49713+ __u64 index;
49714+ __u64 width;
49715+ reiser4_block_nr start;
49716+ extent_state state;
49717+ oid_t oid;
49718+ reiser4_block_nr first_allocated;
49719+ __u64 allocated;
49720+ __u64 protected;
49721+ reiser4_extent copy_extent;
49722+ reiser4_key key;
49723+ int result;
49724+ block_stage_t block_stage;
49725+
49726+ assert("vs-1457", flush_pos->pos_in_unit == 0);
49727+ assert("vs-1467", coord_is_leftmost_unit(coord));
49728+ assert("vs-1467", item_is_extent(coord));
49729+
49730+ ext = extent_by_coord(coord);
49731+ index = extent_unit_index(coord);
49732+ start = extent_get_start(ext);
49733+ width = extent_get_width(ext);
49734+ state = state_of_extent(ext);
49735+ unit_key_by_coord(coord, &key);
49736+ oid = get_key_objectid(&key);
49737+
49738+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
49739+ (state == UNALLOCATED_EXTENT)) {
49740+ /* relocate */
49741+ if (state == ALLOCATED_EXTENT) {
49742+ /* all protected nodes are not flushprepped, therefore
49743+ * they are counted as flush_reserved */
49744+ block_stage = BLOCK_FLUSH_RESERVED;
49745+ protected = allocated_extent_slum_size(flush_pos, oid,
49746+ index, width);
49747+ if (protected == 0) {
49748+ flush_pos->state = POS_INVALID;
49749+ flush_pos->pos_in_unit = 0;
49750+ return 0;
49751+ }
49752+ } else {
49753+ block_stage = BLOCK_UNALLOCATED;
49754+ protected = width;
49755+ }
49756+
49757+ /*
49758+ * look at previous unit if possible. If it is allocated, make
49759+ * preceder more precise
49760+ */
49761+ if (coord->unit_pos &&
49762+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
49763+ reiser4_pos_hint(flush_pos)->blk =
49764+ extent_get_start(ext - 1) +
49765+ extent_get_width(ext - 1);
49766+
49767+ /* allocate new block numbers for protected nodes */
49768+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49769+ protected,
49770+ &first_allocated, &allocated,
49771+ block_stage);
49772+
49773+ /* prepare extent which will be copied to left */
49774+ reiser4_set_extent(&copy_extent, first_allocated, allocated);
49775+
49776+ result = put_unit_to_end(left, &key, &copy_extent);
49777+ if (result == -E_NODE_FULL) {
49778+ int target_block_stage;
49779+
49780+ /* free blocks which were just allocated */
49781+ target_block_stage =
49782+ (state ==
49783+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
49784+ BLOCK_UNALLOCATED;
49785+ reiser4_dealloc_blocks(&first_allocated, &allocated,
49786+ target_block_stage,
49787+ BA_PERMANENT);
49788+
49789+ /* rewind the preceder. */
49790+ flush_pos->preceder.blk = first_allocated;
49791+ check_preceder(flush_pos->preceder.blk);
49792+
49793+ return SQUEEZE_TARGET_FULL;
49794+ }
49795+
49796+ if (state == ALLOCATED_EXTENT) {
49797+ /* free nodes which were relocated */
49798+ reiser4_dealloc_blocks(&start, &allocated,
49799+ BLOCK_ALLOCATED, BA_DEFER);
49800+ }
49801+
49802+ /* assign new block numbers to protected nodes */
49803+ assign_real_blocknrs(flush_pos, oid, index, allocated,
49804+ first_allocated);
49805+
49806+ set_key_offset(&key,
49807+ get_key_offset(&key) +
49808+ (allocated << current_blocksize_bits));
49809+ } else {
49810+ /*
49811+ * overwrite: try to copy unit as it is to left neighbor and
49812+ * make all first not flushprepped nodes overwrite nodes
49813+ */
49814+ reiser4_set_extent(&copy_extent, start, width);
49815+ result = put_unit_to_end(left, &key, &copy_extent);
49816+ if (result == -E_NODE_FULL)
49817+ return SQUEEZE_TARGET_FULL;
49818+
49819+ if (state != HOLE_EXTENT)
49820+ mark_jnodes_overwrite(flush_pos, oid, index, width);
49821+ set_key_offset(&key,
49822+ get_key_offset(&key) +
49823+ (width << current_blocksize_bits));
49824+ }
49825+ *stop_key = key;
49826+ return SQUEEZE_CONTINUE;
49827+}
49828+
49829+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
49830+{
49831+ return key_by_inode_and_offset_common(inode, off, key);
49832+}
49833+
49834+/*
49835+ * Local variables:
49836+ * c-indentation-style: "K&R"
49837+ * mode-name: "LC"
49838+ * c-basic-offset: 8
49839+ * tab-width: 8
49840+ * fill-column: 79
49841+ * scroll-step: 1
49842+ * End:
49843+ */
49844diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent.h linux-2.6.20/fs/reiser4/plugin/item/extent.h
49845--- linux-2.6.20.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300
49846+++ linux-2.6.20/fs/reiser4/plugin/item/extent.h 2007-05-06 14:50:43.811010720 +0400
49847@@ -0,0 +1,231 @@
49848+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49849+
49850+#ifndef __REISER4_EXTENT_H__
49851+#define __REISER4_EXTENT_H__
49852+
49853+/* on disk extent */
49854+typedef struct {
49855+ reiser4_dblock_nr start;
49856+ reiser4_dblock_nr width;
49857+} reiser4_extent;
49858+
49859+typedef struct extent_stat {
49860+ int unallocated_units;
49861+ int unallocated_blocks;
49862+ int allocated_units;
49863+ int allocated_blocks;
49864+ int hole_units;
49865+ int hole_blocks;
49866+} extent_stat;
49867+
49868+/* extents in an extent item can be either holes, or unallocated or allocated
49869+ extents */
49870+typedef enum {
49871+ HOLE_EXTENT,
49872+ UNALLOCATED_EXTENT,
49873+ ALLOCATED_EXTENT
49874+} extent_state;
49875+
49876+#define HOLE_EXTENT_START 0
49877+#define UNALLOCATED_EXTENT_START 1
49878+#define UNALLOCATED_EXTENT_START2 2
49879+
49880+typedef struct {
49881+ reiser4_block_nr pos_in_unit;
49882+ reiser4_block_nr width; /* width of current unit */
49883+ pos_in_node_t nr_units; /* number of units */
49884+ int ext_offset; /* offset from the beginning of zdata() */
49885+ unsigned long expected_page;
49886+#if REISER4_DEBUG
49887+ reiser4_extent extent;
49888+#endif
49889+} extent_coord_extension_t;
49890+
49891+/* macros to set/get fields of on-disk extent */
49892+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
49893+{
49894+ return le64_to_cpu(ext->start);
49895+}
49896+
49897+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
49898+{
49899+ return le64_to_cpu(ext->width);
49900+}
49901+
49902+extern __u64 reiser4_current_block_count(void);
49903+
49904+static inline void
49905+extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
49906+{
49907+ cassert(sizeof(ext->start) == 8);
49908+ assert("nikita-2510",
49909+ ergo(start > 1, start < reiser4_current_block_count()));
49910+ put_unaligned(cpu_to_le64(start), &ext->start);
49911+}
49912+
49913+static inline void
49914+extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
49915+{
49916+ cassert(sizeof(ext->width) == 8);
49917+ assert("", width > 0);
49918+ put_unaligned(cpu_to_le64(width), &ext->width);
49919+ assert("nikita-2511",
49920+ ergo(extent_get_start(ext) > 1,
49921+ extent_get_start(ext) + width <=
49922+ reiser4_current_block_count()));
49923+}
49924+
49925+#define extent_item(coord) \
49926+({ \
49927+ assert("nikita-3143", item_is_extent(coord)); \
49928+ ((reiser4_extent *)item_body_by_coord (coord)); \
49929+})
49930+
49931+#define extent_by_coord(coord) \
49932+({ \
49933+ assert("nikita-3144", item_is_extent(coord)); \
49934+ (extent_item (coord) + (coord)->unit_pos); \
49935+})
49936+
49937+#define width_by_coord(coord) \
49938+({ \
49939+ assert("nikita-3145", item_is_extent(coord)); \
49940+ extent_get_width (extent_by_coord(coord)); \
49941+})
49942+
49943+struct carry_cut_data;
49944+struct carry_kill_data;
49945+
49946+/* plugin->u.item.b.* */
49947+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
49948+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
49949+ const reiser4_item_data *);
49950+int mergeable_extent(const coord_t * p1, const coord_t * p2);
49951+pos_in_node_t nr_units_extent(const coord_t *);
49952+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
49953+void init_coord_extent(coord_t *);
49954+int init_extent(coord_t *, reiser4_item_data *);
49955+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
49956+int can_shift_extent(unsigned free_space,
49957+ coord_t * source, znode * target, shift_direction,
49958+ unsigned *size, unsigned want);
49959+void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
49960+ unsigned count, shift_direction where_is_free_space,
49961+ unsigned free_space);
49962+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
49963+ struct carry_kill_data *);
49964+int create_hook_extent(const coord_t * coord, void *arg);
49965+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49966+ struct carry_cut_data *, reiser4_key * smallest_removed,
49967+ reiser4_key * new_first);
49968+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49969+ struct carry_kill_data *, reiser4_key * smallest_removed,
49970+ reiser4_key * new_first);
49971+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
49972+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
49973+void print_extent(const char *, coord_t *);
49974+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
49975+int utmost_child_real_block_extent(const coord_t * coord, sideof side,
49976+ reiser4_block_nr * block);
49977+void item_stat_extent(const coord_t * coord, void *vp);
49978+int reiser4_check_extent(const coord_t * coord, const char **error);
49979+
49980+/* plugin->u.item.s.file.* */
49981+ssize_t reiser4_write_extent(struct file *, const char __user *,
49982+ size_t, loff_t *);
49983+int reiser4_read_extent(struct file *, flow_t *, hint_t *);
49984+int reiser4_readpage_extent(void *, struct page *);
49985+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
49986+reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
49987+void init_coord_extension_extent(uf_coord_t *, loff_t offset);
49988+int get_block_address_extent(const coord_t *, sector_t block,
49989+ sector_t * result);
49990+
49991+/* these are used in flush.c
49992+ FIXME-VS: should they be somewhere in item_plugin? */
49993+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
49994+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
49995+ reiser4_key * stop_key);
49996+
49997+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
49998+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
49999+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
50000+
50001+/* plugin->u.item.f. */
50002+int reiser4_scan_extent(flush_scan * scan);
50003+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
50004+
50005+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
50006+ int nr_extents);
50007+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
50008+extent_state state_of_extent(reiser4_extent * ext);
50009+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
50010+ reiser4_block_nr width);
50011+int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
50012+ int *plugged_hole);
50013+
50014+#include "../../coord.h"
50015+#include "../../lock.h"
50016+#include "../../tap.h"
50017+
50018+struct replace_handle {
50019+ /* these are to be set before calling reiser4_replace_extent */
50020+ coord_t *coord;
50021+ lock_handle *lh;
50022+ reiser4_key key;
50023+ reiser4_key *pkey;
50024+ reiser4_extent overwrite;
50025+ reiser4_extent new_extents[2];
50026+ int nr_new_extents;
50027+ unsigned flags;
50028+
50029+ /* these are used by reiser4_replace_extent */
50030+ reiser4_item_data item;
50031+ coord_t coord_after;
50032+ lock_handle lh_after;
50033+ tap_t watch;
50034+ reiser4_key paste_key;
50035+#if REISER4_DEBUG
50036+ reiser4_extent orig_ext;
50037+ reiser4_key tmp;
50038+#endif
50039+};
50040+
50041+/* this structure is kmalloced before calling make_extent to avoid excessive
50042+ stack consumption on plug_hole->reiser4_replace_extent */
50043+struct make_extent_handle {
50044+ uf_coord_t *uf_coord;
50045+ reiser4_block_nr blocknr;
50046+ int created;
50047+ struct inode *inode;
50048+ union {
50049+ struct {
50050+ } append;
50051+ struct replace_handle replace;
50052+ } u;
50053+};
50054+
50055+int reiser4_replace_extent(struct replace_handle *,
50056+ int return_inserted_position);
50057+lock_handle *znode_lh(znode *);
50058+
50059+/* the reiser4 repacker support */
50060+struct repacker_cursor;
50061+extern int process_extent_backward_for_repacking(tap_t *,
50062+ struct repacker_cursor *);
50063+extern int mark_extent_for_repacking(tap_t *, int);
50064+
50065+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
50066+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
50067+
50068+/* __REISER4_EXTENT_H__ */
50069+#endif
50070+/*
50071+ Local variables:
50072+ c-indentation-style: "K&R"
50073+ mode-name: "LC"
50074+ c-basic-offset: 8
50075+ tab-width: 8
50076+ fill-column: 120
50077+ End:
50078+*/
50079diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_item_ops.c
50080--- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300
50081+++ linux-2.6.20/fs/reiser4/plugin/item/extent_item_ops.c 2007-05-06 14:50:43.815011970 +0400
50082@@ -0,0 +1,889 @@
50083+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50084+
50085+#include "item.h"
50086+#include "../../inode.h"
50087+#include "../../tree_walk.h" /* check_sibling_list() */
50088+#include "../../page_cache.h"
50089+#include "../../carry.h"
50090+
50091+#include <linux/quotaops.h>
50092+
50093+/* item_plugin->b.max_key_inside */
50094+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
50095+{
50096+ item_key_by_coord(coord, key);
50097+ set_key_offset(key, get_key_offset(reiser4_max_key()));
50098+ return key;
50099+}
50100+
50101+/* item_plugin->b.can_contain_key
50102+ this checks whether @key of @data is matching to position set by @coord */
50103+int
50104+can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
50105+ const reiser4_item_data * data)
50106+{
50107+ reiser4_key item_key;
50108+
50109+ if (item_plugin_by_coord(coord) != data->iplug)
50110+ return 0;
50111+
50112+ item_key_by_coord(coord, &item_key);
50113+ if (get_key_locality(key) != get_key_locality(&item_key) ||
50114+ get_key_objectid(key) != get_key_objectid(&item_key) ||
50115+ get_key_ordering(key) != get_key_ordering(&item_key))
50116+ return 0;
50117+
50118+ return 1;
50119+}
50120+
50121+/* item_plugin->b.mergeable
50122+ first item is of extent type */
50123+/* Audited by: green(2002.06.13) */
50124+int mergeable_extent(const coord_t * p1, const coord_t * p2)
50125+{
50126+ reiser4_key key1, key2;
50127+
50128+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
50129+ /* FIXME-VS: Which is it? Assert or return 0 */
50130+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
50131+ return 0;
50132+ }
50133+
50134+ item_key_by_coord(p1, &key1);
50135+ item_key_by_coord(p2, &key2);
50136+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
50137+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
50138+ get_key_ordering(&key1) != get_key_ordering(&key2) ||
50139+ get_key_type(&key1) != get_key_type(&key2))
50140+ return 0;
50141+ if (get_key_offset(&key1) +
50142+ reiser4_extent_size(p1, nr_units_extent(p1)) !=
50143+ get_key_offset(&key2))
50144+ return 0;
50145+ return 1;
50146+}
50147+
50148+/* item_plugin->b.nr_units */
50149+pos_in_node_t nr_units_extent(const coord_t * coord)
50150+{
50151+ /* length of extent item has to be multiple of extent size */
50152+ assert("vs-1424",
50153+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
50154+ return item_length_by_coord(coord) / sizeof(reiser4_extent);
50155+}
50156+
50157+/* item_plugin->b.lookup */
50158+lookup_result
50159+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
50160+ coord_t * coord)
50161+{ /* znode and item_pos are
50162+ set to an extent item to
50163+ look through */
50164+ reiser4_key item_key;
50165+ reiser4_block_nr lookuped, offset;
50166+ unsigned i, nr_units;
50167+ reiser4_extent *ext;
50168+ unsigned blocksize;
50169+ unsigned char blocksize_bits;
50170+
50171+ item_key_by_coord(coord, &item_key);
50172+ offset = get_key_offset(&item_key);
50173+
50174+ /* key we are looking for must be greater than key of item @coord */
50175+ assert("vs-414", keygt(key, &item_key));
50176+
50177+ assert("umka-99945",
50178+ !keygt(key, max_key_inside_extent(coord, &item_key)));
50179+
50180+ ext = extent_item(coord);
50181+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
50182+
50183+ blocksize = current_blocksize;
50184+ blocksize_bits = current_blocksize_bits;
50185+
50186+ /* offset we are looking for */
50187+ lookuped = get_key_offset(key);
50188+
50189+ nr_units = nr_units_extent(coord);
50190+ /* go through all extents until the one which address given offset */
50191+ for (i = 0; i < nr_units; i++, ext++) {
50192+ offset += (extent_get_width(ext) << blocksize_bits);
50193+ if (offset > lookuped) {
50194+ /* desired byte is somewhere in this extent */
50195+ coord->unit_pos = i;
50196+ coord->between = AT_UNIT;
50197+ return CBK_COORD_FOUND;
50198+ }
50199+ }
50200+
50201+ /* set coord after last unit */
50202+ coord->unit_pos = nr_units - 1;
50203+ coord->between = AFTER_UNIT;
50204+ return CBK_COORD_FOUND;
50205+}
50206+
50207+/* item_plugin->b.paste
50208+ item @coord is set to has been appended with @data->length of free
50209+ space. data->data contains data to be pasted into the item in position
50210+ @coord->in_item.unit_pos. It must fit into that free space.
50211+ @coord must be set between units.
50212+*/
50213+int
50214+paste_extent(coord_t * coord, reiser4_item_data * data,
50215+ carry_plugin_info * info UNUSED_ARG)
50216+{
50217+ unsigned old_nr_units;
50218+ reiser4_extent *ext;
50219+ int item_length;
50220+
50221+ ext = extent_item(coord);
50222+ item_length = item_length_by_coord(coord);
50223+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50224+
50225+ /* this is also used to copy extent into newly created item, so
50226+ old_nr_units could be 0 */
50227+ assert("vs-260", item_length >= data->length);
50228+
50229+ /* make sure that coord is set properly */
50230+ assert("vs-35",
50231+ ((!coord_is_existing_unit(coord))
50232+ || (!old_nr_units && !coord->unit_pos)));
50233+
50234+ /* first unit to be moved */
50235+ switch (coord->between) {
50236+ case AFTER_UNIT:
50237+ coord->unit_pos++;
50238+ case BEFORE_UNIT:
50239+ coord->between = AT_UNIT;
50240+ break;
50241+ case AT_UNIT:
50242+ assert("vs-331", !old_nr_units && !coord->unit_pos);
50243+ break;
50244+ default:
50245+ impossible("vs-330", "coord is set improperly");
50246+ }
50247+
50248+ /* prepare space for new units */
50249+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50250+ ext + coord->unit_pos,
50251+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50252+
50253+ /* copy new data from kernel space */
50254+ assert("vs-556", data->user == 0);
50255+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50256+
50257+ /* after paste @coord is set to first of pasted units */
50258+ assert("vs-332", coord_is_existing_unit(coord));
50259+ assert("vs-333",
50260+ !memcmp(data->data, extent_by_coord(coord),
50261+ (unsigned)data->length));
50262+ return 0;
50263+}
50264+
50265+/* item_plugin->b.can_shift */
50266+int
50267+can_shift_extent(unsigned free_space, coord_t * source,
50268+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50269+ unsigned *size, unsigned want)
50270+{
50271+ *size = item_length_by_coord(source);
50272+ if (*size > free_space)
50273+ /* never split a unit of extent item */
50274+ *size = free_space - free_space % sizeof(reiser4_extent);
50275+
50276+ /* we can shift *size bytes, calculate how many do we want to shift */
50277+ if (*size > want * sizeof(reiser4_extent))
50278+ *size = want * sizeof(reiser4_extent);
50279+
50280+ if (*size % sizeof(reiser4_extent) != 0)
50281+ impossible("vs-119", "Wrong extent size: %i %zd", *size,
50282+ sizeof(reiser4_extent));
50283+ return *size / sizeof(reiser4_extent);
50284+
50285+}
50286+
50287+/* item_plugin->b.copy_units */
50288+void
50289+copy_units_extent(coord_t * target, coord_t * source,
50290+ unsigned from, unsigned count,
50291+ shift_direction where_is_free_space, unsigned free_space)
50292+{
50293+ char *from_ext, *to_ext;
50294+
50295+ assert("vs-217", free_space == count * sizeof(reiser4_extent));
50296+
50297+ from_ext = item_body_by_coord(source);
50298+ to_ext = item_body_by_coord(target);
50299+
50300+ if (where_is_free_space == SHIFT_LEFT) {
50301+ assert("vs-215", from == 0);
50302+
50303+ /* At this moment, item length was already updated in the item
50304+ header by shifting code, hence nr_units_extent() will
50305+ return "new" number of units---one we obtain after copying
50306+ units.
50307+ */
50308+ to_ext +=
50309+ (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50310+ } else {
50311+ reiser4_key key;
50312+ coord_t coord;
50313+
50314+ assert("vs-216",
50315+ from + count == coord_last_unit_pos(source) + 1);
50316+
50317+ from_ext += item_length_by_coord(source) - free_space;
50318+
50319+ /* new units are inserted before first unit in an item,
50320+ therefore, we have to update item key */
50321+ coord = *source;
50322+ coord.unit_pos = from;
50323+ unit_key_extent(&coord, &key);
50324+
50325+ node_plugin_by_node(target->node)->update_item_key(target, &key,
50326+ NULL /*info */);
50327+ }
50328+
50329+ memcpy(to_ext, from_ext, free_space);
50330+}
50331+
50332+/* item_plugin->b.create_hook
50333+ @arg is znode of leaf node for which we need to update right delimiting key */
50334+int create_hook_extent(const coord_t * coord, void *arg)
50335+{
50336+ coord_t *child_coord;
50337+ znode *node;
50338+ reiser4_key key;
50339+ reiser4_tree *tree;
50340+
50341+ if (!arg)
50342+ return 0;
50343+
50344+ child_coord = arg;
50345+ tree = znode_get_tree(coord->node);
50346+
50347+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50348+
50349+ write_lock_tree(tree);
50350+ write_lock_dk(tree);
50351+ /* find a node on the left level for which right delimiting key has to
50352+ be updated */
50353+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50354+ assert("vs-411", znode_is_left_connected(child_coord->node));
50355+ node = child_coord->node->left;
50356+ } else {
50357+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50358+ node = child_coord->node;
50359+ assert("nikita-3314", node != NULL);
50360+ }
50361+
50362+ if (node != NULL) {
50363+ znode_set_rd_key(node, item_key_by_coord(coord, &key));
50364+
50365+ assert("nikita-3282", check_sibling_list(node));
50366+ /* break sibling links */
50367+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50368+ ON_DEBUG(node->right->left_version =
50369+ atomic_inc_return(&delim_key_version);
50370+ node->right_version =
50371+ atomic_inc_return(&delim_key_version););
50372+
50373+ node->right->left = NULL;
50374+ node->right = NULL;
50375+ }
50376+ }
50377+ write_unlock_dk(tree);
50378+ write_unlock_tree(tree);
50379+ return 0;
50380+}
50381+
50382+#define ITEM_TAIL_KILLED 0
50383+#define ITEM_HEAD_KILLED 1
50384+#define ITEM_KILLED 2
50385+
50386+/* item_plugin->b.kill_hook
50387+ this is called when @count units starting from @from-th one are going to be removed
50388+ */
50389+int
50390+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50391+ struct carry_kill_data *kdata)
50392+{
50393+ reiser4_extent *ext;
50394+ reiser4_block_nr start, length;
50395+ const reiser4_key *pfrom_key, *pto_key;
50396+ struct inode *inode;
50397+ reiser4_tree *tree;
50398+ pgoff_t from_off, to_off, offset, skip;
50399+ int retval;
50400+
50401+ /* these are located in memory kmalloc-ed by kill_node_content */
50402+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50403+ coord_t *dup, *next;
50404+
50405+ assert("zam-811", znode_is_write_locked(coord->node));
50406+ assert("nikita-3315", kdata != NULL);
50407+ assert("vs-34", kdata->buf != NULL);
50408+
50409+ /* map structures to kdata->buf */
50410+ min_item_key = (reiser4_key *) (kdata->buf);
50411+ max_item_key = min_item_key + 1;
50412+ from_key = max_item_key + 1;
50413+ to_key = from_key + 1;
50414+ key = to_key + 1;
50415+ dup = (coord_t *) (key + 1);
50416+ next = dup + 1;
50417+
50418+ item_key_by_coord(coord, min_item_key);
50419+ max_item_key_by_coord(coord, max_item_key);
50420+
50421+ if (kdata->params.from_key) {
50422+ pfrom_key = kdata->params.from_key;
50423+ pto_key = kdata->params.to_key;
50424+ } else {
50425+ assert("vs-1549", from == coord->unit_pos);
50426+ unit_key_by_coord(coord, from_key);
50427+ pfrom_key = from_key;
50428+
50429+ coord_dup(dup, coord);
50430+ dup->unit_pos = from + count - 1;
50431+ max_unit_key_by_coord(dup, to_key);
50432+ pto_key = to_key;
50433+ }
50434+
50435+ if (!keylt(pto_key, max_item_key)) {
50436+ if (!keygt(pfrom_key, min_item_key)) {
50437+ znode *left, *right;
50438+
50439+ /* item is to be removed completely */
50440+ assert("nikita-3316", kdata->left != NULL
50441+ && kdata->right != NULL);
50442+
50443+ left = kdata->left->node;
50444+ right = kdata->right->node;
50445+
50446+ tree = current_tree;
50447+ /* we have to do two things:
50448+ *
50449+ * 1. link left and right formatted neighbors of
50450+ * extent being removed, and
50451+ *
50452+ * 2. update their delimiting keys.
50453+ *
50454+ * atomicity of these operations is protected by
50455+ * taking dk-lock and tree-lock.
50456+ */
50457+ /* if neighbors of item being removed are znodes -
50458+ * link them */
50459+ write_lock_tree(tree);
50460+ write_lock_dk(tree);
50461+ link_left_and_right(left, right);
50462+ if (left) {
50463+ /* update right delimiting key of left
50464+ * neighbor of extent item */
50465+ /*coord_t next;
50466+ reiser4_key key; */
50467+
50468+ coord_dup(next, coord);
50469+
50470+ if (coord_next_item(next))
50471+ *key = *znode_get_rd_key(coord->node);
50472+ else
50473+ item_key_by_coord(next, key);
50474+ znode_set_rd_key(left, key);
50475+ }
50476+ write_unlock_dk(tree);
50477+ write_unlock_tree(tree);
50478+
50479+ from_off =
50480+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
50481+ to_off =
50482+ (get_key_offset(max_item_key) +
50483+ 1) >> PAGE_CACHE_SHIFT;
50484+ retval = ITEM_KILLED;
50485+ } else {
50486+ /* tail of item is to be removed */
50487+ from_off =
50488+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
50489+ 1) >> PAGE_CACHE_SHIFT;
50490+ to_off =
50491+ (get_key_offset(max_item_key) +
50492+ 1) >> PAGE_CACHE_SHIFT;
50493+ retval = ITEM_TAIL_KILLED;
50494+ }
50495+ } else {
50496+ /* head of item is to be removed */
50497+ assert("vs-1571", keyeq(pfrom_key, min_item_key));
50498+ assert("vs-1572",
50499+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
50500+ 0);
50501+ assert("vs-1573",
50502+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50503+ 1)) == 0);
50504+
50505+ if (kdata->left->node) {
50506+ /* update right delimiting key of left neighbor of extent item */
50507+ /*reiser4_key key; */
50508+
50509+ *key = *pto_key;
50510+ set_key_offset(key, get_key_offset(pto_key) + 1);
50511+
50512+ write_lock_dk(current_tree);
50513+ znode_set_rd_key(kdata->left->node, key);
50514+ write_unlock_dk(current_tree);
50515+ }
50516+
50517+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
50518+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
50519+ retval = ITEM_HEAD_KILLED;
50520+ }
50521+
50522+ inode = kdata->inode;
50523+ assert("vs-1545", inode != NULL);
50524+ if (inode != NULL)
50525+ /* take care of pages and jnodes corresponding to part of item being killed */
50526+ reiser4_invalidate_pages(inode->i_mapping, from_off,
50527+ to_off - from_off,
50528+ kdata->params.truncate);
50529+
50530+ ext = extent_item(coord) + from;
50531+ offset =
50532+ (get_key_offset(min_item_key) +
50533+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
50534+
50535+ assert("vs-1551", from_off >= offset);
50536+ assert("vs-1552", from_off - offset <= extent_get_width(ext));
50537+ skip = from_off - offset;
50538+ offset = from_off;
50539+
50540+ while (offset < to_off) {
50541+ length = extent_get_width(ext) - skip;
50542+ if (state_of_extent(ext) == HOLE_EXTENT) {
50543+ skip = 0;
50544+ offset += length;
50545+ ext++;
50546+ continue;
50547+ }
50548+
50549+ if (offset + length > to_off) {
50550+ length = to_off - offset;
50551+ }
50552+
50553+ DQUOT_FREE_BLOCK_NODIRTY(inode, length);
50554+
50555+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50556+ /* some jnodes corresponding to this unallocated extent */
50557+ fake_allocated2free(length, 0 /* unformatted */ );
50558+
50559+ skip = 0;
50560+ offset += length;
50561+ ext++;
50562+ continue;
50563+ }
50564+
50565+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
50566+
50567+ if (length != 0) {
50568+ start = extent_get_start(ext) + skip;
50569+
50570+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
50571+ immediately */
50572+ reiser4_dealloc_blocks(&start, &length,
50573+ 0 /* not used */ ,
50574+ BA_DEFER
50575+ /* unformatted with defer */ );
50576+ }
50577+ skip = 0;
50578+ offset += length;
50579+ ext++;
50580+ }
50581+ return retval;
50582+}
50583+
50584+/* item_plugin->b.kill_units */
50585+int
50586+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50587+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
50588+ reiser4_key * new_first)
50589+{
50590+ reiser4_extent *ext;
50591+ reiser4_key item_key;
50592+ pos_in_node_t count;
50593+ reiser4_key from_key, to_key;
50594+ const reiser4_key *pfrom_key, *pto_key;
50595+ loff_t off;
50596+ int result;
50597+
50598+ assert("vs-1541",
50599+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
50600+ || (kdata->params.from_key != NULL
50601+ && kdata->params.to_key != NULL)));
50602+
50603+ if (kdata->params.from_key) {
50604+ pfrom_key = kdata->params.from_key;
50605+ pto_key = kdata->params.to_key;
50606+ } else {
50607+ coord_t dup;
50608+
50609+ /* calculate key range of kill */
50610+ assert("vs-1549", from == coord->unit_pos);
50611+ unit_key_by_coord(coord, &from_key);
50612+ pfrom_key = &from_key;
50613+
50614+ coord_dup(&dup, coord);
50615+ dup.unit_pos = to;
50616+ max_unit_key_by_coord(&dup, &to_key);
50617+ pto_key = &to_key;
50618+ }
50619+
50620+ item_key_by_coord(coord, &item_key);
50621+
50622+#if REISER4_DEBUG
50623+ {
50624+ reiser4_key max_item_key;
50625+
50626+ max_item_key_by_coord(coord, &max_item_key);
50627+
50628+ if (new_first) {
50629+ /* head of item is to be cut */
50630+ assert("vs-1542", keyeq(pfrom_key, &item_key));
50631+ assert("vs-1538", keylt(pto_key, &max_item_key));
50632+ } else {
50633+ /* tail of item is to be cut */
50634+ assert("vs-1540", keygt(pfrom_key, &item_key));
50635+ assert("vs-1543", !keylt(pto_key, &max_item_key));
50636+ }
50637+ }
50638+#endif
50639+
50640+ if (smallest_removed)
50641+ *smallest_removed = *pfrom_key;
50642+
50643+ if (new_first) {
50644+ /* item head is cut. Item key will change. This new key is calculated here */
50645+ assert("vs-1556",
50646+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50647+ (PAGE_CACHE_SIZE - 1));
50648+ *new_first = *pto_key;
50649+ set_key_offset(new_first, get_key_offset(new_first) + 1);
50650+ }
50651+
50652+ count = to - from + 1;
50653+ result = kill_hook_extent(coord, from, count, kdata);
50654+ if (result == ITEM_TAIL_KILLED) {
50655+ assert("vs-1553",
50656+ get_key_offset(pfrom_key) >=
50657+ get_key_offset(&item_key) +
50658+ reiser4_extent_size(coord, from));
50659+ off =
50660+ get_key_offset(pfrom_key) -
50661+ (get_key_offset(&item_key) +
50662+ reiser4_extent_size(coord, from));
50663+ if (off) {
50664+ /* unit @from is to be cut partially. Its width decreases */
50665+ ext = extent_item(coord) + from;
50666+ extent_set_width(ext,
50667+ (off + PAGE_CACHE_SIZE -
50668+ 1) >> PAGE_CACHE_SHIFT);
50669+ count--;
50670+ }
50671+ } else {
50672+ __u64 max_to_offset;
50673+ __u64 rest;
50674+
50675+ assert("vs-1575", result == ITEM_HEAD_KILLED);
50676+ assert("", from == 0);
50677+ assert("",
50678+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50679+ 1)) == 0);
50680+ assert("",
50681+ get_key_offset(pto_key) + 1 >
50682+ get_key_offset(&item_key) +
50683+ reiser4_extent_size(coord, to));
50684+ max_to_offset =
50685+ get_key_offset(&item_key) +
50686+ reiser4_extent_size(coord, to + 1) - 1;
50687+ assert("", get_key_offset(pto_key) <= max_to_offset);
50688+
50689+ rest =
50690+ (max_to_offset -
50691+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
50692+ if (rest) {
50693+ /* unit @to is to be cut partially */
50694+ ext = extent_item(coord) + to;
50695+
50696+ assert("", extent_get_width(ext) > rest);
50697+
50698+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
50699+ extent_set_start(ext,
50700+ extent_get_start(ext) +
50701+ (extent_get_width(ext) -
50702+ rest));
50703+
50704+ extent_set_width(ext, rest);
50705+ count--;
50706+ }
50707+ }
50708+ return count * sizeof(reiser4_extent);
50709+}
50710+
50711+/* item_plugin->b.cut_units
50712+ this is too similar to kill_units_extent */
50713+int
50714+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50715+ struct carry_cut_data *cdata, reiser4_key * smallest_removed,
50716+ reiser4_key * new_first)
50717+{
50718+ reiser4_extent *ext;
50719+ reiser4_key item_key;
50720+ pos_in_node_t count;
50721+ reiser4_key from_key, to_key;
50722+ const reiser4_key *pfrom_key, *pto_key;
50723+ loff_t off;
50724+
50725+ assert("vs-1541",
50726+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
50727+ || (cdata->params.from_key != NULL
50728+ && cdata->params.to_key != NULL)));
50729+
50730+ if (cdata->params.from_key) {
50731+ pfrom_key = cdata->params.from_key;
50732+ pto_key = cdata->params.to_key;
50733+ } else {
50734+ coord_t dup;
50735+
50736+ /* calculate key range of kill */
50737+ coord_dup(&dup, coord);
50738+ dup.unit_pos = from;
50739+ unit_key_by_coord(&dup, &from_key);
50740+
50741+ dup.unit_pos = to;
50742+ max_unit_key_by_coord(&dup, &to_key);
50743+
50744+ pfrom_key = &from_key;
50745+ pto_key = &to_key;
50746+ }
50747+
50748+ assert("vs-1555",
50749+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
50750+ assert("vs-1556",
50751+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50752+ (PAGE_CACHE_SIZE - 1));
50753+
50754+ item_key_by_coord(coord, &item_key);
50755+
50756+#if REISER4_DEBUG
50757+ {
50758+ reiser4_key max_item_key;
50759+
50760+ assert("vs-1584",
50761+ get_key_locality(pfrom_key) ==
50762+ get_key_locality(&item_key));
50763+ assert("vs-1585",
50764+ get_key_type(pfrom_key) == get_key_type(&item_key));
50765+ assert("vs-1586",
50766+ get_key_objectid(pfrom_key) ==
50767+ get_key_objectid(&item_key));
50768+ assert("vs-1587",
50769+ get_key_ordering(pfrom_key) ==
50770+ get_key_ordering(&item_key));
50771+
50772+ max_item_key_by_coord(coord, &max_item_key);
50773+
50774+ if (new_first != NULL) {
50775+ /* head of item is to be cut */
50776+ assert("vs-1542", keyeq(pfrom_key, &item_key));
50777+ assert("vs-1538", keylt(pto_key, &max_item_key));
50778+ } else {
50779+ /* tail of item is to be cut */
50780+ assert("vs-1540", keygt(pfrom_key, &item_key));
50781+ assert("vs-1543", keyeq(pto_key, &max_item_key));
50782+ }
50783+ }
50784+#endif
50785+
50786+ if (smallest_removed)
50787+ *smallest_removed = *pfrom_key;
50788+
50789+ if (new_first) {
50790+ /* item head is cut. Item key will change. This new key is calculated here */
50791+ *new_first = *pto_key;
50792+ set_key_offset(new_first, get_key_offset(new_first) + 1);
50793+ }
50794+
50795+ count = to - from + 1;
50796+
50797+ assert("vs-1553",
50798+ get_key_offset(pfrom_key) >=
50799+ get_key_offset(&item_key) + reiser4_extent_size(coord, from));
50800+ off =
50801+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
50802+ reiser4_extent_size(coord, from));
50803+ if (off) {
50804+ /* tail of unit @from is to be cut partially. Its width decreases */
50805+ assert("vs-1582", new_first == NULL);
50806+ ext = extent_item(coord) + from;
50807+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
50808+ count--;
50809+ }
50810+
50811+ assert("vs-1554",
50812+ get_key_offset(pto_key) <=
50813+ get_key_offset(&item_key) +
50814+ reiser4_extent_size(coord, to + 1) - 1);
50815+ off =
50816+ (get_key_offset(&item_key) +
50817+ reiser4_extent_size(coord, to + 1) - 1) -
50818+ get_key_offset(pto_key);
50819+ if (off) {
50820+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
50821+ and width decreased. */
50822+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
50823+ ext = extent_item(coord) + to;
50824+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
50825+ extent_set_start(ext,
50826+ extent_get_start(ext) +
50827+ (extent_get_width(ext) -
50828+ (off >> PAGE_CACHE_SHIFT)));
50829+
50830+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
50831+ count--;
50832+ }
50833+ return count * sizeof(reiser4_extent);
50834+}
50835+
50836+/* item_plugin->b.unit_key */
50837+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
50838+{
50839+ assert("vs-300", coord_is_existing_unit(coord));
50840+
50841+ item_key_by_coord(coord, key);
50842+ set_key_offset(key,
50843+ (get_key_offset(key) +
50844+ reiser4_extent_size(coord, coord->unit_pos)));
50845+
50846+ return key;
50847+}
50848+
50849+/* item_plugin->b.max_unit_key */
50850+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
50851+{
50852+ assert("vs-300", coord_is_existing_unit(coord));
50853+
50854+ item_key_by_coord(coord, key);
50855+ set_key_offset(key,
50856+ (get_key_offset(key) +
50857+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
50858+ return key;
50859+}
50860+
50861+/* item_plugin->b.estimate
50862+ item_plugin->b.item_data_by_flow */
50863+
50864+#if REISER4_DEBUG
50865+
50866+/* item_plugin->b.check
50867+ used for debugging, every item should have here the most complete
50868+ possible check of the consistency of the item that the inventor can
50869+ construct
50870+*/
50871+int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
50872+ const char **error /* where to store error message */)
50873+{
50874+ reiser4_extent *ext, *first;
50875+ unsigned i, j;
50876+ reiser4_block_nr start, width, blk_cnt;
50877+ unsigned num_units;
50878+ reiser4_tree *tree;
50879+ oid_t oid;
50880+ reiser4_key key;
50881+ coord_t scan;
50882+
50883+ assert("vs-933", REISER4_DEBUG);
50884+
50885+ if (znode_get_level(coord->node) != TWIG_LEVEL) {
50886+ *error = "Extent on the wrong level";
50887+ return -1;
50888+ }
50889+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
50890+ *error = "Wrong item size";
50891+ return -1;
50892+ }
50893+ ext = first = extent_item(coord);
50894+ blk_cnt = reiser4_block_count(reiser4_get_current_sb());
50895+ num_units = coord_num_units(coord);
50896+ tree = znode_get_tree(coord->node);
50897+ item_key_by_coord(coord, &key);
50898+ oid = get_key_objectid(&key);
50899+ coord_dup(&scan, coord);
50900+
50901+ for (i = 0; i < num_units; ++i, ++ext) {
50902+ __u64 index;
50903+
50904+ scan.unit_pos = i;
50905+ index = extent_unit_index(&scan);
50906+
50907+#if 0
50908+ /* check that all jnodes are present for the unallocated
50909+ * extent */
50910+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50911+ for (j = 0; j < extent_get_width(ext); j++) {
50912+ jnode *node;
50913+
50914+ node = jlookup(tree, oid, index + j);
50915+ if (node == NULL) {
50916+ print_coord("scan", &scan, 0);
50917+ *error = "Jnode missing";
50918+ return -1;
50919+ }
50920+ jput(node);
50921+ }
50922+ }
50923+#endif
50924+
50925+ start = extent_get_start(ext);
50926+ if (start < 2)
50927+ continue;
50928+ /* extent is allocated one */
50929+ width = extent_get_width(ext);
50930+ if (start >= blk_cnt) {
50931+ *error = "Start too large";
50932+ return -1;
50933+ }
50934+ if (start + width > blk_cnt) {
50935+ *error = "End too large";
50936+ return -1;
50937+ }
50938+ /* make sure that this extent does not overlap with other
50939+ allocated extents extents */
50940+ for (j = 0; j < i; j++) {
50941+ if (state_of_extent(first + j) != ALLOCATED_EXTENT)
50942+ continue;
50943+ if (!
50944+ ((extent_get_start(ext) >=
50945+ extent_get_start(first + j) +
50946+ extent_get_width(first + j))
50947+ || (extent_get_start(ext) +
50948+ extent_get_width(ext) <=
50949+ extent_get_start(first + j)))) {
50950+ *error = "Extent overlaps with others";
50951+ return -1;
50952+ }
50953+ }
50954+
50955+ }
50956+
50957+ return 0;
50958+}
50959+
50960+#endif /* REISER4_DEBUG */
50961+
50962+/*
50963+ Local variables:
50964+ c-indentation-style: "K&R"
50965+ mode-name: "LC"
50966+ c-basic-offset: 8
50967+ tab-width: 8
50968+ fill-column: 120
50969+ scroll-step: 1
50970+ End:
50971+*/
50972diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/internal.c linux-2.6.20/fs/reiser4/plugin/item/internal.c
50973--- linux-2.6.20.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
50974+++ linux-2.6.20/fs/reiser4/plugin/item/internal.c 2007-05-06 14:50:43.815011970 +0400
50975@@ -0,0 +1,396 @@
50976+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50977+
50978+/* Implementation of internal-item plugin methods. */
50979+
50980+#include "../../forward.h"
50981+#include "../../debug.h"
50982+#include "../../dformat.h"
50983+#include "../../key.h"
50984+#include "../../coord.h"
50985+#include "internal.h"
50986+#include "item.h"
50987+#include "../node/node.h"
50988+#include "../plugin.h"
50989+#include "../../jnode.h"
50990+#include "../../znode.h"
50991+#include "../../tree_walk.h"
50992+#include "../../tree_mod.h"
50993+#include "../../tree.h"
50994+#include "../../super.h"
50995+#include "../../block_alloc.h"
50996+
50997+/* see internal.h for explanation */
50998+
50999+/* plugin->u.item.b.mergeable */
51000+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
51001+ const coord_t * p2 UNUSED_ARG /* second item */ )
51002+{
51003+ /* internal items are not mergeable */
51004+ return 0;
51005+}
51006+
51007+/* ->lookup() method for internal items */
51008+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
51009+ lookup_bias bias UNUSED_ARG /* lookup bias */ ,
51010+ coord_t * coord /* coord of item */ )
51011+{
51012+ reiser4_key ukey;
51013+
51014+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
51015+ default:
51016+ impossible("", "keycmp()?!");
51017+ case LESS_THAN:
51018+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
51019+ item plugin can not be taken using coord set this way */
51020+ assert("vs-681", coord->unit_pos == 0);
51021+ coord->between = AFTER_UNIT;
51022+ case EQUAL_TO:
51023+ return CBK_COORD_FOUND;
51024+ case GREATER_THAN:
51025+ return CBK_COORD_NOTFOUND;
51026+ }
51027+}
51028+
51029+/* return body of internal item at @coord */
51030+static internal_item_layout *internal_at(const coord_t * coord /* coord of
51031+ * item */ )
51032+{
51033+ assert("nikita-607", coord != NULL);
51034+ assert("nikita-1650",
51035+ item_plugin_by_coord(coord) ==
51036+ item_plugin_by_id(NODE_POINTER_ID));
51037+ return (internal_item_layout *) item_body_by_coord(coord);
51038+}
51039+
51040+void reiser4_update_internal(const coord_t * coord,
51041+ const reiser4_block_nr * blocknr)
51042+{
51043+ internal_item_layout *item = internal_at(coord);
51044+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
51045+
51046+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
51047+}
51048+
51049+/* return child block number stored in the internal item at @coord */
51050+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
51051+{
51052+ assert("nikita-608", coord != NULL);
51053+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
51054+}
51055+
51056+/* get znode pointed to by internal @item */
51057+static znode *znode_at(const coord_t * item /* coord of item */ ,
51058+ znode * parent /* parent node */ )
51059+{
51060+ return child_znode(item, parent, 1, 0);
51061+}
51062+
51063+/* store pointer from internal item into "block". Implementation of
51064+ ->down_link() method */
51065+void down_link_internal(const coord_t * coord /* coord of item */ ,
51066+ const reiser4_key * key UNUSED_ARG /* key to get
51067+ * pointer for */ ,
51068+ reiser4_block_nr * block /* resulting block number */ )
51069+{
51070+ ON_DEBUG(reiser4_key item_key);
51071+
51072+ assert("nikita-609", coord != NULL);
51073+ assert("nikita-611", block != NULL);
51074+ assert("nikita-612", (key == NULL) ||
51075+ /* twig horrors */
51076+ (znode_get_level(coord->node) == TWIG_LEVEL)
51077+ || keyle(item_key_by_coord(coord, &item_key), key));
51078+
51079+ *block = pointer_at(coord);
51080+ assert("nikita-2960", reiser4_blocknr_is_sane(block));
51081+}
51082+
51083+/* Get the child's block number, or 0 if the block is unallocated. */
51084+int
51085+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
51086+ reiser4_block_nr * block)
51087+{
51088+ assert("jmacd-2059", coord != NULL);
51089+
51090+ *block = pointer_at(coord);
51091+ assert("nikita-2961", reiser4_blocknr_is_sane(block));
51092+
51093+ if (reiser4_blocknr_is_fake(block)) {
51094+ *block = 0;
51095+ }
51096+
51097+ return 0;
51098+}
51099+
51100+/* Return the child. */
51101+int
51102+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
51103+ jnode ** childp)
51104+{
51105+ reiser4_block_nr block = pointer_at(coord);
51106+ znode *child;
51107+
51108+ assert("jmacd-2059", childp != NULL);
51109+ assert("nikita-2962", reiser4_blocknr_is_sane(&block));
51110+
51111+ child = zlook(znode_get_tree(coord->node), &block);
51112+
51113+ if (IS_ERR(child)) {
51114+ return PTR_ERR(child);
51115+ }
51116+
51117+ *childp = ZJNODE(child);
51118+
51119+ return 0;
51120+}
51121+
51122+#if REISER4_DEBUG
51123+
51124+static void check_link(znode * left, znode * right)
51125+{
51126+ znode *scan;
51127+
51128+ for (scan = left; scan != right; scan = scan->right) {
51129+ if (ZF_ISSET(scan, JNODE_RIP))
51130+ break;
51131+ if (znode_is_right_connected(scan) && scan->right != NULL) {
51132+ if (ZF_ISSET(scan->right, JNODE_RIP))
51133+ break;
51134+ assert("nikita-3285",
51135+ znode_is_left_connected(scan->right));
51136+ assert("nikita-3265",
51137+ ergo(scan != left,
51138+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
51139+ assert("nikita-3284", scan->right->left == scan);
51140+ } else
51141+ break;
51142+ }
51143+}
51144+
51145+int check__internal(const coord_t * coord, const char **error)
51146+{
51147+ reiser4_block_nr blk;
51148+ znode *child;
51149+ coord_t cpy;
51150+
51151+ blk = pointer_at(coord);
51152+ if (!reiser4_blocknr_is_sane(&blk)) {
51153+ *error = "Invalid pointer";
51154+ return -1;
51155+ }
51156+ coord_dup(&cpy, coord);
51157+ child = znode_at(&cpy, cpy.node);
51158+ if (child != NULL) {
51159+ znode *left_child;
51160+ znode *right_child;
51161+
51162+ left_child = right_child = NULL;
51163+
51164+ assert("nikita-3256", znode_invariant(child));
51165+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
51166+ left_child = znode_at(&cpy, cpy.node);
51167+ if (left_child != NULL) {
51168+ read_lock_tree(znode_get_tree(child));
51169+ check_link(left_child, child);
51170+ read_unlock_tree(znode_get_tree(child));
51171+ zput(left_child);
51172+ }
51173+ }
51174+ coord_dup(&cpy, coord);
51175+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
51176+ right_child = znode_at(&cpy, cpy.node);
51177+ if (right_child != NULL) {
51178+ read_lock_tree(znode_get_tree(child));
51179+ check_link(child, right_child);
51180+ read_unlock_tree(znode_get_tree(child));
51181+ zput(right_child);
51182+ }
51183+ }
51184+ zput(child);
51185+ }
51186+ return 0;
51187+}
51188+
51189+#endif /* REISER4_DEBUG */
51190+
51191+/* return true only if this item really points to "block" */
51192+/* Audited by: green(2002.06.14) */
51193+int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
51194+ const reiser4_block_nr * block /* block number to
51195+ * check */ )
51196+{
51197+ assert("nikita-613", coord != NULL);
51198+ assert("nikita-614", block != NULL);
51199+
51200+ return pointer_at(coord) == *block;
51201+}
51202+
51203+/* hook called by ->create_item() method of node plugin after new internal
51204+ item was just created.
51205+
51206+ This is point where pointer to new node is inserted into tree. Initialize
51207+ parent pointer in child znode, insert child into sibling list and slum.
51208+
51209+*/
51210+int create_hook_internal(const coord_t * item /* coord of item */ ,
51211+ void *arg /* child's left neighbor, if any */ )
51212+{
51213+ znode *child;
51214+ __u64 child_ptr;
51215+
51216+ assert("nikita-1252", item != NULL);
51217+ assert("nikita-1253", item->node != NULL);
51218+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51219+ assert("nikita-1450", item->unit_pos == 0);
51220+
51221+ /*
51222+ * preparing to item insertion build_child_ptr_data sets pointer to
51223+ * data to be inserted to jnode's blocknr which is in cpu byte
51224+ * order. Node's create_item simply copied those data. As result we
51225+ * have child pointer in cpu's byte order. Convert content of internal
51226+ * item to little endian byte order.
51227+ */
51228+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51229+ reiser4_update_internal(item, &child_ptr);
51230+
51231+ child = znode_at(item, item->node);
51232+ if (child != NULL && !IS_ERR(child)) {
51233+ znode *left;
51234+ int result = 0;
51235+ reiser4_tree *tree;
51236+
51237+ left = arg;
51238+ tree = znode_get_tree(item->node);
51239+ write_lock_tree(tree);
51240+ write_lock_dk(tree);
51241+ assert("nikita-1400", (child->in_parent.node == NULL)
51242+ || (znode_above_root(child->in_parent.node)));
51243+ ++item->node->c_count;
51244+ coord_to_parent_coord(item, &child->in_parent);
51245+ sibling_list_insert_nolock(child, left);
51246+
51247+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51248+ ZF_CLR(child, JNODE_ORPHAN);
51249+
51250+ if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51251+ znode_get_rd_key(child))) {
51252+ znode_set_rd_key(child, znode_get_rd_key(left));
51253+ }
51254+ write_unlock_dk(tree);
51255+ write_unlock_tree(tree);
51256+ zput(child);
51257+ return result;
51258+ } else {
51259+ if (child == NULL)
51260+ child = ERR_PTR(-EIO);
51261+ return PTR_ERR(child);
51262+ }
51263+}
51264+
51265+/* hook called by ->cut_and_kill() method of node plugin just before internal
51266+ item is removed.
51267+
51268+ This is point where empty node is removed from the tree. Clear parent
51269+ pointer in child, and mark node for pending deletion.
51270+
51271+ Node will be actually deleted later and in several installations:
51272+
51273+ . when last lock on this node will be released, node will be removed from
51274+ the sibling list and its lock will be invalidated
51275+
51276+ . when last reference to this node will be dropped, bitmap will be updated
51277+ and node will be actually removed from the memory.
51278+
51279+*/
51280+int kill_hook_internal(const coord_t * item /* coord of item */ ,
51281+ pos_in_node_t from UNUSED_ARG /* start unit */ ,
51282+ pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51283+ struct carry_kill_data *p UNUSED_ARG)
51284+{
51285+ znode *child;
51286+
51287+ assert("nikita-1222", item != NULL);
51288+ assert("nikita-1224", from == 0);
51289+ assert("nikita-1225", count == 1);
51290+
51291+ child = znode_at(item, item->node);
51292+ if (IS_ERR(child))
51293+ return PTR_ERR(child);
51294+ else if (node_is_empty(child)) {
51295+ reiser4_tree *tree;
51296+
51297+ assert("nikita-1397", znode_is_write_locked(child));
51298+ assert("nikita-1398", child->c_count == 0);
51299+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51300+
51301+ tree = znode_get_tree(item->node);
51302+ write_lock_tree(tree);
51303+ init_parent_coord(&child->in_parent, NULL);
51304+ --item->node->c_count;
51305+ write_unlock_tree(tree);
51306+ zput(child);
51307+ return 0;
51308+ } else {
51309+ warning("nikita-1223",
51310+ "Cowardly refuse to remove link to non-empty node");
51311+ zput(child);
51312+ return RETERR(-EIO);
51313+ }
51314+}
51315+
51316+/* hook called by ->shift() node plugin method when iternal item was just
51317+ moved from one node to another.
51318+
51319+ Update parent pointer in child and c_counts in old and new parent
51320+
51321+*/
51322+int shift_hook_internal(const coord_t * item /* coord of item */ ,
51323+ unsigned from UNUSED_ARG /* start unit */ ,
51324+ unsigned count UNUSED_ARG /* stop unit */ ,
51325+ znode * old_node /* old parent */ )
51326+{
51327+ znode *child;
51328+ znode *new_node;
51329+ reiser4_tree *tree;
51330+
51331+ assert("nikita-1276", item != NULL);
51332+ assert("nikita-1277", from == 0);
51333+ assert("nikita-1278", count == 1);
51334+ assert("nikita-1451", item->unit_pos == 0);
51335+
51336+ new_node = item->node;
51337+ assert("nikita-2132", new_node != old_node);
51338+ tree = znode_get_tree(item->node);
51339+ child = child_znode(item, old_node, 1, 0);
51340+ if (child == NULL)
51341+ return 0;
51342+ if (!IS_ERR(child)) {
51343+ write_lock_tree(tree);
51344+ ++new_node->c_count;
51345+ assert("nikita-1395", znode_parent(child) == old_node);
51346+ assert("nikita-1396", old_node->c_count > 0);
51347+ coord_to_parent_coord(item, &child->in_parent);
51348+ assert("nikita-1781", znode_parent(child) == new_node);
51349+ assert("nikita-1782",
51350+ check_tree_pointer(item, child) == NS_FOUND);
51351+ --old_node->c_count;
51352+ write_unlock_tree(tree);
51353+ zput(child);
51354+ return 0;
51355+ } else
51356+ return PTR_ERR(child);
51357+}
51358+
51359+/* plugin->u.item.b.max_key_inside - not defined */
51360+
51361+/* plugin->u.item.b.nr_units - item.c:single_unit */
51362+
51363+/* Make Linus happy.
51364+ Local variables:
51365+ c-indentation-style: "K&R"
51366+ mode-name: "LC"
51367+ c-basic-offset: 8
51368+ tab-width: 8
51369+ fill-column: 120
51370+ End:
51371+*/
51372diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/internal.h linux-2.6.20/fs/reiser4/plugin/item/internal.h
51373--- linux-2.6.20.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
51374+++ linux-2.6.20/fs/reiser4/plugin/item/internal.h 2007-05-06 14:50:43.815011970 +0400
51375@@ -0,0 +1,57 @@
51376+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51377+/* Internal item contains down-link to the child of the internal/twig
51378+ node in a tree. It is internal items that are actually used during
51379+ tree traversal. */
51380+
51381+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51382+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51383+
51384+#include "../../forward.h"
51385+#include "../../dformat.h"
51386+
51387+/* on-disk layout of internal item */
51388+typedef struct internal_item_layout {
51389+ /* 0 */ reiser4_dblock_nr pointer;
51390+ /* 4 */
51391+} internal_item_layout;
51392+
51393+struct cut_list;
51394+
51395+int mergeable_internal(const coord_t * p1, const coord_t * p2);
51396+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51397+ coord_t * coord);
51398+/* store pointer from internal item into "block". Implementation of
51399+ ->down_link() method */
51400+extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51401+ reiser4_block_nr * block);
51402+extern int has_pointer_to_internal(const coord_t * coord,
51403+ const reiser4_block_nr * block);
51404+extern int create_hook_internal(const coord_t * item, void *arg);
51405+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51406+ pos_in_node_t count, struct carry_kill_data *);
51407+extern int shift_hook_internal(const coord_t * item, unsigned from,
51408+ unsigned count, znode * old_node);
51409+extern void reiser4_print_internal(const char *prefix, coord_t * coord);
51410+
51411+extern int utmost_child_internal(const coord_t * coord, sideof side,
51412+ jnode ** child);
51413+int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51414+ reiser4_block_nr * block);
51415+
51416+extern void reiser4_update_internal(const coord_t * coord,
51417+ const reiser4_block_nr * blocknr);
51418+/* FIXME: reiserfs has check_internal */
51419+extern int check__internal(const coord_t * coord, const char **error);
51420+
51421+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51422+#endif
51423+
51424+/* Make Linus happy.
51425+ Local variables:
51426+ c-indentation-style: "K&R"
51427+ mode-name: "LC"
51428+ c-basic-offset: 8
51429+ tab-width: 8
51430+ fill-column: 120
51431+ End:
51432+*/
51433diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/item.c linux-2.6.20/fs/reiser4/plugin/item/item.c
51434--- linux-2.6.20.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300
51435+++ linux-2.6.20/fs/reiser4/plugin/item/item.c 2007-05-06 14:50:43.815011970 +0400
51436@@ -0,0 +1,719 @@
51437+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51438+
51439+/* definition of item plugins. */
51440+
51441+#include "../../forward.h"
51442+#include "../../debug.h"
51443+#include "../../key.h"
51444+#include "../../coord.h"
51445+#include "../plugin_header.h"
51446+#include "sde.h"
51447+#include "internal.h"
51448+#include "item.h"
51449+#include "static_stat.h"
51450+#include "../plugin.h"
51451+#include "../../znode.h"
51452+#include "../../tree.h"
51453+#include "../../context.h"
51454+#include "ctail.h"
51455+
51456+/* return pointer to item body */
51457+void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
51458+{
51459+ assert("nikita-324", coord != NULL);
51460+ assert("nikita-325", coord->node != NULL);
51461+ assert("nikita-326", znode_is_loaded(coord->node));
51462+ assert("nikita-3200", coord->offset == INVALID_OFFSET);
51463+
51464+ coord->offset =
51465+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
51466+ zdata(coord->node);
51467+ ON_DEBUG(coord->body_v = coord->node->times_locked);
51468+}
51469+
51470+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
51471+{
51472+ return zdata(coord->node) + coord->offset;
51473+}
51474+
51475+#if REISER4_DEBUG
51476+
51477+int item_body_is_valid(const coord_t * coord)
51478+{
51479+ return
51480+ coord->offset ==
51481+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
51482+ zdata(coord->node);
51483+}
51484+
51485+#endif
51486+
51487+/* return length of item at @coord */
51488+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
51489+{
51490+ int len;
51491+
51492+ assert("nikita-327", coord != NULL);
51493+ assert("nikita-328", coord->node != NULL);
51494+ assert("nikita-329", znode_is_loaded(coord->node));
51495+
51496+ len = node_plugin_by_node(coord->node)->length_by_coord(coord);
51497+ return len;
51498+}
51499+
51500+void obtain_item_plugin(const coord_t * coord)
51501+{
51502+ assert("nikita-330", coord != NULL);
51503+ assert("nikita-331", coord->node != NULL);
51504+ assert("nikita-332", znode_is_loaded(coord->node));
51505+
51506+ coord_set_iplug((coord_t *) coord,
51507+ node_plugin_by_node(coord->node)->
51508+ plugin_by_coord(coord));
51509+ assert("nikita-2479",
51510+ coord_iplug(coord) ==
51511+ node_plugin_by_node(coord->node)->plugin_by_coord(coord));
51512+}
51513+
51514+/* return id of item */
51515+/* Audited by: green(2002.06.15) */
51516+item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
51517+{
51518+ assert("vs-539", coord != NULL);
51519+ assert("vs-538", coord->node != NULL);
51520+ assert("vs-537", znode_is_loaded(coord->node));
51521+ assert("vs-536", item_plugin_by_coord(coord) != NULL);
51522+ assert("vs-540",
51523+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
51524+
51525+ return item_id_by_plugin(item_plugin_by_coord(coord));
51526+}
51527+
51528+/* return key of item at @coord */
51529+/* Audited by: green(2002.06.15) */
51530+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
51531+ reiser4_key * key /* result */ )
51532+{
51533+ assert("nikita-338", coord != NULL);
51534+ assert("nikita-339", coord->node != NULL);
51535+ assert("nikita-340", znode_is_loaded(coord->node));
51536+
51537+ return node_plugin_by_node(coord->node)->key_at(coord, key);
51538+}
51539+
51540+/* this returns max key in the item */
51541+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
51542+ reiser4_key * key /* result */ )
51543+{
51544+ coord_t last;
51545+
51546+ assert("nikita-338", coord != NULL);
51547+ assert("nikita-339", coord->node != NULL);
51548+ assert("nikita-340", znode_is_loaded(coord->node));
51549+
51550+ /* make coord pointing to last item's unit */
51551+ coord_dup(&last, coord);
51552+ last.unit_pos = coord_num_units(&last) - 1;
51553+ assert("vs-1560", coord_is_existing_unit(&last));
51554+
51555+ max_unit_key_by_coord(&last, key);
51556+ return key;
51557+}
51558+
51559+/* return key of unit at @coord */
51560+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51561+ reiser4_key * key /* result */ )
51562+{
51563+ assert("nikita-772", coord != NULL);
51564+ assert("nikita-774", coord->node != NULL);
51565+ assert("nikita-775", znode_is_loaded(coord->node));
51566+
51567+ if (item_plugin_by_coord(coord)->b.unit_key != NULL)
51568+ return item_plugin_by_coord(coord)->b.unit_key(coord, key);
51569+ else
51570+ return item_key_by_coord(coord, key);
51571+}
51572+
51573+/* return the biggest key contained the unit @coord */
51574+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51575+ reiser4_key * key /* result */ )
51576+{
51577+ assert("nikita-772", coord != NULL);
51578+ assert("nikita-774", coord->node != NULL);
51579+ assert("nikita-775", znode_is_loaded(coord->node));
51580+
51581+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
51582+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
51583+ else
51584+ return unit_key_by_coord(coord, key);
51585+}
51586+
51587+/* ->max_key_inside() method for items consisting of exactly one key (like
51588+ stat-data) */
51589+static reiser4_key *max_key_inside_single_key(const coord_t *
51590+ coord /* coord of item */ ,
51591+ reiser4_key *
51592+ result /* resulting key */ )
51593+{
51594+ assert("nikita-604", coord != NULL);
51595+
51596+ /* coord -> key is starting key of this item and it has to be already
51597+ filled in */
51598+ return unit_key_by_coord(coord, result);
51599+}
51600+
51601+/* ->nr_units() method for items consisting of exactly one unit always */
51602+pos_in_node_t
51603+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
51604+{
51605+ return 1;
51606+}
51607+
51608+static int
51609+paste_no_paste(coord_t * coord UNUSED_ARG,
51610+ reiser4_item_data * data UNUSED_ARG,
51611+ carry_plugin_info * info UNUSED_ARG)
51612+{
51613+ return 0;
51614+}
51615+
51616+/* default ->fast_paste() method */
51617+static int
51618+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
51619+{
51620+ return 1;
51621+}
51622+
51623+int item_can_contain_key(const coord_t * item /* coord of item */ ,
51624+ const reiser4_key * key /* key to check */ ,
51625+ const reiser4_item_data * data /* parameters of item
51626+ * being created */ )
51627+{
51628+ item_plugin *iplug;
51629+ reiser4_key min_key_in_item;
51630+ reiser4_key max_key_in_item;
51631+
51632+ assert("nikita-1658", item != NULL);
51633+ assert("nikita-1659", key != NULL);
51634+
51635+ iplug = item_plugin_by_coord(item);
51636+ if (iplug->b.can_contain_key != NULL)
51637+ return iplug->b.can_contain_key(item, key, data);
51638+ else {
51639+ assert("nikita-1681", iplug->b.max_key_inside != NULL);
51640+ item_key_by_coord(item, &min_key_in_item);
51641+ iplug->b.max_key_inside(item, &max_key_in_item);
51642+
51643+ /* can contain key if
51644+ min_key_in_item <= key &&
51645+ key <= max_key_in_item
51646+ */
51647+ return keyle(&min_key_in_item, key)
51648+ && keyle(key, &max_key_in_item);
51649+ }
51650+}
51651+
51652+/* mergeable method for non mergeable items */
51653+static int
51654+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
51655+{
51656+ return 0;
51657+}
51658+
51659+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
51660+int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
51661+ const coord_t * i2 /* coord of second item */ )
51662+{
51663+ item_plugin *iplug;
51664+ reiser4_key k1;
51665+ reiser4_key k2;
51666+
51667+ assert("nikita-1336", i1 != NULL);
51668+ assert("nikita-1337", i2 != NULL);
51669+
51670+ iplug = item_plugin_by_coord(i1);
51671+ assert("nikita-1338", iplug != NULL);
51672+
51673+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
51674+ shifting code when nodes are in "suspended" state. */
51675+ assert("nikita-1663",
51676+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
51677+
51678+ if (iplug->b.mergeable != NULL) {
51679+ return iplug->b.mergeable(i1, i2);
51680+ } else if (iplug->b.max_key_inside != NULL) {
51681+ iplug->b.max_key_inside(i1, &k1);
51682+ item_key_by_coord(i2, &k2);
51683+
51684+ /* mergeable if ->max_key_inside() >= key of i2; */
51685+ return keyge(iplug->b.max_key_inside(i1, &k1),
51686+ item_key_by_coord(i2, &k2));
51687+ } else {
51688+ item_key_by_coord(i1, &k1);
51689+ item_key_by_coord(i2, &k2);
51690+
51691+ return
51692+ (get_key_locality(&k1) == get_key_locality(&k2)) &&
51693+ (get_key_objectid(&k1) == get_key_objectid(&k2))
51694+ && (iplug == item_plugin_by_coord(i2));
51695+ }
51696+}
51697+
51698+int item_is_extent(const coord_t * item)
51699+{
51700+ assert("vs-482", coord_is_existing_item(item));
51701+ return item_id_by_coord(item) == EXTENT_POINTER_ID;
51702+}
51703+
51704+int item_is_tail(const coord_t * item)
51705+{
51706+ assert("vs-482", coord_is_existing_item(item));
51707+ return item_id_by_coord(item) == FORMATTING_ID;
51708+}
51709+
51710+#if REISER4_DEBUG
51711+
51712+int item_is_statdata(const coord_t * item)
51713+{
51714+ assert("vs-516", coord_is_existing_item(item));
51715+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
51716+}
51717+
51718+int item_is_ctail(const coord_t * item)
51719+{
51720+ assert("edward-xx", coord_is_existing_item(item));
51721+ return item_id_by_coord(item) == CTAIL_ID;
51722+}
51723+
51724+#endif /* REISER4_DEBUG */
51725+
51726+static int change_item(struct inode *inode,
51727+ reiser4_plugin * plugin,
51728+ pset_member memb)
51729+{
51730+ /* cannot change constituent item (sd, or dir_item) */
51731+ return RETERR(-EINVAL);
51732+}
51733+
51734+static reiser4_plugin_ops item_plugin_ops = {
51735+ .init = NULL,
51736+ .load = NULL,
51737+ .save_len = NULL,
51738+ .save = NULL,
51739+ .change = change_item
51740+};
51741+
51742+item_plugin item_plugins[LAST_ITEM_ID] = {
51743+ [STATIC_STAT_DATA_ID] = {
51744+ .h = {
51745+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51746+ .id = STATIC_STAT_DATA_ID,
51747+ .groups = (1 << STAT_DATA_ITEM_TYPE),
51748+ .pops = &item_plugin_ops,
51749+ .label = "sd",
51750+ .desc = "stat-data",
51751+ .linkage = {NULL, NULL}
51752+ },
51753+ .b = {
51754+ .max_key_inside = max_key_inside_single_key,
51755+ .can_contain_key = NULL,
51756+ .mergeable = not_mergeable,
51757+ .nr_units = nr_units_single_unit,
51758+ .lookup = NULL,
51759+ .init = NULL,
51760+ .paste = paste_no_paste,
51761+ .fast_paste = NULL,
51762+ .can_shift = NULL,
51763+ .copy_units = NULL,
51764+ .create_hook = NULL,
51765+ .kill_hook = NULL,
51766+ .shift_hook = NULL,
51767+ .cut_units = NULL,
51768+ .kill_units = NULL,
51769+ .unit_key = NULL,
51770+ .max_unit_key = NULL,
51771+ .estimate = NULL,
51772+ .item_data_by_flow = NULL,
51773+#if REISER4_DEBUG
51774+ .check = NULL
51775+#endif
51776+ },
51777+ .f = {
51778+ .utmost_child = NULL,
51779+ .utmost_child_real_block = NULL,
51780+ .update = NULL,
51781+ .scan = NULL,
51782+ .convert = NULL
51783+ },
51784+ .s = {
51785+ .sd = {
51786+ .init_inode = init_inode_static_sd,
51787+ .save_len = save_len_static_sd,
51788+ .save = save_static_sd
51789+ }
51790+ }
51791+ },
51792+ [SIMPLE_DIR_ENTRY_ID] = {
51793+ .h = {
51794+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51795+ .id = SIMPLE_DIR_ENTRY_ID,
51796+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
51797+ .pops = &item_plugin_ops,
51798+ .label = "de",
51799+ .desc = "directory entry",
51800+ .linkage = {NULL, NULL}
51801+ },
51802+ .b = {
51803+ .max_key_inside = max_key_inside_single_key,
51804+ .can_contain_key = NULL,
51805+ .mergeable = NULL,
51806+ .nr_units = nr_units_single_unit,
51807+ .lookup = NULL,
51808+ .init = NULL,
51809+ .paste = NULL,
51810+ .fast_paste = NULL,
51811+ .can_shift = NULL,
51812+ .copy_units = NULL,
51813+ .create_hook = NULL,
51814+ .kill_hook = NULL,
51815+ .shift_hook = NULL,
51816+ .cut_units = NULL,
51817+ .kill_units = NULL,
51818+ .unit_key = NULL,
51819+ .max_unit_key = NULL,
51820+ .estimate = NULL,
51821+ .item_data_by_flow = NULL,
51822+#if REISER4_DEBUG
51823+ .check = NULL
51824+#endif
51825+ },
51826+ .f = {
51827+ .utmost_child = NULL,
51828+ .utmost_child_real_block = NULL,
51829+ .update = NULL,
51830+ .scan = NULL,
51831+ .convert = NULL
51832+ },
51833+ .s = {
51834+ .dir = {
51835+ .extract_key = extract_key_de,
51836+ .update_key = update_key_de,
51837+ .extract_name = extract_name_de,
51838+ .extract_file_type = extract_file_type_de,
51839+ .add_entry = add_entry_de,
51840+ .rem_entry = rem_entry_de,
51841+ .max_name_len = max_name_len_de
51842+ }
51843+ }
51844+ },
51845+ [COMPOUND_DIR_ID] = {
51846+ .h = {
51847+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51848+ .id = COMPOUND_DIR_ID,
51849+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
51850+ .pops = &item_plugin_ops,
51851+ .label = "cde",
51852+ .desc = "compressed directory entry",
51853+ .linkage = {NULL, NULL}
51854+ },
51855+ .b = {
51856+ .max_key_inside = max_key_inside_cde,
51857+ .can_contain_key = can_contain_key_cde,
51858+ .mergeable = mergeable_cde,
51859+ .nr_units = nr_units_cde,
51860+ .lookup = lookup_cde,
51861+ .init = init_cde,
51862+ .paste = paste_cde,
51863+ .fast_paste = agree_to_fast_op,
51864+ .can_shift = can_shift_cde,
51865+ .copy_units = copy_units_cde,
51866+ .create_hook = NULL,
51867+ .kill_hook = NULL,
51868+ .shift_hook = NULL,
51869+ .cut_units = cut_units_cde,
51870+ .kill_units = kill_units_cde,
51871+ .unit_key = unit_key_cde,
51872+ .max_unit_key = unit_key_cde,
51873+ .estimate = estimate_cde,
51874+ .item_data_by_flow = NULL,
51875+#if REISER4_DEBUG
51876+ .check = reiser4_check_cde
51877+#endif
51878+ },
51879+ .f = {
51880+ .utmost_child = NULL,
51881+ .utmost_child_real_block = NULL,
51882+ .update = NULL,
51883+ .scan = NULL,
51884+ .convert = NULL
51885+ },
51886+ .s = {
51887+ .dir = {
51888+ .extract_key = extract_key_cde,
51889+ .update_key = update_key_cde,
51890+ .extract_name = extract_name_cde,
51891+ .extract_file_type = extract_file_type_de,
51892+ .add_entry = add_entry_cde,
51893+ .rem_entry = rem_entry_cde,
51894+ .max_name_len = max_name_len_cde
51895+ }
51896+ }
51897+ },
51898+ [NODE_POINTER_ID] = {
51899+ .h = {
51900+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51901+ .id = NODE_POINTER_ID,
51902+ .groups = (1 << INTERNAL_ITEM_TYPE),
51903+ .pops = NULL,
51904+ .label = "internal",
51905+ .desc = "internal item",
51906+ .linkage = {NULL, NULL}
51907+ },
51908+ .b = {
51909+ .max_key_inside = NULL,
51910+ .can_contain_key = NULL,
51911+ .mergeable = mergeable_internal,
51912+ .nr_units = nr_units_single_unit,
51913+ .lookup = lookup_internal,
51914+ .init = NULL,
51915+ .paste = NULL,
51916+ .fast_paste = NULL,
51917+ .can_shift = NULL,
51918+ .copy_units = NULL,
51919+ .create_hook = create_hook_internal,
51920+ .kill_hook = kill_hook_internal,
51921+ .shift_hook = shift_hook_internal,
51922+ .cut_units = NULL,
51923+ .kill_units = NULL,
51924+ .unit_key = NULL,
51925+ .max_unit_key = NULL,
51926+ .estimate = NULL,
51927+ .item_data_by_flow = NULL,
51928+#if REISER4_DEBUG
51929+ .check = check__internal
51930+#endif
51931+ },
51932+ .f = {
51933+ .utmost_child = utmost_child_internal,
51934+ .utmost_child_real_block =
51935+ utmost_child_real_block_internal,
51936+ .update = reiser4_update_internal,
51937+ .scan = NULL,
51938+ .convert = NULL
51939+ },
51940+ .s = {
51941+ .internal = {
51942+ .down_link = down_link_internal,
51943+ .has_pointer_to = has_pointer_to_internal
51944+ }
51945+ }
51946+ },
51947+ [EXTENT_POINTER_ID] = {
51948+ .h = {
51949+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51950+ .id = EXTENT_POINTER_ID,
51951+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
51952+ .pops = NULL,
51953+ .label = "extent",
51954+ .desc = "extent item",
51955+ .linkage = {NULL, NULL}
51956+ },
51957+ .b = {
51958+ .max_key_inside = max_key_inside_extent,
51959+ .can_contain_key = can_contain_key_extent,
51960+ .mergeable = mergeable_extent,
51961+ .nr_units = nr_units_extent,
51962+ .lookup = lookup_extent,
51963+ .init = NULL,
51964+ .paste = paste_extent,
51965+ .fast_paste = agree_to_fast_op,
51966+ .can_shift = can_shift_extent,
51967+ .create_hook = create_hook_extent,
51968+ .copy_units = copy_units_extent,
51969+ .kill_hook = kill_hook_extent,
51970+ .shift_hook = NULL,
51971+ .cut_units = cut_units_extent,
51972+ .kill_units = kill_units_extent,
51973+ .unit_key = unit_key_extent,
51974+ .max_unit_key = max_unit_key_extent,
51975+ .estimate = NULL,
51976+ .item_data_by_flow = NULL,
51977+#if REISER4_DEBUG
51978+ .check = reiser4_check_extent
51979+#endif
51980+ },
51981+ .f = {
51982+ .utmost_child = utmost_child_extent,
51983+ .utmost_child_real_block =
51984+ utmost_child_real_block_extent,
51985+ .update = NULL,
51986+ .scan = reiser4_scan_extent,
51987+ .convert = NULL,
51988+ .key_by_offset = key_by_offset_extent
51989+ },
51990+ .s = {
51991+ .file = {
51992+ .write = reiser4_write_extent,
51993+ .read = reiser4_read_extent,
51994+ .readpage = reiser4_readpage_extent,
51995+ .get_block = get_block_address_extent,
51996+ .append_key = append_key_extent,
51997+ .init_coord_extension =
51998+ init_coord_extension_extent
51999+ }
52000+ }
52001+ },
52002+ [FORMATTING_ID] = {
52003+ .h = {
52004+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52005+ .id = FORMATTING_ID,
52006+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52007+ .pops = NULL,
52008+ .label = "body",
52009+ .desc = "body (or tail?) item",
52010+ .linkage = {NULL, NULL}
52011+ },
52012+ .b = {
52013+ .max_key_inside = max_key_inside_tail,
52014+ .can_contain_key = can_contain_key_tail,
52015+ .mergeable = mergeable_tail,
52016+ .nr_units = nr_units_tail,
52017+ .lookup = lookup_tail,
52018+ .init = NULL,
52019+ .paste = paste_tail,
52020+ .fast_paste = agree_to_fast_op,
52021+ .can_shift = can_shift_tail,
52022+ .create_hook = NULL,
52023+ .copy_units = copy_units_tail,
52024+ .kill_hook = kill_hook_tail,
52025+ .shift_hook = NULL,
52026+ .cut_units = cut_units_tail,
52027+ .kill_units = kill_units_tail,
52028+ .unit_key = unit_key_tail,
52029+ .max_unit_key = unit_key_tail,
52030+ .estimate = NULL,
52031+ .item_data_by_flow = NULL,
52032+#if REISER4_DEBUG
52033+ .check = NULL
52034+#endif
52035+ },
52036+ .f = {
52037+ .utmost_child = NULL,
52038+ .utmost_child_real_block = NULL,
52039+ .update = NULL,
52040+ .scan = NULL,
52041+ .convert = NULL
52042+ },
52043+ .s = {
52044+ .file = {
52045+ .write = reiser4_write_tail,
52046+ .read = reiser4_read_tail,
52047+ .readpage = readpage_tail,
52048+ .get_block = get_block_address_tail,
52049+ .append_key = append_key_tail,
52050+ .init_coord_extension =
52051+ init_coord_extension_tail
52052+ }
52053+ }
52054+ },
52055+ [CTAIL_ID] = {
52056+ .h = {
52057+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52058+ .id = CTAIL_ID,
52059+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52060+ .pops = NULL,
52061+ .label = "ctail",
52062+ .desc = "cryptcompress tail item",
52063+ .linkage = {NULL, NULL}
52064+ },
52065+ .b = {
52066+ .max_key_inside = max_key_inside_tail,
52067+ .can_contain_key = can_contain_key_ctail,
52068+ .mergeable = mergeable_ctail,
52069+ .nr_units = nr_units_ctail,
52070+ .lookup = NULL,
52071+ .init = init_ctail,
52072+ .paste = paste_ctail,
52073+ .fast_paste = agree_to_fast_op,
52074+ .can_shift = can_shift_ctail,
52075+ .create_hook = create_hook_ctail,
52076+ .copy_units = copy_units_ctail,
52077+ .kill_hook = kill_hook_ctail,
52078+ .shift_hook = shift_hook_ctail,
52079+ .cut_units = cut_units_ctail,
52080+ .kill_units = kill_units_ctail,
52081+ .unit_key = unit_key_tail,
52082+ .max_unit_key = unit_key_tail,
52083+ .estimate = estimate_ctail,
52084+ .item_data_by_flow = NULL,
52085+#if REISER4_DEBUG
52086+ .check = check_ctail
52087+#endif
52088+ },
52089+ .f = {
52090+ .utmost_child = utmost_child_ctail,
52091+ /* FIXME-EDWARD: write this */
52092+ .utmost_child_real_block = NULL,
52093+ .update = NULL,
52094+ .scan = scan_ctail,
52095+ .convert = convert_ctail
52096+ },
52097+ .s = {
52098+ .file = {
52099+ .write = NULL,
52100+ .read = read_ctail,
52101+ .readpage = readpage_ctail,
52102+ .get_block = get_block_address_tail,
52103+ .append_key = append_key_ctail,
52104+ .init_coord_extension =
52105+ init_coord_extension_tail
52106+ }
52107+ }
52108+ },
52109+ [BLACK_BOX_ID] = {
52110+ .h = {
52111+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52112+ .id = BLACK_BOX_ID,
52113+ .groups = (1 << OTHER_ITEM_TYPE),
52114+ .pops = NULL,
52115+ .label = "blackbox",
52116+ .desc = "black box item",
52117+ .linkage = {NULL, NULL}
52118+ },
52119+ .b = {
52120+ .max_key_inside = NULL,
52121+ .can_contain_key = NULL,
52122+ .mergeable = not_mergeable,
52123+ .nr_units = nr_units_single_unit,
52124+ /* to need for ->lookup method */
52125+ .lookup = NULL,
52126+ .init = NULL,
52127+ .paste = NULL,
52128+ .fast_paste = NULL,
52129+ .can_shift = NULL,
52130+ .copy_units = NULL,
52131+ .create_hook = NULL,
52132+ .kill_hook = NULL,
52133+ .shift_hook = NULL,
52134+ .cut_units = NULL,
52135+ .kill_units = NULL,
52136+ .unit_key = NULL,
52137+ .max_unit_key = NULL,
52138+ .estimate = NULL,
52139+ .item_data_by_flow = NULL,
52140+#if REISER4_DEBUG
52141+ .check = NULL
52142+#endif
52143+ }
52144+ }
52145+};
52146+
52147+/* Make Linus happy.
52148+ Local variables:
52149+ c-indentation-style: "K&R"
52150+ mode-name: "LC"
52151+ c-basic-offset: 8
52152+ tab-width: 8
52153+ fill-column: 120
52154+ End:
52155+*/
52156diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/item.h linux-2.6.20/fs/reiser4/plugin/item/item.h
52157--- linux-2.6.20.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300
52158+++ linux-2.6.20/fs/reiser4/plugin/item/item.h 2007-05-06 14:50:43.819013220 +0400
52159@@ -0,0 +1,400 @@
52160+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52161+
52162+/* first read balance.c comments before reading this */
52163+
52164+/* An item_plugin implements all of the operations required for
52165+ balancing that are item specific. */
52166+
52167+/* an item plugin also implements other operations that are specific to that
52168+ item. These go into the item specific operations portion of the item
52169+ handler, and all of the item specific portions of the item handler are put
52170+ into a union. */
52171+
52172+#if !defined( __REISER4_ITEM_H__ )
52173+#define __REISER4_ITEM_H__
52174+
52175+#include "../../forward.h"
52176+#include "../plugin_header.h"
52177+#include "../../dformat.h"
52178+#include "../../seal.h"
52179+#include "../../plugin/file/file.h"
52180+
52181+#include <linux/fs.h> /* for struct file, struct inode */
52182+#include <linux/mm.h> /* for struct page */
52183+#include <linux/dcache.h> /* for struct dentry */
52184+
52185+typedef enum {
52186+ STAT_DATA_ITEM_TYPE,
52187+ DIR_ENTRY_ITEM_TYPE,
52188+ INTERNAL_ITEM_TYPE,
52189+ UNIX_FILE_METADATA_ITEM_TYPE,
52190+ OTHER_ITEM_TYPE
52191+} item_type_id;
52192+
52193+/* this is the part of each item plugin that all items are expected to
52194+ support or at least explicitly fail to support by setting the
52195+ pointer to null. */
52196+typedef struct {
52197+ /* operations called by balancing
52198+
52199+ It is interesting to consider that some of these item
52200+ operations could be given sources or targets that are not
52201+ really items in nodes. This could be ok/useful.
52202+
52203+ */
52204+ /* maximal key that can _possibly_ be occupied by this item
52205+
52206+ When inserting, and node ->lookup() method (called by
52207+ coord_by_key()) reaches an item after binary search,
52208+ the ->max_key_inside() item plugin method is used to determine
52209+ whether new item should pasted into existing item
52210+ (new_key<=max_key_inside()) or new item has to be created
52211+ (new_key>max_key_inside()).
52212+
52213+ For items that occupy exactly one key (like stat-data)
52214+ this method should return this key. For items that can
52215+ grow indefinitely (extent, directory item) this should
52216+ return reiser4_max_key().
52217+
52218+ For example extent with the key
52219+
52220+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52221+
52222+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52223+ */
52224+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52225+
52226+ /* true if item @coord can merge data at @key. */
52227+ int (*can_contain_key) (const coord_t *, const reiser4_key *,
52228+ const reiser4_item_data *);
52229+ /* mergeable() - check items for mergeability
52230+
52231+ Optional method. Returns true if two items can be merged.
52232+
52233+ */
52234+ int (*mergeable) (const coord_t *, const coord_t *);
52235+
52236+ /* number of atomic things in an item.
52237+ NOTE FOR CONTRIBUTORS: use a generic method
52238+ nr_units_single_unit() for solid (atomic) items, as
52239+ tree operations use it as a criterion of solidness
52240+ (see is_solid_item macro) */
52241+ pos_in_node_t(*nr_units) (const coord_t *);
52242+
52243+ /* search within item for a unit within the item, and return a
52244+ pointer to it. This can be used to calculate how many
52245+ bytes to shrink an item if you use pointer arithmetic and
52246+ compare to the start of the item body if the item's data
52247+ are continuous in the node, if the item's data are not
52248+ continuous in the node, all sorts of other things are maybe
52249+ going to break as well. */
52250+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52251+ /* method called by ode_plugin->create_item() to initialise new
52252+ item */
52253+ int (*init) (coord_t * target, coord_t * from,
52254+ reiser4_item_data * data);
52255+ /* method called (e.g., by reiser4_resize_item()) to place new data
52256+ into item when it grows */
52257+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52258+ /* return true if paste into @coord is allowed to skip
52259+ carry. That is, if such paste would require any changes
52260+ at the parent level
52261+ */
52262+ int (*fast_paste) (const coord_t *);
52263+ /* how many but not more than @want units of @source can be
52264+ shifted into @target node. If pend == append - we try to
52265+ append last item of @target by first units of @source. If
52266+ pend == prepend - we try to "prepend" first item in @target
52267+ by last units of @source. @target node has @free_space
52268+ bytes of free space. Total size of those units are returned
52269+ via @size.
52270+
52271+ @target is not NULL if shifting to the mergeable item and
52272+ NULL is new item will be created during shifting.
52273+ */
52274+ int (*can_shift) (unsigned free_space, coord_t *,
52275+ znode *, shift_direction, unsigned *size,
52276+ unsigned want);
52277+
52278+ /* starting off @from-th unit of item @source append or
52279+ prepend @count units to @target. @target has been already
52280+ expanded by @free_space bytes. That must be exactly what is
52281+ needed for those items in @target. If @where_is_free_space
52282+ == SHIFT_LEFT - free space is at the end of @target item,
52283+ othersize - it is in the beginning of it. */
52284+ void (*copy_units) (coord_t *, coord_t *,
52285+ unsigned from, unsigned count,
52286+ shift_direction where_is_free_space,
52287+ unsigned free_space);
52288+
52289+ int (*create_hook) (const coord_t *, void *);
52290+ /* do whatever is necessary to do when @count units starting
52291+ from @from-th one are removed from the tree */
52292+ /* FIXME-VS: this is used to be here for, in particular,
52293+ extents and items of internal type to free blocks they point
52294+ to at the same time with removing items from a
52295+ tree. Problems start, however, when dealloc_block fails due
52296+ to some reason. Item gets removed, but blocks it pointed to
52297+ are not freed. It is not clear how to fix this for items of
52298+ internal type because a need to remove internal item may
52299+ appear in the middle of balancing, and there is no way to
52300+ undo changes made. OTOH, if space allocator involves
52301+ balancing to perform dealloc_block - this will probably
52302+ break balancing due to deadlock issues
52303+ */
52304+ int (*kill_hook) (const coord_t *, pos_in_node_t from,
52305+ pos_in_node_t count, struct carry_kill_data *);
52306+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52307+ znode * _node);
52308+
52309+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52310+ including boundaries. When units are cut from item beginning - move space which gets freed to head of
52311+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52312+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52313+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52314+ */
52315+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52316+ struct carry_cut_data *,
52317+ reiser4_key * smallest_removed,
52318+ reiser4_key * new_first_key);
52319+
52320+ /* like cut_units, except that these units are removed from the
52321+ tree, not only from a node */
52322+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52323+ struct carry_kill_data *,
52324+ reiser4_key * smallest_removed,
52325+ reiser4_key * new_first);
52326+
52327+ /* if @key_of_coord == 1 - returned key of coord, otherwise -
52328+ key of unit is returned. If @coord is not set to certain
52329+ unit - ERR_PTR(-ENOENT) is returned */
52330+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52331+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52332+ /* estimate how much space is needed for paste @data into item at
52333+ @coord. if @coord==0 - estimate insertion, otherwise - estimate
52334+ pasting
52335+ */
52336+ int (*estimate) (const coord_t *, const reiser4_item_data *);
52337+
52338+ /* converts flow @f to item data. @coord == 0 on insert */
52339+ int (*item_data_by_flow) (const coord_t *, const flow_t *,
52340+ reiser4_item_data *);
52341+
52342+ /*void (*show) (struct seq_file *, coord_t *); */
52343+
52344+#if REISER4_DEBUG
52345+ /* used for debugging, every item should have here the most
52346+ complete possible check of the consistency of the item that
52347+ the inventor can construct */
52348+ int (*check) (const coord_t *, const char **error);
52349+#endif
52350+
52351+} balance_ops;
52352+
52353+typedef struct {
52354+ /* return the right or left child of @coord, only if it is in memory */
52355+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52356+
52357+ /* return whether the right or left child of @coord has a non-fake
52358+ block number. */
52359+ int (*utmost_child_real_block) (const coord_t *, sideof side,
52360+ reiser4_block_nr *);
52361+ /* relocate child at @coord to the @block */
52362+ void (*update) (const coord_t *, const reiser4_block_nr *);
52363+ /* count unformatted nodes per item for leave relocation policy, etc.. */
52364+ int (*scan) (flush_scan * scan);
52365+ /* convert item by flush */
52366+ int (*convert) (flush_pos_t * pos);
52367+ /* backward mapping from jnode offset to a key. */
52368+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52369+} flush_ops;
52370+
52371+/* operations specific to the directory item */
52372+typedef struct {
52373+ /* extract stat-data key from directory entry at @coord and place it
52374+ into @key. */
52375+ int (*extract_key) (const coord_t *, reiser4_key * key);
52376+ /* update object key in item. */
52377+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52378+ /* extract name from directory entry at @coord and return it */
52379+ char *(*extract_name) (const coord_t *, char *buf);
52380+ /* extract file type (DT_* stuff) from directory entry at @coord and
52381+ return it */
52382+ unsigned (*extract_file_type) (const coord_t *);
52383+ int (*add_entry) (struct inode * dir,
52384+ coord_t *, lock_handle *,
52385+ const struct dentry * name,
52386+ reiser4_dir_entry_desc * entry);
52387+ int (*rem_entry) (struct inode * dir, const struct qstr * name,
52388+ coord_t *, lock_handle *,
52389+ reiser4_dir_entry_desc * entry);
52390+ int (*max_name_len) (const struct inode * dir);
52391+} dir_entry_ops;
52392+
52393+/* operations specific to items regular (unix) file metadata are built of */
52394+typedef struct {
52395+ int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52396+ int (*read) (struct file *, flow_t *, hint_t *);
52397+ int (*readpage) (void *, struct page *);
52398+ int (*get_block) (const coord_t *, sector_t, sector_t *);
52399+ /*
52400+ * key of first byte which is not addressed by the item @coord is set
52401+ * to.
52402+ * For example, for extent item with the key
52403+ *
52404+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52405+ *
52406+ * ->append_key is
52407+ *
52408+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52409+ */
52410+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52411+
52412+ void (*init_coord_extension) (uf_coord_t *, loff_t);
52413+} file_ops;
52414+
52415+/* operations specific to items of stat data type */
52416+typedef struct {
52417+ int (*init_inode) (struct inode * inode, char *sd, int len);
52418+ int (*save_len) (struct inode * inode);
52419+ int (*save) (struct inode * inode, char **area);
52420+} sd_ops;
52421+
52422+/* operations specific to internal item */
52423+typedef struct {
52424+ /* all tree traversal want to know from internal item is where
52425+ to go next. */
52426+ void (*down_link) (const coord_t * coord,
52427+ const reiser4_key * key, reiser4_block_nr * block);
52428+ /* check that given internal item contains given pointer. */
52429+ int (*has_pointer_to) (const coord_t * coord,
52430+ const reiser4_block_nr * block);
52431+} internal_item_ops;
52432+
52433+struct item_plugin {
52434+ /* generic fields */
52435+ plugin_header h;
52436+
52437+ /* methods common for all item types */
52438+ balance_ops b;
52439+ /* methods used during flush */
52440+ flush_ops f;
52441+
52442+ /* methods specific to particular type of item */
52443+ union {
52444+ dir_entry_ops dir;
52445+ file_ops file;
52446+ sd_ops sd;
52447+ internal_item_ops internal;
52448+ } s;
52449+
52450+};
52451+
52452+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
52453+
52454+static inline item_id item_id_by_plugin(item_plugin * plugin)
52455+{
52456+ return plugin->h.id;
52457+}
52458+
52459+static inline char get_iplugid(item_plugin * iplug)
52460+{
52461+ assert("nikita-2838", iplug != NULL);
52462+ assert("nikita-2839", iplug->h.id < 0xff);
52463+ return (char)item_id_by_plugin(iplug);
52464+}
52465+
52466+extern unsigned long znode_times_locked(const znode * z);
52467+
52468+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
52469+{
52470+ assert("nikita-2837", coord != NULL);
52471+ assert("nikita-2838", iplug != NULL);
52472+ coord->iplugid = get_iplugid(iplug);
52473+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
52474+}
52475+
52476+static inline item_plugin *coord_iplug(const coord_t * coord)
52477+{
52478+ assert("nikita-2833", coord != NULL);
52479+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
52480+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
52481+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
52482+ coord->iplugid);
52483+}
52484+
52485+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
52486+ const reiser4_item_data *);
52487+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
52488+extern int item_is_extent(const coord_t *);
52489+extern int item_is_tail(const coord_t *);
52490+extern int item_is_statdata(const coord_t * item);
52491+extern int item_is_ctail(const coord_t *);
52492+
52493+extern pos_in_node_t item_length_by_coord(const coord_t * coord);
52494+extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
52495+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
52496+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
52497+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
52498+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
52499+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
52500+ reiser4_key * key);
52501+extern void obtain_item_plugin(const coord_t * coord);
52502+
52503+#if defined(REISER4_DEBUG)
52504+extern int znode_is_loaded(const znode * node);
52505+#endif
52506+
52507+/* return plugin of item at @coord */
52508+static inline item_plugin *item_plugin_by_coord(const coord_t *
52509+ coord /* coord to query */ )
52510+{
52511+ assert("nikita-330", coord != NULL);
52512+ assert("nikita-331", coord->node != NULL);
52513+ assert("nikita-332", znode_is_loaded(coord->node));
52514+
52515+ if (unlikely(!coord_is_iplug_set(coord)))
52516+ obtain_item_plugin(coord);
52517+ return coord_iplug(coord);
52518+}
52519+
52520+/* this returns true if item is of internal type */
52521+static inline int item_is_internal(const coord_t * item)
52522+{
52523+ assert("vs-483", coord_is_existing_item(item));
52524+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
52525+}
52526+
52527+extern void item_body_by_coord_hard(coord_t * coord);
52528+extern void *item_body_by_coord_easy(const coord_t * coord);
52529+#if REISER4_DEBUG
52530+extern int item_body_is_valid(const coord_t * coord);
52531+#endif
52532+
52533+/* return pointer to item body */
52534+static inline void *item_body_by_coord(const coord_t *
52535+ coord /* coord to query */ )
52536+{
52537+ assert("nikita-324", coord != NULL);
52538+ assert("nikita-325", coord->node != NULL);
52539+ assert("nikita-326", znode_is_loaded(coord->node));
52540+
52541+ if (coord->offset == INVALID_OFFSET)
52542+ item_body_by_coord_hard((coord_t *) coord);
52543+ assert("nikita-3201", item_body_is_valid(coord));
52544+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
52545+ return item_body_by_coord_easy(coord);
52546+}
52547+
52548+/* __REISER4_ITEM_H__ */
52549+#endif
52550+/* Make Linus happy.
52551+ Local variables:
52552+ c-indentation-style: "K&R"
52553+ mode-name: "LC"
52554+ c-basic-offset: 8
52555+ tab-width: 8
52556+ fill-column: 120
52557+ scroll-step: 1
52558+ End:
52559+*/
52560diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/Makefile linux-2.6.20/fs/reiser4/plugin/item/Makefile
52561--- linux-2.6.20.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300
52562+++ linux-2.6.20/fs/reiser4/plugin/item/Makefile 2007-05-06 14:50:43.819013220 +0400
52563@@ -0,0 +1,18 @@
52564+obj-$(CONFIG_REISER4_FS) += item_plugins.o
52565+
52566+item_plugins-objs := \
52567+ item.o \
52568+ static_stat.o \
52569+ sde.o \
52570+ cde.o \
52571+ blackbox.o \
52572+ internal.o \
52573+ tail.o \
52574+ ctail.o \
52575+ extent.o \
52576+ extent_item_ops.o \
52577+ extent_file_ops.o \
52578+ extent_flush_ops.o
52579+
52580+
52581+
52582diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/sde.c linux-2.6.20/fs/reiser4/plugin/item/sde.c
52583--- linux-2.6.20.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300
52584+++ linux-2.6.20/fs/reiser4/plugin/item/sde.c 2007-05-06 14:50:43.819013220 +0400
52585@@ -0,0 +1,190 @@
52586+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52587+
52588+/* Directory entry implementation */
52589+#include "../../forward.h"
52590+#include "../../debug.h"
52591+#include "../../dformat.h"
52592+#include "../../kassign.h"
52593+#include "../../coord.h"
52594+#include "sde.h"
52595+#include "item.h"
52596+#include "../plugin.h"
52597+#include "../../znode.h"
52598+#include "../../carry.h"
52599+#include "../../tree.h"
52600+#include "../../inode.h"
52601+
52602+#include <linux/fs.h> /* for struct inode */
52603+#include <linux/dcache.h> /* for struct dentry */
52604+#include <linux/quotaops.h>
52605+
52606+/* ->extract_key() method of simple directory item plugin. */
52607+int extract_key_de(const coord_t * coord /* coord of item */ ,
52608+ reiser4_key * key /* resulting key */ )
52609+{
52610+ directory_entry_format *dent;
52611+
52612+ assert("nikita-1458", coord != NULL);
52613+ assert("nikita-1459", key != NULL);
52614+
52615+ dent = (directory_entry_format *) item_body_by_coord(coord);
52616+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
52617+ return extract_key_from_id(&dent->id, key);
52618+}
52619+
52620+int
52621+update_key_de(const coord_t * coord, const reiser4_key * key,
52622+ lock_handle * lh UNUSED_ARG)
52623+{
52624+ directory_entry_format *dent;
52625+ obj_key_id obj_id;
52626+ int result;
52627+
52628+ assert("nikita-2342", coord != NULL);
52629+ assert("nikita-2343", key != NULL);
52630+
52631+ dent = (directory_entry_format *) item_body_by_coord(coord);
52632+ result = build_obj_key_id(key, &obj_id);
52633+ if (result == 0) {
52634+ dent->id = obj_id;
52635+ znode_make_dirty(coord->node);
52636+ }
52637+ return 0;
52638+}
52639+
52640+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
52641+ char *buf)
52642+{
52643+ reiser4_key key;
52644+
52645+ unit_key_by_coord(coord, &key);
52646+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
52647+ reiser4_print_address("oops", znode_get_block(coord->node));
52648+ if (!is_longname_key(&key)) {
52649+ if (is_dot_key(&key))
52650+ return (char *)".";
52651+ else
52652+ return extract_name_from_key(&key, buf);
52653+ } else
52654+ return (char *)dent->name;
52655+}
52656+
52657+/* ->extract_name() method of simple directory item plugin. */
52658+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
52659+{
52660+ directory_entry_format *dent;
52661+
52662+ assert("nikita-1460", coord != NULL);
52663+
52664+ dent = (directory_entry_format *) item_body_by_coord(coord);
52665+ return extract_dent_name(coord, dent, buf);
52666+}
52667+
52668+/* ->extract_file_type() method of simple directory item plugin. */
52669+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
52670+ * item */ )
52671+{
52672+ assert("nikita-1764", coord != NULL);
52673+ /* we don't store file type in the directory entry yet.
52674+
52675+ But see comments at kassign.h:obj_key_id
52676+ */
52677+ return DT_UNKNOWN;
52678+}
52679+
52680+int add_entry_de(struct inode *dir /* directory of item */ ,
52681+ coord_t * coord /* coord of item */ ,
52682+ lock_handle * lh /* insertion lock handle */ ,
52683+ const struct dentry *de /* name to add */ ,
52684+ reiser4_dir_entry_desc * entry /* parameters of new directory
52685+ * entry */ )
52686+{
52687+ reiser4_item_data data;
52688+ directory_entry_format *dent;
52689+ int result;
52690+ const char *name;
52691+ int len;
52692+ int longname;
52693+
52694+ name = de->d_name.name;
52695+ len = de->d_name.len;
52696+ assert("nikita-1163", strlen(name) == len);
52697+
52698+ longname = is_longname(name, len);
52699+
52700+ data.length = sizeof *dent;
52701+ if (longname)
52702+ data.length += len + 1;
52703+ data.data = NULL;
52704+ data.user = 0;
52705+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
52706+
52707+ /* NOTE-NIKITA quota plugin */
52708+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
52709+ return -EDQUOT;
52710+
52711+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
52712+ if (result != 0)
52713+ return result;
52714+
52715+ dent = (directory_entry_format *) item_body_by_coord(coord);
52716+ build_inode_key_id(entry->obj, &dent->id);
52717+ if (longname) {
52718+ memcpy(dent->name, name, len);
52719+ put_unaligned(0, &dent->name[len]);
52720+ }
52721+ return 0;
52722+}
52723+
52724+int rem_entry_de(struct inode *dir /* directory of item */ ,
52725+ const struct qstr *name UNUSED_ARG,
52726+ coord_t * coord /* coord of item */ ,
52727+ lock_handle * lh UNUSED_ARG /* lock handle for
52728+ * removal */ ,
52729+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
52730+ * directory entry
52731+ * being removed */ )
52732+{
52733+ coord_t shadow;
52734+ int result;
52735+ int length;
52736+
52737+ length = item_length_by_coord(coord);
52738+ if (inode_get_bytes(dir) < length) {
52739+ warning("nikita-2627", "Dir is broke: %llu: %llu",
52740+ (unsigned long long)get_inode_oid(dir),
52741+ inode_get_bytes(dir));
52742+
52743+ return RETERR(-EIO);
52744+ }
52745+
52746+ /* cut_node() is supposed to take pointers to _different_
52747+ coords, because it will modify them without respect to
52748+ possible aliasing. To work around this, create temporary copy
52749+ of @coord.
52750+ */
52751+ coord_dup(&shadow, coord);
52752+ result =
52753+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
52754+ if (result == 0) {
52755+ /* NOTE-NIKITA quota plugin */
52756+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
52757+ }
52758+ return result;
52759+}
52760+
52761+int max_name_len_de(const struct inode *dir)
52762+{
52763+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
52764+ sizeof(directory_entry_format) - 2;
52765+}
52766+
52767+/* Make Linus happy.
52768+ Local variables:
52769+ c-indentation-style: "K&R"
52770+ mode-name: "LC"
52771+ c-basic-offset: 8
52772+ tab-width: 8
52773+ fill-column: 120
52774+ End:
52775+*/
52776diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/sde.h linux-2.6.20/fs/reiser4/plugin/item/sde.h
52777--- linux-2.6.20.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300
52778+++ linux-2.6.20/fs/reiser4/plugin/item/sde.h 2007-05-06 14:50:43.819013220 +0400
52779@@ -0,0 +1,66 @@
52780+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52781+
52782+/* Directory entry. */
52783+
52784+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
52785+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
52786+
52787+#include "../../forward.h"
52788+#include "../../dformat.h"
52789+#include "../../kassign.h"
52790+#include "../../key.h"
52791+
52792+#include <linux/fs.h>
52793+#include <linux/dcache.h> /* for struct dentry */
52794+
52795+typedef struct directory_entry_format {
52796+ /* key of object stat-data. It's not necessary to store whole
52797+ key here, because it's always key of stat-data, so minor
52798+ packing locality and offset can be omitted here. But this
52799+ relies on particular key allocation scheme for stat-data, so,
52800+ for extensibility sake, whole key can be stored here.
52801+
52802+ We store key as array of bytes, because we don't want 8-byte
52803+ alignment of dir entries.
52804+ */
52805+ obj_key_id id;
52806+ /* file name. Null terminated string. */
52807+ d8 name[0];
52808+} directory_entry_format;
52809+
52810+void print_de(const char *prefix, coord_t * coord);
52811+int extract_key_de(const coord_t * coord, reiser4_key * key);
52812+int update_key_de(const coord_t * coord, const reiser4_key * key,
52813+ lock_handle * lh);
52814+char *extract_name_de(const coord_t * coord, char *buf);
52815+unsigned extract_file_type_de(const coord_t * coord);
52816+int add_entry_de(struct inode *dir, coord_t * coord,
52817+ lock_handle * lh, const struct dentry *name,
52818+ reiser4_dir_entry_desc * entry);
52819+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
52820+ lock_handle * lh, reiser4_dir_entry_desc * entry);
52821+int max_name_len_de(const struct inode *dir);
52822+
52823+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
52824+
52825+char *extract_dent_name(const coord_t * coord,
52826+ directory_entry_format * dent, char *buf);
52827+
52828+#if REISER4_LARGE_KEY
52829+#define DE_NAME_BUF_LEN (24)
52830+#else
52831+#define DE_NAME_BUF_LEN (16)
52832+#endif
52833+
52834+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
52835+#endif
52836+
52837+/* Make Linus happy.
52838+ Local variables:
52839+ c-indentation-style: "K&R"
52840+ mode-name: "LC"
52841+ c-basic-offset: 8
52842+ tab-width: 8
52843+ fill-column: 120
52844+ End:
52845+*/
52846diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.20/fs/reiser4/plugin/item/static_stat.c
52847--- linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300
52848+++ linux-2.6.20/fs/reiser4/plugin/item/static_stat.c 2007-05-06 14:50:43.823014469 +0400
52849@@ -0,0 +1,1107 @@
52850+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52851+
52852+/* stat data manipulation. */
52853+
52854+#include "../../forward.h"
52855+#include "../../super.h"
52856+#include "../../vfs_ops.h"
52857+#include "../../inode.h"
52858+#include "../../debug.h"
52859+#include "../../dformat.h"
52860+#include "../object.h"
52861+#include "../plugin.h"
52862+#include "../plugin_header.h"
52863+#include "static_stat.h"
52864+#include "item.h"
52865+
52866+#include <linux/types.h>
52867+#include <linux/fs.h>
52868+
52869+/* see static_stat.h for explanation */
52870+
52871+/* helper function used while we are dumping/loading inode/plugin state
52872+ to/from the stat-data. */
52873+
52874+static void move_on(int *length /* space remaining in stat-data */ ,
52875+ char **area /* current coord in stat data */ ,
52876+ int size_of /* how many bytes to move forward */ )
52877+{
52878+ assert("nikita-615", length != NULL);
52879+ assert("nikita-616", area != NULL);
52880+
52881+ *length -= size_of;
52882+ *area += size_of;
52883+
52884+ assert("nikita-617", *length >= 0);
52885+}
52886+
52887+/* helper function used while loading inode/plugin state from stat-data.
52888+ Complain if there is less space in stat-data than was expected.
52889+ Can only happen on disk corruption. */
52890+static int not_enough_space(struct inode *inode /* object being processed */ ,
52891+ const char *where /* error message */ )
52892+{
52893+ assert("nikita-618", inode != NULL);
52894+
52895+ warning("nikita-619", "Not enough space in %llu while loading %s",
52896+ (unsigned long long)get_inode_oid(inode), where);
52897+
52898+ return RETERR(-EINVAL);
52899+}
52900+
52901+/* helper function used while loading inode/plugin state from
52902+ stat-data. Call it if invalid plugin id was found. */
52903+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
52904+ struct inode *inode /* object being processed */ )
52905+{
52906+ warning("nikita-620", "Unknown plugin %i in %llu",
52907+ id, (unsigned long long)get_inode_oid(inode));
52908+
52909+ return RETERR(-EINVAL);
52910+}
52911+
52912+/* this is installed as ->init_inode() method of
52913+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
52914+ Copies data from on-disk stat-data format into inode.
52915+ Handles stat-data extensions. */
52916+/* was sd_load */
52917+int init_inode_static_sd(struct inode *inode /* object being processed */ ,
52918+ char *sd /* stat-data body */ ,
52919+ int len /* length of stat-data */ )
52920+{
52921+ int result;
52922+ int bit;
52923+ int chunk;
52924+ __u16 mask;
52925+ __u64 bigmask;
52926+ reiser4_stat_data_base *sd_base;
52927+ reiser4_inode *state;
52928+
52929+ assert("nikita-625", inode != NULL);
52930+ assert("nikita-626", sd != NULL);
52931+
52932+ result = 0;
52933+ sd_base = (reiser4_stat_data_base *) sd;
52934+ state = reiser4_inode_data(inode);
52935+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
52936+ bigmask = mask;
52937+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
52938+
52939+ move_on(&len, &sd, sizeof *sd_base);
52940+ for (bit = 0, chunk = 0;
52941+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
52942+ ++bit, mask >>= 1) {
52943+ if (((bit + 1) % 16) != 0) {
52944+ /* handle extension */
52945+ sd_ext_plugin *sdplug;
52946+
52947+ if (bit >= LAST_SD_EXTENSION) {
52948+ warning("vpf-1904",
52949+ "No such extension %i in inode %llu",
52950+ bit,
52951+ (unsigned long long)
52952+ get_inode_oid(inode));
52953+
52954+ result = RETERR(-EINVAL);
52955+ break;
52956+ }
52957+
52958+ sdplug = sd_ext_plugin_by_id(bit);
52959+ if (sdplug == NULL) {
52960+ warning("nikita-627",
52961+ "No such extension %i in inode %llu",
52962+ bit,
52963+ (unsigned long long)
52964+ get_inode_oid(inode));
52965+
52966+ result = RETERR(-EINVAL);
52967+ break;
52968+ }
52969+ if (mask & 1) {
52970+ assert("nikita-628", sdplug->present);
52971+ /* alignment is not supported in node layout
52972+ plugin yet.
52973+ result = align( inode, &len, &sd,
52974+ sdplug -> alignment );
52975+ if( result != 0 )
52976+ return result; */
52977+ result = sdplug->present(inode, &sd, &len);
52978+ } else if (sdplug->absent != NULL)
52979+ result = sdplug->absent(inode);
52980+ if (result)
52981+ break;
52982+ /* else, we are looking at the last bit in 16-bit
52983+ portion of bitmask */
52984+ } else if (mask & 1) {
52985+ /* next portion of bitmask */
52986+ if (len < (int)sizeof(d16)) {
52987+ warning("nikita-629",
52988+ "No space for bitmap in inode %llu",
52989+ (unsigned long long)
52990+ get_inode_oid(inode));
52991+
52992+ result = RETERR(-EINVAL);
52993+ break;
52994+ }
52995+ mask = le16_to_cpu(get_unaligned((d16 *)sd));
52996+ bigmask <<= 16;
52997+ bigmask |= mask;
52998+ move_on(&len, &sd, sizeof(d16));
52999+ ++chunk;
53000+ if (chunk == 3) {
53001+ if (!(mask & 0x8000)) {
53002+ /* clear last bit */
53003+ mask &= ~0x8000;
53004+ continue;
53005+ }
53006+ /* too much */
53007+ warning("nikita-630",
53008+ "Too many extensions in %llu",
53009+ (unsigned long long)
53010+ get_inode_oid(inode));
53011+
53012+ result = RETERR(-EINVAL);
53013+ break;
53014+ }
53015+ } else
53016+ /* bitmask exhausted */
53017+ break;
53018+ }
53019+ state->extmask = bigmask;
53020+ /* common initialisations */
53021+ if (len - (bit / 16 * sizeof(d16)) > 0) {
53022+ /* alignment in save_len_static_sd() is taken into account
53023+ -edward */
53024+ warning("nikita-631", "unused space in inode %llu",
53025+ (unsigned long long)get_inode_oid(inode));
53026+ }
53027+
53028+ return result;
53029+}
53030+
53031+/* estimates size of stat-data required to store inode.
53032+ Installed as ->save_len() method of
53033+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53034+/* was sd_len */
53035+int save_len_static_sd(struct inode *inode /* object being processed */ )
53036+{
53037+ unsigned int result;
53038+ __u64 mask;
53039+ int bit;
53040+
53041+ assert("nikita-632", inode != NULL);
53042+
53043+ result = sizeof(reiser4_stat_data_base);
53044+ mask = reiser4_inode_data(inode)->extmask;
53045+ for (bit = 0; mask != 0; ++bit, mask >>= 1) {
53046+ if (mask & 1) {
53047+ sd_ext_plugin *sdplug;
53048+
53049+ sdplug = sd_ext_plugin_by_id(bit);
53050+ assert("nikita-633", sdplug != NULL);
53051+ /* no aligment support
53052+ result +=
53053+ round_up( result, sdplug -> alignment ) - result; */
53054+ result += sdplug->save_len(inode);
53055+ }
53056+ }
53057+ result += bit / 16 * sizeof(d16);
53058+ return result;
53059+}
53060+
53061+/* saves inode into stat-data.
53062+ Installed as ->save() method of
53063+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53064+/* was sd_save */
53065+int save_static_sd(struct inode *inode /* object being processed */ ,
53066+ char **area /* where to save stat-data */ )
53067+{
53068+ int result;
53069+ __u64 emask;
53070+ int bit;
53071+ unsigned int len;
53072+ reiser4_stat_data_base *sd_base;
53073+
53074+ assert("nikita-634", inode != NULL);
53075+ assert("nikita-635", area != NULL);
53076+
53077+ result = 0;
53078+ emask = reiser4_inode_data(inode)->extmask;
53079+ sd_base = (reiser4_stat_data_base *) * area;
53080+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
53081+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
53082+
53083+ *area += sizeof *sd_base;
53084+ len = 0xffffffffu;
53085+ for (bit = 0; emask != 0; ++bit, emask >>= 1) {
53086+ if (emask & 1) {
53087+ if ((bit + 1) % 16 != 0) {
53088+ sd_ext_plugin *sdplug;
53089+ sdplug = sd_ext_plugin_by_id(bit);
53090+ assert("nikita-636", sdplug != NULL);
53091+ /* no alignment support yet
53092+ align( inode, &len, area,
53093+ sdplug -> alignment ); */
53094+ result = sdplug->save(inode, area);
53095+ if (result)
53096+ break;
53097+ } else {
53098+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
53099+ (d16 *)(*area));
53100+ /*cputod16((unsigned)(emask & 0xffff),
53101+ (d16 *) * area);*/
53102+ *area += sizeof(d16);
53103+ }
53104+ }
53105+ }
53106+ return result;
53107+}
53108+
53109+/* stat-data extension handling functions. */
53110+
53111+static int present_lw_sd(struct inode *inode /* object being processed */ ,
53112+ char **area /* position in stat-data */ ,
53113+ int *len /* remaining length */ )
53114+{
53115+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
53116+ reiser4_light_weight_stat *sd_lw;
53117+
53118+ sd_lw = (reiser4_light_weight_stat *) * area;
53119+
53120+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
53121+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
53122+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
53123+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
53124+ inode->i_mode &= ~S_IFIFO;
53125+ warning("", "partially converted file is encountered");
53126+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
53127+ }
53128+ move_on(len, area, sizeof *sd_lw);
53129+ return 0;
53130+ } else
53131+ return not_enough_space(inode, "lw sd");
53132+}
53133+
53134+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
53135+ * processed */ )
53136+{
53137+ return sizeof(reiser4_light_weight_stat);
53138+}
53139+
53140+static int save_lw_sd(struct inode *inode /* object being processed */ ,
53141+ char **area /* position in stat-data */ )
53142+{
53143+ reiser4_light_weight_stat *sd;
53144+ mode_t delta;
53145+
53146+ assert("nikita-2705", inode != NULL);
53147+ assert("nikita-2706", area != NULL);
53148+ assert("nikita-2707", *area != NULL);
53149+
53150+ sd = (reiser4_light_weight_stat *) * area;
53151+
53152+ delta = (reiser4_inode_get_flag(inode,
53153+ REISER4_PART_MIXED) ? S_IFIFO : 0);
53154+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
53155+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
53156+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
53157+ *area += sizeof *sd;
53158+ return 0;
53159+}
53160+
53161+static int present_unix_sd(struct inode *inode /* object being processed */ ,
53162+ char **area /* position in stat-data */ ,
53163+ int *len /* remaining length */ )
53164+{
53165+ assert("nikita-637", inode != NULL);
53166+ assert("nikita-638", area != NULL);
53167+ assert("nikita-639", *area != NULL);
53168+ assert("nikita-640", len != NULL);
53169+ assert("nikita-641", *len > 0);
53170+
53171+ if (*len >= (int)sizeof(reiser4_unix_stat)) {
53172+ reiser4_unix_stat *sd;
53173+
53174+ sd = (reiser4_unix_stat *) * area;
53175+
53176+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
53177+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
53178+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
53179+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
53180+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
53181+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53182+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
53183+ else
53184+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
53185+ move_on(len, area, sizeof *sd);
53186+ return 0;
53187+ } else
53188+ return not_enough_space(inode, "unix sd");
53189+}
53190+
53191+static int absent_unix_sd(struct inode *inode /* object being processed */ )
53192+{
53193+ inode->i_uid = get_super_private(inode->i_sb)->default_uid;
53194+ inode->i_gid = get_super_private(inode->i_sb)->default_gid;
53195+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53196+ inode_set_bytes(inode, inode->i_size);
53197+ /* mark inode as lightweight, so that caller (lookup_common) will
53198+ complete initialisation by copying [ug]id from a parent. */
53199+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
53200+ return 0;
53201+}
53202+
53203+/* Audited by: green(2002.06.14) */
53204+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
53205+ * processed */ )
53206+{
53207+ return sizeof(reiser4_unix_stat);
53208+}
53209+
53210+static int save_unix_sd(struct inode *inode /* object being processed */ ,
53211+ char **area /* position in stat-data */ )
53212+{
53213+ reiser4_unix_stat *sd;
53214+
53215+ assert("nikita-642", inode != NULL);
53216+ assert("nikita-643", area != NULL);
53217+ assert("nikita-644", *area != NULL);
53218+
53219+ sd = (reiser4_unix_stat *) * area;
53220+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53221+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53222+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53223+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53224+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53225+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53226+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53227+ else
53228+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53229+ *area += sizeof *sd;
53230+ return 0;
53231+}
53232+
53233+static int
53234+present_large_times_sd(struct inode *inode /* object being processed */ ,
53235+ char **area /* position in stat-data */ ,
53236+ int *len /* remaining length */ )
53237+{
53238+ if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53239+ reiser4_large_times_stat *sd_lt;
53240+
53241+ sd_lt = (reiser4_large_times_stat *) * area;
53242+
53243+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53244+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53245+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53246+
53247+ move_on(len, area, sizeof *sd_lt);
53248+ return 0;
53249+ } else
53250+ return not_enough_space(inode, "large times sd");
53251+}
53252+
53253+static int
53254+save_len_large_times_sd(struct inode *inode UNUSED_ARG
53255+ /* object being processed */ )
53256+{
53257+ return sizeof(reiser4_large_times_stat);
53258+}
53259+
53260+static int
53261+save_large_times_sd(struct inode *inode /* object being processed */ ,
53262+ char **area /* position in stat-data */ )
53263+{
53264+ reiser4_large_times_stat *sd;
53265+
53266+ assert("nikita-2817", inode != NULL);
53267+ assert("nikita-2818", area != NULL);
53268+ assert("nikita-2819", *area != NULL);
53269+
53270+ sd = (reiser4_large_times_stat *) * area;
53271+
53272+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53273+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53274+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53275+
53276+ *area += sizeof *sd;
53277+ return 0;
53278+}
53279+
53280+/* symlink stat data extension */
53281+
53282+/* allocate memory for symlink target and attach it to inode->i_private */
53283+static int
53284+symlink_target_to_inode(struct inode *inode, const char *target, int len)
53285+{
53286+ assert("vs-845", inode->i_private == NULL);
53287+ assert("vs-846", !reiser4_inode_get_flag(inode,
53288+ REISER4_GENERIC_PTR_USED));
53289+ /* FIXME-VS: this is prone to deadlock. Not more than other similar
53290+ places, though */
53291+ inode->i_private = kmalloc((size_t) len + 1,
53292+ reiser4_ctx_gfp_mask_get());
53293+ if (!inode->i_private)
53294+ return RETERR(-ENOMEM);
53295+
53296+ memcpy((char *)(inode->i_private), target, (size_t) len);
53297+ ((char *)(inode->i_private))[len] = 0;
53298+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53299+ return 0;
53300+}
53301+
53302+/* this is called on read_inode. There is nothing to do actually, but some
53303+ sanity checks */
53304+static int present_symlink_sd(struct inode *inode, char **area, int *len)
53305+{
53306+ int result;
53307+ int length;
53308+ reiser4_symlink_stat *sd;
53309+
53310+ length = (int)inode->i_size;
53311+ /*
53312+ * *len is number of bytes in stat data item from *area to the end of
53313+ * item. It must be not less than size of symlink + 1 for ending 0
53314+ */
53315+ if (length > *len)
53316+ return not_enough_space(inode, "symlink");
53317+
53318+ if (*(*area + length) != 0) {
53319+ warning("vs-840", "Symlink is not zero terminated");
53320+ return RETERR(-EIO);
53321+ }
53322+
53323+ sd = (reiser4_symlink_stat *) * area;
53324+ result = symlink_target_to_inode(inode, sd->body, length);
53325+
53326+ move_on(len, area, length + 1);
53327+ return result;
53328+}
53329+
53330+static int save_len_symlink_sd(struct inode *inode)
53331+{
53332+ return inode->i_size + 1;
53333+}
53334+
53335+/* this is called on create and update stat data. Do nothing on update but
53336+ update @area */
53337+static int save_symlink_sd(struct inode *inode, char **area)
53338+{
53339+ int result;
53340+ int length;
53341+ reiser4_symlink_stat *sd;
53342+
53343+ length = (int)inode->i_size;
53344+ /* inode->i_size must be set already */
53345+ assert("vs-841", length);
53346+
53347+ result = 0;
53348+ sd = (reiser4_symlink_stat *) * area;
53349+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53350+ const char *target;
53351+
53352+ target = (const char *)(inode->i_private);
53353+ inode->i_private = NULL;
53354+
53355+ result = symlink_target_to_inode(inode, target, length);
53356+
53357+ /* copy symlink to stat data */
53358+ memcpy(sd->body, target, (size_t) length);
53359+ (*area)[length] = 0;
53360+ } else {
53361+ /* there is nothing to do in update but move area */
53362+ assert("vs-844",
53363+ !memcmp(inode->i_private, sd->body,
53364+ (size_t) length + 1));
53365+ }
53366+
53367+ *area += (length + 1);
53368+ return result;
53369+}
53370+
53371+static int present_flags_sd(struct inode *inode /* object being processed */ ,
53372+ char **area /* position in stat-data */ ,
53373+ int *len /* remaining length */ )
53374+{
53375+ assert("nikita-645", inode != NULL);
53376+ assert("nikita-646", area != NULL);
53377+ assert("nikita-647", *area != NULL);
53378+ assert("nikita-648", len != NULL);
53379+ assert("nikita-649", *len > 0);
53380+
53381+ if (*len >= (int)sizeof(reiser4_flags_stat)) {
53382+ reiser4_flags_stat *sd;
53383+
53384+ sd = (reiser4_flags_stat *) * area;
53385+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53386+ move_on(len, area, sizeof *sd);
53387+ return 0;
53388+ } else
53389+ return not_enough_space(inode, "generation and attrs");
53390+}
53391+
53392+/* Audited by: green(2002.06.14) */
53393+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
53394+ * processed */ )
53395+{
53396+ return sizeof(reiser4_flags_stat);
53397+}
53398+
53399+static int save_flags_sd(struct inode *inode /* object being processed */ ,
53400+ char **area /* position in stat-data */ )
53401+{
53402+ reiser4_flags_stat *sd;
53403+
53404+ assert("nikita-650", inode != NULL);
53405+ assert("nikita-651", area != NULL);
53406+ assert("nikita-652", *area != NULL);
53407+
53408+ sd = (reiser4_flags_stat *) * area;
53409+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53410+ *area += sizeof *sd;
53411+ return 0;
53412+}
53413+
53414+static int absent_plugin_sd(struct inode *inode);
53415+static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53416+ char **area /* position in stat-data */ ,
53417+ int *len /* remaining length */,
53418+ int is_pset /* 1 if plugin set, 0 if heir set. */)
53419+{
53420+ reiser4_plugin_stat *sd;
53421+ reiser4_plugin *plugin;
53422+ reiser4_inode *info;
53423+ int i;
53424+ __u16 mask;
53425+ int result;
53426+ int num_of_plugins;
53427+
53428+ assert("nikita-653", inode != NULL);
53429+ assert("nikita-654", area != NULL);
53430+ assert("nikita-655", *area != NULL);
53431+ assert("nikita-656", len != NULL);
53432+ assert("nikita-657", *len > 0);
53433+
53434+ if (*len < (int)sizeof(reiser4_plugin_stat))
53435+ return not_enough_space(inode, "plugin");
53436+
53437+ sd = (reiser4_plugin_stat *) * area;
53438+ info = reiser4_inode_data(inode);
53439+
53440+ mask = 0;
53441+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
53442+ move_on(len, area, sizeof *sd);
53443+ result = 0;
53444+ for (i = 0; i < num_of_plugins; ++i) {
53445+ reiser4_plugin_slot *slot;
53446+ reiser4_plugin_type type;
53447+ pset_member memb;
53448+
53449+ slot = (reiser4_plugin_slot *) * area;
53450+ if (*len < (int)sizeof *slot)
53451+ return not_enough_space(inode, "additional plugin");
53452+
53453+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
53454+ type = aset_member_to_type_unsafe(memb);
53455+
53456+ if (type == REISER4_PLUGIN_TYPES) {
53457+ warning("nikita-3502",
53458+ "wrong %s member (%i) for %llu", is_pset ?
53459+ "pset" : "hset", memb,
53460+ (unsigned long long)get_inode_oid(inode));
53461+ return RETERR(-EINVAL);
53462+ }
53463+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
53464+ type, &slot->id);
53465+ if (plugin == NULL)
53466+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
53467+
53468+ /* plugin is loaded into inode, mark this into inode's
53469+ bitmask of loaded non-standard plugins */
53470+ if (!(mask & (1 << memb))) {
53471+ mask |= (1 << memb);
53472+ } else {
53473+ warning("nikita-658", "duplicate plugin for %llu",
53474+ (unsigned long long)get_inode_oid(inode));
53475+ return RETERR(-EINVAL);
53476+ }
53477+ move_on(len, area, sizeof *slot);
53478+ /* load plugin data, if any */
53479+ if (plugin->h.pops != NULL && plugin->h.pops->load)
53480+ result = plugin->h.pops->load(inode, plugin, area, len);
53481+ else
53482+ result = aset_set_unsafe(is_pset ? &info->pset :
53483+ &info->hset, memb, plugin);
53484+ if (result)
53485+ return result;
53486+ }
53487+ if (is_pset) {
53488+ /* if object plugin wasn't loaded from stat-data, guess it by
53489+ mode bits */
53490+ plugin = file_plugin_to_plugin(inode_file_plugin(inode));
53491+ if (plugin == NULL)
53492+ result = absent_plugin_sd(inode);
53493+ info->plugin_mask = mask;
53494+ } else
53495+ info->heir_mask = mask;
53496+
53497+ return result;
53498+}
53499+
53500+static int present_pset_sd(struct inode *inode, char **area, int *len) {
53501+ return present_plugin_sd(inode, area, len, 1 /* pset */);
53502+}
53503+
53504+/* Determine object plugin for @inode based on i_mode.
53505+
53506+ Many objects in reiser4 file system are controlled by standard object
53507+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
53508+
53509+ For such files we don't explicitly store plugin id in object stat
53510+ data. Rather required plugin is guessed from mode bits, where file "type"
53511+ is encoded (see stat(2)).
53512+*/
53513+static int
53514+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
53515+{
53516+ int fplug_id;
53517+ int dplug_id;
53518+ reiser4_inode *info;
53519+
53520+ assert("nikita-736", inode != NULL);
53521+
53522+ dplug_id = fplug_id = -1;
53523+
53524+ switch (inode->i_mode & S_IFMT) {
53525+ case S_IFSOCK:
53526+ case S_IFBLK:
53527+ case S_IFCHR:
53528+ case S_IFIFO:
53529+ fplug_id = SPECIAL_FILE_PLUGIN_ID;
53530+ break;
53531+ case S_IFLNK:
53532+ fplug_id = SYMLINK_FILE_PLUGIN_ID;
53533+ break;
53534+ case S_IFDIR:
53535+ fplug_id = DIRECTORY_FILE_PLUGIN_ID;
53536+ dplug_id = HASHED_DIR_PLUGIN_ID;
53537+ break;
53538+ default:
53539+ warning("nikita-737", "wrong file mode: %o", inode->i_mode);
53540+ return RETERR(-EIO);
53541+ case S_IFREG:
53542+ fplug_id = UNIX_FILE_PLUGIN_ID;
53543+ break;
53544+ }
53545+ info = reiser4_inode_data(inode);
53546+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
53547+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
53548+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
53549+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
53550+ return 0;
53551+}
53552+
53553+/* Audited by: green(2002.06.14) */
53554+static int absent_plugin_sd(struct inode *inode /* object being processed */ )
53555+{
53556+ int result;
53557+
53558+ assert("nikita-659", inode != NULL);
53559+
53560+ result = guess_plugin_by_mode(inode);
53561+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
53562+ but setup_inode_ops() will call make_bad_inode().
53563+ Another, more logical but bit more complex solution is to add
53564+ "bad-file plugin". */
53565+ /* FIXME-VS: activate was called here */
53566+ return result;
53567+}
53568+
53569+/* helper function for plugin_sd_save_len(): calculate how much space
53570+ required to save state of given plugin */
53571+/* Audited by: green(2002.06.14) */
53572+static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
53573+ struct inode *inode /* object being processed */ ,
53574+ pset_member memb,
53575+ int len, int is_pset)
53576+{
53577+ reiser4_inode *info;
53578+ assert("nikita-661", inode != NULL);
53579+
53580+ if (plugin == NULL)
53581+ return len;
53582+
53583+ info = reiser4_inode_data(inode);
53584+ if (is_pset ?
53585+ info->plugin_mask & (1 << memb) :
53586+ info->heir_mask & (1 << memb)) {
53587+ len += sizeof(reiser4_plugin_slot);
53588+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
53589+ /* non-standard plugin, call method */
53590+ /* commented as it is incompatible with alignment
53591+ * policy in save_plug() -edward */
53592+ /* len = round_up(len, plugin->h.pops->alignment); */
53593+ len += plugin->h.pops->save_len(inode, plugin);
53594+ }
53595+ }
53596+ return len;
53597+}
53598+
53599+/* calculate how much space is required to save state of all plugins,
53600+ associated with inode */
53601+static int save_len_plugin_sd(struct inode *inode /* object being processed */,
53602+ int is_pset)
53603+{
53604+ int len;
53605+ int last;
53606+ reiser4_inode *state;
53607+ pset_member memb;
53608+
53609+ assert("nikita-663", inode != NULL);
53610+
53611+ state = reiser4_inode_data(inode);
53612+
53613+ /* common case: no non-standard plugins */
53614+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
53615+ return 0;
53616+ len = sizeof(reiser4_plugin_stat);
53617+ last = PSET_LAST;
53618+
53619+ for (memb = 0; memb < last; ++memb) {
53620+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
53621+ inode, memb, len, is_pset);
53622+ }
53623+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
53624+ return len;
53625+}
53626+
53627+static int save_len_pset_sd(struct inode *inode) {
53628+ return save_len_plugin_sd(inode, 1 /* pset */);
53629+}
53630+
53631+/* helper function for plugin_sd_save(): save plugin, associated with
53632+ inode. */
53633+static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
53634+ struct inode *inode /* object being processed */ ,
53635+ int memb /* what element of pset is saved */ ,
53636+ char **area /* position in stat-data */ ,
53637+ int *count /* incremented if plugin were actually saved. */,
53638+ int is_pset /* 1 for plugin set, 0 for heir set */)
53639+{
53640+ reiser4_plugin_slot *slot;
53641+ int fake_len;
53642+ int result;
53643+
53644+ assert("nikita-665", inode != NULL);
53645+ assert("nikita-666", area != NULL);
53646+ assert("nikita-667", *area != NULL);
53647+
53648+ if (plugin == NULL)
53649+ return 0;
53650+
53651+ if (is_pset ?
53652+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
53653+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
53654+ return 0;
53655+ slot = (reiser4_plugin_slot *) * area;
53656+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
53657+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
53658+ fake_len = (int)0xffff;
53659+ move_on(&fake_len, area, sizeof *slot);
53660+ ++*count;
53661+ result = 0;
53662+ if (plugin->h.pops != NULL) {
53663+ if (plugin->h.pops->save != NULL)
53664+ result = plugin->h.pops->save(inode, plugin, area);
53665+ }
53666+ return result;
53667+}
53668+
53669+/* save state of all non-standard plugins associated with inode */
53670+static int save_plugin_sd(struct inode *inode /* object being processed */ ,
53671+ char **area /* position in stat-data */,
53672+ int is_pset /* 1 for pset, 0 for hset */)
53673+{
53674+ int fake_len;
53675+ int result = 0;
53676+ int num_of_plugins;
53677+ reiser4_plugin_stat *sd;
53678+ reiser4_inode *state;
53679+ pset_member memb;
53680+
53681+ assert("nikita-669", inode != NULL);
53682+ assert("nikita-670", area != NULL);
53683+ assert("nikita-671", *area != NULL);
53684+
53685+ state = reiser4_inode_data(inode);
53686+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
53687+ return 0;
53688+ sd = (reiser4_plugin_stat *) * area;
53689+ fake_len = (int)0xffff;
53690+ move_on(&fake_len, area, sizeof *sd);
53691+
53692+ num_of_plugins = 0;
53693+ for (memb = 0; memb < PSET_LAST; ++memb) {
53694+ result = save_plug(aset_get(is_pset ? state->pset : state->hset,
53695+ memb),
53696+ inode, memb, area, &num_of_plugins, is_pset);
53697+ if (result != 0)
53698+ break;
53699+ }
53700+
53701+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
53702+ return result;
53703+}
53704+
53705+static int save_pset_sd(struct inode *inode, char **area) {
53706+ return save_plugin_sd(inode, area, 1 /* pset */);
53707+}
53708+
53709+static int present_hset_sd(struct inode *inode, char **area, int *len) {
53710+ return present_plugin_sd(inode, area, len, 0 /* hset */);
53711+}
53712+
53713+static int save_len_hset_sd(struct inode *inode) {
53714+ return save_len_plugin_sd(inode, 0 /* pset */);
53715+}
53716+
53717+static int save_hset_sd(struct inode *inode, char **area) {
53718+ return save_plugin_sd(inode, area, 0 /* hset */);
53719+}
53720+
53721+/* helper function for crypto_sd_present(), crypto_sd_save.
53722+ Allocates memory for crypto stat, keyid and attaches it to the inode */
53723+static int extract_crypto_stat (struct inode * inode,
53724+ reiser4_crypto_stat * sd)
53725+{
53726+ crypto_stat_t * info;
53727+ assert("edward-11", !inode_crypto_stat(inode));
53728+ assert("edward-1413",
53729+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
53730+ /* create and attach a crypto-stat without secret key loaded */
53731+ info = reiser4_alloc_crypto_stat(inode);
53732+ if (IS_ERR(info))
53733+ return PTR_ERR(info);
53734+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
53735+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
53736+ reiser4_attach_crypto_stat(inode, info);
53737+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53738+ return 0;
53739+}
53740+
53741+/* crypto stat-data extension */
53742+
53743+static int present_crypto_sd(struct inode *inode, char **area, int *len)
53744+{
53745+ int result;
53746+ reiser4_crypto_stat *sd;
53747+ digest_plugin *dplug = inode_digest_plugin(inode);
53748+
53749+ assert("edward-06", dplug != NULL);
53750+ assert("edward-684", dplug->fipsize);
53751+ assert("edward-07", area != NULL);
53752+ assert("edward-08", *area != NULL);
53753+ assert("edward-09", len != NULL);
53754+ assert("edward-10", *len > 0);
53755+
53756+ if (*len < (int)sizeof(reiser4_crypto_stat)) {
53757+ return not_enough_space(inode, "crypto-sd");
53758+ }
53759+ /* *len is number of bytes in stat data item from *area to the end of
53760+ item. It must be not less than size of this extension */
53761+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
53762+
53763+ sd = (reiser4_crypto_stat *) * area;
53764+ result = extract_crypto_stat(inode, sd);
53765+ move_on(len, area, sizeof(*sd) + dplug->fipsize);
53766+
53767+ return result;
53768+}
53769+
53770+static int save_len_crypto_sd(struct inode *inode)
53771+{
53772+ return sizeof(reiser4_crypto_stat) +
53773+ inode_digest_plugin(inode)->fipsize;
53774+}
53775+
53776+static int save_crypto_sd(struct inode *inode, char **area)
53777+{
53778+ int result = 0;
53779+ reiser4_crypto_stat *sd;
53780+ crypto_stat_t * info = inode_crypto_stat(inode);
53781+ digest_plugin *dplug = inode_digest_plugin(inode);
53782+
53783+ assert("edward-12", dplug != NULL);
53784+ assert("edward-13", area != NULL);
53785+ assert("edward-14", *area != NULL);
53786+ assert("edward-15", info != NULL);
53787+ assert("edward-1414", info->keyid != NULL);
53788+ assert("edward-1415", info->keysize != 0);
53789+ assert("edward-76", reiser4_inode_data(inode) != NULL);
53790+
53791+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
53792+ /* file is just created */
53793+ sd = (reiser4_crypto_stat *) *area;
53794+ /* copy everything but private key to the disk stat-data */
53795+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
53796+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
53797+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
53798+ }
53799+ *area += (sizeof(*sd) + dplug->fipsize);
53800+ return result;
53801+}
53802+
53803+static int eio(struct inode *inode, char **area, int *len)
53804+{
53805+ return RETERR(-EIO);
53806+}
53807+
53808+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
53809+ [LIGHT_WEIGHT_STAT] = {
53810+ .h = {
53811+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53812+ .id = LIGHT_WEIGHT_STAT,
53813+ .pops = NULL,
53814+ .label = "light-weight sd",
53815+ .desc = "sd for light-weight files",
53816+ .linkage = {NULL,NULL}
53817+ },
53818+ .present = present_lw_sd,
53819+ .absent = NULL,
53820+ .save_len = save_len_lw_sd,
53821+ .save = save_lw_sd,
53822+ .alignment = 8
53823+ },
53824+ [UNIX_STAT] = {
53825+ .h = {
53826+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53827+ .id = UNIX_STAT,
53828+ .pops = NULL,
53829+ .label = "unix-sd",
53830+ .desc = "unix stat-data fields",
53831+ .linkage = {NULL,NULL}
53832+ },
53833+ .present = present_unix_sd,
53834+ .absent = absent_unix_sd,
53835+ .save_len = save_len_unix_sd,
53836+ .save = save_unix_sd,
53837+ .alignment = 8
53838+ },
53839+ [LARGE_TIMES_STAT] = {
53840+ .h = {
53841+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53842+ .id = LARGE_TIMES_STAT,
53843+ .pops = NULL,
53844+ .label = "64time-sd",
53845+ .desc = "nanosecond resolution for times",
53846+ .linkage = {NULL,NULL}
53847+ },
53848+ .present = present_large_times_sd,
53849+ .absent = NULL,
53850+ .save_len = save_len_large_times_sd,
53851+ .save = save_large_times_sd,
53852+ .alignment = 8
53853+ },
53854+ [SYMLINK_STAT] = {
53855+ /* stat data of symlink has this extension */
53856+ .h = {
53857+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53858+ .id = SYMLINK_STAT,
53859+ .pops = NULL,
53860+ .label = "symlink-sd",
53861+ .desc =
53862+ "stat data is appended with symlink name",
53863+ .linkage = {NULL,NULL}
53864+ },
53865+ .present = present_symlink_sd,
53866+ .absent = NULL,
53867+ .save_len = save_len_symlink_sd,
53868+ .save = save_symlink_sd,
53869+ .alignment = 8
53870+ },
53871+ [PLUGIN_STAT] = {
53872+ .h = {
53873+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53874+ .id = PLUGIN_STAT,
53875+ .pops = NULL,
53876+ .label = "plugin-sd",
53877+ .desc = "plugin stat-data fields",
53878+ .linkage = {NULL,NULL}
53879+ },
53880+ .present = present_pset_sd,
53881+ .absent = absent_plugin_sd,
53882+ .save_len = save_len_pset_sd,
53883+ .save = save_pset_sd,
53884+ .alignment = 8
53885+ },
53886+ [HEIR_STAT] = {
53887+ .h = {
53888+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53889+ .id = HEIR_STAT,
53890+ .pops = NULL,
53891+ .label = "heir-plugin-sd",
53892+ .desc = "heir plugin stat-data fields",
53893+ .linkage = {NULL,NULL}
53894+ },
53895+ .present = present_hset_sd,
53896+ .absent = NULL,
53897+ .save_len = save_len_hset_sd,
53898+ .save = save_hset_sd,
53899+ .alignment = 8
53900+ },
53901+ [FLAGS_STAT] = {
53902+ .h = {
53903+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53904+ .id = FLAGS_STAT,
53905+ .pops = NULL,
53906+ .label = "flags-sd",
53907+ .desc = "inode bit flags",
53908+ .linkage = {NULL, NULL}
53909+ },
53910+ .present = present_flags_sd,
53911+ .absent = NULL,
53912+ .save_len = save_len_flags_sd,
53913+ .save = save_flags_sd,
53914+ .alignment = 8
53915+ },
53916+ [CAPABILITIES_STAT] = {
53917+ .h = {
53918+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53919+ .id = CAPABILITIES_STAT,
53920+ .pops = NULL,
53921+ .label = "capabilities-sd",
53922+ .desc = "capabilities",
53923+ .linkage = {NULL, NULL}
53924+ },
53925+ .present = eio,
53926+ .absent = NULL,
53927+ .save_len = save_len_flags_sd,
53928+ .save = save_flags_sd,
53929+ .alignment = 8
53930+ },
53931+ [CRYPTO_STAT] = {
53932+ .h = {
53933+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53934+ .id = CRYPTO_STAT,
53935+ .pops = NULL,
53936+ .label = "crypto-sd",
53937+ .desc = "secret key size and id",
53938+ .linkage = {NULL, NULL}
53939+ },
53940+ .present = present_crypto_sd,
53941+ .absent = NULL,
53942+ .save_len = save_len_crypto_sd,
53943+ .save = save_crypto_sd,
53944+ .alignment = 8
53945+ }
53946+};
53947+
53948+/* Make Linus happy.
53949+ Local variables:
53950+ c-indentation-style: "K&R"
53951+ mode-name: "LC"
53952+ c-basic-offset: 8
53953+ tab-width: 8
53954+ fill-column: 120
53955+ End:
53956+*/
53957diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.20/fs/reiser4/plugin/item/static_stat.h
53958--- linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300
53959+++ linux-2.6.20/fs/reiser4/plugin/item/static_stat.h 2007-05-06 14:50:43.823014469 +0400
53960@@ -0,0 +1,224 @@
53961+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53962+
53963+/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
53964+
53965+In the case where each file has not less than the fields needed by the
53966+stat() syscall, it is more compact to store those fields in this
53967+struct.
53968+
53969+If this item does not exist, then all stats are dynamically resolved.
53970+At the moment, we either resolve all stats dynamically or all of them
53971+statically. If you think this is not fully optimal, and the rest of
53972+reiser4 is working, then fix it...:-)
53973+
53974+*/
53975+
53976+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
53977+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
53978+
53979+#include "../../forward.h"
53980+#include "../../dformat.h"
53981+
53982+#include <linux/fs.h> /* for struct inode */
53983+
53984+/* Stat data layout: goals and implementation.
53985+
53986+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
53987+ them, including not having semantic metadata attached to them.
53988+
53989+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
53990+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
53991+ sized structure because the statically sized structure knows without recording it what the names and lengths of the
53992+ attributes are.
53993+
53994+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file
53995+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
53996+ file in their use of file attributes.
53997+
53998+ Yet this compromise deserves to be compromised a little.
53999+
54000+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension
54001+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
54002+
54003+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
54004+ from parent directory (as uid, gid) or initialised to some sane values.
54005+
54006+ To capitalize on existing code infrastructure, extensions are
54007+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
54008+ Each stat-data extension plugin implements four methods:
54009+
54010+ ->present() called by sd_load() when this extension is found in stat-data
54011+ ->absent() called by sd_load() when this extension is not found in stat-data
54012+ ->save_len() called by sd_len() to calculate total length of stat-data
54013+ ->save() called by sd_save() to store extension data into stat-data
54014+
54015+ Implementation is in fs/reiser4/plugin/item/static_stat.c
54016+*/
54017+
54018+/* stat-data extension. Please order this by presumed frequency of use */
54019+typedef enum {
54020+ /* support for light-weight files */
54021+ LIGHT_WEIGHT_STAT,
54022+ /* data required to implement unix stat(2) call. Layout is in
54023+ reiser4_unix_stat. If this is not present, file is light-weight */
54024+ UNIX_STAT,
54025+ /* this contains additional set of 32bit [anc]time fields to implement
54026+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
54027+ if this extension is governed by 32bittimes mount option. */
54028+ LARGE_TIMES_STAT,
54029+ /* stat data has link name included */
54030+ SYMLINK_STAT,
54031+ /* on-disk slots of non-standard plugins for main plugin table
54032+ (@reiser4_inode->pset), that is, plugins that cannot be deduced
54033+ from file mode bits), for example, aggregation, interpolation etc. */
54034+ PLUGIN_STAT,
54035+ /* this extension contains persistent inode flags. These flags are
54036+ single bits: immutable, append, only, etc. Layout is in
54037+ reiser4_flags_stat. */
54038+ FLAGS_STAT,
54039+ /* this extension contains capabilities sets, associated with this
54040+ file. Layout is in reiser4_capabilities_stat */
54041+ CAPABILITIES_STAT,
54042+ /* this extension contains size and public id of the secret key.
54043+ Layout is in reiser4_crypto_stat */
54044+ CRYPTO_STAT,
54045+ /* on-disk slots of non-default plugins for inheritance, which
54046+ are extracted to special plugin table (@reiser4_inode->hset).
54047+ By default, children of the object will inherit plugins from
54048+ its main plugin table (pset). */
54049+ HEIR_STAT,
54050+ LAST_SD_EXTENSION,
54051+ /*
54052+ * init_inode_static_sd() iterates over extension mask until all
54053+ * non-zero bits are processed. This means, that neither ->present(),
54054+ * nor ->absent() methods will be called for stat-data extensions that
54055+ * go after last present extension. But some basic extensions, we want
54056+ * either ->absent() or ->present() method to be called, because these
54057+ * extensions set up something in inode even when they are not
54058+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
54059+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
54060+ * ->present(), or ->absent() method will be called, independently of
54061+ * what other extensions are present.
54062+ */
54063+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
54064+} sd_ext_bits;
54065+
54066+/* minimal stat-data. This allows to support light-weight files. */
54067+typedef struct reiser4_stat_data_base {
54068+ /* 0 */ __le16 extmask;
54069+ /* 2 */
54070+} PACKED reiser4_stat_data_base;
54071+
54072+typedef struct reiser4_light_weight_stat {
54073+ /* 0 */ __le16 mode;
54074+ /* 2 */ __le32 nlink;
54075+ /* 6 */ __le64 size;
54076+ /* size in bytes */
54077+ /* 14 */
54078+} PACKED reiser4_light_weight_stat;
54079+
54080+typedef struct reiser4_unix_stat {
54081+ /* owner id */
54082+ /* 0 */ __le32 uid;
54083+ /* group id */
54084+ /* 4 */ __le32 gid;
54085+ /* access time */
54086+ /* 8 */ __le32 atime;
54087+ /* modification time */
54088+ /* 12 */ __le32 mtime;
54089+ /* change time */
54090+ /* 16 */ __le32 ctime;
54091+ union {
54092+ /* minor:major for device files */
54093+ /* 20 */ __le64 rdev;
54094+ /* bytes used by file */
54095+ /* 20 */ __le64 bytes;
54096+ } u;
54097+ /* 28 */
54098+} PACKED reiser4_unix_stat;
54099+
54100+/* symlink stored as part of inode */
54101+typedef struct reiser4_symlink_stat {
54102+ char body[0];
54103+} PACKED reiser4_symlink_stat;
54104+
54105+typedef struct reiser4_plugin_slot {
54106+ /* 0 */ __le16 pset_memb;
54107+ /* 2 */ __le16 id;
54108+ /* 4 *//* here plugin stores its persistent state */
54109+} PACKED reiser4_plugin_slot;
54110+
54111+/* stat-data extension for files with non-standard plugin. */
54112+typedef struct reiser4_plugin_stat {
54113+ /* number of additional plugins, associated with this object */
54114+ /* 0 */ __le16 plugins_no;
54115+ /* 2 */ reiser4_plugin_slot slot[0];
54116+ /* 2 */
54117+} PACKED reiser4_plugin_stat;
54118+
54119+/* stat-data extension for inode flags. Currently it is just fixed-width 32
54120+ * bit mask. If need arise, this can be replaced with variable width
54121+ * bitmask. */
54122+typedef struct reiser4_flags_stat {
54123+ /* 0 */ __le32 flags;
54124+ /* 4 */
54125+} PACKED reiser4_flags_stat;
54126+
54127+typedef struct reiser4_capabilities_stat {
54128+ /* 0 */ __le32 effective;
54129+ /* 8 */ __le32 permitted;
54130+ /* 16 */
54131+} PACKED reiser4_capabilities_stat;
54132+
54133+typedef struct reiser4_cluster_stat {
54134+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
54135+ /* 0 */ d8 cluster_shift;
54136+ /* 1 */
54137+} PACKED reiser4_cluster_stat;
54138+
54139+typedef struct reiser4_crypto_stat {
54140+ /* secret key size, bits */
54141+ /* 0 */ d16 keysize;
54142+ /* secret key id */
54143+ /* 2 */ d8 keyid[0];
54144+ /* 2 */
54145+} PACKED reiser4_crypto_stat;
54146+
54147+typedef struct reiser4_large_times_stat {
54148+ /* access time */
54149+ /* 0 */ d32 atime;
54150+ /* modification time */
54151+ /* 4 */ d32 mtime;
54152+ /* change time */
54153+ /* 8 */ d32 ctime;
54154+ /* 12 */
54155+} PACKED reiser4_large_times_stat;
54156+
54157+/* this structure is filled by sd_item_stat */
54158+typedef struct sd_stat {
54159+ int dirs;
54160+ int files;
54161+ int others;
54162+} sd_stat;
54163+
54164+/* plugin->item.common.* */
54165+extern void print_sd(const char *prefix, coord_t * coord);
54166+extern void item_stat_static_sd(const coord_t * coord, void *vp);
54167+
54168+/* plugin->item.s.sd.* */
54169+extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
54170+extern int save_len_static_sd(struct inode *inode);
54171+extern int save_static_sd(struct inode *inode, char **area);
54172+
54173+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
54174+#endif
54175+
54176+/* Make Linus happy.
54177+ Local variables:
54178+ c-indentation-style: "K&R"
54179+ mode-name: "LC"
54180+ c-basic-offset: 8
54181+ tab-width: 8
54182+ fill-column: 120
54183+ End:
54184+*/
54185diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/tail.c linux-2.6.20/fs/reiser4/plugin/item/tail.c
54186--- linux-2.6.20.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300
54187+++ linux-2.6.20/fs/reiser4/plugin/item/tail.c 2007-05-06 14:50:43.823014469 +0400
54188@@ -0,0 +1,812 @@
54189+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54190+
54191+#include "item.h"
54192+#include "../../inode.h"
54193+#include "../../page_cache.h"
54194+#include "../../carry.h"
54195+#include "../../vfs_ops.h"
54196+
54197+#include <linux/quotaops.h>
54198+#include <asm/uaccess.h>
54199+#include <linux/swap.h>
54200+#include <linux/writeback.h>
54201+
54202+/* plugin->u.item.b.max_key_inside */
54203+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
54204+{
54205+ item_key_by_coord(coord, key);
54206+ set_key_offset(key, get_key_offset(reiser4_max_key()));
54207+ return key;
54208+}
54209+
54210+/* plugin->u.item.b.can_contain_key */
54211+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54212+ const reiser4_item_data *data)
54213+{
54214+ reiser4_key item_key;
54215+
54216+ if (item_plugin_by_coord(coord) != data->iplug)
54217+ return 0;
54218+
54219+ item_key_by_coord(coord, &item_key);
54220+ if (get_key_locality(key) != get_key_locality(&item_key) ||
54221+ get_key_objectid(key) != get_key_objectid(&item_key))
54222+ return 0;
54223+
54224+ return 1;
54225+}
54226+
54227+/* plugin->u.item.b.mergeable
54228+ first item is of tail type */
54229+/* Audited by: green(2002.06.14) */
54230+int mergeable_tail(const coord_t *p1, const coord_t *p2)
54231+{
54232+ reiser4_key key1, key2;
54233+
54234+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
54235+ UNIX_FILE_METADATA_ITEM_TYPE));
54236+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54237+
54238+ if (item_id_by_coord(p2) != FORMATTING_ID) {
54239+ /* second item is of another type */
54240+ return 0;
54241+ }
54242+
54243+ item_key_by_coord(p1, &key1);
54244+ item_key_by_coord(p2, &key2);
54245+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
54246+ get_key_objectid(&key1) != get_key_objectid(&key2)
54247+ || get_key_type(&key1) != get_key_type(&key2)) {
54248+ /* items of different objects */
54249+ return 0;
54250+ }
54251+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54252+ /* not adjacent items */
54253+ return 0;
54254+ }
54255+ return 1;
54256+}
54257+
54258+/* plugin->u.item.b.print
54259+ plugin->u.item.b.check */
54260+
54261+/* plugin->u.item.b.nr_units */
54262+pos_in_node_t nr_units_tail(const coord_t * coord)
54263+{
54264+ return item_length_by_coord(coord);
54265+}
54266+
54267+/* plugin->u.item.b.lookup */
54268+lookup_result
54269+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54270+{
54271+ reiser4_key item_key;
54272+ __u64 lookuped, offset;
54273+ unsigned nr_units;
54274+
54275+ item_key_by_coord(coord, &item_key);
54276+ offset = get_key_offset(item_key_by_coord(coord, &item_key));
54277+ nr_units = nr_units_tail(coord);
54278+
54279+ /* key we are looking for must be greater than key of item @coord */
54280+ assert("vs-416", keygt(key, &item_key));
54281+
54282+ /* offset we are looking for */
54283+ lookuped = get_key_offset(key);
54284+
54285+ if (lookuped >= offset && lookuped < offset + nr_units) {
54286+ /* byte we are looking for is in this item */
54287+ coord->unit_pos = lookuped - offset;
54288+ coord->between = AT_UNIT;
54289+ return CBK_COORD_FOUND;
54290+ }
54291+
54292+ /* set coord after last unit */
54293+ coord->unit_pos = nr_units - 1;
54294+ coord->between = AFTER_UNIT;
54295+ return bias ==
54296+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54297+}
54298+
54299+/* plugin->u.item.b.paste */
54300+int
54301+paste_tail(coord_t *coord, reiser4_item_data *data,
54302+ carry_plugin_info *info UNUSED_ARG)
54303+{
54304+ unsigned old_item_length;
54305+ char *item;
54306+
54307+ /* length the item had before resizing has been performed */
54308+ old_item_length = item_length_by_coord(coord) - data->length;
54309+
54310+ /* tail items never get pasted in the middle */
54311+ assert("vs-363",
54312+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54313+ (coord->unit_pos == old_item_length - 1 &&
54314+ coord->between == AFTER_UNIT) ||
54315+ (coord->unit_pos == 0 && old_item_length == 0
54316+ && coord->between == AT_UNIT));
54317+
54318+ item = item_body_by_coord(coord);
54319+ if (coord->unit_pos == 0)
54320+ /* make space for pasted data when pasting at the beginning of
54321+ the item */
54322+ memmove(item + data->length, item, old_item_length);
54323+
54324+ if (coord->between == AFTER_UNIT)
54325+ coord->unit_pos++;
54326+
54327+ if (data->data) {
54328+ assert("vs-554", data->user == 0 || data->user == 1);
54329+ if (data->user) {
54330+ assert("nikita-3035", reiser4_schedulable());
54331+ /* copy from user space */
54332+ if (__copy_from_user(item + coord->unit_pos,
54333+ (const char __user *)data->data,
54334+ (unsigned)data->length))
54335+ return RETERR(-EFAULT);
54336+ } else
54337+ /* copy from kernel space */
54338+ memcpy(item + coord->unit_pos, data->data,
54339+ (unsigned)data->length);
54340+ } else {
54341+ memset(item + coord->unit_pos, 0, (unsigned)data->length);
54342+ }
54343+ return 0;
54344+}
54345+
54346+/* plugin->u.item.b.fast_paste */
54347+
54348+/* plugin->u.item.b.can_shift
54349+ number of units is returned via return value, number of bytes via @size. For
54350+ tail items they coincide */
54351+int
54352+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54353+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54354+ unsigned *size, unsigned want)
54355+{
54356+ /* make sure that that we do not want to shift more than we have */
54357+ assert("vs-364", want > 0
54358+ && want <= (unsigned)item_length_by_coord(source));
54359+
54360+ *size = min(want, free_space);
54361+ return *size;
54362+}
54363+
54364+/* plugin->u.item.b.copy_units */
54365+void
54366+copy_units_tail(coord_t * target, coord_t * source,
54367+ unsigned from, unsigned count,
54368+ shift_direction where_is_free_space,
54369+ unsigned free_space UNUSED_ARG)
54370+{
54371+ /* make sure that item @target is expanded already */
54372+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54373+ assert("vs-370", free_space >= count);
54374+
54375+ if (where_is_free_space == SHIFT_LEFT) {
54376+ /* append item @target with @count first bytes of @source */
54377+ assert("vs-365", from == 0);
54378+
54379+ memcpy((char *)item_body_by_coord(target) +
54380+ item_length_by_coord(target) - count,
54381+ (char *)item_body_by_coord(source), count);
54382+ } else {
54383+ /* target item is moved to right already */
54384+ reiser4_key key;
54385+
54386+ assert("vs-367",
54387+ (unsigned)item_length_by_coord(source) == from + count);
54388+
54389+ memcpy((char *)item_body_by_coord(target),
54390+ (char *)item_body_by_coord(source) + from, count);
54391+
54392+ /* new units are inserted before first unit in an item,
54393+ therefore, we have to update item key */
54394+ item_key_by_coord(source, &key);
54395+ set_key_offset(&key, get_key_offset(&key) + from);
54396+
54397+ node_plugin_by_node(target->node)->update_item_key(target, &key,
54398+ NULL /*info */);
54399+ }
54400+}
54401+
54402+/* plugin->u.item.b.create_hook */
54403+
54404+/* item_plugin->b.kill_hook
54405+ this is called when @count units starting from @from-th one are going to be removed
54406+ */
54407+int
54408+kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54409+ pos_in_node_t count, struct carry_kill_data *kdata)
54410+{
54411+ reiser4_key key;
54412+ loff_t start, end;
54413+
54414+ assert("vs-1577", kdata);
54415+ assert("vs-1579", kdata->inode);
54416+
54417+ item_key_by_coord(coord, &key);
54418+ start = get_key_offset(&key) + from;
54419+ end = start + count;
54420+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54421+ return 0;
54422+}
54423+
54424+/* plugin->u.item.b.shift_hook */
54425+
54426+/* helper for kill_units_tail and cut_units_tail */
54427+static int
54428+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54429+ reiser4_key * smallest_removed, reiser4_key * new_first)
54430+{
54431+ pos_in_node_t count;
54432+
54433+ /* this method is only called to remove part of item */
54434+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54435+ /* tails items are never cut from the middle of an item */
54436+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
54437+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
54438+
54439+ count = to - from + 1;
54440+
54441+ if (smallest_removed) {
54442+ /* store smallest key removed */
54443+ item_key_by_coord(coord, smallest_removed);
54444+ set_key_offset(smallest_removed,
54445+ get_key_offset(smallest_removed) + from);
54446+ }
54447+ if (new_first) {
54448+ /* head of item is cut */
54449+ assert("vs-1529", from == 0);
54450+
54451+ item_key_by_coord(coord, new_first);
54452+ set_key_offset(new_first,
54453+ get_key_offset(new_first) + from + count);
54454+ }
54455+
54456+ if (REISER4_DEBUG)
54457+ memset((char *)item_body_by_coord(coord) + from, 0, count);
54458+ return count;
54459+}
54460+
54461+/* plugin->u.item.b.cut_units */
54462+int
54463+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54464+ struct carry_cut_data *cdata UNUSED_ARG,
54465+ reiser4_key * smallest_removed, reiser4_key * new_first)
54466+{
54467+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54468+}
54469+
54470+/* plugin->u.item.b.kill_units */
54471+int
54472+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54473+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
54474+ reiser4_key * new_first)
54475+{
54476+ kill_hook_tail(coord, from, to - from + 1, kdata);
54477+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54478+}
54479+
54480+/* plugin->u.item.b.unit_key */
54481+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
54482+{
54483+ assert("vs-375", coord_is_existing_unit(coord));
54484+
54485+ item_key_by_coord(coord, key);
54486+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
54487+
54488+ return key;
54489+}
54490+
54491+/* plugin->u.item.b.estimate
54492+ plugin->u.item.b.item_data_by_flow */
54493+
54494+/* tail redpage function. It is called from readpage_tail(). */
54495+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
54496+{
54497+ tap_t tap;
54498+ int result;
54499+ coord_t coord;
54500+ lock_handle lh;
54501+ int count, mapped;
54502+ struct inode *inode;
54503+ char *pagedata;
54504+
54505+ /* saving passed coord in order to do not move it by tap. */
54506+ init_lh(&lh);
54507+ copy_lh(&lh, uf_coord->lh);
54508+ inode = page->mapping->host;
54509+ coord_dup(&coord, &uf_coord->coord);
54510+
54511+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
54512+
54513+ if ((result = reiser4_tap_load(&tap)))
54514+ goto out_tap_done;
54515+
54516+ /* lookup until page is filled up. */
54517+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
54518+ /* number of bytes to be copied to page */
54519+ count = item_length_by_coord(&coord) - coord.unit_pos;
54520+ if (count > PAGE_CACHE_SIZE - mapped)
54521+ count = PAGE_CACHE_SIZE - mapped;
54522+
54523+ /* attach @page to address space and get data address */
54524+ pagedata = kmap_atomic(page, KM_USER0);
54525+
54526+ /* copy tail item to page */
54527+ memcpy(pagedata + mapped,
54528+ ((char *)item_body_by_coord(&coord) + coord.unit_pos),
54529+ count);
54530+ mapped += count;
54531+
54532+ flush_dcache_page(page);
54533+
54534+ /* dettach page from address space */
54535+ kunmap_atomic(pagedata, KM_USER0);
54536+
54537+ /* Getting next tail item. */
54538+ if (mapped < PAGE_CACHE_SIZE) {
54539+ /*
54540+ * unlock page in order to avoid keep it locked
54541+ * during tree lookup, which takes long term locks
54542+ */
54543+ unlock_page(page);
54544+
54545+ /* getting right neighbour. */
54546+ result = go_dir_el(&tap, RIGHT_SIDE, 0);
54547+
54548+ /* lock page back */
54549+ lock_page(page);
54550+ if (PageUptodate(page)) {
54551+ /*
54552+ * another thread read the page, we have
54553+ * nothing to do
54554+ */
54555+ result = 0;
54556+ goto out_unlock_page;
54557+ }
54558+
54559+ if (result) {
54560+ if (result == -E_NO_NEIGHBOR) {
54561+ /*
54562+ * rigth neighbor is not a formatted
54563+ * node
54564+ */
54565+ result = 0;
54566+ goto done;
54567+ } else {
54568+ goto out_tap_relse;
54569+ }
54570+ } else {
54571+ if (!inode_file_plugin(inode)->
54572+ owns_item(inode, &coord)) {
54573+ /* item of another file is found */
54574+ result = 0;
54575+ goto done;
54576+ }
54577+ }
54578+ }
54579+ }
54580+
54581+ done:
54582+ if (mapped != PAGE_CACHE_SIZE) {
54583+ pagedata = kmap_atomic(page, KM_USER0);
54584+ memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
54585+ flush_dcache_page(page);
54586+ kunmap_atomic(pagedata, KM_USER0);
54587+ }
54588+ SetPageUptodate(page);
54589+ out_unlock_page:
54590+ unlock_page(page);
54591+ out_tap_relse:
54592+ reiser4_tap_relse(&tap);
54593+ out_tap_done:
54594+ reiser4_tap_done(&tap);
54595+ return result;
54596+}
54597+
54598+/*
54599+ plugin->s.file.readpage
54600+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
54601+ or
54602+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
54603+
54604+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
54605+ item. */
54606+int readpage_tail(void *vp, struct page *page)
54607+{
54608+ uf_coord_t *uf_coord = vp;
54609+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
54610+ ON_DEBUG(reiser4_key key);
54611+
54612+ assert("umka-2515", PageLocked(page));
54613+ assert("umka-2516", !PageUptodate(page));
54614+ assert("umka-2517", !jprivate(page) && !PagePrivate(page));
54615+ assert("umka-2518", page->mapping && page->mapping->host);
54616+
54617+ assert("umka-2519", znode_is_loaded(coord->node));
54618+ assert("umka-2520", item_is_tail(coord));
54619+ assert("umka-2521", coord_is_existing_unit(coord));
54620+ assert("umka-2522", znode_is_rlocked(coord->node));
54621+ assert("umka-2523",
54622+ page->mapping->host->i_ino ==
54623+ get_key_objectid(item_key_by_coord(coord, &key)));
54624+
54625+ return do_readpage_tail(uf_coord, page);
54626+}
54627+
54628+/**
54629+ * overwrite_tail
54630+ * @flow:
54631+ * @coord:
54632+ *
54633+ * Overwrites tail item or its part by user data. Returns number of bytes
54634+ * written or error code.
54635+ */
54636+static int overwrite_tail(flow_t *flow, coord_t *coord)
54637+{
54638+ unsigned count;
54639+
54640+ assert("vs-570", flow->user == 1);
54641+ assert("vs-946", flow->data);
54642+ assert("vs-947", coord_is_existing_unit(coord));
54643+ assert("vs-948", znode_is_write_locked(coord->node));
54644+ assert("nikita-3036", reiser4_schedulable());
54645+
54646+ count = item_length_by_coord(coord) - coord->unit_pos;
54647+ if (count > flow->length)
54648+ count = flow->length;
54649+
54650+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
54651+ (const char __user *)flow->data, count))
54652+ return RETERR(-EFAULT);
54653+
54654+ znode_make_dirty(coord->node);
54655+ return count;
54656+}
54657+
54658+/**
54659+ * insert_first_tail
54660+ * @inode:
54661+ * @flow:
54662+ * @coord:
54663+ * @lh:
54664+ *
54665+ * Returns number of bytes written or error code.
54666+ */
54667+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
54668+ coord_t *coord, lock_handle *lh)
54669+{
54670+ int result;
54671+ loff_t to_write;
54672+ unix_file_info_t *uf_info;
54673+
54674+ if (get_key_offset(&flow->key) != 0) {
54675+ /*
54676+ * file is empty and we have to write not to the beginning of
54677+ * file. Create a hole at the beginning of file. On success
54678+ * insert_flow returns 0 as number of written bytes which is
54679+ * what we have to return on padding a file with holes
54680+ */
54681+ flow->data = NULL;
54682+ flow->length = get_key_offset(&flow->key);
54683+ set_key_offset(&flow->key, 0);
54684+ /*
54685+ * holes in files built of tails are stored just like if there
54686+ * were real data which are all zeros. Therefore we have to
54687+ * allocate quota here as well
54688+ */
54689+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54690+ return RETERR(-EDQUOT);
54691+ result = reiser4_insert_flow(coord, lh, flow);
54692+ if (flow->length)
54693+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54694+
54695+ uf_info = unix_file_inode_data(inode);
54696+
54697+ /*
54698+ * first item insertion is only possible when writing to empty
54699+ * file or performing tail conversion
54700+ */
54701+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
54702+ (reiser4_inode_get_flag(inode,
54703+ REISER4_PART_MIXED) &&
54704+ reiser4_inode_get_flag(inode,
54705+ REISER4_PART_IN_CONV))));
54706+ /* if file was empty - update its state */
54707+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
54708+ uf_info->container = UF_CONTAINER_TAILS;
54709+ return result;
54710+ }
54711+
54712+ /* check quota before appending data */
54713+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54714+ return RETERR(-EDQUOT);
54715+
54716+ to_write = flow->length;
54717+ result = reiser4_insert_flow(coord, lh, flow);
54718+ if (flow->length)
54719+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54720+ return (to_write - flow->length) ? (to_write - flow->length) : result;
54721+}
54722+
54723+/**
54724+ * append_tail
54725+ * @inode:
54726+ * @flow:
54727+ * @coord:
54728+ * @lh:
54729+ *
54730+ * Returns number of bytes written or error code.
54731+ */
54732+static ssize_t append_tail(struct inode *inode,
54733+ flow_t *flow, coord_t *coord, lock_handle *lh)
54734+{
54735+ int result;
54736+ reiser4_key append_key;
54737+ loff_t to_write;
54738+
54739+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
54740+ flow->data = NULL;
54741+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
54742+ set_key_offset(&flow->key, get_key_offset(&append_key));
54743+ /*
54744+ * holes in files built of tails are stored just like if there
54745+ * were real data which are all zeros. Therefore we have to
54746+ * allocate quota here as well
54747+ */
54748+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54749+ return RETERR(-EDQUOT);
54750+ result = reiser4_insert_flow(coord, lh, flow);
54751+ if (flow->length)
54752+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54753+ return result;
54754+ }
54755+
54756+ /* check quota before appending data */
54757+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54758+ return RETERR(-EDQUOT);
54759+
54760+ to_write = flow->length;
54761+ result = reiser4_insert_flow(coord, lh, flow);
54762+ if (flow->length)
54763+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54764+ return (to_write - flow->length) ? (to_write - flow->length) : result;
54765+}
54766+
54767+/**
54768+ * write_tail_reserve_space - reserve space for tail write operation
54769+ * @inode:
54770+ *
54771+ * Estimates and reserves space which may be required for writing one flow to a
54772+ * file
54773+ */
54774+static int write_extent_reserve_space(struct inode *inode)
54775+{
54776+ __u64 count;
54777+ reiser4_tree *tree;
54778+
54779+ /*
54780+ * to write one flow to a file by tails we have to reserve disk space for:
54781+
54782+ * 1. find_file_item may have to insert empty node to the tree (empty
54783+ * leaf node between two extent items). This requires 1 block and
54784+ * number of blocks which are necessary to perform insertion of an
54785+ * internal item into twig level.
54786+ *
54787+ * 2. flow insertion
54788+ *
54789+ * 3. stat data update
54790+ */
54791+ tree = reiser4_tree_by_inode(inode);
54792+ count = estimate_one_insert_item(tree) +
54793+ estimate_insert_flow(tree->height) +
54794+ estimate_one_insert_item(tree);
54795+ grab_space_enable();
54796+ return reiser4_grab_space(count, 0 /* flags */);
54797+}
54798+
54799+#define PAGE_PER_FLOW 4
54800+
54801+static loff_t faultin_user_pages(const char __user *buf, size_t count)
54802+{
54803+ loff_t faulted;
54804+ int to_fault;
54805+
54806+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
54807+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
54808+ faulted = 0;
54809+ while (count > 0) {
54810+ to_fault = PAGE_CACHE_SIZE;
54811+ if (count < to_fault)
54812+ to_fault = count;
54813+ fault_in_pages_readable(buf + faulted, to_fault);
54814+ count -= to_fault;
54815+ faulted += to_fault;
54816+ }
54817+ return faulted;
54818+}
54819+
54820+/**
54821+ * reiser4_write_extent - write method of tail item plugin
54822+ * @file: file to write to
54823+ * @buf: address of user-space buffer
54824+ * @count: number of bytes to write
54825+ * @pos: position in file to write to
54826+ *
54827+ * Returns number of written bytes or error code.
54828+ */
54829+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
54830+ size_t count, loff_t *pos)
54831+{
54832+ struct inode *inode;
54833+ struct hint hint;
54834+ int result;
54835+ flow_t flow;
54836+ coord_t *coord;
54837+ lock_handle *lh;
54838+ znode *loaded;
54839+
54840+ inode = file->f_dentry->d_inode;
54841+
54842+ if (write_extent_reserve_space(inode))
54843+ return RETERR(-ENOSPC);
54844+
54845+ result = load_file_hint(file, &hint);
54846+ BUG_ON(result != 0);
54847+
54848+ flow.length = faultin_user_pages(buf, count);
54849+ flow.user = 1;
54850+ memcpy(&flow.data, &buf, sizeof(buf));
54851+ flow.op = WRITE_OP;
54852+ key_by_inode_and_offset_common(inode, *pos, &flow.key);
54853+
54854+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
54855+ if (IS_CBKERR(result))
54856+ return result;
54857+
54858+ coord = &hint.ext_coord.coord;
54859+ lh = hint.ext_coord.lh;
54860+
54861+ result = zload(coord->node);
54862+ BUG_ON(result != 0);
54863+ loaded = coord->node;
54864+
54865+ if (coord->between == AFTER_UNIT) {
54866+ /* append with data or hole */
54867+ result = append_tail(inode, &flow, coord, lh);
54868+ } else if (coord->between == AT_UNIT) {
54869+ /* overwrite */
54870+ result = overwrite_tail(&flow, coord);
54871+ } else {
54872+ /* no items of this file yet. insert data or hole */
54873+ result = insert_first_tail(inode, &flow, coord, lh);
54874+ }
54875+ zrelse(loaded);
54876+ if (result < 0) {
54877+ done_lh(lh);
54878+ return result;
54879+ }
54880+
54881+ /* seal and unlock znode */
54882+ hint.ext_coord.valid = 0;
54883+ if (hint.ext_coord.valid)
54884+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
54885+ else
54886+ reiser4_unset_hint(&hint);
54887+
54888+ save_file_hint(file, &hint);
54889+ return result;
54890+}
54891+
54892+#if REISER4_DEBUG
54893+
54894+static int
54895+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
54896+{
54897+ reiser4_key item_key;
54898+
54899+ assert("vs-1356", coord_is_existing_unit(coord));
54900+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
54901+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
54902+ return get_key_offset(key) ==
54903+ get_key_offset(&item_key) + coord->unit_pos;
54904+
54905+}
54906+
54907+#endif
54908+
54909+/* plugin->u.item.s.file.read */
54910+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
54911+{
54912+ unsigned count;
54913+ int item_length;
54914+ coord_t *coord;
54915+ uf_coord_t *uf_coord;
54916+
54917+ uf_coord = &hint->ext_coord;
54918+ coord = &uf_coord->coord;
54919+
54920+ assert("vs-571", f->user == 1);
54921+ assert("vs-571", f->data);
54922+ assert("vs-967", coord && coord->node);
54923+ assert("vs-1117", znode_is_rlocked(coord->node));
54924+ assert("vs-1118", znode_is_loaded(coord->node));
54925+
54926+ assert("nikita-3037", reiser4_schedulable());
54927+ assert("vs-1357", coord_matches_key_tail(coord, &f->key));
54928+
54929+ /* calculate number of bytes to read off the item */
54930+ item_length = item_length_by_coord(coord);
54931+ count = item_length_by_coord(coord) - coord->unit_pos;
54932+ if (count > f->length)
54933+ count = f->length;
54934+
54935+ /* user page has to be brought in so that major page fault does not
54936+ * occur here when longtem lock is held */
54937+ if (__copy_to_user((char __user *)f->data,
54938+ ((char *)item_body_by_coord(coord) + coord->unit_pos),
54939+ count))
54940+ return RETERR(-EFAULT);
54941+
54942+ /* probably mark_page_accessed() should only be called if
54943+ * coord->unit_pos is zero. */
54944+ mark_page_accessed(znode_page(coord->node));
54945+ move_flow_forward(f, count);
54946+
54947+ coord->unit_pos += count;
54948+ if (item_length == coord->unit_pos) {
54949+ coord->unit_pos--;
54950+ coord->between = AFTER_UNIT;
54951+ }
54952+
54953+ return 0;
54954+}
54955+
54956+/*
54957+ plugin->u.item.s.file.append_key
54958+ key of first byte which is the next to last byte by addressed by this item
54959+*/
54960+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
54961+{
54962+ item_key_by_coord(coord, key);
54963+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
54964+ return key;
54965+}
54966+
54967+/* plugin->u.item.s.file.init_coord_extension */
54968+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
54969+{
54970+ uf_coord->valid = 1;
54971+}
54972+
54973+/*
54974+ plugin->u.item.s.file.get_block
54975+*/
54976+int
54977+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
54978+{
54979+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
54980+
54981+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
54982+ /* if node has'nt obtainet its block number yet, return 0.
54983+ * Lets avoid upsetting users with some cosmic numbers beyond
54984+ * the device capacity.*/
54985+ *block = 0;
54986+ else
54987+ *block = *znode_get_block(coord->node);
54988+ return 0;
54989+}
54990+
54991+/*
54992+ * Local variables:
54993+ * c-indentation-style: "K&R"
54994+ * mode-name: "LC"
54995+ * c-basic-offset: 8
54996+ * tab-width: 8
54997+ * fill-column: 79
54998+ * scroll-step: 1
54999+ * End:
55000+ */
55001diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/tail.h linux-2.6.20/fs/reiser4/plugin/item/tail.h
55002--- linux-2.6.20.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300
55003+++ linux-2.6.20/fs/reiser4/plugin/item/tail.h 2007-05-06 14:50:43.827015719 +0400
55004@@ -0,0 +1,58 @@
55005+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55006+
55007+#if !defined( __REISER4_TAIL_H__ )
55008+#define __REISER4_TAIL_H__
55009+
55010+typedef struct {
55011+ int not_used;
55012+} tail_coord_extension_t;
55013+
55014+struct cut_list;
55015+
55016+/* plugin->u.item.b.* */
55017+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
55018+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
55019+ const reiser4_item_data *);
55020+int mergeable_tail(const coord_t * p1, const coord_t * p2);
55021+pos_in_node_t nr_units_tail(const coord_t *);
55022+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
55023+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
55024+int can_shift_tail(unsigned free_space, coord_t * source,
55025+ znode * target, shift_direction, unsigned *size,
55026+ unsigned want);
55027+void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
55028+ unsigned count, shift_direction, unsigned free_space);
55029+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
55030+ struct carry_kill_data *);
55031+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55032+ struct carry_cut_data *, reiser4_key * smallest_removed,
55033+ reiser4_key * new_first);
55034+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55035+ struct carry_kill_data *, reiser4_key * smallest_removed,
55036+ reiser4_key * new_first);
55037+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
55038+
55039+/* plugin->u.item.s.* */
55040+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
55041+ size_t count, loff_t *pos);
55042+int reiser4_read_tail(struct file *, flow_t *, hint_t *);
55043+int readpage_tail(void *vp, struct page *page);
55044+reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
55045+void init_coord_extension_tail(uf_coord_t *, loff_t offset);
55046+int get_block_address_tail(const coord_t *, sector_t, sector_t *);
55047+int item_balance_dirty_pages(struct address_space *, const flow_t *,
55048+ hint_t *, int back_to_dirty, int set_hint);
55049+
55050+/* __REISER4_TAIL_H__ */
55051+#endif
55052+
55053+/* Make Linus happy.
55054+ Local variables:
55055+ c-indentation-style: "K&R"
55056+ mode-name: "LC"
55057+ c-basic-offset: 8
55058+ tab-width: 8
55059+ fill-column: 120
55060+ scroll-step: 1
55061+ End:
55062+*/
55063diff -urN linux-2.6.20.orig/fs/reiser4/plugin/Makefile linux-2.6.20/fs/reiser4/plugin/Makefile
55064--- linux-2.6.20.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300
55065+++ linux-2.6.20/fs/reiser4/plugin/Makefile 2007-05-06 14:50:43.827015719 +0400
55066@@ -0,0 +1,26 @@
55067+obj-$(CONFIG_REISER4_FS) += plugins.o
55068+
55069+plugins-objs := \
55070+ plugin.o \
55071+ plugin_set.o \
55072+ object.o \
55073+ inode_ops.o \
55074+ inode_ops_rename.o \
55075+ file_ops.o \
55076+ file_ops_readdir.o \
55077+ file_plugin_common.o \
55078+ dir_plugin_common.o \
55079+ digest.o \
55080+ hash.o \
55081+ fibration.o \
55082+ tail_policy.o \
55083+ regular.o
55084+
55085+obj-$(CONFIG_REISER4_FS) += item/
55086+obj-$(CONFIG_REISER4_FS) += file/
55087+obj-$(CONFIG_REISER4_FS) += dir/
55088+obj-$(CONFIG_REISER4_FS) += node/
55089+obj-$(CONFIG_REISER4_FS) += compress/
55090+obj-$(CONFIG_REISER4_FS) += space/
55091+obj-$(CONFIG_REISER4_FS) += disk_format/
55092+obj-$(CONFIG_REISER4_FS) += security/
55093diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/Makefile linux-2.6.20/fs/reiser4/plugin/node/Makefile
55094--- linux-2.6.20.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300
55095+++ linux-2.6.20/fs/reiser4/plugin/node/Makefile 2007-05-06 14:50:43.827015719 +0400
55096@@ -0,0 +1,5 @@
55097+obj-$(CONFIG_REISER4_FS) += node_plugins.o
55098+
55099+node_plugins-objs := \
55100+ node.o \
55101+ node40.o
55102diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node40.c linux-2.6.20/fs/reiser4/plugin/node/node40.c
55103--- linux-2.6.20.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300
55104+++ linux-2.6.20/fs/reiser4/plugin/node/node40.c 2007-05-06 14:50:43.831016969 +0400
55105@@ -0,0 +1,2924 @@
55106+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55107+
55108+#include "../../debug.h"
55109+#include "../../key.h"
55110+#include "../../coord.h"
55111+#include "../plugin_header.h"
55112+#include "../item/item.h"
55113+#include "node.h"
55114+#include "node40.h"
55115+#include "../plugin.h"
55116+#include "../../jnode.h"
55117+#include "../../znode.h"
55118+#include "../../pool.h"
55119+#include "../../carry.h"
55120+#include "../../tap.h"
55121+#include "../../tree.h"
55122+#include "../../super.h"
55123+#include "../../reiser4.h"
55124+
55125+#include <asm/uaccess.h>
55126+#include <linux/types.h>
55127+#include <linux/prefetch.h>
55128+
55129+/* leaf 40 format:
55130+
55131+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
55132+ plugin_id (16) key
55133+ free_space (16) pluginid (16)
55134+ free_space_start (16) offset (16)
55135+ level (8)
55136+ num_items (16)
55137+ magic (32)
55138+ flush_time (32)
55139+*/
55140+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
55141+/* magic number that is stored in ->magic field of node header */
55142+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
55143+
55144+static int prepare_for_update(znode * left, znode * right,
55145+ carry_plugin_info * info);
55146+
55147+/* header of node of reiser40 format is at the beginning of node */
55148+static inline node40_header *node40_node_header(const znode * node /* node to
55149+ * query */ )
55150+{
55151+ assert("nikita-567", node != NULL);
55152+ assert("nikita-568", znode_page(node) != NULL);
55153+ assert("nikita-569", zdata(node) != NULL);
55154+ return (node40_header *) zdata(node);
55155+}
55156+
55157+/* functions to get/set fields of node40_header */
55158+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
55159+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
55160+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
55161+#define nh40_get_level(nh) get_unaligned(&(nh)->level)
55162+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
55163+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
55164+
55165+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
55166+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
55167+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
55168+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
55169+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
55170+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
55171+
55172+/* plugin field of node header should be read/set by
55173+ plugin_by_disk_id/save_disk_plugin */
55174+
55175+/* array of item headers is at the end of node */
55176+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
55177+{
55178+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
55179+}
55180+
55181+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
55182+ */
55183+static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
55184+{
55185+ return (item_header40 *) (zdata(coord->node) +
55186+ znode_size(coord->node)) - (coord->item_pos) -
55187+ 1;
55188+}
55189+
55190+/* functions to get/set fields of item_header40 */
55191+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
55192+
55193+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
55194+
55195+/* plugin field of item header should be read/set by
55196+ plugin_by_disk_id/save_disk_plugin */
55197+
55198+/* plugin methods */
55199+
55200+/* plugin->u.node.item_overhead
55201+ look for description of this method in plugin/node/node.h */
55202+size_t
55203+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
55204+{
55205+ return sizeof(item_header40);
55206+}
55207+
55208+/* plugin->u.node.free_space
55209+ look for description of this method in plugin/node/node.h */
55210+size_t free_space_node40(znode * node)
55211+{
55212+ assert("nikita-577", node != NULL);
55213+ assert("nikita-578", znode_is_loaded(node));
55214+ assert("nikita-579", zdata(node) != NULL);
55215+
55216+ return nh40_get_free_space(node40_node_header(node));
55217+}
55218+
55219+/* private inline version of node40_num_of_items() for use in this file. This
55220+ is necessary, because address of node40_num_of_items() is taken and it is
55221+ never inlined as a result. */
55222+static inline short node40_num_of_items_internal(const znode * node)
55223+{
55224+ return nh40_get_num_items(node40_node_header(node));
55225+}
55226+
55227+#if REISER4_DEBUG
55228+static inline void check_num_items(const znode * node)
55229+{
55230+ assert("nikita-2749",
55231+ node40_num_of_items_internal(node) == node->nr_items);
55232+ assert("nikita-2746", znode_is_write_locked(node));
55233+}
55234+#else
55235+#define check_num_items(node) noop
55236+#endif
55237+
55238+/* plugin->u.node.num_of_items
55239+ look for description of this method in plugin/node/node.h */
55240+int num_of_items_node40(const znode * node)
55241+{
55242+ return node40_num_of_items_internal(node);
55243+}
55244+
55245+static void
55246+node40_set_num_items(znode * node, node40_header * nh, unsigned value)
55247+{
55248+ assert("nikita-2751", node != NULL);
55249+ assert("nikita-2750", nh == node40_node_header(node));
55250+
55251+ check_num_items(node);
55252+ nh40_set_num_items(nh, value);
55253+ node->nr_items = value;
55254+ check_num_items(node);
55255+}
55256+
55257+/* plugin->u.node.item_by_coord
55258+ look for description of this method in plugin/node/node.h */
55259+char *item_by_coord_node40(const coord_t * coord)
55260+{
55261+ item_header40 *ih;
55262+ char *p;
55263+
55264+ /* @coord is set to existing item */
55265+ assert("nikita-596", coord != NULL);
55266+ assert("vs-255", coord_is_existing_item(coord));
55267+
55268+ ih = node40_ih_at_coord(coord);
55269+ p = zdata(coord->node) + ih40_get_offset(ih);
55270+ return p;
55271+}
55272+
55273+/* plugin->u.node.length_by_coord
55274+ look for description of this method in plugin/node/node.h */
55275+int length_by_coord_node40(const coord_t * coord)
55276+{
55277+ item_header40 *ih;
55278+ int result;
55279+
55280+ /* @coord is set to existing item */
55281+ assert("vs-256", coord != NULL);
55282+ assert("vs-257", coord_is_existing_item(coord));
55283+
55284+ ih = node40_ih_at_coord(coord);
55285+ if ((int)coord->item_pos ==
55286+ node40_num_of_items_internal(coord->node) - 1)
55287+ result =
55288+ nh40_get_free_space_start(node40_node_header(coord->node)) -
55289+ ih40_get_offset(ih);
55290+ else
55291+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55292+
55293+ return result;
55294+}
55295+
55296+static pos_in_node_t
55297+node40_item_length(const znode * node, pos_in_node_t item_pos)
55298+{
55299+ item_header40 *ih;
55300+ pos_in_node_t result;
55301+
55302+ /* @coord is set to existing item */
55303+ assert("vs-256", node != NULL);
55304+ assert("vs-257", node40_num_of_items_internal(node) > item_pos);
55305+
55306+ ih = node40_ih_at(node, item_pos);
55307+ if (item_pos == node40_num_of_items_internal(node) - 1)
55308+ result =
55309+ nh40_get_free_space_start(node40_node_header(node)) -
55310+ ih40_get_offset(ih);
55311+ else
55312+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55313+
55314+ return result;
55315+}
55316+
55317+/* plugin->u.node.plugin_by_coord
55318+ look for description of this method in plugin/node/node.h */
55319+item_plugin *plugin_by_coord_node40(const coord_t * coord)
55320+{
55321+ item_header40 *ih;
55322+ item_plugin *result;
55323+
55324+ /* @coord is set to existing item */
55325+ assert("vs-258", coord != NULL);
55326+ assert("vs-259", coord_is_existing_item(coord));
55327+
55328+ ih = node40_ih_at_coord(coord);
55329+ /* pass NULL in stead of current tree. This is time critical call. */
55330+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
55331+ return result;
55332+}
55333+
55334+/* plugin->u.node.key_at
55335+ look for description of this method in plugin/node/node.h */
55336+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
55337+{
55338+ item_header40 *ih;
55339+
55340+ assert("nikita-1765", coord_is_existing_item(coord));
55341+
55342+ /* @coord is set to existing item */
55343+ ih = node40_ih_at_coord(coord);
55344+ memcpy(key, &ih->key, sizeof(reiser4_key));
55345+ return key;
55346+}
55347+
55348+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
55349+
55350+#define NODE_INCSTAT(n, counter) \
55351+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
55352+
55353+#define NODE_ADDSTAT(n, counter, val) \
55354+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
55355+
55356+/* plugin->u.node.lookup
55357+ look for description of this method in plugin/node/node.h */
55358+node_search_result lookup_node40(znode * node /* node to query */ ,
55359+ const reiser4_key * key /* key to look for */ ,
55360+ lookup_bias bias /* search bias */ ,
55361+ coord_t * coord /* resulting coord */ )
55362+{
55363+ int left;
55364+ int right;
55365+ int found;
55366+ int items;
55367+
55368+ item_header40 *lefth;
55369+ item_header40 *righth;
55370+
55371+ item_plugin *iplug;
55372+ item_header40 *bstop;
55373+ item_header40 *ih;
55374+ cmp_t order;
55375+
55376+ assert("nikita-583", node != NULL);
55377+ assert("nikita-584", key != NULL);
55378+ assert("nikita-585", coord != NULL);
55379+ assert("nikita-2693", znode_is_any_locked(node));
55380+ cassert(REISER4_SEQ_SEARCH_BREAK > 2);
55381+
55382+ items = node_num_items(node);
55383+
55384+ if (unlikely(items == 0)) {
55385+ coord_init_first_unit(coord, node);
55386+ return NS_NOT_FOUND;
55387+ }
55388+
55389+ /* binary search for item that can contain given key */
55390+ left = 0;
55391+ right = items - 1;
55392+ coord->node = node;
55393+ coord_clear_iplug(coord);
55394+ found = 0;
55395+
55396+ lefth = node40_ih_at(node, left);
55397+ righth = node40_ih_at(node, right);
55398+
55399+ /* It is known that for small arrays sequential search is on average
55400+ more efficient than binary. This is because sequential search is
55401+ coded as tight loop that can be better optimized by compilers and
55402+ for small array size gain from this optimization makes sequential
55403+ search the winner. Another, maybe more important, reason for this,
55404+ is that sequential array is more CPU cache friendly, whereas binary
55405+ search effectively destroys CPU caching.
55406+
55407+ Critical here is the notion of "smallness". Reasonable value of
55408+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
55409+ fs/reiser4/ulevel/ulevel.c:test_search().
55410+
55411+ Don't try to further optimize sequential search by scanning from
55412+ right to left in attempt to use more efficient loop termination
55413+ condition (comparison with 0). This doesn't work.
55414+
55415+ */
55416+
55417+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
55418+ int median;
55419+ item_header40 *medianh;
55420+
55421+ median = (left + right) / 2;
55422+ medianh = node40_ih_at(node, median);
55423+
55424+ assert("nikita-1084", median >= 0);
55425+ assert("nikita-1085", median < items);
55426+ switch (keycmp(key, &medianh->key)) {
55427+ case LESS_THAN:
55428+ right = median;
55429+ righth = medianh;
55430+ break;
55431+ default:
55432+ wrong_return_value("nikita-586", "keycmp");
55433+ case GREATER_THAN:
55434+ left = median;
55435+ lefth = medianh;
55436+ break;
55437+ case EQUAL_TO:
55438+ do {
55439+ --median;
55440+ /* headers are ordered from right to left */
55441+ ++medianh;
55442+ } while (median >= 0 && keyeq(key, &medianh->key));
55443+ right = left = median + 1;
55444+ ih = lefth = righth = medianh - 1;
55445+ found = 1;
55446+ break;
55447+ }
55448+ }
55449+ /* sequential scan. Item headers, and, therefore, keys are stored at
55450+ the rightmost part of a node from right to left. We are trying to
55451+ access memory from left to right, and hence, scan in _descending_
55452+ order of item numbers.
55453+ */
55454+ if (!found) {
55455+ for (left = right, ih = righth; left >= 0; ++ih, --left) {
55456+ cmp_t comparison;
55457+
55458+ prefetchkey(&(ih + 1)->key);
55459+ comparison = keycmp(&ih->key, key);
55460+ if (comparison == GREATER_THAN)
55461+ continue;
55462+ if (comparison == EQUAL_TO) {
55463+ found = 1;
55464+ do {
55465+ --left;
55466+ ++ih;
55467+ } while (left >= 0 && keyeq(&ih->key, key));
55468+ ++left;
55469+ --ih;
55470+ } else {
55471+ assert("nikita-1256", comparison == LESS_THAN);
55472+ }
55473+ break;
55474+ }
55475+ if (unlikely(left < 0))
55476+ left = 0;
55477+ }
55478+
55479+ assert("nikita-3212", right >= left);
55480+ assert("nikita-3214",
55481+ equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
55482+
55483+ coord_set_item_pos(coord, left);
55484+ coord->unit_pos = 0;
55485+ coord->between = AT_UNIT;
55486+
55487+ /* key < leftmost key in a mode or node is corrupted and keys
55488+ are not sorted */
55489+ bstop = node40_ih_at(node, (unsigned)left);
55490+ order = keycmp(&bstop->key, key);
55491+ if (unlikely(order == GREATER_THAN)) {
55492+ if (unlikely(left != 0)) {
55493+ /* screw up */
55494+ warning("nikita-587", "Key less than %i key in a node",
55495+ left);
55496+ reiser4_print_key("key", key);
55497+ reiser4_print_key("min", &bstop->key);
55498+ print_coord_content("coord", coord);
55499+ return RETERR(-EIO);
55500+ } else {
55501+ coord->between = BEFORE_UNIT;
55502+ return NS_NOT_FOUND;
55503+ }
55504+ }
55505+ /* left <= key, ok */
55506+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
55507+
55508+ if (unlikely(iplug == NULL)) {
55509+ warning("nikita-588", "Unknown plugin %i",
55510+ le16_to_cpu(get_unaligned(&bstop->plugin_id)));
55511+ reiser4_print_key("key", key);
55512+ print_coord_content("coord", coord);
55513+ return RETERR(-EIO);
55514+ }
55515+
55516+ coord_set_iplug(coord, iplug);
55517+
55518+ /* if exact key from item header was found by binary search, no
55519+ further checks are necessary. */
55520+ if (found) {
55521+ assert("nikita-1259", order == EQUAL_TO);
55522+ return NS_FOUND;
55523+ }
55524+ if (iplug->b.max_key_inside != NULL) {
55525+ reiser4_key max_item_key;
55526+
55527+ /* key > max_item_key --- outside of an item */
55528+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
55529+ coord->unit_pos = 0;
55530+ coord->between = AFTER_ITEM;
55531+ /* FIXME-VS: key we are looking for does not fit into
55532+ found item. Return NS_NOT_FOUND then. Without that
55533+ the following case does not work: there is extent of
55534+ file 10000, 10001. File 10000, 10002 has been just
55535+ created. When writing to position 0 in that file -
55536+ traverse_tree will stop here on twig level. When we
55537+ want it to go down to leaf level
55538+ */
55539+ return NS_NOT_FOUND;
55540+ }
55541+ }
55542+
55543+ if (iplug->b.lookup != NULL) {
55544+ return iplug->b.lookup(key, bias, coord);
55545+ } else {
55546+ assert("nikita-1260", order == LESS_THAN);
55547+ coord->between = AFTER_UNIT;
55548+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
55549+ }
55550+}
55551+
55552+#undef NODE_ADDSTAT
55553+#undef NODE_INCSTAT
55554+
55555+/* plugin->u.node.estimate
55556+ look for description of this method in plugin/node/node.h */
55557+size_t estimate_node40(znode * node)
55558+{
55559+ size_t result;
55560+
55561+ assert("nikita-597", node != NULL);
55562+
55563+ result = free_space_node40(node) - sizeof(item_header40);
55564+
55565+ return (result > 0) ? result : 0;
55566+}
55567+
55568+/* plugin->u.node.check
55569+ look for description of this method in plugin/node/node.h */
55570+int check_node40(const znode * node /* node to check */ ,
55571+ __u32 flags /* check flags */ ,
55572+ const char **error /* where to store error message */ )
55573+{
55574+ int nr_items;
55575+ int i;
55576+ reiser4_key prev;
55577+ unsigned old_offset;
55578+ tree_level level;
55579+ coord_t coord;
55580+ int result;
55581+
55582+ assert("nikita-580", node != NULL);
55583+ assert("nikita-581", error != NULL);
55584+ assert("nikita-2948", znode_is_loaded(node));
55585+
55586+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
55587+ return 0;
55588+
55589+ assert("nikita-582", zdata(node) != NULL);
55590+
55591+ nr_items = node40_num_of_items_internal(node);
55592+ if (nr_items < 0) {
55593+ *error = "Negative number of items";
55594+ return -1;
55595+ }
55596+
55597+ if (flags & REISER4_NODE_DKEYS)
55598+ prev = *znode_get_ld_key((znode *) node);
55599+ else
55600+ prev = *reiser4_min_key();
55601+
55602+ old_offset = 0;
55603+ coord_init_zero(&coord);
55604+ coord.node = (znode *) node;
55605+ coord.unit_pos = 0;
55606+ coord.between = AT_UNIT;
55607+ level = znode_get_level(node);
55608+ for (i = 0; i < nr_items; i++) {
55609+ item_header40 *ih;
55610+ reiser4_key unit_key;
55611+ unsigned j;
55612+
55613+ ih = node40_ih_at(node, (unsigned)i);
55614+ coord_set_item_pos(&coord, i);
55615+ if ((ih40_get_offset(ih) >=
55616+ znode_size(node) - nr_items * sizeof(item_header40)) ||
55617+ (ih40_get_offset(ih) < sizeof(node40_header))) {
55618+ *error = "Offset is out of bounds";
55619+ return -1;
55620+ }
55621+ if (ih40_get_offset(ih) <= old_offset) {
55622+ *error = "Offsets are in wrong order";
55623+ return -1;
55624+ }
55625+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
55626+ *error = "Wrong offset of first item";
55627+ return -1;
55628+ }
55629+ old_offset = ih40_get_offset(ih);
55630+
55631+ if (keygt(&prev, &ih->key)) {
55632+ *error = "Keys are in wrong order";
55633+ return -1;
55634+ }
55635+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
55636+ *error = "Wrong key of first unit";
55637+ return -1;
55638+ }
55639+ prev = ih->key;
55640+ for (j = 0; j < coord_num_units(&coord); ++j) {
55641+ coord.unit_pos = j;
55642+ unit_key_by_coord(&coord, &unit_key);
55643+ if (keygt(&prev, &unit_key)) {
55644+ *error = "Unit keys are in wrong order";
55645+ return -1;
55646+ }
55647+ prev = unit_key;
55648+ }
55649+ coord.unit_pos = 0;
55650+ if (level != TWIG_LEVEL && item_is_extent(&coord)) {
55651+ *error = "extent on the wrong level";
55652+ return -1;
55653+ }
55654+ if (level == LEAF_LEVEL && item_is_internal(&coord)) {
55655+ *error = "internal item on the wrong level";
55656+ return -1;
55657+ }
55658+ if (level != LEAF_LEVEL &&
55659+ !item_is_internal(&coord) && !item_is_extent(&coord)) {
55660+ *error = "wrong item on the internal level";
55661+ return -1;
55662+ }
55663+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
55664+ *error = "non-internal item on the internal level";
55665+ return -1;
55666+ }
55667+#if REISER4_DEBUG
55668+ if (item_plugin_by_coord(&coord)->b.check
55669+ && item_plugin_by_coord(&coord)->b.check(&coord, error))
55670+ return -1;
55671+#endif
55672+ if (i) {
55673+ coord_t prev_coord;
55674+ /* two neighboring items can not be mergeable */
55675+ coord_dup(&prev_coord, &coord);
55676+ coord_prev_item(&prev_coord);
55677+ if (are_items_mergeable(&prev_coord, &coord)) {
55678+ *error = "mergeable items in one node";
55679+ return -1;
55680+ }
55681+
55682+ }
55683+ }
55684+
55685+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
55686+ coord_t coord;
55687+ item_plugin *iplug;
55688+
55689+ coord_init_last_unit(&coord, node);
55690+ iplug = item_plugin_by_coord(&coord);
55691+ if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
55692+ iplug->s.file.append_key != NULL) {
55693+ reiser4_key mkey;
55694+
55695+ iplug->s.file.append_key(&coord, &mkey);
55696+ set_key_offset(&mkey, get_key_offset(&mkey) - 1);
55697+ read_lock_dk(current_tree);
55698+ result = keygt(&mkey, znode_get_rd_key((znode *) node));
55699+ read_unlock_dk(current_tree);
55700+ if (result) {
55701+ *error = "key of rightmost item is too large";
55702+ return -1;
55703+ }
55704+ }
55705+ }
55706+ if (flags & REISER4_NODE_DKEYS) {
55707+ read_lock_tree(current_tree);
55708+ read_lock_dk(current_tree);
55709+
55710+ flags |= REISER4_NODE_TREE_STABLE;
55711+
55712+ if (keygt(&prev, znode_get_rd_key((znode *) node))) {
55713+ if (flags & REISER4_NODE_TREE_STABLE) {
55714+ *error = "Last key is greater than rdkey";
55715+ read_unlock_dk(current_tree);
55716+ read_unlock_tree(current_tree);
55717+ return -1;
55718+ }
55719+ }
55720+ if (keygt
55721+ (znode_get_ld_key((znode *) node),
55722+ znode_get_rd_key((znode *) node))) {
55723+ *error = "ldkey is greater than rdkey";
55724+ read_unlock_dk(current_tree);
55725+ read_unlock_tree(current_tree);
55726+ return -1;
55727+ }
55728+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
55729+ (node->left != NULL) &&
55730+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
55731+ ergo(flags & REISER4_NODE_TREE_STABLE,
55732+ !keyeq(znode_get_rd_key(node->left),
55733+ znode_get_ld_key((znode *) node)))
55734+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55735+ keygt(znode_get_rd_key(node->left),
55736+ znode_get_ld_key((znode *) node)))) {
55737+ *error = "left rdkey or ldkey is wrong";
55738+ read_unlock_dk(current_tree);
55739+ read_unlock_tree(current_tree);
55740+ return -1;
55741+ }
55742+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
55743+ (node->right != NULL) &&
55744+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
55745+ ergo(flags & REISER4_NODE_TREE_STABLE,
55746+ !keyeq(znode_get_rd_key((znode *) node),
55747+ znode_get_ld_key(node->right)))
55748+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55749+ keygt(znode_get_rd_key((znode *) node),
55750+ znode_get_ld_key(node->right)))) {
55751+ *error = "rdkey or right ldkey is wrong";
55752+ read_unlock_dk(current_tree);
55753+ read_unlock_tree(current_tree);
55754+ return -1;
55755+ }
55756+
55757+ read_unlock_dk(current_tree);
55758+ read_unlock_tree(current_tree);
55759+ }
55760+
55761+ return 0;
55762+}
55763+
55764+/* plugin->u.node.parse
55765+ look for description of this method in plugin/node/node.h */
55766+int parse_node40(znode * node /* node to parse */ )
55767+{
55768+ node40_header *header;
55769+ int result;
55770+ d8 level;
55771+
55772+ header = node40_node_header((znode *) node);
55773+ result = -EIO;
55774+ level = nh40_get_level(header);
55775+ if (unlikely(((__u8) znode_get_level(node)) != level))
55776+ warning("nikita-494", "Wrong level found in node: %i != %i",
55777+ znode_get_level(node), level);
55778+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
55779+ warning("nikita-495",
55780+ "Wrong magic in tree node: want %x, got %x",
55781+ REISER4_NODE_MAGIC, nh40_get_magic(header));
55782+ else {
55783+ node->nr_items = node40_num_of_items_internal(node);
55784+ result = 0;
55785+ }
55786+ return RETERR(result);
55787+}
55788+
55789+/* plugin->u.node.init
55790+ look for description of this method in plugin/node/node.h */
55791+int init_node40(znode * node /* node to initialise */ )
55792+{
55793+ node40_header *header;
55794+
55795+ assert("nikita-570", node != NULL);
55796+ assert("nikita-572", zdata(node) != NULL);
55797+
55798+ header = node40_node_header(node);
55799+ memset(header, 0, sizeof(node40_header));
55800+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
55801+ nh40_set_free_space_start(header, sizeof(node40_header));
55802+ /* sane hypothesis: 0 in CPU format is 0 in disk format */
55803+ /* items: 0 */
55804+ save_plugin_id(node_plugin_to_plugin(node->nplug),
55805+ &header->common_header.plugin_id);
55806+ nh40_set_level(header, znode_get_level(node));
55807+ nh40_set_magic(header, REISER4_NODE_MAGIC);
55808+ node->nr_items = 0;
55809+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
55810+
55811+ /* flags: 0 */
55812+ return 0;
55813+}
55814+
55815+#ifdef GUESS_EXISTS
55816+int guess_node40(const znode * node /* node to guess plugin of */ )
55817+{
55818+ node40_header *nethack;
55819+
55820+ assert("nikita-1058", node != NULL);
55821+ nethack = node40_node_header(node);
55822+ return
55823+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
55824+ (plugin_by_disk_id(znode_get_tree(node),
55825+ REISER4_NODE_PLUGIN_TYPE,
55826+ &nethack->common_header.plugin_id)->h.id ==
55827+ NODE40_ID);
55828+}
55829+#endif
55830+
55831+/* plugin->u.node.chage_item_size
55832+ look for description of this method in plugin/node/node.h */
55833+void change_item_size_node40(coord_t * coord, int by)
55834+{
55835+ node40_header *nh;
55836+ item_header40 *ih;
55837+ char *item_data;
55838+ int item_length;
55839+ unsigned i;
55840+
55841+ /* make sure that @item is coord of existing item */
55842+ assert("vs-210", coord_is_existing_item(coord));
55843+
55844+ nh = node40_node_header(coord->node);
55845+
55846+ item_data = item_by_coord_node40(coord);
55847+ item_length = length_by_coord_node40(coord);
55848+
55849+ /* move item bodies */
55850+ ih = node40_ih_at_coord(coord);
55851+ memmove(item_data + item_length + by, item_data + item_length,
55852+ nh40_get_free_space_start(node40_node_header(coord->node)) -
55853+ (ih40_get_offset(ih) + item_length));
55854+
55855+ /* update offsets of moved items */
55856+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
55857+ ih = node40_ih_at(coord->node, i);
55858+ ih40_set_offset(ih, ih40_get_offset(ih) + by);
55859+ }
55860+
55861+ /* update node header */
55862+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
55863+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
55864+}
55865+
55866+static int should_notify_parent(const znode * node)
55867+{
55868+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
55869+ return !disk_addr_eq(znode_get_block(node),
55870+ &znode_get_tree(node)->root_block);
55871+}
55872+
55873+/* plugin->u.node.create_item
55874+ look for description of this method in plugin/node/node.h */
55875+int
55876+create_item_node40(coord_t *target, const reiser4_key *key,
55877+ reiser4_item_data *data, carry_plugin_info *info)
55878+{
55879+ node40_header *nh;
55880+ item_header40 *ih;
55881+ unsigned offset;
55882+ unsigned i;
55883+
55884+ nh = node40_node_header(target->node);
55885+
55886+ assert("vs-212", coord_is_between_items(target));
55887+ /* node must have enough free space */
55888+ assert("vs-254",
55889+ free_space_node40(target->node) >=
55890+ data->length + sizeof(item_header40));
55891+ assert("vs-1410", data->length >= 0);
55892+
55893+ if (coord_set_to_right(target))
55894+ /* there are not items to the right of @target, so, new item
55895+ will be inserted after last one */
55896+ coord_set_item_pos(target, nh40_get_num_items(nh));
55897+
55898+ if (target->item_pos < nh40_get_num_items(nh)) {
55899+ /* there are items to be moved to prepare space for new
55900+ item */
55901+ ih = node40_ih_at_coord(target);
55902+ /* new item will start at this offset */
55903+ offset = ih40_get_offset(ih);
55904+
55905+ memmove(zdata(target->node) + offset + data->length,
55906+ zdata(target->node) + offset,
55907+ nh40_get_free_space_start(nh) - offset);
55908+ /* update headers of moved items */
55909+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
55910+ ih = node40_ih_at(target->node, i);
55911+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
55912+ }
55913+
55914+ /* @ih is set to item header of the last item, move item headers */
55915+ memmove(ih - 1, ih,
55916+ sizeof(item_header40) * (nh40_get_num_items(nh) -
55917+ target->item_pos));
55918+ } else {
55919+ /* new item will start at this offset */
55920+ offset = nh40_get_free_space_start(nh);
55921+ }
55922+
55923+ /* make item header for the new item */
55924+ ih = node40_ih_at_coord(target);
55925+ memcpy(&ih->key, key, sizeof(reiser4_key));
55926+ ih40_set_offset(ih, offset);
55927+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
55928+
55929+ /* update node header */
55930+ nh40_set_free_space(nh,
55931+ nh40_get_free_space(nh) - data->length -
55932+ sizeof(item_header40));
55933+ nh40_set_free_space_start(nh,
55934+ nh40_get_free_space_start(nh) + data->length);
55935+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
55936+
55937+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
55938+ target->unit_pos = 0;
55939+ target->between = AT_UNIT;
55940+ coord_clear_iplug(target);
55941+
55942+ /* initialize item */
55943+ if (data->iplug->b.init != NULL) {
55944+ data->iplug->b.init(target, NULL, data);
55945+ }
55946+ /* copy item body */
55947+ if (data->iplug->b.paste != NULL) {
55948+ data->iplug->b.paste(target, data, info);
55949+ } else if (data->data != NULL) {
55950+ if (data->user) {
55951+ /* AUDIT: Are we really should not check that pointer
55952+ from userspace was valid and data bytes were
55953+ available? How will we return -EFAULT of some kind
55954+ without this check? */
55955+ assert("nikita-3038", reiser4_schedulable());
55956+ /* copy data from user space */
55957+ __copy_from_user(zdata(target->node) + offset,
55958+ (const char __user *)data->data,
55959+ (unsigned)data->length);
55960+ } else
55961+ /* copy from kernel space */
55962+ memcpy(zdata(target->node) + offset, data->data,
55963+ (unsigned)data->length);
55964+ }
55965+
55966+ if (target->item_pos == 0) {
55967+ /* left delimiting key has to be updated */
55968+ prepare_for_update(NULL, target->node, info);
55969+ }
55970+
55971+ if (item_plugin_by_coord(target)->b.create_hook != NULL) {
55972+ item_plugin_by_coord(target)->b.create_hook(target, data->arg);
55973+ }
55974+
55975+ return 0;
55976+}
55977+
55978+/* plugin->u.node.update_item_key
55979+ look for description of this method in plugin/node/node.h */
55980+void
55981+update_item_key_node40(coord_t * target, const reiser4_key * key,
55982+ carry_plugin_info * info)
55983+{
55984+ item_header40 *ih;
55985+
55986+ ih = node40_ih_at_coord(target);
55987+ memcpy(&ih->key, key, sizeof(reiser4_key));
55988+
55989+ if (target->item_pos == 0) {
55990+ prepare_for_update(NULL, target->node, info);
55991+ }
55992+}
55993+
55994+/* this bits encode cut mode */
55995+#define CMODE_TAIL 1
55996+#define CMODE_WHOLE 2
55997+#define CMODE_HEAD 4
55998+
55999+struct cut40_info {
56000+ int mode;
56001+ pos_in_node_t tail_removed; /* position of item which gets tail removed */
56002+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
56003+ pos_in_node_t removed_count; /* number of items removed completely */
56004+ pos_in_node_t head_removed; /* position of item which gets head removed */
56005+
56006+ pos_in_node_t freed_space_start;
56007+ pos_in_node_t freed_space_end;
56008+ pos_in_node_t first_moved;
56009+ pos_in_node_t head_removed_location;
56010+};
56011+
56012+static void init_cinfo(struct cut40_info *cinfo)
56013+{
56014+ cinfo->mode = 0;
56015+ cinfo->tail_removed = MAX_POS_IN_NODE;
56016+ cinfo->first_removed = MAX_POS_IN_NODE;
56017+ cinfo->removed_count = MAX_POS_IN_NODE;
56018+ cinfo->head_removed = MAX_POS_IN_NODE;
56019+ cinfo->freed_space_start = MAX_POS_IN_NODE;
56020+ cinfo->freed_space_end = MAX_POS_IN_NODE;
56021+ cinfo->first_moved = MAX_POS_IN_NODE;
56022+ cinfo->head_removed_location = MAX_POS_IN_NODE;
56023+}
56024+
56025+/* complete cut_node40/kill_node40 content by removing the gap created by */
56026+static void compact(znode * node, struct cut40_info *cinfo)
56027+{
56028+ node40_header *nh;
56029+ item_header40 *ih;
56030+ pos_in_node_t freed;
56031+ pos_in_node_t pos, nr_items;
56032+
56033+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
56034+ cinfo->freed_space_end != MAX_POS_IN_NODE &&
56035+ cinfo->first_moved != MAX_POS_IN_NODE));
56036+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
56037+
56038+ nh = node40_node_header(node);
56039+ nr_items = nh40_get_num_items(nh);
56040+
56041+ /* remove gap made up by removal */
56042+ memmove(zdata(node) + cinfo->freed_space_start,
56043+ zdata(node) + cinfo->freed_space_end,
56044+ nh40_get_free_space_start(nh) - cinfo->freed_space_end);
56045+
56046+ /* update item headers of moved items - change their locations */
56047+ pos = cinfo->first_moved;
56048+ ih = node40_ih_at(node, pos);
56049+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
56050+ assert("vs-1580", pos == cinfo->head_removed);
56051+ ih40_set_offset(ih, cinfo->head_removed_location);
56052+ pos++;
56053+ ih--;
56054+ }
56055+
56056+ freed = cinfo->freed_space_end - cinfo->freed_space_start;
56057+ for (; pos < nr_items; pos++, ih--) {
56058+ assert("vs-1581", ih == node40_ih_at(node, pos));
56059+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
56060+ }
56061+
56062+ /* free space start moved to right */
56063+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
56064+
56065+ if (cinfo->removed_count != MAX_POS_IN_NODE) {
56066+ /* number of items changed. Remove item headers of those items */
56067+ ih = node40_ih_at(node, nr_items - 1);
56068+ memmove(ih + cinfo->removed_count, ih,
56069+ sizeof(item_header40) * (nr_items -
56070+ cinfo->removed_count -
56071+ cinfo->first_removed));
56072+ freed += sizeof(item_header40) * cinfo->removed_count;
56073+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
56074+ }
56075+
56076+ /* total amount of free space increased */
56077+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
56078+}
56079+
56080+int shrink_item_node40(coord_t * coord, int delta)
56081+{
56082+ node40_header *nh;
56083+ item_header40 *ih;
56084+ pos_in_node_t pos;
56085+ pos_in_node_t nr_items;
56086+ char *end;
56087+ znode *node;
56088+ int off;
56089+
56090+ assert("nikita-3487", coord != NULL);
56091+ assert("nikita-3488", delta >= 0);
56092+
56093+ node = coord->node;
56094+ nh = node40_node_header(node);
56095+ nr_items = nh40_get_num_items(nh);
56096+
56097+ ih = node40_ih_at_coord(coord);
56098+ assert("nikita-3489", delta <= length_by_coord_node40(coord));
56099+ off = ih40_get_offset(ih) + length_by_coord_node40(coord);
56100+ end = zdata(node) + off;
56101+
56102+ /* remove gap made up by removal */
56103+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
56104+
56105+ /* update item headers of moved items - change their locations */
56106+ pos = coord->item_pos + 1;
56107+ ih = node40_ih_at(node, pos);
56108+ for (; pos < nr_items; pos++, ih--) {
56109+ assert("nikita-3490", ih == node40_ih_at(node, pos));
56110+ ih40_set_offset(ih, ih40_get_offset(ih) - delta);
56111+ }
56112+
56113+ /* free space start moved to left */
56114+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
56115+ /* total amount of free space increased */
56116+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
56117+ /*
56118+ * This method does _not_ changes number of items. Hence, it cannot
56119+ * make node empty. Also it doesn't remove items at all, which means
56120+ * that no keys have to be updated either.
56121+ */
56122+ return 0;
56123+}
56124+
56125+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
56126+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
56127+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
56128+ getting head cut. Function returns 0 in this case */
56129+static int
56130+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
56131+{
56132+ reiser4_key left_key, right_key;
56133+ reiser4_key min_from_key, max_to_key;
56134+ const reiser4_key *from_key, *to_key;
56135+
56136+ init_cinfo(cinfo);
56137+
56138+ /* calculate minimal key stored in first item of items to be cut (params->from) */
56139+ item_key_by_coord(params->from, &min_from_key);
56140+ /* and max key stored in last item of items to be cut (params->to) */
56141+ max_item_key_by_coord(params->to, &max_to_key);
56142+
56143+ /* if cut key range is not defined in input parameters - define it using cut coord range */
56144+ if (params->from_key == NULL) {
56145+ assert("vs-1513", params->to_key == NULL);
56146+ unit_key_by_coord(params->from, &left_key);
56147+ from_key = &left_key;
56148+ max_unit_key_by_coord(params->to, &right_key);
56149+ to_key = &right_key;
56150+ } else {
56151+ from_key = params->from_key;
56152+ to_key = params->to_key;
56153+ }
56154+
56155+ if (params->from->item_pos == params->to->item_pos) {
56156+ if (keylt(&min_from_key, from_key)
56157+ && keylt(to_key, &max_to_key))
56158+ return 1;
56159+
56160+ if (keygt(from_key, &min_from_key)) {
56161+ /* tail of item is to be cut cut */
56162+ cinfo->tail_removed = params->from->item_pos;
56163+ cinfo->mode |= CMODE_TAIL;
56164+ } else if (keylt(to_key, &max_to_key)) {
56165+ /* head of item is to be cut */
56166+ cinfo->head_removed = params->from->item_pos;
56167+ cinfo->mode |= CMODE_HEAD;
56168+ } else {
56169+ /* item is removed completely */
56170+ cinfo->first_removed = params->from->item_pos;
56171+ cinfo->removed_count = 1;
56172+ cinfo->mode |= CMODE_WHOLE;
56173+ }
56174+ } else {
56175+ cinfo->first_removed = params->from->item_pos + 1;
56176+ cinfo->removed_count =
56177+ params->to->item_pos - params->from->item_pos - 1;
56178+
56179+ if (keygt(from_key, &min_from_key)) {
56180+ /* first item is not cut completely */
56181+ cinfo->tail_removed = params->from->item_pos;
56182+ cinfo->mode |= CMODE_TAIL;
56183+ } else {
56184+ cinfo->first_removed--;
56185+ cinfo->removed_count++;
56186+ }
56187+ if (keylt(to_key, &max_to_key)) {
56188+ /* last item is not cut completely */
56189+ cinfo->head_removed = params->to->item_pos;
56190+ cinfo->mode |= CMODE_HEAD;
56191+ } else {
56192+ cinfo->removed_count++;
56193+ }
56194+ if (cinfo->removed_count)
56195+ cinfo->mode |= CMODE_WHOLE;
56196+ }
56197+
56198+ return 0;
56199+}
56200+
56201+static void
56202+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
56203+ carry_kill_data * kdata)
56204+{
56205+ coord_t coord;
56206+ item_plugin *iplug;
56207+ pos_in_node_t pos;
56208+
56209+ coord.node = node;
56210+ coord.unit_pos = 0;
56211+ coord.between = AT_UNIT;
56212+ for (pos = 0; pos < count; pos++) {
56213+ coord_set_item_pos(&coord, from + pos);
56214+ coord.unit_pos = 0;
56215+ coord.between = AT_UNIT;
56216+ iplug = item_plugin_by_coord(&coord);
56217+ if (iplug->b.kill_hook) {
56218+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
56219+ kdata);
56220+ }
56221+ }
56222+}
56223+
56224+/* this is used to kill item partially */
56225+static pos_in_node_t
56226+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56227+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
56228+{
56229+ struct carry_kill_data *kdata;
56230+ item_plugin *iplug;
56231+
56232+ kdata = data;
56233+ iplug = item_plugin_by_coord(coord);
56234+
56235+ assert("vs-1524", iplug->b.kill_units);
56236+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
56237+ new_first_key);
56238+}
56239+
56240+/* call item plugin to cut tail of file */
56241+static pos_in_node_t
56242+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56243+{
56244+ struct carry_kill_data *kdata;
56245+ pos_in_node_t to;
56246+
56247+ kdata = data;
56248+ to = coord_last_unit_pos(coord);
56249+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
56250+ NULL);
56251+}
56252+
56253+/* call item plugin to cut head of item */
56254+static pos_in_node_t
56255+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56256+ reiser4_key * new_first_key)
56257+{
56258+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
56259+ new_first_key);
56260+}
56261+
56262+/* this is used to cut item partially */
56263+static pos_in_node_t
56264+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56265+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
56266+{
56267+ carry_cut_data *cdata;
56268+ item_plugin *iplug;
56269+
56270+ cdata = data;
56271+ iplug = item_plugin_by_coord(coord);
56272+ assert("vs-302", iplug->b.cut_units);
56273+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
56274+ new_first_key);
56275+}
56276+
56277+/* call item plugin to cut tail of file */
56278+static pos_in_node_t
56279+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56280+{
56281+ carry_cut_data *cdata;
56282+ pos_in_node_t to;
56283+
56284+ cdata = data;
56285+ to = coord_last_unit_pos(cdata->params.from);
56286+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
56287+}
56288+
56289+/* call item plugin to cut head of item */
56290+static pos_in_node_t
56291+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56292+ reiser4_key * new_first_key)
56293+{
56294+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
56295+ new_first_key);
56296+}
56297+
56298+/* this returns 1 of key of first item changed, 0 - if it did not */
56299+static int
56300+prepare_for_compact(struct cut40_info *cinfo,
56301+ const struct cut_kill_params *params, int is_cut,
56302+ void *data, carry_plugin_info * info)
56303+{
56304+ znode *node;
56305+ item_header40 *ih;
56306+ pos_in_node_t freed;
56307+ pos_in_node_t item_pos;
56308+ coord_t coord;
56309+ reiser4_key new_first_key;
56310+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
56311+ void *, reiser4_key *, reiser4_key *);
56312+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
56313+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
56314+ reiser4_key *);
56315+ int retval;
56316+
56317+ retval = 0;
56318+
56319+ node = params->from->node;
56320+
56321+ assert("vs-184", node == params->to->node);
56322+ assert("vs-312", !node_is_empty(node));
56323+ assert("vs-297",
56324+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
56325+
56326+ if (is_cut) {
56327+ kill_units_f = cut_units;
56328+ kill_tail_f = cut_tail;
56329+ kill_head_f = cut_head;
56330+ } else {
56331+ kill_units_f = kill_units;
56332+ kill_tail_f = kill_tail;
56333+ kill_head_f = kill_head;
56334+ }
56335+
56336+ if (parse_cut(cinfo, params) == 1) {
56337+ /* cut from the middle of item */
56338+ freed =
56339+ kill_units_f(params->from, params->from->unit_pos,
56340+ params->to->unit_pos, data,
56341+ params->smallest_removed, NULL);
56342+
56343+ item_pos = params->from->item_pos;
56344+ ih = node40_ih_at(node, item_pos);
56345+ cinfo->freed_space_start =
56346+ ih40_get_offset(ih) + node40_item_length(node,
56347+ item_pos) - freed;
56348+ cinfo->freed_space_end = cinfo->freed_space_start + freed;
56349+ cinfo->first_moved = item_pos + 1;
56350+ } else {
56351+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
56352+ cinfo->first_removed != MAX_POS_IN_NODE ||
56353+ cinfo->head_removed != MAX_POS_IN_NODE));
56354+
56355+ switch (cinfo->mode) {
56356+ case CMODE_TAIL:
56357+ /* one item gets cut partially from its end */
56358+ assert("vs-1562",
56359+ cinfo->tail_removed == params->from->item_pos);
56360+
56361+ freed =
56362+ kill_tail_f(params->from, data,
56363+ params->smallest_removed);
56364+
56365+ item_pos = cinfo->tail_removed;
56366+ ih = node40_ih_at(node, item_pos);
56367+ cinfo->freed_space_start =
56368+ ih40_get_offset(ih) + node40_item_length(node,
56369+ item_pos) -
56370+ freed;
56371+ cinfo->freed_space_end =
56372+ cinfo->freed_space_start + freed;
56373+ cinfo->first_moved = cinfo->tail_removed + 1;
56374+ break;
56375+
56376+ case CMODE_WHOLE:
56377+ /* one or more items get removed completely */
56378+ assert("vs-1563",
56379+ cinfo->first_removed == params->from->item_pos);
56380+ assert("vs-1564", cinfo->removed_count > 0
56381+ && cinfo->removed_count != MAX_POS_IN_NODE);
56382+
56383+ /* call kill hook for all items removed completely */
56384+ if (is_cut == 0)
56385+ call_kill_hooks(node, cinfo->first_removed,
56386+ cinfo->removed_count, data);
56387+
56388+ item_pos = cinfo->first_removed;
56389+ ih = node40_ih_at(node, item_pos);
56390+
56391+ if (params->smallest_removed)
56392+ memcpy(params->smallest_removed, &ih->key,
56393+ sizeof(reiser4_key));
56394+
56395+ cinfo->freed_space_start = ih40_get_offset(ih);
56396+
56397+ item_pos += (cinfo->removed_count - 1);
56398+ ih -= (cinfo->removed_count - 1);
56399+ cinfo->freed_space_end =
56400+ ih40_get_offset(ih) + node40_item_length(node,
56401+ item_pos);
56402+ cinfo->first_moved = item_pos + 1;
56403+ if (cinfo->first_removed == 0)
56404+ /* key of first item of the node changes */
56405+ retval = 1;
56406+ break;
56407+
56408+ case CMODE_HEAD:
56409+ /* one item gets cut partially from its head */
56410+ assert("vs-1565",
56411+ cinfo->head_removed == params->from->item_pos);
56412+
56413+ freed =
56414+ kill_head_f(params->to, data,
56415+ params->smallest_removed,
56416+ &new_first_key);
56417+
56418+ item_pos = cinfo->head_removed;
56419+ ih = node40_ih_at(node, item_pos);
56420+ cinfo->freed_space_start = ih40_get_offset(ih);
56421+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56422+ cinfo->first_moved = cinfo->head_removed + 1;
56423+
56424+ /* item head is removed, therefore, item key changed */
56425+ coord.node = node;
56426+ coord_set_item_pos(&coord, item_pos);
56427+ coord.unit_pos = 0;
56428+ coord.between = AT_UNIT;
56429+ update_item_key_node40(&coord, &new_first_key, NULL);
56430+ if (item_pos == 0)
56431+ /* key of first item of the node changes */
56432+ retval = 1;
56433+ break;
56434+
56435+ case CMODE_TAIL | CMODE_WHOLE:
56436+ /* one item gets cut from its end and one or more items get removed completely */
56437+ assert("vs-1566",
56438+ cinfo->tail_removed == params->from->item_pos);
56439+ assert("vs-1567",
56440+ cinfo->first_removed == cinfo->tail_removed + 1);
56441+ assert("vs-1564", cinfo->removed_count > 0
56442+ && cinfo->removed_count != MAX_POS_IN_NODE);
56443+
56444+ freed =
56445+ kill_tail_f(params->from, data,
56446+ params->smallest_removed);
56447+
56448+ item_pos = cinfo->tail_removed;
56449+ ih = node40_ih_at(node, item_pos);
56450+ cinfo->freed_space_start =
56451+ ih40_get_offset(ih) + node40_item_length(node,
56452+ item_pos) -
56453+ freed;
56454+
56455+ /* call kill hook for all items removed completely */
56456+ if (is_cut == 0)
56457+ call_kill_hooks(node, cinfo->first_removed,
56458+ cinfo->removed_count, data);
56459+
56460+ item_pos += cinfo->removed_count;
56461+ ih -= cinfo->removed_count;
56462+ cinfo->freed_space_end =
56463+ ih40_get_offset(ih) + node40_item_length(node,
56464+ item_pos);
56465+ cinfo->first_moved = item_pos + 1;
56466+ break;
56467+
56468+ case CMODE_WHOLE | CMODE_HEAD:
56469+ /* one or more items get removed completely and one item gets cut partially from its head */
56470+ assert("vs-1568",
56471+ cinfo->first_removed == params->from->item_pos);
56472+ assert("vs-1564", cinfo->removed_count > 0
56473+ && cinfo->removed_count != MAX_POS_IN_NODE);
56474+ assert("vs-1569",
56475+ cinfo->head_removed ==
56476+ cinfo->first_removed + cinfo->removed_count);
56477+
56478+ /* call kill hook for all items removed completely */
56479+ if (is_cut == 0)
56480+ call_kill_hooks(node, cinfo->first_removed,
56481+ cinfo->removed_count, data);
56482+
56483+ item_pos = cinfo->first_removed;
56484+ ih = node40_ih_at(node, item_pos);
56485+
56486+ if (params->smallest_removed)
56487+ memcpy(params->smallest_removed, &ih->key,
56488+ sizeof(reiser4_key));
56489+
56490+ freed =
56491+ kill_head_f(params->to, data, NULL, &new_first_key);
56492+
56493+ cinfo->freed_space_start = ih40_get_offset(ih);
56494+
56495+ ih = node40_ih_at(node, cinfo->head_removed);
56496+ /* this is the most complex case. Item which got head removed and items which are to be moved
56497+ intact change their location differently. */
56498+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56499+ cinfo->first_moved = cinfo->head_removed;
56500+ cinfo->head_removed_location = cinfo->freed_space_start;
56501+
56502+ /* item head is removed, therefore, item key changed */
56503+ coord.node = node;
56504+ coord_set_item_pos(&coord, cinfo->head_removed);
56505+ coord.unit_pos = 0;
56506+ coord.between = AT_UNIT;
56507+ update_item_key_node40(&coord, &new_first_key, NULL);
56508+
56509+ assert("vs-1579", cinfo->first_removed == 0);
56510+ /* key of first item of the node changes */
56511+ retval = 1;
56512+ break;
56513+
56514+ case CMODE_TAIL | CMODE_HEAD:
56515+ /* one item get cut from its end and its neighbor gets cut from its tail */
56516+ impossible("vs-1576", "this can not happen currently");
56517+ break;
56518+
56519+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
56520+ impossible("vs-1577", "this can not happen currently");
56521+ break;
56522+ default:
56523+ impossible("vs-1578", "unexpected cut mode");
56524+ break;
56525+ }
56526+ }
56527+ return retval;
56528+}
56529+
56530+/* plugin->u.node.kill
56531+ return value is number of items removed completely */
56532+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
56533+{
56534+ znode *node;
56535+ struct cut40_info cinfo;
56536+ int first_key_changed;
56537+
56538+ node = kdata->params.from->node;
56539+
56540+ first_key_changed =
56541+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
56542+ info);
56543+ compact(node, &cinfo);
56544+
56545+ if (info) {
56546+ /* it is not called by node40_shift, so we have to take care
56547+ of changes on upper levels */
56548+ if (node_is_empty(node)
56549+ && !(kdata->flags & DELETE_RETAIN_EMPTY))
56550+ /* all contents of node is deleted */
56551+ prepare_removal_node40(node, info);
56552+ else if (first_key_changed) {
56553+ prepare_for_update(NULL, node, info);
56554+ }
56555+ }
56556+
56557+ coord_clear_iplug(kdata->params.from);
56558+ coord_clear_iplug(kdata->params.to);
56559+
56560+ znode_make_dirty(node);
56561+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56562+}
56563+
56564+/* plugin->u.node.cut
56565+ return value is number of items removed completely */
56566+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
56567+{
56568+ znode *node;
56569+ struct cut40_info cinfo;
56570+ int first_key_changed;
56571+
56572+ node = cdata->params.from->node;
56573+
56574+ first_key_changed =
56575+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
56576+ info);
56577+ compact(node, &cinfo);
56578+
56579+ if (info) {
56580+ /* it is not called by node40_shift, so we have to take care
56581+ of changes on upper levels */
56582+ if (node_is_empty(node))
56583+ /* all contents of node is deleted */
56584+ prepare_removal_node40(node, info);
56585+ else if (first_key_changed) {
56586+ prepare_for_update(NULL, node, info);
56587+ }
56588+ }
56589+
56590+ coord_clear_iplug(cdata->params.from);
56591+ coord_clear_iplug(cdata->params.to);
56592+
56593+ znode_make_dirty(node);
56594+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56595+}
56596+
56597+/* this structure is used by shift method of node40 plugin */
56598+struct shift_params {
56599+ shift_direction pend; /* when @pend == append - we are shifting to
56600+ left, when @pend == prepend - to right */
56601+ coord_t wish_stop; /* when shifting to left this is last unit we
56602+ want shifted, when shifting to right - this
56603+ is set to unit we want to start shifting
56604+ from */
56605+ znode *target;
56606+ int everything; /* it is set to 1 if everything we have to shift is
56607+ shifted, 0 - otherwise */
56608+
56609+ /* FIXME-VS: get rid of read_stop */
56610+
56611+ /* these are set by estimate_shift */
56612+ coord_t real_stop; /* this will be set to last unit which will be
56613+ really shifted */
56614+
56615+ /* coordinate in source node before operation of unit which becomes
56616+ first after shift to left of last after shift to right */
56617+ union {
56618+ coord_t future_first;
56619+ coord_t future_last;
56620+ } u;
56621+
56622+ unsigned merging_units; /* number of units of first item which have to
56623+ be merged with last item of target node */
56624+ unsigned merging_bytes; /* number of bytes in those units */
56625+
56626+ unsigned entire; /* items shifted in their entirety */
56627+ unsigned entire_bytes; /* number of bytes in those items */
56628+
56629+ unsigned part_units; /* number of units of partially copied item */
56630+ unsigned part_bytes; /* number of bytes in those units */
56631+
56632+ unsigned shift_bytes; /* total number of bytes in items shifted (item
56633+ headers not included) */
56634+
56635+};
56636+
56637+static int item_creation_overhead(coord_t *item)
56638+{
56639+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
56640+}
56641+
56642+/* how many units are there in @source starting from source->unit_pos
56643+ but not further than @stop_coord */
56644+static int
56645+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
56646+{
56647+ if (pend == SHIFT_LEFT) {
56648+ assert("vs-181", source->unit_pos == 0);
56649+ } else {
56650+ assert("vs-182",
56651+ source->unit_pos == coord_last_unit_pos(source));
56652+ }
56653+
56654+ if (source->item_pos != stop_coord->item_pos) {
56655+ /* @source and @stop_coord are different items */
56656+ return coord_last_unit_pos(source) + 1;
56657+ }
56658+
56659+ if (pend == SHIFT_LEFT) {
56660+ return stop_coord->unit_pos + 1;
56661+ } else {
56662+ return source->unit_pos - stop_coord->unit_pos + 1;
56663+ }
56664+}
56665+
56666+/* this calculates what can be copied from @shift->wish_stop.node to
56667+ @shift->target */
56668+static void
56669+estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
56670+{
56671+ unsigned target_free_space, size;
56672+ pos_in_node_t stop_item; /* item which estimating should not consider */
56673+ unsigned want; /* number of units of item we want shifted */
56674+ coord_t source; /* item being estimated */
56675+ item_plugin *iplug;
56676+
56677+ /* shifting to left/right starts from first/last units of
56678+ @shift->wish_stop.node */
56679+ if (shift->pend == SHIFT_LEFT) {
56680+ coord_init_first_unit(&source, shift->wish_stop.node);
56681+ } else {
56682+ coord_init_last_unit(&source, shift->wish_stop.node);
56683+ }
56684+ shift->real_stop = source;
56685+
56686+ /* free space in target node and number of items in source */
56687+ target_free_space = znode_free_space(shift->target);
56688+
56689+ shift->everything = 0;
56690+ if (!node_is_empty(shift->target)) {
56691+ /* target node is not empty, check for boundary items
56692+ mergeability */
56693+ coord_t to;
56694+
56695+ /* item we try to merge @source with */
56696+ if (shift->pend == SHIFT_LEFT) {
56697+ coord_init_last_unit(&to, shift->target);
56698+ } else {
56699+ coord_init_first_unit(&to, shift->target);
56700+ }
56701+
56702+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
56703+ &source) :
56704+ are_items_mergeable(&source, &to)) {
56705+ /* how many units of @source do we want to merge to
56706+ item @to */
56707+ want =
56708+ wanted_units(&source, &shift->wish_stop,
56709+ shift->pend);
56710+
56711+ /* how many units of @source we can merge to item
56712+ @to */
56713+ iplug = item_plugin_by_coord(&source);
56714+ if (iplug->b.can_shift != NULL)
56715+ shift->merging_units =
56716+ iplug->b.can_shift(target_free_space,
56717+ &source, shift->target,
56718+ shift->pend, &size,
56719+ want);
56720+ else {
56721+ shift->merging_units = 0;
56722+ size = 0;
56723+ }
56724+ shift->merging_bytes = size;
56725+ shift->shift_bytes += size;
56726+ /* update stop coord to be set to last unit of @source
56727+ we can merge to @target */
56728+ if (shift->merging_units)
56729+ /* at least one unit can be shifted */
56730+ shift->real_stop.unit_pos =
56731+ (shift->merging_units - source.unit_pos -
56732+ 1) * shift->pend;
56733+ else {
56734+ /* nothing can be shifted */
56735+ if (shift->pend == SHIFT_LEFT)
56736+ coord_init_before_first_item(&shift->
56737+ real_stop,
56738+ source.
56739+ node);
56740+ else
56741+ coord_init_after_last_item(&shift->
56742+ real_stop,
56743+ source.node);
56744+ }
56745+ assert("nikita-2081", shift->real_stop.unit_pos + 1);
56746+
56747+ if (shift->merging_units != want) {
56748+ /* we could not copy as many as we want, so,
56749+ there is no reason for estimating any
56750+ longer */
56751+ return;
56752+ }
56753+
56754+ target_free_space -= size;
56755+ coord_add_item_pos(&source, shift->pend);
56756+ }
56757+ }
56758+
56759+ /* number of item nothing of which we want to shift */
56760+ stop_item = shift->wish_stop.item_pos + shift->pend;
56761+
56762+ /* calculate how many items can be copied into given free
56763+ space as whole */
56764+ for (; source.item_pos != stop_item;
56765+ coord_add_item_pos(&source, shift->pend)) {
56766+ if (shift->pend == SHIFT_RIGHT)
56767+ source.unit_pos = coord_last_unit_pos(&source);
56768+
56769+ /* how many units of @source do we want to copy */
56770+ want = wanted_units(&source, &shift->wish_stop, shift->pend);
56771+
56772+ if (want == coord_last_unit_pos(&source) + 1) {
56773+ /* we want this item to be copied entirely */
56774+ size =
56775+ item_length_by_coord(&source) +
56776+ item_creation_overhead(&source);
56777+ if (size <= target_free_space) {
56778+ /* item fits into target node as whole */
56779+ target_free_space -= size;
56780+ shift->shift_bytes +=
56781+ size - item_creation_overhead(&source);
56782+ shift->entire_bytes +=
56783+ size - item_creation_overhead(&source);
56784+ shift->entire++;
56785+
56786+ /* update shift->real_stop coord to be set to
56787+ last unit of @source we can merge to
56788+ @target */
56789+ shift->real_stop = source;
56790+ if (shift->pend == SHIFT_LEFT)
56791+ shift->real_stop.unit_pos =
56792+ coord_last_unit_pos(&shift->
56793+ real_stop);
56794+ else
56795+ shift->real_stop.unit_pos = 0;
56796+ continue;
56797+ }
56798+ }
56799+
56800+ /* we reach here only for an item which does not fit into
56801+ target node in its entirety. This item may be either
56802+ partially shifted, or not shifted at all. We will have to
56803+ create new item in target node, so decrease amout of free
56804+ space by an item creation overhead. We can reach here also
56805+ if stop coord is in this item */
56806+ if (target_free_space >=
56807+ (unsigned)item_creation_overhead(&source)) {
56808+ target_free_space -= item_creation_overhead(&source);
56809+ iplug = item_plugin_by_coord(&source);
56810+ if (iplug->b.can_shift) {
56811+ shift->part_units = iplug->b.can_shift(target_free_space,
56812+ &source,
56813+ NULL, /* target */
56814+ shift->pend,
56815+ &size,
56816+ want);
56817+ } else {
56818+ target_free_space = 0;
56819+ shift->part_units = 0;
56820+ size = 0;
56821+ }
56822+ } else {
56823+ target_free_space = 0;
56824+ shift->part_units = 0;
56825+ size = 0;
56826+ }
56827+ shift->part_bytes = size;
56828+ shift->shift_bytes += size;
56829+
56830+ /* set @shift->real_stop to last unit of @source we can merge
56831+ to @shift->target */
56832+ if (shift->part_units) {
56833+ shift->real_stop = source;
56834+ shift->real_stop.unit_pos =
56835+ (shift->part_units - source.unit_pos -
56836+ 1) * shift->pend;
56837+ assert("nikita-2082", shift->real_stop.unit_pos + 1);
56838+ }
56839+
56840+ if (want != shift->part_units)
56841+ /* not everything wanted were shifted */
56842+ return;
56843+ break;
56844+ }
56845+
56846+ shift->everything = 1;
56847+}
56848+
56849+static void
56850+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
56851+ shift_direction dir, unsigned free_space)
56852+{
56853+ item_plugin *iplug;
56854+
56855+ assert("nikita-1463", target != NULL);
56856+ assert("nikita-1464", source != NULL);
56857+ assert("nikita-1465", from + count <= coord_num_units(source));
56858+
56859+ iplug = item_plugin_by_coord(source);
56860+ assert("nikita-1468", iplug == item_plugin_by_coord(target));
56861+ iplug->b.copy_units(target, source, from, count, dir, free_space);
56862+
56863+ if (dir == SHIFT_RIGHT) {
56864+ /* FIXME-VS: this looks not necessary. update_item_key was
56865+ called already by copy_units method */
56866+ reiser4_key split_key;
56867+
56868+ assert("nikita-1469", target->unit_pos == 0);
56869+
56870+ unit_key_by_coord(target, &split_key);
56871+ node_plugin_by_coord(target)->update_item_key(target,
56872+ &split_key, NULL);
56873+ }
56874+}
56875+
56876+/* copy part of @shift->real_stop.node starting either from its beginning or
56877+ from its end and ending at @shift->real_stop to either the end or the
56878+ beginning of @shift->target */
56879+static void copy(struct shift_params *shift)
56880+{
56881+ node40_header *nh;
56882+ coord_t from;
56883+ coord_t to;
56884+ item_header40 *from_ih, *to_ih;
56885+ int free_space_start;
56886+ int new_items;
56887+ unsigned old_items;
56888+ int old_offset;
56889+ unsigned i;
56890+
56891+ nh = node40_node_header(shift->target);
56892+ free_space_start = nh40_get_free_space_start(nh);
56893+ old_items = nh40_get_num_items(nh);
56894+ new_items = shift->entire + (shift->part_units ? 1 : 0);
56895+ assert("vs-185",
56896+ shift->shift_bytes ==
56897+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
56898+
56899+ from = shift->wish_stop;
56900+
56901+ coord_init_first_unit(&to, shift->target);
56902+
56903+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
56904+ hence to.between is set to EMPTY_NODE above. Looks like we want it
56905+ to be AT_UNIT.
56906+
56907+ Oh, wonders of ->betweeness...
56908+
56909+ */
56910+ to.between = AT_UNIT;
56911+
56912+ if (shift->pend == SHIFT_LEFT) {
56913+ /* copying to left */
56914+
56915+ coord_set_item_pos(&from, 0);
56916+ from_ih = node40_ih_at(from.node, 0);
56917+
56918+ coord_set_item_pos(&to,
56919+ node40_num_of_items_internal(to.node) - 1);
56920+ if (shift->merging_units) {
56921+ /* expand last item, so that plugin methods will see
56922+ correct data */
56923+ free_space_start += shift->merging_bytes;
56924+ nh40_set_free_space_start(nh,
56925+ (unsigned)free_space_start);
56926+ nh40_set_free_space(nh,
56927+ nh40_get_free_space(nh) -
56928+ shift->merging_bytes);
56929+
56930+ /* appending last item of @target */
56931+ copy_units(&to, &from, 0, /* starting from 0-th unit */
56932+ shift->merging_units, SHIFT_LEFT,
56933+ shift->merging_bytes);
56934+ coord_inc_item_pos(&from);
56935+ from_ih--;
56936+ coord_inc_item_pos(&to);
56937+ }
56938+
56939+ to_ih = node40_ih_at(shift->target, old_items);
56940+ if (shift->entire) {
56941+ /* copy @entire items entirely */
56942+
56943+ /* copy item headers */
56944+ memcpy(to_ih - shift->entire + 1,
56945+ from_ih - shift->entire + 1,
56946+ shift->entire * sizeof(item_header40));
56947+ /* update item header offset */
56948+ old_offset = ih40_get_offset(from_ih);
56949+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
56950+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
56951+ ih40_set_offset(to_ih,
56952+ ih40_get_offset(from_ih) -
56953+ old_offset + free_space_start);
56954+
56955+ /* copy item bodies */
56956+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
56957+ shift->entire_bytes);
56958+
56959+ coord_add_item_pos(&from, (int)shift->entire);
56960+ coord_add_item_pos(&to, (int)shift->entire);
56961+ }
56962+
56963+ nh40_set_free_space_start(nh,
56964+ free_space_start +
56965+ shift->shift_bytes -
56966+ shift->merging_bytes);
56967+ nh40_set_free_space(nh,
56968+ nh40_get_free_space(nh) -
56969+ (shift->shift_bytes - shift->merging_bytes +
56970+ sizeof(item_header40) * new_items));
56971+
56972+ /* update node header */
56973+ node40_set_num_items(shift->target, nh, old_items + new_items);
56974+ assert("vs-170",
56975+ nh40_get_free_space(nh) < znode_size(shift->target));
56976+
56977+ if (shift->part_units) {
56978+ /* copy heading part (@part units) of @source item as
56979+ a new item into @target->node */
56980+
56981+ /* copy item header of partially copied item */
56982+ coord_set_item_pos(&to,
56983+ node40_num_of_items_internal(to.node)
56984+ - 1);
56985+ memcpy(to_ih, from_ih, sizeof(item_header40));
56986+ ih40_set_offset(to_ih,
56987+ nh40_get_free_space_start(nh) -
56988+ shift->part_bytes);
56989+ if (item_plugin_by_coord(&to)->b.init)
56990+ item_plugin_by_coord(&to)->b.init(&to, &from,
56991+ NULL);
56992+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
56993+ shift->part_bytes);
56994+ }
56995+
56996+ } else {
56997+ /* copying to right */
56998+
56999+ coord_set_item_pos(&from,
57000+ node40_num_of_items_internal(from.node) - 1);
57001+ from_ih = node40_ih_at_coord(&from);
57002+
57003+ coord_set_item_pos(&to, 0);
57004+
57005+ /* prepare space for new items */
57006+ memmove(zdata(to.node) + sizeof(node40_header) +
57007+ shift->shift_bytes,
57008+ zdata(to.node) + sizeof(node40_header),
57009+ free_space_start - sizeof(node40_header));
57010+ /* update item headers of moved items */
57011+ to_ih = node40_ih_at(to.node, 0);
57012+ /* first item gets @merging_bytes longer. free space appears
57013+ at its beginning */
57014+ if (!node_is_empty(to.node))
57015+ ih40_set_offset(to_ih,
57016+ ih40_get_offset(to_ih) +
57017+ shift->shift_bytes -
57018+ shift->merging_bytes);
57019+
57020+ for (i = 1; i < old_items; i++)
57021+ ih40_set_offset(to_ih - i,
57022+ ih40_get_offset(to_ih - i) +
57023+ shift->shift_bytes);
57024+
57025+ /* move item headers to make space for new items */
57026+ memmove(to_ih - old_items + 1 - new_items,
57027+ to_ih - old_items + 1,
57028+ sizeof(item_header40) * old_items);
57029+ to_ih -= (new_items - 1);
57030+
57031+ nh40_set_free_space_start(nh,
57032+ free_space_start +
57033+ shift->shift_bytes);
57034+ nh40_set_free_space(nh,
57035+ nh40_get_free_space(nh) -
57036+ (shift->shift_bytes +
57037+ sizeof(item_header40) * new_items));
57038+
57039+ /* update node header */
57040+ node40_set_num_items(shift->target, nh, old_items + new_items);
57041+ assert("vs-170",
57042+ nh40_get_free_space(nh) < znode_size(shift->target));
57043+
57044+ if (shift->merging_units) {
57045+ coord_add_item_pos(&to, new_items);
57046+ to.unit_pos = 0;
57047+ to.between = AT_UNIT;
57048+ /* prepend first item of @to */
57049+ copy_units(&to, &from,
57050+ coord_last_unit_pos(&from) -
57051+ shift->merging_units + 1,
57052+ shift->merging_units, SHIFT_RIGHT,
57053+ shift->merging_bytes);
57054+ coord_dec_item_pos(&from);
57055+ from_ih++;
57056+ }
57057+
57058+ if (shift->entire) {
57059+ /* copy @entire items entirely */
57060+
57061+ /* copy item headers */
57062+ memcpy(to_ih, from_ih,
57063+ shift->entire * sizeof(item_header40));
57064+
57065+ /* update item header offset */
57066+ old_offset =
57067+ ih40_get_offset(from_ih + shift->entire - 1);
57068+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
57069+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
57070+ ih40_set_offset(to_ih,
57071+ ih40_get_offset(from_ih) -
57072+ old_offset +
57073+ sizeof(node40_header) +
57074+ shift->part_bytes);
57075+ /* copy item bodies */
57076+ coord_add_item_pos(&from, -(int)(shift->entire - 1));
57077+ memcpy(zdata(to.node) + sizeof(node40_header) +
57078+ shift->part_bytes, item_by_coord_node40(&from),
57079+ shift->entire_bytes);
57080+ coord_dec_item_pos(&from);
57081+ }
57082+
57083+ if (shift->part_units) {
57084+ coord_set_item_pos(&to, 0);
57085+ to.unit_pos = 0;
57086+ to.between = AT_UNIT;
57087+ /* copy heading part (@part units) of @source item as
57088+ a new item into @target->node */
57089+
57090+ /* copy item header of partially copied item */
57091+ memcpy(to_ih, from_ih, sizeof(item_header40));
57092+ ih40_set_offset(to_ih, sizeof(node40_header));
57093+ if (item_plugin_by_coord(&to)->b.init)
57094+ item_plugin_by_coord(&to)->b.init(&to, &from,
57095+ NULL);
57096+ copy_units(&to, &from,
57097+ coord_last_unit_pos(&from) -
57098+ shift->part_units + 1, shift->part_units,
57099+ SHIFT_RIGHT, shift->part_bytes);
57100+ }
57101+ }
57102+}
57103+
57104+/* remove everything either before or after @fact_stop. Number of items
57105+ removed completely is returned */
57106+static int delete_copied(struct shift_params *shift)
57107+{
57108+ coord_t from;
57109+ coord_t to;
57110+ struct carry_cut_data cdata;
57111+
57112+ if (shift->pend == SHIFT_LEFT) {
57113+ /* we were shifting to left, remove everything from the
57114+ beginning of @shift->wish_stop->node upto
57115+ @shift->wish_stop */
57116+ coord_init_first_unit(&from, shift->real_stop.node);
57117+ to = shift->real_stop;
57118+
57119+ /* store old coordinate of unit which will be first after
57120+ shift to left */
57121+ shift->u.future_first = to;
57122+ coord_next_unit(&shift->u.future_first);
57123+ } else {
57124+ /* we were shifting to right, remove everything from
57125+ @shift->stop_coord upto to end of
57126+ @shift->stop_coord->node */
57127+ from = shift->real_stop;
57128+ coord_init_last_unit(&to, from.node);
57129+
57130+ /* store old coordinate of unit which will be last after
57131+ shift to right */
57132+ shift->u.future_last = from;
57133+ coord_prev_unit(&shift->u.future_last);
57134+ }
57135+
57136+ cdata.params.from = &from;
57137+ cdata.params.to = &to;
57138+ cdata.params.from_key = NULL;
57139+ cdata.params.to_key = NULL;
57140+ cdata.params.smallest_removed = NULL;
57141+ return cut_node40(&cdata, NULL);
57142+}
57143+
57144+/* something was moved between @left and @right. Add carry operation to @info
57145+ list to have carry to update delimiting key between them */
57146+static int
57147+prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
57148+{
57149+ carry_op *op;
57150+ carry_node *cn;
57151+
57152+ if (info == NULL)
57153+ /* nowhere to send operation to. */
57154+ return 0;
57155+
57156+ if (!should_notify_parent(right))
57157+ return 0;
57158+
57159+ op = node_post_carry(info, COP_UPDATE, right, 1);
57160+ if (IS_ERR(op) || op == NULL)
57161+ return op ? PTR_ERR(op) : -EIO;
57162+
57163+ if (left != NULL) {
57164+ carry_node *reference;
57165+
57166+ if (info->doing)
57167+ reference = insert_carry_node(info->doing,
57168+ info->todo, left);
57169+ else
57170+ reference = op->node;
57171+ assert("nikita-2992", reference != NULL);
57172+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
57173+ if (IS_ERR(cn))
57174+ return PTR_ERR(cn);
57175+ cn->parent = 1;
57176+ cn->node = left;
57177+ if (ZF_ISSET(left, JNODE_ORPHAN))
57178+ cn->left_before = 1;
57179+ op->u.update.left = cn;
57180+ } else
57181+ op->u.update.left = NULL;
57182+ return 0;
57183+}
57184+
57185+/* plugin->u.node.prepare_removal
57186+ to delete a pointer to @empty from the tree add corresponding carry
57187+ operation (delete) to @info list */
57188+int prepare_removal_node40(znode * empty, carry_plugin_info * info)
57189+{
57190+ carry_op *op;
57191+ reiser4_tree *tree;
57192+
57193+ if (!should_notify_parent(empty))
57194+ return 0;
57195+ /* already on a road to Styx */
57196+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
57197+ return 0;
57198+ op = node_post_carry(info, COP_DELETE, empty, 1);
57199+ if (IS_ERR(op) || op == NULL)
57200+ return RETERR(op ? PTR_ERR(op) : -EIO);
57201+
57202+ op->u.delete.child = NULL;
57203+ op->u.delete.flags = 0;
57204+
57205+ /* fare thee well */
57206+ tree = znode_get_tree(empty);
57207+ read_lock_tree(tree);
57208+ write_lock_dk(tree);
57209+ znode_set_ld_key(empty, znode_get_rd_key(empty));
57210+ if (znode_is_left_connected(empty) && empty->left)
57211+ znode_set_rd_key(empty->left, znode_get_rd_key(empty));
57212+ write_unlock_dk(tree);
57213+ read_unlock_tree(tree);
57214+
57215+ ZF_SET(empty, JNODE_HEARD_BANSHEE);
57216+ return 0;
57217+}
57218+
57219+/* something were shifted from @insert_coord->node to @shift->target, update
57220+ @insert_coord correspondingly */
57221+static void
57222+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
57223+ int including_insert_coord)
57224+{
57225+ /* item plugin was invalidated by shifting */
57226+ coord_clear_iplug(insert_coord);
57227+
57228+ if (node_is_empty(shift->wish_stop.node)) {
57229+ assert("vs-242", shift->everything);
57230+ if (including_insert_coord) {
57231+ if (shift->pend == SHIFT_RIGHT) {
57232+ /* set @insert_coord before first unit of
57233+ @shift->target node */
57234+ coord_init_before_first_item(insert_coord,
57235+ shift->target);
57236+ } else {
57237+ /* set @insert_coord after last in target node */
57238+ coord_init_after_last_item(insert_coord,
57239+ shift->target);
57240+ }
57241+ } else {
57242+ /* set @insert_coord inside of empty node. There is
57243+ only one possible coord within an empty
57244+ node. init_first_unit will set that coord */
57245+ coord_init_first_unit(insert_coord,
57246+ shift->wish_stop.node);
57247+ }
57248+ return;
57249+ }
57250+
57251+ if (shift->pend == SHIFT_RIGHT) {
57252+ /* there was shifting to right */
57253+ if (shift->everything) {
57254+ /* everything wanted was shifted */
57255+ if (including_insert_coord) {
57256+ /* @insert_coord is set before first unit of
57257+ @to node */
57258+ coord_init_before_first_item(insert_coord,
57259+ shift->target);
57260+ insert_coord->between = BEFORE_UNIT;
57261+ } else {
57262+ /* @insert_coord is set after last unit of
57263+ @insert->node */
57264+ coord_init_last_unit(insert_coord,
57265+ shift->wish_stop.node);
57266+ insert_coord->between = AFTER_UNIT;
57267+ }
57268+ }
57269+ return;
57270+ }
57271+
57272+ /* there was shifting to left */
57273+ if (shift->everything) {
57274+ /* everything wanted was shifted */
57275+ if (including_insert_coord) {
57276+ /* @insert_coord is set after last unit in @to node */
57277+ coord_init_after_last_item(insert_coord, shift->target);
57278+ } else {
57279+ /* @insert_coord is set before first unit in the same
57280+ node */
57281+ coord_init_before_first_item(insert_coord,
57282+ shift->wish_stop.node);
57283+ }
57284+ return;
57285+ }
57286+
57287+ /* FIXME-VS: the code below is complicated because with between ==
57288+ AFTER_ITEM unit_pos is set to 0 */
57289+
57290+ if (!removed) {
57291+ /* no items were shifted entirely */
57292+ assert("vs-195", shift->merging_units == 0
57293+ || shift->part_units == 0);
57294+
57295+ if (shift->real_stop.item_pos == insert_coord->item_pos) {
57296+ if (shift->merging_units) {
57297+ if (insert_coord->between == AFTER_UNIT) {
57298+ assert("nikita-1441",
57299+ insert_coord->unit_pos >=
57300+ shift->merging_units);
57301+ insert_coord->unit_pos -=
57302+ shift->merging_units;
57303+ } else if (insert_coord->between == BEFORE_UNIT) {
57304+ assert("nikita-2090",
57305+ insert_coord->unit_pos >
57306+ shift->merging_units);
57307+ insert_coord->unit_pos -=
57308+ shift->merging_units;
57309+ }
57310+
57311+ assert("nikita-2083",
57312+ insert_coord->unit_pos + 1);
57313+ } else {
57314+ if (insert_coord->between == AFTER_UNIT) {
57315+ assert("nikita-1442",
57316+ insert_coord->unit_pos >=
57317+ shift->part_units);
57318+ insert_coord->unit_pos -=
57319+ shift->part_units;
57320+ } else if (insert_coord->between == BEFORE_UNIT) {
57321+ assert("nikita-2089",
57322+ insert_coord->unit_pos >
57323+ shift->part_units);
57324+ insert_coord->unit_pos -=
57325+ shift->part_units;
57326+ }
57327+
57328+ assert("nikita-2084",
57329+ insert_coord->unit_pos + 1);
57330+ }
57331+ }
57332+ return;
57333+ }
57334+
57335+ /* we shifted to left and there was no enough space for everything */
57336+ switch (insert_coord->between) {
57337+ case AFTER_UNIT:
57338+ case BEFORE_UNIT:
57339+ if (shift->real_stop.item_pos == insert_coord->item_pos)
57340+ insert_coord->unit_pos -= shift->part_units;
57341+ case AFTER_ITEM:
57342+ coord_add_item_pos(insert_coord, -removed);
57343+ break;
57344+ default:
57345+ impossible("nikita-2087", "not ready");
57346+ }
57347+ assert("nikita-2085", insert_coord->unit_pos + 1);
57348+}
57349+
57350+static int call_shift_hooks(struct shift_params *shift)
57351+{
57352+ unsigned i, shifted;
57353+ coord_t coord;
57354+ item_plugin *iplug;
57355+
57356+ assert("vs-275", !node_is_empty(shift->target));
57357+
57358+ /* number of items shift touches */
57359+ shifted =
57360+ shift->entire + (shift->merging_units ? 1 : 0) +
57361+ (shift->part_units ? 1 : 0);
57362+
57363+ if (shift->pend == SHIFT_LEFT) {
57364+ /* moved items are at the end */
57365+ coord_init_last_unit(&coord, shift->target);
57366+ coord.unit_pos = 0;
57367+
57368+ assert("vs-279", shift->pend == 1);
57369+ for (i = 0; i < shifted; i++) {
57370+ unsigned from, count;
57371+
57372+ iplug = item_plugin_by_coord(&coord);
57373+ if (i == 0 && shift->part_units) {
57374+ assert("vs-277",
57375+ coord_num_units(&coord) ==
57376+ shift->part_units);
57377+ count = shift->part_units;
57378+ from = 0;
57379+ } else if (i == shifted - 1 && shift->merging_units) {
57380+ count = shift->merging_units;
57381+ from = coord_num_units(&coord) - count;
57382+ } else {
57383+ count = coord_num_units(&coord);
57384+ from = 0;
57385+ }
57386+
57387+ if (iplug->b.shift_hook) {
57388+ iplug->b.shift_hook(&coord, from, count,
57389+ shift->wish_stop.node);
57390+ }
57391+ coord_add_item_pos(&coord, -shift->pend);
57392+ }
57393+ } else {
57394+ /* moved items are at the beginning */
57395+ coord_init_first_unit(&coord, shift->target);
57396+
57397+ assert("vs-278", shift->pend == -1);
57398+ for (i = 0; i < shifted; i++) {
57399+ unsigned from, count;
57400+
57401+ iplug = item_plugin_by_coord(&coord);
57402+ if (i == 0 && shift->part_units) {
57403+ assert("vs-277",
57404+ coord_num_units(&coord) ==
57405+ shift->part_units);
57406+ count = coord_num_units(&coord);
57407+ from = 0;
57408+ } else if (i == shifted - 1 && shift->merging_units) {
57409+ count = shift->merging_units;
57410+ from = 0;
57411+ } else {
57412+ count = coord_num_units(&coord);
57413+ from = 0;
57414+ }
57415+
57416+ if (iplug->b.shift_hook) {
57417+ iplug->b.shift_hook(&coord, from, count,
57418+ shift->wish_stop.node);
57419+ }
57420+ coord_add_item_pos(&coord, -shift->pend);
57421+ }
57422+ }
57423+
57424+ return 0;
57425+}
57426+
57427+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
57428+static int
57429+unit_moved_left(const struct shift_params *shift, const coord_t * old)
57430+{
57431+ assert("vs-944", shift->real_stop.node == old->node);
57432+
57433+ if (shift->real_stop.item_pos < old->item_pos)
57434+ return 0;
57435+ if (shift->real_stop.item_pos == old->item_pos) {
57436+ if (shift->real_stop.unit_pos < old->unit_pos)
57437+ return 0;
57438+ }
57439+ return 1;
57440+}
57441+
57442+/* shift to right is completed. Return 1 if unit @old was moved to right
57443+ neighbor */
57444+static int
57445+unit_moved_right(const struct shift_params *shift, const coord_t * old)
57446+{
57447+ assert("vs-944", shift->real_stop.node == old->node);
57448+
57449+ if (shift->real_stop.item_pos > old->item_pos)
57450+ return 0;
57451+ if (shift->real_stop.item_pos == old->item_pos) {
57452+ if (shift->real_stop.unit_pos > old->unit_pos)
57453+ return 0;
57454+ }
57455+ return 1;
57456+}
57457+
57458+/* coord @old was set in node from which shift was performed. What was shifted
57459+ is stored in @shift. Update @old correspondingly to performed shift */
57460+static coord_t *adjust_coord2(const struct shift_params *shift,
57461+ const coord_t * old, coord_t * new)
57462+{
57463+ coord_clear_iplug(new);
57464+ new->between = old->between;
57465+
57466+ coord_clear_iplug(new);
57467+ if (old->node == shift->target) {
57468+ if (shift->pend == SHIFT_LEFT) {
57469+ /* coord which is set inside of left neighbor does not
57470+ change during shift to left */
57471+ coord_dup(new, old);
57472+ return new;
57473+ }
57474+ new->node = old->node;
57475+ coord_set_item_pos(new,
57476+ old->item_pos + shift->entire +
57477+ (shift->part_units ? 1 : 0));
57478+ new->unit_pos = old->unit_pos;
57479+ if (old->item_pos == 0 && shift->merging_units)
57480+ new->unit_pos += shift->merging_units;
57481+ return new;
57482+ }
57483+
57484+ assert("vs-977", old->node == shift->wish_stop.node);
57485+ if (shift->pend == SHIFT_LEFT) {
57486+ if (unit_moved_left(shift, old)) {
57487+ /* unit @old moved to left neighbor. Calculate its
57488+ coordinate there */
57489+ new->node = shift->target;
57490+ coord_set_item_pos(new,
57491+ node_num_items(shift->target) -
57492+ shift->entire -
57493+ (shift->part_units ? 1 : 0) +
57494+ old->item_pos);
57495+
57496+ new->unit_pos = old->unit_pos;
57497+ if (shift->merging_units) {
57498+ coord_dec_item_pos(new);
57499+ if (old->item_pos == 0) {
57500+ /* unit_pos only changes if item got
57501+ merged */
57502+ new->unit_pos =
57503+ coord_num_units(new) -
57504+ (shift->merging_units -
57505+ old->unit_pos);
57506+ }
57507+ }
57508+ } else {
57509+ /* unit @old did not move to left neighbor.
57510+
57511+ Use _nocheck, because @old is outside of its node.
57512+ */
57513+ coord_dup_nocheck(new, old);
57514+ coord_add_item_pos(new,
57515+ -shift->u.future_first.item_pos);
57516+ if (new->item_pos == 0)
57517+ new->unit_pos -= shift->u.future_first.unit_pos;
57518+ }
57519+ } else {
57520+ if (unit_moved_right(shift, old)) {
57521+ /* unit @old moved to right neighbor */
57522+ new->node = shift->target;
57523+ coord_set_item_pos(new,
57524+ old->item_pos -
57525+ shift->real_stop.item_pos);
57526+ if (new->item_pos == 0) {
57527+ /* unit @old might change unit pos */
57528+ coord_set_item_pos(new,
57529+ old->unit_pos -
57530+ shift->real_stop.unit_pos);
57531+ }
57532+ } else {
57533+ /* unit @old did not move to right neighbor, therefore
57534+ it did not change */
57535+ coord_dup(new, old);
57536+ }
57537+ }
57538+ coord_set_iplug(new, item_plugin_by_coord(new));
57539+ return new;
57540+}
57541+
57542+/* this is called when shift is completed (something of source node is copied
57543+ to target and deleted in source) to update all taps set in current
57544+ context */
57545+static void update_taps(const struct shift_params *shift)
57546+{
57547+ tap_t *tap;
57548+ coord_t new;
57549+
57550+ for_all_taps(tap) {
57551+ /* update only taps set to nodes participating in shift */
57552+ if (tap->coord->node == shift->wish_stop.node
57553+ || tap->coord->node == shift->target)
57554+ tap_to_coord(tap,
57555+ adjust_coord2(shift, tap->coord, &new));
57556+ }
57557+}
57558+
57559+#if REISER4_DEBUG
57560+
57561+struct shift_check {
57562+ reiser4_key key;
57563+ __u16 plugin_id;
57564+ union {
57565+ __u64 bytes;
57566+ __u64 entries;
57567+ void *unused;
57568+ } u;
57569+};
57570+
57571+void *shift_check_prepare(const znode * left, const znode * right)
57572+{
57573+ pos_in_node_t i, nr_items;
57574+ int mergeable;
57575+ struct shift_check *data;
57576+ item_header40 *ih;
57577+
57578+ if (node_is_empty(left) || node_is_empty(right))
57579+ mergeable = 0;
57580+ else {
57581+ coord_t l, r;
57582+
57583+ coord_init_last_unit(&l, left);
57584+ coord_init_first_unit(&r, right);
57585+ mergeable = are_items_mergeable(&l, &r);
57586+ }
57587+ nr_items =
57588+ node40_num_of_items_internal(left) +
57589+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57590+ data =
57591+ kmalloc(sizeof(struct shift_check) * nr_items,
57592+ reiser4_ctx_gfp_mask_get());
57593+ if (data != NULL) {
57594+ coord_t coord;
57595+ pos_in_node_t item_pos;
57596+
57597+ coord_init_first_unit(&coord, left);
57598+ i = 0;
57599+
57600+ for (item_pos = 0;
57601+ item_pos < node40_num_of_items_internal(left);
57602+ item_pos++) {
57603+
57604+ coord_set_item_pos(&coord, item_pos);
57605+ ih = node40_ih_at_coord(&coord);
57606+
57607+ data[i].key = ih->key;
57608+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57609+ switch (data[i].plugin_id) {
57610+ case CTAIL_ID:
57611+ case FORMATTING_ID:
57612+ data[i].u.bytes = coord_num_units(&coord);
57613+ break;
57614+ case EXTENT_POINTER_ID:
57615+ data[i].u.bytes =
57616+ reiser4_extent_size(&coord,
57617+ coord_num_units(&coord));
57618+ break;
57619+ case COMPOUND_DIR_ID:
57620+ data[i].u.entries = coord_num_units(&coord);
57621+ break;
57622+ default:
57623+ data[i].u.unused = NULL;
57624+ break;
57625+ }
57626+ i++;
57627+ }
57628+
57629+ coord_init_first_unit(&coord, right);
57630+
57631+ if (mergeable) {
57632+ assert("vs-1609", i != 0);
57633+
57634+ ih = node40_ih_at_coord(&coord);
57635+
57636+ assert("vs-1589",
57637+ data[i - 1].plugin_id ==
57638+ le16_to_cpu(get_unaligned(&ih->plugin_id)));
57639+ switch (data[i - 1].plugin_id) {
57640+ case CTAIL_ID:
57641+ case FORMATTING_ID:
57642+ data[i - 1].u.bytes += coord_num_units(&coord);
57643+ break;
57644+ case EXTENT_POINTER_ID:
57645+ data[i - 1].u.bytes +=
57646+ reiser4_extent_size(&coord,
57647+ coord_num_units(&coord));
57648+ break;
57649+ case COMPOUND_DIR_ID:
57650+ data[i - 1].u.entries +=
57651+ coord_num_units(&coord);
57652+ break;
57653+ default:
57654+ impossible("vs-1605", "wrong mergeable item");
57655+ break;
57656+ }
57657+ item_pos = 1;
57658+ } else
57659+ item_pos = 0;
57660+ for (; item_pos < node40_num_of_items_internal(right);
57661+ item_pos++) {
57662+
57663+ assert("vs-1604", i < nr_items);
57664+ coord_set_item_pos(&coord, item_pos);
57665+ ih = node40_ih_at_coord(&coord);
57666+
57667+ data[i].key = ih->key;
57668+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57669+ switch (data[i].plugin_id) {
57670+ case CTAIL_ID:
57671+ case FORMATTING_ID:
57672+ data[i].u.bytes = coord_num_units(&coord);
57673+ break;
57674+ case EXTENT_POINTER_ID:
57675+ data[i].u.bytes =
57676+ reiser4_extent_size(&coord,
57677+ coord_num_units(&coord));
57678+ break;
57679+ case COMPOUND_DIR_ID:
57680+ data[i].u.entries = coord_num_units(&coord);
57681+ break;
57682+ default:
57683+ data[i].u.unused = NULL;
57684+ break;
57685+ }
57686+ i++;
57687+ }
57688+ assert("vs-1606", i == nr_items);
57689+ }
57690+ return data;
57691+}
57692+
57693+void shift_check(void *vp, const znode * left, const znode * right)
57694+{
57695+ pos_in_node_t i, nr_items;
57696+ coord_t coord;
57697+ __u64 last_bytes;
57698+ int mergeable;
57699+ item_header40 *ih;
57700+ pos_in_node_t item_pos;
57701+ struct shift_check *data;
57702+
57703+ data = (struct shift_check *)vp;
57704+
57705+ if (data == NULL)
57706+ return;
57707+
57708+ if (node_is_empty(left) || node_is_empty(right))
57709+ mergeable = 0;
57710+ else {
57711+ coord_t l, r;
57712+
57713+ coord_init_last_unit(&l, left);
57714+ coord_init_first_unit(&r, right);
57715+ mergeable = are_items_mergeable(&l, &r);
57716+ }
57717+
57718+ nr_items =
57719+ node40_num_of_items_internal(left) +
57720+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57721+
57722+ i = 0;
57723+ last_bytes = 0;
57724+
57725+ coord_init_first_unit(&coord, left);
57726+
57727+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
57728+ item_pos++) {
57729+
57730+ coord_set_item_pos(&coord, item_pos);
57731+ ih = node40_ih_at_coord(&coord);
57732+
57733+ assert("vs-1611", i == item_pos);
57734+ assert("vs-1590", keyeq(&ih->key, &data[i].key));
57735+ assert("vs-1591",
57736+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57737+ if ((i < (node40_num_of_items_internal(left) - 1))
57738+ || !mergeable) {
57739+ switch (data[i].plugin_id) {
57740+ case CTAIL_ID:
57741+ case FORMATTING_ID:
57742+ assert("vs-1592",
57743+ data[i].u.bytes ==
57744+ coord_num_units(&coord));
57745+ break;
57746+ case EXTENT_POINTER_ID:
57747+ assert("vs-1593",
57748+ data[i].u.bytes ==
57749+ reiser4_extent_size(&coord,
57750+ coord_num_units
57751+ (&coord)));
57752+ break;
57753+ case COMPOUND_DIR_ID:
57754+ assert("vs-1594",
57755+ data[i].u.entries ==
57756+ coord_num_units(&coord));
57757+ break;
57758+ default:
57759+ break;
57760+ }
57761+ }
57762+ if (item_pos == (node40_num_of_items_internal(left) - 1)
57763+ && mergeable) {
57764+ switch (data[i].plugin_id) {
57765+ case CTAIL_ID:
57766+ case FORMATTING_ID:
57767+ last_bytes = coord_num_units(&coord);
57768+ break;
57769+ case EXTENT_POINTER_ID:
57770+ last_bytes =
57771+ reiser4_extent_size(&coord,
57772+ coord_num_units(&coord));
57773+ break;
57774+ case COMPOUND_DIR_ID:
57775+ last_bytes = coord_num_units(&coord);
57776+ break;
57777+ default:
57778+ impossible("vs-1595", "wrong mergeable item");
57779+ break;
57780+ }
57781+ }
57782+ i++;
57783+ }
57784+
57785+ coord_init_first_unit(&coord, right);
57786+ if (mergeable) {
57787+ ih = node40_ih_at_coord(&coord);
57788+
57789+ assert("vs-1589",
57790+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
57791+ assert("vs-1608", last_bytes != 0);
57792+ switch (data[i - 1].plugin_id) {
57793+ case CTAIL_ID:
57794+ case FORMATTING_ID:
57795+ assert("vs-1596",
57796+ data[i - 1].u.bytes ==
57797+ last_bytes + coord_num_units(&coord));
57798+ break;
57799+
57800+ case EXTENT_POINTER_ID:
57801+ assert("vs-1597",
57802+ data[i - 1].u.bytes ==
57803+ last_bytes + reiser4_extent_size(&coord,
57804+ coord_num_units
57805+ (&coord)));
57806+ break;
57807+
57808+ case COMPOUND_DIR_ID:
57809+ assert("vs-1598",
57810+ data[i - 1].u.bytes ==
57811+ last_bytes + coord_num_units(&coord));
57812+ break;
57813+ default:
57814+ impossible("vs-1599", "wrong mergeable item");
57815+ break;
57816+ }
57817+ item_pos = 1;
57818+ } else
57819+ item_pos = 0;
57820+
57821+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
57822+
57823+ coord_set_item_pos(&coord, item_pos);
57824+ ih = node40_ih_at_coord(&coord);
57825+
57826+ assert("vs-1612", keyeq(&ih->key, &data[i].key));
57827+ assert("vs-1613",
57828+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57829+ switch (data[i].plugin_id) {
57830+ case CTAIL_ID:
57831+ case FORMATTING_ID:
57832+ assert("vs-1600",
57833+ data[i].u.bytes == coord_num_units(&coord));
57834+ break;
57835+ case EXTENT_POINTER_ID:
57836+ assert("vs-1601",
57837+ data[i].u.bytes ==
57838+ reiser4_extent_size(&coord,
57839+ coord_num_units
57840+ (&coord)));
57841+ break;
57842+ case COMPOUND_DIR_ID:
57843+ assert("vs-1602",
57844+ data[i].u.entries == coord_num_units(&coord));
57845+ break;
57846+ default:
57847+ break;
57848+ }
57849+ i++;
57850+ }
57851+
57852+ assert("vs-1603", i == nr_items);
57853+ kfree(data);
57854+}
57855+
57856+#endif
57857+
57858+/* plugin->u.node.shift
57859+ look for description of this method in plugin/node/node.h */
57860+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
57861+ deleted from the tree if this is set to 1 */
57862+ int including_stop_coord, carry_plugin_info * info)
57863+{
57864+ struct shift_params shift;
57865+ int result;
57866+ znode *left, *right;
57867+ znode *source;
57868+ int target_empty;
57869+
57870+ assert("nikita-2161", coord_check(from));
57871+
57872+ memset(&shift, 0, sizeof(shift));
57873+ shift.pend = pend;
57874+ shift.wish_stop = *from;
57875+ shift.target = to;
57876+
57877+ assert("nikita-1473", znode_is_write_locked(from->node));
57878+ assert("nikita-1474", znode_is_write_locked(to));
57879+
57880+ source = from->node;
57881+
57882+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want
57883+ shifted */
57884+ if (pend == SHIFT_LEFT) {
57885+ result = coord_set_to_left(&shift.wish_stop);
57886+ left = to;
57887+ right = from->node;
57888+ } else {
57889+ result = coord_set_to_right(&shift.wish_stop);
57890+ left = from->node;
57891+ right = to;
57892+ }
57893+
57894+ if (result) {
57895+ /* move insertion coord even if there is nothing to move */
57896+ if (including_stop_coord) {
57897+ /* move insertion coord (@from) */
57898+ if (pend == SHIFT_LEFT) {
57899+ /* after last item in target node */
57900+ coord_init_after_last_item(from, to);
57901+ } else {
57902+ /* before first item in target node */
57903+ coord_init_before_first_item(from, to);
57904+ }
57905+ }
57906+
57907+ if (delete_child && node_is_empty(shift.wish_stop.node))
57908+ result =
57909+ prepare_removal_node40(shift.wish_stop.node, info);
57910+ else
57911+ result = 0;
57912+ /* there is nothing to shift */
57913+ assert("nikita-2078", coord_check(from));
57914+ return result;
57915+ }
57916+
57917+ target_empty = node_is_empty(to);
57918+
57919+ /* when first node plugin with item body compression is implemented,
57920+ this must be changed to call node specific plugin */
57921+
57922+ /* shift->stop_coord is updated to last unit which really will be
57923+ shifted */
57924+ estimate_shift(&shift, get_current_context());
57925+ if (!shift.shift_bytes) {
57926+ /* we could not shift anything */
57927+ assert("nikita-2079", coord_check(from));
57928+ return 0;
57929+ }
57930+
57931+ copy(&shift);
57932+
57933+ /* result value of this is important. It is used by adjust_coord below */
57934+ result = delete_copied(&shift);
57935+
57936+ assert("vs-1610", result >= 0);
57937+ assert("vs-1471",
57938+ ((reiser4_context *) current->journal_info)->magic ==
57939+ context_magic);
57940+
57941+ /* item which has been moved from one node to another might want to do
57942+ something on that event. This can be done by item's shift_hook
57943+ method, which will be now called for every moved items */
57944+ call_shift_hooks(&shift);
57945+
57946+ assert("vs-1472",
57947+ ((reiser4_context *) current->journal_info)->magic ==
57948+ context_magic);
57949+
57950+ update_taps(&shift);
57951+
57952+ assert("vs-1473",
57953+ ((reiser4_context *) current->journal_info)->magic ==
57954+ context_magic);
57955+
57956+ /* adjust @from pointer in accordance with @including_stop_coord flag
57957+ and amount of data which was really shifted */
57958+ adjust_coord(from, &shift, result, including_stop_coord);
57959+
57960+ if (target_empty)
57961+ /*
57962+ * items were shifted into empty node. Update delimiting key.
57963+ */
57964+ result = prepare_for_update(NULL, left, info);
57965+
57966+ /* add update operation to @info, which is the list of operations to
57967+ be performed on a higher level */
57968+ result = prepare_for_update(left, right, info);
57969+ if (!result && node_is_empty(source) && delete_child) {
57970+ /* all contents of @from->node is moved to @to and @from->node
57971+ has to be removed from the tree, so, on higher level we
57972+ will be removing the pointer to node @from->node */
57973+ result = prepare_removal_node40(source, info);
57974+ }
57975+ assert("nikita-2080", coord_check(from));
57976+ return result ? result : (int)shift.shift_bytes;
57977+}
57978+
57979+/* plugin->u.node.fast_insert()
57980+ look for description of this method in plugin/node/node.h */
57981+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57982+{
57983+ return 1;
57984+}
57985+
57986+/* plugin->u.node.fast_paste()
57987+ look for description of this method in plugin/node/node.h */
57988+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57989+{
57990+ return 1;
57991+}
57992+
57993+/* plugin->u.node.fast_cut()
57994+ look for description of this method in plugin/node/node.h */
57995+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57996+{
57997+ return 1;
57998+}
57999+
58000+/* plugin->u.node.modify - not defined */
58001+
58002+/* plugin->u.node.max_item_size */
58003+int max_item_size_node40(void)
58004+{
58005+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
58006+ sizeof(item_header40);
58007+}
58008+
58009+/* plugin->u.node.set_item_plugin */
58010+int set_item_plugin_node40(coord_t *coord, item_id id)
58011+{
58012+ item_header40 *ih;
58013+
58014+ ih = node40_ih_at_coord(coord);
58015+ put_unaligned(cpu_to_le16(id), &ih->plugin_id);
58016+ coord->iplugid = id;
58017+ return 0;
58018+}
58019+
58020+/*
58021+ Local variables:
58022+ c-indentation-style: "K&R"
58023+ mode-name: "LC"
58024+ c-basic-offset: 8
58025+ tab-width: 8
58026+ fill-column: 120
58027+ scroll-step: 1
58028+ End:
58029+*/
58030diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node40.h linux-2.6.20/fs/reiser4/plugin/node/node40.h
58031--- linux-2.6.20.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300
58032+++ linux-2.6.20/fs/reiser4/plugin/node/node40.h 2007-05-06 14:50:43.835018219 +0400
58033@@ -0,0 +1,125 @@
58034+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58035+
58036+#if !defined( __REISER4_NODE40_H__ )
58037+#define __REISER4_NODE40_H__
58038+
58039+#include "../../forward.h"
58040+#include "../../dformat.h"
58041+#include "node.h"
58042+
58043+#include <linux/types.h>
58044+
58045+/* format of node header for 40 node layouts. Keep bloat out of this struct. */
58046+typedef struct node40_header {
58047+ /* identifier of node plugin. Must be located at the very beginning
58048+ of a node. */
58049+ common_node_header common_header; /* this is 16 bits */
58050+ /* number of items. Should be first element in the node header,
58051+ because we haven't yet finally decided whether it shouldn't go into
58052+ common_header.
58053+ */
58054+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
58055+ * node format at compile time, and it is this one, accesses do not function dereference when
58056+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
58057+ d16 nr_items;
58058+ /* free space in node measured in bytes */
58059+ d16 free_space;
58060+ /* offset to start of free space in node */
58061+ d16 free_space_start;
58062+ /* for reiser4_fsck. When information about what is a free
58063+ block is corrupted, and we try to recover everything even
58064+ if marked as freed, then old versions of data may
58065+ duplicate newer versions, and this field allows us to
58066+ restore the newer version. Also useful for when users
58067+ who don't have the new trashcan installed on their linux distro
58068+ delete the wrong files and send us desperate emails
58069+ offering $25 for them back. */
58070+
58071+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
58072+ d32 magic;
58073+ /* flushstamp is made of mk_id and write_counter. mk_id is an
58074+ id generated randomly at mkreiserfs time. So we can just
58075+ skip all nodes with different mk_id. write_counter is d64
58076+ incrementing counter of writes on disk. It is used for
58077+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
58078+
58079+ d32 mkfs_id;
58080+ d64 flush_id;
58081+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
58082+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
58083+ d16 flags;
58084+
58085+ /* 1 is leaf level, 2 is twig level, root is the numerically
58086+ largest level */
58087+ d8 level;
58088+
58089+ d8 pad;
58090+} PACKED node40_header;
58091+
58092+/* item headers are not standard across all node layouts, pass
58093+ pos_in_node to functions instead */
58094+typedef struct item_header40 {
58095+ /* key of item */
58096+ /* 0 */ reiser4_key key;
58097+ /* offset from start of a node measured in 8-byte chunks */
58098+ /* 24 */ d16 offset;
58099+ /* 26 */ d16 flags;
58100+ /* 28 */ d16 plugin_id;
58101+} PACKED item_header40;
58102+
58103+size_t item_overhead_node40(const znode * node, flow_t * aflow);
58104+size_t free_space_node40(znode * node);
58105+node_search_result lookup_node40(znode * node, const reiser4_key * key,
58106+ lookup_bias bias, coord_t * coord);
58107+int num_of_items_node40(const znode * node);
58108+char *item_by_coord_node40(const coord_t * coord);
58109+int length_by_coord_node40(const coord_t * coord);
58110+item_plugin *plugin_by_coord_node40(const coord_t * coord);
58111+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
58112+size_t estimate_node40(znode * node);
58113+int check_node40(const znode * node, __u32 flags, const char **error);
58114+int parse_node40(znode * node);
58115+int init_node40(znode * node);
58116+#ifdef GUESS_EXISTS
58117+int guess_node40(const znode * node);
58118+#endif
58119+void change_item_size_node40(coord_t * coord, int by);
58120+int create_item_node40(coord_t * target, const reiser4_key * key,
58121+ reiser4_item_data * data, carry_plugin_info * info);
58122+void update_item_key_node40(coord_t * target, const reiser4_key * key,
58123+ carry_plugin_info * info);
58124+int kill_node40(struct carry_kill_data *, carry_plugin_info *);
58125+int cut_node40(struct carry_cut_data *, carry_plugin_info *);
58126+int shift_node40(coord_t * from, znode * to, shift_direction pend,
58127+ /* if @from->node becomes
58128+ empty - it will be deleted from
58129+ the tree if this is set to 1
58130+ */
58131+ int delete_child, int including_stop_coord,
58132+ carry_plugin_info * info);
58133+
58134+int fast_insert_node40(const coord_t * coord);
58135+int fast_paste_node40(const coord_t * coord);
58136+int fast_cut_node40(const coord_t * coord);
58137+int max_item_size_node40(void);
58138+int prepare_removal_node40(znode * empty, carry_plugin_info * info);
58139+int set_item_plugin_node40(coord_t * coord, item_id id);
58140+int shrink_item_node40(coord_t * coord, int delta);
58141+
58142+#if REISER4_DEBUG
58143+void *shift_check_prepare(const znode *left, const znode *right);
58144+void shift_check(void *vp, const znode *left, const znode *right);
58145+#endif
58146+
58147+/* __REISER4_NODE40_H__ */
58148+#endif
58149+/*
58150+ Local variables:
58151+ c-indentation-style: "K&R"
58152+ mode-name: "LC"
58153+ c-basic-offset: 8
58154+ tab-width: 8
58155+ fill-column: 120
58156+ scroll-step: 1
58157+ End:
58158+*/
58159diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node.c linux-2.6.20/fs/reiser4/plugin/node/node.c
58160--- linux-2.6.20.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300
58161+++ linux-2.6.20/fs/reiser4/plugin/node/node.c 2007-05-06 14:50:43.835018219 +0400
58162@@ -0,0 +1,131 @@
58163+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58164+
58165+/* Node plugin interface.
58166+
58167+ Description: The tree provides the abstraction of flows, which it
58168+ internally fragments into items which it stores in nodes.
58169+
58170+ A key_atom is a piece of data bound to a single key.
58171+
58172+ For reasonable space efficiency to be achieved it is often
58173+ necessary to store key_atoms in the nodes in the form of items, where
58174+ an item is a sequence of key_atoms of the same or similar type. It is
58175+ more space-efficient, because the item can implement (very)
58176+ efficient compression of key_atom's bodies using internal knowledge
58177+ about their semantics, and it can often avoid having a key for each
58178+ key_atom. Each type of item has specific operations implemented by its
58179+ item handler (see balance.c).
58180+
58181+ Rationale: the rest of the code (specifically balancing routines)
58182+ accesses leaf level nodes through this interface. This way we can
58183+ implement various block layouts and even combine various layouts
58184+ within the same tree. Balancing/allocating algorithms should not
58185+ care about peculiarities of splitting/merging specific item types,
58186+ but rather should leave that to the item's item handler.
58187+
58188+ Items, including those that provide the abstraction of flows, have
58189+ the property that if you move them in part or in whole to another
58190+ node, the balancing code invokes their is_left_mergeable()
58191+ item_operation to determine if they are mergeable with their new
58192+ neighbor in the node you have moved them to. For some items the
58193+ is_left_mergeable() function always returns null.
58194+
58195+ When moving the bodies of items from one node to another:
58196+
58197+ if a partial item is shifted to another node the balancing code invokes
58198+ an item handler method to handle the item splitting.
58199+
58200+ if the balancing code needs to merge with an item in the node it
58201+ is shifting to, it will invoke an item handler method to handle
58202+ the item merging.
58203+
58204+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
58205+ adjusting the item headers after the move is done using the node handler.
58206+*/
58207+
58208+#include "../../forward.h"
58209+#include "../../debug.h"
58210+#include "../../key.h"
58211+#include "../../coord.h"
58212+#include "../plugin_header.h"
58213+#include "../item/item.h"
58214+#include "node.h"
58215+#include "../plugin.h"
58216+#include "../../znode.h"
58217+#include "../../tree.h"
58218+#include "../../super.h"
58219+#include "../../reiser4.h"
58220+
58221+/**
58222+ * leftmost_key_in_node - get the smallest key in node
58223+ * @node:
58224+ * @key: store result here
58225+ *
58226+ * Stores the leftmost key of @node in @key.
58227+ */
58228+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
58229+{
58230+ assert("nikita-1634", node != NULL);
58231+ assert("nikita-1635", key != NULL);
58232+
58233+ if (!node_is_empty(node)) {
58234+ coord_t first_item;
58235+
58236+ coord_init_first_unit(&first_item, (znode *) node);
58237+ item_key_by_coord(&first_item, key);
58238+ } else
58239+ *key = *reiser4_max_key();
58240+ return key;
58241+}
58242+
58243+node_plugin node_plugins[LAST_NODE_ID] = {
58244+ [NODE40_ID] = {
58245+ .h = {
58246+ .type_id = REISER4_NODE_PLUGIN_TYPE,
58247+ .id = NODE40_ID,
58248+ .pops = NULL,
58249+ .label = "unified",
58250+ .desc = "unified node layout",
58251+ .linkage = {NULL, NULL}
58252+ },
58253+ .item_overhead = item_overhead_node40,
58254+ .free_space = free_space_node40,
58255+ .lookup = lookup_node40,
58256+ .num_of_items = num_of_items_node40,
58257+ .item_by_coord = item_by_coord_node40,
58258+ .length_by_coord = length_by_coord_node40,
58259+ .plugin_by_coord = plugin_by_coord_node40,
58260+ .key_at = key_at_node40,
58261+ .estimate = estimate_node40,
58262+ .check = check_node40,
58263+ .parse = parse_node40,
58264+ .init = init_node40,
58265+#ifdef GUESS_EXISTS
58266+ .guess = guess_node40,
58267+#endif
58268+ .change_item_size = change_item_size_node40,
58269+ .create_item = create_item_node40,
58270+ .update_item_key = update_item_key_node40,
58271+ .cut_and_kill = kill_node40,
58272+ .cut = cut_node40,
58273+ .shift = shift_node40,
58274+ .shrink_item = shrink_item_node40,
58275+ .fast_insert = fast_insert_node40,
58276+ .fast_paste = fast_paste_node40,
58277+ .fast_cut = fast_cut_node40,
58278+ .max_item_size = max_item_size_node40,
58279+ .prepare_removal = prepare_removal_node40,
58280+ .set_item_plugin = set_item_plugin_node40
58281+ }
58282+};
58283+
58284+/*
58285+ Local variables:
58286+ c-indentation-style: "K&R"
58287+ mode-name: "LC"
58288+ c-basic-offset: 8
58289+ tab-width: 8
58290+ fill-column: 120
58291+ scroll-step: 1
58292+ End:
58293+*/
58294diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node.h linux-2.6.20/fs/reiser4/plugin/node/node.h
58295--- linux-2.6.20.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300
58296+++ linux-2.6.20/fs/reiser4/plugin/node/node.h 2007-05-06 14:50:43.835018219 +0400
58297@@ -0,0 +1,272 @@
58298+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58299+
58300+/* We need a definition of the default node layout here. */
58301+
58302+/* Generally speaking, it is best to have free space in the middle of the
58303+ node so that two sets of things can grow towards it, and to have the
58304+ item bodies on the left so that the last one of them grows into free
58305+ space. We optimize for the case where we append new items to the end
58306+ of the node, or grow the last item, because it hurts nothing to so
58307+ optimize and it is a common special case to do massive insertions in
58308+ increasing key order (and one of cases more likely to have a real user
58309+ notice the delay time for).
58310+
58311+ formatted leaf default layout: (leaf1)
58312+
58313+ |node header:item bodies:free space:key + pluginid + item offset|
58314+
58315+ We grow towards the middle, optimizing layout for the case where we
58316+ append new items to the end of the node. The node header is fixed
58317+ length. Keys, and item offsets plus pluginids for the items
58318+ corresponding to them are in increasing key order, and are fixed
58319+ length. Item offsets are relative to start of node (16 bits creating
58320+ a node size limit of 64k, 12 bits might be a better choice....). Item
58321+ bodies are in decreasing key order. Item bodies have a variable size.
58322+ There is a one to one to one mapping of keys to item offsets to item
58323+ bodies. Item offsets consist of pointers to the zeroth byte of the
58324+ item body. Item length equals the start of the next item minus the
58325+ start of this item, except the zeroth item whose length equals the end
58326+ of the node minus the start of that item (plus a byte). In other
58327+ words, the item length is not recorded anywhere, and it does not need
58328+ to be since it is computable.
58329+
58330+ Leaf variable length items and keys layout : (lvar)
58331+
58332+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
58333+
58334+ We grow towards the middle, optimizing layout for the case where we
58335+ append new items to the end of the node. The node header is fixed
58336+ length. Keys and item offsets for the items corresponding to them are
58337+ in increasing key order, and keys are variable length. Item offsets
58338+ are relative to start of node (16 bits). Item bodies are in
58339+ decreasing key order. Item bodies have a variable size. There is a
58340+ one to one to one mapping of keys to item offsets to item bodies.
58341+ Item offsets consist of pointers to the zeroth byte of the item body.
58342+ Item length equals the start of the next item's key minus the start of
58343+ this item, except the zeroth item whose length equals the end of the
58344+ node minus the start of that item (plus a byte).
58345+
58346+ leaf compressed keys layout: (lcomp)
58347+
58348+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
58349+
58350+ We grow towards the middle, optimizing layout for the case where we
58351+ append new items to the end of the node. The node header is fixed
58352+ length. Keys and item offsets for the items corresponding to them are
58353+ in increasing key order, and keys are variable length. The "key
58354+ inherit" field indicates how much of the key prefix is identical to
58355+ the previous key (stem compression as described in "Managing
58356+ Gigabytes" is used). key_inherit is a one byte integer. The
58357+ intra-node searches performed through this layout are linear searches,
58358+ and this is theorized to not hurt performance much due to the high
58359+ cost of processor stalls on modern CPUs, and the small number of keys
58360+ in a single node. Item offsets are relative to start of node (16
58361+ bits). Item bodies are in decreasing key order. Item bodies have a
58362+ variable size. There is a one to one to one mapping of keys to item
58363+ offsets to item bodies. Item offsets consist of pointers to the
58364+ zeroth byte of the item body. Item length equals the start of the
58365+ next item minus the start of this item, except the zeroth item whose
58366+ length equals the end of the node minus the start of that item (plus a
58367+ byte). In other words, item length and key length is not recorded
58368+ anywhere, and it does not need to be since it is computable.
58369+
58370+ internal node default layout: (idef1)
58371+
58372+ just like ldef1 except that item bodies are either blocknrs of
58373+ children or extents, and moving them may require updating parent
58374+ pointers in the nodes that they point to.
58375+*/
58376+
58377+/* There is an inherent 3-way tradeoff between optimizing and
58378+ exchanging disks between different architectures and code
58379+ complexity. This is optimal and simple and inexchangeable.
58380+ Someone else can do the code for exchanging disks and make it
58381+ complex. It would not be that hard. Using other than the PAGE_SIZE
58382+ might be suboptimal.
58383+*/
58384+
58385+#if !defined( __REISER4_NODE_H__ )
58386+#define __REISER4_NODE_H__
58387+
58388+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
58389+
58390+#include "../../dformat.h"
58391+#include "../plugin_header.h"
58392+
58393+#include <linux/types.h>
58394+
58395+typedef enum {
58396+ NS_FOUND = 0,
58397+ NS_NOT_FOUND = -ENOENT
58398+} node_search_result;
58399+
58400+/* Maximal possible space overhead for creation of new item in a node */
58401+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
58402+
58403+typedef enum {
58404+ REISER4_NODE_DKEYS = (1 << 0),
58405+ REISER4_NODE_TREE_STABLE = (1 << 1)
58406+} reiser4_node_check_flag;
58407+
58408+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
58409+struct cut_list {
58410+ coord_t *from;
58411+ coord_t *to;
58412+ const reiser4_key *from_key;
58413+ const reiser4_key *to_key;
58414+ reiser4_key *smallest_removed;
58415+ carry_plugin_info *info;
58416+ __u32 flags;
58417+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
58418+ lock_handle *left;
58419+ lock_handle *right;
58420+};
58421+
58422+struct carry_cut_data;
58423+struct carry_kill_data;
58424+
58425+/* The responsibility of the node plugin is to store and give access
58426+ to the sequence of items within the node. */
58427+typedef struct node_plugin {
58428+ /* generic plugin fields */
58429+ plugin_header h;
58430+
58431+ /* calculates the amount of space that will be required to store an
58432+ item which is in addition to the space consumed by the item body.
58433+ (the space consumed by the item body can be gotten by calling
58434+ item->estimate) */
58435+ size_t(*item_overhead) (const znode * node, flow_t * f);
58436+
58437+ /* returns free space by looking into node (i.e., without using
58438+ znode->free_space). */
58439+ size_t(*free_space) (znode * node);
58440+ /* search within the node for the one item which might
58441+ contain the key, invoking item->search_within to search within
58442+ that item to see if it is in there */
58443+ node_search_result(*lookup) (znode * node, const reiser4_key * key,
58444+ lookup_bias bias, coord_t * coord);
58445+ /* number of items in node */
58446+ int (*num_of_items) (const znode * node);
58447+
58448+ /* store information about item in @coord in @data */
58449+ /* break into several node ops, don't add any more uses of this before doing so */
58450+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
58451+ char *(*item_by_coord) (const coord_t * coord);
58452+ int (*length_by_coord) (const coord_t * coord);
58453+ item_plugin *(*plugin_by_coord) (const coord_t * coord);
58454+
58455+ /* store item key in @key */
58456+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
58457+ /* conservatively estimate whether unit of what size can fit
58458+ into node. This estimation should be performed without
58459+ actually looking into the node's content (free space is saved in
58460+ znode). */
58461+ size_t(*estimate) (znode * node);
58462+
58463+ /* performs every consistency check the node plugin author could
58464+ imagine. Optional. */
58465+ int (*check) (const znode * node, __u32 flags, const char **error);
58466+
58467+ /* Called when node is read into memory and node plugin is
58468+ already detected. This should read some data into znode (like free
58469+ space counter) and, optionally, check data consistency.
58470+ */
58471+ int (*parse) (znode * node);
58472+ /* This method is called on a new node to initialise plugin specific
58473+ data (header, etc.) */
58474+ int (*init) (znode * node);
58475+ /* Check whether @node content conforms to this plugin format.
58476+ Probably only useful after support for old V3.x formats is added.
58477+ Uncomment after 4.0 only.
58478+ */
58479+ /* int ( *guess )( const znode *node ); */
58480+#if REISER4_DEBUG
58481+ void (*print) (const char *prefix, const znode * node, __u32 flags);
58482+#endif
58483+ /* change size of @item by @by bytes. @item->node has enough free
58484+ space. When @by > 0 - free space is appended to end of item. When
58485+ @by < 0 - item is truncated - it is assumed that last @by bytes if
58486+ the item are freed already */
58487+ void (*change_item_size) (coord_t * item, int by);
58488+
58489+ /* create new item @length bytes long in coord @target */
58490+ int (*create_item) (coord_t * target, const reiser4_key * key,
58491+ reiser4_item_data * data, carry_plugin_info * info);
58492+
58493+ /* update key of item. */
58494+ void (*update_item_key) (coord_t * target, const reiser4_key * key,
58495+ carry_plugin_info * info);
58496+
58497+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
58498+ int (*cut) (struct carry_cut_data *, carry_plugin_info *);
58499+
58500+ /*
58501+ * shrink item pointed to by @coord by @delta bytes.
58502+ */
58503+ int (*shrink_item) (coord_t * coord, int delta);
58504+
58505+ /* copy as much as possible but not more than up to @stop from
58506+ @stop->node to @target. If (pend == append) then data from beginning of
58507+ @stop->node are copied to the end of @target. If (pend == prepend) then
58508+ data from the end of @stop->node are copied to the beginning of
58509+ @target. Copied data are removed from @stop->node. Information
58510+ about what to do on upper level is stored in @todo */
58511+ int (*shift) (coord_t * stop, znode * target, shift_direction pend,
58512+ int delete_node, int including_insert_coord,
58513+ carry_plugin_info * info);
58514+ /* return true if this node allows skip carry() in some situations
58515+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
58516+ emulation doesn't.
58517+
58518+ This will speedup insertions that doesn't require updates to the
58519+ parent, by bypassing initialisation of carry() structures. It's
58520+ believed that majority of insertions will fit there.
58521+
58522+ */
58523+ int (*fast_insert) (const coord_t * coord);
58524+ int (*fast_paste) (const coord_t * coord);
58525+ int (*fast_cut) (const coord_t * coord);
58526+ /* this limits max size of item which can be inserted into a node and
58527+ number of bytes item in a node may be appended with */
58528+ int (*max_item_size) (void);
58529+ int (*prepare_removal) (znode * empty, carry_plugin_info * info);
58530+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
58531+ * files */
58532+ int (*set_item_plugin) (coord_t * coord, item_id);
58533+} node_plugin;
58534+
58535+typedef enum {
58536+ /* standard unified node layout used for both leaf and internal
58537+ nodes */
58538+ NODE40_ID,
58539+ LAST_NODE_ID
58540+} reiser4_node_id;
58541+
58542+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
58543+#if REISER4_DEBUG
58544+extern void print_node_content(const char *prefix, const znode * node,
58545+ __u32 flags);
58546+#endif
58547+
58548+extern void indent_znode(const znode * node);
58549+
58550+typedef struct common_node_header {
58551+ /*
58552+ * identifier of node plugin. Must be located at the very beginning of
58553+ * a node.
58554+ */
58555+ __le16 plugin_id;
58556+} common_node_header;
58557+
58558+/* __REISER4_NODE_H__ */
58559+#endif
58560+/*
58561+ * Local variables:
58562+ * c-indentation-style: "K&R"
58563+ * mode-name: "LC"
58564+ * c-basic-offset: 8
58565+ * tab-width: 8
58566+ * fill-column: 79
58567+ * scroll-step: 1
58568+ * End:
58569+ */
58570diff -urN linux-2.6.20.orig/fs/reiser4/plugin/object.c linux-2.6.20/fs/reiser4/plugin/object.c
58571--- linux-2.6.20.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300
58572+++ linux-2.6.20/fs/reiser4/plugin/object.c 2007-05-06 14:50:43.835018219 +0400
58573@@ -0,0 +1,516 @@
58574+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58575+ * reiser4/README */
58576+
58577+/*
58578+ * Examples of object plugins: file, directory, symlink, special file.
58579+ *
58580+ * Plugins associated with inode:
58581+ *
58582+ * Plugin of inode is plugin referenced by plugin-id field of on-disk
58583+ * stat-data. How we store this plugin in in-core inode is not
58584+ * important. Currently pointers are used, another variant is to store offsets
58585+ * and do array lookup on each access.
58586+ *
58587+ * Now, each inode has one selected plugin: object plugin that
58588+ * determines what type of file this object is: directory, regular etc.
58589+ *
58590+ * This main plugin can use other plugins that are thus subordinated to
58591+ * it. Directory instance of object plugin uses hash; regular file
58592+ * instance uses tail policy plugin.
58593+ *
58594+ * Object plugin is either taken from id in stat-data or guessed from
58595+ * i_mode bits. Once it is established we ask it to install its
58596+ * subordinate plugins, by looking again in stat-data or inheriting them
58597+ * from parent.
58598+ *
58599+ * How new inode is initialized during ->read_inode():
58600+ * 1 read stat-data and initialize inode fields: i_size, i_mode,
58601+ * i_generation, capabilities etc.
58602+ * 2 read plugin id from stat data or try to guess plugin id
58603+ * from inode->i_mode bits if plugin id is missing.
58604+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
58605+ *
58606+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
58607+ * if stat data does contain i_size, etc., due to it being an unusual plugin?
58608+ *
58609+ * 4 Call ->activate() method of object's plugin. Plugin is either read from
58610+ * from stat-data or guessed from mode bits
58611+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
58612+ * plugins from parent.
58613+ *
58614+ * Easy induction proves that on last step all plugins of inode would be
58615+ * initialized.
58616+ *
58617+ * When creating new object:
58618+ * 1 obtain object plugin id (see next period)
58619+ * NIKITA-FIXME-HANS: period?
58620+ * 2 ->install() this plugin
58621+ * 3 ->inherit() the rest from the parent
58622+ *
58623+ * We need some examples of creating an object with default and non-default
58624+ * plugin ids. Nikita, please create them.
58625+ */
58626+
58627+#include "../inode.h"
58628+
58629+static int _bugop(void)
58630+{
58631+ BUG_ON(1);
58632+ return 0;
58633+}
58634+
58635+#define bugop ((void *)_bugop)
58636+
58637+static int _dummyop(void)
58638+{
58639+ return 0;
58640+}
58641+
58642+#define dummyop ((void *)_dummyop)
58643+
58644+static int change_file(struct inode *inode,
58645+ reiser4_plugin * plugin,
58646+ pset_member memb)
58647+{
58648+ /* cannot change object plugin of already existing object */
58649+ if (memb == PSET_FILE)
58650+ return RETERR(-EINVAL);
58651+
58652+ /* Change PSET_CREATE */
58653+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
58654+}
58655+
58656+static reiser4_plugin_ops file_plugin_ops = {
58657+ .change = change_file
58658+};
58659+
58660+/*
58661+ * Definitions of object plugins.
58662+ */
58663+
58664+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
58665+ [UNIX_FILE_PLUGIN_ID] = {
58666+ .h = {
58667+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58668+ .id = UNIX_FILE_PLUGIN_ID,
58669+ .groups = (1 << REISER4_REGULAR_FILE),
58670+ .pops = &file_plugin_ops,
58671+ .label = "reg",
58672+ .desc = "regular file",
58673+ .linkage = {NULL, NULL},
58674+ },
58675+ .inode_ops = {
58676+ .permission = reiser4_permission_common,
58677+ .setattr = setattr_unix_file,
58678+ .getattr = reiser4_getattr_common
58679+ },
58680+ .file_ops = {
58681+ .llseek = generic_file_llseek,
58682+ .read = read_unix_file,
58683+ .write = write_unix_file,
58684+ .aio_read = generic_file_aio_read,
58685+ .ioctl = ioctl_unix_file,
58686+ .mmap = mmap_unix_file,
58687+ .open = open_unix_file,
58688+ .release = release_unix_file,
58689+ .fsync = sync_unix_file,
58690+ .sendfile = sendfile_unix_file
58691+ },
58692+ .as_ops = {
58693+ .writepage = reiser4_writepage,
58694+ .readpage = readpage_unix_file,
58695+ .sync_page = block_sync_page,
58696+ .writepages = writepages_unix_file,
58697+ .set_page_dirty = reiser4_set_page_dirty,
58698+ .readpages = readpages_unix_file,
58699+ .prepare_write = prepare_write_unix_file,
58700+ .commit_write = commit_write_unix_file,
58701+ .bmap = bmap_unix_file,
58702+ .invalidatepage = reiser4_invalidatepage,
58703+ .releasepage = reiser4_releasepage
58704+ },
58705+ .write_sd_by_inode = write_sd_by_inode_common,
58706+ .flow_by_inode = flow_by_inode_unix_file,
58707+ .key_by_inode = key_by_inode_and_offset_common,
58708+ .set_plug_in_inode = set_plug_in_inode_common,
58709+ .adjust_to_parent = adjust_to_parent_common,
58710+ .create_object = reiser4_create_object_common,
58711+ .delete_object = delete_object_unix_file,
58712+ .add_link = reiser4_add_link_common,
58713+ .rem_link = reiser4_rem_link_common,
58714+ .owns_item = owns_item_unix_file,
58715+ .can_add_link = can_add_link_common,
58716+ .detach = dummyop,
58717+ .bind = dummyop,
58718+ .safelink = safelink_common,
58719+ .estimate = {
58720+ .create = estimate_create_common,
58721+ .update = estimate_update_common,
58722+ .unlink = estimate_unlink_common
58723+ },
58724+ .init_inode_data = init_inode_data_unix_file,
58725+ .cut_tree_worker = cut_tree_worker_common,
58726+ .wire = {
58727+ .write = wire_write_common,
58728+ .read = wire_read_common,
58729+ .get = wire_get_common,
58730+ .size = wire_size_common,
58731+ .done = wire_done_common
58732+ }
58733+ },
58734+ [DIRECTORY_FILE_PLUGIN_ID] = {
58735+ .h = {
58736+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58737+ .id = DIRECTORY_FILE_PLUGIN_ID,
58738+ .groups = (1 << REISER4_DIRECTORY_FILE),
58739+ .pops = &file_plugin_ops,
58740+ .label = "dir",
58741+ .desc = "directory",
58742+ .linkage = {NULL, NULL}
58743+ },
58744+ .inode_ops = {.create = NULL},
58745+ .file_ops = {.owner = NULL},
58746+ .as_ops = {.writepage = NULL},
58747+
58748+ .write_sd_by_inode = write_sd_by_inode_common,
58749+ .flow_by_inode = bugop,
58750+ .key_by_inode = bugop,
58751+ .set_plug_in_inode = set_plug_in_inode_common,
58752+ .adjust_to_parent = adjust_to_parent_common_dir,
58753+ .create_object = reiser4_create_object_common,
58754+ .delete_object = reiser4_delete_dir_common,
58755+ .add_link = reiser4_add_link_common,
58756+ .rem_link = rem_link_common_dir,
58757+ .owns_item = owns_item_common_dir,
58758+ .can_add_link = can_add_link_common,
58759+ .can_rem_link = can_rem_link_common_dir,
58760+ .detach = reiser4_detach_common_dir,
58761+ .bind = reiser4_bind_common_dir,
58762+ .safelink = safelink_common,
58763+ .estimate = {
58764+ .create = estimate_create_common_dir,
58765+ .update = estimate_update_common,
58766+ .unlink = estimate_unlink_common_dir
58767+ },
58768+ .wire = {
58769+ .write = wire_write_common,
58770+ .read = wire_read_common,
58771+ .get = wire_get_common,
58772+ .size = wire_size_common,
58773+ .done = wire_done_common
58774+ },
58775+ .init_inode_data = init_inode_ordering,
58776+ .cut_tree_worker = cut_tree_worker_common,
58777+ },
58778+ [SYMLINK_FILE_PLUGIN_ID] = {
58779+ .h = {
58780+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58781+ .id = SYMLINK_FILE_PLUGIN_ID,
58782+ .groups = (1 << REISER4_SYMLINK_FILE),
58783+ .pops = &file_plugin_ops,
58784+ .label = "symlink",
58785+ .desc = "symbolic link",
58786+ .linkage = {NULL,NULL}
58787+ },
58788+ .inode_ops = {
58789+ .readlink = generic_readlink,
58790+ .follow_link = reiser4_follow_link_common,
58791+ .permission = reiser4_permission_common,
58792+ .setattr = reiser4_setattr_common,
58793+ .getattr = reiser4_getattr_common
58794+ },
58795+ /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
58796+ .file_ops = {.owner = NULL},
58797+ .as_ops = {.writepage = NULL},
58798+
58799+ .write_sd_by_inode = write_sd_by_inode_common,
58800+ .set_plug_in_inode = set_plug_in_inode_common,
58801+ .adjust_to_parent = adjust_to_parent_common,
58802+ .create_object = reiser4_create_symlink,
58803+ .delete_object = reiser4_delete_object_common,
58804+ .add_link = reiser4_add_link_common,
58805+ .rem_link = reiser4_rem_link_common,
58806+ .can_add_link = can_add_link_common,
58807+ .detach = dummyop,
58808+ .bind = dummyop,
58809+ .safelink = safelink_common,
58810+ .estimate = {
58811+ .create = estimate_create_common,
58812+ .update = estimate_update_common,
58813+ .unlink = estimate_unlink_common
58814+ },
58815+ .init_inode_data = init_inode_ordering,
58816+ .cut_tree_worker = cut_tree_worker_common,
58817+ .destroy_inode = destroy_inode_symlink,
58818+ .wire = {
58819+ .write = wire_write_common,
58820+ .read = wire_read_common,
58821+ .get = wire_get_common,
58822+ .size = wire_size_common,
58823+ .done = wire_done_common
58824+ }
58825+ },
58826+ [SPECIAL_FILE_PLUGIN_ID] = {
58827+ .h = {
58828+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58829+ .id = SPECIAL_FILE_PLUGIN_ID,
58830+ .groups = (1 << REISER4_SPECIAL_FILE),
58831+ .pops = &file_plugin_ops,
58832+ .label = "special",
58833+ .desc =
58834+ "special: fifo, device or socket",
58835+ .linkage = {NULL, NULL}
58836+ },
58837+ .inode_ops = {
58838+ .permission = reiser4_permission_common,
58839+ .setattr = reiser4_setattr_common,
58840+ .getattr = reiser4_getattr_common
58841+ },
58842+ /* file_ops of special files (sockets, block, char, fifo) are
58843+ initialized by init_special_inode. */
58844+ .file_ops = {.owner = NULL},
58845+ .as_ops = {.writepage = NULL},
58846+
58847+ .write_sd_by_inode = write_sd_by_inode_common,
58848+ .set_plug_in_inode = set_plug_in_inode_common,
58849+ .adjust_to_parent = adjust_to_parent_common,
58850+ .create_object = reiser4_create_object_common,
58851+ .delete_object = reiser4_delete_object_common,
58852+ .add_link = reiser4_add_link_common,
58853+ .rem_link = reiser4_rem_link_common,
58854+ .owns_item = owns_item_common,
58855+ .can_add_link = can_add_link_common,
58856+ .detach = dummyop,
58857+ .bind = dummyop,
58858+ .safelink = safelink_common,
58859+ .estimate = {
58860+ .create = estimate_create_common,
58861+ .update = estimate_update_common,
58862+ .unlink = estimate_unlink_common
58863+ },
58864+ .init_inode_data = init_inode_ordering,
58865+ .cut_tree_worker = cut_tree_worker_common,
58866+ .wire = {
58867+ .write = wire_write_common,
58868+ .read = wire_read_common,
58869+ .get = wire_get_common,
58870+ .size = wire_size_common,
58871+ .done = wire_done_common
58872+ }
58873+ },
58874+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
58875+ .h = {
58876+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58877+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
58878+ .groups = (1 << REISER4_REGULAR_FILE),
58879+ .pops = &file_plugin_ops,
58880+ .label = "cryptcompress",
58881+ .desc = "cryptcompress file",
58882+ .linkage = {NULL, NULL}
58883+ },
58884+ .inode_ops = {
58885+ .permission = reiser4_permission_common,
58886+ .setattr = prot_setattr_cryptcompress,
58887+ .getattr = reiser4_getattr_common
58888+ },
58889+ .file_ops = {
58890+ .llseek = generic_file_llseek,
58891+ .read = prot_read_cryptcompress,
58892+ .write = prot_write_cryptcompress,
58893+ .aio_read = generic_file_aio_read,
58894+ .mmap = prot_mmap_cryptcompress,
58895+ .release = prot_release_cryptcompress,
58896+ .fsync = reiser4_sync_common,
58897+ .sendfile = prot_sendfile_cryptcompress
58898+ },
58899+ .as_ops = {
58900+ .writepage = reiser4_writepage,
58901+ .readpage = readpage_cryptcompress,
58902+ .sync_page = block_sync_page,
58903+ .writepages = writepages_cryptcompress,
58904+ .set_page_dirty = reiser4_set_page_dirty,
58905+ .readpages = readpages_cryptcompress,
58906+ .prepare_write = prepare_write_common,
58907+ .invalidatepage = reiser4_invalidatepage,
58908+ .releasepage = reiser4_releasepage
58909+ },
58910+ .write_sd_by_inode = write_sd_by_inode_common,
58911+ .flow_by_inode = flow_by_inode_cryptcompress,
58912+ .key_by_inode = key_by_inode_cryptcompress,
58913+ .set_plug_in_inode = set_plug_in_inode_common,
58914+ .adjust_to_parent = adjust_to_parent_cryptcompress,
58915+ .create_object = create_cryptcompress,
58916+ .open_object = open_object_cryptcompress,
58917+ .delete_object = delete_object_cryptcompress,
58918+ .add_link = reiser4_add_link_common,
58919+ .rem_link = reiser4_rem_link_common,
58920+ .owns_item = owns_item_common,
58921+ .can_add_link = can_add_link_common,
58922+ .detach = dummyop,
58923+ .bind = dummyop,
58924+ .safelink = safelink_common,
58925+ .estimate = {
58926+ .create = estimate_create_common,
58927+ .update = estimate_update_common,
58928+ .unlink = estimate_unlink_common
58929+ },
58930+ .init_inode_data = init_inode_data_cryptcompress,
58931+ .cut_tree_worker = cut_tree_worker_cryptcompress,
58932+ .destroy_inode = destroy_inode_cryptcompress,
58933+ .wire = {
58934+ .write = wire_write_common,
58935+ .read = wire_read_common,
58936+ .get = wire_get_common,
58937+ .size = wire_size_common,
58938+ .done = wire_done_common
58939+ }
58940+ }
58941+};
58942+
58943+static int change_dir(struct inode *inode,
58944+ reiser4_plugin * plugin,
58945+ pset_member memb)
58946+{
58947+ /* cannot change dir plugin of already existing object */
58948+ return RETERR(-EINVAL);
58949+}
58950+
58951+static reiser4_plugin_ops dir_plugin_ops = {
58952+ .change = change_dir
58953+};
58954+
58955+/*
58956+ * definition of directory plugins
58957+ */
58958+
58959+dir_plugin dir_plugins[LAST_DIR_ID] = {
58960+ /* standard hashed directory plugin */
58961+ [HASHED_DIR_PLUGIN_ID] = {
58962+ .h = {
58963+ .type_id = REISER4_DIR_PLUGIN_TYPE,
58964+ .id = HASHED_DIR_PLUGIN_ID,
58965+ .pops = &dir_plugin_ops,
58966+ .label = "dir",
58967+ .desc = "hashed directory",
58968+ .linkage = {NULL, NULL}
58969+ },
58970+ .inode_ops = {
58971+ .create = reiser4_create_common,
58972+ .lookup = reiser4_lookup_common,
58973+ .link = reiser4_link_common,
58974+ .unlink = reiser4_unlink_common,
58975+ .symlink = reiser4_symlink_common,
58976+ .mkdir = reiser4_mkdir_common,
58977+ .rmdir = reiser4_unlink_common,
58978+ .mknod = reiser4_mknod_common,
58979+ .rename = reiser4_rename_common,
58980+ .permission = reiser4_permission_common,
58981+ .setattr = reiser4_setattr_common,
58982+ .getattr = reiser4_getattr_common
58983+ },
58984+ .file_ops = {
58985+ .llseek = reiser4_llseek_dir_common,
58986+ .read = generic_read_dir,
58987+ .readdir = reiser4_readdir_common,
58988+ .release = reiser4_release_dir_common,
58989+ .fsync = reiser4_sync_common
58990+ },
58991+ .as_ops = {
58992+ .writepage = bugop,
58993+ .sync_page = bugop,
58994+ .writepages = dummyop,
58995+ .set_page_dirty = bugop,
58996+ .readpages = bugop,
58997+ .prepare_write = bugop,
58998+ .commit_write = bugop,
58999+ .bmap = bugop,
59000+ .invalidatepage = bugop,
59001+ .releasepage = bugop
59002+ },
59003+ .get_parent = get_parent_common,
59004+ .is_name_acceptable = is_name_acceptable_common,
59005+ .build_entry_key = build_entry_key_hashed,
59006+ .build_readdir_key = build_readdir_key_common,
59007+ .add_entry = reiser4_add_entry_common,
59008+ .rem_entry = reiser4_rem_entry_common,
59009+ .init = reiser4_dir_init_common,
59010+ .done = reiser4_dir_done_common,
59011+ .attach = reiser4_attach_common,
59012+ .detach = reiser4_detach_common,
59013+ .estimate = {
59014+ .add_entry = estimate_add_entry_common,
59015+ .rem_entry = estimate_rem_entry_common,
59016+ .unlink = dir_estimate_unlink_common
59017+ }
59018+ },
59019+ /* hashed directory for which seekdir/telldir are guaranteed to
59020+ * work. Brain-damage. */
59021+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
59022+ .h = {
59023+ .type_id = REISER4_DIR_PLUGIN_TYPE,
59024+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
59025+ .pops = &dir_plugin_ops,
59026+ .label = "dir32",
59027+ .desc = "directory hashed with 31 bit hash",
59028+ .linkage = {NULL, NULL}
59029+ },
59030+ .inode_ops = {
59031+ .create = reiser4_create_common,
59032+ .lookup = reiser4_lookup_common,
59033+ .link = reiser4_link_common,
59034+ .unlink = reiser4_unlink_common,
59035+ .symlink = reiser4_symlink_common,
59036+ .mkdir = reiser4_mkdir_common,
59037+ .rmdir = reiser4_unlink_common,
59038+ .mknod = reiser4_mknod_common,
59039+ .rename = reiser4_rename_common,
59040+ .permission = reiser4_permission_common,
59041+ .setattr = reiser4_setattr_common,
59042+ .getattr = reiser4_getattr_common
59043+ },
59044+ .file_ops = {
59045+ .llseek = reiser4_llseek_dir_common,
59046+ .read = generic_read_dir,
59047+ .readdir = reiser4_readdir_common,
59048+ .release = reiser4_release_dir_common,
59049+ .fsync = reiser4_sync_common
59050+ },
59051+ .as_ops = {
59052+ .writepage = bugop,
59053+ .sync_page = bugop,
59054+ .writepages = dummyop,
59055+ .set_page_dirty = bugop,
59056+ .readpages = bugop,
59057+ .prepare_write = bugop,
59058+ .commit_write = bugop,
59059+ .bmap = bugop,
59060+ .invalidatepage = bugop,
59061+ .releasepage = bugop
59062+ },
59063+ .get_parent = get_parent_common,
59064+ .is_name_acceptable = is_name_acceptable_common,
59065+ .build_entry_key = build_entry_key_seekable,
59066+ .build_readdir_key = build_readdir_key_common,
59067+ .add_entry = reiser4_add_entry_common,
59068+ .rem_entry = reiser4_rem_entry_common,
59069+ .init = reiser4_dir_init_common,
59070+ .done = reiser4_dir_done_common,
59071+ .attach = reiser4_attach_common,
59072+ .detach = reiser4_detach_common,
59073+ .estimate = {
59074+ .add_entry = estimate_add_entry_common,
59075+ .rem_entry = estimate_rem_entry_common,
59076+ .unlink = dir_estimate_unlink_common
59077+ }
59078+ }
59079+};
59080+
59081+/* Make Linus happy.
59082+ Local variables:
59083+ c-indentation-style: "K&R"
59084+ mode-name: "LC"
59085+ c-basic-offset: 8
59086+ tab-width: 8
59087+ fill-column: 120
59088+ End:
59089+*/
59090diff -urN linux-2.6.20.orig/fs/reiser4/plugin/object.h linux-2.6.20/fs/reiser4/plugin/object.h
59091--- linux-2.6.20.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300
59092+++ linux-2.6.20/fs/reiser4/plugin/object.h 2007-05-06 14:50:43.839019469 +0400
59093@@ -0,0 +1,121 @@
59094+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
59095+ * reiser4/README */
59096+
59097+/* Declaration of object plugin functions. */
59098+
59099+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
59100+#define __FS_REISER4_PLUGIN_OBJECT_H__
59101+
59102+#include "../type_safe_hash.h"
59103+
59104+/* common implementations of inode operations */
59105+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
59106+ int mode, struct nameidata *);
59107+struct dentry * reiser4_lookup_common(struct inode *parent,
59108+ struct dentry *dentry,
59109+ struct nameidata *nameidata);
59110+int reiser4_link_common(struct dentry *existing, struct inode *parent,
59111+ struct dentry *newname);
59112+int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
59113+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
59114+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
59115+ const char *linkname);
59116+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
59117+ int mode, dev_t rdev);
59118+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
59119+ struct inode *new_dir, struct dentry *new_name);
59120+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
59121+int reiser4_permission_common(struct inode *, int mask,
59122+ struct nameidata *nameidata);
59123+int reiser4_setattr_common(struct dentry *, struct iattr *);
59124+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
59125+ struct kstat *);
59126+
59127+/* common implementations of file operations */
59128+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
59129+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
59130+int reiser4_release_dir_common(struct inode *, struct file *);
59131+int reiser4_sync_common(struct file *, struct dentry *, int datasync);
59132+
59133+/* common implementations of address space operations */
59134+int prepare_write_common(struct file *, struct page *, unsigned from,
59135+ unsigned to);
59136+
59137+/* file plugin operations: common implementations */
59138+int write_sd_by_inode_common(struct inode *);
59139+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
59140+int set_plug_in_inode_common(struct inode *object, struct inode *parent,
59141+ reiser4_object_create_data *);
59142+int adjust_to_parent_common(struct inode *object, struct inode *parent,
59143+ struct inode *root);
59144+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
59145+ struct inode *root);
59146+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
59147+ struct inode *root);
59148+int reiser4_create_object_common(struct inode *object, struct inode *parent,
59149+ reiser4_object_create_data *);
59150+int reiser4_delete_object_common(struct inode *);
59151+int reiser4_delete_dir_common(struct inode *);
59152+int reiser4_add_link_common(struct inode *object, struct inode *parent);
59153+int reiser4_rem_link_common(struct inode *object, struct inode *parent);
59154+int rem_link_common_dir(struct inode *object, struct inode *parent);
59155+int owns_item_common(const struct inode *, const coord_t *);
59156+int owns_item_common_dir(const struct inode *, const coord_t *);
59157+int can_add_link_common(const struct inode *);
59158+int can_rem_link_common_dir(const struct inode *);
59159+int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
59160+int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
59161+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
59162+reiser4_block_nr estimate_create_common(const struct inode *);
59163+reiser4_block_nr estimate_create_common_dir(const struct inode *);
59164+reiser4_block_nr estimate_update_common(const struct inode *);
59165+reiser4_block_nr estimate_unlink_common(const struct inode *,
59166+ const struct inode *);
59167+reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
59168+ const struct inode *);
59169+char *wire_write_common(struct inode *, char *start);
59170+char *wire_read_common(char *addr, reiser4_object_on_wire *);
59171+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
59172+int wire_size_common(struct inode *);
59173+void wire_done_common(reiser4_object_on_wire *);
59174+
59175+/* dir plugin operations: common implementations */
59176+struct dentry *get_parent_common(struct inode *child);
59177+int is_name_acceptable_common(const struct inode *, const char *name, int len);
59178+void build_entry_key_common(const struct inode *,
59179+ const struct qstr *qname, reiser4_key *);
59180+int build_readdir_key_common(struct file *dir, reiser4_key *);
59181+int reiser4_add_entry_common(struct inode *object, struct dentry *where,
59182+ reiser4_object_create_data *, reiser4_dir_entry_desc *);
59183+int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
59184+ reiser4_dir_entry_desc *);
59185+int reiser4_dir_init_common(struct inode *object, struct inode *parent,
59186+ reiser4_object_create_data *);
59187+int reiser4_dir_done_common(struct inode *);
59188+int reiser4_attach_common(struct inode *child, struct inode *parent);
59189+int reiser4_detach_common(struct inode *object, struct inode *parent);
59190+reiser4_block_nr estimate_add_entry_common(const struct inode *);
59191+reiser4_block_nr estimate_rem_entry_common(const struct inode *);
59192+reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
59193+ const struct inode *);
59194+
59195+/* these are essential parts of common implementations, they are to make
59196+ customized implementations easier */
59197+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
59198+
59199+/* merely useful functions */
59200+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
59201+ const reiser4_key *, int silent);
59202+
59203+/* __FS_REISER4_PLUGIN_OBJECT_H__ */
59204+#endif
59205+
59206+/* Make Linus happy.
59207+ Local variables:
59208+ c-indentation-style: "K&R"
59209+ mode-name: "LC"
59210+ c-basic-offset: 8
59211+ tab-width: 8
59212+ fill-column: 120
59213+ End:
59214+*/
59215diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin.c linux-2.6.20/fs/reiser4/plugin/plugin.c
59216--- linux-2.6.20.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300
59217+++ linux-2.6.20/fs/reiser4/plugin/plugin.c 2007-05-06 14:50:43.839019469 +0400
59218@@ -0,0 +1,578 @@
59219+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59220+ * reiser4/README */
59221+
59222+/* Basic plugin infrastructure, lookup etc. */
59223+
59224+/* PLUGINS:
59225+
59226+ Plugins are internal Reiser4 "modules" or "objects" used to increase
59227+ extensibility and allow external users to easily adapt reiser4 to
59228+ their needs.
59229+
59230+ Plugins are classified into several disjoint "types". Plugins
59231+ belonging to the particular plugin type are termed "instances" of
59232+ this type. Currently the following types are present:
59233+
59234+ . object plugin
59235+ . hash plugin
59236+ . tail plugin
59237+ . perm plugin
59238+ . item plugin
59239+ . node layout plugin
59240+
59241+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59242+
59243+ Object (file) plugin determines how given file-system object serves
59244+ standard VFS requests for read, write, seek, mmap etc. Instances of
59245+ file plugins are: regular file, directory, symlink. Another example
59246+ of file plugin is audit plugin, that optionally records accesses to
59247+ underlying object and forwards requests to it.
59248+
59249+ Hash plugins compute hashes used by reiser4 to store and locate
59250+ files within directories. Instances of hash plugin type are: r5,
59251+ tea, rupasov.
59252+
59253+ Tail plugins (or, more precisely, tail policy plugins) determine
59254+ when last part of the file should be stored in a formatted item.
59255+
59256+ Perm plugins control permissions granted for a process accessing a file.
59257+
59258+ Scope and lookup:
59259+
59260+ label such that pair ( type_label, plugin_label ) is unique. This
59261+ pair is a globally persistent and user-visible plugin
59262+ identifier. Internally kernel maintains plugins and plugin types in
59263+ arrays using an index into those arrays as plugin and plugin type
59264+ identifiers. File-system in turn, also maintains persistent
59265+ "dictionary" which is mapping from plugin label to numerical
59266+ identifier which is stored in file-system objects. That is, we
59267+ store the offset into the plugin array for that plugin type as the
59268+ plugin id in the stat data of the filesystem object.
59269+
59270+ plugin_labels have meaning for the user interface that assigns
59271+ plugins to files, and may someday have meaning for dynamic loading of
59272+ plugins and for copying of plugins from one fs instance to
59273+ another by utilities like cp and tar.
59274+
59275+ Internal kernel plugin type identifier (index in plugins[] array) is
59276+ of type reiser4_plugin_type. Set of available plugin types is
59277+ currently static, but dynamic loading doesn't seem to pose
59278+ insurmountable problems.
59279+
59280+ Within each type plugins are addressed by the identifiers of type
59281+ reiser4_plugin_id (indices in
59282+ reiser4_plugin_type_data.builtin[]). Such identifiers are only
59283+ required to be unique within one type, not globally.
59284+
59285+ Thus, plugin in memory is uniquely identified by the pair (type_id,
59286+ id).
59287+
59288+ Usage:
59289+
59290+ There exists only one instance of each plugin instance, but this
59291+ single instance can be associated with many entities (file-system
59292+ objects, items, nodes, transactions, file-descriptors etc.). Entity
59293+ to which plugin of given type is termed (due to the lack of
59294+ imagination) "subject" of this plugin type and, by abuse of
59295+ terminology, subject of particular instance of this type to which
59296+ it's attached currently. For example, inode is subject of object
59297+ plugin type. Inode representing directory is subject of directory
59298+ plugin, hash plugin type and some particular instance of hash plugin
59299+ type. Inode, representing regular file is subject of "regular file"
59300+ plugin, tail-policy plugin type etc.
59301+
59302+ With each subject the plugin possibly stores some state. For example,
59303+ the state of a directory plugin (instance of object plugin type) is pointer
59304+ to hash plugin (if directories always use hashing that is). State of
59305+ audit plugin is file descriptor (struct file) of log file or some
59306+ magic value to do logging through printk().
59307+
59308+ Interface:
59309+
59310+ In addition to a scalar identifier, each plugin type and plugin
59311+ proper has a "label": short string and a "description"---longer
59312+ descriptive string. Labels and descriptions of plugin types are
59313+ hard-coded into plugins[] array, declared and defined in
59314+ plugin.c. Label and description of plugin are stored in .label and
59315+ .desc fields of reiser4_plugin_header respectively. It's possible to
59316+ locate plugin by the pair of labels.
59317+
59318+ Features:
59319+
59320+ . user-level plugin manipulations:
59321+ + reiser4("filename/..file_plugin<='audit'");
59322+ + write(open("filename/..file_plugin"), "audit", 8);
59323+
59324+ . user level utilities lsplug and chplug to manipulate plugins.
59325+ Utilities are not of primary priority. Possibly they will be not
59326+ working on v4.0
59327+
59328+NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree? I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
59329+
59330+ . mount option "plug" to set-up plugins of root-directory.
59331+ "plug=foo:bar" will set "bar" as default plugin of type "foo".
59332+
59333+ Limitations:
59334+
59335+ . each plugin type has to provide at least one builtin
59336+ plugin. This is technical limitation and it can be lifted in the
59337+ future.
59338+
59339+ TODO:
59340+
59341+ New plugin types/plugings:
59342+ Things we should be able to separately choose to inherit:
59343+
59344+ security plugins
59345+
59346+ stat data
59347+
59348+ file bodies
59349+
59350+ file plugins
59351+
59352+ dir plugins
59353+
59354+ . perm:acl
59355+
59356+ d audi---audit plugin intercepting and possibly logging all
59357+ accesses to object. Requires to put stub functions in file_operations
59358+ in stead of generic_file_*.
59359+
59360+NIKITA-FIXME-HANS: why make overflows a plugin?
59361+ . over---handle hash overflows
59362+
59363+ . sqnt---handle different access patterns and instruments read-ahead
59364+
59365+NIKITA-FIXME-HANS: describe the line below in more detail.
59366+
59367+ . hier---handle inheritance of plugins along file-system hierarchy
59368+
59369+ Different kinds of inheritance: on creation vs. on access.
59370+ Compatible/incompatible plugins.
59371+ Inheritance for multi-linked files.
59372+ Layered plugins.
59373+ Notion of plugin context is abandoned.
59374+
59375+Each file is associated
59376+ with one plugin and dependant plugins (hash, etc.) are stored as
59377+ main plugin state. Now, if we have plugins used for regular files
59378+ but not for directories, how such plugins would be inherited?
59379+ . always store them with directories also
59380+
59381+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing the line below which is also useful.
59382+
59383+ . use inheritance hierarchy, independent of file-system namespace
59384+
59385+*/
59386+
59387+#include "../debug.h"
59388+#include "../dformat.h"
59389+#include "plugin_header.h"
59390+#include "item/static_stat.h"
59391+#include "node/node.h"
59392+#include "security/perm.h"
59393+#include "space/space_allocator.h"
59394+#include "disk_format/disk_format.h"
59395+#include "plugin.h"
59396+#include "../reiser4.h"
59397+#include "../jnode.h"
59398+#include "../inode.h"
59399+
59400+#include <linux/fs.h> /* for struct super_block */
59401+
59402+/* public interface */
59403+
59404+/* initialise plugin sub-system. Just call this once on reiser4 startup. */
59405+int init_plugins(void);
59406+int setup_plugins(struct super_block *super, reiser4_plugin ** area);
59407+int locate_plugin(struct inode *inode, plugin_locator * loc);
59408+
59409+/**
59410+ * init_plugins - initialize plugins
59411+ *
59412+ * Initializes plugin sub-system. It is part of reiser4 module
59413+ * initialization. For each plugin of each type init method is called and each
59414+ * plugin is put into list of plugins.
59415+ */
59416+int init_plugins(void)
59417+{
59418+ reiser4_plugin_type type_id;
59419+
59420+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59421+ reiser4_plugin_type_data *ptype;
59422+ int i;
59423+
59424+ ptype = &plugins[type_id];
59425+ assert("nikita-3508", ptype->label != NULL);
59426+ assert("nikita-3509", ptype->type_id == type_id);
59427+
59428+ INIT_LIST_HEAD(&ptype->plugins_list);
59429+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59430+ for (i = 0; i < ptype->builtin_num; ++i) {
59431+ reiser4_plugin *plugin;
59432+
59433+ plugin = plugin_at(ptype, i);
59434+
59435+ if (plugin->h.label == NULL)
59436+ /* uninitialized slot encountered */
59437+ continue;
59438+ assert("nikita-3445", plugin->h.type_id == type_id);
59439+ plugin->h.id = i;
59440+ if (plugin->h.pops != NULL &&
59441+ plugin->h.pops->init != NULL) {
59442+ int result;
59443+
59444+ result = plugin->h.pops->init(plugin);
59445+ if (result != 0)
59446+ return result;
59447+ }
59448+ INIT_LIST_HEAD(&plugin->h.linkage);
59449+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
59450+ }
59451+ }
59452+ return 0;
59453+}
59454+
59455+/* true if plugin type id is valid */
59456+int is_plugin_type_valid(reiser4_plugin_type type)
59457+{
59458+ /* "type" is unsigned, so no comparison with 0 is
59459+ necessary */
59460+ return (type < REISER4_PLUGIN_TYPES);
59461+}
59462+
59463+/* true if plugin id is valid */
59464+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
59465+{
59466+ assert("nikita-1653", is_plugin_type_valid(type));
59467+ return id < plugins[type].builtin_num;
59468+}
59469+
59470+/* return plugin by its @type and @id.
59471+
59472+ Both arguments are checked for validness: this is supposed to be called
59473+ from user-level.
59474+
59475+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
59476+user space, and passed to the filesystem by use of method files? Your
59477+comment really confused me on the first reading....
59478+
59479+*/
59480+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
59481+ * unchecked */,
59482+ reiser4_plugin_id id /* plugin id,
59483+ * unchecked */)
59484+{
59485+ if (is_plugin_type_valid(type)) {
59486+ if (is_plugin_id_valid(type, id))
59487+ return plugin_at(&plugins[type], id);
59488+ else
59489+ /* id out of bounds */
59490+ warning("nikita-2913",
59491+ "Invalid plugin id: [%i:%i]", type, id);
59492+ } else
59493+ /* type_id out of bounds */
59494+ warning("nikita-2914", "Invalid type_id: %i", type);
59495+ return NULL;
59496+}
59497+
59498+/**
59499+ * save_plugin_id - store plugin id in disk format
59500+ * @plugin: plugin to convert
59501+ * @area: where to store result
59502+ *
59503+ * Puts id of @plugin in little endian format to address @area.
59504+ */
59505+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
59506+ d16 *area /* where to store result */ )
59507+{
59508+ assert("nikita-1261", plugin != NULL);
59509+ assert("nikita-1262", area != NULL);
59510+
59511+ put_unaligned(cpu_to_le16(plugin->h.id), area);
59512+ return 0;
59513+}
59514+
59515+/* list of all plugins of given type */
59516+struct list_head *get_plugin_list(reiser4_plugin_type type)
59517+{
59518+ assert("nikita-1056", is_plugin_type_valid(type));
59519+ return &plugins[type].plugins_list;
59520+}
59521+
59522+static void update_pset_mask(reiser4_inode * info, pset_member memb)
59523+{
59524+ struct dentry *rootdir;
59525+ reiser4_inode *root;
59526+
59527+ assert("edward-1443", memb != PSET_FILE);
59528+
59529+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
59530+ if (rootdir != NULL) {
59531+ root = reiser4_inode_data(rootdir->d_inode);
59532+ /*
59533+ * if inode is different from the default one, or we are
59534+ * changing plugin of root directory, update plugin_mask
59535+ */
59536+ if (aset_get(info->pset, memb) !=
59537+ aset_get(root->pset, memb) ||
59538+ info == root)
59539+ info->plugin_mask |= (1 << memb);
59540+ else
59541+ info->plugin_mask &= ~(1 << memb);
59542+ }
59543+}
59544+
59545+/* Get specified plugin set member from parent,
59546+ or from fs-defaults (if no parent is given) and
59547+ install the result to pset of @self */
59548+int grab_plugin_pset(struct inode *self,
59549+ struct inode *ancestor,
59550+ pset_member memb)
59551+{
59552+ reiser4_plugin *plug;
59553+ reiser4_inode *info;
59554+ int result = 0;
59555+
59556+ /* Do not grab if initialised already. */
59557+ info = reiser4_inode_data(self);
59558+ if (aset_get(info->pset, memb) != NULL)
59559+ return 0;
59560+ if (ancestor) {
59561+ reiser4_inode *parent;
59562+
59563+ parent = reiser4_inode_data(ancestor);
59564+ plug = aset_get(parent->hset, memb) ? :
59565+ aset_get(parent->pset, memb);
59566+ }
59567+ else
59568+ plug = get_default_plugin(memb);
59569+
59570+ result = set_plugin(&info->pset, memb, plug);
59571+ if (result == 0) {
59572+ if (!ancestor || self->i_sb->s_root->d_inode != self)
59573+ update_pset_mask(info, memb);
59574+ }
59575+ return result;
59576+}
59577+
59578+/* Take missing pset members from root inode */
59579+int finish_pset(struct inode *inode)
59580+{
59581+ reiser4_plugin *plug;
59582+ reiser4_inode *root;
59583+ reiser4_inode *info;
59584+ pset_member memb;
59585+ int result = 0;
59586+
59587+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
59588+ info = reiser4_inode_data(inode);
59589+
59590+ assert("edward-1455", root != NULL);
59591+ assert("edward-1456", info != NULL);
59592+
59593+ /* file and directory plugins are already initialized. */
59594+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
59595+
59596+ /* Do not grab if initialised already. */
59597+ if (aset_get(info->pset, memb) != NULL)
59598+ continue;
59599+
59600+ plug = aset_get(root->pset, memb);
59601+ result = set_plugin(&info->pset, memb, plug);
59602+ if (result != 0)
59603+ break;
59604+ }
59605+ if (result != 0) {
59606+ warning("nikita-3447",
59607+ "Cannot set up plugins for %lli",
59608+ (unsigned long long)
59609+ get_inode_oid(inode));
59610+ }
59611+ return result;
59612+}
59613+
59614+int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
59615+{
59616+ reiser4_inode *info;
59617+ int result = 0;
59618+
59619+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
59620+ /* Changing pset in the root object. */
59621+ return RETERR(-EINVAL);
59622+ }
59623+
59624+ info = reiser4_inode_data(self);
59625+ if (plug->h.pops != NULL && plug->h.pops->change != NULL)
59626+ result = plug->h.pops->change(self, plug, memb);
59627+ else
59628+ result = aset_set_unsafe(&info->pset, memb, plug);
59629+ if (result == 0) {
59630+ __u16 oldmask = info->plugin_mask;
59631+
59632+ update_pset_mask(info, memb);
59633+ if (oldmask != info->plugin_mask)
59634+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
59635+ }
59636+ return result;
59637+}
59638+
59639+reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
59640+ /* C90 initializers */
59641+ [REISER4_FILE_PLUGIN_TYPE] = {
59642+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59643+ .label = "file",
59644+ .desc = "Object plugins",
59645+ .builtin_num = sizeof_array(file_plugins),
59646+ .builtin = file_plugins,
59647+ .plugins_list = {NULL, NULL},
59648+ .size = sizeof(file_plugin)
59649+ },
59650+ [REISER4_DIR_PLUGIN_TYPE] = {
59651+ .type_id = REISER4_DIR_PLUGIN_TYPE,
59652+ .label = "dir",
59653+ .desc = "Directory plugins",
59654+ .builtin_num = sizeof_array(dir_plugins),
59655+ .builtin = dir_plugins,
59656+ .plugins_list = {NULL, NULL},
59657+ .size = sizeof(dir_plugin)
59658+ },
59659+ [REISER4_HASH_PLUGIN_TYPE] = {
59660+ .type_id = REISER4_HASH_PLUGIN_TYPE,
59661+ .label = "hash",
59662+ .desc = "Directory hashes",
59663+ .builtin_num = sizeof_array(hash_plugins),
59664+ .builtin = hash_plugins,
59665+ .plugins_list = {NULL, NULL},
59666+ .size = sizeof(hash_plugin)
59667+ },
59668+ [REISER4_FIBRATION_PLUGIN_TYPE] = {
59669+ .type_id =
59670+ REISER4_FIBRATION_PLUGIN_TYPE,
59671+ .label = "fibration",
59672+ .desc = "Directory fibrations",
59673+ .builtin_num = sizeof_array(fibration_plugins),
59674+ .builtin = fibration_plugins,
59675+ .plugins_list = {NULL, NULL},
59676+ .size = sizeof(fibration_plugin)
59677+ },
59678+ [REISER4_CIPHER_PLUGIN_TYPE] = {
59679+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
59680+ .label = "cipher",
59681+ .desc = "Cipher plugins",
59682+ .builtin_num = sizeof_array(cipher_plugins),
59683+ .builtin = cipher_plugins,
59684+ .plugins_list = {NULL, NULL},
59685+ .size = sizeof(cipher_plugin)
59686+ },
59687+ [REISER4_DIGEST_PLUGIN_TYPE] = {
59688+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
59689+ .label = "digest",
59690+ .desc = "Digest plugins",
59691+ .builtin_num = sizeof_array(digest_plugins),
59692+ .builtin = digest_plugins,
59693+ .plugins_list = {NULL, NULL},
59694+ .size = sizeof(digest_plugin)
59695+ },
59696+ [REISER4_COMPRESSION_PLUGIN_TYPE] = {
59697+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
59698+ .label = "compression",
59699+ .desc = "Compression plugins",
59700+ .builtin_num = sizeof_array(compression_plugins),
59701+ .builtin = compression_plugins,
59702+ .plugins_list = {NULL, NULL},
59703+ .size = sizeof(compression_plugin)
59704+ },
59705+ [REISER4_FORMATTING_PLUGIN_TYPE] = {
59706+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
59707+ .label = "formatting",
59708+ .desc = "Tail inlining policies",
59709+ .builtin_num = sizeof_array(formatting_plugins),
59710+ .builtin = formatting_plugins,
59711+ .plugins_list = {NULL, NULL},
59712+ .size = sizeof(formatting_plugin)
59713+ },
59714+ [REISER4_PERM_PLUGIN_TYPE] = {
59715+ .type_id = REISER4_PERM_PLUGIN_TYPE,
59716+ .label = "perm",
59717+ .desc = "Permission checks",
59718+ .builtin_num = sizeof_array(perm_plugins),
59719+ .builtin = perm_plugins,
59720+ .plugins_list = {NULL, NULL},
59721+ .size = sizeof(perm_plugin)
59722+ },
59723+ [REISER4_ITEM_PLUGIN_TYPE] = {
59724+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
59725+ .label = "item",
59726+ .desc = "Item handlers",
59727+ .builtin_num = sizeof_array(item_plugins),
59728+ .builtin = item_plugins,
59729+ .plugins_list = {NULL, NULL},
59730+ .size = sizeof(item_plugin)
59731+ },
59732+ [REISER4_NODE_PLUGIN_TYPE] = {
59733+ .type_id = REISER4_NODE_PLUGIN_TYPE,
59734+ .label = "node",
59735+ .desc = "node layout handlers",
59736+ .builtin_num = sizeof_array(node_plugins),
59737+ .builtin = node_plugins,
59738+ .plugins_list = {NULL, NULL},
59739+ .size = sizeof(node_plugin)
59740+ },
59741+ [REISER4_SD_EXT_PLUGIN_TYPE] = {
59742+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
59743+ .label = "sd_ext",
59744+ .desc = "Parts of stat-data",
59745+ .builtin_num = sizeof_array(sd_ext_plugins),
59746+ .builtin = sd_ext_plugins,
59747+ .plugins_list = {NULL, NULL},
59748+ .size = sizeof(sd_ext_plugin)
59749+ },
59750+ [REISER4_FORMAT_PLUGIN_TYPE] = {
59751+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
59752+ .label = "disk_layout",
59753+ .desc = "defines filesystem on disk layout",
59754+ .builtin_num = sizeof_array(format_plugins),
59755+ .builtin = format_plugins,
59756+ .plugins_list = {NULL, NULL},
59757+ .size = sizeof(disk_format_plugin)
59758+ },
59759+ [REISER4_JNODE_PLUGIN_TYPE] = {
59760+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
59761+ .label = "jnode",
59762+ .desc = "defines kind of jnode",
59763+ .builtin_num = sizeof_array(jnode_plugins),
59764+ .builtin = jnode_plugins,
59765+ .plugins_list = {NULL, NULL},
59766+ .size = sizeof(jnode_plugin)
59767+ },
59768+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
59769+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59770+ .label = "compression_mode",
59771+ .desc = "Defines compression mode",
59772+ .builtin_num = sizeof_array(compression_mode_plugins),
59773+ .builtin = compression_mode_plugins,
59774+ .plugins_list = {NULL, NULL},
59775+ .size = sizeof(compression_mode_plugin)
59776+ },
59777+ [REISER4_CLUSTER_PLUGIN_TYPE] = {
59778+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
59779+ .label = "cluster",
59780+ .desc = "Defines cluster size",
59781+ .builtin_num = sizeof_array(cluster_plugins),
59782+ .builtin = cluster_plugins,
59783+ .plugins_list = {NULL, NULL},
59784+ .size = sizeof(cluster_plugin)
59785+ }
59786+};
59787+
59788+/*
59789+ * Local variables:
59790+ * c-indentation-style: "K&R"
59791+ * mode-name: "LC"
59792+ * c-basic-offset: 8
59793+ * tab-width: 8
59794+ * fill-column: 120
59795+ * End:
59796+ */
59797diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin.h linux-2.6.20/fs/reiser4/plugin/plugin.h
59798--- linux-2.6.20.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300
59799+++ linux-2.6.20/fs/reiser4/plugin/plugin.h 2007-05-06 14:50:43.855024468 +0400
59800@@ -0,0 +1,920 @@
59801+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59802+
59803+/* Basic plugin data-types.
59804+ see fs/reiser4/plugin/plugin.c for details */
59805+
59806+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
59807+#define __FS_REISER4_PLUGIN_TYPES_H__
59808+
59809+#include "../forward.h"
59810+#include "../debug.h"
59811+#include "../dformat.h"
59812+#include "../key.h"
59813+#include "compress/compress.h"
59814+#include "crypto/cipher.h"
59815+#include "plugin_header.h"
59816+#include "item/static_stat.h"
59817+#include "item/internal.h"
59818+#include "item/sde.h"
59819+#include "item/cde.h"
59820+#include "item/item.h"
59821+#include "node/node.h"
59822+#include "node/node40.h"
59823+#include "security/perm.h"
59824+#include "fibration.h"
59825+
59826+#include "space/bitmap.h"
59827+#include "space/space_allocator.h"
59828+
59829+#include "disk_format/disk_format40.h"
59830+#include "disk_format/disk_format.h"
59831+
59832+#include <linux/fs.h> /* for struct super_block, address_space */
59833+#include <linux/mm.h> /* for struct page */
59834+#include <linux/buffer_head.h> /* for struct buffer_head */
59835+#include <linux/dcache.h> /* for struct dentry */
59836+#include <linux/types.h>
59837+#include <linux/crypto.h>
59838+
59839+typedef struct reiser4_object_on_wire reiser4_object_on_wire;
59840+
59841+/*
59842+ * File plugin. Defines the set of methods that file plugins implement, some
59843+ * of which are optional.
59844+ *
59845+ * A file plugin offers to the caller an interface for IO ( writing to and/or
59846+ * reading from) to what the caller sees as one sequence of bytes. An IO to it
59847+ * may affect more than one physical sequence of bytes, or no physical sequence
59848+ * of bytes, it may affect sequences of bytes offered by other file plugins to
59849+ * the semantic layer, and the file plugin may invoke other plugins and
59850+ * delegate work to them, but its interface is structured for offering the
59851+ * caller the ability to read and/or write what the caller sees as being a
59852+ * single sequence of bytes.
59853+ *
59854+ * The file plugin must present a sequence of bytes to the caller, but it does
59855+ * not necessarily have to store a sequence of bytes, it does not necessarily
59856+ * have to support efficient tree traversal to any offset in the sequence of
59857+ * bytes (tail and extent items, whose keys contain offsets, do however provide
59858+ * efficient non-sequential lookup of any offset in the sequence of bytes).
59859+ *
59860+ * Directory plugins provide methods for selecting file plugins by resolving a
59861+ * name for them.
59862+ *
59863+ * The functionality other filesystems call an attribute, and rigidly tie
59864+ * together, we decompose into orthogonal selectable features of files. Using
59865+ * the terminology we will define next, an attribute is a perhaps constrained,
59866+ * perhaps static length, file whose parent has a uni-count-intra-link to it,
59867+ * which might be grandparent-major-packed, and whose parent has a deletion
59868+ * method that deletes it.
59869+ *
59870+ * File plugins can implement constraints.
59871+ *
59872+ * Files can be of variable length (e.g. regular unix files), or of static
59873+ * length (e.g. static sized attributes).
59874+ *
59875+ * An object may have many sequences of bytes, and many file plugins, but, it
59876+ * has exactly one objectid. It is usually desirable that an object has a
59877+ * deletion method which deletes every item with that objectid. Items cannot
59878+ * in general be found by just their objectids. This means that an object must
59879+ * have either a method built into its deletion plugin method for knowing what
59880+ * items need to be deleted, or links stored with the object that provide the
59881+ * plugin with a method for finding those items. Deleting a file within an
59882+ * object may or may not have the effect of deleting the entire object,
59883+ * depending on the file plugin's deletion method.
59884+ *
59885+ * LINK TAXONOMY:
59886+ *
59887+ * Many objects have a reference count, and when the reference count reaches 0
59888+ * the object's deletion method is invoked. Some links embody a reference
59889+ * count increase ("countlinks"), and others do not ("nocountlinks").
59890+ *
59891+ * Some links are bi-directional links ("bilinks"), and some are
59892+ * uni-directional("unilinks").
59893+ *
59894+ * Some links are between parts of the same object ("intralinks"), and some are
59895+ * between different objects ("interlinks").
59896+ *
59897+ * PACKING TAXONOMY:
59898+ *
59899+ * Some items of an object are stored with a major packing locality based on
59900+ * their object's objectid (e.g. unix directory items in plan A), and these are
59901+ * called "self-major-packed".
59902+ *
59903+ * Some items of an object are stored with a major packing locality based on
59904+ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
59905+ * and these are called "parent-major-packed".
59906+ *
59907+ * Some items of an object are stored with a major packing locality based on
59908+ * their semantic grandparent, and these are called "grandparent-major-packed".
59909+ * Now carefully notice that we run into trouble with key length if we have to
59910+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
59911+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
59912+ * a 24 byte key. One of these fields must be sacrificed if an item is to be
59913+ * grandparent-major-packed, and which to sacrifice is left to the item author
59914+ * choosing to make the item grandparent-major-packed. You cannot make tail
59915+ * items and extent items grandparent-major-packed, though you could make them
59916+ * self-major-packed (usually they are parent-major-packed).
59917+ *
59918+ * In the case of ACLs (which are composed of fixed length ACEs which consist
59919+ * of {subject-type, subject, and permission bitmask} triples), it makes sense
59920+ * to not have an offset field in the ACE item key, and to allow duplicate keys
59921+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a
59922+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
59923+ * a directory together), the minor packing locality of ACE, the objectid of
59924+ * the file, and 0.
59925+ *
59926+ * IO involves moving data from one location to another, which means that two
59927+ * locations must be specified, source and destination.
59928+ *
59929+ * This source and destination can be in the filesystem, or they can be a
59930+ * pointer in the user process address space plus a byte count.
59931+ *
59932+ * If both source and destination are in the filesystem, then at least one of
59933+ * them must be representable as a pure stream of bytes (which we call a flow,
59934+ * and define as a struct containing a key, a data pointer, and a length).
59935+ * This may mean converting one of them into a flow. We provide a generic
59936+ * cast_into_flow() method, which will work for any plugin supporting
59937+ * read_flow(), though it is inefficiently implemented in that it temporarily
59938+ * stores the flow in a buffer (Question: what to do with huge flows that
59939+ * cannot fit into memory? Answer: we must not convert them all at once. )
59940+ *
59941+ * Performing a write requires resolving the write request into a flow defining
59942+ * the source, and a method that performs the write, and a key that defines
59943+ * where in the tree the write is to go.
59944+ *
59945+ * Performing a read requires resolving the read request into a flow defining
59946+ * the target, and a method that performs the read, and a key that defines
59947+ * where in the tree the read is to come from.
59948+ *
59949+ * There will exist file plugins which have no pluginid stored on the disk for
59950+ * them, and which are only invoked by other plugins.
59951+ */
59952+
59953+/* This should be incremented with each new contributed
59954+ pair (plugin type, plugin id).
59955+ NOTE: Make sure there is a release of reiser4progs
59956+ with the corresponding version number */
59957+#define PLUGIN_LIBRARY_VERSION 0
59958+
59959+ /* enumeration of fields within plugin_set */
59960+typedef enum {
59961+ PSET_FILE,
59962+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
59963+ * inode.c:read_inode() depends on this. */
59964+ PSET_PERM,
59965+ PSET_FORMATTING,
59966+ PSET_HASH,
59967+ PSET_FIBRATION,
59968+ PSET_SD,
59969+ PSET_DIR_ITEM,
59970+ PSET_CIPHER,
59971+ PSET_DIGEST,
59972+ PSET_COMPRESSION,
59973+ PSET_COMPRESSION_MODE,
59974+ PSET_CLUSTER,
59975+ PSET_CREATE,
59976+ PSET_LAST
59977+} pset_member;
59978+
59979+/* builtin file-plugins */
59980+typedef enum {
59981+ /* regular file */
59982+ UNIX_FILE_PLUGIN_ID,
59983+ /* directory */
59984+ DIRECTORY_FILE_PLUGIN_ID,
59985+ /* symlink */
59986+ SYMLINK_FILE_PLUGIN_ID,
59987+ /* for objects completely handled by the VFS: fifos, devices,
59988+ sockets */
59989+ SPECIAL_FILE_PLUGIN_ID,
59990+ /* regular cryptcompress file */
59991+ CRYPTCOMPRESS_FILE_PLUGIN_ID,
59992+ /* number of file plugins. Used as size of arrays to hold
59993+ file plugins. */
59994+ LAST_FILE_PLUGIN_ID
59995+} reiser4_file_id;
59996+
59997+typedef struct file_plugin {
59998+
59999+ /* generic fields */
60000+ plugin_header h;
60001+
60002+ struct inode_operations inode_ops;
60003+ struct file_operations file_ops;
60004+ struct address_space_operations as_ops;
60005+
60006+ /* save inode cached stat-data onto disk. It was called
60007+ reiserfs_update_sd() in 3.x */
60008+ int (*write_sd_by_inode) (struct inode *);
60009+
60010+ /*
60011+ * private methods: These are optional. If used they will allow you to
60012+ * minimize the amount of code needed to implement a deviation from
60013+ * some other method that also uses them.
60014+ */
60015+
60016+ /*
60017+ * Construct flow into @flow according to user-supplied data.
60018+ *
60019+ * This is used by read/write methods to construct a flow to
60020+ * write/read. ->flow_by_inode() is plugin method, rather than single
60021+ * global implementation, because key in a flow used by plugin may
60022+ * depend on data in a @buf.
60023+ *
60024+ * NIKITA-FIXME-HANS: please create statistics on what functions are
60025+ * dereferenced how often for the mongo benchmark. You can supervise
60026+ * Elena doing this for you if that helps. Email me the list of the
60027+ * top 10, with their counts, and an estimate of the total number of
60028+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent
60029+ * processing (non-idle processing). If the total percent is, say,
60030+ * less than 1%, it will make our coding discussions much easier, and
60031+ * keep me from questioning whether functions like the below are too
60032+ * frequently called to be dereferenced. If the total percent is more
60033+ * than 1%, perhaps private methods should be listed in a "required"
60034+ * comment at the top of each plugin (with stern language about how if
60035+ * the comment is missing it will not be accepted by the maintainer),
60036+ * and implemented using macros not dereferenced functions. How about
60037+ * replacing this whole private methods part of the struct with a
60038+ * thorough documentation of what the standard helper functions are for
60039+ * use in constructing plugins? I think users have been asking for
60040+ * that, though not in so many words.
60041+ */
60042+ int (*flow_by_inode) (struct inode *, const char __user *buf,
60043+ int user, loff_t size,
60044+ loff_t off, rw_op op, flow_t *);
60045+
60046+ /*
60047+ * Return the key used to retrieve an offset of a file. It is used by
60048+ * default implementation of ->flow_by_inode() method
60049+ * (common_build_flow()) and, among other things, to get to the extent
60050+ * from jnode of unformatted node.
60051+ */
60052+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
60053+
60054+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
60055+ /*
60056+ * set the plugin for a file. Called during file creation in creat()
60057+ * but not reiser4() unless an inode already exists for the file.
60058+ */
60059+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
60060+ reiser4_object_create_data *);
60061+
60062+ /* NIKITA-FIXME-HANS: comment and name seem to say different things,
60063+ * are you setting up the object itself also or just adjusting the
60064+ * parent?.... */
60065+ /* set up plugins for new @object created in @parent. @root is root
60066+ directory. */
60067+ int (*adjust_to_parent) (struct inode *object, struct inode *parent,
60068+ struct inode *root);
60069+ /*
60070+ * this does whatever is necessary to do when object is created. For
60071+ * instance, for unix files stat data is inserted. It is supposed to be
60072+ * called by create of struct inode_operations.
60073+ */
60074+ int (*create_object) (struct inode *object, struct inode *parent,
60075+ reiser4_object_create_data *);
60076+
60077+ /* this does whatever is necessary to do when object is opened */
60078+ int (*open_object) (struct inode * inode, struct file * file);
60079+ /*
60080+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on
60081+ * success. Deletion of an object usually includes removal of items
60082+ * building file body (for directories this is removal of "." and "..")
60083+ * and removal of stat-data item.
60084+ */
60085+ int (*delete_object) (struct inode *);
60086+
60087+ /* add link from @parent to @object */
60088+ int (*add_link) (struct inode *object, struct inode *parent);
60089+
60090+ /* remove link from @parent to @object */
60091+ int (*rem_link) (struct inode *object, struct inode *parent);
60092+
60093+ /*
60094+ * return true if item addressed by @coord belongs to @inode. This is
60095+ * used by read/write to properly slice flow into items in presence of
60096+ * multiple key assignment policies, because items of a file are not
60097+ * necessarily contiguous in a key space, for example, in a plan-b.
60098+ */
60099+ int (*owns_item) (const struct inode *, const coord_t *);
60100+
60101+ /* checks whether yet another hard links to this object can be
60102+ added */
60103+ int (*can_add_link) (const struct inode *);
60104+
60105+ /* checks whether hard links to this object can be removed */
60106+ int (*can_rem_link) (const struct inode *);
60107+
60108+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
60109+ detach of directory plugin to remove ".." */
60110+ int (*detach) (struct inode * child, struct inode * parent);
60111+
60112+ /* called when @child was just looked up in the @parent. It is not
60113+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
60114+ directory plugin */
60115+ int (*bind) (struct inode * child, struct inode * parent);
60116+
60117+ /* process safe-link during mount */
60118+ int (*safelink) (struct inode * object, reiser4_safe_link_t link,
60119+ __u64 value);
60120+
60121+ /* The couple of estimate methods for all file operations */
60122+ struct {
60123+ reiser4_block_nr(*create) (const struct inode *);
60124+ reiser4_block_nr(*update) (const struct inode *);
60125+ reiser4_block_nr(*unlink) (const struct inode *,
60126+ const struct inode *);
60127+ } estimate;
60128+
60129+ /*
60130+ * reiser4 specific part of inode has a union of structures which are
60131+ * specific to a plugin. This method is called when inode is read
60132+ * (read_inode) and when file is created (common_create_child) so that
60133+ * file plugin could initialize its inode data
60134+ */
60135+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
60136+ int);
60137+
60138+ /*
60139+ * This method performs progressive deletion of items and whole nodes
60140+ * from right to left.
60141+ *
60142+ * @tap: the point deletion process begins from,
60143+ * @from_key: the beginning of the deleted key range,
60144+ * @to_key: the end of the deleted key range,
60145+ * @smallest_removed: the smallest removed key,
60146+ *
60147+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
60148+ * operation was interrupted for allowing atom commit .
60149+ */
60150+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
60151+ const reiser4_key * to_key,
60152+ reiser4_key * smallest_removed, struct inode *,
60153+ int, int *);
60154+
60155+ /* called from ->destroy_inode() */
60156+ void (*destroy_inode) (struct inode *);
60157+
60158+ /*
60159+ * methods to serialize object identify. This is used, for example, by
60160+ * reiser4_{en,de}code_fh().
60161+ */
60162+ struct {
60163+ /* store object's identity at @area */
60164+ char *(*write) (struct inode * inode, char *area);
60165+ /* parse object from wire to the @obj */
60166+ char *(*read) (char *area, reiser4_object_on_wire * obj);
60167+ /* given object identity in @obj, find or create its dentry */
60168+ struct dentry *(*get) (struct super_block * s,
60169+ reiser4_object_on_wire * obj);
60170+ /* how many bytes ->wire.write() consumes */
60171+ int (*size) (struct inode * inode);
60172+ /* finish with object identify */
60173+ void (*done) (reiser4_object_on_wire * obj);
60174+ } wire;
60175+} file_plugin;
60176+
60177+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60178+
60179+struct reiser4_object_on_wire {
60180+ file_plugin *plugin;
60181+ union {
60182+ struct {
60183+ obj_key_id key_id;
60184+ } std;
60185+ void *generic;
60186+ } u;
60187+};
60188+
60189+/* builtin dir-plugins */
60190+typedef enum {
60191+ HASHED_DIR_PLUGIN_ID,
60192+ SEEKABLE_HASHED_DIR_PLUGIN_ID,
60193+ LAST_DIR_ID
60194+} reiser4_dir_id;
60195+
60196+typedef struct dir_plugin {
60197+ /* generic fields */
60198+ plugin_header h;
60199+
60200+ struct inode_operations inode_ops;
60201+ struct file_operations file_ops;
60202+ struct address_space_operations as_ops;
60203+
60204+ /*
60205+ * private methods: These are optional. If used they will allow you to
60206+ * minimize the amount of code needed to implement a deviation from
60207+ * some other method that uses them. You could logically argue that
60208+ * they should be a separate type of plugin.
60209+ */
60210+
60211+ struct dentry *(*get_parent) (struct inode * childdir);
60212+
60213+ /*
60214+ * check whether "name" is acceptable name to be inserted into this
60215+ * object. Optionally implemented by directory-like objects. Can check
60216+ * for maximal length, reserved symbols etc
60217+ */
60218+ int (*is_name_acceptable) (const struct inode * inode, const char *name,
60219+ int len);
60220+
60221+ void (*build_entry_key) (const struct inode * dir /* directory where
60222+ * entry is (or will
60223+ * be) in.*/ ,
60224+ const struct qstr * name /* name of file
60225+ * referenced by this
60226+ * entry */ ,
60227+ reiser4_key * result /* resulting key of
60228+ * directory entry */ );
60229+ int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60230+ int (*add_entry) (struct inode * object, struct dentry * where,
60231+ reiser4_object_create_data * data,
60232+ reiser4_dir_entry_desc * entry);
60233+ int (*rem_entry) (struct inode * object, struct dentry * where,
60234+ reiser4_dir_entry_desc * entry);
60235+
60236+ /*
60237+ * initialize directory structure for newly created object. For normal
60238+ * unix directories, insert dot and dotdot.
60239+ */
60240+ int (*init) (struct inode * object, struct inode * parent,
60241+ reiser4_object_create_data * data);
60242+
60243+ /* destroy directory */
60244+ int (*done) (struct inode * child);
60245+
60246+ /* called when @subdir was just looked up in the @dir */
60247+ int (*attach) (struct inode * subdir, struct inode * dir);
60248+ int (*detach) (struct inode * subdir, struct inode * dir);
60249+
60250+ struct {
60251+ reiser4_block_nr(*add_entry) (const struct inode *);
60252+ reiser4_block_nr(*rem_entry) (const struct inode *);
60253+ reiser4_block_nr(*unlink) (const struct inode *,
60254+ const struct inode *);
60255+ } estimate;
60256+} dir_plugin;
60257+
60258+extern dir_plugin dir_plugins[LAST_DIR_ID];
60259+
60260+typedef struct formatting_plugin {
60261+ /* generic fields */
60262+ plugin_header h;
60263+ /* returns non-zero iff file's tail has to be stored
60264+ in a direct item. */
60265+ int (*have_tail) (const struct inode * inode, loff_t size);
60266+} formatting_plugin;
60267+
60268+typedef struct hash_plugin {
60269+ /* generic fields */
60270+ plugin_header h;
60271+ /* computes hash of the given name */
60272+ __u64(*hash) (const unsigned char *name, int len);
60273+} hash_plugin;
60274+
60275+typedef struct cipher_plugin {
60276+ /* generic fields */
60277+ plugin_header h;
60278+ struct crypto_blkcipher * (*alloc) (void);
60279+ void (*free) (struct crypto_blkcipher * tfm);
60280+ /* Offset translator. For each offset this returns (k * offset), where
60281+ k (k >= 1) is an expansion factor of the cipher algorithm.
60282+ For all symmetric algorithms k == 1. For asymmetric algorithms (which
60283+ inflate data) offset translation guarantees that all disk cluster's
60284+ units will have keys smaller then next cluster's one.
60285+ */
60286+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60287+ /* Cipher algorithms can accept data only by chunks of cipher block
60288+ size. This method is to align any flow up to cipher block size when
60289+ we pass it to cipher algorithm. To align means to append padding of
60290+ special format specific to the cipher algorithm */
60291+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60292+ /* low-level key manager (check, install, etc..) */
60293+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60294+ unsigned int keylen);
60295+ /* main text processing procedures */
60296+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60297+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60298+} cipher_plugin;
60299+
60300+typedef struct digest_plugin {
60301+ /* generic fields */
60302+ plugin_header h;
60303+ /* fingerprint size in bytes */
60304+ int fipsize;
60305+ struct crypto_hash * (*alloc) (void);
60306+ void (*free) (struct crypto_hash * tfm);
60307+} digest_plugin;
60308+
60309+typedef struct compression_plugin {
60310+ /* generic fields */
60311+ plugin_header h;
60312+ int (*init) (void);
60313+ /* the maximum number of bytes the size of the "compressed" data can
60314+ * exceed the uncompressed data. */
60315+ int (*overrun) (unsigned src_len);
60316+ coa_t(*alloc) (tfm_action act);
60317+ void (*free) (coa_t coa, tfm_action act);
60318+ /* minimal size of the flow we still try to compress */
60319+ int (*min_size_deflate) (void);
60320+ __u32(*checksum) (char *data, __u32 length);
60321+ /* main transform procedures */
60322+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60323+ __u8 * dst_first, unsigned *dst_len);
60324+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60325+ __u8 * dst_first, unsigned *dst_len);
60326+} compression_plugin;
60327+
60328+typedef struct compression_mode_plugin {
60329+ /* generic fields */
60330+ plugin_header h;
60331+ /* this is called when estimating compressibility
60332+ of a logical cluster by its content */
60333+ int (*should_deflate) (struct inode * inode, cloff_t index);
60334+ /* this is called when results of compression should be saved */
60335+ int (*accept_hook) (struct inode * inode, cloff_t index);
60336+ /* this is called when results of compression should be discarded */
60337+ int (*discard_hook) (struct inode * inode, cloff_t index);
60338+} compression_mode_plugin;
60339+
60340+typedef struct cluster_plugin {
60341+ /* generic fields */
60342+ plugin_header h;
60343+ int shift;
60344+} cluster_plugin;
60345+
60346+typedef struct sd_ext_plugin {
60347+ /* generic fields */
60348+ plugin_header h;
60349+ int (*present) (struct inode * inode, char **area, int *len);
60350+ int (*absent) (struct inode * inode);
60351+ int (*save_len) (struct inode * inode);
60352+ int (*save) (struct inode * inode, char **area);
60353+ /* alignment requirement for this stat-data part */
60354+ int alignment;
60355+} sd_ext_plugin;
60356+
60357+/* this plugin contains methods to allocate objectid for newly created files,
60358+ to deallocate objectid when file gets removed, to report number of used and
60359+ free objectids */
60360+typedef struct oid_allocator_plugin {
60361+ /* generic fields */
60362+ plugin_header h;
60363+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60364+ __u64 oids);
60365+ /* used to report statfs->f_files */
60366+ __u64(*oids_used) (reiser4_oid_allocator * map);
60367+ /* get next oid to use */
60368+ __u64(*next_oid) (reiser4_oid_allocator * map);
60369+ /* used to report statfs->f_ffree */
60370+ __u64(*oids_free) (reiser4_oid_allocator * map);
60371+ /* allocate new objectid */
60372+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60373+ /* release objectid */
60374+ int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60375+ /* how many pages to reserve in transaction for allocation of new
60376+ objectid */
60377+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60378+ /* how many pages to reserve in transaction for freeing of an
60379+ objectid */
60380+ int (*oid_reserve_release) (reiser4_oid_allocator * map);
60381+ void (*print_info) (const char *, reiser4_oid_allocator *);
60382+} oid_allocator_plugin;
60383+
60384+/* disk layout plugin: this specifies super block, journal, bitmap (if there
60385+ are any) locations, etc */
60386+typedef struct disk_format_plugin {
60387+ /* generic fields */
60388+ plugin_header h;
60389+ /* replay journal, initialize super_info_data, etc */
60390+ int (*init_format) (struct super_block *, void *data);
60391+
60392+ /* key of root directory stat data */
60393+ const reiser4_key *(*root_dir_key) (const struct super_block *);
60394+
60395+ int (*release) (struct super_block *);
60396+ jnode *(*log_super) (struct super_block *);
60397+ int (*check_open) (const struct inode * object);
60398+ int (*version_update) (struct super_block *);
60399+} disk_format_plugin;
60400+
60401+struct jnode_plugin {
60402+ /* generic fields */
60403+ plugin_header h;
60404+ int (*init) (jnode * node);
60405+ int (*parse) (jnode * node);
60406+ struct address_space *(*mapping) (const jnode * node);
60407+ unsigned long (*index) (const jnode * node);
60408+ jnode *(*clone) (jnode * node);
60409+};
60410+
60411+/* plugin instance. */
60412+/* */
60413+/* This is "wrapper" union for all types of plugins. Most of the code uses */
60414+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
60415+/* operates with pointers to reiser4_plugin. This union is only used in */
60416+/* some generic code in plugin/plugin.c that operates on all */
60417+/* plugins. Technically speaking purpose of this union is to add type */
60418+/* safety to said generic code: each plugin type (file_plugin, for */
60419+/* example), contains plugin_header as its first memeber. This first member */
60420+/* is located at the same place in memory as .h member of */
60421+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
60422+/* looks in the .h which is header of plugin type located in union. This */
60423+/* allows to avoid type-casts. */
60424+union reiser4_plugin {
60425+ /* generic fields */
60426+ plugin_header h;
60427+ /* file plugin */
60428+ file_plugin file;
60429+ /* directory plugin */
60430+ dir_plugin dir;
60431+ /* hash plugin, used by directory plugin */
60432+ hash_plugin hash;
60433+ /* fibration plugin used by directory plugin */
60434+ fibration_plugin fibration;
60435+ /* cipher transform plugin, used by file plugin */
60436+ cipher_plugin cipher;
60437+ /* digest transform plugin, used by file plugin */
60438+ digest_plugin digest;
60439+ /* compression transform plugin, used by file plugin */
60440+ compression_plugin compression;
60441+ /* tail plugin, used by file plugin */
60442+ formatting_plugin formatting;
60443+ /* permission plugin */
60444+ perm_plugin perm;
60445+ /* node plugin */
60446+ node_plugin node;
60447+ /* item plugin */
60448+ item_plugin item;
60449+ /* stat-data extension plugin */
60450+ sd_ext_plugin sd_ext;
60451+ /* disk layout plugin */
60452+ disk_format_plugin format;
60453+ /* object id allocator plugin */
60454+ oid_allocator_plugin oid_allocator;
60455+ /* plugin for different jnode types */
60456+ jnode_plugin jnode;
60457+ /* compression mode plugin, used by object plugin */
60458+ compression_mode_plugin compression_mode;
60459+ /* cluster plugin, used by object plugin */
60460+ cluster_plugin clust;
60461+ /* place-holder for new plugin types that can be registered
60462+ dynamically, and used by other dynamically loaded plugins. */
60463+ void *generic;
60464+};
60465+
60466+struct reiser4_plugin_ops {
60467+ /* called when plugin is initialized */
60468+ int (*init) (reiser4_plugin * plugin);
60469+ /* called when plugin is unloaded */
60470+ int (*done) (reiser4_plugin * plugin);
60471+ /* load given plugin from disk */
60472+ int (*load) (struct inode * inode,
60473+ reiser4_plugin * plugin, char **area, int *len);
60474+ /* how many space is required to store this plugin's state
60475+ in stat-data */
60476+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
60477+ /* save persistent plugin-data to disk */
60478+ int (*save) (struct inode * inode, reiser4_plugin * plugin,
60479+ char **area);
60480+ /* alignment requirement for on-disk state of this plugin
60481+ in number of bytes */
60482+ int alignment;
60483+ /* install itself into given inode. This can return error
60484+ (e.g., you cannot change hash of non-empty directory). */
60485+ int (*change) (struct inode * inode, reiser4_plugin * plugin,
60486+ pset_member memb);
60487+ /* install itself into given inode. This can return error
60488+ (e.g., you cannot change hash of non-empty directory). */
60489+ int (*inherit) (struct inode * inode, struct inode * parent,
60490+ reiser4_plugin * plugin);
60491+};
60492+
60493+/* functions implemented in fs/reiser4/plugin/plugin.c */
60494+
60495+/* stores plugin reference in reiser4-specific part of inode */
60496+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
60497+extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
60498+extern int init_plugins(void);
60499+
60500+/* builtin plugins */
60501+
60502+/* builtin hash-plugins */
60503+
60504+typedef enum {
60505+ RUPASOV_HASH_ID,
60506+ R5_HASH_ID,
60507+ TEA_HASH_ID,
60508+ FNV1_HASH_ID,
60509+ DEGENERATE_HASH_ID,
60510+ LAST_HASH_ID
60511+} reiser4_hash_id;
60512+
60513+/* builtin cipher plugins */
60514+
60515+typedef enum {
60516+ NONE_CIPHER_ID,
60517+ LAST_CIPHER_ID
60518+} reiser4_cipher_id;
60519+
60520+/* builtin digest plugins */
60521+
60522+typedef enum {
60523+ SHA256_32_DIGEST_ID,
60524+ LAST_DIGEST_ID
60525+} reiser4_digest_id;
60526+
60527+/* builtin compression mode plugins */
60528+typedef enum {
60529+ NONE_COMPRESSION_MODE_ID,
60530+ LATTD_COMPRESSION_MODE_ID,
60531+ ULTIM_COMPRESSION_MODE_ID,
60532+ FORCE_COMPRESSION_MODE_ID,
60533+ CONVX_COMPRESSION_MODE_ID,
60534+ LAST_COMPRESSION_MODE_ID
60535+} reiser4_compression_mode_id;
60536+
60537+/* builtin cluster plugins */
60538+typedef enum {
60539+ CLUSTER_64K_ID,
60540+ CLUSTER_32K_ID,
60541+ CLUSTER_16K_ID,
60542+ CLUSTER_8K_ID,
60543+ CLUSTER_4K_ID,
60544+ LAST_CLUSTER_ID
60545+} reiser4_cluster_id;
60546+
60547+/* builtin tail-plugins */
60548+
60549+typedef enum {
60550+ NEVER_TAILS_FORMATTING_ID,
60551+ ALWAYS_TAILS_FORMATTING_ID,
60552+ SMALL_FILE_FORMATTING_ID,
60553+ LAST_TAIL_FORMATTING_ID
60554+} reiser4_formatting_id;
60555+
60556+/* compression/clustering specific data */
60557+typedef struct compression_data {
60558+ reiser4_compression_id coa; /* id of the compression algorithm */
60559+} compression_data_t;
60560+
60561+typedef __u8 cluster_data_t; /* cluster info */
60562+
60563+/* data type used to pack parameters that we pass to vfs object creation
60564+ function create_object() */
60565+struct reiser4_object_create_data {
60566+ /* plugin to control created object */
60567+ reiser4_file_id id;
60568+ /* mode of regular file, directory or special file */
60569+/* what happens if some other sort of perm plugin is in use? */
60570+ int mode;
60571+ /* rdev of special file */
60572+ dev_t rdev;
60573+ /* symlink target */
60574+ const char *name;
60575+ /* add here something for non-standard objects you invent, like
60576+ query for interpolation file etc. */
60577+
60578+ crypto_stat_t * crypto;
60579+ compression_data_t *compression;
60580+ cluster_data_t *cluster;
60581+
60582+ struct inode *parent;
60583+ struct dentry *dentry;
60584+};
60585+
60586+/* description of directory entry being created/destroyed/sought for
60587+
60588+ It is passed down to the directory plugin and farther to the
60589+ directory item plugin methods. Creation of new directory is done in
60590+ several stages: first we search for an entry with the same name, then
60591+ create new one. reiser4_dir_entry_desc is used to store some information
60592+ collected at some stage of this process and required later: key of
60593+ item that we want to insert/delete and pointer to an object that will
60594+ be bound by the new directory entry. Probably some more fields will
60595+ be added there.
60596+
60597+*/
60598+struct reiser4_dir_entry_desc {
60599+ /* key of directory entry */
60600+ reiser4_key key;
60601+ /* object bound by this entry. */
60602+ struct inode *obj;
60603+};
60604+
60605+#define MAX_PLUGIN_TYPE_LABEL_LEN 32
60606+#define MAX_PLUGIN_PLUG_LABEL_LEN 32
60607+
60608+/* used for interface with user-land: table-driven parsing in
60609+ reiser4(). */
60610+typedef struct plugin_locator {
60611+ reiser4_plugin_type type_id;
60612+ reiser4_plugin_id id;
60613+ char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
60614+ char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
60615+} plugin_locator;
60616+
60617+extern int locate_plugin(struct inode *inode, plugin_locator * loc);
60618+
60619+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
60620+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
60621+{ \
60622+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
60623+ return plugin ? & plugin -> FIELD : NULL; \
60624+} \
60625+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
60626+{ \
60627+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
60628+ return plugin ? & plugin -> FIELD : NULL; \
60629+} \
60630+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
60631+{ \
60632+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
60633+ return plugin ? & plugin -> FIELD : NULL; \
60634+} \
60635+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
60636+{ \
60637+ return ( reiser4_plugin * ) plugin; \
60638+} \
60639+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
60640+{ \
60641+ return TYPE ## _to_plugin (plugin) -> h.id; \
60642+} \
60643+typedef struct { int foo; } TYPE ## _plugin_dummy
60644+
60645+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
60646+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
60647+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
60648+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
60649+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
60650+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
60651+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
60652+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
60653+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
60654+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
60655+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
60656+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
60657+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
60658+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
60659+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60660+ compression_mode);
60661+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
60662+
60663+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
60664+
60665+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
60666+
60667+#define for_all_plugins(ptype, plugin) \
60668+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
60669+ get_plugin_list(ptype) != &plugin->h.linkage; \
60670+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
60671+
60672+
60673+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
60674+extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
60675+extern int finish_pset(struct inode *inode);
60676+
60677+/* defined in fs/reiser4/plugin/object.c */
60678+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60679+/* defined in fs/reiser4/plugin/object.c */
60680+extern dir_plugin dir_plugins[LAST_DIR_ID];
60681+/* defined in fs/reiser4/plugin/item/static_stat.c */
60682+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
60683+/* defined in fs/reiser4/plugin/hash.c */
60684+extern hash_plugin hash_plugins[LAST_HASH_ID];
60685+/* defined in fs/reiser4/plugin/fibration.c */
60686+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
60687+/* defined in fs/reiser4/plugin/crypt.c */
60688+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
60689+/* defined in fs/reiser4/plugin/digest.c */
60690+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
60691+/* defined in fs/reiser4/plugin/compress/compress.c */
60692+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
60693+/* defined in fs/reiser4/plugin/compress/compression_mode.c */
60694+extern compression_mode_plugin
60695+compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
60696+/* defined in fs/reiser4/plugin/cluster.c */
60697+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
60698+/* defined in fs/reiser4/plugin/tail.c */
60699+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
60700+/* defined in fs/reiser4/plugin/security/security.c */
60701+extern perm_plugin perm_plugins[LAST_PERM_ID];
60702+/* defined in fs/reiser4/plugin/item/item.c */
60703+extern item_plugin item_plugins[LAST_ITEM_ID];
60704+/* defined in fs/reiser4/plugin/node/node.c */
60705+extern node_plugin node_plugins[LAST_NODE_ID];
60706+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
60707+extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
60708+
60709+/* __FS_REISER4_PLUGIN_TYPES_H__ */
60710+#endif
60711+
60712+/* Make Linus happy.
60713+ Local variables:
60714+ c-indentation-style: "K&R"
60715+ mode-name: "LC"
60716+ c-basic-offset: 8
60717+ tab-width: 8
60718+ fill-column: 120
60719+ End:
60720+*/
60721diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.20/fs/reiser4/plugin/plugin_header.h
60722--- linux-2.6.20.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
60723+++ linux-2.6.20/fs/reiser4/plugin/plugin_header.h 2007-05-06 14:50:43.855024468 +0400
60724@@ -0,0 +1,144 @@
60725+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60726+
60727+/* plugin header. Data structures required by all plugin types. */
60728+
60729+#if !defined( __PLUGIN_HEADER_H__ )
60730+#define __PLUGIN_HEADER_H__
60731+
60732+/* plugin data-types and constants */
60733+
60734+#include "../debug.h"
60735+#include "../dformat.h"
60736+
60737+typedef enum {
60738+ REISER4_FILE_PLUGIN_TYPE,
60739+ REISER4_DIR_PLUGIN_TYPE,
60740+ REISER4_ITEM_PLUGIN_TYPE,
60741+ REISER4_NODE_PLUGIN_TYPE,
60742+ REISER4_HASH_PLUGIN_TYPE,
60743+ REISER4_FIBRATION_PLUGIN_TYPE,
60744+ REISER4_FORMATTING_PLUGIN_TYPE,
60745+ REISER4_PERM_PLUGIN_TYPE,
60746+ REISER4_SD_EXT_PLUGIN_TYPE,
60747+ REISER4_FORMAT_PLUGIN_TYPE,
60748+ REISER4_JNODE_PLUGIN_TYPE,
60749+ REISER4_CIPHER_PLUGIN_TYPE,
60750+ REISER4_DIGEST_PLUGIN_TYPE,
60751+ REISER4_COMPRESSION_PLUGIN_TYPE,
60752+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60753+ REISER4_CLUSTER_PLUGIN_TYPE,
60754+ REISER4_PLUGIN_TYPES
60755+} reiser4_plugin_type;
60756+
60757+typedef enum {
60758+ REISER4_DIRECTORY_FILE,
60759+ REISER4_REGULAR_FILE,
60760+ REISER4_SYMLINK_FILE,
60761+ REISER4_SPECIAL_FILE,
60762+} reiser4_plugin_group;
60763+
60764+struct reiser4_plugin_ops;
60765+/* generic plugin operations, supported by each
60766+ plugin type. */
60767+typedef struct reiser4_plugin_ops reiser4_plugin_ops;
60768+
60769+/* the common part of all plugin instances. */
60770+typedef struct plugin_header {
60771+ /* plugin type */
60772+ reiser4_plugin_type type_id;
60773+ /* id of this plugin */
60774+ reiser4_plugin_id id;
60775+ /* bitmask of groups the plugin belongs to. */
60776+ reiser4_plugin_groups groups;
60777+ /* plugin operations */
60778+ reiser4_plugin_ops *pops;
60779+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
60780+ /* short label of this plugin */
60781+ const char *label;
60782+ /* descriptive string.. */
60783+ const char *desc;
60784+ /* list linkage */
60785+ struct list_head linkage;
60786+} plugin_header;
60787+
60788+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
60789+
60790+/* PRIVATE INTERFACES */
60791+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
60792+/* plugin type representation. */
60793+typedef struct reiser4_plugin_type_data {
60794+ /* internal plugin type identifier. Should coincide with
60795+ index of this item in plugins[] array. */
60796+ reiser4_plugin_type type_id;
60797+ /* short symbolic label of this plugin type. Should be no longer
60798+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
60799+ const char *label;
60800+ /* plugin type description longer than .label */
60801+ const char *desc;
60802+
60803+/* NIKITA-FIXME-HANS: define built-in */
60804+ /* number of built-in plugin instances of this type */
60805+ int builtin_num;
60806+ /* array of built-in plugins */
60807+ void *builtin;
60808+ struct list_head plugins_list;
60809+ size_t size;
60810+} reiser4_plugin_type_data;
60811+
60812+extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
60813+
60814+int is_plugin_type_valid(reiser4_plugin_type type);
60815+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
60816+
60817+static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
60818+{
60819+ char *builtin;
60820+
60821+ builtin = ptype->builtin;
60822+ return (reiser4_plugin *) (builtin + i * ptype->size);
60823+}
60824+
60825+/* return plugin by its @type_id and @id */
60826+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
60827+ reiser4_plugin_id id)
60828+{
60829+ assert("nikita-1651", is_plugin_type_valid(type));
60830+ assert("nikita-1652", is_plugin_id_valid(type, id));
60831+ return plugin_at(&plugins[type], id);
60832+}
60833+
60834+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
60835+ reiser4_plugin_id id);
60836+
60837+/**
60838+ * plugin_by_disk_id - get reiser4_plugin
60839+ * @type_id: plugin type id
60840+ * @did: plugin id in disk format
60841+ *
60842+ * Returns reiser4_plugin by plugin type id an dplugin_id.
60843+ */
60844+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
60845+ reiser4_plugin_type type_id,
60846+ __le16 *plugin_id)
60847+{
60848+ /*
60849+ * what we should do properly is to maintain within each file-system a
60850+ * dictionary that maps on-disk plugin ids to "universal" ids. This
60851+ * dictionary will be resolved on mount time, so that this function
60852+ * will perform just one additional array lookup.
60853+ */
60854+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
60855+}
60856+
60857+/* __PLUGIN_HEADER_H__ */
60858+#endif
60859+
60860+/*
60861+ * Local variables:
60862+ * c-indentation-style: "K&R"
60863+ * mode-name: "LC"
60864+ * c-basic-offset: 8
60865+ * tab-width: 8
60866+ * fill-column: 79
60867+ * End:
60868+ */
60869diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.20/fs/reiser4/plugin/plugin_set.c
60870--- linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300
60871+++ linux-2.6.20/fs/reiser4/plugin/plugin_set.c 2007-05-06 14:50:43.855024468 +0400
60872@@ -0,0 +1,379 @@
60873+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60874+ * reiser4/README */
60875+/* This file contains Reiser4 plugin set operations */
60876+
60877+/* plugin sets
60878+ *
60879+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
60880+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
60881+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This
60882+ * set of plugins (so called pset) is described by structure plugin_set (see
60883+ * plugin/plugin_set.h), which contains pointers to all required plugins.
60884+ *
60885+ * Children can inherit some pset members from their parent, however sometimes
60886+ * it is useful to specify members different from parent ones. Since object's
60887+ * pset can not be easily changed without fatal consequences, we use for this
60888+ * purpose another special plugin table (so called hset, or heir set) described
60889+ * by the same structure.
60890+ *
60891+ * Inode only stores a pointers to pset and hset. Different inodes with the
60892+ * same set of pset (hset) members point to the same pset (hset). This is
60893+ * archived by storing psets and hsets in global hash table. Races are avoided
60894+ * by simple (and efficient so far) solution of never recycling psets, even
60895+ * when last inode pointing to it is destroyed.
60896+ */
60897+
60898+#include "../debug.h"
60899+#include "../super.h"
60900+#include "plugin_set.h"
60901+
60902+#include <linux/slab.h>
60903+#include <linux/stddef.h>
60904+
60905+/* slab for plugin sets */
60906+static struct kmem_cache *plugin_set_slab;
60907+
60908+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
60909+ [0 ... 7] = SPIN_LOCK_UNLOCKED
60910+};
60911+
60912+/* hash table support */
60913+
60914+#define PS_TABLE_SIZE (32)
60915+
60916+static inline plugin_set *cast_to(const unsigned long *a)
60917+{
60918+ return container_of(a, plugin_set, hashval);
60919+}
60920+
60921+static inline int pseq(const unsigned long *a1, const unsigned long *a2)
60922+{
60923+ plugin_set *set1;
60924+ plugin_set *set2;
60925+
60926+ /* make sure fields are not missed in the code below */
60927+ cassert(sizeof *set1 ==
60928+ sizeof set1->hashval +
60929+ sizeof set1->link +
60930+ sizeof set1->file +
60931+ sizeof set1->dir +
60932+ sizeof set1->perm +
60933+ sizeof set1->formatting +
60934+ sizeof set1->hash +
60935+ sizeof set1->fibration +
60936+ sizeof set1->sd +
60937+ sizeof set1->dir_item +
60938+ sizeof set1->cipher +
60939+ sizeof set1->digest +
60940+ sizeof set1->compression +
60941+ sizeof set1->compression_mode +
60942+ sizeof set1->cluster +
60943+ sizeof set1->create);
60944+
60945+ set1 = cast_to(a1);
60946+ set2 = cast_to(a2);
60947+ return
60948+ set1->hashval == set2->hashval &&
60949+ set1->file == set2->file &&
60950+ set1->dir == set2->dir &&
60951+ set1->perm == set2->perm &&
60952+ set1->formatting == set2->formatting &&
60953+ set1->hash == set2->hash &&
60954+ set1->fibration == set2->fibration &&
60955+ set1->sd == set2->sd &&
60956+ set1->dir_item == set2->dir_item &&
60957+ set1->cipher == set2->cipher &&
60958+ set1->digest == set2->digest &&
60959+ set1->compression == set2->compression &&
60960+ set1->compression_mode == set2->compression_mode &&
60961+ set1->cluster == set2->cluster &&
60962+ set1->create == set2->create;
60963+}
60964+
60965+#define HASH_FIELD(hash, set, field) \
60966+({ \
60967+ (hash) += (unsigned long)(set)->field >> 2; \
60968+})
60969+
60970+static inline unsigned long calculate_hash(const plugin_set * set)
60971+{
60972+ unsigned long result;
60973+
60974+ result = 0;
60975+ HASH_FIELD(result, set, file);
60976+ HASH_FIELD(result, set, dir);
60977+ HASH_FIELD(result, set, perm);
60978+ HASH_FIELD(result, set, formatting);
60979+ HASH_FIELD(result, set, hash);
60980+ HASH_FIELD(result, set, fibration);
60981+ HASH_FIELD(result, set, sd);
60982+ HASH_FIELD(result, set, dir_item);
60983+ HASH_FIELD(result, set, cipher);
60984+ HASH_FIELD(result, set, digest);
60985+ HASH_FIELD(result, set, compression);
60986+ HASH_FIELD(result, set, compression_mode);
60987+ HASH_FIELD(result, set, cluster);
60988+ HASH_FIELD(result, set, create);
60989+ return result & (PS_TABLE_SIZE - 1);
60990+}
60991+
60992+static inline unsigned long
60993+pshash(ps_hash_table * table, const unsigned long *a)
60994+{
60995+ return *a;
60996+}
60997+
60998+/* The hash table definition */
60999+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
61000+#define KFREE(ptr, size) kfree(ptr)
61001+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
61002+ pseq);
61003+#undef KFREE
61004+#undef KMALLOC
61005+
61006+static ps_hash_table ps_table;
61007+static plugin_set empty_set = {
61008+ .hashval = 0,
61009+ .file = NULL,
61010+ .dir = NULL,
61011+ .perm = NULL,
61012+ .formatting = NULL,
61013+ .hash = NULL,
61014+ .fibration = NULL,
61015+ .sd = NULL,
61016+ .dir_item = NULL,
61017+ .cipher = NULL,
61018+ .digest = NULL,
61019+ .compression = NULL,
61020+ .compression_mode = NULL,
61021+ .cluster = NULL,
61022+ .create = NULL,
61023+ .link = {NULL}
61024+};
61025+
61026+plugin_set *plugin_set_get_empty(void)
61027+{
61028+ return &empty_set;
61029+}
61030+
61031+void plugin_set_put(plugin_set * set)
61032+{
61033+}
61034+
61035+static inline unsigned long *pset_field(plugin_set * set, int offset)
61036+{
61037+ return (unsigned long *)(((char *)set) + offset);
61038+}
61039+
61040+static int plugin_set_field(plugin_set ** set, const unsigned long val,
61041+ const int offset)
61042+{
61043+ unsigned long *spot;
61044+ spinlock_t *lock;
61045+ plugin_set replica;
61046+ plugin_set *twin;
61047+ plugin_set *psal;
61048+ plugin_set *orig;
61049+
61050+ assert("nikita-2902", set != NULL);
61051+ assert("nikita-2904", *set != NULL);
61052+
61053+ spot = pset_field(*set, offset);
61054+ if (unlikely(*spot == val))
61055+ return 0;
61056+
61057+ replica = *(orig = *set);
61058+ *pset_field(&replica, offset) = val;
61059+ replica.hashval = calculate_hash(&replica);
61060+ rcu_read_lock();
61061+ twin = ps_hash_find(&ps_table, &replica.hashval);
61062+ if (unlikely(twin == NULL)) {
61063+ rcu_read_unlock();
61064+ psal = kmem_cache_alloc(plugin_set_slab,
61065+ reiser4_ctx_gfp_mask_get());
61066+ if (psal == NULL)
61067+ return RETERR(-ENOMEM);
61068+ *psal = replica;
61069+ lock = &plugin_set_lock[replica.hashval & 7];
61070+ spin_lock(lock);
61071+ twin = ps_hash_find(&ps_table, &replica.hashval);
61072+ if (likely(twin == NULL)) {
61073+ *set = psal;
61074+ ps_hash_insert_rcu(&ps_table, psal);
61075+ } else {
61076+ *set = twin;
61077+ kmem_cache_free(plugin_set_slab, psal);
61078+ }
61079+ spin_unlock(lock);
61080+ } else {
61081+ rcu_read_unlock();
61082+ *set = twin;
61083+ }
61084+ return 0;
61085+}
61086+
61087+static struct {
61088+ int offset;
61089+ reiser4_plugin_groups groups;
61090+ reiser4_plugin_type type;
61091+} pset_descr[PSET_LAST] = {
61092+ [PSET_FILE] = {
61093+ .offset = offsetof(plugin_set, file),
61094+ .type = REISER4_FILE_PLUGIN_TYPE,
61095+ .groups = 0
61096+ },
61097+ [PSET_DIR] = {
61098+ .offset = offsetof(plugin_set, dir),
61099+ .type = REISER4_DIR_PLUGIN_TYPE,
61100+ .groups = 0
61101+ },
61102+ [PSET_PERM] = {
61103+ .offset = offsetof(plugin_set, perm),
61104+ .type = REISER4_PERM_PLUGIN_TYPE,
61105+ .groups = 0
61106+ },
61107+ [PSET_FORMATTING] = {
61108+ .offset = offsetof(plugin_set, formatting),
61109+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
61110+ .groups = 0
61111+ },
61112+ [PSET_HASH] = {
61113+ .offset = offsetof(plugin_set, hash),
61114+ .type = REISER4_HASH_PLUGIN_TYPE,
61115+ .groups = 0
61116+ },
61117+ [PSET_FIBRATION] = {
61118+ .offset = offsetof(plugin_set, fibration),
61119+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
61120+ .groups = 0
61121+ },
61122+ [PSET_SD] = {
61123+ .offset = offsetof(plugin_set, sd),
61124+ .type = REISER4_ITEM_PLUGIN_TYPE,
61125+ .groups = (1 << STAT_DATA_ITEM_TYPE)
61126+ },
61127+ [PSET_DIR_ITEM] = {
61128+ .offset = offsetof(plugin_set, dir_item),
61129+ .type = REISER4_ITEM_PLUGIN_TYPE,
61130+ .groups = (1 << DIR_ENTRY_ITEM_TYPE)
61131+ },
61132+ [PSET_CIPHER] = {
61133+ .offset = offsetof(plugin_set, cipher),
61134+ .type = REISER4_CIPHER_PLUGIN_TYPE,
61135+ .groups = 0
61136+ },
61137+ [PSET_DIGEST] = {
61138+ .offset = offsetof(plugin_set, digest),
61139+ .type = REISER4_DIGEST_PLUGIN_TYPE,
61140+ .groups = 0
61141+ },
61142+ [PSET_COMPRESSION] = {
61143+ .offset = offsetof(plugin_set, compression),
61144+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
61145+ .groups = 0
61146+ },
61147+ [PSET_COMPRESSION_MODE] = {
61148+ .offset = offsetof(plugin_set, compression_mode),
61149+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61150+ .groups = 0
61151+ },
61152+ [PSET_CLUSTER] = {
61153+ .offset = offsetof(plugin_set, cluster),
61154+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
61155+ .groups = 0
61156+ },
61157+ [PSET_CREATE] = {
61158+ .offset = offsetof(plugin_set, create),
61159+ .type = REISER4_FILE_PLUGIN_TYPE,
61160+ .groups = (1 << REISER4_REGULAR_FILE)
61161+ }
61162+};
61163+
61164+#define DEFINE_PSET_OPS(PREFIX) \
61165+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
61166+{ \
61167+ if (memb > PSET_LAST) \
61168+ return REISER4_PLUGIN_TYPES; \
61169+ return pset_descr[memb].type; \
61170+} \
61171+ \
61172+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
61173+ reiser4_plugin * plugin) \
61174+{ \
61175+ assert("nikita-3492", set != NULL); \
61176+ assert("nikita-3493", *set != NULL); \
61177+ assert("nikita-3494", plugin != NULL); \
61178+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
61179+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
61180+ \
61181+ if (pset_descr[memb].groups) \
61182+ if (!(pset_descr[memb].groups & plugin->h.groups)) \
61183+ return -EINVAL; \
61184+ \
61185+ return plugin_set_field(set, \
61186+ (unsigned long)plugin, pset_descr[memb].offset); \
61187+} \
61188+ \
61189+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
61190+{ \
61191+ assert("nikita-3497", set != NULL); \
61192+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
61193+ \
61194+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
61195+}
61196+
61197+DEFINE_PSET_OPS(aset);
61198+
61199+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
61200+ return plugin_set_field(set,
61201+ (unsigned long)plugin, pset_descr[memb].offset);
61202+}
61203+
61204+/**
61205+ * init_plugin_set - create plugin set cache and hash table
61206+ *
61207+ * Initializes slab cache of plugin_set-s and their hash table. It is part of
61208+ * reiser4 module initialization.
61209+ */
61210+int init_plugin_set(void)
61211+{
61212+ int result;
61213+
61214+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
61215+ if (result == 0) {
61216+ plugin_set_slab = kmem_cache_create("plugin_set",
61217+ sizeof(plugin_set), 0,
61218+ SLAB_HWCACHE_ALIGN,
61219+ NULL, NULL);
61220+ if (plugin_set_slab == NULL)
61221+ result = RETERR(-ENOMEM);
61222+ }
61223+ return result;
61224+}
61225+
61226+/**
61227+ * done_plugin_set - delete plugin_set cache and plugin_set hash table
61228+ *
61229+ * This is called on reiser4 module unloading or system shutdown.
61230+ */
61231+void done_plugin_set(void)
61232+{
61233+ plugin_set *cur, *next;
61234+
61235+ for_all_in_htable(&ps_table, ps, cur, next) {
61236+ ps_hash_remove(&ps_table, cur);
61237+ kmem_cache_free(plugin_set_slab, cur);
61238+ }
61239+ destroy_reiser4_cache(&plugin_set_slab);
61240+ ps_hash_done(&ps_table);
61241+}
61242+
61243+/*
61244+ * Local variables:
61245+ * c-indentation-style: "K&R"
61246+ * mode-name: "LC"
61247+ * c-basic-offset: 8
61248+ * tab-width: 8
61249+ * fill-column: 120
61250+ * End:
61251+ */
61252diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.20/fs/reiser4/plugin/plugin_set.h
61253--- linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300
61254+++ linux-2.6.20/fs/reiser4/plugin/plugin_set.h 2007-05-06 14:50:43.855024468 +0400
61255@@ -0,0 +1,77 @@
61256+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61257+
61258+/* Reiser4 plugin set definition.
61259+ See fs/reiser4/plugin/plugin_set.c for details */
61260+
61261+#if !defined( __PLUGIN_SET_H__ )
61262+#define __PLUGIN_SET_H__
61263+
61264+#include "../type_safe_hash.h"
61265+#include "plugin.h"
61266+
61267+#include <linux/rcupdate.h>
61268+
61269+struct plugin_set;
61270+typedef struct plugin_set plugin_set;
61271+
61272+TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61273+
61274+struct plugin_set {
61275+ unsigned long hashval;
61276+ /* plugin of file */
61277+ file_plugin *file;
61278+ /* plugin of dir */
61279+ dir_plugin *dir;
61280+ /* perm plugin for this file */
61281+ perm_plugin *perm;
61282+ /* tail policy plugin. Only meaningful for regular files */
61283+ formatting_plugin *formatting;
61284+ /* hash plugin. Only meaningful for directories. */
61285+ hash_plugin *hash;
61286+ /* fibration plugin. Only meaningful for directories. */
61287+ fibration_plugin *fibration;
61288+ /* plugin of stat-data */
61289+ item_plugin *sd;
61290+ /* plugin of items a directory is built of */
61291+ item_plugin *dir_item;
61292+ /* cipher plugin */
61293+ cipher_plugin *cipher;
61294+ /* digest plugin */
61295+ digest_plugin *digest;
61296+ /* compression plugin */
61297+ compression_plugin *compression;
61298+ /* compression mode plugin */
61299+ compression_mode_plugin *compression_mode;
61300+ /* cluster plugin */
61301+ cluster_plugin *cluster;
61302+ /* this specifies file plugin of regular children.
61303+ only meaningful for directories */
61304+ file_plugin *create;
61305+ ps_hash_link link;
61306+};
61307+
61308+extern plugin_set *plugin_set_get_empty(void);
61309+extern void plugin_set_put(plugin_set * set);
61310+
61311+extern int init_plugin_set(void);
61312+extern void done_plugin_set(void);
61313+
61314+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
61315+extern int set_plugin(plugin_set ** set, pset_member memb,
61316+ reiser4_plugin * plugin);
61317+extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
61318+ reiser4_plugin * plugin);
61319+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
61320+
61321+/* __PLUGIN_SET_H__ */
61322+#endif
61323+
61324+/* Make Linus happy.
61325+ Local variables:
61326+ c-indentation-style: "K&R"
61327+ mode-name: "LC"
61328+ c-basic-offset: 8
61329+ tab-width: 8
61330+ fill-column: 120
61331+ End:
61332+*/
61333diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/Makefile linux-2.6.20/fs/reiser4/plugin/security/Makefile
61334--- linux-2.6.20.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300
61335+++ linux-2.6.20/fs/reiser4/plugin/security/Makefile 2007-05-06 14:50:43.855024468 +0400
61336@@ -0,0 +1,4 @@
61337+obj-$(CONFIG_REISER4_FS) += security_plugins.o
61338+
61339+security_plugins-objs := \
61340+ perm.o
61341diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/perm.c linux-2.6.20/fs/reiser4/plugin/security/perm.c
61342--- linux-2.6.20.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
61343+++ linux-2.6.20/fs/reiser4/plugin/security/perm.c 2007-05-06 14:50:43.859025718 +0400
61344@@ -0,0 +1,44 @@
61345+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61346+
61347+/*
61348+ * this file contains implementation of permission plugins. Currently, only
61349+ * RWX_PERM_ID is implemented
61350+ */
61351+
61352+#include "../plugin.h"
61353+#include "../plugin_header.h"
61354+#include "../../debug.h"
61355+
61356+perm_plugin perm_plugins[LAST_PERM_ID] = {
61357+ [NULL_PERM_ID] = {
61358+ .h = {
61359+ .type_id = REISER4_PERM_PLUGIN_TYPE,
61360+ .id = NULL_PERM_ID,
61361+ .pops = NULL,
61362+ .label = "null",
61363+ .desc = "stub permission plugin",
61364+ .linkage = {NULL, NULL}
61365+ },
61366+ .read_ok = NULL,
61367+ .write_ok = NULL,
61368+ .lookup_ok = NULL,
61369+ .create_ok = NULL,
61370+ .link_ok = NULL,
61371+ .unlink_ok = NULL,
61372+ .delete_ok = NULL,
61373+ .mask_ok = NULL,
61374+ .setattr_ok = NULL,
61375+ .getattr_ok = NULL,
61376+ .rename_ok = NULL,
61377+ }
61378+};
61379+
61380+/*
61381+ * Local variables:
61382+ * c-indentation-style: "K&R"
61383+ * mode-name: "LC"
61384+ * c-basic-offset: 8
61385+ * tab-width: 8
61386+ * fill-column: 79
61387+ * End:
61388+ */
61389diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/perm.h linux-2.6.20/fs/reiser4/plugin/security/perm.h
61390--- linux-2.6.20.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
61391+++ linux-2.6.20/fs/reiser4/plugin/security/perm.h 2007-05-06 14:50:43.859025718 +0400
61392@@ -0,0 +1,82 @@
61393+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61394+
61395+/* Perm (short for "permissions") plugins common stuff. */
61396+
61397+#if !defined( __REISER4_PERM_H__ )
61398+#define __REISER4_PERM_H__
61399+
61400+#include "../../forward.h"
61401+#include "../plugin_header.h"
61402+
61403+#include <linux/types.h>
61404+#include <linux/fs.h> /* for struct file */
61405+#include <linux/dcache.h> /* for struct dentry */
61406+
61407+/* interface for perm plugin.
61408+
61409+ Perm plugin method can be implemented through:
61410+
61411+ 1. consulting ->i_mode bits in stat data
61412+
61413+ 2. obtaining acl from the tree and inspecting it
61414+
61415+ 3. asking some kernel module or user-level program to authorize access.
61416+
61417+ This allows for integration with things like capabilities, SELinux-style
61418+ secutiry contexts, etc.
61419+
61420+*/
61421+/* NIKITA-FIXME-HANS: define what this is targeted for. It does not seem to be intended for use with sys_reiser4. Explain. */
61422+typedef struct perm_plugin {
61423+ /* generic plugin fields */
61424+ plugin_header h;
61425+
61426+ /* check permissions for read/write */
61427+ int (*read_ok) (struct file *file, const char __user *buf,
61428+ size_t size, loff_t *off);
61429+ int (*write_ok) (struct file *file, const char __user *buf,
61430+ size_t size, loff_t *off);
61431+
61432+ /* check permissions for lookup */
61433+ int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
61434+
61435+ /* check permissions for create */
61436+ int (*create_ok) (struct inode * parent, struct dentry * dentry,
61437+ reiser4_object_create_data * data);
61438+
61439+ /* check permissions for linking @where to @existing */
61440+ int (*link_ok) (struct dentry * existing, struct inode * parent,
61441+ struct dentry * where);
61442+
61443+ /* check permissions for unlinking @victim from @parent */
61444+ int (*unlink_ok) (struct inode * parent, struct dentry * victim);
61445+
61446+ /* check permissions for deletion of @object whose last reference is
61447+ by @parent */
61448+ int (*delete_ok) (struct inode * parent, struct dentry * victim);
61449+ int (*mask_ok) (struct inode * inode, int mask);
61450+ /* check whether attribute change is acceptable */
61451+ int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
61452+
61453+ /* check whether stat(2) is allowed */
61454+ int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
61455+ struct dentry * dentry, struct kstat * stat);
61456+ /* check whether rename(2) is allowed */
61457+ int (*rename_ok) (struct inode * old_dir, struct dentry * old,
61458+ struct inode * new_dir, struct dentry * new);
61459+} perm_plugin;
61460+
61461+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
61462+
61463+/* __REISER4_PERM_H__ */
61464+#endif
61465+
61466+/* Make Linus happy.
61467+ Local variables:
61468+ c-indentation-style: "K&R"
61469+ mode-name: "LC"
61470+ c-basic-offset: 8
61471+ tab-width: 8
61472+ fill-column: 120
61473+ End:
61474+*/
61475diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.20/fs/reiser4/plugin/space/bitmap.c
61476--- linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300
61477+++ linux-2.6.20/fs/reiser4/plugin/space/bitmap.c 2007-05-06 14:50:43.859025718 +0400
61478@@ -0,0 +1,1585 @@
61479+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61480+
61481+#include "../../debug.h"
61482+#include "../../dformat.h"
61483+#include "../../txnmgr.h"
61484+#include "../../jnode.h"
61485+#include "../../block_alloc.h"
61486+#include "../../tree.h"
61487+#include "../../super.h"
61488+#include "../plugin.h"
61489+#include "space_allocator.h"
61490+#include "bitmap.h"
61491+
61492+#include <linux/types.h>
61493+#include <linux/fs.h> /* for struct super_block */
61494+#include <linux/mutex.h>
61495+#include <asm/div64.h>
61496+
61497+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
61498+ * blocks
61499+
61500+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap
61501+ blocks loading/unloading which is different from v3.x where all bitmap
61502+ blocks are loaded at mount time.
61503+
61504+ To implement bitmap blocks unloading we need to count bitmap block usage
61505+ and detect currently unused blocks allowing them to be unloaded. It is not
61506+ a simple task since we allow several threads to modify one bitmap block
61507+ simultaneously.
61508+
61509+ Briefly speaking, the following schema is proposed: we count in special
61510+ variable associated with each bitmap block. That is for counting of block
61511+ alloc/dealloc operations on that bitmap block. With a deferred block
61512+ deallocation feature of reiser4 all those operation will be represented in
61513+ atom dirty/deleted lists as jnodes for freshly allocated or deleted
61514+ nodes.
61515+
61516+ So, we increment usage counter for each new node allocated or deleted, and
61517+ decrement it at atom commit one time for each node from the dirty/deleted
61518+ atom's list. Of course, freshly allocated node deletion and node reusing
61519+ from atom deleted (if we do so) list should decrement bitmap usage counter
61520+ also.
61521+
61522+ This schema seems to be working but that reference counting is
61523+ not easy to debug. I think we should agree with Hans and do not implement
61524+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
61525+
61526+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are
61527+ loaded into memory on fs mount time or each bitmap nodes are loaded at the
61528+ first access to it, the "dont_load_bitmap" mount option controls whether
61529+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
61530+ nodes currently is not supported. */
61531+
61532+#define CHECKSUM_SIZE 4
61533+
61534+#define BYTES_PER_LONG (sizeof(long))
61535+
61536+#if BITS_PER_LONG == 64
61537+# define LONG_INT_SHIFT (6)
61538+#else
61539+# define LONG_INT_SHIFT (5)
61540+#endif
61541+
61542+#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
61543+
61544+typedef unsigned long ulong_t;
61545+
61546+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
61547+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
61548+
61549+/* Block allocation/deallocation are done through special bitmap objects which
61550+ are allocated in an array at fs mount. */
61551+struct bitmap_node {
61552+ struct mutex mutex; /* long term lock object */
61553+
61554+ jnode *wjnode; /* j-nodes for WORKING ... */
61555+ jnode *cjnode; /* ... and COMMIT bitmap blocks */
61556+
61557+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */
61558+
61559+ atomic_t loaded; /* a flag which shows that bnode is loaded
61560+ * already */
61561+};
61562+
61563+static inline char *bnode_working_data(struct bitmap_node *bnode)
61564+{
61565+ char *data;
61566+
61567+ data = jdata(bnode->wjnode);
61568+ assert("zam-429", data != NULL);
61569+
61570+ return data + CHECKSUM_SIZE;
61571+}
61572+
61573+static inline char *bnode_commit_data(const struct bitmap_node *bnode)
61574+{
61575+ char *data;
61576+
61577+ data = jdata(bnode->cjnode);
61578+ assert("zam-430", data != NULL);
61579+
61580+ return data + CHECKSUM_SIZE;
61581+}
61582+
61583+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
61584+{
61585+ char *data;
61586+
61587+ data = jdata(bnode->cjnode);
61588+ assert("vpf-261", data != NULL);
61589+
61590+ return le32_to_cpu(get_unaligned((d32 *)data));
61591+}
61592+
61593+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
61594+{
61595+ char *data;
61596+
61597+ data = jdata(bnode->cjnode);
61598+ assert("vpf-261", data != NULL);
61599+
61600+ put_unaligned(cpu_to_le32(crc), (d32 *)data);
61601+}
61602+
61603+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
61604+ * written the code, does this added abstraction still have */
61605+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
61606+ * reiser4_space_allocator structure) */
61607+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
61608+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
61609+ * someday?". What they about? If there is a reason to have a union, it should
61610+ * be a union, if not, it should not be a union. "..might be someday" means no
61611+ * reason. */
61612+struct bitmap_allocator_data {
61613+ /* an array for bitmap blocks direct access */
61614+ struct bitmap_node *bitmap;
61615+};
61616+
61617+#define get_barray(super) \
61618+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
61619+
61620+#define get_bnode(super, i) (get_barray(super) + i)
61621+
61622+/* allocate and initialize jnode with JNODE_BITMAP type */
61623+static jnode *bnew(void)
61624+{
61625+ jnode *jal = jalloc();
61626+
61627+ if (jal)
61628+ jnode_init(jal, current_tree, JNODE_BITMAP);
61629+
61630+ return jal;
61631+}
61632+
61633+/* this file contains:
61634+ - bitmap based implementation of space allocation plugin
61635+ - all the helper functions like set bit, find_first_zero_bit, etc */
61636+
61637+/* Audited by: green(2002.06.12) */
61638+static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
61639+{
61640+ ulong_t mask = 1UL << start_bit;
61641+ int i = start_bit;
61642+
61643+ while ((word & mask) != 0) {
61644+ mask <<= 1;
61645+ if (++i >= BITS_PER_LONG)
61646+ break;
61647+ }
61648+
61649+ return i;
61650+}
61651+
61652+#include <asm/bitops.h>
61653+
61654+#if BITS_PER_LONG == 64
61655+
61656+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
61657+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
61658+
61659+static inline void reiser4_set_bit(int nr, void *addr)
61660+{
61661+ ext2_set_bit(nr + OFF(addr), BASE(addr));
61662+}
61663+
61664+static inline void reiser4_clear_bit(int nr, void *addr)
61665+{
61666+ ext2_clear_bit(nr + OFF(addr), BASE(addr));
61667+}
61668+
61669+static inline int reiser4_test_bit(int nr, void *addr)
61670+{
61671+ return ext2_test_bit(nr + OFF(addr), BASE(addr));
61672+}
61673+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
61674+ int offset)
61675+{
61676+ int off = OFF(addr);
61677+
61678+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
61679+ offset + off) - off;
61680+}
61681+
61682+#else
61683+
61684+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
61685+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
61686+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
61687+
61688+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
61689+ext2_find_next_zero_bit(addr, maxoffset, offset)
61690+#endif
61691+
61692+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
61693+ * are counted from @addr, return the offset of the first bit if it is found,
61694+ * @maxoffset otherwise. */
61695+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61696+ bmap_off_t start_offset)
61697+{
61698+ ulong_t *base = addr;
61699+ /* start_offset is in bits, convert it to byte offset within bitmap. */
61700+ int word_nr = start_offset >> LONG_INT_SHIFT;
61701+ /* bit number within the byte. */
61702+ int bit_nr = start_offset & LONG_INT_MASK;
61703+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
61704+
61705+ assert("zam-387", max_offset != 0);
61706+
61707+ /* Unaligned @start_offset case. */
61708+ if (bit_nr != 0) {
61709+ bmap_nr_t nr;
61710+
61711+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
61712+
61713+ if (nr < BITS_PER_LONG)
61714+ return (word_nr << LONG_INT_SHIFT) + nr;
61715+
61716+ ++word_nr;
61717+ }
61718+
61719+ /* Fast scan trough aligned words. */
61720+ while (word_nr <= max_word_nr) {
61721+ if (base[word_nr] != 0) {
61722+ return (word_nr << LONG_INT_SHIFT)
61723+ + find_next_zero_bit_in_word(~(base[word_nr]), 0);
61724+ }
61725+
61726+ ++word_nr;
61727+ }
61728+
61729+ return max_offset;
61730+}
61731+
61732+#if BITS_PER_LONG == 64
61733+
61734+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61735+ bmap_off_t start_offset)
61736+{
61737+ bmap_off_t off = OFF(addr);
61738+
61739+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
61740+ start_offset + off) - off;
61741+}
61742+
61743+#else
61744+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
61745+ __reiser4_find_next_set_bit(addr, max_offset, start_offset)
61746+#endif
61747+
61748+/* search for the first set bit in single word. */
61749+static int find_last_set_bit_in_word(ulong_t word, int start_bit)
61750+{
61751+ ulong_t bit_mask;
61752+ int nr = start_bit;
61753+
61754+ assert("zam-965", start_bit < BITS_PER_LONG);
61755+ assert("zam-966", start_bit >= 0);
61756+
61757+ bit_mask = (1UL << nr);
61758+
61759+ while (bit_mask != 0) {
61760+ if (bit_mask & word)
61761+ return nr;
61762+ bit_mask >>= 1;
61763+ nr--;
61764+ }
61765+ return BITS_PER_LONG;
61766+}
61767+
61768+/* Search bitmap for a set bit in backward direction from the end to the
61769+ * beginning of given region
61770+ *
61771+ * @result: result offset of the last set bit
61772+ * @addr: base memory address,
61773+ * @low_off: low end of the search region, edge bit included into the region,
61774+ * @high_off: high end of the search region, edge bit included into the region,
61775+ *
61776+ * @return: 0 - set bit was found, -1 otherwise.
61777+ */
61778+static int
61779+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61780+ bmap_off_t high_off)
61781+{
61782+ ulong_t *base = addr;
61783+ int last_word;
61784+ int first_word;
61785+ int last_bit;
61786+ int nr;
61787+
61788+ assert("zam-962", high_off >= low_off);
61789+
61790+ last_word = high_off >> LONG_INT_SHIFT;
61791+ last_bit = high_off & LONG_INT_MASK;
61792+ first_word = low_off >> LONG_INT_SHIFT;
61793+
61794+ if (last_bit < BITS_PER_LONG) {
61795+ nr = find_last_set_bit_in_word(base[last_word], last_bit);
61796+ if (nr < BITS_PER_LONG) {
61797+ *result = (last_word << LONG_INT_SHIFT) + nr;
61798+ return 0;
61799+ }
61800+ --last_word;
61801+ }
61802+ while (last_word >= first_word) {
61803+ if (base[last_word] != 0x0) {
61804+ last_bit =
61805+ find_last_set_bit_in_word(base[last_word],
61806+ BITS_PER_LONG - 1);
61807+ assert("zam-972", last_bit < BITS_PER_LONG);
61808+ *result = (last_word << LONG_INT_SHIFT) + last_bit;
61809+ return 0;
61810+ }
61811+ --last_word;
61812+ }
61813+
61814+ return -1; /* set bit not found */
61815+}
61816+
61817+/* Search bitmap for a clear bit in backward direction from the end to the
61818+ * beginning of given region */
61819+static int
61820+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61821+ bmap_off_t high_off)
61822+{
61823+ ulong_t *base = addr;
61824+ int last_word;
61825+ int first_word;
61826+ int last_bit;
61827+ int nr;
61828+
61829+ last_word = high_off >> LONG_INT_SHIFT;
61830+ last_bit = high_off & LONG_INT_MASK;
61831+ first_word = low_off >> LONG_INT_SHIFT;
61832+
61833+ if (last_bit < BITS_PER_LONG) {
61834+ nr = find_last_set_bit_in_word(~base[last_word], last_bit);
61835+ if (nr < BITS_PER_LONG) {
61836+ *result = (last_word << LONG_INT_SHIFT) + nr;
61837+ return 0;
61838+ }
61839+ --last_word;
61840+ }
61841+ while (last_word >= first_word) {
61842+ if (base[last_word] != (ulong_t) (-1)) {
61843+ *result = (last_word << LONG_INT_SHIFT) +
61844+ find_last_set_bit_in_word(~base[last_word],
61845+ BITS_PER_LONG - 1);
61846+ return 0;
61847+ }
61848+ --last_word;
61849+ }
61850+
61851+ return -1; /* zero bit not found */
61852+}
61853+
61854+/* Audited by: green(2002.06.12) */
61855+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
61856+{
61857+ int first_byte;
61858+ int last_byte;
61859+
61860+ unsigned char first_byte_mask = 0xFF;
61861+ unsigned char last_byte_mask = 0xFF;
61862+
61863+ assert("zam-410", start < end);
61864+
61865+ first_byte = start >> 3;
61866+ last_byte = (end - 1) >> 3;
61867+
61868+ if (last_byte > first_byte + 1)
61869+ memset(addr + first_byte + 1, 0,
61870+ (size_t) (last_byte - first_byte - 1));
61871+
61872+ first_byte_mask >>= 8 - (start & 0x7);
61873+ last_byte_mask <<= ((end - 1) & 0x7) + 1;
61874+
61875+ if (first_byte == last_byte) {
61876+ addr[first_byte] &= (first_byte_mask | last_byte_mask);
61877+ } else {
61878+ addr[first_byte] &= first_byte_mask;
61879+ addr[last_byte] &= last_byte_mask;
61880+ }
61881+}
61882+
61883+/* Audited by: green(2002.06.12) */
61884+/* ZAM-FIXME-HANS: comment this */
61885+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
61886+{
61887+ int first_byte;
61888+ int last_byte;
61889+
61890+ unsigned char first_byte_mask = 0xFF;
61891+ unsigned char last_byte_mask = 0xFF;
61892+
61893+ assert("zam-386", start < end);
61894+
61895+ first_byte = start >> 3;
61896+ last_byte = (end - 1) >> 3;
61897+
61898+ if (last_byte > first_byte + 1)
61899+ memset(addr + first_byte + 1, 0xFF,
61900+ (size_t) (last_byte - first_byte - 1));
61901+
61902+ first_byte_mask <<= start & 0x7;
61903+ last_byte_mask >>= 7 - ((end - 1) & 0x7);
61904+
61905+ if (first_byte == last_byte) {
61906+ addr[first_byte] |= (first_byte_mask & last_byte_mask);
61907+ } else {
61908+ addr[first_byte] |= first_byte_mask;
61909+ addr[last_byte] |= last_byte_mask;
61910+ }
61911+}
61912+
61913+#define ADLER_BASE 65521
61914+#define ADLER_NMAX 5552
61915+
61916+/* Calculates the adler32 checksum for the data pointed by `data` of the
61917+ length `len`. This function was originally taken from zlib, version 1.1.3,
61918+ July 9th, 1998.
61919+
61920+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
61921+
61922+ This software is provided 'as-is', without any express or implied
61923+ warranty. In no event will the authors be held liable for any damages
61924+ arising from the use of this software.
61925+
61926+ Permission is granted to anyone to use this software for any purpose,
61927+ including commercial applications, and to alter it and redistribute it
61928+ freely, subject to the following restrictions:
61929+
61930+ 1. The origin of this software must not be misrepresented; you must not
61931+ claim that you wrote the original software. If you use this software
61932+ in a product, an acknowledgment in the product documentation would be
61933+ appreciated but is not required.
61934+ 2. Altered source versions must be plainly marked as such, and must not be
61935+ misrepresented as being the original software.
61936+ 3. This notice may not be removed or altered from any source distribution.
61937+
61938+ Jean-loup Gailly Mark Adler
61939+ jloup@gzip.org madler@alumni.caltech.edu
61940+
61941+ The above comment applies only to the reiser4_adler32 function.
61942+*/
61943+
61944+__u32 reiser4_adler32(char *data, __u32 len)
61945+{
61946+ unsigned char *t = data;
61947+ __u32 s1 = 1;
61948+ __u32 s2 = 0;
61949+ int k;
61950+
61951+ while (len > 0) {
61952+ k = len < ADLER_NMAX ? len : ADLER_NMAX;
61953+ len -= k;
61954+
61955+ while (k--) {
61956+ s1 += *t++;
61957+ s2 += s1;
61958+ }
61959+
61960+ s1 %= ADLER_BASE;
61961+ s2 %= ADLER_BASE;
61962+ }
61963+ return (s2 << 16) | s1;
61964+}
61965+
61966+#define sb_by_bnode(bnode) \
61967+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
61968+
61969+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
61970+{
61971+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
61972+}
61973+
61974+static int
61975+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
61976+{
61977+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
61978+ bmap_nr_t bmap;
61979+
61980+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
61981+
61982+ warning("vpf-263",
61983+ "Checksum for the bitmap block %llu is incorrect",
61984+ bmap);
61985+
61986+ return RETERR(-EIO);
61987+ }
61988+
61989+ return 0;
61990+}
61991+
61992+#define REISER4_CHECK_BMAP_CRC (0)
61993+
61994+#if REISER4_CHECK_BMAP_CRC
61995+static int bnode_check_crc(const struct bitmap_node *bnode)
61996+{
61997+ return bnode_check_adler32(bnode,
61998+ bmap_size(sb_by_bnode(bnode)->s_blocksize));
61999+}
62000+
62001+/* REISER4_CHECK_BMAP_CRC */
62002+#else
62003+
62004+#define bnode_check_crc(bnode) (0)
62005+
62006+/* REISER4_CHECK_BMAP_CRC */
62007+#endif
62008+
62009+/* Recalculates the adler32 checksum for only 1 byte change.
62010+ adler - previous adler checksum
62011+ old_data, data - old, new byte values.
62012+ tail == (chunk - offset) : length, checksum was calculated for, - offset of
62013+ the changed byte within this chunk.
62014+ This function can be used for checksum calculation optimisation.
62015+*/
62016+
62017+static __u32
62018+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
62019+ __u32 tail)
62020+{
62021+ __u32 delta = data - old_data + 2 * ADLER_BASE;
62022+ __u32 s1 = adler & 0xffff;
62023+ __u32 s2 = (adler >> 16) & 0xffff;
62024+
62025+ s1 = (delta + s1) % ADLER_BASE;
62026+ s2 = (delta * tail + s2) % ADLER_BASE;
62027+
62028+ return (s2 << 16) | s1;
62029+}
62030+
62031+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
62032+
62033+/**
62034+ * get_nr_bitmap - calculate number of bitmap blocks
62035+ * @super: super block with initialized blocksize and block count
62036+ *
62037+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
62038+ * maintain free disk space. It assumes that each bitmap addresses the same
62039+ * number of blocks which is calculated by bmap_block_count macro defined in
62040+ * above. Number of blocks in the filesystem has to be initialized in reiser4
62041+ * private data of super block already so that it can be obtained via
62042+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
62043+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
62044+ * to use special function to divide and modulo 64bits filesystem block
62045+ * counters.
62046+ *
62047+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
62048+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
62049+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
62050+ */
62051+static bmap_nr_t get_nr_bmap(const struct super_block *super)
62052+{
62053+ u64 quotient;
62054+
62055+ assert("zam-393", reiser4_block_count(super) != 0);
62056+
62057+ quotient = reiser4_block_count(super) - 1;
62058+ do_div(quotient, bmap_bit_count(super->s_blocksize));
62059+ return quotient + 1;
62060+}
62061+
62062+/**
62063+ * parse_blocknr - calculate bitmap number and offset in it by block number
62064+ * @block: pointer to block number to calculate location in bitmap of
62065+ * @bmap: pointer where to store bitmap block number
62066+ * @offset: pointer where to store offset within bitmap block
62067+ *
62068+ * Calculates location of bit which is responsible for allocation/freeing of
62069+ * block @*block. That location is represented by bitmap block number and offset
62070+ * within that bitmap block.
62071+ */
62072+static void
62073+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
62074+ bmap_off_t *offset)
62075+{
62076+ struct super_block *super = get_current_context()->super;
62077+ u64 quotient = *block;
62078+
62079+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
62080+ *bmap = quotient;
62081+
62082+ assert("zam-433", *bmap < get_nr_bmap(super));
62083+ assert("", *offset < bmap_bit_count(super->s_blocksize));
62084+}
62085+
62086+#if REISER4_DEBUG
62087+/* Audited by: green(2002.06.12) */
62088+static void
62089+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
62090+{
62091+ struct super_block *sb = reiser4_get_current_sb();
62092+
62093+ assert("zam-436", sb != NULL);
62094+
62095+ assert("zam-455", start != NULL);
62096+ assert("zam-437", *start != 0);
62097+ assert("zam-541", !reiser4_blocknr_is_fake(start));
62098+ assert("zam-441", *start < reiser4_block_count(sb));
62099+
62100+ if (len != NULL) {
62101+ assert("zam-438", *len != 0);
62102+ assert("zam-442", *start + *len <= reiser4_block_count(sb));
62103+ }
62104+}
62105+
62106+static void check_bnode_loaded(const struct bitmap_node *bnode)
62107+{
62108+ assert("zam-485", bnode != NULL);
62109+ assert("zam-483", jnode_page(bnode->wjnode) != NULL);
62110+ assert("zam-484", jnode_page(bnode->cjnode) != NULL);
62111+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
62112+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
62113+}
62114+
62115+#else
62116+
62117+# define check_block_range(start, len) do { /* nothing */} while(0)
62118+# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
62119+
62120+#endif
62121+
62122+/* modify bnode->first_zero_bit (if we free bits before); bnode should be
62123+ spin-locked */
62124+static inline void
62125+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
62126+{
62127+ if (offset < bnode->first_zero_bit)
62128+ bnode->first_zero_bit = offset;
62129+}
62130+
62131+/* return a physical disk address for logical bitmap number @bmap */
62132+/* FIXME-VS: this is somehow related to disk layout? */
62133+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
62134+ * per block allocation so that performance is not affected. Probably this
62135+ * whole file should be considered part of the disk layout plugin, and other
62136+ * disk layouts can use other defines and efficiency will not be significantly
62137+ * affected. */
62138+
62139+#define REISER4_FIRST_BITMAP_BLOCK \
62140+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
62141+
62142+/* Audited by: green(2002.06.12) */
62143+static void
62144+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
62145+ reiser4_block_nr * bnr)
62146+{
62147+
62148+ assert("zam-390", bmap < get_nr_bmap(super));
62149+
62150+#ifdef CONFIG_REISER4_BADBLOCKS
62151+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
62152+ /* Check if the diskmap have this already, first. */
62153+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
62154+ return; /* Found it in diskmap */
62155+#endif
62156+ /* FIXME_ZAM: before discussing of disk layouts and disk format
62157+ plugins I implement bitmap location scheme which is close to scheme
62158+ used in reiser 3.6 */
62159+ if (bmap == 0) {
62160+ *bnr = REISER4_FIRST_BITMAP_BLOCK;
62161+ } else {
62162+ *bnr = bmap * bmap_bit_count(super->s_blocksize);
62163+ }
62164+}
62165+
62166+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
62167+/* Audited by: green(2002.06.12) */
62168+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
62169+{
62170+ *bnr =
62171+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
62172+ REISER4_BITMAP_BLOCKS_STATUS_VALUE);
62173+}
62174+
62175+/* bnode structure initialization */
62176+static void
62177+init_bnode(struct bitmap_node *bnode,
62178+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
62179+{
62180+ memset(bnode, 0, sizeof(struct bitmap_node));
62181+
62182+ mutex_init(&bnode->mutex);
62183+ atomic_set(&bnode->loaded, 0);
62184+}
62185+
62186+static void release(jnode * node)
62187+{
62188+ jrelse(node);
62189+ JF_SET(node, JNODE_HEARD_BANSHEE);
62190+ jput(node);
62191+}
62192+
62193+/* This function is for internal bitmap.c use because it assumes that jnode is
62194+ in under full control of this thread */
62195+static void done_bnode(struct bitmap_node *bnode)
62196+{
62197+ if (bnode) {
62198+ atomic_set(&bnode->loaded, 0);
62199+ if (bnode->wjnode != NULL)
62200+ release(bnode->wjnode);
62201+ if (bnode->cjnode != NULL)
62202+ release(bnode->cjnode);
62203+ bnode->wjnode = bnode->cjnode = NULL;
62204+ }
62205+}
62206+
62207+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
62208+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
62209+ jnode **wjnode_ret)
62210+{
62211+ struct super_block *super;
62212+ jnode *cjnode;
62213+ jnode *wjnode;
62214+ bmap_nr_t bmap;
62215+ int ret;
62216+
62217+ super = reiser4_get_current_sb();
62218+
62219+ *wjnode_ret = wjnode = bnew();
62220+ if (wjnode == NULL) {
62221+ *cjnode_ret = NULL;
62222+ return RETERR(-ENOMEM);
62223+ }
62224+
62225+ *cjnode_ret = cjnode = bnew();
62226+ if (cjnode == NULL)
62227+ return RETERR(-ENOMEM);
62228+
62229+ bmap = bnode - get_bnode(super, 0);
62230+
62231+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
62232+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
62233+
62234+ jref(cjnode);
62235+ jref(wjnode);
62236+
62237+ /* load commit bitmap */
62238+ ret = jload_gfp(cjnode, GFP_NOFS, 1);
62239+
62240+ if (ret)
62241+ goto error;
62242+
62243+ /* allocate memory for working bitmap block. Note that for
62244+ * bitmaps jinit_new() doesn't actually modifies node content,
62245+ * so parallel calls to this are ok. */
62246+ ret = jinit_new(wjnode, GFP_NOFS);
62247+
62248+ if (ret != 0) {
62249+ jrelse(cjnode);
62250+ goto error;
62251+ }
62252+
62253+ return 0;
62254+
62255+ error:
62256+ jput(cjnode);
62257+ jput(wjnode);
62258+ *wjnode_ret = *cjnode_ret = NULL;
62259+ return ret;
62260+
62261+}
62262+
62263+/* Check the bnode data on read. */
62264+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
62265+{
62266+ void *data;
62267+ int ret;
62268+
62269+ /* Check CRC */
62270+ ret = bnode_check_adler32(bnode, blksize);
62271+
62272+ if (ret) {
62273+ return ret;
62274+ }
62275+
62276+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
62277+
62278+ /* Check the very first bit -- it must be busy. */
62279+ if (!reiser4_test_bit(0, data)) {
62280+ warning("vpf-1362", "The allocator block %llu is not marked "
62281+ "as used.", (unsigned long long)bnode->cjnode->blocknr);
62282+
62283+ return -EINVAL;
62284+ }
62285+
62286+ return 0;
62287+}
62288+
62289+/* load bitmap blocks "on-demand" */
62290+static int load_and_lock_bnode(struct bitmap_node *bnode)
62291+{
62292+ int ret;
62293+
62294+ jnode *cjnode;
62295+ jnode *wjnode;
62296+
62297+ assert("nikita-3040", reiser4_schedulable());
62298+
62299+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62300+ * need to be atomic, right? Just leave a comment that if bitmaps were
62301+ * unloadable, this would need to be atomic. */
62302+ if (atomic_read(&bnode->loaded)) {
62303+ /* bitmap is already loaded, nothing to do */
62304+ check_bnode_loaded(bnode);
62305+ mutex_lock(&bnode->mutex);
62306+ assert("nikita-2827", atomic_read(&bnode->loaded));
62307+ return 0;
62308+ }
62309+
62310+ ret = prepare_bnode(bnode, &cjnode, &wjnode);
62311+ if (ret == 0) {
62312+ mutex_lock(&bnode->mutex);
62313+
62314+ if (!atomic_read(&bnode->loaded)) {
62315+ assert("nikita-2822", cjnode != NULL);
62316+ assert("nikita-2823", wjnode != NULL);
62317+ assert("nikita-2824", jnode_is_loaded(cjnode));
62318+ assert("nikita-2825", jnode_is_loaded(wjnode));
62319+
62320+ bnode->wjnode = wjnode;
62321+ bnode->cjnode = cjnode;
62322+
62323+ ret = check_struct_bnode(bnode, current_blocksize);
62324+ if (!ret) {
62325+ cjnode = wjnode = NULL;
62326+ atomic_set(&bnode->loaded, 1);
62327+ /* working bitmap is initialized by on-disk
62328+ * commit bitmap. This should be performed
62329+ * under mutex. */
62330+ memcpy(bnode_working_data(bnode),
62331+ bnode_commit_data(bnode),
62332+ bmap_size(current_blocksize));
62333+ } else
62334+ mutex_unlock(&bnode->mutex);
62335+ } else
62336+ /* race: someone already loaded bitmap while we were
62337+ * busy initializing data. */
62338+ check_bnode_loaded(bnode);
62339+ }
62340+
62341+ if (wjnode != NULL) {
62342+ release(wjnode);
62343+ bnode->wjnode = NULL;
62344+ }
62345+ if (cjnode != NULL) {
62346+ release(cjnode);
62347+ bnode->cjnode = NULL;
62348+ }
62349+
62350+ return ret;
62351+}
62352+
62353+static void release_and_unlock_bnode(struct bitmap_node *bnode)
62354+{
62355+ check_bnode_loaded(bnode);
62356+ mutex_unlock(&bnode->mutex);
62357+}
62358+
62359+/* This function does all block allocation work but only for one bitmap
62360+ block.*/
62361+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62362+ block responsibility zone boundaries. This had no sense in v3.6 but may
62363+ have it in v4.x */
62364+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62365+static int
62366+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62367+ bmap_off_t max_offset, int min_len, int max_len)
62368+{
62369+ struct super_block *super = get_current_context()->super;
62370+ struct bitmap_node *bnode = get_bnode(super, bmap);
62371+
62372+ char *data;
62373+
62374+ bmap_off_t search_end;
62375+ bmap_off_t start;
62376+ bmap_off_t end;
62377+
62378+ int set_first_zero_bit = 0;
62379+
62380+ int ret;
62381+
62382+ assert("zam-364", min_len > 0);
62383+ assert("zam-365", max_len >= min_len);
62384+ assert("zam-366", *offset <= max_offset);
62385+
62386+ ret = load_and_lock_bnode(bnode);
62387+
62388+ if (ret)
62389+ return ret;
62390+
62391+ data = bnode_working_data(bnode);
62392+
62393+ start = *offset;
62394+
62395+ if (bnode->first_zero_bit >= start) {
62396+ start = bnode->first_zero_bit;
62397+ set_first_zero_bit = 1;
62398+ }
62399+
62400+ while (start + min_len < max_offset) {
62401+
62402+ start =
62403+ reiser4_find_next_zero_bit((long *)data, max_offset, start);
62404+ if (set_first_zero_bit) {
62405+ bnode->first_zero_bit = start;
62406+ set_first_zero_bit = 0;
62407+ }
62408+ if (start >= max_offset)
62409+ break;
62410+
62411+ search_end = LIMIT(start + max_len, max_offset);
62412+ end =
62413+ reiser4_find_next_set_bit((long *)data, search_end, start);
62414+ if (end >= start + min_len) {
62415+ /* we can't trust find_next_set_bit result if set bit
62416+ was not fount, result may be bigger than
62417+ max_offset */
62418+ if (end > search_end)
62419+ end = search_end;
62420+
62421+ ret = end - start;
62422+ *offset = start;
62423+
62424+ reiser4_set_bits(data, start, end);
62425+
62426+ /* FIXME: we may advance first_zero_bit if [start,
62427+ end] region overlaps the first_zero_bit point */
62428+
62429+ break;
62430+ }
62431+
62432+ start = end + 1;
62433+ }
62434+
62435+ release_and_unlock_bnode(bnode);
62436+
62437+ return ret;
62438+}
62439+
62440+static int
62441+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
62442+ bmap_off_t end_offset, int min_len, int max_len)
62443+{
62444+ struct super_block *super = get_current_context()->super;
62445+ struct bitmap_node *bnode = get_bnode(super, bmap);
62446+ char *data;
62447+ bmap_off_t start;
62448+ int ret;
62449+
62450+ assert("zam-958", min_len > 0);
62451+ assert("zam-959", max_len >= min_len);
62452+ assert("zam-960", *start_offset >= end_offset);
62453+
62454+ ret = load_and_lock_bnode(bnode);
62455+ if (ret)
62456+ return ret;
62457+
62458+ data = bnode_working_data(bnode);
62459+ start = *start_offset;
62460+
62461+ while (1) {
62462+ bmap_off_t end, search_end;
62463+
62464+ /* Find the beginning of the zero filled region */
62465+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
62466+ break;
62467+ /* Is there more than `min_len' bits from `start' to
62468+ * `end_offset'? */
62469+ if (start < end_offset + min_len - 1)
62470+ break;
62471+
62472+ /* Do not search to `end_offset' if we need to find less than
62473+ * `max_len' zero bits. */
62474+ if (end_offset + max_len - 1 < start)
62475+ search_end = start - max_len + 1;
62476+ else
62477+ search_end = end_offset;
62478+
62479+ if (reiser4_find_last_set_bit(&end, data, search_end, start))
62480+ end = search_end;
62481+ else
62482+ end++;
62483+
62484+ if (end + min_len <= start + 1) {
62485+ if (end < search_end)
62486+ end = search_end;
62487+ ret = start - end + 1;
62488+ *start_offset = end; /* `end' is lowest offset */
62489+ assert("zam-987",
62490+ reiser4_find_next_set_bit(data, start + 1,
62491+ end) >= start + 1);
62492+ reiser4_set_bits(data, end, start + 1);
62493+ break;
62494+ }
62495+
62496+ if (end <= end_offset)
62497+ /* left search boundary reached. */
62498+ break;
62499+ start = end - 1;
62500+ }
62501+
62502+ release_and_unlock_bnode(bnode);
62503+ return ret;
62504+}
62505+
62506+/* allocate contiguous range of blocks in bitmap */
62507+static int bitmap_alloc_forward(reiser4_block_nr * start,
62508+ const reiser4_block_nr * end, int min_len,
62509+ int max_len)
62510+{
62511+ bmap_nr_t bmap, end_bmap;
62512+ bmap_off_t offset, end_offset;
62513+ int len;
62514+
62515+ reiser4_block_nr tmp;
62516+
62517+ struct super_block *super = get_current_context()->super;
62518+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62519+
62520+ parse_blocknr(start, &bmap, &offset);
62521+
62522+ tmp = *end - 1;
62523+ parse_blocknr(&tmp, &end_bmap, &end_offset);
62524+ ++end_offset;
62525+
62526+ assert("zam-358", end_bmap >= bmap);
62527+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
62528+
62529+ for (; bmap < end_bmap; bmap++, offset = 0) {
62530+ len =
62531+ search_one_bitmap_forward(bmap, &offset, max_offset,
62532+ min_len, max_len);
62533+ if (len != 0)
62534+ goto out;
62535+ }
62536+
62537+ len =
62538+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
62539+ max_len);
62540+ out:
62541+ *start = bmap * max_offset + offset;
62542+ return len;
62543+}
62544+
62545+/* allocate contiguous range of blocks in bitmap (from @start to @end in
62546+ * backward direction) */
62547+static int bitmap_alloc_backward(reiser4_block_nr * start,
62548+ const reiser4_block_nr * end, int min_len,
62549+ int max_len)
62550+{
62551+ bmap_nr_t bmap, end_bmap;
62552+ bmap_off_t offset, end_offset;
62553+ int len;
62554+ struct super_block *super = get_current_context()->super;
62555+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62556+
62557+ parse_blocknr(start, &bmap, &offset);
62558+ parse_blocknr(end, &end_bmap, &end_offset);
62559+
62560+ assert("zam-961", end_bmap <= bmap);
62561+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
62562+
62563+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
62564+ len =
62565+ search_one_bitmap_backward(bmap, &offset, 0, min_len,
62566+ max_len);
62567+ if (len != 0)
62568+ goto out;
62569+ }
62570+
62571+ len =
62572+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
62573+ max_len);
62574+ out:
62575+ *start = bmap * max_offset + offset;
62576+ return len;
62577+}
62578+
62579+/* plugin->u.space_allocator.alloc_blocks() */
62580+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
62581+ reiser4_block_nr *start, reiser4_block_nr *len)
62582+{
62583+ struct super_block *super = get_current_context()->super;
62584+ int actual_len;
62585+
62586+ reiser4_block_nr search_start;
62587+ reiser4_block_nr search_end;
62588+
62589+ assert("zam-398", super != NULL);
62590+ assert("zam-412", hint != NULL);
62591+ assert("zam-397", hint->blk <= reiser4_block_count(super));
62592+
62593+ if (hint->max_dist == 0)
62594+ search_end = reiser4_block_count(super);
62595+ else
62596+ search_end =
62597+ LIMIT(hint->blk + hint->max_dist,
62598+ reiser4_block_count(super));
62599+
62600+ /* We use @hint -> blk as a search start and search from it to the end
62601+ of the disk or in given region if @hint -> max_dist is not zero */
62602+ search_start = hint->blk;
62603+
62604+ actual_len =
62605+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62606+
62607+ /* There is only one bitmap search if max_dist was specified or first
62608+ pass was from the beginning of the bitmap. We also do one pass for
62609+ scanning bitmap in backward direction. */
62610+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
62611+ /* next step is a scanning from 0 to search_start */
62612+ search_end = search_start;
62613+ search_start = 0;
62614+ actual_len =
62615+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62616+ }
62617+ if (actual_len == 0)
62618+ return RETERR(-ENOSPC);
62619+ if (actual_len < 0)
62620+ return RETERR(actual_len);
62621+ *len = actual_len;
62622+ *start = search_start;
62623+ return 0;
62624+}
62625+
62626+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
62627+ reiser4_block_nr * start,
62628+ reiser4_block_nr * len)
62629+{
62630+ reiser4_block_nr search_start;
62631+ reiser4_block_nr search_end;
62632+ int actual_len;
62633+
62634+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
62635+
62636+ assert("zam-969", super != NULL);
62637+ assert("zam-970", hint != NULL);
62638+ assert("zam-971", hint->blk <= reiser4_block_count(super));
62639+
62640+ search_start = hint->blk;
62641+ if (hint->max_dist == 0 || search_start <= hint->max_dist)
62642+ search_end = 0;
62643+ else
62644+ search_end = search_start - hint->max_dist;
62645+
62646+ actual_len =
62647+ bitmap_alloc_backward(&search_start, &search_end, 1, needed);
62648+ if (actual_len == 0)
62649+ return RETERR(-ENOSPC);
62650+ if (actual_len < 0)
62651+ return RETERR(actual_len);
62652+ *len = actual_len;
62653+ *start = search_start;
62654+ return 0;
62655+}
62656+
62657+/* plugin->u.space_allocator.alloc_blocks() */
62658+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
62659+ reiser4_blocknr_hint * hint, int needed,
62660+ reiser4_block_nr * start, reiser4_block_nr * len)
62661+{
62662+ if (hint->backward)
62663+ return alloc_blocks_backward(hint, needed, start, len);
62664+ return alloc_blocks_forward(hint, needed, start, len);
62665+}
62666+
62667+/* plugin->u.space_allocator.dealloc_blocks(). */
62668+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
62669+ nodes deletion is deferred until transaction commit. However, deallocation
62670+ of temporary objects like wandered blocks and transaction commit records
62671+ requires immediate node deletion from WORKING BITMAP.*/
62672+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
62673+ reiser4_block_nr start, reiser4_block_nr len)
62674+{
62675+ struct super_block *super = reiser4_get_current_sb();
62676+
62677+ bmap_nr_t bmap;
62678+ bmap_off_t offset;
62679+
62680+ struct bitmap_node *bnode;
62681+ int ret;
62682+
62683+ assert("zam-468", len != 0);
62684+ check_block_range(&start, &len);
62685+
62686+ parse_blocknr(&start, &bmap, &offset);
62687+
62688+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
62689+
62690+ bnode = get_bnode(super, bmap);
62691+
62692+ assert("zam-470", bnode != NULL);
62693+
62694+ ret = load_and_lock_bnode(bnode);
62695+ assert("zam-481", ret == 0);
62696+
62697+ reiser4_clear_bits(bnode_working_data(bnode), offset,
62698+ (bmap_off_t) (offset + len));
62699+
62700+ adjust_first_zero_bit(bnode, offset);
62701+
62702+ release_and_unlock_bnode(bnode);
62703+}
62704+
62705+/* plugin->u.space_allocator.check_blocks(). */
62706+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
62707+ const reiser4_block_nr * len, int desired)
62708+{
62709+#if REISER4_DEBUG
62710+ struct super_block *super = reiser4_get_current_sb();
62711+
62712+ bmap_nr_t bmap;
62713+ bmap_off_t start_offset;
62714+ bmap_off_t end_offset;
62715+
62716+ struct bitmap_node *bnode;
62717+ int ret;
62718+
62719+ assert("zam-622", len != NULL);
62720+ check_block_range(start, len);
62721+ parse_blocknr(start, &bmap, &start_offset);
62722+
62723+ end_offset = start_offset + *len;
62724+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
62725+
62726+ bnode = get_bnode(super, bmap);
62727+
62728+ assert("nikita-2215", bnode != NULL);
62729+
62730+ ret = load_and_lock_bnode(bnode);
62731+ assert("zam-626", ret == 0);
62732+
62733+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
62734+
62735+ if (desired) {
62736+ assert("zam-623",
62737+ reiser4_find_next_zero_bit(bnode_working_data(bnode),
62738+ end_offset, start_offset)
62739+ >= end_offset);
62740+ } else {
62741+ assert("zam-624",
62742+ reiser4_find_next_set_bit(bnode_working_data(bnode),
62743+ end_offset, start_offset)
62744+ >= end_offset);
62745+ }
62746+
62747+ release_and_unlock_bnode(bnode);
62748+#endif
62749+}
62750+
62751+/* conditional insertion of @node into atom's overwrite set if it was not there */
62752+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
62753+{
62754+ assert("zam-546", atom != NULL);
62755+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
62756+ assert("zam-548", node != NULL);
62757+
62758+ spin_lock_atom(atom);
62759+ spin_lock_jnode(node);
62760+
62761+ if (node->atom == NULL) {
62762+ JF_SET(node, JNODE_OVRWR);
62763+ insert_into_atom_ovrwr_list(atom, node);
62764+ } else {
62765+ assert("zam-549", node->atom == atom);
62766+ }
62767+
62768+ spin_unlock_jnode(node);
62769+ spin_unlock_atom(atom);
62770+}
62771+
62772+/* an actor which applies delete set to COMMIT bitmap pages and link modified
62773+ pages in a single-linked list */
62774+static int
62775+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
62776+ const reiser4_block_nr * len, void *data)
62777+{
62778+
62779+ bmap_nr_t bmap;
62780+ bmap_off_t offset;
62781+ int ret;
62782+
62783+ long long *blocks_freed_p = data;
62784+
62785+ struct bitmap_node *bnode;
62786+
62787+ struct super_block *sb = reiser4_get_current_sb();
62788+
62789+ check_block_range(start, len);
62790+
62791+ parse_blocknr(start, &bmap, &offset);
62792+
62793+ /* FIXME-ZAM: we assume that all block ranges are allocated by this
62794+ bitmap-based allocator and each block range can't go over a zone of
62795+ responsibility of one bitmap block; same assumption is used in
62796+ other journal hooks in bitmap code. */
62797+ bnode = get_bnode(sb, bmap);
62798+ assert("zam-448", bnode != NULL);
62799+
62800+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
62801+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
62802+ ret = load_and_lock_bnode(bnode);
62803+ if (ret)
62804+ return ret;
62805+
62806+ /* put bnode into atom's overwrite set */
62807+ cond_add_to_overwrite_set(atom, bnode->cjnode);
62808+
62809+ data = bnode_commit_data(bnode);
62810+
62811+ ret = bnode_check_crc(bnode);
62812+ if (ret != 0)
62813+ return ret;
62814+
62815+ if (len != NULL) {
62816+ /* FIXME-ZAM: a check that all bits are set should be there */
62817+ assert("zam-443",
62818+ offset + *len <= bmap_bit_count(sb->s_blocksize));
62819+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
62820+
62821+ (*blocks_freed_p) += *len;
62822+ } else {
62823+ reiser4_clear_bit(offset, data);
62824+ (*blocks_freed_p)++;
62825+ }
62826+
62827+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
62828+
62829+ release_and_unlock_bnode(bnode);
62830+
62831+ return 0;
62832+}
62833+
62834+/* plugin->u.space_allocator.pre_commit_hook(). */
62835+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
62836+ rest is done by transaction manager (allocate wandered locations for COMMIT
62837+ BITMAP blocks, copy COMMIT BITMAP blocks data). */
62838+/* Only one instance of this function can be running at one given time, because
62839+ only one transaction can be committed a time, therefore it is safe to access
62840+ some global variables without any locking */
62841+
62842+int reiser4_pre_commit_hook_bitmap(void)
62843+{
62844+ struct super_block *super = reiser4_get_current_sb();
62845+ txn_atom *atom;
62846+
62847+ long long blocks_freed = 0;
62848+
62849+ atom = get_current_atom_locked();
62850+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
62851+ spin_unlock_atom(atom);
62852+
62853+ { /* scan atom's captured list and find all freshly allocated nodes,
62854+ * mark corresponded bits in COMMIT BITMAP as used */
62855+ struct list_head *head = ATOM_CLEAN_LIST(atom);
62856+ jnode *node = list_entry(head->next, jnode, capture_link);
62857+
62858+ while (head != &node->capture_link) {
62859+ /* we detect freshly allocated jnodes */
62860+ if (JF_ISSET(node, JNODE_RELOC)) {
62861+ int ret;
62862+ bmap_nr_t bmap;
62863+
62864+ bmap_off_t offset;
62865+ bmap_off_t index;
62866+ struct bitmap_node *bn;
62867+ __u32 size = bmap_size(super->s_blocksize);
62868+ __u32 crc;
62869+ char byte;
62870+
62871+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
62872+ assert("zam-460",
62873+ !reiser4_blocknr_is_fake(&node->blocknr));
62874+
62875+ parse_blocknr(&node->blocknr, &bmap, &offset);
62876+ bn = get_bnode(super, bmap);
62877+
62878+ index = offset >> 3;
62879+ assert("vpf-276", index < size);
62880+
62881+ ret = bnode_check_crc(bnode);
62882+ if (ret != 0)
62883+ return ret;
62884+
62885+ check_bnode_loaded(bn);
62886+ load_and_lock_bnode(bn);
62887+
62888+ byte = *(bnode_commit_data(bn) + index);
62889+ reiser4_set_bit(offset, bnode_commit_data(bn));
62890+
62891+ crc = adler32_recalc(bnode_commit_crc(bn), byte,
62892+ *(bnode_commit_data(bn) +
62893+ index),
62894+ size - index),
62895+ bnode_set_commit_crc(bn, crc);
62896+
62897+ release_and_unlock_bnode(bn);
62898+
62899+ ret = bnode_check_crc(bn);
62900+ if (ret != 0)
62901+ return ret;
62902+
62903+ /* working of this depends on how it inserts
62904+ new j-node into clean list, because we are
62905+ scanning the same list now. It is OK, if
62906+ insertion is done to the list front */
62907+ cond_add_to_overwrite_set(atom, bn->cjnode);
62908+ }
62909+
62910+ node = list_entry(node->capture_link.next, jnode, capture_link);
62911+ }
62912+ }
62913+
62914+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
62915+ &blocks_freed, 0);
62916+
62917+ blocks_freed -= atom->nr_blocks_allocated;
62918+
62919+ {
62920+ reiser4_super_info_data *sbinfo;
62921+
62922+ sbinfo = get_super_private(super);
62923+
62924+ spin_lock_reiser4_super(sbinfo);
62925+ sbinfo->blocks_free_committed += blocks_freed;
62926+ spin_unlock_reiser4_super(sbinfo);
62927+ }
62928+
62929+ return 0;
62930+}
62931+
62932+/* plugin->u.space_allocator.init_allocator
62933+ constructor of reiser4_space_allocator object. It is called on fs mount */
62934+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
62935+ struct super_block *super, void *arg)
62936+{
62937+ struct bitmap_allocator_data *data = NULL;
62938+ bmap_nr_t bitmap_blocks_nr;
62939+ bmap_nr_t i;
62940+
62941+ assert("nikita-3039", reiser4_schedulable());
62942+
62943+ /* getting memory for bitmap allocator private data holder */
62944+ data =
62945+ kmalloc(sizeof(struct bitmap_allocator_data),
62946+ reiser4_ctx_gfp_mask_get());
62947+
62948+ if (data == NULL)
62949+ return RETERR(-ENOMEM);
62950+
62951+ /* allocation and initialization for the array of bnodes */
62952+ bitmap_blocks_nr = get_nr_bmap(super);
62953+
62954+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
62955+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
62956+ may I never meet someone who still uses the ia32 architecture when
62957+ storage devices of that size enter the market, and wants to use ia32
62958+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
62959+ probably, another dynamic data structure should replace a static
62960+ array of bnodes. */
62961+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
62962+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
62963+ if (data->bitmap == NULL) {
62964+ kfree(data);
62965+ return RETERR(-ENOMEM);
62966+ }
62967+
62968+ for (i = 0; i < bitmap_blocks_nr; i++)
62969+ init_bnode(data->bitmap + i, super, i);
62970+
62971+ allocator->u.generic = data;
62972+
62973+#if REISER4_DEBUG
62974+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
62975+#endif
62976+
62977+ /* Load all bitmap blocks at mount time. */
62978+ if (!test_bit
62979+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
62980+ __u64 start_time, elapsed_time;
62981+ struct bitmap_node *bnode;
62982+ int ret;
62983+
62984+ if (REISER4_DEBUG)
62985+ printk(KERN_INFO "loading reiser4 bitmap...");
62986+ start_time = jiffies;
62987+
62988+ for (i = 0; i < bitmap_blocks_nr; i++) {
62989+ bnode = data->bitmap + i;
62990+ ret = load_and_lock_bnode(bnode);
62991+ if (ret) {
62992+ reiser4_destroy_allocator_bitmap(allocator,
62993+ super);
62994+ return ret;
62995+ }
62996+ release_and_unlock_bnode(bnode);
62997+ }
62998+
62999+ elapsed_time = jiffies - start_time;
63000+ if (REISER4_DEBUG)
63001+ printk("...done (%llu jiffies)\n",
63002+ (unsigned long long)elapsed_time);
63003+ }
63004+
63005+ return 0;
63006+}
63007+
63008+/* plugin->u.space_allocator.destroy_allocator
63009+ destructor. It is called on fs unmount */
63010+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
63011+ struct super_block *super)
63012+{
63013+ bmap_nr_t bitmap_blocks_nr;
63014+ bmap_nr_t i;
63015+
63016+ struct bitmap_allocator_data *data = allocator->u.generic;
63017+
63018+ assert("zam-414", data != NULL);
63019+ assert("zam-376", data->bitmap != NULL);
63020+
63021+ bitmap_blocks_nr = get_nr_bmap(super);
63022+
63023+ for (i = 0; i < bitmap_blocks_nr; i++) {
63024+ struct bitmap_node *bnode = data->bitmap + i;
63025+
63026+ mutex_lock(&bnode->mutex);
63027+
63028+#if REISER4_DEBUG
63029+ if (atomic_read(&bnode->loaded)) {
63030+ jnode *wj = bnode->wjnode;
63031+ jnode *cj = bnode->cjnode;
63032+
63033+ assert("zam-480", jnode_page(cj) != NULL);
63034+ assert("zam-633", jnode_page(wj) != NULL);
63035+
63036+ assert("zam-634",
63037+ memcmp(jdata(wj), jdata(wj),
63038+ bmap_size(super->s_blocksize)) == 0);
63039+
63040+ }
63041+#endif
63042+ done_bnode(bnode);
63043+ mutex_unlock(&bnode->mutex);
63044+ }
63045+
63046+ vfree(data->bitmap);
63047+ kfree(data);
63048+
63049+ allocator->u.generic = NULL;
63050+
63051+ return 0;
63052+}
63053+
63054+/*
63055+ * Local variables:
63056+ * c-indentation-style: "K&R"
63057+ * mode-name: "LC"
63058+ * c-basic-offset: 8
63059+ * tab-width: 8
63060+ * fill-column: 79
63061+ * scroll-step: 1
63062+ * End:
63063+ */
63064diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.20/fs/reiser4/plugin/space/bitmap.h
63065--- linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300
63066+++ linux-2.6.20/fs/reiser4/plugin/space/bitmap.h 2007-05-06 14:50:43.863026968 +0400
63067@@ -0,0 +1,47 @@
63068+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63069+
63070+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
63071+#define __REISER4_PLUGIN_SPACE_BITMAP_H__
63072+
63073+#include "../../dformat.h"
63074+#include "../../block_alloc.h"
63075+
63076+#include <linux/types.h> /* for __u?? */
63077+#include <linux/fs.h> /* for struct super_block */
63078+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
63079+/* declarations of functions implementing methods of space allocator plugin for
63080+ bitmap based allocator. The functions themselves are in bitmap.c */
63081+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
63082+ struct super_block *, void *);
63083+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
63084+ struct super_block *);
63085+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
63086+ reiser4_blocknr_hint *, int needed,
63087+ reiser4_block_nr * start,
63088+ reiser4_block_nr * len);
63089+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
63090+ const reiser4_block_nr *, int);
63091+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
63092+ reiser4_block_nr,
63093+ reiser4_block_nr);
63094+extern int reiser4_pre_commit_hook_bitmap(void);
63095+
63096+#define reiser4_post_commit_hook_bitmap() do{}while(0)
63097+#define reiser4_post_write_back_hook_bitmap() do{}while(0)
63098+#define reiser4_print_info_bitmap(pref, al) do{}while(0)
63099+
63100+typedef __u64 bmap_nr_t;
63101+typedef __u32 bmap_off_t;
63102+
63103+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
63104+
63105+/* Make Linus happy.
63106+ Local variables:
63107+ c-indentation-style: "K&R"
63108+ mode-name: "LC"
63109+ c-basic-offset: 8
63110+ tab-width: 8
63111+ fill-column: 120
63112+ scroll-step: 1
63113+ End:
63114+*/
63115diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/Makefile linux-2.6.20/fs/reiser4/plugin/space/Makefile
63116--- linux-2.6.20.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300
63117+++ linux-2.6.20/fs/reiser4/plugin/space/Makefile 2007-05-06 14:50:43.863026968 +0400
63118@@ -0,0 +1,4 @@
63119+obj-$(CONFIG_REISER4_FS) += space_plugins.o
63120+
63121+space_plugins-objs := \
63122+ bitmap.o
63123diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.20/fs/reiser4/plugin/space/space_allocator.h
63124--- linux-2.6.20.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
63125+++ linux-2.6.20/fs/reiser4/plugin/space/space_allocator.h 2007-05-06 14:50:43.863026968 +0400
63126@@ -0,0 +1,80 @@
63127+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63128+
63129+#ifndef __SPACE_ALLOCATOR_H__
63130+#define __SPACE_ALLOCATOR_H__
63131+
63132+#include "../../forward.h"
63133+#include "bitmap.h"
63134+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
63135+ * but... */
63136+#define DEF_SPACE_ALLOCATOR(allocator) \
63137+ \
63138+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
63139+{ \
63140+ return reiser4_init_allocator_##allocator (al, s, opaque); \
63141+} \
63142+ \
63143+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
63144+{ \
63145+ reiser4_destroy_allocator_##allocator (al, s); \
63146+} \
63147+ \
63148+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
63149+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
63150+{ \
63151+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
63152+} \
63153+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
63154+{ \
63155+ reiser4_dealloc_blocks_##allocator (al, start, len); \
63156+} \
63157+ \
63158+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
63159+{ \
63160+ reiser4_check_blocks_##allocator (start, end, desired); \
63161+} \
63162+ \
63163+static inline void sa_pre_commit_hook (void) \
63164+{ \
63165+ reiser4_pre_commit_hook_##allocator (); \
63166+} \
63167+ \
63168+static inline void sa_post_commit_hook (void) \
63169+{ \
63170+ reiser4_post_commit_hook_##allocator (); \
63171+} \
63172+ \
63173+static inline void sa_post_write_back_hook (void) \
63174+{ \
63175+ reiser4_post_write_back_hook_##allocator(); \
63176+} \
63177+ \
63178+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
63179+{ \
63180+ reiser4_print_info_##allocator (prefix, al); \
63181+}
63182+
63183+DEF_SPACE_ALLOCATOR(bitmap)
63184+
63185+/* this object is part of reiser4 private in-core super block */
63186+struct reiser4_space_allocator {
63187+ union {
63188+ /* space allocators might use this pointer to reference their
63189+ * data. */
63190+ void *generic;
63191+ } u;
63192+};
63193+
63194+/* __SPACE_ALLOCATOR_H__ */
63195+#endif
63196+
63197+/* Make Linus happy.
63198+ Local variables:
63199+ c-indentation-style: "K&R"
63200+ mode-name: "LC"
63201+ c-basic-offset: 8
63202+ tab-width: 8
63203+ fill-column: 120
63204+ scroll-step: 1
63205+ End:
63206+*/
63207diff -urN linux-2.6.20.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.20/fs/reiser4/plugin/tail_policy.c
63208--- linux-2.6.20.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300
63209+++ linux-2.6.20/fs/reiser4/plugin/tail_policy.c 2007-05-06 14:50:43.863026968 +0400
63210@@ -0,0 +1,113 @@
63211+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63212+ * reiser4/README */
63213+
63214+/* Formatting policy plugins */
63215+
63216+/*
63217+ * Formatting policy plugin is used by object plugin (of regular file) to
63218+ * convert file between two representations.
63219+ *
63220+ * Currently following policies are implemented:
63221+ * never store file in formatted nodes
63222+ * always store file in formatted nodes
63223+ * store file in formatted nodes if file is smaller than 4 blocks (default)
63224+ */
63225+
63226+#include "../tree.h"
63227+#include "../inode.h"
63228+#include "../super.h"
63229+#include "object.h"
63230+#include "plugin.h"
63231+#include "node/node.h"
63232+#include "plugin_header.h"
63233+
63234+#include <linux/pagemap.h>
63235+#include <linux/fs.h> /* For struct inode */
63236+
63237+/**
63238+ * have_formatting_never -
63239+ * @inode:
63240+ * @size:
63241+ *
63242+ *
63243+ */
63244+/* Never store file's tail as direct item */
63245+/* Audited by: green(2002.06.12) */
63246+static int have_formatting_never(const struct inode *inode UNUSED_ARG
63247+ /* inode to operate on */ ,
63248+ loff_t size UNUSED_ARG /* new object size */ )
63249+{
63250+ return 0;
63251+}
63252+
63253+/* Always store file's tail as direct item */
63254+/* Audited by: green(2002.06.12) */
63255+static int
63256+have_formatting_always(const struct inode *inode UNUSED_ARG
63257+ /* inode to operate on */ ,
63258+ loff_t size UNUSED_ARG /* new object size */ )
63259+{
63260+ return 1;
63261+}
63262+
63263+/* This function makes test if we should store file denoted @inode as tails only or
63264+ as extents only. */
63265+static int
63266+have_formatting_default(const struct inode *inode UNUSED_ARG
63267+ /* inode to operate on */ ,
63268+ loff_t size /* new object size */ )
63269+{
63270+ assert("umka-1253", inode != NULL);
63271+
63272+ if (size > inode->i_sb->s_blocksize * 4)
63273+ return 0;
63274+
63275+ return 1;
63276+}
63277+
63278+/* tail plugins */
63279+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
63280+ [NEVER_TAILS_FORMATTING_ID] = {
63281+ .h = {
63282+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63283+ .id = NEVER_TAILS_FORMATTING_ID,
63284+ .pops = NULL,
63285+ .label = "never",
63286+ .desc = "Never store file's tail",
63287+ .linkage = {NULL, NULL}
63288+ },
63289+ .have_tail = have_formatting_never
63290+ },
63291+ [ALWAYS_TAILS_FORMATTING_ID] = {
63292+ .h = {
63293+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63294+ .id = ALWAYS_TAILS_FORMATTING_ID,
63295+ .pops = NULL,
63296+ .label = "always",
63297+ .desc = "Always store file's tail",
63298+ .linkage = {NULL, NULL}
63299+ },
63300+ .have_tail = have_formatting_always
63301+ },
63302+ [SMALL_FILE_FORMATTING_ID] = {
63303+ .h = {
63304+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63305+ .id = SMALL_FILE_FORMATTING_ID,
63306+ .pops = NULL,
63307+ .label = "4blocks",
63308+ .desc = "store files shorter than 4 blocks in tail items",
63309+ .linkage = {NULL, NULL}
63310+ },
63311+ .have_tail = have_formatting_default
63312+ }
63313+};
63314+
63315+/*
63316+ * Local variables:
63317+ * c-indentation-style: "K&R"
63318+ * mode-name: "LC"
63319+ * c-basic-offset: 8
63320+ * tab-width: 8
63321+ * fill-column: 79
63322+ * End:
63323+ */
63324diff -urN linux-2.6.20.orig/fs/reiser4/pool.c linux-2.6.20/fs/reiser4/pool.c
63325--- linux-2.6.20.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
63326+++ linux-2.6.20/fs/reiser4/pool.c 2007-05-06 14:50:43.863026968 +0400
63327@@ -0,0 +1,234 @@
63328+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63329+ * reiser4/README */
63330+
63331+/* Fast pool allocation.
63332+
63333+ There are situations when some sub-system normally asks memory allocator
63334+ for only few objects, but under some circumstances could require much
63335+ more. Typical and actually motivating example is tree balancing. It needs
63336+ to keep track of nodes that were involved into it, and it is well-known
63337+ that in reasonable packed balanced tree most (92.938121%) percent of all
63338+ balancings end up after working with only few nodes (3.141592 on
63339+ average). But in rare cases balancing can involve much more nodes
63340+ (3*tree_height+1 in extremal situation).
63341+
63342+ On the one hand, we don't want to resort to dynamic allocation (slab,
63343+ malloc(), etc.) to allocate data structures required to keep track of
63344+ nodes during balancing. On the other hand, we cannot statically allocate
63345+ required amount of space on the stack, because first: it is useless wastage
63346+ of precious resource, and second: this amount is unknown in advance (tree
63347+ height can change).
63348+
63349+ Pools, implemented in this file are solution for this problem:
63350+
63351+ - some configurable amount of objects is statically preallocated on the
63352+ stack
63353+
63354+ - if this preallocated pool is exhausted and more objects is requested
63355+ they are allocated dynamically.
63356+
63357+ Pools encapsulate distinction between statically and dynamically allocated
63358+ objects. Both allocation and recycling look exactly the same.
63359+
63360+ To keep track of dynamically allocated objects, pool adds its own linkage
63361+ to each object.
63362+
63363+ NOTE-NIKITA This linkage also contains some balancing-specific data. This
63364+ is not perfect. On the other hand, balancing is currently the only client
63365+ of pool code.
63366+
63367+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63368+ functions in the style of tslist/tshash, i.e., make them unreadable, but
63369+ type-safe.
63370+
63371+*/
63372+
63373+#include "debug.h"
63374+#include "pool.h"
63375+#include "super.h"
63376+
63377+#include <linux/types.h>
63378+#include <linux/err.h>
63379+
63380+/* initialize new pool object */
63381+static void reiser4_init_pool_obj(reiser4_pool_header * h /* pool object to
63382+ * initialize */ )
63383+{
63384+ INIT_LIST_HEAD(&h->usage_linkage);
63385+ INIT_LIST_HEAD(&h->level_linkage);
63386+ INIT_LIST_HEAD(&h->extra_linkage);
63387+}
63388+
63389+/* initialize new pool */
63390+void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
63391+ size_t obj_size /* size of objects in @pool */ ,
63392+ int num_of_objs /* number of preallocated objects */ ,
63393+ char *data /* area for preallocated objects */ )
63394+{
63395+ reiser4_pool_header *h;
63396+ int i;
63397+
63398+ assert("nikita-955", pool != NULL);
63399+ assert("nikita-1044", obj_size > 0);
63400+ assert("nikita-956", num_of_objs >= 0);
63401+ assert("nikita-957", data != NULL);
63402+
63403+ memset(pool, 0, sizeof *pool);
63404+ pool->obj_size = obj_size;
63405+ pool->data = data;
63406+ INIT_LIST_HEAD(&pool->free);
63407+ INIT_LIST_HEAD(&pool->used);
63408+ INIT_LIST_HEAD(&pool->extra);
63409+ memset(data, 0, obj_size * num_of_objs);
63410+ for (i = 0; i < num_of_objs; ++i) {
63411+ h = (reiser4_pool_header *) (data + i * obj_size);
63412+ reiser4_init_pool_obj(h);
63413+ /* add pool header to the end of pool's free list */
63414+ list_add_tail(&h->usage_linkage, &pool->free);
63415+ }
63416+}
63417+
63418+/* release pool resources
63419+
63420+ Release all resources acquired by this pool, specifically, dynamically
63421+ allocated objects.
63422+
63423+*/
63424+void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
63425+{
63426+}
63427+
63428+/* allocate carry object from pool
63429+
63430+ First, try to get preallocated object. If this fails, resort to dynamic
63431+ allocation.
63432+
63433+*/
63434+static void *reiser4_pool_alloc(reiser4_pool * pool /* pool to allocate object
63435+ * from */ )
63436+{
63437+ reiser4_pool_header *result;
63438+
63439+ assert("nikita-959", pool != NULL);
63440+
63441+ if (!list_empty(&pool->free)) {
63442+ struct list_head *linkage;
63443+
63444+ linkage = pool->free.next;
63445+ list_del(linkage);
63446+ INIT_LIST_HEAD(linkage);
63447+ result = list_entry(linkage, reiser4_pool_header, usage_linkage);
63448+ BUG_ON(!list_empty(&result->level_linkage) ||
63449+ !list_empty(&result->extra_linkage));
63450+ } else {
63451+ /* pool is empty. Extra allocations don't deserve dedicated
63452+ slab to be served from, as they are expected to be rare. */
63453+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
63454+ if (result != 0) {
63455+ reiser4_init_pool_obj(result);
63456+ list_add(&result->extra_linkage, &pool->extra);
63457+ } else
63458+ return ERR_PTR(RETERR(-ENOMEM));
63459+ BUG_ON(!list_empty(&result->usage_linkage) ||
63460+ !list_empty(&result->level_linkage));
63461+ }
63462+ ++pool->objs;
63463+ list_add(&result->usage_linkage, &pool->used);
63464+ memset(result + 1, 0, pool->obj_size - sizeof *result);
63465+ return result;
63466+}
63467+
63468+/* return object back to the pool */
63469+void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h /* pool to return object back
63470+ * into */ )
63471+{
63472+ assert("nikita-961", h != NULL);
63473+ assert("nikita-962", pool != NULL);
63474+
63475+ --pool->objs;
63476+ assert("nikita-963", pool->objs >= 0);
63477+
63478+ list_del_init(&h->usage_linkage);
63479+ list_del_init(&h->level_linkage);
63480+
63481+ if (list_empty(&h->extra_linkage))
63482+ /*
63483+ * pool header is not an extra one. Push it onto free list
63484+ * using usage_linkage
63485+ */
63486+ list_add(&h->usage_linkage, &pool->free);
63487+ else {
63488+ /* remove pool header from pool's extra list and kfree it */
63489+ list_del(&h->extra_linkage);
63490+ kfree(h);
63491+ }
63492+}
63493+
63494+/* add new object to the carry level list
63495+
63496+ Carry level is FIFO most of the time, but not always. Complications arise
63497+ when make_space() function tries to go to the left neighbor and thus adds
63498+ carry node before existing nodes, and also, when updating delimiting keys
63499+ after moving data between two nodes, we want left node to be locked before
63500+ right node.
63501+
63502+ Latter case is confusing at the first glance. Problem is that COP_UPDATE
63503+ opration that updates delimiting keys is sometimes called with two nodes
63504+ (when data are moved between two nodes) and sometimes with only one node
63505+ (when leftmost item is deleted in a node). In any case operation is
63506+ supplied with at least node whose left delimiting key is to be updated
63507+ (that is "right" node).
63508+
63509+*/
63510+reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool /* pool from which to
63511+ * allocate new object
63512+ */,
63513+ struct list_head *list /* list where to add
63514+ * object */,
63515+ pool_ordering order /* where to add */,
63516+ reiser4_pool_header * reference
63517+ /* after (or before) which existing object
63518+ to add */)
63519+{
63520+ reiser4_pool_header *result;
63521+
63522+ assert("nikita-972", pool != NULL);
63523+
63524+ result = reiser4_pool_alloc(pool);
63525+ if (IS_ERR(result))
63526+ return result;
63527+
63528+ assert("nikita-973", result != NULL);
63529+
63530+ switch (order) {
63531+ case POOLO_BEFORE:
63532+ __list_add(&result->level_linkage,
63533+ reference->level_linkage.prev,
63534+ &reference->level_linkage);
63535+ break;
63536+ case POOLO_AFTER:
63537+ __list_add(&result->level_linkage,
63538+ &reference->level_linkage,
63539+ reference->level_linkage.next);
63540+ break;
63541+ case POOLO_LAST:
63542+ list_add_tail(&result->level_linkage, list);
63543+ break;
63544+ case POOLO_FIRST:
63545+ list_add(&result->level_linkage, list);
63546+ break;
63547+ default:
63548+ wrong_return_value("nikita-927", "order");
63549+ }
63550+ return result;
63551+}
63552+
63553+/* Make Linus happy.
63554+ Local variables:
63555+ c-indentation-style: "K&R"
63556+ mode-name: "LC"
63557+ c-basic-offset: 8
63558+ tab-width: 8
63559+ fill-column: 120
63560+ End:
63561+*/
63562diff -urN linux-2.6.20.orig/fs/reiser4/pool.h linux-2.6.20/fs/reiser4/pool.h
63563--- linux-2.6.20.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
63564+++ linux-2.6.20/fs/reiser4/pool.h 2007-05-06 14:50:43.863026968 +0400
63565@@ -0,0 +1,55 @@
63566+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63567+
63568+/* Fast pool allocation */
63569+
63570+#ifndef __REISER4_POOL_H__
63571+#define __REISER4_POOL_H__
63572+
63573+#include <linux/types.h>
63574+
63575+typedef struct reiser4_pool {
63576+ size_t obj_size;
63577+ int objs;
63578+ char *data;
63579+ struct list_head free;
63580+ struct list_head used;
63581+ struct list_head extra;
63582+} reiser4_pool;
63583+
63584+typedef struct reiser4_pool_header {
63585+ /* object is either on free or "used" lists */
63586+ struct list_head usage_linkage;
63587+ struct list_head level_linkage;
63588+ struct list_head extra_linkage;
63589+} reiser4_pool_header;
63590+
63591+typedef enum {
63592+ POOLO_BEFORE,
63593+ POOLO_AFTER,
63594+ POOLO_LAST,
63595+ POOLO_FIRST
63596+} pool_ordering;
63597+
63598+/* pool manipulation functions */
63599+
63600+extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
63601+ int num_of_objs, char *data);
63602+extern void reiser4_done_pool(reiser4_pool * pool);
63603+extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
63604+reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool,
63605+ struct list_head * list,
63606+ pool_ordering order,
63607+ reiser4_pool_header * reference);
63608+
63609+/* __REISER4_POOL_H__ */
63610+#endif
63611+
63612+/* Make Linus happy.
63613+ Local variables:
63614+ c-indentation-style: "K&R"
63615+ mode-name: "LC"
63616+ c-basic-offset: 8
63617+ tab-width: 8
63618+ fill-column: 120
63619+ End:
63620+*/
63621diff -urN linux-2.6.20.orig/fs/reiser4/readahead.c linux-2.6.20/fs/reiser4/readahead.c
63622--- linux-2.6.20.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300
63623+++ linux-2.6.20/fs/reiser4/readahead.c 2007-05-06 14:50:43.867028218 +0400
63624@@ -0,0 +1,138 @@
63625+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63626+ * reiser4/README */
63627+
63628+#include "forward.h"
63629+#include "tree.h"
63630+#include "tree_walk.h"
63631+#include "super.h"
63632+#include "inode.h"
63633+#include "key.h"
63634+#include "znode.h"
63635+
63636+#include <linux/swap.h> /* for totalram_pages */
63637+
63638+void reiser4_init_ra_info(ra_info_t * rai)
63639+{
63640+ rai->key_to_stop = *reiser4_min_key();
63641+}
63642+
63643+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
63644+static inline int ra_adjacent_only(int flags)
63645+{
63646+ return flags & RA_ADJACENT_ONLY;
63647+}
63648+
63649+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
63650+ if right neighbor's first key is less or equal to readahead's stop key */
63651+static int should_readahead_neighbor(znode * node, ra_info_t * info)
63652+{
63653+ int result;
63654+
63655+ read_lock_dk(znode_get_tree(node));
63656+ result = keyle(znode_get_rd_key(node), &info->key_to_stop);
63657+ read_unlock_dk(znode_get_tree(node));
63658+ return result;
63659+}
63660+
63661+#define LOW_MEM_PERCENTAGE (5)
63662+
63663+static int low_on_memory(void)
63664+{
63665+ unsigned int freepages;
63666+
63667+ freepages = nr_free_pages();
63668+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
63669+}
63670+
63671+/* start read for @node and for a few of its right neighbors */
63672+void formatted_readahead(znode * node, ra_info_t * info)
63673+{
63674+ ra_params_t *ra_params;
63675+ znode *cur;
63676+ int i;
63677+ int grn_flags;
63678+ lock_handle next_lh;
63679+
63680+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
63681+ if (reiser4_blocknr_is_fake(znode_get_block(node)))
63682+ return;
63683+
63684+ ra_params = get_current_super_ra_params();
63685+
63686+ if (znode_page(node) == NULL)
63687+ jstartio(ZJNODE(node));
63688+
63689+ if (znode_get_level(node) != LEAF_LEVEL)
63690+ return;
63691+
63692+ /* don't waste memory for read-ahead when low on memory */
63693+ if (low_on_memory())
63694+ return;
63695+
63696+ /* We can have locked nodes on upper tree levels, in this situation lock
63697+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK
63698+ here. */
63699+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
63700+
63701+ i = 0;
63702+ cur = zref(node);
63703+ init_lh(&next_lh);
63704+ while (i < ra_params->max) {
63705+ const reiser4_block_nr *nextblk;
63706+
63707+ if (!should_readahead_neighbor(cur, info))
63708+ break;
63709+
63710+ if (reiser4_get_right_neighbor
63711+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
63712+ break;
63713+
63714+ nextblk = znode_get_block(next_lh.node);
63715+ if (reiser4_blocknr_is_fake(nextblk) ||
63716+ (ra_adjacent_only(ra_params->flags)
63717+ && *nextblk != *znode_get_block(cur) + 1)) {
63718+ break;
63719+ }
63720+
63721+ zput(cur);
63722+ cur = zref(next_lh.node);
63723+ done_lh(&next_lh);
63724+ if (znode_page(cur) == NULL)
63725+ jstartio(ZJNODE(cur));
63726+ else
63727+ /* Do not scan read-ahead window if pages already
63728+ * allocated (and i/o already started). */
63729+ break;
63730+
63731+ i++;
63732+ }
63733+ zput(cur);
63734+ done_lh(&next_lh);
63735+}
63736+
63737+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
63738+{
63739+ reiser4_key *stop_key;
63740+
63741+ assert("nikita-3542", dir != NULL);
63742+ assert("nikita-3543", tap != NULL);
63743+
63744+ stop_key = &tap->ra_info.key_to_stop;
63745+ /* initialize readdir readahead information: include into readahead
63746+ * stat data of all files of the directory */
63747+ set_key_locality(stop_key, get_inode_oid(dir));
63748+ set_key_type(stop_key, KEY_SD_MINOR);
63749+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
63750+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
63751+ set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
63752+}
63753+
63754+/*
63755+ Local variables:
63756+ c-indentation-style: "K&R"
63757+ mode-name: "LC"
63758+ c-basic-offset: 8
63759+ tab-width: 8
63760+ fill-column: 80
63761+ End:
63762+*/
63763diff -urN linux-2.6.20.orig/fs/reiser4/readahead.h linux-2.6.20/fs/reiser4/readahead.h
63764--- linux-2.6.20.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300
63765+++ linux-2.6.20/fs/reiser4/readahead.h 2007-05-06 14:50:43.867028218 +0400
63766@@ -0,0 +1,48 @@
63767+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63768+
63769+#ifndef __READAHEAD_H__
63770+#define __READAHEAD_H__
63771+
63772+#include "key.h"
63773+
63774+typedef enum {
63775+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
63776+} ra_global_flags;
63777+
63778+/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
63779+typedef struct formatted_read_ahead_params {
63780+ unsigned long max; /* request not more than this amount of nodes. Default is totalram_pages / 4 */
63781+ int flags;
63782+} ra_params_t;
63783+
63784+typedef struct {
63785+ reiser4_key key_to_stop;
63786+} ra_info_t;
63787+
63788+void formatted_readahead(znode *, ra_info_t *);
63789+void reiser4_init_ra_info(ra_info_t * rai);
63790+
63791+struct reiser4_file_ra_state {
63792+ loff_t start; /* Current window */
63793+ loff_t size;
63794+ loff_t next_size; /* Next window size */
63795+ loff_t ahead_start; /* Ahead window */
63796+ loff_t ahead_size;
63797+ loff_t max_window_size; /* Maximum readahead window */
63798+ loff_t slow_start; /* enlarging r/a size algorithm. */
63799+};
63800+
63801+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
63802+
63803+/* __READAHEAD_H__ */
63804+#endif
63805+
63806+/*
63807+ Local variables:
63808+ c-indentation-style: "K&R"
63809+ mode-name: "LC"
63810+ c-basic-offset: 8
63811+ tab-width: 8
63812+ fill-column: 120
63813+ End:
63814+*/
63815diff -urN linux-2.6.20.orig/fs/reiser4/README linux-2.6.20/fs/reiser4/README
63816--- linux-2.6.20.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
63817+++ linux-2.6.20/fs/reiser4/README 2007-05-06 14:50:43.867028218 +0400
63818@@ -0,0 +1,125 @@
63819+[LICENSING]
63820+
63821+Reiser4 is hereby licensed under the GNU General
63822+Public License version 2.
63823+
63824+Source code files that contain the phrase "licensing governed by
63825+reiser4/README" are "governed files" throughout this file. Governed
63826+files are licensed under the GPL. The portions of them owned by Hans
63827+Reiser, or authorized to be licensed by him, have been in the past,
63828+and likely will be in the future, licensed to other parties under
63829+other licenses. If you add your code to governed files, and don't
63830+want it to be owned by Hans Reiser, put your copyright label on that
63831+code so the poor blight and his customers can keep things straight.
63832+All portions of governed files not labeled otherwise are owned by Hans
63833+Reiser, and by adding your code to it, widely distributing it to
63834+others or sending us a patch, and leaving the sentence in stating that
63835+licensing is governed by the statement in this file, you accept this.
63836+It will be a kindness if you identify whether Hans Reiser is allowed
63837+to license code labeled as owned by you on your behalf other than
63838+under the GPL, because he wants to know if it is okay to do so and put
63839+a check in the mail to you (for non-trivial improvements) when he
63840+makes his next sale. He makes no guarantees as to the amount if any,
63841+though he feels motivated to motivate contributors, and you can surely
63842+discuss this with him before or after contributing. You have the
63843+right to decline to allow him to license your code contribution other
63844+than under the GPL.
63845+
63846+Further licensing options are available for commercial and/or other
63847+interests directly from Hans Reiser: reiser@namesys.com. If you interpret
63848+the GPL as not allowing those additional licensing options, you read
63849+it wrongly, and Richard Stallman agrees with me, when carefully read
63850+you can see that those restrictions on additional terms do not apply
63851+to the owner of the copyright, and my interpretation of this shall
63852+govern for this license.
63853+
63854+[END LICENSING]
63855+
63856+Reiser4 is a file system based on dancing tree algorithms, and is
63857+described at http://www.namesys.com
63858+
63859+mkfs.reiser4 and other utilities are on our webpage or wherever your
63860+Linux provider put them. You really want to be running the latest
63861+version off the website if you use fsck.
63862+
63863+Yes, if you update your reiser4 kernel module you do have to
63864+recompile your kernel, most of the time. The errors you get will be
63865+quite cryptic if your forget to do so.
63866+
63867+Hideous Commercial Pitch: Spread your development costs across other OS
63868+vendors. Select from the best in the world, not the best in your
63869+building, by buying from third party OS component suppliers. Leverage
63870+the software component development power of the internet. Be the most
63871+aggressive in taking advantage of the commercial possibilities of
63872+decentralized internet development, and add value through your branded
63873+integration that you sell as an operating system. Let your competitors
63874+be the ones to compete against the entire internet by themselves. Be
63875+hip, get with the new economic trend, before your competitors do. Send
63876+email to reiser@namesys.com
63877+
63878+Hans Reiser was the primary architect of Reiser4, but a whole team
63879+chipped their ideas in. He invested everything he had into Namesys
63880+for 5.5 dark years of no money before Reiser3 finally started to work well
63881+enough to bring in money. He owns the copyright.
63882+
63883+DARPA was the primary sponsor of Reiser4. DARPA does not endorse
63884+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
63885+opinion, unique in its willingness to invest into things more
63886+theoretical than the VC community can readily understand, and more
63887+longterm than allows them to be sure that they will be the ones to
63888+extract the economic benefits from. DARPA also integrated us into a
63889+security community that transformed our security worldview.
63890+
63891+Vladimir Saveliev is our lead programmer, with us from the beginning,
63892+and he worked long hours writing the cleanest code. This is why he is
63893+now the lead programmer after years of commitment to our work. He
63894+always made the effort to be the best he could be, and to make his
63895+code the best that it could be. What resulted was quite remarkable. I
63896+don't think that money can ever motivate someone to work the way he
63897+did, he is one of the most selfless men I know.
63898+
63899+Alexander Lyamin was our sysadmin, and helped to educate us in
63900+security issues. Moscow State University and IMT were very generous
63901+in the internet access they provided us, and in lots of other little
63902+ways that a generous institution can be.
63903+
63904+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
63905+locking code, the block allocator, and finished the flushing code.
63906+His code is always crystal clean and well structured.
63907+
63908+Nikita Danilov wrote the core of the balancing code, the core of the
63909+plugins code, and the directory code. He worked a steady pace of long
63910+hours that produced a whole lot of well abstracted code. He is our
63911+senior computer scientist.
63912+
63913+Vladimir Demidov wrote the parser. Writing an in kernel parser is
63914+something very few persons have the skills for, and it is thanks to
63915+him that we can say that the parser is really not so big compared to
63916+various bits of our other code, and making a parser work in the kernel
63917+was not so complicated as everyone would imagine mainly because it was
63918+him doing it...
63919+
63920+Joshua McDonald wrote the transaction manager, and the flush code.
63921+The flush code unexpectedly turned out be extremely hairy for reasons
63922+you can read about on our web page, and he did a great job on an
63923+extremely difficult task.
63924+
63925+Nina Reiser handled our accounting, government relations, and much
63926+more.
63927+
63928+Ramon Reiser developed our website.
63929+
63930+Beverly Palmer drew our graphics.
63931+
63932+Vitaly Fertman developed librepair, userspace plugins repair code, fsck
63933+and worked with Umka on developing libreiser4 and userspace plugins.
63934+
63935+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
63936+userspace tools (reiser4progs).
63937+
63938+Oleg Drokin (aka Green) is the release manager who fixes everything.
63939+It is so nice to have someone like that on the team. He (plus Chris
63940+and Jeff) make it possible for the entire rest of the Namesys team to
63941+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
63942+is just amazing to watch his talent for spotting bugs in action.
63943+
63944diff -urN linux-2.6.20.orig/fs/reiser4/reiser4.h linux-2.6.20/fs/reiser4/reiser4.h
63945--- linux-2.6.20.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300
63946+++ linux-2.6.20/fs/reiser4/reiser4.h 2007-05-06 14:50:43.867028218 +0400
63947@@ -0,0 +1,269 @@
63948+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63949+ * reiser4/README */
63950+
63951+/* definitions of common constants used by reiser4 */
63952+
63953+#if !defined( __REISER4_H__ )
63954+#define __REISER4_H__
63955+
63956+#include <asm/param.h> /* for HZ */
63957+#include <linux/errno.h>
63958+#include <linux/types.h>
63959+#include <linux/fs.h>
63960+#include <linux/hardirq.h>
63961+#include <linux/sched.h>
63962+
63963+/*
63964+ * reiser4 compilation options.
63965+ */
63966+
63967+#if defined(CONFIG_REISER4_DEBUG)
63968+/* turn on assertion checks */
63969+#define REISER4_DEBUG (1)
63970+#else
63971+#define REISER4_DEBUG (0)
63972+#endif
63973+
63974+#if defined(CONFIG_ZLIB_INFLATE)
63975+/* turn on zlib */
63976+#define REISER4_ZLIB (1)
63977+#else
63978+#define REISER4_ZLIB (0)
63979+#endif
63980+
63981+#if defined(CONFIG_CRYPTO_SHA256)
63982+#define REISER4_SHA256 (1)
63983+#else
63984+#define REISER4_SHA256 (0)
63985+#endif
63986+
63987+/*
63988+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
63989+ * 8-byte components. In the old "small key" mode, it's 3 8-byte
63990+ * components. Additional component, referred to as "ordering" is used to
63991+ * order items from which given object is composed of. As such, ordering is
63992+ * placed between locality and objectid. For directory item ordering contains
63993+ * initial prefix of the file name this item is for. This sorts all directory
63994+ * items within given directory lexicographically (but see
63995+ * fibration.[ch]). For file body and stat-data, ordering contains initial
63996+ * prefix of the name file was initially created with. In the common case
63997+ * (files with single name) this allows to order file bodies and stat-datas in
63998+ * the same order as their respective directory entries, thus speeding up
63999+ * readdir.
64000+ *
64001+ * Note, that kernel can only mount file system with the same key size as one
64002+ * it is compiled for, so flipping this option may render your data
64003+ * inaccessible.
64004+ */
64005+#define REISER4_LARGE_KEY (1)
64006+/*#define REISER4_LARGE_KEY (0)*/
64007+
64008+/*#define GUESS_EXISTS 1*/
64009+
64010+/*
64011+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
64012+ * option
64013+ */
64014+
64015+extern const char *REISER4_SUPER_MAGIC_STRING;
64016+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
64017+ * beginning of device */
64018+
64019+/* here go tunable parameters that are not worth special entry in kernel
64020+ configuration */
64021+
64022+/* default number of slots in coord-by-key caches */
64023+#define CBK_CACHE_SLOTS (16)
64024+/* how many elementary tree operation to carry on the next level */
64025+#define CARRIES_POOL_SIZE (5)
64026+/* size of pool of preallocated nodes for carry process. */
64027+#define NODES_LOCKED_POOL_SIZE (5)
64028+
64029+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64030+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64031+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
64032+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
64033+
64034+/* we are supporting reservation of disk space on uid basis */
64035+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
64036+/* we are supporting reservation of disk space for groups */
64037+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
64038+/* we are supporting reservation of disk space for root */
64039+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
64040+/* we use rapid flush mode, see flush.c for comments. */
64041+#define REISER4_USE_RAPID_FLUSH (1)
64042+
64043+/*
64044+ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
64045+ */
64046+#define REISER4_USE_ENTD (1)
64047+
64048+/* key allocation is Plan-A */
64049+#define REISER4_PLANA_KEY_ALLOCATION (1)
64050+/* key allocation follows good old 3.x scheme */
64051+#define REISER4_3_5_KEY_ALLOCATION (0)
64052+
64053+/* size of hash-table for znodes */
64054+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
64055+
64056+/* number of buckets in lnode hash-table */
64057+#define LNODE_HTABLE_BUCKETS (1024)
64058+
64059+/* some ridiculously high maximal limit on height of znode tree. This
64060+ is used in declaration of various per level arrays and
64061+ to allocate stattistics gathering array for per-level stats. */
64062+#define REISER4_MAX_ZTREE_HEIGHT (8)
64063+
64064+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
64065+
64066+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
64067+ sequential search is on average faster than binary. This is because
64068+ of better optimization and because sequential search is more CPU
64069+ cache friendly. This number (25) was found by experiments on dual AMD
64070+ Athlon(tm), 1400MHz.
64071+
64072+ NOTE: testing in kernel has shown that binary search is more effective than
64073+ implied by results of the user level benchmarking. Probably because in the
64074+ node keys are separated by other data. So value was adjusted after few
64075+ tests. More thorough tuning is needed.
64076+*/
64077+#define REISER4_SEQ_SEARCH_BREAK (3)
64078+
64079+/* don't allow tree to be lower than this */
64080+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
64081+
64082+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
64083+ * available memory. */
64084+/* Default value of maximal atom size. Can be ovewritten by
64085+ tmgr.atom_max_size mount option. By default infinity. */
64086+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
64087+
64088+/* Default value of maximal atom age (in jiffies). After reaching this age
64089+ atom will be forced to commit, either synchronously or asynchronously. Can
64090+ be overwritten by tmgr.atom_max_age mount option. */
64091+#define REISER4_ATOM_MAX_AGE (600 * HZ)
64092+
64093+/* sleeping period for ktxnmrgd */
64094+#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
64095+
64096+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
64097+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
64098+
64099+/* start complaining after that many restarts in coord_by_key().
64100+
64101+ This either means incredibly heavy contention for this part of a tree, or
64102+ some corruption or bug.
64103+*/
64104+#define REISER4_CBK_ITERATIONS_LIMIT (100)
64105+
64106+/* return -EIO after that many iterations in coord_by_key().
64107+
64108+ I have witnessed more than 800 iterations (in 30 thread test) before cbk
64109+ finished. --nikita
64110+*/
64111+#define REISER4_MAX_CBK_ITERATIONS 500000
64112+
64113+/* put a per-inode limit on maximal number of directory entries with identical
64114+ keys in hashed directory.
64115+
64116+ Disable this until inheritance interfaces stabilize: we need some way to
64117+ set per directory limit.
64118+*/
64119+#define REISER4_USE_COLLISION_LIMIT (0)
64120+
64121+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
64122+ will force them to be relocated. */
64123+#define FLUSH_RELOCATE_THRESHOLD 64
64124+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
64125+ from the preceder it will relocate to that position. */
64126+#define FLUSH_RELOCATE_DISTANCE 64
64127+
64128+/* If we have written this much or more blocks before encountering busy jnode
64129+ in flush list - abort flushing hoping that next time we get called
64130+ this jnode will be clean already, and we will save some seeks. */
64131+#define FLUSH_WRITTEN_THRESHOLD 50
64132+
64133+/* The maximum number of nodes to scan left on a level during flush. */
64134+#define FLUSH_SCAN_MAXNODES 10000
64135+
64136+/* per-atom limit of flushers */
64137+#define ATOM_MAX_FLUSHERS (1)
64138+
64139+/* default tracing buffer size */
64140+#define REISER4_TRACE_BUF_SIZE (1 << 15)
64141+
64142+/* what size units of IO we would like cp, etc., to use, in writing to
64143+ reiser4. In bytes.
64144+
64145+ Can be overwritten by optimal_io_size mount option.
64146+*/
64147+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
64148+
64149+/* see comments in inode.c:oid_to_uino() */
64150+#define REISER4_UINO_SHIFT (1 << 30)
64151+
64152+/* Mark function argument as unused to avoid compiler warnings. */
64153+#define UNUSED_ARG __attribute__((unused))
64154+
64155+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
64156+#define NONNULL __attribute__((nonnull))
64157+#else
64158+#define NONNULL
64159+#endif
64160+
64161+/* master super block offset in bytes.*/
64162+#define REISER4_MASTER_OFFSET 65536
64163+
64164+/* size of VFS block */
64165+#define VFS_BLKSIZE 512
64166+/* number of bits in size of VFS block (512==2^9) */
64167+#define VFS_BLKSIZE_BITS 9
64168+
64169+#define REISER4_I reiser4_inode_data
64170+
64171+/* implication */
64172+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
64173+/* logical equivalence */
64174+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
64175+
64176+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
64177+
64178+#define NOT_YET (0)
64179+
64180+/** Reiser4 specific error codes **/
64181+
64182+#define REISER4_ERROR_CODE_BASE 500
64183+
64184+/* Neighbor is not available (side neighbor or parent) */
64185+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
64186+
64187+/* Node was not found in cache */
64188+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
64189+
64190+/* node has no free space enough for completion of balancing operation */
64191+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
64192+
64193+/* repeat operation */
64194+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
64195+
64196+/* deadlock happens */
64197+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
64198+
64199+/* operation cannot be performed, because it would block and non-blocking mode
64200+ * was requested. */
64201+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
64202+
64203+/* wait some event (depends on context), then repeat */
64204+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
64205+
64206+#endif /* __REISER4_H__ */
64207+
64208+/* Make Linus happy.
64209+ Local variables:
64210+ c-indentation-style: "K&R"
64211+ mode-name: "LC"
64212+ c-basic-offset: 8
64213+ tab-width: 8
64214+ fill-column: 120
64215+ End:
64216+*/
64217diff -urN linux-2.6.20.orig/fs/reiser4/safe_link.c linux-2.6.20/fs/reiser4/safe_link.c
64218--- linux-2.6.20.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300
64219+++ linux-2.6.20/fs/reiser4/safe_link.c 2007-05-06 14:50:43.867028218 +0400
64220@@ -0,0 +1,351 @@
64221+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
64222+ * reiser4/README */
64223+
64224+/* Safe-links. */
64225+
64226+/*
64227+ * Safe-links are used to maintain file system consistency during operations
64228+ * that spawns multiple transactions. For example:
64229+ *
64230+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
64231+ * without user-visible names in the file system, but still opened by some
64232+ * active process. What happens here is that unlink proper (i.e., removal
64233+ * of the last file name) and file deletion (truncate of file body to zero
64234+ * and deletion of stat-data, that happens when last file descriptor is
64235+ * closed), may belong to different transactions T1 and T2. If a crash
64236+ * happens after T1 commit, but before T2 commit, on-disk file system has
64237+ * a file without name, that is, disk space leak.
64238+ *
64239+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If
64240+ * system crashes while truncate was in-progress, file is left partially
64241+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that
64242+ * every system is atomic.
64243+ *
64244+ * Safe-links address both above cases. Basically, safe-link is a way post
64245+ * some operation to be executed during commit of some other transaction than
64246+ * current one. (Another way to look at the safe-link is to interpret it as a
64247+ * logical logging.)
64248+ *
64249+ * Specifically, at the beginning of unlink safe-link in inserted in the
64250+ * tree. This safe-link is normally removed by file deletion code (during
64251+ * transaction T2 in the above terms). Truncate also inserts safe-link that is
64252+ * normally removed when truncate operation is finished.
64253+ *
64254+ * This means, that in the case of "clean umount" there are no safe-links in
64255+ * the tree. If safe-links are observed during mount, it means that (a) system
64256+ * was terminated abnormally, and (b) safe-link correspond to the "pending"
64257+ * (i.e., not finished) operations that were in-progress during system
64258+ * termination. Each safe-link record enough information to complete
64259+ * corresponding operation, and mount simply "replays" them (hence, the
64260+ * analogy with the logical logging).
64261+ *
64262+ * Safe-links are implemented as blackbox items (see
64263+ * plugin/item/blackbox.[ch]).
64264+ *
64265+ * For the reference: ext3 also has similar mechanism, it's called "an orphan
64266+ * list" there.
64267+ */
64268+
64269+#include "safe_link.h"
64270+#include "debug.h"
64271+#include "inode.h"
64272+
64273+#include "plugin/item/blackbox.h"
64274+
64275+#include <linux/fs.h>
64276+
64277+/*
64278+ * On-disk format of safe-link.
64279+ */
64280+typedef struct safelink {
64281+ reiser4_key sdkey; /* key of stat-data for the file safe-link is
64282+ * for */
64283+ d64 size; /* size to which file should be truncated */
64284+} safelink_t;
64285+
64286+/*
64287+ * locality where safe-link items are stored. Next to the objectid of root
64288+ * directory.
64289+ */
64290+static oid_t safe_link_locality(reiser4_tree * tree)
64291+{
64292+ return get_key_objectid(get_super_private(tree->super)->df_plug->
64293+ root_dir_key(tree->super)) + 1;
64294+}
64295+
64296+/*
64297+ Construct a key for the safe-link. Key has the following format:
64298+
64299+| 60 | 4 | 64 | 4 | 60 | 64 |
64300++---------------+---+------------------+---+---------------+------------------+
64301+| locality | 0 | 0 | 0 | objectid | link type |
64302++---------------+---+------------------+---+---------------+------------------+
64303+| | | | |
64304+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
64305+
64306+ This is in large keys format. In small keys format second 8 byte chunk is
64307+ out. Locality is a constant returned by safe_link_locality(). objectid is
64308+ an oid of a file on which operation protected by this safe-link is
64309+ performed. link-type is used to distinguish safe-links for different
64310+ operations.
64311+
64312+ */
64313+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64314+ reiser4_safe_link_t link, reiser4_key * key)
64315+{
64316+ reiser4_key_init(key);
64317+ set_key_locality(key, safe_link_locality(tree));
64318+ set_key_objectid(key, oid);
64319+ set_key_offset(key, link);
64320+ return key;
64321+}
64322+
64323+/*
64324+ * how much disk space is necessary to insert and remove (in the
64325+ * error-handling path) safe-link.
64326+ */
64327+static __u64 safe_link_tograb(reiser4_tree * tree)
64328+{
64329+ return
64330+ /* insert safe link */
64331+ estimate_one_insert_item(tree) +
64332+ /* remove safe link */
64333+ estimate_one_item_removal(tree) +
64334+ /* drill to the leaf level during insertion */
64335+ 1 + estimate_one_insert_item(tree) +
64336+ /*
64337+ * possible update of existing safe-link. Actually, if
64338+ * safe-link existed already (we failed to remove it), then no
64339+ * insertion is necessary, so this term is already "covered",
64340+ * but for simplicity let's left it.
64341+ */
64342+ 1;
64343+}
64344+
64345+/*
64346+ * grab enough disk space to insert and remove (in the error-handling path)
64347+ * safe-link.
64348+ */
64349+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64350+{
64351+ int result;
64352+
64353+ grab_space_enable();
64354+ /* The sbinfo->delete_mutex can be taken here.
64355+ * safe_link_release() should be called before leaving reiser4
64356+ * context. */
64357+ result =
64358+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64359+ grab_space_enable();
64360+ return result;
64361+}
64362+
64363+/*
64364+ * release unused disk space reserved by safe_link_grab().
64365+ */
64366+void safe_link_release(reiser4_tree * tree)
64367+{
64368+ reiser4_release_reserved(tree->super);
64369+}
64370+
64371+/*
64372+ * insert into tree safe-link for operation @link on inode @inode.
64373+ */
64374+int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64375+{
64376+ reiser4_key key;
64377+ safelink_t sl;
64378+ int length;
64379+ int result;
64380+ reiser4_tree *tree;
64381+
64382+ build_sd_key(inode, &sl.sdkey);
64383+ length = sizeof sl.sdkey;
64384+
64385+ if (link == SAFE_TRUNCATE) {
64386+ /*
64387+ * for truncate we have to store final file length also,
64388+ * expand item.
64389+ */
64390+ length += sizeof(sl.size);
64391+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64392+ }
64393+ tree = reiser4_tree_by_inode(inode);
64394+ build_link_key(tree, get_inode_oid(inode), link, &key);
64395+
64396+ result = store_black_box(tree, &key, &sl, length);
64397+ if (result == -EEXIST)
64398+ result = update_black_box(tree, &key, &sl, length);
64399+ return result;
64400+}
64401+
64402+/*
64403+ * remove safe-link corresponding to the operation @link on inode @inode from
64404+ * the tree.
64405+ */
64406+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64407+{
64408+ reiser4_key key;
64409+
64410+ return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64411+}
64412+
64413+/*
64414+ * in-memory structure to keep information extracted from safe-link. This is
64415+ * used to iterate over all safe-links.
64416+ */
64417+typedef struct {
64418+ reiser4_tree *tree; /* internal tree */
64419+ reiser4_key key; /* safe-link key */
64420+ reiser4_key sdkey; /* key of object stat-data */
64421+ reiser4_safe_link_t link; /* safe-link type */
64422+ oid_t oid; /* object oid */
64423+ __u64 size; /* final size for truncate */
64424+} safe_link_context;
64425+
64426+/*
64427+ * start iterating over all safe-links.
64428+ */
64429+static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
64430+{
64431+ ctx->tree = tree;
64432+ reiser4_key_init(&ctx->key);
64433+ set_key_locality(&ctx->key, safe_link_locality(tree));
64434+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
64435+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
64436+}
64437+
64438+/*
64439+ * return next safe-link.
64440+ */
64441+static int safe_link_iter_next(safe_link_context * ctx)
64442+{
64443+ int result;
64444+ safelink_t sl;
64445+
64446+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64447+ if (result == 0) {
64448+ ctx->oid = get_key_objectid(&ctx->key);
64449+ ctx->link = get_key_offset(&ctx->key);
64450+ ctx->sdkey = sl.sdkey;
64451+ if (ctx->link == SAFE_TRUNCATE)
64452+ ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64453+ }
64454+ return result;
64455+}
64456+
64457+/*
64458+ * check are there any more safe-links left in the tree.
64459+ */
64460+static int safe_link_iter_finished(safe_link_context * ctx)
64461+{
64462+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64463+}
64464+
64465+/*
64466+ * finish safe-link iteration.
64467+ */
64468+static void safe_link_iter_end(safe_link_context * ctx)
64469+{
64470+ /* nothing special */
64471+}
64472+
64473+/*
64474+ * process single safe-link.
64475+ */
64476+static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64477+ reiser4_key * sdkey, oid_t oid, __u64 size)
64478+{
64479+ struct inode *inode;
64480+ int result;
64481+
64482+ /*
64483+ * obtain object inode by reiser4_iget(), then call object plugin
64484+ * ->safelink() method to do actual work, then delete safe-link on
64485+ * success.
64486+ */
64487+ inode = reiser4_iget(super, sdkey, 1);
64488+ if (!IS_ERR(inode)) {
64489+ file_plugin *fplug;
64490+
64491+ fplug = inode_file_plugin(inode);
64492+ assert("nikita-3428", fplug != NULL);
64493+ assert("", oid == get_inode_oid(inode));
64494+ if (fplug->safelink != NULL) {
64495+ /* reiser4_txn_restart_current is not necessary because
64496+ * mounting is signle thread. However, without it
64497+ * deadlock detection code will complain (see
64498+ * nikita-3361). */
64499+ reiser4_txn_restart_current();
64500+ result = fplug->safelink(inode, link, size);
64501+ } else {
64502+ warning("nikita-3430",
64503+ "Cannot handle safelink for %lli",
64504+ (unsigned long long)oid);
64505+ reiser4_print_key("key", sdkey);
64506+ result = 0;
64507+ }
64508+ if (result != 0) {
64509+ warning("nikita-3431",
64510+ "Error processing safelink for %lli: %i",
64511+ (unsigned long long)oid, result);
64512+ }
64513+ reiser4_iget_complete(inode);
64514+ iput(inode);
64515+ if (result == 0) {
64516+ result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
64517+ if (result == 0)
64518+ result =
64519+ safe_link_del(reiser4_get_tree(super), oid, link);
64520+ safe_link_release(reiser4_get_tree(super));
64521+ /*
64522+ * restart transaction: if there was large number of
64523+ * safe-links, their processing may fail to fit into
64524+ * single transaction.
64525+ */
64526+ if (result == 0)
64527+ reiser4_txn_restart_current();
64528+ }
64529+ } else
64530+ result = PTR_ERR(inode);
64531+ return result;
64532+}
64533+
64534+/*
64535+ * iterate over all safe-links in the file-system processing them one by one.
64536+ */
64537+int process_safelinks(struct super_block *super)
64538+{
64539+ safe_link_context ctx;
64540+ int result;
64541+
64542+ if (rofs_super(super))
64543+ /* do nothing on the read-only file system */
64544+ return 0;
64545+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
64546+ result = 0;
64547+ do {
64548+ result = safe_link_iter_next(&ctx);
64549+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
64550+ result = 0;
64551+ break;
64552+ }
64553+ if (result == 0)
64554+ result = process_safelink(super, ctx.link,
64555+ &ctx.sdkey, ctx.oid,
64556+ ctx.size);
64557+ } while (result == 0);
64558+ safe_link_iter_end(&ctx);
64559+ return result;
64560+}
64561+
64562+/* Make Linus happy.
64563+ Local variables:
64564+ c-indentation-style: "K&R"
64565+ mode-name: "LC"
64566+ c-basic-offset: 8
64567+ tab-width: 8
64568+ fill-column: 120
64569+ scroll-step: 1
64570+ End:
64571+*/
64572diff -urN linux-2.6.20.orig/fs/reiser4/safe_link.h linux-2.6.20/fs/reiser4/safe_link.h
64573--- linux-2.6.20.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300
64574+++ linux-2.6.20/fs/reiser4/safe_link.h 2007-05-06 14:50:43.867028218 +0400
64575@@ -0,0 +1,29 @@
64576+/* Copyright 2003 by Hans Reiser, licensing governed by
64577+ * reiser4/README */
64578+
64579+/* Safe-links. See safe_link.c for details. */
64580+
64581+#if !defined( __FS_SAFE_LINK_H__ )
64582+#define __FS_SAFE_LINK_H__
64583+
64584+#include "tree.h"
64585+
64586+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
64587+void safe_link_release(reiser4_tree * tree);
64588+int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
64589+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
64590+
64591+int process_safelinks(struct super_block *super);
64592+
64593+/* __FS_SAFE_LINK_H__ */
64594+#endif
64595+
64596+/* Make Linus happy.
64597+ Local variables:
64598+ c-indentation-style: "K&R"
64599+ mode-name: "LC"
64600+ c-basic-offset: 8
64601+ tab-width: 8
64602+ fill-column: 120
64603+ End:
64604+*/
64605diff -urN linux-2.6.20.orig/fs/reiser4/seal.c linux-2.6.20/fs/reiser4/seal.c
64606--- linux-2.6.20.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
64607+++ linux-2.6.20/fs/reiser4/seal.c 2007-05-06 14:50:43.871029467 +0400
64608@@ -0,0 +1,218 @@
64609+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64610+/* Seals implementation. */
64611+/* Seals are "weak" tree pointers. They are analogous to tree coords in
64612+ allowing to bypass tree traversal. But normal usage of coords implies that
64613+ node pointed to by coord is locked, whereas seals don't keep a lock (or
64614+ even a reference) to znode. In stead, each znode contains a version number,
64615+ increased on each znode modification. This version number is copied into a
64616+ seal when seal is created. Later, one can "validate" seal by calling
64617+ reiser4_seal_validate(). If znode is in cache and its version number is
64618+ still the same, seal is "pristine" and coord associated with it can be
64619+ re-used immediately.
64620+
64621+ If, on the other hand, znode is out of cache, or it is obviously different
64622+ one from the znode seal was initially attached to (for example, it is on
64623+ the different level, or is being removed from the tree), seal is
64624+ irreparably invalid ("burned") and tree traversal has to be repeated.
64625+
64626+ Otherwise, there is some hope, that while znode was modified (and seal was
64627+ "broken" as a result), key attached to the seal is still in the node. This
64628+ is checked by first comparing this key with delimiting keys of node and, if
64629+ key is ok, doing intra-node lookup.
64630+
64631+ Znode version is maintained in the following way:
64632+
64633+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
64634+ znode_epoch is incremented and its new value is stored in ->version field
64635+ of new znode. Whenever znode is dirtied (which means it was probably
64636+ modified), znode_epoch is also incremented and its new value is stored in
64637+ znode->version. This is done so, because just incrementing znode->version
64638+ on each update is not enough: it may so happen, that znode get deleted, new
64639+ znode is allocated for the same disk block and gets the same version
64640+ counter, tricking seal code into false positive.
64641+*/
64642+
64643+#include "forward.h"
64644+#include "debug.h"
64645+#include "key.h"
64646+#include "coord.h"
64647+#include "seal.h"
64648+#include "plugin/item/item.h"
64649+#include "plugin/node/node.h"
64650+#include "jnode.h"
64651+#include "znode.h"
64652+#include "super.h"
64653+
64654+static znode *seal_node(const seal_t * seal);
64655+static int seal_matches(const seal_t * seal, znode * node);
64656+
64657+/* initialise seal. This can be called several times on the same seal. @coord
64658+ and @key can be NULL. */
64659+void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
64660+ const coord_t * coord /* coord @seal will be
64661+ * attached to */ ,
64662+ const reiser4_key * key UNUSED_ARG /* key @seal will be
64663+ * attached to */ )
64664+{
64665+ assert("nikita-1886", seal != NULL);
64666+ memset(seal, 0, sizeof *seal);
64667+ if (coord != NULL) {
64668+ znode *node;
64669+
64670+ node = coord->node;
64671+ assert("nikita-1987", node != NULL);
64672+ spin_lock_znode(node);
64673+ seal->version = node->version;
64674+ assert("nikita-1988", seal->version != 0);
64675+ seal->block = *znode_get_block(node);
64676+#if REISER4_DEBUG
64677+ seal->coord1 = *coord;
64678+ if (key != NULL)
64679+ seal->key = *key;
64680+#endif
64681+ spin_unlock_znode(node);
64682+ }
64683+}
64684+
64685+/* finish with seal */
64686+void reiser4_seal_done(seal_t * seal /* seal to clear */ )
64687+{
64688+ assert("nikita-1887", seal != NULL);
64689+ seal->version = 0;
64690+}
64691+
64692+/* true if seal was initialised */
64693+int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
64694+{
64695+ assert("nikita-1890", seal != NULL);
64696+ return seal->version != 0;
64697+}
64698+
64699+#if REISER4_DEBUG
64700+/* helper function for reiser4_seal_validate(). It checks that item at @coord
64701+ * has expected key. This is to detect cases where node was modified but wasn't
64702+ * marked dirty. */
64703+static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
64704+ const reiser4_key * k /* expected key */ )
64705+{
64706+ reiser4_key ukey;
64707+
64708+ return (coord->between != AT_UNIT) ||
64709+ /* FIXME-VS: we only can compare keys for items whose units
64710+ represent exactly one key */
64711+ ((coord_is_existing_unit(coord))
64712+ && (item_is_extent(coord)
64713+ || keyeq(k, unit_key_by_coord(coord, &ukey))))
64714+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
64715+ && keyge(k, unit_key_by_coord(coord, &ukey)));
64716+}
64717+#endif
64718+
64719+/* this is used by reiser4_seal_validate. It accepts return value of
64720+ * longterm_lock_znode and returns 1 if it can be interpreted as seal
64721+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
64722+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
64723+ * We cannot do this in longterm_lock_znode(), because sometimes we want to
64724+ * distinguish between -EINVAL and -E_REPEAT. */
64725+static int should_repeat(int return_code)
64726+{
64727+ return return_code == -EINVAL;
64728+}
64729+
64730+/* (re-)validate seal.
64731+
64732+ Checks whether seal is pristine, and try to revalidate it if possible.
64733+
64734+ If seal was burned, or broken irreparably, return -E_REPEAT.
64735+
64736+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
64737+ looking for is in range of keys covered by the sealed node, but item wasn't
64738+ found by node ->lookup() method. Alternative is to return -ENOENT in this
64739+ case, but this would complicate callers logic.
64740+
64741+*/
64742+int reiser4_seal_validate(seal_t * seal /* seal to validate */,
64743+ coord_t * coord /* coord to validate against */,
64744+ const reiser4_key * key /* key to validate against */,
64745+ lock_handle * lh /* resulting lock handle */,
64746+ znode_lock_mode mode /* lock node */,
64747+ znode_lock_request request /* locking priority */)
64748+{
64749+ znode *node;
64750+ int result;
64751+
64752+ assert("nikita-1889", seal != NULL);
64753+ assert("nikita-1881", reiser4_seal_is_set(seal));
64754+ assert("nikita-1882", key != NULL);
64755+ assert("nikita-1883", coord != NULL);
64756+ assert("nikita-1884", lh != NULL);
64757+ assert("nikita-1885", keyeq(&seal->key, key));
64758+ assert("nikita-1989", coords_equal(&seal->coord1, coord));
64759+
64760+ /* obtain znode by block number */
64761+ node = seal_node(seal);
64762+ if (node != NULL) {
64763+ /* znode was in cache, lock it */
64764+ result = longterm_lock_znode(lh, node, mode, request);
64765+ zput(node);
64766+ if (result == 0) {
64767+ if (seal_matches(seal, node)) {
64768+ /* if seal version and znode version
64769+ coincide */
64770+ ON_DEBUG(coord_update_v(coord));
64771+ assert("nikita-1990",
64772+ node == seal->coord1.node);
64773+ assert("nikita-1898",
64774+ WITH_DATA_RET(coord->node, 1,
64775+ check_seal_match(coord,
64776+ key)));
64777+ } else
64778+ result = RETERR(-E_REPEAT);
64779+ }
64780+ if (result != 0) {
64781+ if (should_repeat(result))
64782+ result = RETERR(-E_REPEAT);
64783+ /* unlock node on failure */
64784+ done_lh(lh);
64785+ }
64786+ } else {
64787+ /* znode wasn't in cache */
64788+ result = RETERR(-E_REPEAT);
64789+ }
64790+ return result;
64791+}
64792+
64793+/* helpers functions */
64794+
64795+/* obtain reference to znode seal points to, if in cache */
64796+static znode *seal_node(const seal_t * seal /* seal to query */ )
64797+{
64798+ assert("nikita-1891", seal != NULL);
64799+ return zlook(current_tree, &seal->block);
64800+}
64801+
64802+/* true if @seal version and @node version coincide */
64803+static int seal_matches(const seal_t * seal /* seal to check */ ,
64804+ znode * node /* node to check */ )
64805+{
64806+ int result;
64807+
64808+ assert("nikita-1991", seal != NULL);
64809+ assert("nikita-1993", node != NULL);
64810+
64811+ spin_lock_znode(node);
64812+ result = (seal->version == node->version);
64813+ spin_unlock_znode(node);
64814+ return result;
64815+}
64816+
64817+/* Make Linus happy.
64818+ Local variables:
64819+ c-indentation-style: "K&R"
64820+ mode-name: "LC"
64821+ c-basic-offset: 8
64822+ tab-width: 8
64823+ fill-column: 120
64824+ scroll-step: 1
64825+ End:
64826+*/
64827diff -urN linux-2.6.20.orig/fs/reiser4/seal.h linux-2.6.20/fs/reiser4/seal.h
64828--- linux-2.6.20.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
64829+++ linux-2.6.20/fs/reiser4/seal.h 2007-05-06 14:50:43.871029467 +0400
64830@@ -0,0 +1,49 @@
64831+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64832+
64833+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
64834+
64835+#ifndef __SEAL_H__
64836+#define __SEAL_H__
64837+
64838+#include "forward.h"
64839+#include "debug.h"
64840+#include "dformat.h"
64841+#include "key.h"
64842+#include "coord.h"
64843+
64844+/* for __u?? types */
64845+/*#include <linux/types.h>*/
64846+
64847+/* seal. See comment at the top of seal.c */
64848+typedef struct seal_s {
64849+ /* version of znode recorder at the time of seal creation */
64850+ __u64 version;
64851+ /* block number of znode attached to this seal */
64852+ reiser4_block_nr block;
64853+#if REISER4_DEBUG
64854+ /* coord this seal is attached to. For debugging. */
64855+ coord_t coord1;
64856+ /* key this seal is attached to. For debugging. */
64857+ reiser4_key key;
64858+#endif
64859+} seal_t;
64860+
64861+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
64862+extern void reiser4_seal_done(seal_t *);
64863+extern int reiser4_seal_is_set(const seal_t *);
64864+extern int reiser4_seal_validate(seal_t *, coord_t *,
64865+ const reiser4_key *, lock_handle *,
64866+ znode_lock_mode mode, znode_lock_request request);
64867+
64868+/* __SEAL_H__ */
64869+#endif
64870+
64871+/* Make Linus happy.
64872+ Local variables:
64873+ c-indentation-style: "K&R"
64874+ mode-name: "LC"
64875+ c-basic-offset: 8
64876+ tab-width: 8
64877+ fill-column: 120
64878+ End:
64879+*/
64880diff -urN linux-2.6.20.orig/fs/reiser4/search.c linux-2.6.20/fs/reiser4/search.c
64881--- linux-2.6.20.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
64882+++ linux-2.6.20/fs/reiser4/search.c 2007-05-06 14:50:43.871029467 +0400
64883@@ -0,0 +1,1611 @@
64884+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64885+ * reiser4/README */
64886+
64887+#include "forward.h"
64888+#include "debug.h"
64889+#include "dformat.h"
64890+#include "key.h"
64891+#include "coord.h"
64892+#include "seal.h"
64893+#include "plugin/item/item.h"
64894+#include "plugin/node/node.h"
64895+#include "plugin/plugin.h"
64896+#include "jnode.h"
64897+#include "znode.h"
64898+#include "block_alloc.h"
64899+#include "tree_walk.h"
64900+#include "tree.h"
64901+#include "reiser4.h"
64902+#include "super.h"
64903+#include "inode.h"
64904+
64905+#include <linux/slab.h>
64906+
64907+static const char *bias_name(lookup_bias bias);
64908+
64909+/* tree searching algorithm, intranode searching algorithms are in
64910+ plugin/node/ */
64911+
64912+/* tree lookup cache
64913+ *
64914+ * The coord by key cache consists of small list of recently accessed nodes
64915+ * maintained according to the LRU discipline. Before doing real top-to-down
64916+ * tree traversal this cache is scanned for nodes that can contain key
64917+ * requested.
64918+ *
64919+ * The efficiency of coord cache depends heavily on locality of reference for
64920+ * tree accesses. Our user level simulations show reasonably good hit ratios
64921+ * for coord cache under most loads so far.
64922+ */
64923+
64924+/* Initialise coord cache slot */
64925+static void cbk_cache_init_slot(cbk_cache_slot *slot)
64926+{
64927+ assert("nikita-345", slot != NULL);
64928+
64929+ INIT_LIST_HEAD(&slot->lru);
64930+ slot->node = NULL;
64931+}
64932+
64933+/* Initialize coord cache */
64934+int cbk_cache_init(cbk_cache *cache /* cache to init */ )
64935+{
64936+ int i;
64937+
64938+ assert("nikita-346", cache != NULL);
64939+
64940+ cache->slot =
64941+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
64942+ reiser4_ctx_gfp_mask_get());
64943+ if (cache->slot == NULL)
64944+ return RETERR(-ENOMEM);
64945+
64946+ INIT_LIST_HEAD(&cache->lru);
64947+ for (i = 0; i < cache->nr_slots; ++i) {
64948+ cbk_cache_init_slot(cache->slot + i);
64949+ list_add_tail(&((cache->slot + i)->lru), &cache->lru);
64950+ }
64951+ rwlock_init(&cache->guard);
64952+ return 0;
64953+}
64954+
64955+/* free cbk cache data */
64956+void cbk_cache_done(cbk_cache * cache /* cache to release */ )
64957+{
64958+ assert("nikita-2493", cache != NULL);
64959+ if (cache->slot != NULL) {
64960+ kfree(cache->slot);
64961+ cache->slot = NULL;
64962+ }
64963+}
64964+
64965+/* macro to iterate over all cbk cache slots */
64966+#define for_all_slots(cache, slot) \
64967+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
64968+ &(cache)->lru != &(slot)->lru; \
64969+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
64970+
64971+#if REISER4_DEBUG
64972+/* this function assures that [cbk-cache-invariant] invariant holds */
64973+static int cbk_cache_invariant(const cbk_cache *cache)
64974+{
64975+ cbk_cache_slot *slot;
64976+ int result;
64977+ int unused;
64978+
64979+ if (cache->nr_slots == 0)
64980+ return 1;
64981+
64982+ assert("nikita-2469", cache != NULL);
64983+ unused = 0;
64984+ result = 1;
64985+ read_lock(&((cbk_cache *)cache)->guard);
64986+ for_all_slots(cache, slot) {
64987+ /* in LRU first go all `used' slots followed by `unused' */
64988+ if (unused && (slot->node != NULL))
64989+ result = 0;
64990+ if (slot->node == NULL)
64991+ unused = 1;
64992+ else {
64993+ cbk_cache_slot *scan;
64994+
64995+ /* all cached nodes are different */
64996+ scan = slot;
64997+ while (result) {
64998+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
64999+ if (&cache->lru == &scan->lru)
65000+ break;
65001+ if (slot->node == scan->node)
65002+ result = 0;
65003+ }
65004+ }
65005+ if (!result)
65006+ break;
65007+ }
65008+ read_unlock(&((cbk_cache *)cache)->guard);
65009+ return result;
65010+}
65011+
65012+#endif
65013+
65014+/* Remove references, if any, to @node from coord cache */
65015+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
65016+ reiser4_tree * tree /* tree to remove node from */ )
65017+{
65018+ cbk_cache_slot *slot;
65019+ cbk_cache *cache;
65020+ int i;
65021+
65022+ assert("nikita-350", node != NULL);
65023+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
65024+
65025+ cache = &tree->cbk_cache;
65026+ assert("nikita-2470", cbk_cache_invariant(cache));
65027+
65028+ write_lock(&(cache->guard));
65029+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65030+ if (slot->node == node) {
65031+ list_move_tail(&slot->lru, &cache->lru);
65032+ slot->node = NULL;
65033+ break;
65034+ }
65035+ }
65036+ write_unlock(&(cache->guard));
65037+ assert("nikita-2471", cbk_cache_invariant(cache));
65038+}
65039+
65040+/* add to the cbk-cache in the "tree" information about "node". This
65041+ can actually be update of existing slot in a cache. */
65042+static void cbk_cache_add(const znode *node /* node to add to the cache */ )
65043+{
65044+ cbk_cache *cache;
65045+ cbk_cache_slot *slot;
65046+ int i;
65047+
65048+ assert("nikita-352", node != NULL);
65049+
65050+ cache = &znode_get_tree(node)->cbk_cache;
65051+ assert("nikita-2472", cbk_cache_invariant(cache));
65052+
65053+ if (cache->nr_slots == 0)
65054+ return;
65055+
65056+ write_lock(&(cache->guard));
65057+ /* find slot to update/add */
65058+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65059+ /* oops, this node is already in a cache */
65060+ if (slot->node == node)
65061+ break;
65062+ }
65063+ /* if all slots are used, reuse least recently used one */
65064+ if (i == cache->nr_slots) {
65065+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
65066+ slot->node = (znode *) node;
65067+ }
65068+ list_move(&slot->lru, &cache->lru);
65069+ write_unlock(&(cache->guard));
65070+ assert("nikita-2473", cbk_cache_invariant(cache));
65071+}
65072+
65073+static int setup_delimiting_keys(cbk_handle * h);
65074+static lookup_result coord_by_handle(cbk_handle * handle);
65075+static lookup_result traverse_tree(cbk_handle * h);
65076+static int cbk_cache_search(cbk_handle * h);
65077+
65078+static level_lookup_result cbk_level_lookup(cbk_handle * h);
65079+static level_lookup_result cbk_node_lookup(cbk_handle * h);
65080+
65081+/* helper functions */
65082+
65083+static void update_stale_dk(reiser4_tree * tree, znode * node);
65084+
65085+/* release parent node during traversal */
65086+static void put_parent(cbk_handle * h);
65087+/* check consistency of fields */
65088+static int sanity_check(cbk_handle * h);
65089+/* release resources in handle */
65090+static void hput(cbk_handle * h);
65091+
65092+static level_lookup_result search_to_left(cbk_handle * h);
65093+
65094+/* pack numerous (numberous I should say) arguments of coord_by_key() into
65095+ * cbk_handle */
65096+static cbk_handle *cbk_pack(cbk_handle * handle,
65097+ reiser4_tree * tree,
65098+ const reiser4_key * key,
65099+ coord_t * coord,
65100+ lock_handle * active_lh,
65101+ lock_handle * parent_lh,
65102+ znode_lock_mode lock_mode,
65103+ lookup_bias bias,
65104+ tree_level lock_level,
65105+ tree_level stop_level,
65106+ __u32 flags, ra_info_t * info)
65107+{
65108+ memset(handle, 0, sizeof *handle);
65109+
65110+ handle->tree = tree;
65111+ handle->key = key;
65112+ handle->lock_mode = lock_mode;
65113+ handle->bias = bias;
65114+ handle->lock_level = lock_level;
65115+ handle->stop_level = stop_level;
65116+ handle->coord = coord;
65117+ /* set flags. See comment in tree.h:cbk_flags */
65118+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
65119+
65120+ handle->active_lh = active_lh;
65121+ handle->parent_lh = parent_lh;
65122+ handle->ra_info = info;
65123+ return handle;
65124+}
65125+
65126+/* main tree lookup procedure
65127+
65128+ Check coord cache. If key we are looking for is not found there, call cbk()
65129+ to do real tree traversal.
65130+
65131+ As we have extents on the twig level, @lock_level and @stop_level can
65132+ be different from LEAF_LEVEL and each other.
65133+
65134+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
65135+ long term locks) while calling this.
65136+*/
65137+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
65138+ * in. Usually this tree is
65139+ * part of file-system
65140+ * super-block */ ,
65141+ const reiser4_key * key /* key to look for */ ,
65142+ coord_t * coord /* where to store found
65143+ * position in a tree. Fields
65144+ * in "coord" are only valid if
65145+ * coord_by_key() returned
65146+ * "CBK_COORD_FOUND" */ ,
65147+ lock_handle * lh, /* resulting lock handle */
65148+ znode_lock_mode lock_mode /* type of lookup we
65149+ * want on node. Pass
65150+ * ZNODE_READ_LOCK here
65151+ * if you only want to
65152+ * read item found and
65153+ * ZNODE_WRITE_LOCK if
65154+ * you want to modify
65155+ * it */ ,
65156+ lookup_bias bias /* what to return if coord
65157+ * with exactly the @key is
65158+ * not in the tree */ ,
65159+ tree_level lock_level /* tree level where to start
65160+ * taking @lock type of
65161+ * locks */ ,
65162+ tree_level stop_level /* tree level to stop. Pass
65163+ * LEAF_LEVEL or TWIG_LEVEL
65164+ * here Item being looked
65165+ * for has to be between
65166+ * @lock_level and
65167+ * @stop_level, inclusive */ ,
65168+ __u32 flags /* search flags */ ,
65169+ ra_info_t *
65170+ info
65171+ /* information about desired tree traversal readahead */
65172+ )
65173+{
65174+ cbk_handle handle;
65175+ lock_handle parent_lh;
65176+ lookup_result result;
65177+
65178+ init_lh(lh);
65179+ init_lh(&parent_lh);
65180+
65181+ assert("nikita-3023", reiser4_schedulable());
65182+
65183+ assert("nikita-353", tree != NULL);
65184+ assert("nikita-354", key != NULL);
65185+ assert("nikita-355", coord != NULL);
65186+ assert("nikita-356", (bias == FIND_EXACT)
65187+ || (bias == FIND_MAX_NOT_MORE_THAN));
65188+ assert("nikita-357", stop_level >= LEAF_LEVEL);
65189+ /* no locks can be held during tree traversal */
65190+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65191+
65192+ cbk_pack(&handle,
65193+ tree,
65194+ key,
65195+ coord,
65196+ lh,
65197+ &parent_lh,
65198+ lock_mode, bias, lock_level, stop_level, flags, info);
65199+
65200+ result = coord_by_handle(&handle);
65201+ assert("nikita-3247",
65202+ ergo(!IS_CBKERR(result), coord->node == lh->node));
65203+ return result;
65204+}
65205+
65206+/* like coord_by_key(), but starts traversal from vroot of @object rather than
65207+ * from tree root. */
65208+lookup_result reiser4_object_lookup(struct inode * object,
65209+ const reiser4_key * key,
65210+ coord_t * coord,
65211+ lock_handle * lh,
65212+ znode_lock_mode lock_mode,
65213+ lookup_bias bias,
65214+ tree_level lock_level,
65215+ tree_level stop_level, __u32 flags,
65216+ ra_info_t * info)
65217+{
65218+ cbk_handle handle;
65219+ lock_handle parent_lh;
65220+ lookup_result result;
65221+
65222+ init_lh(lh);
65223+ init_lh(&parent_lh);
65224+
65225+ assert("nikita-3023", reiser4_schedulable());
65226+
65227+ assert("nikita-354", key != NULL);
65228+ assert("nikita-355", coord != NULL);
65229+ assert("nikita-356", (bias == FIND_EXACT)
65230+ || (bias == FIND_MAX_NOT_MORE_THAN));
65231+ assert("nikita-357", stop_level >= LEAF_LEVEL);
65232+ /* no locks can be held during tree search by key */
65233+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65234+
65235+ cbk_pack(&handle,
65236+ object != NULL ? reiser4_tree_by_inode(object) : current_tree,
65237+ key,
65238+ coord,
65239+ lh,
65240+ &parent_lh,
65241+ lock_mode, bias, lock_level, stop_level, flags, info);
65242+ handle.object = object;
65243+
65244+ result = coord_by_handle(&handle);
65245+ assert("nikita-3247",
65246+ ergo(!IS_CBKERR(result), coord->node == lh->node));
65247+ return result;
65248+}
65249+
65250+/* lookup by cbk_handle. Common part of coord_by_key() and
65251+ reiser4_object_lookup(). */
65252+static lookup_result coord_by_handle(cbk_handle * handle)
65253+{
65254+ /*
65255+ * first check cbk_cache (which is look-aside cache for our tree) and
65256+ * of this fails, start traversal.
65257+ */
65258+ /* first check whether "key" is in cache of recent lookups. */
65259+ if (cbk_cache_search(handle) == 0)
65260+ return handle->result;
65261+ else
65262+ return traverse_tree(handle);
65263+}
65264+
65265+/* Execute actor for each item (or unit, depending on @through_units_p),
65266+ starting from @coord, right-ward, until either:
65267+
65268+ - end of the tree is reached
65269+ - unformatted node is met
65270+ - error occurred
65271+ - @actor returns 0 or less
65272+
65273+ Error code, or last actor return value is returned.
65274+
65275+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
65276+ sequence of entries with identical keys and alikes.
65277+*/
65278+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
65279+ coord_t * coord /* coord to start from */ ,
65280+ lock_handle * lh /* lock handle to start with and to
65281+ * update along the way */ ,
65282+ tree_iterate_actor_t actor /* function to call on each
65283+ * item/unit */ ,
65284+ void *arg /* argument to pass to @actor */ ,
65285+ znode_lock_mode mode /* lock mode on scanned nodes */ ,
65286+ int through_units_p /* call @actor on each item or on
65287+ * each unit */ )
65288+{
65289+ int result;
65290+
65291+ assert("nikita-1143", tree != NULL);
65292+ assert("nikita-1145", coord != NULL);
65293+ assert("nikita-1146", lh != NULL);
65294+ assert("nikita-1147", actor != NULL);
65295+
65296+ result = zload(coord->node);
65297+ coord_clear_iplug(coord);
65298+ if (result != 0)
65299+ return result;
65300+ if (!coord_is_existing_unit(coord)) {
65301+ zrelse(coord->node);
65302+ return -ENOENT;
65303+ }
65304+ while ((result = actor(tree, coord, lh, arg)) > 0) {
65305+ /* move further */
65306+ if ((through_units_p && coord_next_unit(coord)) ||
65307+ (!through_units_p && coord_next_item(coord))) {
65308+ do {
65309+ lock_handle couple;
65310+
65311+ /* move to the next node */
65312+ init_lh(&couple);
65313+ result =
65314+ reiser4_get_right_neighbor(&couple,
65315+ coord->node,
65316+ (int)mode,
65317+ GN_CAN_USE_UPPER_LEVELS);
65318+ zrelse(coord->node);
65319+ if (result == 0) {
65320+
65321+ result = zload(couple.node);
65322+ if (result != 0) {
65323+ done_lh(&couple);
65324+ return result;
65325+ }
65326+
65327+ coord_init_first_unit(coord,
65328+ couple.node);
65329+ done_lh(lh);
65330+ move_lh(lh, &couple);
65331+ } else
65332+ return result;
65333+ } while (node_is_empty(coord->node));
65334+ }
65335+
65336+ assert("nikita-1149", coord_is_existing_unit(coord));
65337+ }
65338+ zrelse(coord->node);
65339+ return result;
65340+}
65341+
65342+/* return locked uber znode for @tree */
65343+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65344+ znode_lock_request pri, lock_handle * lh)
65345+{
65346+ int result;
65347+
65348+ result = longterm_lock_znode(lh, tree->uber, mode, pri);
65349+ return result;
65350+}
65351+
65352+/* true if @key is strictly within @node
65353+
65354+ we are looking for possibly non-unique key and it is item is at the edge of
65355+ @node. May be it is in the neighbor.
65356+*/
65357+static int znode_contains_key_strict(znode * node /* node to check key
65358+ * against */ ,
65359+ const reiser4_key *
65360+ key /* key to check */ ,
65361+ int isunique)
65362+{
65363+ int answer;
65364+
65365+ assert("nikita-1760", node != NULL);
65366+ assert("nikita-1722", key != NULL);
65367+
65368+ if (keyge(key, &node->rd_key))
65369+ return 0;
65370+
65371+ answer = keycmp(&node->ld_key, key);
65372+
65373+ if (isunique)
65374+ return answer != GREATER_THAN;
65375+ else
65376+ return answer == LESS_THAN;
65377+}
65378+
65379+/*
65380+ * Virtual Root (vroot) code.
65381+ *
65382+ * For given file system object (e.g., regular file or directory) let's
65383+ * define its "virtual root" as lowest in the tree (that is, furtherest
65384+ * from the tree root) node such that all body items of said object are
65385+ * located in a tree rooted at this node.
65386+ *
65387+ * Once vroot of object is found all tree lookups for items within body of
65388+ * this object ("object lookups") can be started from its vroot rather
65389+ * than from real root. This has following advantages:
65390+ *
65391+ * 1. amount of nodes traversed during lookup (and, hence, amount of
65392+ * key comparisons made) decreases, and
65393+ *
65394+ * 2. contention on tree root is decreased. This latter was actually
65395+ * motivating reason behind vroot, because spin lock of root node,
65396+ * which is taken when acquiring long-term lock on root node is the
65397+ * hottest lock in the reiser4.
65398+ *
65399+ * How to find vroot.
65400+ *
65401+ * When vroot of object F is not yet determined, all object lookups start
65402+ * from the root of the tree. At each tree level during traversal we have
65403+ * a node N such that a key we are looking for (which is the key inside
65404+ * object's body) is located within N. In function handle_vroot() called
65405+ * from cbk_level_lookup() we check whether N is possible vroot for
65406+ * F. Check is trivial---if neither leftmost nor rightmost item of N
65407+ * belongs to F (and we already have helpful ->owns_item() method of
65408+ * object plugin for this), then N is possible vroot of F. This, of
65409+ * course, relies on the assumption that each object occupies contiguous
65410+ * range of keys in the tree.
65411+ *
65412+ * Thus, traversing tree downward and checking each node as we go, we can
65413+ * find lowest such node, which, by definition, is vroot.
65414+ *
65415+ * How to track vroot.
65416+ *
65417+ * Nohow. If actual vroot changes, next object lookup will just restart
65418+ * from the actual tree root, refreshing object's vroot along the way.
65419+ *
65420+ */
65421+
65422+/*
65423+ * Check whether @node is possible vroot of @object.
65424+ */
65425+static void handle_vroot(struct inode *object, znode * node)
65426+{
65427+ file_plugin *fplug;
65428+ coord_t coord;
65429+
65430+ fplug = inode_file_plugin(object);
65431+ assert("nikita-3353", fplug != NULL);
65432+ assert("nikita-3354", fplug->owns_item != NULL);
65433+
65434+ if (unlikely(node_is_empty(node)))
65435+ return;
65436+
65437+ coord_init_first_unit(&coord, node);
65438+ /*
65439+ * if leftmost item of @node belongs to @object, we cannot be sure
65440+ * that @node is vroot of @object, because, some items of @object are
65441+ * probably in the sub-tree rooted at the left neighbor of @node.
65442+ */
65443+ if (fplug->owns_item(object, &coord))
65444+ return;
65445+ coord_init_last_unit(&coord, node);
65446+ /* mutatis mutandis for the rightmost item */
65447+ if (fplug->owns_item(object, &coord))
65448+ return;
65449+ /* otherwise, @node is possible vroot of @object */
65450+ inode_set_vroot(object, node);
65451+}
65452+
65453+/*
65454+ * helper function used by traverse tree to start tree traversal not from the
65455+ * tree root, but from @h->object's vroot, if possible.
65456+ */
65457+static int prepare_object_lookup(cbk_handle * h)
65458+{
65459+ znode *vroot;
65460+ int result;
65461+
65462+ vroot = inode_get_vroot(h->object);
65463+ if (vroot == NULL) {
65464+ /*
65465+ * object doesn't have known vroot, start from real tree root.
65466+ */
65467+ return LOOKUP_CONT;
65468+ }
65469+
65470+ h->level = znode_get_level(vroot);
65471+ /* take a long-term lock on vroot */
65472+ h->result = longterm_lock_znode(h->active_lh, vroot,
65473+ cbk_lock_mode(h->level, h),
65474+ ZNODE_LOCK_LOPRI);
65475+ result = LOOKUP_REST;
65476+ if (h->result == 0) {
65477+ int isunique;
65478+ int inside;
65479+
65480+ isunique = h->flags & CBK_UNIQUE;
65481+ /* check that key is inside vroot */
65482+ read_lock_dk(h->tree);
65483+ inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65484+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65485+ read_unlock_dk(h->tree);
65486+ if (inside) {
65487+ h->result = zload(vroot);
65488+ if (h->result == 0) {
65489+ /* search for key in vroot. */
65490+ result = cbk_node_lookup(h);
65491+ zrelse(vroot); /*h->active_lh->node); */
65492+ if (h->active_lh->node != vroot) {
65493+ result = LOOKUP_REST;
65494+ } else if (result == LOOKUP_CONT) {
65495+ move_lh(h->parent_lh, h->active_lh);
65496+ h->flags &= ~CBK_DKSET;
65497+ }
65498+ }
65499+ }
65500+ }
65501+
65502+ zput(vroot);
65503+
65504+ if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65505+ hput(h);
65506+ return result;
65507+}
65508+
65509+/* main function that handles common parts of tree traversal: starting
65510+ (fake znode handling), restarts, error handling, completion */
65511+static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65512+{
65513+ int done;
65514+ int iterations;
65515+ int vroot_used;
65516+
65517+ assert("nikita-365", h != NULL);
65518+ assert("nikita-366", h->tree != NULL);
65519+ assert("nikita-367", h->key != NULL);
65520+ assert("nikita-368", h->coord != NULL);
65521+ assert("nikita-369", (h->bias == FIND_EXACT)
65522+ || (h->bias == FIND_MAX_NOT_MORE_THAN));
65523+ assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65524+ assert("nikita-2949", !(h->flags & CBK_DKSET));
65525+ assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65526+
65527+ done = 0;
65528+ iterations = 0;
65529+ vroot_used = 0;
65530+
65531+ /* loop for restarts */
65532+ restart:
65533+
65534+ assert("nikita-3024", reiser4_schedulable());
65535+
65536+ h->result = CBK_COORD_FOUND;
65537+ /* connect_znode() needs it */
65538+ h->ld_key = *reiser4_min_key();
65539+ h->rd_key = *reiser4_max_key();
65540+ h->flags |= CBK_DKSET;
65541+ h->error = NULL;
65542+
65543+ if (!vroot_used && h->object != NULL) {
65544+ vroot_used = 1;
65545+ done = prepare_object_lookup(h);
65546+ if (done == LOOKUP_REST) {
65547+ goto restart;
65548+ } else if (done == LOOKUP_DONE)
65549+ return h->result;
65550+ }
65551+ if (h->parent_lh->node == NULL) {
65552+ done =
65553+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
65554+ h->parent_lh);
65555+
65556+ assert("nikita-1637", done != -E_DEADLOCK);
65557+
65558+ h->block = h->tree->root_block;
65559+ h->level = h->tree->height;
65560+ h->coord->node = h->parent_lh->node;
65561+
65562+ if (done != 0)
65563+ return done;
65564+ }
65565+
65566+ /* loop descending a tree */
65567+ while (!done) {
65568+
65569+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
65570+ IS_POW(iterations))) {
65571+ warning("nikita-1481", "Too many iterations: %i",
65572+ iterations);
65573+ reiser4_print_key("key", h->key);
65574+ ++iterations;
65575+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
65576+ h->error =
65577+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
65578+ h->result = RETERR(-EIO);
65579+ break;
65580+ }
65581+ switch (cbk_level_lookup(h)) {
65582+ case LOOKUP_CONT:
65583+ move_lh(h->parent_lh, h->active_lh);
65584+ continue;
65585+ default:
65586+ wrong_return_value("nikita-372", "cbk_level");
65587+ case LOOKUP_DONE:
65588+ done = 1;
65589+ break;
65590+ case LOOKUP_REST:
65591+ hput(h);
65592+ /* deadlock avoidance is normal case. */
65593+ if (h->result != -E_DEADLOCK)
65594+ ++iterations;
65595+ reiser4_preempt_point();
65596+ goto restart;
65597+ }
65598+ }
65599+ /* that's all. The rest is error handling */
65600+ if (unlikely(h->error != NULL)) {
65601+ warning("nikita-373", "%s: level: %i, "
65602+ "lock_level: %i, stop_level: %i "
65603+ "lock_mode: %s, bias: %s",
65604+ h->error, h->level, h->lock_level, h->stop_level,
65605+ lock_mode_name(h->lock_mode), bias_name(h->bias));
65606+ reiser4_print_address("block", &h->block);
65607+ reiser4_print_key("key", h->key);
65608+ print_coord_content("coord", h->coord);
65609+ }
65610+ /* `unlikely' error case */
65611+ if (unlikely(IS_CBKERR(h->result))) {
65612+ /* failure. do cleanup */
65613+ hput(h);
65614+ } else {
65615+ assert("nikita-1605", WITH_DATA_RET
65616+ (h->coord->node, 1,
65617+ ergo((h->result == CBK_COORD_FOUND) &&
65618+ (h->bias == FIND_EXACT) &&
65619+ (!node_is_empty(h->coord->node)),
65620+ coord_is_existing_item(h->coord))));
65621+ }
65622+ return h->result;
65623+}
65624+
65625+/* find delimiting keys of child
65626+
65627+ Determine left and right delimiting keys for child pointed to by
65628+ @parent_coord.
65629+
65630+*/
65631+static void find_child_delimiting_keys(znode * parent /* parent znode, passed
65632+ * locked */ ,
65633+ const coord_t * parent_coord /* coord where
65634+ * pointer to
65635+ * child is
65636+ * stored */ ,
65637+ reiser4_key * ld /* where to store left
65638+ * delimiting key */ ,
65639+ reiser4_key * rd /* where to store right
65640+ * delimiting key */ )
65641+{
65642+ coord_t neighbor;
65643+
65644+ assert("nikita-1484", parent != NULL);
65645+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
65646+
65647+ coord_dup(&neighbor, parent_coord);
65648+
65649+ if (neighbor.between == AT_UNIT)
65650+ /* imitate item ->lookup() behavior. */
65651+ neighbor.between = AFTER_UNIT;
65652+
65653+ if (coord_set_to_left(&neighbor) == 0)
65654+ unit_key_by_coord(&neighbor, ld);
65655+ else {
65656+ assert("nikita-14851", 0);
65657+ *ld = *znode_get_ld_key(parent);
65658+ }
65659+
65660+ coord_dup(&neighbor, parent_coord);
65661+ if (neighbor.between == AT_UNIT)
65662+ neighbor.between = AFTER_UNIT;
65663+ if (coord_set_to_right(&neighbor) == 0)
65664+ unit_key_by_coord(&neighbor, rd);
65665+ else
65666+ *rd = *znode_get_rd_key(parent);
65667+}
65668+
65669+/*
65670+ * setup delimiting keys for a child
65671+ *
65672+ * @parent parent node
65673+ *
65674+ * @coord location in @parent where pointer to @child is
65675+ *
65676+ * @child child node
65677+ */
65678+int
65679+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
65680+{
65681+ reiser4_tree *tree;
65682+
65683+ assert("nikita-2952",
65684+ znode_get_level(parent) == znode_get_level(coord->node));
65685+
65686+ /* fast check without taking dk lock. This is safe, because
65687+ * JNODE_DKSET is never cleared once set. */
65688+ if (!ZF_ISSET(child, JNODE_DKSET)) {
65689+ tree = znode_get_tree(parent);
65690+ write_lock_dk(tree);
65691+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
65692+ find_child_delimiting_keys(parent, coord,
65693+ &child->ld_key,
65694+ &child->rd_key);
65695+ ON_DEBUG(child->ld_key_version =
65696+ atomic_inc_return(&delim_key_version);
65697+ child->rd_key_version =
65698+ atomic_inc_return(&delim_key_version););
65699+ ZF_SET(child, JNODE_DKSET);
65700+ }
65701+ write_unlock_dk(tree);
65702+ return 1;
65703+ }
65704+ return 0;
65705+}
65706+
65707+/* Perform tree lookup at one level. This is called from cbk_traverse()
65708+ function that drives lookup through tree and calls cbk_node_lookup() to
65709+ perform lookup within one node.
65710+
65711+ See comments in a code.
65712+*/
65713+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
65714+{
65715+ int ret;
65716+ int setdk;
65717+ int ldkeyset = 0;
65718+ reiser4_key ldkey;
65719+ reiser4_key key;
65720+ znode *active;
65721+
65722+ assert("nikita-3025", reiser4_schedulable());
65723+
65724+ /* acquire reference to @active node */
65725+ active =
65726+ zget(h->tree, &h->block, h->parent_lh->node, h->level,
65727+ reiser4_ctx_gfp_mask_get());
65728+
65729+ if (IS_ERR(active)) {
65730+ h->result = PTR_ERR(active);
65731+ return LOOKUP_DONE;
65732+ }
65733+
65734+ /* lock @active */
65735+ h->result = longterm_lock_znode(h->active_lh,
65736+ active,
65737+ cbk_lock_mode(h->level, h),
65738+ ZNODE_LOCK_LOPRI);
65739+ /* longterm_lock_znode() acquires additional reference to znode (which
65740+ will be later released by longterm_unlock_znode()). Release
65741+ reference acquired by zget().
65742+ */
65743+ zput(active);
65744+ if (unlikely(h->result != 0))
65745+ goto fail_or_restart;
65746+
65747+ setdk = 0;
65748+ /* if @active is accessed for the first time, setup delimiting keys on
65749+ it. Delimiting keys are taken from the parent node. See
65750+ setup_delimiting_keys() for details.
65751+ */
65752+ if (h->flags & CBK_DKSET) {
65753+ setdk = setup_delimiting_keys(h);
65754+ h->flags &= ~CBK_DKSET;
65755+ } else {
65756+ znode *parent;
65757+
65758+ parent = h->parent_lh->node;
65759+ h->result = zload(parent);
65760+ if (unlikely(h->result != 0))
65761+ goto fail_or_restart;
65762+
65763+ if (!ZF_ISSET(active, JNODE_DKSET))
65764+ setdk = set_child_delimiting_keys(parent,
65765+ h->coord, active);
65766+ else {
65767+ read_lock_dk(h->tree);
65768+ find_child_delimiting_keys(parent, h->coord, &ldkey,
65769+ &key);
65770+ read_unlock_dk(h->tree);
65771+ ldkeyset = 1;
65772+ }
65773+ zrelse(parent);
65774+ }
65775+
65776+ /* this is ugly kludge. Reminder: this is necessary, because
65777+ ->lookup() method returns coord with ->between field probably set
65778+ to something different from AT_UNIT.
65779+ */
65780+ h->coord->between = AT_UNIT;
65781+
65782+ if (znode_just_created(active) && (h->coord->node != NULL)) {
65783+ write_lock_tree(h->tree);
65784+ /* if we are going to load znode right now, setup
65785+ ->in_parent: coord where pointer to this node is stored in
65786+ parent.
65787+ */
65788+ coord_to_parent_coord(h->coord, &active->in_parent);
65789+ write_unlock_tree(h->tree);
65790+ }
65791+
65792+ /* check connectedness without holding tree lock---false negatives
65793+ * will be re-checked by connect_znode(), and false positives are
65794+ * impossible---@active cannot suddenly turn into unconnected
65795+ * state. */
65796+ if (!znode_is_connected(active)) {
65797+ h->result = connect_znode(h->coord, active);
65798+ if (unlikely(h->result != 0)) {
65799+ put_parent(h);
65800+ goto fail_or_restart;
65801+ }
65802+ }
65803+
65804+ jload_prefetch(ZJNODE(active));
65805+
65806+ if (setdk)
65807+ update_stale_dk(h->tree, active);
65808+
65809+ /* put_parent() cannot be called earlier, because connect_znode()
65810+ assumes parent node is referenced; */
65811+ put_parent(h);
65812+
65813+ if ((!znode_contains_key_lock(active, h->key) &&
65814+ (h->flags & CBK_TRUST_DK))
65815+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
65816+ /* 1. key was moved out of this node while this thread was
65817+ waiting for the lock. Restart. More elaborate solution is
65818+ to determine where key moved (to the left, or to the right)
65819+ and try to follow it through sibling pointers.
65820+
65821+ 2. or, node itself is going to be removed from the
65822+ tree. Release lock and restart.
65823+ */
65824+ h->result = -E_REPEAT;
65825+ }
65826+ if (h->result == -E_REPEAT)
65827+ return LOOKUP_REST;
65828+
65829+ h->result = zload_ra(active, h->ra_info);
65830+ if (h->result) {
65831+ return LOOKUP_DONE;
65832+ }
65833+
65834+ /* sanity checks */
65835+ if (sanity_check(h)) {
65836+ zrelse(active);
65837+ return LOOKUP_DONE;
65838+ }
65839+
65840+ /* check that key of leftmost item in the @active is the same as in
65841+ * its parent */
65842+ if (ldkeyset && !node_is_empty(active) &&
65843+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
65844+ warning("vs-3533", "Keys are inconsistent. Fsck?");
65845+ reiser4_print_key("inparent", &ldkey);
65846+ reiser4_print_key("inchild", &key);
65847+ h->result = RETERR(-EIO);
65848+ zrelse(active);
65849+ return LOOKUP_DONE;
65850+ }
65851+
65852+ if (h->object != NULL)
65853+ handle_vroot(h->object, active);
65854+
65855+ ret = cbk_node_lookup(h);
65856+
65857+ /* h->active_lh->node might change, but active is yet to be zrelsed */
65858+ zrelse(active);
65859+
65860+ return ret;
65861+
65862+ fail_or_restart:
65863+ if (h->result == -E_DEADLOCK)
65864+ return LOOKUP_REST;
65865+ return LOOKUP_DONE;
65866+}
65867+
65868+#if REISER4_DEBUG
65869+/* check left and right delimiting keys of a znode */
65870+void check_dkeys(znode * node)
65871+{
65872+ znode *left;
65873+ znode *right;
65874+
65875+ read_lock_tree(current_tree);
65876+ read_lock_dk(current_tree);
65877+
65878+ assert("vs-1710", znode_is_any_locked(node));
65879+ assert("vs-1197",
65880+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
65881+
65882+ left = node->left;
65883+ right = node->right;
65884+
65885+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65886+ && left != NULL && ZF_ISSET(left, JNODE_DKSET))
65887+ /* check left neighbor. Note that left neighbor is not locked,
65888+ so it might get wrong delimiting keys therefore */
65889+ assert("vs-1198",
65890+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
65891+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
65892+
65893+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65894+ && right != NULL && ZF_ISSET(right, JNODE_DKSET))
65895+ /* check right neighbor. Note that right neighbor is not
65896+ locked, so it might get wrong delimiting keys therefore */
65897+ assert("vs-1199",
65898+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
65899+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
65900+
65901+ read_unlock_dk(current_tree);
65902+ read_unlock_tree(current_tree);
65903+}
65904+#endif
65905+
65906+/* true if @key is left delimiting key of @node */
65907+static int key_is_ld(znode * node, const reiser4_key * key)
65908+{
65909+ int ld;
65910+
65911+ assert("nikita-1716", node != NULL);
65912+ assert("nikita-1758", key != NULL);
65913+
65914+ read_lock_dk(znode_get_tree(node));
65915+ assert("nikita-1759", znode_contains_key(node, key));
65916+ ld = keyeq(znode_get_ld_key(node), key);
65917+ read_unlock_dk(znode_get_tree(node));
65918+ return ld;
65919+}
65920+
65921+/* Process one node during tree traversal.
65922+
65923+ This is called by cbk_level_lookup(). */
65924+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
65925+{
65926+ /* node plugin of @active */
65927+ node_plugin *nplug;
65928+ /* item plugin of item that was found */
65929+ item_plugin *iplug;
65930+ /* search bias */
65931+ lookup_bias node_bias;
65932+ /* node we are operating upon */
65933+ znode *active;
65934+ /* tree we are searching in */
65935+ reiser4_tree *tree;
65936+ /* result */
65937+ int result;
65938+
65939+ assert("nikita-379", h != NULL);
65940+
65941+ active = h->active_lh->node;
65942+ tree = h->tree;
65943+
65944+ nplug = active->nplug;
65945+ assert("nikita-380", nplug != NULL);
65946+
65947+ ON_DEBUG(check_dkeys(active));
65948+
65949+ /* return item from "active" node with maximal key not greater than
65950+ "key" */
65951+ node_bias = h->bias;
65952+ result = nplug->lookup(active, h->key, node_bias, h->coord);
65953+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
65954+ /* error occurred */
65955+ h->result = result;
65956+ return LOOKUP_DONE;
65957+ }
65958+ if (h->level == h->stop_level) {
65959+ /* welcome to the stop level */
65960+ assert("nikita-381", h->coord->node == active);
65961+ if (result == NS_FOUND) {
65962+ /* success of tree lookup */
65963+ if (!(h->flags & CBK_UNIQUE)
65964+ && key_is_ld(active, h->key)) {
65965+ return search_to_left(h);
65966+ } else
65967+ h->result = CBK_COORD_FOUND;
65968+ } else {
65969+ h->result = CBK_COORD_NOTFOUND;
65970+ }
65971+ if (!(h->flags & CBK_IN_CACHE))
65972+ cbk_cache_add(active);
65973+ return LOOKUP_DONE;
65974+ }
65975+
65976+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
65977+ h->error = "not found on internal node";
65978+ h->result = result;
65979+ return LOOKUP_DONE;
65980+ }
65981+
65982+ assert("vs-361", h->level > h->stop_level);
65983+
65984+ if (handle_eottl(h, &result)) {
65985+ assert("vs-1674", (result == LOOKUP_DONE ||
65986+ result == LOOKUP_REST));
65987+ return result;
65988+ }
65989+
65990+ /* go down to next level */
65991+ check_me("vs-12", zload(h->coord->node) == 0);
65992+ assert("nikita-2116", item_is_internal(h->coord));
65993+ iplug = item_plugin_by_coord(h->coord);
65994+ iplug->s.internal.down_link(h->coord, h->key, &h->block);
65995+ zrelse(h->coord->node);
65996+ --h->level;
65997+ return LOOKUP_CONT; /* continue */
65998+}
65999+
66000+/* scan cbk_cache slots looking for a match for @h */
66001+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
66002+{
66003+ level_lookup_result llr;
66004+ znode *node;
66005+ reiser4_tree *tree;
66006+ cbk_cache_slot *slot;
66007+ cbk_cache *cache;
66008+ tree_level level;
66009+ int isunique;
66010+ const reiser4_key *key;
66011+ int result;
66012+
66013+ assert("nikita-1317", h != NULL);
66014+ assert("nikita-1315", h->tree != NULL);
66015+ assert("nikita-1316", h->key != NULL);
66016+
66017+ tree = h->tree;
66018+ cache = &tree->cbk_cache;
66019+ if (cache->nr_slots == 0)
66020+ /* size of cbk cache was set to 0 by mount time option. */
66021+ return RETERR(-ENOENT);
66022+
66023+ assert("nikita-2474", cbk_cache_invariant(cache));
66024+ node = NULL; /* to keep gcc happy */
66025+ level = h->level;
66026+ key = h->key;
66027+ isunique = h->flags & CBK_UNIQUE;
66028+ result = RETERR(-ENOENT);
66029+
66030+ /*
66031+ * this is time-critical function and dragons had, hence, been settled
66032+ * here.
66033+ *
66034+ * Loop below scans cbk cache slots trying to find matching node with
66035+ * suitable range of delimiting keys and located at the h->level.
66036+ *
66037+ * Scan is done under cbk cache spin lock that protects slot->node
66038+ * pointers. If suitable node is found we want to pin it in
66039+ * memory. But slot->node can point to the node with x_count 0
66040+ * (unreferenced). Such node can be recycled at any moment, or can
66041+ * already be in the process of being recycled (within jput()).
66042+ *
66043+ * As we found node in the cbk cache, it means that jput() hasn't yet
66044+ * called cbk_cache_invalidate().
66045+ *
66046+ * We acquire reference to the node without holding tree lock, and
66047+ * later, check node's RIP bit. This avoids races with jput().
66048+ */
66049+
66050+ rcu_read_lock();
66051+ read_lock(&((cbk_cache *)cache)->guard);
66052+
66053+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
66054+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
66055+ BUG_ON(&slot->lru != &cache->lru);/*????*/
66056+ while (1) {
66057+
66058+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
66059+
66060+ if (&cache->lru != &slot->lru)
66061+ node = slot->node;
66062+ else
66063+ node = NULL;
66064+
66065+ if (unlikely(node == NULL))
66066+ break;
66067+
66068+ /*
66069+ * this is (hopefully) the only place in the code where we are
66070+ * working with delimiting keys without holding dk lock. This
66071+ * is fine here, because this is only "guess" anyway---keys
66072+ * are rechecked under dk lock below.
66073+ */
66074+ if (znode_get_level(node) == level &&
66075+ /* reiser4_min_key < key < reiser4_max_key */
66076+ znode_contains_key_strict(node, key, isunique)) {
66077+ zref(node);
66078+ result = 0;
66079+ spin_lock_prefetch(&tree->tree_lock);
66080+ break;
66081+ }
66082+ }
66083+ read_unlock(&((cbk_cache *)cache)->guard);
66084+
66085+ assert("nikita-2475", cbk_cache_invariant(cache));
66086+
66087+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
66088+ result = -ENOENT;
66089+
66090+ rcu_read_unlock();
66091+
66092+ if (result != 0) {
66093+ h->result = CBK_COORD_NOTFOUND;
66094+ return RETERR(-ENOENT);
66095+ }
66096+
66097+ result =
66098+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
66099+ ZNODE_LOCK_LOPRI);
66100+ zput(node);
66101+ if (result != 0)
66102+ return result;
66103+ result = zload(node);
66104+ if (result != 0)
66105+ return result;
66106+
66107+ /* recheck keys */
66108+ read_lock_dk(tree);
66109+ result = (znode_contains_key_strict(node, key, isunique) &&
66110+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66111+ read_unlock_dk(tree);
66112+ if (result) {
66113+ /* do lookup inside node */
66114+ llr = cbk_node_lookup(h);
66115+ /* if cbk_node_lookup() wandered to another node (due to eottl
66116+ or non-unique keys), adjust @node */
66117+ /*node = h->active_lh->node; */
66118+
66119+ if (llr != LOOKUP_DONE) {
66120+ /* restart or continue on the next level */
66121+ result = RETERR(-ENOENT);
66122+ } else if (IS_CBKERR(h->result))
66123+ /* io or oom */
66124+ result = RETERR(-ENOENT);
66125+ else {
66126+ /* good. Either item found or definitely not found. */
66127+ result = 0;
66128+
66129+ write_lock(&(cache->guard));
66130+ if (slot->node == h->active_lh->node /*node */ ) {
66131+ /* if this node is still in cbk cache---move
66132+ its slot to the head of the LRU list. */
66133+ list_move(&slot->lru, &cache->lru);
66134+ }
66135+ write_unlock(&(cache->guard));
66136+ }
66137+ } else {
66138+ /* race. While this thread was waiting for the lock, node was
66139+ rebalanced and item we are looking for, shifted out of it
66140+ (if it ever was here).
66141+
66142+ Continuing scanning is almost hopeless: node key range was
66143+ moved to, is almost certainly at the beginning of the LRU
66144+ list at this time, because it's hot, but restarting
66145+ scanning from the very beginning is complex. Just return,
66146+ so that cbk() will be performed. This is not that
66147+ important, because such races should be rare. Are they?
66148+ */
66149+ result = RETERR(-ENOENT); /* -ERAUGHT */
66150+ }
66151+ zrelse(node);
66152+ assert("nikita-2476", cbk_cache_invariant(cache));
66153+ return result;
66154+}
66155+
66156+/* look for item with given key in the coord cache
66157+
66158+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
66159+ which is a small LRU list of znodes accessed lately. For each znode in
66160+ znode in this list, it checks whether key we are looking for fits into key
66161+ range covered by this node. If so, and in addition, node lies at allowed
66162+ level (this is to handle extents on a twig level), node is locked, and
66163+ lookup inside it is performed.
66164+
66165+ we need a measurement of the cost of this cache search compared to the cost
66166+ of coord_by_key.
66167+
66168+*/
66169+static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
66170+{
66171+ int result = 0;
66172+ tree_level level;
66173+
66174+ /* add CBK_IN_CACHE to the handle flags. This means that
66175+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add
66176+ * found node to the cache. */
66177+ h->flags |= CBK_IN_CACHE;
66178+ for (level = h->stop_level; level <= h->lock_level; ++level) {
66179+ h->level = level;
66180+ result = cbk_cache_scan_slots(h);
66181+ if (result != 0) {
66182+ done_lh(h->active_lh);
66183+ done_lh(h->parent_lh);
66184+ } else {
66185+ assert("nikita-1319", !IS_CBKERR(h->result));
66186+ break;
66187+ }
66188+ }
66189+ h->flags &= ~CBK_IN_CACHE;
66190+ return result;
66191+}
66192+
66193+/* type of lock we want to obtain during tree traversal. On stop level
66194+ we want type of lock user asked for, on upper levels: read lock. */
66195+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
66196+{
66197+ assert("nikita-382", h != NULL);
66198+
66199+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
66200+}
66201+
66202+/* update outdated delimiting keys */
66203+static void stale_dk(reiser4_tree * tree, znode * node)
66204+{
66205+ znode *right;
66206+
66207+ read_lock_tree(tree);
66208+ write_lock_dk(tree);
66209+ right = node->right;
66210+
66211+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66212+ right && ZF_ISSET(right, JNODE_DKSET) &&
66213+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
66214+ znode_set_rd_key(node, znode_get_ld_key(right));
66215+
66216+ write_unlock_dk(tree);
66217+ read_unlock_tree(tree);
66218+}
66219+
66220+/* check for possibly outdated delimiting keys, and update them if
66221+ * necessary. */
66222+static void update_stale_dk(reiser4_tree * tree, znode * node)
66223+{
66224+ znode *right;
66225+ reiser4_key rd;
66226+
66227+ read_lock_tree(tree);
66228+ read_lock_dk(tree);
66229+ rd = *znode_get_rd_key(node);
66230+ right = node->right;
66231+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66232+ right && ZF_ISSET(right, JNODE_DKSET) &&
66233+ !keyeq(&rd, znode_get_ld_key(right)))) {
66234+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
66235+ read_unlock_dk(tree);
66236+ read_unlock_tree(tree);
66237+ stale_dk(tree, node);
66238+ return;
66239+ }
66240+ read_unlock_dk(tree);
66241+ read_unlock_tree(tree);
66242+}
66243+
66244+/*
66245+ * handle searches a the non-unique key.
66246+ *
66247+ * Suppose that we are looking for an item with possibly non-unique key 100.
66248+ *
66249+ * Root node contains two pointers: one to a node with left delimiting key 0,
66250+ * and another to a node with left delimiting key 100. Item we interested in
66251+ * may well happen in the sub-tree rooted at the first pointer.
66252+ *
66253+ * To handle this search_to_left() is called when search reaches stop
66254+ * level. This function checks it is _possible_ that item we are looking for
66255+ * is in the left neighbor (this can be done by comparing delimiting keys) and
66256+ * if so, tries to lock left neighbor (this is low priority lock, so it can
66257+ * deadlock, tree traversal is just restarted if it did) and then checks
66258+ * whether left neighbor actually contains items with our key.
66259+ *
66260+ * Note that this is done on the stop level only. It is possible to try such
66261+ * left-check on each level, but as duplicate keys are supposed to be rare
66262+ * (very unlikely that more than one node is completely filled with items with
66263+ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
66264+ *
66265+ */
66266+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
66267+{
66268+ level_lookup_result result;
66269+ coord_t *coord;
66270+ znode *node;
66271+ znode *neighbor;
66272+
66273+ lock_handle lh;
66274+
66275+ assert("nikita-1761", h != NULL);
66276+ assert("nikita-1762", h->level == h->stop_level);
66277+
66278+ init_lh(&lh);
66279+ coord = h->coord;
66280+ node = h->active_lh->node;
66281+ assert("nikita-1763", coord_is_leftmost_unit(coord));
66282+
66283+ h->result =
66284+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
66285+ GN_CAN_USE_UPPER_LEVELS);
66286+ neighbor = NULL;
66287+ switch (h->result) {
66288+ case -E_DEADLOCK:
66289+ result = LOOKUP_REST;
66290+ break;
66291+ case 0:{
66292+ node_plugin *nplug;
66293+ coord_t crd;
66294+ lookup_bias bias;
66295+
66296+ neighbor = lh.node;
66297+ h->result = zload(neighbor);
66298+ if (h->result != 0) {
66299+ result = LOOKUP_DONE;
66300+ break;
66301+ }
66302+
66303+ nplug = neighbor->nplug;
66304+
66305+ coord_init_zero(&crd);
66306+ bias = h->bias;
66307+ h->bias = FIND_EXACT;
66308+ h->result =
66309+ nplug->lookup(neighbor, h->key, h->bias, &crd);
66310+ h->bias = bias;
66311+
66312+ if (h->result == NS_NOT_FOUND) {
66313+ case -E_NO_NEIGHBOR:
66314+ h->result = CBK_COORD_FOUND;
66315+ if (!(h->flags & CBK_IN_CACHE))
66316+ cbk_cache_add(node);
66317+ default: /* some other error */
66318+ result = LOOKUP_DONE;
66319+ } else if (h->result == NS_FOUND) {
66320+ read_lock_dk(znode_get_tree(neighbor));
66321+ h->rd_key = *znode_get_ld_key(node);
66322+ leftmost_key_in_node(neighbor, &h->ld_key);
66323+ read_unlock_dk(znode_get_tree(neighbor));
66324+ h->flags |= CBK_DKSET;
66325+
66326+ h->block = *znode_get_block(neighbor);
66327+ /* clear coord -> node so that cbk_level_lookup()
66328+ wouldn't overwrite parent hint in neighbor.
66329+
66330+ Parent hint was set up by
66331+ reiser4_get_left_neighbor()
66332+ */
66333+ /* FIXME: why do we have to spinlock here? */
66334+ write_lock_tree(znode_get_tree(neighbor));
66335+ h->coord->node = NULL;
66336+ write_unlock_tree(znode_get_tree(neighbor));
66337+ result = LOOKUP_CONT;
66338+ } else {
66339+ result = LOOKUP_DONE;
66340+ }
66341+ if (neighbor != NULL)
66342+ zrelse(neighbor);
66343+ }
66344+ }
66345+ done_lh(&lh);
66346+ return result;
66347+}
66348+
66349+/* debugging aid: return symbolic name of search bias */
66350+static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66351+{
66352+ if (bias == FIND_EXACT)
66353+ return "exact";
66354+ else if (bias == FIND_MAX_NOT_MORE_THAN)
66355+ return "left-slant";
66356+/* else if( bias == RIGHT_SLANT_BIAS ) */
66357+/* return "right-bias"; */
66358+ else {
66359+ static char buf[30];
66360+
66361+ sprintf(buf, "unknown: %i", bias);
66362+ return buf;
66363+ }
66364+}
66365+
66366+#if REISER4_DEBUG
66367+/* debugging aid: print human readable information about @p */
66368+void print_coord_content(const char *prefix /* prefix to print */ ,
66369+ coord_t * p /* coord to print */ )
66370+{
66371+ reiser4_key key;
66372+
66373+ if (p == NULL) {
66374+ printk("%s: null\n", prefix);
66375+ return;
66376+ }
66377+ if ((p->node != NULL) && znode_is_loaded(p->node)
66378+ && coord_is_existing_item(p))
66379+ printk("%s: data: %p, length: %i\n", prefix,
66380+ item_body_by_coord(p), item_length_by_coord(p));
66381+ if (znode_is_loaded(p->node)) {
66382+ item_key_by_coord(p, &key);
66383+ reiser4_print_key(prefix, &key);
66384+ }
66385+}
66386+
66387+/* debugging aid: print human readable information about @block */
66388+void reiser4_print_address(const char *prefix /* prefix to print */ ,
66389+ const reiser4_block_nr * block /* block number to print */ )
66390+{
66391+ printk("%s: %s\n", prefix, sprint_address(block));
66392+}
66393+#endif
66394+
66395+/* return string containing human readable representation of @block */
66396+char *sprint_address(const reiser4_block_nr *
66397+ block /* block number to print */ )
66398+{
66399+ static char address[30];
66400+
66401+ if (block == NULL)
66402+ sprintf(address, "null");
66403+ else if (reiser4_blocknr_is_fake(block))
66404+ sprintf(address, "%llx", (unsigned long long)(*block));
66405+ else
66406+ sprintf(address, "%llu", (unsigned long long)(*block));
66407+ return address;
66408+}
66409+
66410+/* release parent node during traversal */
66411+static void put_parent(cbk_handle * h /* search handle */ )
66412+{
66413+ assert("nikita-383", h != NULL);
66414+ if (h->parent_lh->node != NULL) {
66415+ longterm_unlock_znode(h->parent_lh);
66416+ }
66417+}
66418+
66419+/* helper function used by coord_by_key(): release reference to parent znode
66420+ stored in handle before processing its child. */
66421+static void hput(cbk_handle * h /* search handle */ )
66422+{
66423+ assert("nikita-385", h != NULL);
66424+ done_lh(h->parent_lh);
66425+ done_lh(h->active_lh);
66426+}
66427+
66428+/* Helper function used by cbk(): update delimiting keys of child node (stored
66429+ in h->active_lh->node) using key taken from parent on the parent level. */
66430+static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66431+{
66432+ znode *active;
66433+ reiser4_tree *tree;
66434+
66435+ assert("nikita-1088", h != NULL);
66436+
66437+ active = h->active_lh->node;
66438+
66439+ /* fast check without taking dk lock. This is safe, because
66440+ * JNODE_DKSET is never cleared once set. */
66441+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66442+ tree = znode_get_tree(active);
66443+ write_lock_dk(tree);
66444+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66445+ znode_set_ld_key(active, &h->ld_key);
66446+ znode_set_rd_key(active, &h->rd_key);
66447+ ZF_SET(active, JNODE_DKSET);
66448+ }
66449+ write_unlock_dk(tree);
66450+ return 1;
66451+ }
66452+ return 0;
66453+}
66454+
66455+/* true if @block makes sense for the @tree. Used to detect corrupted node
66456+ * pointers */
66457+static int
66458+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66459+ reiser4_tree * tree /* tree to check against */ )
66460+{
66461+ assert("nikita-757", block != NULL);
66462+ assert("nikita-758", tree != NULL);
66463+
66464+ /* check to see if it exceeds the size of the device. */
66465+ return reiser4_blocknr_is_sane_for(tree->super, block);
66466+}
66467+
66468+/* check consistency of fields */
66469+static int sanity_check(cbk_handle * h /* search handle */ )
66470+{
66471+ assert("nikita-384", h != NULL);
66472+
66473+ if (h->level < h->stop_level) {
66474+ h->error = "Buried under leaves";
66475+ h->result = RETERR(-EIO);
66476+ return LOOKUP_DONE;
66477+ } else if (!block_nr_is_correct(&h->block, h->tree)) {
66478+ h->error = "bad block number";
66479+ h->result = RETERR(-EIO);
66480+ return LOOKUP_DONE;
66481+ } else
66482+ return 0;
66483+}
66484+
66485+/* Make Linus happy.
66486+ Local variables:
66487+ c-indentation-style: "K&R"
66488+ mode-name: "LC"
66489+ c-basic-offset: 8
66490+ tab-width: 8
66491+ fill-column: 120
66492+ scroll-step: 1
66493+ End:
66494+*/
66495diff -urN linux-2.6.20.orig/fs/reiser4/status_flags.c linux-2.6.20/fs/reiser4/status_flags.c
66496--- linux-2.6.20.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
66497+++ linux-2.6.20/fs/reiser4/status_flags.c 2007-05-06 14:50:43.875030717 +0400
66498@@ -0,0 +1,175 @@
66499+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66500+ * reiser4/README */
66501+
66502+/* Functions that deal with reiser4 status block, query status and update it, if needed */
66503+
66504+#include <linux/bio.h>
66505+#include <linux/highmem.h>
66506+#include <linux/fs.h>
66507+#include <linux/blkdev.h>
66508+#include "debug.h"
66509+#include "dformat.h"
66510+#include "status_flags.h"
66511+#include "super.h"
66512+
66513+/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66514+ unconditionally unlocks the page, so we can see that io was done.
66515+ We do not free bio, because we hope to reuse that. */
66516+static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66517+ int err)
66518+{
66519+ if (bio->bi_size)
66520+ return 1;
66521+
66522+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66523+ SetPageUptodate(bio->bi_io_vec->bv_page);
66524+ } else {
66525+ ClearPageUptodate(bio->bi_io_vec->bv_page);
66526+ SetPageError(bio->bi_io_vec->bv_page);
66527+ }
66528+ unlock_page(bio->bi_io_vec->bv_page);
66529+ return 0;
66530+}
66531+
66532+/* Initialise status code. This is expected to be called from the disk format
66533+ code. block paremeter is where status block lives. */
66534+int reiser4_status_init(reiser4_block_nr block)
66535+{
66536+ struct super_block *sb = reiser4_get_current_sb();
66537+ struct reiser4_status *statuspage;
66538+ struct bio *bio;
66539+ struct page *page;
66540+
66541+ get_super_private(sb)->status_page = NULL;
66542+ get_super_private(sb)->status_bio = NULL;
66543+
66544+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
66545+ if (!page)
66546+ return -ENOMEM;
66547+
66548+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
66549+ if (bio != NULL) {
66550+ bio->bi_sector = block * (sb->s_blocksize >> 9);
66551+ bio->bi_bdev = sb->s_bdev;
66552+ bio->bi_io_vec[0].bv_page = page;
66553+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66554+ bio->bi_io_vec[0].bv_offset = 0;
66555+ bio->bi_vcnt = 1;
66556+ bio->bi_size = sb->s_blocksize;
66557+ bio->bi_end_io = reiser4_status_endio;
66558+ } else {
66559+ __free_pages(page, 0);
66560+ return -ENOMEM;
66561+ }
66562+ lock_page(page);
66563+ submit_bio(READ, bio);
66564+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
66565+ wait_on_page_locked(page);
66566+ if (!PageUptodate(page)) {
66567+ warning("green-2007",
66568+ "I/O error while tried to read status page\n");
66569+ return -EIO;
66570+ }
66571+
66572+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
66573+ if (memcmp
66574+ (statuspage->magic, REISER4_STATUS_MAGIC,
66575+ sizeof(REISER4_STATUS_MAGIC))) {
66576+ /* Magic does not match. */
66577+ kunmap_atomic((char *)statuspage, KM_USER0);
66578+ warning("green-2008", "Wrong magic in status block\n");
66579+ __free_pages(page, 0);
66580+ bio_put(bio);
66581+ return -EINVAL;
66582+ }
66583+ kunmap_atomic((char *)statuspage, KM_USER0);
66584+
66585+ get_super_private(sb)->status_page = page;
66586+ get_super_private(sb)->status_bio = bio;
66587+ return 0;
66588+}
66589+
66590+/* Query the status of fs. Returns if the FS can be safely mounted.
66591+ Also if "status" and "extended" parameters are given, it will fill
66592+ actual parts of status from disk there. */
66593+int reiser4_status_query(u64 * status, u64 * extended)
66594+{
66595+ struct super_block *sb = reiser4_get_current_sb();
66596+ struct reiser4_status *statuspage;
66597+ int retval;
66598+
66599+ if (!get_super_private(sb)->status_page) { // No status page?
66600+ return REISER4_STATUS_MOUNT_UNKNOWN;
66601+ }
66602+ statuspage = (struct reiser4_status *)
66603+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66604+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
66605+ case REISER4_STATUS_OK:
66606+ retval = REISER4_STATUS_MOUNT_OK;
66607+ break;
66608+ case REISER4_STATUS_CORRUPTED:
66609+ retval = REISER4_STATUS_MOUNT_WARN;
66610+ break;
66611+ case REISER4_STATUS_DAMAGED:
66612+ case REISER4_STATUS_DESTROYED:
66613+ case REISER4_STATUS_IOERROR:
66614+ retval = REISER4_STATUS_MOUNT_RO;
66615+ break;
66616+ default:
66617+ retval = REISER4_STATUS_MOUNT_UNKNOWN;
66618+ break;
66619+ }
66620+
66621+ if (status)
66622+ *status = le64_to_cpu(get_unaligned(&statuspage->status));
66623+ if (extended)
66624+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
66625+
66626+ kunmap_atomic((char *)statuspage, KM_USER0);
66627+ return retval;
66628+}
66629+
66630+/* This function should be called when something bad happens (e.g. from reiser4_panic).
66631+ It fills the status structure and tries to push it to disk. */
66632+int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
66633+{
66634+ struct super_block *sb = reiser4_get_current_sb();
66635+ struct reiser4_status *statuspage;
66636+ struct bio *bio = get_super_private(sb)->status_bio;
66637+
66638+ if (!get_super_private(sb)->status_page) { // No status page?
66639+ return -1;
66640+ }
66641+ statuspage = (struct reiser4_status *)
66642+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66643+
66644+ put_unaligned(cpu_to_le64(status), &statuspage->status);
66645+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
66646+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
66647+
66648+ kunmap_atomic((char *)statuspage, KM_USER0);
66649+ bio->bi_bdev = sb->s_bdev;
66650+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
66651+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66652+ bio->bi_io_vec[0].bv_offset = 0;
66653+ bio->bi_vcnt = 1;
66654+ bio->bi_size = sb->s_blocksize;
66655+ bio->bi_end_io = reiser4_status_endio;
66656+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
66657+ /* We can block now, but we have no other choice anyway */
66658+ submit_bio(WRITE, bio);
66659+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
66660+ return 0; // We do not wait for io to finish.
66661+}
66662+
66663+/* Frees the page with status and bio structure. Should be called by disk format at umount time */
66664+int reiser4_status_finish(void)
66665+{
66666+ struct super_block *sb = reiser4_get_current_sb();
66667+
66668+ __free_pages(get_super_private(sb)->status_page, 0);
66669+ get_super_private(sb)->status_page = NULL;
66670+ bio_put(get_super_private(sb)->status_bio);
66671+ get_super_private(sb)->status_bio = NULL;
66672+ return 0;
66673+}
66674diff -urN linux-2.6.20.orig/fs/reiser4/status_flags.h linux-2.6.20/fs/reiser4/status_flags.h
66675--- linux-2.6.20.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
66676+++ linux-2.6.20/fs/reiser4/status_flags.h 2007-05-06 14:50:43.875030717 +0400
66677@@ -0,0 +1,43 @@
66678+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66679+ * reiser4/README */
66680+
66681+/* Here we declare structures and flags that store reiser4 status on disk.
66682+ The status that helps us to find out if the filesystem is valid or if it
66683+ contains some critical, or not so critical errors */
66684+
66685+#if !defined( __REISER4_STATUS_FLAGS_H__ )
66686+#define __REISER4_STATUS_FLAGS_H__
66687+
66688+#include "dformat.h"
66689+/* These are major status flags */
66690+#define REISER4_STATUS_OK 0
66691+#define REISER4_STATUS_CORRUPTED 0x1
66692+#define REISER4_STATUS_DAMAGED 0x2
66693+#define REISER4_STATUS_DESTROYED 0x4
66694+#define REISER4_STATUS_IOERROR 0x8
66695+
66696+/* Return values for reiser4_status_query() */
66697+#define REISER4_STATUS_MOUNT_OK 0
66698+#define REISER4_STATUS_MOUNT_WARN 1
66699+#define REISER4_STATUS_MOUNT_RO 2
66700+#define REISER4_STATUS_MOUNT_UNKNOWN -1
66701+
66702+#define REISER4_TEXTERROR_LEN 256
66703+
66704+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
66705+/* We probably need to keep its size under sector size which is 512 bytes */
66706+struct reiser4_status {
66707+ char magic[16];
66708+ d64 status; /* Current FS state */
66709+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
66710+ last sector where io error happened if status is "io error encountered" */
66711+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
66712+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
66713+};
66714+
66715+int reiser4_status_init(reiser4_block_nr block);
66716+int reiser4_status_query(u64 * status, u64 * extended);
66717+int reiser4_status_write(u64 status, u64 extended_status, char *message);
66718+int reiser4_status_finish(void);
66719+
66720+#endif
66721diff -urN linux-2.6.20.orig/fs/reiser4/super.c linux-2.6.20/fs/reiser4/super.c
66722--- linux-2.6.20.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300
66723+++ linux-2.6.20/fs/reiser4/super.c 2007-05-06 14:50:43.875030717 +0400
66724@@ -0,0 +1,316 @@
66725+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66726+ * reiser4/README */
66727+
66728+/* Super-block manipulations. */
66729+
66730+#include "debug.h"
66731+#include "dformat.h"
66732+#include "key.h"
66733+#include "plugin/security/perm.h"
66734+#include "plugin/space/space_allocator.h"
66735+#include "plugin/plugin.h"
66736+#include "tree.h"
66737+#include "vfs_ops.h"
66738+#include "super.h"
66739+#include "reiser4.h"
66740+
66741+#include <linux/types.h> /* for __u?? */
66742+#include <linux/fs.h> /* for struct super_block */
66743+
66744+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
66745+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
66746+static __u64 reserved_for_root(const struct super_block *super);
66747+
66748+/* Return reiser4-specific part of super block */
66749+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
66750+ * queried */ )
66751+{
66752+ return (reiser4_super_info_data *) super->s_fs_info;
66753+}
66754+
66755+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
66756+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
66757+{
66758+ assert("nikita-448", super != NULL);
66759+ assert("nikita-449", is_reiser4_super(super));
66760+ return (long)REISER4_SUPER_MAGIC;
66761+}
66762+
66763+/* functions to read/modify fields of reiser4_super_info_data */
66764+
66765+/* get number of blocks in file system */
66766+__u64 reiser4_block_count(const struct super_block *super /* super block
66767+ queried */ )
66768+{
66769+ assert("vs-494", super != NULL);
66770+ assert("vs-495", is_reiser4_super(super));
66771+ return get_super_private(super)->block_count;
66772+}
66773+
66774+#if REISER4_DEBUG
66775+/*
66776+ * number of blocks in the current file system
66777+ */
66778+__u64 reiser4_current_block_count(void)
66779+{
66780+ return get_current_super_private()->block_count;
66781+}
66782+#endif /* REISER4_DEBUG */
66783+
66784+/* set number of block in filesystem */
66785+void reiser4_set_block_count(const struct super_block *super, __u64 nr)
66786+{
66787+ assert("vs-501", super != NULL);
66788+ assert("vs-502", is_reiser4_super(super));
66789+ get_super_private(super)->block_count = nr;
66790+ /*
66791+ * The proper calculation of the reserved space counter (%5 of device
66792+ * block counter) we need a 64 bit division which is missing in Linux
66793+ * on i386 platform. Because we do not need a precise calculation here
66794+ * we can replace a div64 operation by this combination of
66795+ * multiplication and shift: 51. / (2^10) == .0498 .
66796+ * FIXME: this is a bug. It comes up only for very small filesystems
66797+ * which probably are never used. Nevertheless, it is a bug. Number of
66798+ * reserved blocks must be not less than maximal number of blocks which
66799+ * get grabbed with BA_RESERVED.
66800+ */
66801+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
66802+}
66803+
66804+/* amount of blocks used (allocated for data) in file system */
66805+__u64 reiser4_data_blocks(const struct super_block *super /* super block
66806+ queried */ )
66807+{
66808+ assert("nikita-452", super != NULL);
66809+ assert("nikita-453", is_reiser4_super(super));
66810+ return get_super_private(super)->blocks_used;
66811+}
66812+
66813+/* set number of block used in filesystem */
66814+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
66815+{
66816+ assert("vs-503", super != NULL);
66817+ assert("vs-504", is_reiser4_super(super));
66818+ get_super_private(super)->blocks_used = nr;
66819+}
66820+
66821+/* amount of free blocks in file system */
66822+__u64 reiser4_free_blocks(const struct super_block *super /* super block
66823+ queried */ )
66824+{
66825+ assert("nikita-454", super != NULL);
66826+ assert("nikita-455", is_reiser4_super(super));
66827+ return get_super_private(super)->blocks_free;
66828+}
66829+
66830+/* set number of blocks free in filesystem */
66831+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
66832+{
66833+ assert("vs-505", super != NULL);
66834+ assert("vs-506", is_reiser4_super(super));
66835+ get_super_private(super)->blocks_free = nr;
66836+}
66837+
66838+/* get mkfs unique identifier */
66839+__u32 reiser4_mkfs_id(const struct super_block *super /* super block
66840+ queried */ )
66841+{
66842+ assert("vpf-221", super != NULL);
66843+ assert("vpf-222", is_reiser4_super(super));
66844+ return get_super_private(super)->mkfs_id;
66845+}
66846+
66847+/* amount of free blocks in file system */
66848+__u64 reiser4_free_committed_blocks(const struct super_block *super)
66849+{
66850+ assert("vs-497", super != NULL);
66851+ assert("vs-498", is_reiser4_super(super));
66852+ return get_super_private(super)->blocks_free_committed;
66853+}
66854+
66855+/* amount of blocks in the file system reserved for @uid and @gid */
66856+long reiser4_reserved_blocks(const struct super_block *super /* super block
66857+ queried */ ,
66858+ uid_t uid /* user id */ ,
66859+ gid_t gid /* group id */ )
66860+{
66861+ long reserved;
66862+
66863+ assert("nikita-456", super != NULL);
66864+ assert("nikita-457", is_reiser4_super(super));
66865+
66866+ reserved = 0;
66867+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
66868+ reserved += reserved_for_gid(super, gid);
66869+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
66870+ reserved += reserved_for_uid(super, uid);
66871+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
66872+ reserved += reserved_for_root(super);
66873+ return reserved;
66874+}
66875+
66876+/* get/set value of/to grabbed blocks counter */
66877+__u64 reiser4_grabbed_blocks(const struct super_block * super)
66878+{
66879+ assert("zam-512", super != NULL);
66880+ assert("zam-513", is_reiser4_super(super));
66881+
66882+ return get_super_private(super)->blocks_grabbed;
66883+}
66884+
66885+__u64 reiser4_flush_reserved(const struct super_block * super)
66886+{
66887+ assert("vpf-285", super != NULL);
66888+ assert("vpf-286", is_reiser4_super(super));
66889+
66890+ return get_super_private(super)->blocks_flush_reserved;
66891+}
66892+
66893+/* get/set value of/to counter of fake allocated formatted blocks */
66894+__u64 reiser4_fake_allocated(const struct super_block * super)
66895+{
66896+ assert("zam-516", super != NULL);
66897+ assert("zam-517", is_reiser4_super(super));
66898+
66899+ return get_super_private(super)->blocks_fake_allocated;
66900+}
66901+
66902+/* get/set value of/to counter of fake allocated unformatted blocks */
66903+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
66904+{
66905+ assert("zam-516", super != NULL);
66906+ assert("zam-517", is_reiser4_super(super));
66907+
66908+ return get_super_private(super)->blocks_fake_allocated_unformatted;
66909+}
66910+
66911+/* get/set value of/to counter of clustered blocks */
66912+__u64 reiser4_clustered_blocks(const struct super_block * super)
66913+{
66914+ assert("edward-601", super != NULL);
66915+ assert("edward-602", is_reiser4_super(super));
66916+
66917+ return get_super_private(super)->blocks_clustered;
66918+}
66919+
66920+/* space allocator used by this file system */
66921+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
66922+ *super)
66923+{
66924+ assert("nikita-1965", super != NULL);
66925+ assert("nikita-1966", is_reiser4_super(super));
66926+ return &get_super_private(super)->space_allocator;
66927+}
66928+
66929+/* return fake inode used to bind formatted nodes in the page cache */
66930+struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
66931+ queried */ )
66932+{
66933+ assert("nikita-1757", super != NULL);
66934+ return get_super_private(super)->fake;
66935+}
66936+
66937+/* return fake inode used to bind copied on capture nodes in the page cache */
66938+struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
66939+ queried */ )
66940+{
66941+ assert("nikita-1757", super != NULL);
66942+ return get_super_private(super)->cc;
66943+}
66944+
66945+/* return fake inode used to bind bitmaps and journlal heads */
66946+struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
66947+{
66948+ assert("nikita-17571", super != NULL);
66949+ return get_super_private(super)->bitmap;
66950+}
66951+
66952+/* tree used by this file system */
66953+reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
66954+ * queried */ )
66955+{
66956+ assert("nikita-460", super != NULL);
66957+ assert("nikita-461", is_reiser4_super(super));
66958+ return &get_super_private(super)->tree;
66959+}
66960+
66961+/* Check that @super is (looks like) reiser4 super block. This is mainly for
66962+ use in assertions. */
66963+int is_reiser4_super(const struct super_block *super /* super block
66964+ * queried */ )
66965+{
66966+ return
66967+ super != NULL &&
66968+ get_super_private(super) != NULL &&
66969+ super->s_op == &(get_super_private(super)->ops.super);
66970+}
66971+
66972+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
66973+{
66974+ return test_bit((int)f, &get_super_private(super)->fs_flags);
66975+}
66976+
66977+/* amount of blocks reserved for given group in file system */
66978+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
66979+ * block
66980+ * queried */ ,
66981+ gid_t gid UNUSED_ARG /* group id */ )
66982+{
66983+ return 0;
66984+}
66985+
66986+/* amount of blocks reserved for given user in file system */
66987+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
66988+ block
66989+ queried */ ,
66990+ uid_t uid UNUSED_ARG /* user id */ )
66991+{
66992+ return 0;
66993+}
66994+
66995+/* amount of blocks reserved for super user in file system */
66996+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
66997+ block
66998+ queried */ )
66999+{
67000+ return 0;
67001+}
67002+
67003+/*
67004+ * true if block number @blk makes sense for the file system at @super.
67005+ */
67006+int
67007+reiser4_blocknr_is_sane_for(const struct super_block *super,
67008+ const reiser4_block_nr * blk)
67009+{
67010+ reiser4_super_info_data *sbinfo;
67011+
67012+ assert("nikita-2957", super != NULL);
67013+ assert("nikita-2958", blk != NULL);
67014+
67015+ if (reiser4_blocknr_is_fake(blk))
67016+ return 1;
67017+
67018+ sbinfo = get_super_private(super);
67019+ return *blk < sbinfo->block_count;
67020+}
67021+
67022+#if REISER4_DEBUG
67023+/*
67024+ * true, if block number @blk makes sense for the current file system
67025+ */
67026+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
67027+{
67028+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
67029+}
67030+#endif /* REISER4_DEBUG */
67031+
67032+/* Make Linus happy.
67033+ Local variables:
67034+ c-indentation-style: "K&R"
67035+ mode-name: "LC"
67036+ c-basic-offset: 8
67037+ tab-width: 8
67038+ fill-column: 120
67039+ End:
67040+*/
67041diff -urN linux-2.6.20.orig/fs/reiser4/super.h linux-2.6.20/fs/reiser4/super.h
67042--- linux-2.6.20.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300
67043+++ linux-2.6.20/fs/reiser4/super.h 2007-05-06 14:50:43.875030717 +0400
67044@@ -0,0 +1,464 @@
67045+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67046+ * reiser4/README */
67047+
67048+/* Super-block functions. See super.c for details. */
67049+
67050+#if !defined( __REISER4_SUPER_H__ )
67051+#define __REISER4_SUPER_H__
67052+
67053+#include "tree.h"
67054+#include "entd.h"
67055+#include "wander.h"
67056+#include "fsdata.h"
67057+#include "plugin/object.h"
67058+#include "plugin/space/space_allocator.h"
67059+
67060+/*
67061+ * Flush algorithms parameters.
67062+ */
67063+typedef struct {
67064+ unsigned relocate_threshold;
67065+ unsigned relocate_distance;
67066+ unsigned written_threshold;
67067+ unsigned scan_maxnodes;
67068+} flush_params;
67069+
67070+typedef enum {
67071+ /*
67072+ * True if this file system doesn't support hard-links (multiple names)
67073+ * for directories: this is default UNIX behavior.
67074+ *
67075+ * If hard-links on directoires are not allowed, file system is Acyclic
67076+ * Directed Graph (modulo dot, and dotdot, of course).
67077+ *
67078+ * This is used by reiser4_link().
67079+ */
67080+ REISER4_ADG = 0,
67081+ /*
67082+ * set if all nodes in internal tree have the same node layout plugin.
67083+ * If so, znode_guess_plugin() will return tree->node_plugin in stead
67084+ * of guessing plugin by plugin id stored in the node.
67085+ */
67086+ REISER4_ONE_NODE_PLUGIN = 1,
67087+ /* if set, bsd gid assignment is supported. */
67088+ REISER4_BSD_GID = 2,
67089+ /* [mac]_time are 32 bit in inode */
67090+ REISER4_32_BIT_TIMES = 3,
67091+ /* load all bitmap blocks at mount time */
67092+ REISER4_DONT_LOAD_BITMAP = 5,
67093+ /* enforce atomicity during write(2) */
67094+ REISER4_ATOMIC_WRITE = 6,
67095+ /* don't use write barriers in the log writer code. */
67096+ REISER4_NO_WRITE_BARRIER = 7
67097+} reiser4_fs_flag;
67098+
67099+/*
67100+ * VFS related operation vectors.
67101+ */
67102+typedef struct object_ops {
67103+ struct super_operations super;
67104+ struct dentry_operations dentry;
67105+ struct export_operations export;
67106+} object_ops;
67107+
67108+/* reiser4-specific part of super block
67109+
67110+ Locking
67111+
67112+ Fields immutable after mount:
67113+
67114+ ->oid*
67115+ ->space*
67116+ ->default_[ug]id
67117+ ->mkfs_id
67118+ ->trace_flags
67119+ ->debug_flags
67120+ ->fs_flags
67121+ ->df_plug
67122+ ->optimal_io_size
67123+ ->plug
67124+ ->flush
67125+ ->u (bad name)
67126+ ->txnmgr
67127+ ->ra_params
67128+ ->fsuid
67129+ ->journal_header
67130+ ->journal_footer
67131+
67132+ Fields protected by ->lnode_guard
67133+
67134+ ->lnode_htable
67135+
67136+ Fields protected by per-super block spin lock
67137+
67138+ ->block_count
67139+ ->blocks_used
67140+ ->blocks_free
67141+ ->blocks_free_committed
67142+ ->blocks_grabbed
67143+ ->blocks_fake_allocated_unformatted
67144+ ->blocks_fake_allocated
67145+ ->blocks_flush_reserved
67146+ ->eflushed
67147+ ->blocknr_hint_default
67148+
67149+ After journal replaying during mount,
67150+
67151+ ->last_committed_tx
67152+
67153+ is protected by ->tmgr.commit_mutex
67154+
67155+ Invariants involving this data-type:
67156+
67157+ [sb-block-counts]
67158+ [sb-grabbed]
67159+ [sb-fake-allocated]
67160+*/
67161+struct reiser4_super_info_data {
67162+ /*
67163+ * guard spinlock which protects reiser4 super block fields (currently
67164+ * blocks_free, blocks_free_committed)
67165+ */
67166+ spinlock_t guard;
67167+
67168+ /* next oid that will be returned by oid_allocate() */
67169+ oid_t next_to_use;
67170+ /* total number of used oids */
67171+ oid_t oids_in_use;
67172+
67173+ /* space manager plugin */
67174+ reiser4_space_allocator space_allocator;
67175+
67176+ /* reiser4 internal tree */
67177+ reiser4_tree tree;
67178+
67179+ /*
67180+ * default user id used for light-weight files without their own
67181+ * stat-data.
67182+ */
67183+ uid_t default_uid;
67184+
67185+ /*
67186+ * default group id used for light-weight files without their own
67187+ * stat-data.
67188+ */
67189+ gid_t default_gid;
67190+
67191+ /* mkfs identifier generated at mkfs time. */
67192+ __u32 mkfs_id;
67193+ /* amount of blocks in a file system */
67194+ __u64 block_count;
67195+
67196+ /* inviolable reserve */
67197+ __u64 blocks_reserved;
67198+
67199+ /* amount of blocks used by file system data and meta-data. */
67200+ __u64 blocks_used;
67201+
67202+ /*
67203+ * amount of free blocks. This is "working" free blocks counter. It is
67204+ * like "working" bitmap, please see block_alloc.c for description.
67205+ */
67206+ __u64 blocks_free;
67207+
67208+ /*
67209+ * free block count for fs committed state. This is "commit" version of
67210+ * free block counter.
67211+ */
67212+ __u64 blocks_free_committed;
67213+
67214+ /*
67215+ * number of blocks reserved for further allocation, for all
67216+ * threads.
67217+ */
67218+ __u64 blocks_grabbed;
67219+
67220+ /* number of fake allocated unformatted blocks in tree. */
67221+ __u64 blocks_fake_allocated_unformatted;
67222+
67223+ /* number of fake allocated formatted blocks in tree. */
67224+ __u64 blocks_fake_allocated;
67225+
67226+ /* number of blocks reserved for flush operations. */
67227+ __u64 blocks_flush_reserved;
67228+
67229+ /* number of blocks reserved for cluster operations. */
67230+ __u64 blocks_clustered;
67231+
67232+ /* unique file-system identifier */
67233+ __u32 fsuid;
67234+
67235+ /* On-disk format version. If does not equal to the disk_format
67236+ plugin version, some format updates (e.g. enlarging plugin
67237+ set, etc) may have place on mount. */
67238+ int version;
67239+
67240+ /* file-system wide flags. See reiser4_fs_flag enum */
67241+ unsigned long fs_flags;
67242+
67243+ /* transaction manager */
67244+ txn_mgr tmgr;
67245+
67246+ /* ent thread */
67247+ entd_context entd;
67248+
67249+ /* fake inode used to bind formatted nodes */
67250+ struct inode *fake;
67251+ /* inode used to bind bitmaps (and journal heads) */
67252+ struct inode *bitmap;
67253+ /* inode used to bind copied on capture nodes */
67254+ struct inode *cc;
67255+
67256+ /* disk layout plugin */
67257+ disk_format_plugin *df_plug;
67258+
67259+ /* disk layout specific part of reiser4 super info data */
67260+ union {
67261+ format40_super_info format40;
67262+ } u;
67263+
67264+ /* value we return in st_blksize on stat(2) */
67265+ unsigned long optimal_io_size;
67266+
67267+ /* parameters for the flush algorithm */
67268+ flush_params flush;
67269+
67270+ /* pointers to jnodes for journal header and footer */
67271+ jnode *journal_header;
67272+ jnode *journal_footer;
67273+
67274+ journal_location jloc;
67275+
67276+ /* head block number of last committed transaction */
67277+ __u64 last_committed_tx;
67278+
67279+ /*
67280+ * we remember last written location for using as a hint for new block
67281+ * allocation
67282+ */
67283+ __u64 blocknr_hint_default;
67284+
67285+ /* committed number of files (oid allocator state variable ) */
67286+ __u64 nr_files_committed;
67287+
67288+ ra_params_t ra_params;
67289+
67290+ /*
67291+ * A mutex for serializing cut tree operation if out-of-free-space:
67292+ * the only one cut_tree thread is allowed to grab space from reserved
67293+ * area (it is 5% of disk space)
67294+ */
67295+ struct mutex delete_mutex;
67296+ /* task owning ->delete_mutex */
67297+ struct task_struct *delete_mutex_owner;
67298+
67299+ /* Diskmap's blocknumber */
67300+ __u64 diskmap_block;
67301+
67302+ /* What to do in case of error */
67303+ int onerror;
67304+
67305+ /* operations for objects on this file system */
67306+ object_ops ops;
67307+
67308+ /*
67309+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67310+ * more details
67311+ */
67312+ d_cursor_info d_info;
67313+
67314+#ifdef CONFIG_REISER4_BADBLOCKS
67315+ /* Alternative master superblock offset (in bytes) */
67316+ unsigned long altsuper;
67317+#endif
67318+ struct repacker *repacker;
67319+ struct page *status_page;
67320+ struct bio *status_bio;
67321+
67322+#if REISER4_DEBUG
67323+ /*
67324+ * minimum used blocks value (includes super blocks, bitmap blocks and
67325+ * other fs reserved areas), depends on fs format and fs size.
67326+ */
67327+ __u64 min_blocks_used;
67328+
67329+ /*
67330+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67331+ * are kept on a list anchored at sbinfo->all_jnodes. This list is
67332+ * protected by sbinfo->all_guard spin lock. This lock should be taken
67333+ * with _irq modifier, because it is also modified from interrupt
67334+ * contexts (by RCU).
67335+ */
67336+ spinlock_t all_guard;
67337+ /* list of all jnodes */
67338+ struct list_head all_jnodes;
67339+#endif
67340+ struct dentry *debugfs_root;
67341+};
67342+
67343+extern reiser4_super_info_data *get_super_private_nocheck(const struct
67344+ super_block *super);
67345+
67346+/* Return reiser4-specific part of super block */
67347+static inline reiser4_super_info_data *get_super_private(const struct
67348+ super_block *super)
67349+{
67350+ assert("nikita-447", super != NULL);
67351+
67352+ return (reiser4_super_info_data *) super->s_fs_info;
67353+}
67354+
67355+/* get ent context for the @super */
67356+static inline entd_context *get_entd_context(struct super_block *super)
67357+{
67358+ return &get_super_private(super)->entd;
67359+}
67360+
67361+/* "Current" super-block: main super block used during current system
67362+ call. Reference to this super block is stored in reiser4_context. */
67363+static inline struct super_block *reiser4_get_current_sb(void)
67364+{
67365+ return get_current_context()->super;
67366+}
67367+
67368+/* Reiser4-specific part of "current" super-block: main super block used
67369+ during current system call. Reference to this super block is stored in
67370+ reiser4_context. */
67371+static inline reiser4_super_info_data *get_current_super_private(void)
67372+{
67373+ return get_super_private(reiser4_get_current_sb());
67374+}
67375+
67376+static inline ra_params_t *get_current_super_ra_params(void)
67377+{
67378+ return &(get_current_super_private()->ra_params);
67379+}
67380+
67381+/*
67382+ * true, if file system on @super is read-only
67383+ */
67384+static inline int rofs_super(struct super_block *super)
67385+{
67386+ return super->s_flags & MS_RDONLY;
67387+}
67388+
67389+/*
67390+ * true, if @tree represents read-only file system
67391+ */
67392+static inline int rofs_tree(reiser4_tree * tree)
67393+{
67394+ return rofs_super(tree->super);
67395+}
67396+
67397+/*
67398+ * true, if file system where @inode lives on, is read-only
67399+ */
67400+static inline int rofs_inode(struct inode *inode)
67401+{
67402+ return rofs_super(inode->i_sb);
67403+}
67404+
67405+/*
67406+ * true, if file system where @node lives on, is read-only
67407+ */
67408+static inline int rofs_jnode(jnode * node)
67409+{
67410+ return rofs_tree(jnode_get_tree(node));
67411+}
67412+
67413+extern __u64 reiser4_current_block_count(void);
67414+
67415+extern void build_object_ops(struct super_block *super, object_ops * ops);
67416+
67417+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67418+
67419+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67420+{
67421+ spin_lock(&(sbinfo->guard));
67422+}
67423+
67424+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67425+{
67426+ assert_spin_locked(&(sbinfo->guard));
67427+ spin_unlock(&(sbinfo->guard));
67428+}
67429+
67430+extern __u64 reiser4_flush_reserved(const struct super_block *);
67431+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67432+extern long reiser4_statfs_type(const struct super_block *super);
67433+extern __u64 reiser4_block_count(const struct super_block *super);
67434+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67435+extern __u64 reiser4_data_blocks(const struct super_block *super);
67436+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67437+extern __u64 reiser4_free_blocks(const struct super_block *super);
67438+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67439+extern __u32 reiser4_mkfs_id(const struct super_block *super);
67440+
67441+extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67442+
67443+extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67444+extern __u64 reiser4_fake_allocated(const struct super_block *);
67445+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67446+extern __u64 reiser4_clustered_blocks(const struct super_block *);
67447+
67448+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67449+ gid_t gid);
67450+
67451+extern reiser4_space_allocator *
67452+reiser4_get_space_allocator(const struct super_block *super);
67453+extern reiser4_oid_allocator *
67454+reiser4_get_oid_allocator(const struct super_block *super);
67455+extern struct inode *reiser4_get_super_fake(const struct super_block *super);
67456+extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
67457+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
67458+extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
67459+extern int is_reiser4_super(const struct super_block *super);
67460+
67461+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67462+extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67463+ const reiser4_block_nr * blk);
67464+extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67465+extern int reiser4_done_super(struct super_block *s);
67466+
67467+/* step of fill super */
67468+extern int reiser4_init_fs_info(struct super_block *);
67469+extern void reiser4_done_fs_info(struct super_block *);
67470+extern int reiser4_init_super_data(struct super_block *, char *opt_string);
67471+extern int reiser4_init_read_super(struct super_block *, int silent);
67472+extern int reiser4_init_root_inode(struct super_block *);
67473+extern reiser4_plugin *get_default_plugin(pset_member memb);
67474+
67475+/* Maximal possible object id. */
67476+#define ABSOLUTE_MAX_OID ((oid_t)~0)
67477+
67478+#define OIDS_RESERVED ( 1 << 16 )
67479+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67480+oid_t oid_allocate(struct super_block *);
67481+int oid_release(struct super_block *, oid_t);
67482+oid_t oid_next(const struct super_block *);
67483+void oid_count_allocated(void);
67484+void oid_count_released(void);
67485+long oids_used(const struct super_block *);
67486+
67487+#if REISER4_DEBUG
67488+void print_fs_info(const char *prefix, const struct super_block *);
67489+#endif
67490+
67491+extern void destroy_reiser4_cache(struct kmem_cache **);
67492+
67493+extern struct super_operations reiser4_super_operations;
67494+extern struct export_operations reiser4_export_operations;
67495+extern struct dentry_operations reiser4_dentry_operations;
67496+
67497+/* __REISER4_SUPER_H__ */
67498+#endif
67499+
67500+/*
67501+ * Local variables:
67502+ * c-indentation-style: "K&R"
67503+ * mode-name: "LC"
67504+ * c-basic-offset: 8
67505+ * tab-width: 8
67506+ * fill-column: 120
67507+ * End:
67508+ */
67509diff -urN linux-2.6.20.orig/fs/reiser4/super_ops.c linux-2.6.20/fs/reiser4/super_ops.c
67510--- linux-2.6.20.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300
67511+++ linux-2.6.20/fs/reiser4/super_ops.c 2007-05-06 14:50:43.879031967 +0400
67512@@ -0,0 +1,728 @@
67513+/* Copyright 2005 by Hans Reiser, licensing governed by
67514+ * reiser4/README */
67515+
67516+#include "inode.h"
67517+#include "page_cache.h"
67518+#include "ktxnmgrd.h"
67519+#include "flush.h"
67520+#include "safe_link.h"
67521+
67522+#include <linux/vfs.h>
67523+#include <linux/writeback.h>
67524+#include <linux/mount.h>
67525+#include <linux/seq_file.h>
67526+#include <linux/debugfs.h>
67527+
67528+/* slab cache for inodes */
67529+static struct kmem_cache *inode_cache;
67530+
67531+static struct dentry *reiser4_debugfs_root = NULL;
67532+
67533+/**
67534+ * init_once - constructor for reiser4 inodes
67535+ * @obj: inode to be initialized
67536+ * @cache: cache @obj belongs to
67537+ * @flags: SLAB flags
67538+ *
67539+ * Initialization function to be called when new page is allocated by reiser4
67540+ * inode cache. It is set on inode cache creation.
67541+ */
67542+static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
67543+{
67544+ reiser4_inode_object *info;
67545+
67546+ info = obj;
67547+
67548+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
67549+ SLAB_CTOR_CONSTRUCTOR) {
67550+ /* initialize vfs inode */
67551+ inode_init_once(&info->vfs_inode);
67552+
67553+ /*
67554+ * initialize reiser4 specific part fo inode.
67555+ * NOTE-NIKITA add here initializations for locks, list heads,
67556+ * etc. that will be added to our private inode part.
67557+ */
67558+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
67559+ init_rwsem(&info->p.conv_sem);
67560+ /* init semaphore which is used during inode loading */
67561+ loading_init_once(&info->p);
67562+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
67563+ GFP_ATOMIC);
67564+#if REISER4_DEBUG
67565+ info->p.nr_jnodes = 0;
67566+#endif
67567+ }
67568+}
67569+
67570+/**
67571+ * init_inodes - create znode cache
67572+ *
67573+ * Initializes slab cache of inodes. It is part of reiser4 module initialization.
67574+ */
67575+static int init_inodes(void)
67576+{
67577+ inode_cache = kmem_cache_create("reiser4_inode",
67578+ sizeof(reiser4_inode_object),
67579+ 0,
67580+ SLAB_HWCACHE_ALIGN |
67581+ SLAB_RECLAIM_ACCOUNT, init_once, NULL);
67582+ if (inode_cache == NULL)
67583+ return RETERR(-ENOMEM);
67584+ return 0;
67585+}
67586+
67587+/**
67588+ * done_inodes - delete inode cache
67589+ *
67590+ * This is called on reiser4 module unloading or system shutdown.
67591+ */
67592+static void done_inodes(void)
67593+{
67594+ destroy_reiser4_cache(&inode_cache);
67595+}
67596+
67597+/**
67598+ * reiser4_alloc_inode - alloc_inode of super operations
67599+ * @super: super block new inode is allocated for
67600+ *
67601+ * Allocates new inode, initializes reiser4 specific part of it.
67602+ */
67603+static struct inode *reiser4_alloc_inode(struct super_block *super)
67604+{
67605+ reiser4_inode_object *obj;
67606+
67607+ assert("nikita-1696", super != NULL);
67608+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
67609+ if (obj != NULL) {
67610+ reiser4_inode *info;
67611+
67612+ info = &obj->p;
67613+
67614+ info->pset = plugin_set_get_empty();
67615+ info->hset = plugin_set_get_empty();
67616+ info->extmask = 0;
67617+ info->locality_id = 0ull;
67618+ info->plugin_mask = 0;
67619+ info->heir_mask = 0;
67620+#if !REISER4_INO_IS_OID
67621+ info->oid_hi = 0;
67622+#endif
67623+ reiser4_seal_init(&info->sd_seal, NULL, NULL);
67624+ coord_init_invalid(&info->sd_coord, NULL);
67625+ info->flags = 0;
67626+ spin_lock_init(&info->guard);
67627+ /* this deals with info's loading semaphore */
67628+ loading_alloc(info);
67629+ info->vroot = UBER_TREE_ADDR;
67630+ return &obj->vfs_inode;
67631+ } else
67632+ return NULL;
67633+}
67634+
67635+/**
67636+ * reiser4_destroy_inode - destroy_inode of super operations
67637+ * @inode: inode being destroyed
67638+ *
67639+ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
67640+ */
67641+static void reiser4_destroy_inode(struct inode *inode)
67642+{
67643+ reiser4_inode *info;
67644+
67645+ info = reiser4_inode_data(inode);
67646+
67647+ assert("vs-1220", inode_has_no_jnodes(info));
67648+
67649+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
67650+ file_plugin *fplug = inode_file_plugin(inode);
67651+ if (fplug->destroy_inode != NULL)
67652+ fplug->destroy_inode(inode);
67653+ }
67654+ reiser4_dispose_cursors(inode);
67655+ if (info->pset)
67656+ plugin_set_put(info->pset);
67657+ if (info->hset)
67658+ plugin_set_put(info->hset);
67659+
67660+ /*
67661+ * cannot add similar assertion about ->i_list as prune_icache return
67662+ * inode into slab with dangling ->list.{next,prev}. This is safe,
67663+ * because they are re-initialized in the new_inode().
67664+ */
67665+ assert("nikita-2895", list_empty(&inode->i_dentry));
67666+ assert("nikita-2896", hlist_unhashed(&inode->i_hash));
67667+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
67668+
67669+ /* this deals with info's loading semaphore */
67670+ loading_destroy(info);
67671+
67672+ kmem_cache_free(inode_cache,
67673+ container_of(info, reiser4_inode_object, p));
67674+}
67675+
67676+/**
67677+ * reiser4_dirty_inode - dirty_inode of super operations
67678+ * @inode: inode being dirtied
67679+ *
67680+ * Updates stat data.
67681+ */
67682+static void reiser4_dirty_inode(struct inode *inode)
67683+{
67684+ int result;
67685+
67686+ if (!is_in_reiser4_context())
67687+ return;
67688+ assert("", !IS_RDONLY(inode));
67689+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
67690+ get_current_context()->grabbed_blocks));
67691+
67692+ result = reiser4_update_sd(inode);
67693+ if (result)
67694+ warning("", "failed to dirty inode for %llu: %d",
67695+ get_inode_oid(inode), result);
67696+}
67697+
67698+/**
67699+ * reiser4_delete_inode - delete_inode of super operations
67700+ * @inode: inode to delete
67701+ *
67702+ * Calls file plugin's delete_object method to delete object items from
67703+ * filesystem tree and calls clear_inode.
67704+ */
67705+static void reiser4_delete_inode(struct inode *inode)
67706+{
67707+ reiser4_context *ctx;
67708+ file_plugin *fplug;
67709+
67710+ ctx = reiser4_init_context(inode->i_sb);
67711+ if (IS_ERR(ctx)) {
67712+ warning("vs-15", "failed to init context");
67713+ return;
67714+ }
67715+
67716+ if (is_inode_loaded(inode)) {
67717+ fplug = inode_file_plugin(inode);
67718+ if (fplug != NULL && fplug->delete_object != NULL)
67719+ fplug->delete_object(inode);
67720+ }
67721+
67722+ truncate_inode_pages(&inode->i_data, 0);
67723+ inode->i_blocks = 0;
67724+ clear_inode(inode);
67725+ reiser4_exit_context(ctx);
67726+}
67727+
67728+/**
67729+ * reiser4_put_super - put_super of super operations
67730+ * @super: super block to free
67731+ *
67732+ * Stops daemons, release resources, umounts in short.
67733+ */
67734+static void reiser4_put_super(struct super_block *super)
67735+{
67736+ reiser4_super_info_data *sbinfo;
67737+ reiser4_context *ctx;
67738+
67739+ sbinfo = get_super_private(super);
67740+ assert("vs-1699", sbinfo);
67741+
67742+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
67743+ debugfs_remove(sbinfo->tmgr.debugfs_id_count);
67744+ debugfs_remove(sbinfo->debugfs_root);
67745+
67746+ ctx = reiser4_init_context(super);
67747+ if (IS_ERR(ctx)) {
67748+ warning("vs-17", "failed to init context");
67749+ return;
67750+ }
67751+
67752+ /* have disk format plugin to free its resources */
67753+ if (get_super_private(super)->df_plug->release)
67754+ get_super_private(super)->df_plug->release(super);
67755+
67756+ reiser4_done_formatted_fake(super);
67757+
67758+ /* stop daemons: ktxnmgr and entd */
67759+ reiser4_done_entd(super);
67760+ reiser4_done_ktxnmgrd(super);
67761+ reiser4_done_txnmgr(&sbinfo->tmgr);
67762+
67763+ reiser4_done_fs_info(super);
67764+ reiser4_exit_context(ctx);
67765+}
67766+
67767+/**
67768+ * reiser4_write_super - write_super of super operations
67769+ * @super: super block to write
67770+ *
67771+ * Captures znode associated with super block, comit all transactions.
67772+ */
67773+static void reiser4_write_super(struct super_block *super)
67774+{
67775+ int ret;
67776+ reiser4_context *ctx;
67777+
67778+ assert("vs-1700", !rofs_super(super));
67779+
67780+ ctx = reiser4_init_context(super);
67781+ if (IS_ERR(ctx)) {
67782+ warning("vs-16", "failed to init context");
67783+ return;
67784+ }
67785+
67786+ ret = reiser4_capture_super_block(super);
67787+ if (ret != 0)
67788+ warning("vs-1701",
67789+ "reiser4_capture_super_block failed in write_super: %d",
67790+ ret);
67791+ ret = txnmgr_force_commit_all(super, 0);
67792+ if (ret != 0)
67793+ warning("jmacd-77113",
67794+ "txn_force failed in write_super: %d", ret);
67795+
67796+ super->s_dirt = 0;
67797+
67798+ reiser4_exit_context(ctx);
67799+}
67800+
67801+/**
67802+ * reiser4_statfs - statfs of super operations
67803+ * @super: super block of file system in queried
67804+ * @stafs: buffer to fill with statistics
67805+ *
67806+ * Returns information about filesystem.
67807+ */
67808+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
67809+{
67810+ sector_t total;
67811+ sector_t reserved;
67812+ sector_t free;
67813+ sector_t forroot;
67814+ sector_t deleted;
67815+ reiser4_context *ctx;
67816+ struct super_block *super = dentry->d_sb;
67817+
67818+ assert("nikita-408", super != NULL);
67819+ assert("nikita-409", statfs != NULL);
67820+
67821+ ctx = reiser4_init_context(super);
67822+ if (IS_ERR(ctx))
67823+ return PTR_ERR(ctx);
67824+
67825+ statfs->f_type = reiser4_statfs_type(super);
67826+ statfs->f_bsize = super->s_blocksize;
67827+
67828+ /*
67829+ * 5% of total block space is reserved. This is needed for flush and
67830+ * for truncates (so that we are able to perform truncate/unlink even
67831+ * on the otherwise completely full file system). If this reservation
67832+ * is hidden from statfs(2), users will mistakenly guess that they
67833+ * have enough free space to complete some operation, which is
67834+ * frustrating.
67835+ *
67836+ * Another possible solution is to subtract ->blocks_reserved from
67837+ * ->f_bfree, but changing available space seems less intrusive than
67838+ * letting user to see 5% of disk space to be used directly after
67839+ * mkfs.
67840+ */
67841+ total = reiser4_block_count(super);
67842+ reserved = get_super_private(super)->blocks_reserved;
67843+ deleted = txnmgr_count_deleted_blocks();
67844+ free = reiser4_free_blocks(super) + deleted;
67845+ forroot = reiser4_reserved_blocks(super, 0, 0);
67846+
67847+ /*
67848+ * These counters may be in inconsistent state because we take the
67849+ * values without keeping any global spinlock. Here we do a sanity
67850+ * check that free block counter does not exceed the number of all
67851+ * blocks.
67852+ */
67853+ if (free > total)
67854+ free = total;
67855+ statfs->f_blocks = total - reserved;
67856+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
67857+ if (free > reserved)
67858+ free -= reserved;
67859+ else
67860+ free = 0;
67861+ statfs->f_bfree = free;
67862+
67863+ if (free > forroot)
67864+ free -= forroot;
67865+ else
67866+ free = 0;
67867+ statfs->f_bavail = free;
67868+
67869+ statfs->f_files = 0;
67870+ statfs->f_ffree = 0;
67871+
67872+ /* maximal acceptable name length depends on directory plugin. */
67873+ assert("nikita-3351", super->s_root->d_inode != NULL);
67874+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
67875+ reiser4_exit_context(ctx);
67876+ return 0;
67877+}
67878+
67879+/**
67880+ * reiser4_clear_inode - clear_inode of super operation
67881+ * @inode: inode about to destroy
67882+ *
67883+ * Does sanity checks: being destroyed should have all jnodes detached.
67884+ */
67885+static void reiser4_clear_inode(struct inode *inode)
67886+{
67887+#if REISER4_DEBUG
67888+ reiser4_inode *r4_inode;
67889+
67890+ r4_inode = reiser4_inode_data(inode);
67891+ if (!inode_has_no_jnodes(r4_inode))
67892+ warning("vs-1732", "reiser4 inode has %ld jnodes\n",
67893+ r4_inode->nr_jnodes);
67894+#endif
67895+}
67896+
67897+/**
67898+ * reiser4_sync_inodes - sync_inodes of super operations
67899+ * @super:
67900+ * @wbc:
67901+ *
67902+ * This method is called by background and non-backgound writeback. Reiser4's
67903+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
67904+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
67905+ * mapping - dirty pages get into atoms. Writeout is called to flush some
67906+ * atoms.
67907+ */
67908+static void reiser4_sync_inodes(struct super_block *super,
67909+ struct writeback_control *wbc)
67910+{
67911+ reiser4_context *ctx;
67912+ long to_write;
67913+
67914+ if (wbc->for_kupdate)
67915+ /* reiser4 has its own means of periodical write-out */
67916+ return;
67917+
67918+ to_write = wbc->nr_to_write;
67919+ assert("vs-49", wbc->older_than_this == NULL);
67920+
67921+ ctx = reiser4_init_context(super);
67922+ if (IS_ERR(ctx)) {
67923+ warning("vs-13", "failed to init context");
67924+ return;
67925+ }
67926+
67927+ /*
67928+ * call reiser4_writepages for each of dirty inodes to turn dirty pages
67929+ * into transactions if they were not yet.
67930+ */
67931+ generic_sync_sb_inodes(super, wbc);
67932+
67933+ /* flush goes here */
67934+ wbc->nr_to_write = to_write;
67935+ reiser4_writeout(super, wbc);
67936+
67937+ /* avoid recursive calls to ->sync_inodes */
67938+ context_set_commit_async(ctx);
67939+ reiser4_exit_context(ctx);
67940+}
67941+
67942+/**
67943+ * reiser4_show_options - show_options of super operations
67944+ * @m: file where to write information
67945+ * @mnt: mount structure
67946+ *
67947+ * Makes reiser4 mount options visible in /proc/mounts.
67948+ */
67949+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
67950+{
67951+ struct super_block *super;
67952+ reiser4_super_info_data *sbinfo;
67953+
67954+ super = mnt->mnt_sb;
67955+ sbinfo = get_super_private(super);
67956+
67957+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
67958+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
67959+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
67960+ seq_printf(m, ",atom_max_flushers=0x%x",
67961+ sbinfo->tmgr.atom_max_flushers);
67962+ seq_printf(m, ",cbk_cache_slots=0x%x",
67963+ sbinfo->tree.cbk_cache.nr_slots);
67964+
67965+ return 0;
67966+}
67967+
67968+struct super_operations reiser4_super_operations = {
67969+ .alloc_inode = reiser4_alloc_inode,
67970+ .destroy_inode = reiser4_destroy_inode,
67971+ .dirty_inode = reiser4_dirty_inode,
67972+ .delete_inode = reiser4_delete_inode,
67973+ .put_super = reiser4_put_super,
67974+ .write_super = reiser4_write_super,
67975+ .statfs = reiser4_statfs,
67976+ .clear_inode = reiser4_clear_inode,
67977+ .sync_inodes = reiser4_sync_inodes,
67978+ .show_options = reiser4_show_options
67979+};
67980+
67981+/**
67982+ * fill_super - initialize super block on mount
67983+ * @super: super block to fill
67984+ * @data: reiser4 specific mount option
67985+ * @silent:
67986+ *
67987+ * This is to be called by reiser4_get_sb. Mounts filesystem.
67988+ */
67989+static int fill_super(struct super_block *super, void *data, int silent)
67990+{
67991+ reiser4_context ctx;
67992+ int result;
67993+ reiser4_super_info_data *sbinfo;
67994+
67995+ assert("zam-989", super != NULL);
67996+
67997+ super->s_op = NULL;
67998+ init_stack_context(&ctx, super);
67999+
68000+ /* allocate reiser4 specific super block */
68001+ if ((result = reiser4_init_fs_info(super)) != 0)
68002+ goto failed_init_sinfo;
68003+
68004+ sbinfo = get_super_private(super);
68005+ /* initialize various reiser4 parameters, parse mount options */
68006+ if ((result = reiser4_init_super_data(super, data)) != 0)
68007+ goto failed_init_super_data;
68008+
68009+ /* read reiser4 master super block, initialize disk format plugin */
68010+ if ((result = reiser4_init_read_super(super, silent)) != 0)
68011+ goto failed_init_read_super;
68012+
68013+ /* initialize transaction manager */
68014+ reiser4_init_txnmgr(&sbinfo->tmgr);
68015+
68016+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
68017+ if ((result = reiser4_init_ktxnmgrd(super)) != 0)
68018+ goto failed_init_ktxnmgrd;
68019+
68020+ /* initialize entd context and start kernel thread entd */
68021+ if ((result = reiser4_init_entd(super)) != 0)
68022+ goto failed_init_entd;
68023+
68024+ /* initialize address spaces for formatted nodes and bitmaps */
68025+ if ((result = reiser4_init_formatted_fake(super)) != 0)
68026+ goto failed_init_formatted_fake;
68027+
68028+ /* initialize disk format plugin */
68029+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
68030+ goto failed_init_disk_format;
68031+
68032+ /*
68033+ * There are some 'committed' versions of reiser4 super block counters,
68034+ * which correspond to reiser4 on-disk state. These counters are
68035+ * initialized here
68036+ */
68037+ sbinfo->blocks_free_committed = sbinfo->blocks_free;
68038+ sbinfo->nr_files_committed = oids_used(super);
68039+
68040+ /* get inode of root directory */
68041+ if ((result = reiser4_init_root_inode(super)) != 0)
68042+ goto failed_init_root_inode;
68043+
68044+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
68045+ goto failed_update_format_version;
68046+
68047+ process_safelinks(super);
68048+ reiser4_exit_context(&ctx);
68049+
68050+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
68051+ reiser4_debugfs_root);
68052+ if (sbinfo->debugfs_root) {
68053+ sbinfo->tmgr.debugfs_atom_count =
68054+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
68055+ sbinfo->debugfs_root,
68056+ &sbinfo->tmgr.atom_count);
68057+ sbinfo->tmgr.debugfs_id_count =
68058+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
68059+ sbinfo->debugfs_root,
68060+ &sbinfo->tmgr.id_count);
68061+ }
68062+ return 0;
68063+
68064+ failed_update_format_version:
68065+ failed_init_root_inode:
68066+ if (sbinfo->df_plug->release)
68067+ sbinfo->df_plug->release(super);
68068+ failed_init_disk_format:
68069+ reiser4_done_formatted_fake(super);
68070+ failed_init_formatted_fake:
68071+ reiser4_done_entd(super);
68072+ failed_init_entd:
68073+ reiser4_done_ktxnmgrd(super);
68074+ failed_init_ktxnmgrd:
68075+ reiser4_done_txnmgr(&sbinfo->tmgr);
68076+ failed_init_read_super:
68077+ failed_init_super_data:
68078+ reiser4_done_fs_info(super);
68079+ failed_init_sinfo:
68080+ reiser4_exit_context(&ctx);
68081+ return result;
68082+}
68083+
68084+/**
68085+ * reiser4_get_sb - get_sb of file_system_type operations
68086+ * @fs_type:
68087+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
68088+ * @dev_name: block device file name
68089+ * @data: specific mount options
68090+ *
68091+ * Reiser4 mount entry.
68092+ */
68093+static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
68094+ const char *dev_name, void *data, struct vfsmount *mnt)
68095+{
68096+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
68097+}
68098+
68099+/* structure describing the reiser4 filesystem implementation */
68100+static struct file_system_type reiser4_fs_type = {
68101+ .owner = THIS_MODULE,
68102+ .name = "reiser4",
68103+ .fs_flags = FS_REQUIRES_DEV,
68104+ .get_sb = reiser4_get_sb,
68105+ .kill_sb = kill_block_super,
68106+ .next = NULL
68107+};
68108+
68109+void destroy_reiser4_cache(struct kmem_cache **cachep)
68110+{
68111+ BUG_ON(*cachep == NULL);
68112+ kmem_cache_destroy(*cachep);
68113+ *cachep = NULL;
68114+}
68115+
68116+/**
68117+ * init_reiser4 - reiser4 initialization entry point
68118+ *
68119+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
68120+ * on kernel initialization or during reiser4 module load.
68121+ */
68122+static int __init init_reiser4(void)
68123+{
68124+ int result;
68125+
68126+ printk(KERN_INFO
68127+ "Loading Reiser4. "
68128+ "See www.namesys.com for a description of Reiser4.\n");
68129+
68130+ /* initialize slab cache of inodes */
68131+ if ((result = init_inodes()) != 0)
68132+ goto failed_inode_cache;
68133+
68134+ /* initialize cache of znodes */
68135+ if ((result = init_znodes()) != 0)
68136+ goto failed_init_znodes;
68137+
68138+ /* initialize all plugins */
68139+ if ((result = init_plugins()) != 0)
68140+ goto failed_init_plugins;
68141+
68142+ /* initialize cache of plugin_set-s and plugin_set's hash table */
68143+ if ((result = init_plugin_set()) != 0)
68144+ goto failed_init_plugin_set;
68145+
68146+ /* initialize caches of txn_atom-s and txn_handle-s */
68147+ if ((result = init_txnmgr_static()) != 0)
68148+ goto failed_init_txnmgr_static;
68149+
68150+ /* initialize cache of jnodes */
68151+ if ((result = init_jnodes()) != 0)
68152+ goto failed_init_jnodes;
68153+
68154+ /* initialize cache of flush queues */
68155+ if ((result = reiser4_init_fqs()) != 0)
68156+ goto failed_init_fqs;
68157+
68158+ /* initialize cache of structures attached to dentry->d_fsdata */
68159+ if ((result = reiser4_init_dentry_fsdata()) != 0)
68160+ goto failed_init_dentry_fsdata;
68161+
68162+ /* initialize cache of structures attached to file->private_data */
68163+ if ((result = reiser4_init_file_fsdata()) != 0)
68164+ goto failed_init_file_fsdata;
68165+
68166+ /*
68167+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
68168+ * more details
68169+ */
68170+ if ((result = reiser4_init_d_cursor()) != 0)
68171+ goto failed_init_d_cursor;
68172+
68173+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
68174+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
68175+ return 0;
68176+ }
68177+
68178+ reiser4_done_d_cursor();
68179+ failed_init_d_cursor:
68180+ reiser4_done_file_fsdata();
68181+ failed_init_file_fsdata:
68182+ reiser4_done_dentry_fsdata();
68183+ failed_init_dentry_fsdata:
68184+ reiser4_done_fqs();
68185+ failed_init_fqs:
68186+ done_jnodes();
68187+ failed_init_jnodes:
68188+ done_txnmgr_static();
68189+ failed_init_txnmgr_static:
68190+ done_plugin_set();
68191+ failed_init_plugin_set:
68192+ failed_init_plugins:
68193+ done_znodes();
68194+ failed_init_znodes:
68195+ done_inodes();
68196+ failed_inode_cache:
68197+ return result;
68198+}
68199+
68200+/**
68201+ * done_reiser4 - reiser4 exit entry point
68202+ *
68203+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
68204+ * or at module unload.
68205+ */
68206+static void __exit done_reiser4(void)
68207+{
68208+ int result;
68209+
68210+ debugfs_remove(reiser4_debugfs_root);
68211+ result = unregister_filesystem(&reiser4_fs_type);
68212+ BUG_ON(result != 0);
68213+ reiser4_done_d_cursor();
68214+ reiser4_done_file_fsdata();
68215+ reiser4_done_dentry_fsdata();
68216+ reiser4_done_fqs();
68217+ done_jnodes();
68218+ done_txnmgr_static();
68219+ done_plugin_set();
68220+ done_znodes();
68221+ destroy_reiser4_cache(&inode_cache);
68222+}
68223+
68224+module_init(init_reiser4);
68225+module_exit(done_reiser4);
68226+
68227+MODULE_DESCRIPTION("Reiser4 filesystem");
68228+MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
68229+
68230+MODULE_LICENSE("GPL");
68231+
68232+/*
68233+ * Local variables:
68234+ * c-indentation-style: "K&R"
68235+ * mode-name: "LC"
68236+ * c-basic-offset: 8
68237+ * tab-width: 8
68238+ * fill-column: 79
68239+ * End:
68240+ */
68241diff -urN linux-2.6.20.orig/fs/reiser4/tap.c linux-2.6.20/fs/reiser4/tap.c
68242--- linux-2.6.20.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300
68243+++ linux-2.6.20/fs/reiser4/tap.c 2007-05-06 14:50:43.879031967 +0400
68244@@ -0,0 +1,377 @@
68245+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68246+ * reiser4/README */
68247+
68248+/*
68249+ Tree Access Pointer (tap).
68250+
68251+ tap is data structure combining coord and lock handle (mostly). It is
68252+ useful when one has to scan tree nodes (for example, in readdir, or flush),
68253+ for tap functions allow to move tap in either direction transparently
68254+ crossing unit/item/node borders.
68255+
68256+ Tap doesn't provide automatic synchronization of its fields as it is
68257+ supposed to be per-thread object.
68258+*/
68259+
68260+#include "forward.h"
68261+#include "debug.h"
68262+#include "coord.h"
68263+#include "tree.h"
68264+#include "context.h"
68265+#include "tap.h"
68266+#include "znode.h"
68267+#include "tree_walk.h"
68268+
68269+#if REISER4_DEBUG
68270+static int tap_invariant(const tap_t * tap);
68271+static void tap_check(const tap_t * tap);
68272+#else
68273+#define tap_check(tap) noop
68274+#endif
68275+
68276+/** load node tap is pointing to, if not loaded already */
68277+int reiser4_tap_load(tap_t * tap)
68278+{
68279+ tap_check(tap);
68280+ if (tap->loaded == 0) {
68281+ int result;
68282+
68283+ result = zload_ra(tap->coord->node, &tap->ra_info);
68284+ if (result != 0)
68285+ return result;
68286+ coord_clear_iplug(tap->coord);
68287+ }
68288+ ++tap->loaded;
68289+ tap_check(tap);
68290+ return 0;
68291+}
68292+
68293+/** release node tap is pointing to. Dual to tap_load() */
68294+void reiser4_tap_relse(tap_t * tap)
68295+{
68296+ tap_check(tap);
68297+ if (tap->loaded > 0) {
68298+ --tap->loaded;
68299+ if (tap->loaded == 0) {
68300+ zrelse(tap->coord->node);
68301+ }
68302+ }
68303+ tap_check(tap);
68304+}
68305+
68306+/**
68307+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68308+ * @mode
68309+ */
68310+void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68311+ znode_lock_mode mode)
68312+{
68313+ tap->coord = coord;
68314+ tap->lh = lh;
68315+ tap->mode = mode;
68316+ tap->loaded = 0;
68317+ INIT_LIST_HEAD(&tap->linkage);
68318+ reiser4_init_ra_info(&tap->ra_info);
68319+}
68320+
68321+/** add @tap to the per-thread list of all taps */
68322+void reiser4_tap_monitor(tap_t * tap)
68323+{
68324+ assert("nikita-2623", tap != NULL);
68325+ tap_check(tap);
68326+ list_add(&tap->linkage, reiser4_taps_list());
68327+ tap_check(tap);
68328+}
68329+
68330+/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68331+ * loaded. */
68332+void reiser4_tap_copy(tap_t * dst, tap_t * src)
68333+{
68334+ assert("nikita-3193", src != NULL);
68335+ assert("nikita-3194", dst != NULL);
68336+
68337+ *dst->coord = *src->coord;
68338+ if (src->lh->node)
68339+ copy_lh(dst->lh, src->lh);
68340+ dst->mode = src->mode;
68341+ dst->loaded = 0;
68342+ INIT_LIST_HEAD(&dst->linkage);
68343+ dst->ra_info = src->ra_info;
68344+}
68345+
68346+/** finish with @tap */
68347+void reiser4_tap_done(tap_t * tap)
68348+{
68349+ assert("nikita-2565", tap != NULL);
68350+ tap_check(tap);
68351+ if (tap->loaded > 0)
68352+ zrelse(tap->coord->node);
68353+ done_lh(tap->lh);
68354+ tap->loaded = 0;
68355+ list_del_init(&tap->linkage);
68356+ tap->coord->node = NULL;
68357+}
68358+
68359+/**
68360+ * move @tap to the new node, locked with @target. Load @target, if @tap was
68361+ * already loaded.
68362+ */
68363+int reiser4_tap_move(tap_t * tap, lock_handle * target)
68364+{
68365+ int result = 0;
68366+
68367+ assert("nikita-2567", tap != NULL);
68368+ assert("nikita-2568", target != NULL);
68369+ assert("nikita-2570", target->node != NULL);
68370+ assert("nikita-2569", tap->coord->node == tap->lh->node);
68371+
68372+ tap_check(tap);
68373+ if (tap->loaded > 0)
68374+ result = zload_ra(target->node, &tap->ra_info);
68375+
68376+ if (result == 0) {
68377+ if (tap->loaded > 0)
68378+ zrelse(tap->coord->node);
68379+ done_lh(tap->lh);
68380+ copy_lh(tap->lh, target);
68381+ tap->coord->node = target->node;
68382+ coord_clear_iplug(tap->coord);
68383+ }
68384+ tap_check(tap);
68385+ return result;
68386+}
68387+
68388+/**
68389+ * move @tap to @target. Acquire lock on @target, if @tap was already
68390+ * loaded.
68391+ */
68392+static int tap_to(tap_t * tap, znode * target)
68393+{
68394+ int result;
68395+
68396+ assert("nikita-2624", tap != NULL);
68397+ assert("nikita-2625", target != NULL);
68398+
68399+ tap_check(tap);
68400+ result = 0;
68401+ if (tap->coord->node != target) {
68402+ lock_handle here;
68403+
68404+ init_lh(&here);
68405+ result = longterm_lock_znode(&here, target,
68406+ tap->mode, ZNODE_LOCK_HIPRI);
68407+ if (result == 0) {
68408+ result = reiser4_tap_move(tap, &here);
68409+ done_lh(&here);
68410+ }
68411+ }
68412+ tap_check(tap);
68413+ return result;
68414+}
68415+
68416+/**
68417+ * move @tap to given @target, loading and locking @target->node if
68418+ * necessary
68419+ */
68420+int tap_to_coord(tap_t * tap, coord_t * target)
68421+{
68422+ int result;
68423+
68424+ tap_check(tap);
68425+ result = tap_to(tap, target->node);
68426+ if (result == 0)
68427+ coord_dup(tap->coord, target);
68428+ tap_check(tap);
68429+ return result;
68430+}
68431+
68432+/** return list of all taps */
68433+struct list_head *reiser4_taps_list(void)
68434+{
68435+ return &get_current_context()->taps;
68436+}
68437+
68438+/** helper function for go_{next,prev}_{item,unit,node}() */
68439+int go_dir_el(tap_t * tap, sideof dir, int units_p)
68440+{
68441+ coord_t dup;
68442+ coord_t *coord;
68443+ int result;
68444+
68445+ int (*coord_dir) (coord_t *);
68446+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68447+ void (*coord_init) (coord_t *, const znode *);
68448+ ON_DEBUG(int (*coord_check) (const coord_t *));
68449+
68450+ assert("nikita-2556", tap != NULL);
68451+ assert("nikita-2557", tap->coord != NULL);
68452+ assert("nikita-2558", tap->lh != NULL);
68453+ assert("nikita-2559", tap->coord->node != NULL);
68454+
68455+ tap_check(tap);
68456+ if (dir == LEFT_SIDE) {
68457+ coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68458+ get_dir_neighbor = reiser4_get_left_neighbor;
68459+ coord_init = coord_init_last_unit;
68460+ } else {
68461+ coord_dir = units_p ? coord_next_unit : coord_next_item;
68462+ get_dir_neighbor = reiser4_get_right_neighbor;
68463+ coord_init = coord_init_first_unit;
68464+ }
68465+ ON_DEBUG(coord_check =
68466+ units_p ? coord_is_existing_unit : coord_is_existing_item);
68467+ assert("nikita-2560", coord_check(tap->coord));
68468+
68469+ coord = tap->coord;
68470+ coord_dup(&dup, coord);
68471+ if (coord_dir(&dup) != 0) {
68472+ do {
68473+ /* move to the left neighboring node */
68474+ lock_handle dup;
68475+
68476+ init_lh(&dup);
68477+ result =
68478+ get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68479+ GN_CAN_USE_UPPER_LEVELS);
68480+ if (result == 0) {
68481+ result = reiser4_tap_move(tap, &dup);
68482+ if (result == 0)
68483+ coord_init(tap->coord, dup.node);
68484+ done_lh(&dup);
68485+ }
68486+ /* skip empty nodes */
68487+ } while ((result == 0) && node_is_empty(coord->node));
68488+ } else {
68489+ result = 0;
68490+ coord_dup(coord, &dup);
68491+ }
68492+ assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68493+ tap_check(tap);
68494+ return result;
68495+}
68496+
68497+/**
68498+ * move @tap to the next unit, transparently crossing item and node
68499+ * boundaries
68500+ */
68501+int go_next_unit(tap_t * tap)
68502+{
68503+ return go_dir_el(tap, RIGHT_SIDE, 1);
68504+}
68505+
68506+/**
68507+ * move @tap to the previous unit, transparently crossing item and node
68508+ * boundaries
68509+ */
68510+int go_prev_unit(tap_t * tap)
68511+{
68512+ return go_dir_el(tap, LEFT_SIDE, 1);
68513+}
68514+
68515+/**
68516+ * @shift times apply @actor to the @tap. This is used to move @tap by
68517+ * @shift units (or items, or nodes) in either direction.
68518+ */
68519+static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
68520+{
68521+ int result;
68522+
68523+ assert("nikita-2555", shift >= 0);
68524+ assert("nikita-2562", tap->coord->node == tap->lh->node);
68525+
68526+ tap_check(tap);
68527+ result = reiser4_tap_load(tap);
68528+ if (result != 0)
68529+ return result;
68530+
68531+ for (; shift > 0; --shift) {
68532+ result = actor(tap);
68533+ assert("nikita-2563", tap->coord->node == tap->lh->node);
68534+ if (result != 0)
68535+ break;
68536+ }
68537+ reiser4_tap_relse(tap);
68538+ tap_check(tap);
68539+ return result;
68540+}
68541+
68542+/** move @tap @shift units rightward */
68543+int rewind_right(tap_t * tap, int shift)
68544+{
68545+ return rewind_to(tap, go_next_unit, shift);
68546+}
68547+
68548+/** move @tap @shift units leftward */
68549+int rewind_left(tap_t * tap, int shift)
68550+{
68551+ return rewind_to(tap, go_prev_unit, shift);
68552+}
68553+
68554+#if REISER4_DEBUG
68555+/** debugging function: print @tap content in human readable form */
68556+static void print_tap(const char *prefix, const tap_t * tap)
68557+{
68558+ if (tap == NULL) {
68559+ printk("%s: null tap\n", prefix);
68560+ return;
68561+ }
68562+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
68563+ tap->loaded, (&tap->linkage == tap->linkage.next &&
68564+ &tap->linkage == tap->linkage.prev),
68565+ tap->lh->node,
68566+ lock_mode_name(tap->mode));
68567+ print_coord("\tcoord", tap->coord, 0);
68568+}
68569+
68570+/** check [tap-sane] invariant */
68571+static int tap_invariant(const tap_t * tap)
68572+{
68573+ /* [tap-sane] invariant */
68574+
68575+ if (tap == NULL)
68576+ return 1;
68577+ /* tap->mode is one of
68578+ *
68579+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
68580+ */
68581+ if (tap->mode != ZNODE_NO_LOCK &&
68582+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
68583+ return 2;
68584+ /* tap->coord != NULL, and */
68585+ if (tap->coord == NULL)
68586+ return 3;
68587+ /* tap->lh != NULL, and */
68588+ if (tap->lh == NULL)
68589+ return 4;
68590+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
68591+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
68592+ return 5;
68593+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
68594+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
68595+ return 6;
68596+ return 0;
68597+}
68598+
68599+/** debugging function: check internal @tap consistency */
68600+static void tap_check(const tap_t * tap)
68601+{
68602+ int result;
68603+
68604+ result = tap_invariant(tap);
68605+ if (result != 0) {
68606+ print_tap("broken", tap);
68607+ reiser4_panic("nikita-2831", "tap broken: %i\n", result);
68608+ }
68609+}
68610+#endif
68611+
68612+/* Make Linus happy.
68613+ Local variables:
68614+ c-indentation-style: "K&R"
68615+ mode-name: "LC"
68616+ c-basic-offset: 8
68617+ tab-width: 8
68618+ fill-column: 120
68619+ scroll-step: 1
68620+ End:
68621+*/
68622diff -urN linux-2.6.20.orig/fs/reiser4/tap.h linux-2.6.20/fs/reiser4/tap.h
68623--- linux-2.6.20.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300
68624+++ linux-2.6.20/fs/reiser4/tap.h 2007-05-06 14:50:43.879031967 +0400
68625@@ -0,0 +1,70 @@
68626+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68627+
68628+/* Tree Access Pointers. See tap.c for more details. */
68629+
68630+#if !defined( __REISER4_TAP_H__ )
68631+#define __REISER4_TAP_H__
68632+
68633+#include "forward.h"
68634+#include "readahead.h"
68635+
68636+/**
68637+ tree_access_pointer aka tap. Data structure combining coord_t and lock
68638+ handle.
68639+ Invariants involving this data-type, see doc/lock-ordering for details:
68640+
68641+ [tap-sane]
68642+ */
68643+struct tree_access_pointer {
68644+ /* coord tap is at */
68645+ coord_t *coord;
68646+ /* lock handle on ->coord->node */
68647+ lock_handle *lh;
68648+ /* mode of lock acquired by this tap */
68649+ znode_lock_mode mode;
68650+ /* incremented by reiser4_tap_load().
68651+ Decremented by reiser4_tap_relse(). */
68652+ int loaded;
68653+ /* list of taps */
68654+ struct list_head linkage;
68655+ /* read-ahead hint */
68656+ ra_info_t ra_info;
68657+};
68658+
68659+typedef int (*go_actor_t) (tap_t * tap);
68660+
68661+extern int reiser4_tap_load(tap_t * tap);
68662+extern void reiser4_tap_relse(tap_t * tap);
68663+extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68664+ znode_lock_mode mode);
68665+extern void reiser4_tap_monitor(tap_t * tap);
68666+extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
68667+extern void reiser4_tap_done(tap_t * tap);
68668+extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
68669+extern int tap_to_coord(tap_t * tap, coord_t * target);
68670+
68671+extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
68672+extern int go_next_unit(tap_t * tap);
68673+extern int go_prev_unit(tap_t * tap);
68674+extern int rewind_right(tap_t * tap, int shift);
68675+extern int rewind_left(tap_t * tap, int shift);
68676+
68677+extern struct list_head *reiser4_taps_list(void);
68678+
68679+#define for_all_taps(tap) \
68680+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
68681+ reiser4_taps_list() != &tap->linkage; \
68682+ tap = list_entry(tap->linkage.next, tap_t, linkage))
68683+
68684+/* __REISER4_TAP_H__ */
68685+#endif
68686+/* Make Linus happy.
68687+ Local variables:
68688+ c-indentation-style: "K&R"
68689+ mode-name: "LC"
68690+ c-basic-offset: 8
68691+ tab-width: 8
68692+ fill-column: 120
68693+ scroll-step: 1
68694+ End:
68695+*/
68696diff -urN linux-2.6.20.orig/fs/reiser4/tree.c linux-2.6.20/fs/reiser4/tree.c
68697--- linux-2.6.20.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
68698+++ linux-2.6.20/fs/reiser4/tree.c 2007-05-06 14:50:43.883033217 +0400
68699@@ -0,0 +1,1876 @@
68700+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68701+ * reiser4/README */
68702+
68703+/*
68704+ * KEYS IN A TREE.
68705+ *
68706+ * The tree consists of nodes located on the disk. Node in the tree is either
68707+ * formatted or unformatted. Formatted node is one that has structure
68708+ * understood by the tree balancing and traversal code. Formatted nodes are
68709+ * further classified into leaf and internal nodes. Latter distinctions is
68710+ * (almost) of only historical importance: general structure of leaves and
68711+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
68712+ * that are part of bodies of ordinary files and attributes.
68713+ *
68714+ * Each node in the tree spawns some interval in the key space. Key ranges for
68715+ * all nodes in the tree are disjoint. Actually, this only holds in some weak
68716+ * sense, because of the non-unique keys: intersection of key ranges for
68717+ * different nodes is either empty, or consists of exactly one key.
68718+ *
68719+ * Formatted node consists of a sequence of items. Each item spawns some
68720+ * interval in key space. Key ranges for all items in a tree are disjoint,
68721+ * modulo non-unique keys again. Items within nodes are ordered in the key
68722+ * order of the smallest key in a item.
68723+ *
68724+ * Particular type of item can be further split into units. Unit is piece of
68725+ * item that can be cut from item and moved into another item of the same
68726+ * time. Units are used by balancing code to repack data during balancing.
68727+ *
68728+ * Unit can be further split into smaller entities (for example, extent unit
68729+ * represents several pages, and it is natural for extent code to operate on
68730+ * particular pages and even bytes within one unit), but this is of no
68731+ * relevance to the generic balancing and lookup code.
68732+ *
68733+ * Although item is said to "spawn" range or interval of keys, it is not
68734+ * necessary that item contains piece of data addressable by each and every
68735+ * key in this range. For example, compound directory item, consisting of
68736+ * units corresponding to directory entries and keyed by hashes of file names,
68737+ * looks more as having "discrete spectrum": only some disjoint keys inside
68738+ * range occupied by this item really address data.
68739+ *
68740+ * No than less, each item always has well-defined least (minimal) key, that
68741+ * is recorded in item header, stored in the node this item is in. Also, item
68742+ * plugin can optionally define method ->max_key_inside() returning maximal
68743+ * key that can _possibly_ be located within this item. This method is used
68744+ * (mainly) to determine when given piece of data should be merged into
68745+ * existing item, in stead of creating new one. Because of this, even though
68746+ * ->max_key_inside() can be larger that any key actually located in the item,
68747+ * intervals
68748+ *
68749+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
68750+ *
68751+ * are still disjoint for all items within the _same_ node.
68752+ *
68753+ * In memory node is represented by znode. It plays several roles:
68754+ *
68755+ * . something locks are taken on
68756+ *
68757+ * . something tracked by transaction manager (this is going to change)
68758+ *
68759+ * . something used to access node data
68760+ *
68761+ * . something used to maintain tree structure in memory: sibling and
68762+ * parental linkage.
68763+ *
68764+ * . something used to organize nodes into "slums"
68765+ *
68766+ * More on znodes see in znode.[ch]
68767+ *
68768+ * DELIMITING KEYS
68769+ *
68770+ * To simplify balancing, allow some flexibility in locking and speed up
68771+ * important coord cache optimization, we keep delimiting keys of nodes in
68772+ * memory. Depending on disk format (implemented by appropriate node plugin)
68773+ * node on disk can record both left and right delimiting key, only one of
68774+ * them, or none. Still, our balancing and tree traversal code keep both
68775+ * delimiting keys for a node that is in memory stored in the znode. When
68776+ * node is first brought into memory during tree traversal, its left
68777+ * delimiting key is taken from its parent, and its right delimiting key is
68778+ * either next key in its parent, or is right delimiting key of parent if
68779+ * node is the rightmost child of parent.
68780+ *
68781+ * Physical consistency of delimiting key is protected by special dk
68782+ * read-write lock. That is, delimiting keys can only be inspected or
68783+ * modified under this lock. But dk lock is only sufficient for fast
68784+ * "pessimistic" check, because to simplify code and to decrease lock
68785+ * contention, balancing (carry) only updates delimiting keys right before
68786+ * unlocking all locked nodes on the given tree level. For example,
68787+ * coord-by-key cache scans LRU list of recently accessed znodes. For each
68788+ * node it first does fast check under dk spin lock. If key looked for is
68789+ * not between delimiting keys for this node, next node is inspected and so
68790+ * on. If key is inside of the key range, long term lock is taken on node
68791+ * and key range is rechecked.
68792+ *
68793+ * COORDINATES
68794+ *
68795+ * To find something in the tree, you supply a key, and the key is resolved
68796+ * by coord_by_key() into a coord (coordinate) that is valid as long as the
68797+ * node the coord points to remains locked. As mentioned above trees
68798+ * consist of nodes that consist of items that consist of units. A unit is
68799+ * the smallest and indivisible piece of tree as far as balancing and tree
68800+ * search are concerned. Each node, item, and unit can be addressed by
68801+ * giving its level in the tree and the key occupied by this entity. A node
68802+ * knows what the key ranges are of the items within it, and how to find its
68803+ * items and invoke their item handlers, but it does not know how to access
68804+ * individual units within its items except through the item handlers.
68805+ * coord is a structure containing a pointer to the node, the ordinal number
68806+ * of the item within this node (a sort of item offset), and the ordinal
68807+ * number of the unit within this item.
68808+ *
68809+ * TREE LOOKUP
68810+ *
68811+ * There are two types of access to the tree: lookup and modification.
68812+ *
68813+ * Lookup is a search for the key in the tree. Search can look for either
68814+ * exactly the key given to it, or for the largest key that is not greater
68815+ * than the key given to it. This distinction is determined by "bias"
68816+ * parameter of search routine (coord_by_key()). coord_by_key() either
68817+ * returns error (key is not in the tree, or some kind of external error
68818+ * occurred), or successfully resolves key into coord.
68819+ *
68820+ * This resolution is done by traversing tree top-to-bottom from root level
68821+ * to the desired level. On levels above twig level (level one above the
68822+ * leaf level) nodes consist exclusively of internal items. Internal item is
68823+ * nothing more than pointer to the tree node on the child level. On twig
68824+ * level nodes consist of internal items intermixed with extent
68825+ * items. Internal items form normal search tree structure used by traversal
68826+ * to descent through the tree.
68827+ *
68828+ * TREE LOOKUP OPTIMIZATIONS
68829+ *
68830+ * Tree lookup described above is expensive even if all nodes traversed are
68831+ * already in the memory: for each node binary search within it has to be
68832+ * performed and binary searches are CPU consuming and tend to destroy CPU
68833+ * caches.
68834+ *
68835+ * Several optimizations are used to work around this:
68836+ *
68837+ * . cbk_cache (look-aside cache for tree traversals, see search.c for
68838+ * details)
68839+ *
68840+ * . seals (see seal.[ch])
68841+ *
68842+ * . vroot (see search.c)
68843+ *
68844+ * General search-by-key is layered thusly:
68845+ *
68846+ * [check seal, if any] --ok--> done
68847+ * |
68848+ * failed
68849+ * |
68850+ * V
68851+ * [vroot defined] --no--> node = tree_root
68852+ * | |
68853+ * yes |
68854+ * | |
68855+ * V |
68856+ * node = vroot |
68857+ * | |
68858+ * | |
68859+ * | |
68860+ * V V
68861+ * [check cbk_cache for key] --ok--> done
68862+ * |
68863+ * failed
68864+ * |
68865+ * V
68866+ * [start tree traversal from node]
68867+ *
68868+ */
68869+
68870+#include "forward.h"
68871+#include "debug.h"
68872+#include "dformat.h"
68873+#include "key.h"
68874+#include "coord.h"
68875+#include "plugin/item/static_stat.h"
68876+#include "plugin/item/item.h"
68877+#include "plugin/node/node.h"
68878+#include "plugin/plugin.h"
68879+#include "txnmgr.h"
68880+#include "jnode.h"
68881+#include "znode.h"
68882+#include "block_alloc.h"
68883+#include "tree_walk.h"
68884+#include "carry.h"
68885+#include "carry_ops.h"
68886+#include "tap.h"
68887+#include "tree.h"
68888+#include "vfs_ops.h"
68889+#include "page_cache.h"
68890+#include "super.h"
68891+#include "reiser4.h"
68892+#include "inode.h"
68893+
68894+#include <linux/fs.h> /* for struct super_block */
68895+#include <linux/spinlock.h>
68896+
68897+/* Disk address (block number) never ever used for any real tree node. This is
68898+ used as block number of "uber" znode.
68899+
68900+ Invalid block addresses are 0 by tradition.
68901+
68902+*/
68903+const reiser4_block_nr UBER_TREE_ADDR = 0ull;
68904+
68905+#define CUT_TREE_MIN_ITERATIONS 64
68906+
68907+static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
68908+
68909+/* return node plugin of coord->node */
68910+node_plugin *node_plugin_by_coord(const coord_t * coord)
68911+{
68912+ assert("vs-1", coord != NULL);
68913+ assert("vs-2", coord->node != NULL);
68914+
68915+ return coord->node->nplug;
68916+}
68917+
68918+/* insert item into tree. Fields of @coord are updated so that they can be
68919+ * used by consequent insert operation. */
68920+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
68921+ * into */ ,
68922+ const reiser4_key * key /* key of new item */ ,
68923+ reiser4_item_data * data /* parameters for item
68924+ * creation */ ,
68925+ coord_t * coord /* resulting insertion coord */ ,
68926+ lock_handle * lh /* resulting lock
68927+ * handle */ ,
68928+ tree_level stop_level /** level where to insert */ ,
68929+ __u32 flags /* insertion flags */ )
68930+{
68931+ int result;
68932+
68933+ assert("nikita-358", tree != NULL);
68934+ assert("nikita-360", coord != NULL);
68935+
68936+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
68937+ FIND_EXACT, stop_level, stop_level,
68938+ flags | CBK_FOR_INSERT, NULL /*ra_info */ );
68939+ switch (result) {
68940+ default:
68941+ break;
68942+ case CBK_COORD_FOUND:
68943+ result = IBK_ALREADY_EXISTS;
68944+ break;
68945+ case CBK_COORD_NOTFOUND:
68946+ assert("nikita-2017", coord->node != NULL);
68947+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
68948+ break;
68949+ }
68950+ return result;
68951+}
68952+
68953+/* insert item by calling carry. Helper function called if short-cut
68954+ insertion failed */
68955+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
68956+ lock_handle * lh, /* lock handle of insertion
68957+ * node */
68958+ reiser4_item_data * data, /* parameters of new
68959+ * item */
68960+ const reiser4_key * key, /* key of new item */
68961+ carry_opcode cop, /* carry operation to perform */
68962+ cop_insert_flag flags
68963+ /* carry flags */ )
68964+{
68965+ int result;
68966+ carry_pool *pool;
68967+ carry_level *lowest_level;
68968+ carry_insert_data *cdata;
68969+ carry_op *op;
68970+
68971+ assert("umka-314", coord != NULL);
68972+
68973+ /* allocate carry_pool and 3 carry_level-s */
68974+ pool =
68975+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68976+ sizeof(*cdata));
68977+ if (IS_ERR(pool))
68978+ return PTR_ERR(pool);
68979+ lowest_level = (carry_level *) (pool + 1);
68980+ init_carry_level(lowest_level, pool);
68981+
68982+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
68983+ if (IS_ERR(op) || (op == NULL)) {
68984+ done_carry_pool(pool);
68985+ return RETERR(op ? PTR_ERR(op) : -EIO);
68986+ }
68987+ cdata = (carry_insert_data *) (lowest_level + 3);
68988+ cdata->coord = coord;
68989+ cdata->data = data;
68990+ cdata->key = key;
68991+ op->u.insert.d = cdata;
68992+ if (flags == 0)
68993+ flags = znode_get_tree(coord->node)->carry.insert_flags;
68994+ op->u.insert.flags = flags;
68995+ op->u.insert.type = COPT_ITEM_DATA;
68996+ op->u.insert.child = NULL;
68997+ if (lh != NULL) {
68998+ assert("nikita-3245", lh->node == coord->node);
68999+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69000+ lowest_level->tracked = lh;
69001+ }
69002+
69003+ result = reiser4_carry(lowest_level, NULL);
69004+ done_carry_pool(pool);
69005+
69006+ return result;
69007+}
69008+
69009+/* form carry queue to perform paste of @data with @key at @coord, and launch
69010+ its execution by calling carry().
69011+
69012+ Instruct carry to update @lh it after balancing insertion coord moves into
69013+ different block.
69014+
69015+*/
69016+static int paste_with_carry(coord_t * coord, /* coord of paste */
69017+ lock_handle * lh, /* lock handle of node
69018+ * where item is
69019+ * pasted */
69020+ reiser4_item_data * data, /* parameters of new
69021+ * item */
69022+ const reiser4_key * key, /* key of new item */
69023+ unsigned flags /* paste flags */ )
69024+{
69025+ int result;
69026+ carry_pool *pool;
69027+ carry_level *lowest_level;
69028+ carry_insert_data *cdata;
69029+ carry_op *op;
69030+
69031+ assert("umka-315", coord != NULL);
69032+ assert("umka-316", key != NULL);
69033+
69034+ pool =
69035+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69036+ sizeof(*cdata));
69037+ if (IS_ERR(pool))
69038+ return PTR_ERR(pool);
69039+ lowest_level = (carry_level *) (pool + 1);
69040+ init_carry_level(lowest_level, pool);
69041+
69042+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
69043+ if (IS_ERR(op) || (op == NULL)) {
69044+ done_carry_pool(pool);
69045+ return RETERR(op ? PTR_ERR(op) : -EIO);
69046+ }
69047+ cdata = (carry_insert_data *) (lowest_level + 3);
69048+ cdata->coord = coord;
69049+ cdata->data = data;
69050+ cdata->key = key;
69051+ op->u.paste.d = cdata;
69052+ if (flags == 0)
69053+ flags = znode_get_tree(coord->node)->carry.paste_flags;
69054+ op->u.paste.flags = flags;
69055+ op->u.paste.type = COPT_ITEM_DATA;
69056+ if (lh != NULL) {
69057+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69058+ lowest_level->tracked = lh;
69059+ }
69060+
69061+ result = reiser4_carry(lowest_level, NULL);
69062+ done_carry_pool(pool);
69063+
69064+ return result;
69065+}
69066+
69067+/* insert item at the given coord.
69068+
69069+ First try to skip carry by directly calling ->create_item() method of node
69070+ plugin. If this is impossible (there is not enough free space in the node,
69071+ or leftmost item in the node is created), call insert_with_carry_by_coord()
69072+ that will do full carry().
69073+
69074+*/
69075+insert_result insert_by_coord(coord_t * coord /* coord where to
69076+ * insert. coord->node has
69077+ * to be write locked by
69078+ * caller */ ,
69079+ reiser4_item_data * data /* data to be
69080+ * inserted */ ,
69081+ const reiser4_key * key /* key of new item */ ,
69082+ lock_handle * lh /* lock handle of write
69083+ * lock on node */ ,
69084+ __u32 flags /* insertion flags */ )
69085+{
69086+ unsigned item_size;
69087+ int result;
69088+ znode *node;
69089+
69090+ assert("vs-247", coord != NULL);
69091+ assert("vs-248", data != NULL);
69092+ assert("vs-249", data->length >= 0);
69093+ assert("nikita-1191", znode_is_write_locked(coord->node));
69094+
69095+ node = coord->node;
69096+ coord_clear_iplug(coord);
69097+ result = zload(node);
69098+ if (result != 0)
69099+ return result;
69100+
69101+ item_size = space_needed(node, NULL, data, 1);
69102+ if (item_size > znode_free_space(node) &&
69103+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69104+ && (flags & COPI_DONT_ALLOCATE)) {
69105+ /* we are forced to use free space of coord->node and new item
69106+ does not fit into it.
69107+
69108+ Currently we get here only when we allocate and copy units
69109+ of extent item from a node to its left neighbor during
69110+ "squalloc"-ing. If @node (this is left neighbor) does not
69111+ have enough free space - we do not want to attempt any
69112+ shifting and allocations because we are in squeezing and
69113+ everything to the left of @node is tightly packed.
69114+ */
69115+ result = -E_NODE_FULL;
69116+ } else if ((item_size <= znode_free_space(node)) &&
69117+ !coord_is_before_leftmost(coord) &&
69118+ (node_plugin_by_node(node)->fast_insert != NULL)
69119+ && node_plugin_by_node(node)->fast_insert(coord)) {
69120+ /* shortcut insertion without carry() overhead.
69121+
69122+ Only possible if:
69123+
69124+ - there is enough free space
69125+
69126+ - insertion is not into the leftmost position in a node
69127+ (otherwise it would require updating of delimiting key in a
69128+ parent)
69129+
69130+ - node plugin agrees with this
69131+
69132+ */
69133+ result =
69134+ node_plugin_by_node(node)->create_item(coord, key, data,
69135+ NULL);
69136+ znode_make_dirty(node);
69137+ } else {
69138+ /* otherwise do full-fledged carry(). */
69139+ result =
69140+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
69141+ flags);
69142+ }
69143+ zrelse(node);
69144+ return result;
69145+}
69146+
69147+/* @coord is set to leaf level and @data is to be inserted to twig level */
69148+insert_result
69149+insert_extent_by_coord(coord_t *
69150+ coord
69151+ /* coord where to insert. coord->node * has to be write * locked by caller */
69152+ ,
69153+ reiser4_item_data * data /* data to be inserted */ ,
69154+ const reiser4_key * key /* key of new item */ ,
69155+ lock_handle *
69156+ lh /* lock handle of write lock on * node */ )
69157+{
69158+ assert("vs-405", coord != NULL);
69159+ assert("vs-406", data != NULL);
69160+ assert("vs-407", data->length > 0);
69161+ assert("vs-408", znode_is_write_locked(coord->node));
69162+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
69163+
69164+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
69165+ 0 /*flags */ );
69166+}
69167+
69168+/* Insert into the item at the given coord.
69169+
69170+ First try to skip carry by directly calling ->paste() method of item
69171+ plugin. If this is impossible (there is not enough free space in the node,
69172+ or we are pasting into leftmost position in the node), call
69173+ paste_with_carry() that will do full carry().
69174+
69175+*/
69176+/* paste_into_item */
69177+int insert_into_item(coord_t * coord /* coord of pasting */ ,
69178+ lock_handle * lh /* lock handle on node involved */ ,
69179+ const reiser4_key * key /* key of unit being pasted */ ,
69180+ reiser4_item_data * data /* parameters for new unit */ ,
69181+ unsigned flags /* insert/paste flags */ )
69182+{
69183+ int result;
69184+ int size_change;
69185+ node_plugin *nplug;
69186+ item_plugin *iplug;
69187+
69188+ assert("umka-317", coord != NULL);
69189+ assert("umka-318", key != NULL);
69190+
69191+ iplug = item_plugin_by_coord(coord);
69192+ nplug = node_plugin_by_coord(coord);
69193+
69194+ assert("nikita-1480", iplug == data->iplug);
69195+
69196+ size_change = space_needed(coord->node, coord, data, 0);
69197+ if (size_change > (int)znode_free_space(coord->node) &&
69198+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69199+ && (flags & COPI_DONT_ALLOCATE)) {
69200+ /* we are forced to use free space of coord->node and new data
69201+ does not fit into it. */
69202+ return -E_NODE_FULL;
69203+ }
69204+
69205+ /* shortcut paste without carry() overhead.
69206+
69207+ Only possible if:
69208+
69209+ - there is enough free space
69210+
69211+ - paste is not into the leftmost unit in a node (otherwise
69212+ it would require updating of delimiting key in a parent)
69213+
69214+ - node plugin agrees with this
69215+
69216+ - item plugin agrees with us
69217+ */
69218+ if (size_change <= (int)znode_free_space(coord->node) &&
69219+ (coord->item_pos != 0 ||
69220+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
69221+ coord->unit_pos != 0 && nplug->fast_paste != NULL &&
69222+ nplug->fast_paste(coord) &&
69223+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
69224+ if (size_change > 0)
69225+ nplug->change_item_size(coord, size_change);
69226+ /* NOTE-NIKITA: huh? where @key is used? */
69227+ result = iplug->b.paste(coord, data, NULL);
69228+ if (size_change < 0)
69229+ nplug->change_item_size(coord, size_change);
69230+ znode_make_dirty(coord->node);
69231+ } else
69232+ /* otherwise do full-fledged carry(). */
69233+ result = paste_with_carry(coord, lh, data, key, flags);
69234+ return result;
69235+}
69236+
69237+/* this either appends or truncates item @coord */
69238+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
69239+ reiser4_item_data * data /* parameters of resize */ ,
69240+ reiser4_key * key /* key of new unit */ ,
69241+ lock_handle * lh /* lock handle of node
69242+ * being modified */ ,
69243+ cop_insert_flag flags /* carry flags */ )
69244+{
69245+ int result;
69246+ znode *node;
69247+
69248+ assert("nikita-362", coord != NULL);
69249+ assert("nikita-363", data != NULL);
69250+ assert("vs-245", data->length != 0);
69251+
69252+ node = coord->node;
69253+ coord_clear_iplug(coord);
69254+ result = zload(node);
69255+ if (result != 0)
69256+ return result;
69257+
69258+ if (data->length < 0)
69259+ result = node_plugin_by_coord(coord)->shrink_item(coord,
69260+ -data->length);
69261+ else
69262+ result = insert_into_item(coord, lh, key, data, flags);
69263+
69264+ zrelse(node);
69265+ return result;
69266+}
69267+
69268+/* insert flow @f */
69269+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
69270+{
69271+ int result;
69272+ carry_pool *pool;
69273+ carry_level *lowest_level;
69274+ reiser4_item_data *data;
69275+ carry_op *op;
69276+
69277+ pool =
69278+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69279+ sizeof(*data));
69280+ if (IS_ERR(pool))
69281+ return PTR_ERR(pool);
69282+ lowest_level = (carry_level *) (pool + 1);
69283+ init_carry_level(lowest_level, pool);
69284+
69285+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
69286+ 0 /* operate directly on coord -> node */ );
69287+ if (IS_ERR(op) || (op == NULL)) {
69288+ done_carry_pool(pool);
69289+ return RETERR(op ? PTR_ERR(op) : -EIO);
69290+ }
69291+
69292+ /* these are permanent during insert_flow */
69293+ data = (reiser4_item_data *) (lowest_level + 3);
69294+ data->user = 1;
69295+ data->iplug = item_plugin_by_id(FORMATTING_ID);
69296+ data->arg = NULL;
69297+ /* data.length and data.data will be set before calling paste or
69298+ insert */
69299+ data->length = 0;
69300+ data->data = NULL;
69301+
69302+ op->u.insert_flow.flags = 0;
69303+ op->u.insert_flow.insert_point = coord;
69304+ op->u.insert_flow.flow = f;
69305+ op->u.insert_flow.data = data;
69306+ op->u.insert_flow.new_nodes = 0;
69307+
69308+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69309+ lowest_level->tracked = lh;
69310+
69311+ result = reiser4_carry(lowest_level, NULL);
69312+ done_carry_pool(pool);
69313+
69314+ return result;
69315+}
69316+
69317+/* Given a coord in parent node, obtain a znode for the corresponding child */
69318+znode *child_znode(const coord_t * parent_coord /* coord of pointer to
69319+ * child */ ,
69320+ znode * parent /* parent of child */ ,
69321+ int incore_p /* if !0 only return child if already in
69322+ * memory */ ,
69323+ int setup_dkeys_p /* if !0 update delimiting keys of
69324+ * child */ )
69325+{
69326+ znode *child;
69327+
69328+ assert("nikita-1374", parent_coord != NULL);
69329+ assert("nikita-1482", parent != NULL);
69330+#if REISER4_DEBUG
69331+ if (setup_dkeys_p)
69332+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69333+#endif
69334+ assert("nikita-2947", znode_is_any_locked(parent));
69335+
69336+ if (znode_get_level(parent) <= LEAF_LEVEL) {
69337+ /* trying to get child of leaf node */
69338+ warning("nikita-1217", "Child of maize?");
69339+ return ERR_PTR(RETERR(-EIO));
69340+ }
69341+ if (item_is_internal(parent_coord)) {
69342+ reiser4_block_nr addr;
69343+ item_plugin *iplug;
69344+ reiser4_tree *tree;
69345+
69346+ iplug = item_plugin_by_coord(parent_coord);
69347+ assert("vs-512", iplug->s.internal.down_link);
69348+ iplug->s.internal.down_link(parent_coord, NULL, &addr);
69349+
69350+ tree = znode_get_tree(parent);
69351+ if (incore_p)
69352+ child = zlook(tree, &addr);
69353+ else
69354+ child =
69355+ zget(tree, &addr, parent,
69356+ znode_get_level(parent) - 1,
69357+ reiser4_ctx_gfp_mask_get());
69358+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69359+ set_child_delimiting_keys(parent, parent_coord, child);
69360+ } else {
69361+ warning("nikita-1483", "Internal item expected");
69362+ child = ERR_PTR(RETERR(-EIO));
69363+ }
69364+ return child;
69365+}
69366+
69367+/* remove znode from transaction */
69368+static void uncapture_znode(znode * node)
69369+{
69370+ struct page *page;
69371+
69372+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69373+
69374+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
69375+ int ret;
69376+
69377+ /* An already allocated block goes right to the atom's delete set. */
69378+ ret =
69379+ reiser4_dealloc_block(znode_get_block(node), 0,
69380+ BA_DEFER | BA_FORMATTED);
69381+ if (ret)
69382+ warning("zam-942",
69383+ "can\'t add a block (%llu) number to atom's delete set\n",
69384+ (unsigned long long)(*znode_get_block(node)));
69385+
69386+ spin_lock_znode(node);
69387+ /* Here we return flush reserved block which was reserved at the
69388+ * moment when this allocated node was marked dirty and still
69389+ * not used by flush in node relocation procedure. */
69390+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69391+ txn_atom *atom;
69392+
69393+ atom = jnode_get_atom(ZJNODE(node));
69394+ assert("zam-939", atom != NULL);
69395+ spin_unlock_znode(node);
69396+ flush_reserved2grabbed(atom, (__u64) 1);
69397+ spin_unlock_atom(atom);
69398+ } else
69399+ spin_unlock_znode(node);
69400+ } else {
69401+ /* znode has assigned block which is counted as "fake
69402+ allocated". Return it back to "free blocks") */
69403+ fake_allocated2free((__u64) 1, BA_FORMATTED);
69404+ }
69405+
69406+ /*
69407+ * uncapture page from transaction. There is a possibility of a race
69408+ * with ->releasepage(): reiser4_releasepage() detaches page from this
69409+ * jnode and we have nothing to uncapture. To avoid this, get
69410+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
69411+ * will deal with released page itself.
69412+ */
69413+ spin_lock_znode(node);
69414+ page = znode_page(node);
69415+ if (likely(page != NULL)) {
69416+ /*
69417+ * reiser4_uncapture_page() can only be called when we are sure
69418+ * that znode is pinned in memory, which we are, because
69419+ * forget_znode() is only called from longterm_unlock_znode().
69420+ */
69421+ page_cache_get(page);
69422+ spin_unlock_znode(node);
69423+ lock_page(page);
69424+ reiser4_uncapture_page(page);
69425+ unlock_page(page);
69426+ page_cache_release(page);
69427+ } else {
69428+ txn_atom *atom;
69429+
69430+ /* handle "flush queued" znodes */
69431+ while (1) {
69432+ atom = jnode_get_atom(ZJNODE(node));
69433+ assert("zam-943", atom != NULL);
69434+
69435+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69436+ || !atom->nr_running_queues)
69437+ break;
69438+
69439+ spin_unlock_znode(node);
69440+ reiser4_atom_wait_event(atom);
69441+ spin_lock_znode(node);
69442+ }
69443+
69444+ reiser4_uncapture_block(ZJNODE(node));
69445+ spin_unlock_atom(atom);
69446+ zput(node);
69447+ }
69448+}
69449+
69450+/* This is called from longterm_unlock_znode() when last lock is released from
69451+ the node that has been removed from the tree. At this point node is removed
69452+ from sibling list and its lock is invalidated. */
69453+void forget_znode(lock_handle * handle)
69454+{
69455+ znode *node;
69456+ reiser4_tree *tree;
69457+
69458+ assert("umka-319", handle != NULL);
69459+
69460+ node = handle->node;
69461+ tree = znode_get_tree(node);
69462+
69463+ assert("vs-164", znode_is_write_locked(node));
69464+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69465+ assert_rw_locked(&(node->lock.guard));
69466+
69467+ /* We assume that this node was detached from its parent before
69468+ * unlocking, it gives no way to reach this node from parent through a
69469+ * down link. The node should have no children and, thereby, can't be
69470+ * reached from them by their parent pointers. The only way to obtain a
69471+ * reference to the node is to use sibling pointers from its left and
69472+ * right neighbors. In the next several lines we remove the node from
69473+ * the sibling list. */
69474+
69475+ write_lock_tree(tree);
69476+ sibling_list_remove(node);
69477+ znode_remove(node, tree);
69478+ write_unlock_tree(tree);
69479+
69480+ /* Here we set JNODE_DYING and cancel all pending lock requests. It
69481+ * forces all lock requestor threads to repeat iterations of getting
69482+ * lock on a child, neighbor or parent node. But, those threads can't
69483+ * come to this node again, because this node is no longer a child,
69484+ * neighbor or parent of any other node. This order of znode
69485+ * invalidation does not allow other threads to waste cpu time is a busy
69486+ * loop, trying to lock dying object. The exception is in the flush
69487+ * code when we take node directly from atom's capture list.*/
69488+ reiser4_invalidate_lock(handle);
69489+ uncapture_znode(node);
69490+}
69491+
69492+/* Check that internal item at @pointer really contains pointer to @child. */
69493+int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69494+ * @child */ ,
69495+ const znode * child /* child znode */ )
69496+{
69497+ assert("nikita-1016", pointer != NULL);
69498+ assert("nikita-1017", child != NULL);
69499+ assert("nikita-1018", pointer->node != NULL);
69500+
69501+ assert("nikita-1325", znode_is_any_locked(pointer->node));
69502+
69503+ assert("nikita-2985",
69504+ znode_get_level(pointer->node) == znode_get_level(child) + 1);
69505+
69506+ coord_clear_iplug((coord_t *) pointer);
69507+
69508+ if (coord_is_existing_unit(pointer)) {
69509+ item_plugin *iplug;
69510+ reiser4_block_nr addr;
69511+
69512+ if (item_is_internal(pointer)) {
69513+ iplug = item_plugin_by_coord(pointer);
69514+ assert("vs-513", iplug->s.internal.down_link);
69515+ iplug->s.internal.down_link(pointer, NULL, &addr);
69516+ /* check that cached value is correct */
69517+ if (disk_addr_eq(&addr, znode_get_block(child))) {
69518+ return NS_FOUND;
69519+ }
69520+ }
69521+ }
69522+ /* warning ("jmacd-1002", "tree pointer incorrect"); */
69523+ return NS_NOT_FOUND;
69524+}
69525+
69526+/* find coord of pointer to new @child in @parent.
69527+
69528+ Find the &coord_t in the @parent where pointer to a given @child will
69529+ be in.
69530+
69531+*/
69532+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
69533+ znode *
69534+ child UNUSED_ARG /* child znode, passed locked */ ,
69535+ znode * left /* left brother of new node */ ,
69536+ coord_t * result /* where result is stored in */ )
69537+{
69538+ int ret;
69539+
69540+ assert("nikita-1486", parent != NULL);
69541+ assert("nikita-1487", child != NULL);
69542+ assert("nikita-1488", result != NULL);
69543+
69544+ ret = find_child_ptr(parent, left, result);
69545+ if (ret != NS_FOUND) {
69546+ warning("nikita-1489", "Cannot find brother position: %i", ret);
69547+ return RETERR(-EIO);
69548+ } else {
69549+ result->between = AFTER_UNIT;
69550+ return RETERR(NS_NOT_FOUND);
69551+ }
69552+}
69553+
69554+/* find coord of pointer to @child in @parent.
69555+
69556+ Find the &coord_t in the @parent where pointer to a given @child is in.
69557+
69558+*/
69559+int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
69560+ znode * child /* child znode, passed locked */ ,
69561+ coord_t * result /* where result is stored in */ )
69562+{
69563+ int lookup_res;
69564+ node_plugin *nplug;
69565+ /* left delimiting key of a child */
69566+ reiser4_key ld;
69567+ reiser4_tree *tree;
69568+
69569+ assert("nikita-934", parent != NULL);
69570+ assert("nikita-935", child != NULL);
69571+ assert("nikita-936", result != NULL);
69572+ assert("zam-356", znode_is_loaded(parent));
69573+
69574+ coord_init_zero(result);
69575+ result->node = parent;
69576+
69577+ nplug = parent->nplug;
69578+ assert("nikita-939", nplug != NULL);
69579+
69580+ tree = znode_get_tree(parent);
69581+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
69582+ * not aliased to ->in_parent of some znode. Otherwise,
69583+ * parent_coord_to_coord() below would modify data protected by tree
69584+ * lock. */
69585+ read_lock_tree(tree);
69586+ /* fast path. Try to use cached value. Lock tree to keep
69587+ node->pos_in_parent and pos->*_blocknr consistent. */
69588+ if (child->in_parent.item_pos + 1 != 0) {
69589+ parent_coord_to_coord(&child->in_parent, result);
69590+ if (check_tree_pointer(result, child) == NS_FOUND) {
69591+ read_unlock_tree(tree);
69592+ return NS_FOUND;
69593+ }
69594+
69595+ child->in_parent.item_pos = (unsigned short)~0;
69596+ }
69597+ read_unlock_tree(tree);
69598+
69599+ /* is above failed, find some key from @child. We are looking for the
69600+ least key in a child. */
69601+ read_lock_dk(tree);
69602+ ld = *znode_get_ld_key(child);
69603+ read_unlock_dk(tree);
69604+ /*
69605+ * now, lookup parent with key just found. Note, that left delimiting
69606+ * key doesn't identify node uniquely, because (in extremely rare
69607+ * case) two nodes can have equal left delimiting keys, if one of them
69608+ * is completely filled with directory entries that all happened to be
69609+ * hash collision. But, we check block number in check_tree_pointer()
69610+ * and, so, are safe.
69611+ */
69612+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
69613+ /* update cached pos_in_node */
69614+ if (lookup_res == NS_FOUND) {
69615+ write_lock_tree(tree);
69616+ coord_to_parent_coord(result, &child->in_parent);
69617+ write_unlock_tree(tree);
69618+ lookup_res = check_tree_pointer(result, child);
69619+ }
69620+ if (lookup_res == NS_NOT_FOUND)
69621+ lookup_res = find_child_by_addr(parent, child, result);
69622+ return lookup_res;
69623+}
69624+
69625+/* find coord of pointer to @child in @parent by scanning
69626+
69627+ Find the &coord_t in the @parent where pointer to a given @child
69628+ is in by scanning all internal items in @parent and comparing block
69629+ numbers in them with that of @child.
69630+
69631+*/
69632+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
69633+ znode * child /* child znode, passed locked */ ,
69634+ coord_t * result /* where result is stored in */ )
69635+{
69636+ int ret;
69637+
69638+ assert("nikita-1320", parent != NULL);
69639+ assert("nikita-1321", child != NULL);
69640+ assert("nikita-1322", result != NULL);
69641+
69642+ ret = NS_NOT_FOUND;
69643+
69644+ for_all_units(result, parent) {
69645+ if (check_tree_pointer(result, child) == NS_FOUND) {
69646+ write_lock_tree(znode_get_tree(parent));
69647+ coord_to_parent_coord(result, &child->in_parent);
69648+ write_unlock_tree(znode_get_tree(parent));
69649+ ret = NS_FOUND;
69650+ break;
69651+ }
69652+ }
69653+ return ret;
69654+}
69655+
69656+/* true, if @addr is "unallocated block number", which is just address, with
69657+ highest bit set. */
69658+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
69659+ * check */ )
69660+{
69661+ assert("nikita-1766", addr != NULL);
69662+ cassert(sizeof(reiser4_block_nr) == 8);
69663+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
69664+ REISER4_UNALLOCATED_STATUS_VALUE;
69665+}
69666+
69667+/* returns true if removing bytes of given range of key [from_key, to_key]
69668+ causes removing of whole item @from */
69669+static int
69670+item_removed_completely(coord_t * from, const reiser4_key * from_key,
69671+ const reiser4_key * to_key)
69672+{
69673+ item_plugin *iplug;
69674+ reiser4_key key_in_item;
69675+
69676+ assert("umka-325", from != NULL);
69677+ assert("", item_is_extent(from));
69678+
69679+ /* check first key just for case */
69680+ item_key_by_coord(from, &key_in_item);
69681+ if (keygt(from_key, &key_in_item))
69682+ return 0;
69683+
69684+ /* check last key */
69685+ iplug = item_plugin_by_coord(from);
69686+ assert("vs-611", iplug && iplug->s.file.append_key);
69687+
69688+ iplug->s.file.append_key(from, &key_in_item);
69689+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
69690+
69691+ if (keylt(to_key, &key_in_item))
69692+ /* last byte is not removed */
69693+ return 0;
69694+ return 1;
69695+}
69696+
69697+/* helper function for prepare_twig_kill(): @left and @right are formatted
69698+ * neighbors of extent item being completely removed. Load and lock neighbors
69699+ * and store lock handles into @cdata for later use by kill_hook_extent() */
69700+static int
69701+prepare_children(znode * left, znode * right, carry_kill_data * kdata)
69702+{
69703+ int result;
69704+ int left_loaded;
69705+ int right_loaded;
69706+
69707+ result = 0;
69708+ left_loaded = right_loaded = 0;
69709+
69710+ if (left != NULL) {
69711+ result = zload(left);
69712+ if (result == 0) {
69713+ left_loaded = 1;
69714+ result = longterm_lock_znode(kdata->left, left,
69715+ ZNODE_READ_LOCK,
69716+ ZNODE_LOCK_LOPRI);
69717+ }
69718+ }
69719+ if (result == 0 && right != NULL) {
69720+ result = zload(right);
69721+ if (result == 0) {
69722+ right_loaded = 1;
69723+ result = longterm_lock_znode(kdata->right, right,
69724+ ZNODE_READ_LOCK,
69725+ ZNODE_LOCK_HIPRI |
69726+ ZNODE_LOCK_NONBLOCK);
69727+ }
69728+ }
69729+ if (result != 0) {
69730+ done_lh(kdata->left);
69731+ done_lh(kdata->right);
69732+ if (left_loaded != 0)
69733+ zrelse(left);
69734+ if (right_loaded != 0)
69735+ zrelse(right);
69736+ }
69737+ return result;
69738+}
69739+
69740+static void done_children(carry_kill_data * kdata)
69741+{
69742+ if (kdata->left != NULL && kdata->left->node != NULL) {
69743+ zrelse(kdata->left->node);
69744+ done_lh(kdata->left);
69745+ }
69746+ if (kdata->right != NULL && kdata->right->node != NULL) {
69747+ zrelse(kdata->right->node);
69748+ done_lh(kdata->right);
69749+ }
69750+}
69751+
69752+/* part of cut_node. It is called when cut_node is called to remove or cut part
69753+ of extent item. When head of that item is removed - we have to update right
69754+ delimiting of left neighbor of extent. When item is removed completely - we
69755+ have to set sibling link between left and right neighbor of removed
69756+ extent. This may return -E_DEADLOCK because of trying to get left neighbor
69757+ locked. So, caller should repeat an attempt
69758+*/
69759+/* Audited by: umka (2002.06.16) */
69760+static int
69761+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
69762+{
69763+ int result;
69764+ reiser4_key key;
69765+ lock_handle left_lh;
69766+ lock_handle right_lh;
69767+ coord_t left_coord;
69768+ coord_t *from;
69769+ znode *left_child;
69770+ znode *right_child;
69771+ reiser4_tree *tree;
69772+ int left_zloaded_here, right_zloaded_here;
69773+
69774+ from = kdata->params.from;
69775+ assert("umka-326", from != NULL);
69776+ assert("umka-327", kdata->params.to != NULL);
69777+
69778+ /* for one extent item only yet */
69779+ assert("vs-591", item_is_extent(from));
69780+ assert("vs-592", from->item_pos == kdata->params.to->item_pos);
69781+
69782+ if ((kdata->params.from_key
69783+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
69784+ || from->unit_pos != 0) {
69785+ /* head of item @from is not removed, there is nothing to
69786+ worry about */
69787+ return 0;
69788+ }
69789+
69790+ result = 0;
69791+ left_zloaded_here = 0;
69792+ right_zloaded_here = 0;
69793+
69794+ left_child = right_child = NULL;
69795+
69796+ coord_dup(&left_coord, from);
69797+ init_lh(&left_lh);
69798+ init_lh(&right_lh);
69799+ if (coord_prev_unit(&left_coord)) {
69800+ /* @from is leftmost item in its node */
69801+ if (!locked_left_neighbor) {
69802+ result =
69803+ reiser4_get_left_neighbor(&left_lh, from->node,
69804+ ZNODE_READ_LOCK,
69805+ GN_CAN_USE_UPPER_LEVELS);
69806+ switch (result) {
69807+ case 0:
69808+ break;
69809+ case -E_NO_NEIGHBOR:
69810+ /* there is no formatted node to the left of
69811+ from->node */
69812+ warning("vs-605",
69813+ "extent item has smallest key in "
69814+ "the tree and it is about to be removed");
69815+ return 0;
69816+ case -E_DEADLOCK:
69817+ /* need to restart */
69818+ default:
69819+ return result;
69820+ }
69821+
69822+ /* we have acquired left neighbor of from->node */
69823+ result = zload(left_lh.node);
69824+ if (result)
69825+ goto done;
69826+
69827+ locked_left_neighbor = left_lh.node;
69828+ } else {
69829+ /* squalloc_right_twig_cut should have supplied locked
69830+ * left neighbor */
69831+ assert("vs-834",
69832+ znode_is_write_locked(locked_left_neighbor));
69833+ result = zload(locked_left_neighbor);
69834+ if (result)
69835+ return result;
69836+ }
69837+
69838+ left_zloaded_here = 1;
69839+ coord_init_last_unit(&left_coord, locked_left_neighbor);
69840+ }
69841+
69842+ if (!item_is_internal(&left_coord)) {
69843+ /* what else but extent can be on twig level */
69844+ assert("vs-606", item_is_extent(&left_coord));
69845+
69846+ /* there is no left formatted child */
69847+ if (left_zloaded_here)
69848+ zrelse(locked_left_neighbor);
69849+ done_lh(&left_lh);
69850+ return 0;
69851+ }
69852+
69853+ tree = znode_get_tree(left_coord.node);
69854+ left_child = child_znode(&left_coord, left_coord.node, 1, 0);
69855+
69856+ if (IS_ERR(left_child)) {
69857+ result = PTR_ERR(left_child);
69858+ goto done;
69859+ }
69860+
69861+ /* left child is acquired, calculate new right delimiting key for it
69862+ and get right child if it is necessary */
69863+ if (item_removed_completely
69864+ (from, kdata->params.from_key, kdata->params.to_key)) {
69865+ /* try to get right child of removed item */
69866+ coord_t right_coord;
69867+
69868+ assert("vs-607",
69869+ kdata->params.to->unit_pos ==
69870+ coord_last_unit_pos(kdata->params.to));
69871+ coord_dup(&right_coord, kdata->params.to);
69872+ if (coord_next_unit(&right_coord)) {
69873+ /* @to is rightmost unit in the node */
69874+ result =
69875+ reiser4_get_right_neighbor(&right_lh, from->node,
69876+ ZNODE_READ_LOCK,
69877+ GN_CAN_USE_UPPER_LEVELS);
69878+ switch (result) {
69879+ case 0:
69880+ result = zload(right_lh.node);
69881+ if (result)
69882+ goto done;
69883+
69884+ right_zloaded_here = 1;
69885+ coord_init_first_unit(&right_coord,
69886+ right_lh.node);
69887+ item_key_by_coord(&right_coord, &key);
69888+ break;
69889+
69890+ case -E_NO_NEIGHBOR:
69891+ /* there is no formatted node to the right of
69892+ from->node */
69893+ read_lock_dk(tree);
69894+ key = *znode_get_rd_key(from->node);
69895+ read_unlock_dk(tree);
69896+ right_coord.node = NULL;
69897+ result = 0;
69898+ break;
69899+ default:
69900+ /* real error */
69901+ goto done;
69902+ }
69903+ } else {
69904+ /* there is an item to the right of @from - take its key */
69905+ item_key_by_coord(&right_coord, &key);
69906+ }
69907+
69908+ /* try to get right child of @from */
69909+ if (right_coord.node && /* there is right neighbor of @from */
69910+ item_is_internal(&right_coord)) { /* it is internal item */
69911+ right_child = child_znode(&right_coord,
69912+ right_coord.node, 1, 0);
69913+
69914+ if (IS_ERR(right_child)) {
69915+ result = PTR_ERR(right_child);
69916+ goto done;
69917+ }
69918+
69919+ }
69920+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
69921+ update of right delimiting key of left_child */
69922+ result = prepare_children(left_child, right_child, kdata);
69923+ } else {
69924+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
69925+ result = prepare_children(left_child, NULL, kdata);
69926+ }
69927+
69928+ done:
69929+ if (right_child)
69930+ zput(right_child);
69931+ if (right_zloaded_here)
69932+ zrelse(right_lh.node);
69933+ done_lh(&right_lh);
69934+
69935+ if (left_child)
69936+ zput(left_child);
69937+ if (left_zloaded_here)
69938+ zrelse(locked_left_neighbor);
69939+ done_lh(&left_lh);
69940+ return result;
69941+}
69942+
69943+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
69944+ are to be cut completely */
69945+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
69946+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
69947+ const reiser4_key * to_key, /* last key to be removed */
69948+ reiser4_key *
69949+ smallest_removed /* smallest key actually removed */ )
69950+{
69951+ int result;
69952+ carry_pool *pool;
69953+ carry_level *lowest_level;
69954+ carry_cut_data *cut_data;
69955+ carry_op *op;
69956+
69957+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
69958+
69959+ pool =
69960+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69961+ sizeof(*cut_data));
69962+ if (IS_ERR(pool))
69963+ return PTR_ERR(pool);
69964+ lowest_level = (carry_level *) (pool + 1);
69965+ init_carry_level(lowest_level, pool);
69966+
69967+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
69968+ assert("vs-1509", op != 0);
69969+ if (IS_ERR(op)) {
69970+ done_carry_pool(pool);
69971+ return PTR_ERR(op);
69972+ }
69973+
69974+ cut_data = (carry_cut_data *) (lowest_level + 3);
69975+ cut_data->params.from = from;
69976+ cut_data->params.to = to;
69977+ cut_data->params.from_key = from_key;
69978+ cut_data->params.to_key = to_key;
69979+ cut_data->params.smallest_removed = smallest_removed;
69980+
69981+ op->u.cut_or_kill.is_cut = 1;
69982+ op->u.cut_or_kill.u.cut = cut_data;
69983+
69984+ result = reiser4_carry(lowest_level, NULL);
69985+ done_carry_pool(pool);
69986+
69987+ return result;
69988+}
69989+
69990+/* cut part of the node
69991+
69992+ Cut part or whole content of node.
69993+
69994+ cut data between @from and @to of @from->node and call carry() to make
69995+ corresponding changes in the tree. @from->node may become empty. If so -
69996+ pointer to it will be removed. Neighboring nodes are not changed. Smallest
69997+ removed key is stored in @smallest_removed
69998+
69999+*/
70000+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
70001+ coord_t * to, /* coord of the last unit/item that will be eliminated */
70002+ const reiser4_key * from_key, /* first key to be removed */
70003+ const reiser4_key * to_key, /* last key to be removed */
70004+ reiser4_key * smallest_removed, /* smallest key actually removed */
70005+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
70006+ * locked (in squalloc_right_twig_cut, namely) */
70007+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
70008+ invalidate pages together with item pointing to them */
70009+ int truncate)
70010+{ /* this call is made for file truncate) */
70011+ int result;
70012+ carry_pool *pool;
70013+ carry_level *lowest_level;
70014+ carry_kill_data *kdata;
70015+ lock_handle *left_child;
70016+ lock_handle *right_child;
70017+ carry_op *op;
70018+
70019+ assert("umka-328", from != NULL);
70020+ assert("vs-316", !node_is_empty(from->node));
70021+ assert("nikita-1812", coord_is_existing_unit(from)
70022+ && coord_is_existing_unit(to));
70023+
70024+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
70025+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70026+ sizeof(carry_kill_data) +
70027+ 2 * sizeof(lock_handle) +
70028+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
70029+ if (IS_ERR(pool))
70030+ return PTR_ERR(pool);
70031+
70032+ lowest_level = (carry_level *) (pool + 1);
70033+ init_carry_level(lowest_level, pool);
70034+
70035+ kdata = (carry_kill_data *) (lowest_level + 3);
70036+ left_child = (lock_handle *) (kdata + 1);
70037+ right_child = left_child + 1;
70038+
70039+ init_lh(left_child);
70040+ init_lh(right_child);
70041+
70042+ kdata->params.from = from;
70043+ kdata->params.to = to;
70044+ kdata->params.from_key = from_key;
70045+ kdata->params.to_key = to_key;
70046+ kdata->params.smallest_removed = smallest_removed;
70047+ kdata->params.truncate = truncate;
70048+ kdata->flags = 0;
70049+ kdata->inode = inode;
70050+ kdata->left = left_child;
70051+ kdata->right = right_child;
70052+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
70053+ kdata->buf = (char *)(right_child + 1);
70054+
70055+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
70056+ /* left child of extent item may have to get updated right
70057+ delimiting key and to get linked with right child of extent
70058+ @from if it will be removed completely */
70059+ result = prepare_twig_kill(kdata, locked_left_neighbor);
70060+ if (result) {
70061+ done_children(kdata);
70062+ done_carry_pool(pool);
70063+ return result;
70064+ }
70065+ }
70066+
70067+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
70068+ if (IS_ERR(op) || (op == NULL)) {
70069+ done_children(kdata);
70070+ done_carry_pool(pool);
70071+ return RETERR(op ? PTR_ERR(op) : -EIO);
70072+ }
70073+
70074+ op->u.cut_or_kill.is_cut = 0;
70075+ op->u.cut_or_kill.u.kill = kdata;
70076+
70077+ result = reiser4_carry(lowest_level, NULL);
70078+
70079+ done_children(kdata);
70080+ done_carry_pool(pool);
70081+ return result;
70082+}
70083+
70084+void
70085+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
70086+{
70087+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
70088+ pgoff_t start_pg, end_pg;
70089+
70090+ start_pg = start >> PAGE_CACHE_SHIFT;
70091+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
70092+
70093+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
70094+ /*
70095+ * kill up to the page boundary.
70096+ */
70097+ assert("vs-123456", start_pg == end_pg);
70098+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
70099+ truncate);
70100+ } else if (start_pg != end_pg) {
70101+ /*
70102+ * page boundary is within killed portion of node.
70103+ */
70104+ assert("vs-654321", end_pg - start_pg == 1);
70105+ reiser4_invalidate_pages(inode->i_mapping, end_pg,
70106+ end_pg - start_pg, 1);
70107+ }
70108+ }
70109+ inode_sub_bytes(inode, end - start);
70110+}
70111+
70112+/**
70113+ * Delete whole @node from the reiser4 tree without loading it.
70114+ *
70115+ * @left: locked left neighbor,
70116+ * @node: node to be deleted,
70117+ * @smallest_removed: leftmost key of deleted node,
70118+ * @object: inode pointer, if we truncate a file body.
70119+ * @truncate: true if called for file truncate.
70120+ *
70121+ * @return: 0 if success, error code otherwise.
70122+ *
70123+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
70124+ * contains the right value of the smallest removed key from the previous
70125+ * cut_worker() iteration. This is needed for proper accounting of
70126+ * "i_blocks" and "i_bytes" fields of the @object.
70127+ */
70128+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
70129+ struct inode *object, int truncate)
70130+{
70131+ lock_handle parent_lock;
70132+ coord_t cut_from;
70133+ coord_t cut_to;
70134+ reiser4_tree *tree;
70135+ int ret;
70136+
70137+ assert("zam-937", node != NULL);
70138+ assert("zam-933", znode_is_write_locked(node));
70139+ assert("zam-999", smallest_removed != NULL);
70140+
70141+ init_lh(&parent_lock);
70142+
70143+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
70144+ if (ret)
70145+ return ret;
70146+
70147+ assert("zam-934", !znode_above_root(parent_lock.node));
70148+
70149+ ret = zload(parent_lock.node);
70150+ if (ret)
70151+ goto failed_nozrelse;
70152+
70153+ ret = find_child_ptr(parent_lock.node, node, &cut_from);
70154+ if (ret)
70155+ goto failed;
70156+
70157+ /* decrement child counter and set parent pointer to NULL before
70158+ deleting the list from parent node because of checks in
70159+ internal_kill_item_hook (we can delete the last item from the parent
70160+ node, the parent node is going to be deleted and its c_count should
70161+ be zero). */
70162+
70163+ tree = znode_get_tree(node);
70164+ write_lock_tree(tree);
70165+ init_parent_coord(&node->in_parent, NULL);
70166+ --parent_lock.node->c_count;
70167+ write_unlock_tree(tree);
70168+
70169+ assert("zam-989", item_is_internal(&cut_from));
70170+
70171+ /* @node should be deleted after unlocking. */
70172+ ZF_SET(node, JNODE_HEARD_BANSHEE);
70173+
70174+ /* remove a pointer from the parent node to the node being deleted. */
70175+ coord_dup(&cut_to, &cut_from);
70176+ /* FIXME: shouldn't this be kill_node_content */
70177+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
70178+ if (ret)
70179+ /* FIXME(Zam): Should we re-connect the node to its parent if
70180+ * cut_node fails? */
70181+ goto failed;
70182+
70183+ {
70184+ reiser4_tree *tree = current_tree;
70185+ __u64 start_offset = 0, end_offset = 0;
70186+
70187+ read_lock_tree(tree);
70188+ write_lock_dk(tree);
70189+ if (object) {
70190+ /* We use @smallest_removed and the left delimiting of
70191+ * the current node for @object->i_blocks, i_bytes
70192+ * calculation. We assume that the items after the
70193+ * *@smallest_removed key have been deleted from the
70194+ * file body. */
70195+ start_offset = get_key_offset(znode_get_ld_key(node));
70196+ end_offset = get_key_offset(smallest_removed);
70197+ }
70198+
70199+ assert("zam-1021", znode_is_connected(node));
70200+ if (node->left)
70201+ znode_set_rd_key(node->left, znode_get_rd_key(node));
70202+
70203+ *smallest_removed = *znode_get_ld_key(node);
70204+
70205+ write_unlock_dk(tree);
70206+ read_unlock_tree(tree);
70207+
70208+ if (object) {
70209+ /* we used to perform actions which are to be performed on items on their removal from tree in
70210+ special item method - kill_hook. Here for optimization reasons we avoid reading node
70211+ containing item we remove and can not call item's kill hook. Instead we call function which
70212+ does exactly the same things as tail kill hook in assumption that node we avoid reading
70213+ contains only one item and that item is a tail one. */
70214+ fake_kill_hook_tail(object, start_offset, end_offset,
70215+ truncate);
70216+ }
70217+ }
70218+ failed:
70219+ zrelse(parent_lock.node);
70220+ failed_nozrelse:
70221+ done_lh(&parent_lock);
70222+
70223+ return ret;
70224+}
70225+
70226+static int can_delete(const reiser4_key *key, znode *node)
70227+{
70228+ int result;
70229+
70230+ read_lock_dk(current_tree);
70231+ result = keyle(key, znode_get_ld_key(node));
70232+ read_unlock_dk(current_tree);
70233+ return result;
70234+}
70235+
70236+/**
70237+ * This subroutine is not optimal but implementation seems to
70238+ * be easier).
70239+ *
70240+ * @tap: the point deletion process begins from,
70241+ * @from_key: the beginning of the deleted key range,
70242+ * @to_key: the end of the deleted key range,
70243+ * @smallest_removed: the smallest removed key,
70244+ * @truncate: true if called for file truncate.
70245+ * @progress: return true if a progress in file items deletions was made,
70246+ * @smallest_removed value is actual in that case.
70247+ *
70248+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long
70249+ * reiser4_cut_tree operation was interrupted for allowing atom commit.
70250+ */
70251+int
70252+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
70253+ const reiser4_key * to_key,
70254+ reiser4_key * smallest_removed, struct inode *object,
70255+ int truncate, int *progress)
70256+{
70257+ lock_handle next_node_lock;
70258+ coord_t left_coord;
70259+ int result;
70260+
70261+ assert("zam-931", tap->coord->node != NULL);
70262+ assert("zam-932", znode_is_write_locked(tap->coord->node));
70263+
70264+ *progress = 0;
70265+ init_lh(&next_node_lock);
70266+
70267+ while (1) {
70268+ znode *node; /* node from which items are cut */
70269+ node_plugin *nplug; /* node plugin for @node */
70270+
70271+ node = tap->coord->node;
70272+
70273+ /* Move next_node_lock to the next node on the left. */
70274+ result =
70275+ reiser4_get_left_neighbor(&next_node_lock, node,
70276+ ZNODE_WRITE_LOCK,
70277+ GN_CAN_USE_UPPER_LEVELS);
70278+ if (result != 0 && result != -E_NO_NEIGHBOR)
70279+ break;
70280+ /* Check can we delete the node as a whole. */
70281+ if (*progress && znode_get_level(node) == LEAF_LEVEL &&
70282+ can_delete(from_key, node)) {
70283+ result = reiser4_delete_node(node, smallest_removed,
70284+ object, truncate);
70285+ } else {
70286+ result = reiser4_tap_load(tap);
70287+ if (result)
70288+ return result;
70289+
70290+ /* Prepare the second (right) point for cut_node() */
70291+ if (*progress)
70292+ coord_init_last_unit(tap->coord, node);
70293+
70294+ else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70295+ NULL)
70296+ /* set rightmost unit for the items without lookup method */
70297+ tap->coord->unit_pos =
70298+ coord_last_unit_pos(tap->coord);
70299+
70300+ nplug = node->nplug;
70301+
70302+ assert("vs-686", nplug);
70303+ assert("vs-687", nplug->lookup);
70304+
70305+ /* left_coord is leftmost unit cut from @node */
70306+ result = nplug->lookup(node, from_key,
70307+ FIND_MAX_NOT_MORE_THAN,
70308+ &left_coord);
70309+
70310+ if (IS_CBKERR(result))
70311+ break;
70312+
70313+ /* adjust coordinates so that they are set to existing units */
70314+ if (coord_set_to_right(&left_coord)
70315+ || coord_set_to_left(tap->coord)) {
70316+ result = 0;
70317+ break;
70318+ }
70319+
70320+ if (coord_compare(&left_coord, tap->coord) ==
70321+ COORD_CMP_ON_RIGHT) {
70322+ /* keys from @from_key to @to_key are not in the tree */
70323+ result = 0;
70324+ break;
70325+ }
70326+
70327+ if (left_coord.item_pos != tap->coord->item_pos) {
70328+ /* do not allow to cut more than one item. It is added to solve problem of truncating
70329+ partially converted files. If file is partially converted there may exist a twig node
70330+ containing both internal item or items pointing to leaf nodes with formatting items
70331+ and extent item. We do not want to kill internal items being at twig node here
70332+ because cut_tree_worker assumes killing them from level level */
70333+ coord_dup(&left_coord, tap->coord);
70334+ assert("vs-1652",
70335+ coord_is_existing_unit(&left_coord));
70336+ left_coord.unit_pos = 0;
70337+ }
70338+
70339+ /* cut data from one node */
70340+ // *smallest_removed = *reiser4_min_key();
70341+ result =
70342+ kill_node_content(&left_coord, tap->coord, from_key,
70343+ to_key, smallest_removed,
70344+ next_node_lock.node, object,
70345+ truncate);
70346+ reiser4_tap_relse(tap);
70347+ }
70348+ if (result)
70349+ break;
70350+
70351+ ++(*progress);
70352+
70353+ /* Check whether all items with keys >= from_key were removed
70354+ * from the tree. */
70355+ if (keyle(smallest_removed, from_key))
70356+ /* result = 0; */
70357+ break;
70358+
70359+ if (next_node_lock.node == NULL)
70360+ break;
70361+
70362+ result = reiser4_tap_move(tap, &next_node_lock);
70363+ done_lh(&next_node_lock);
70364+ if (result)
70365+ break;
70366+
70367+ /* Break long reiser4_cut_tree operation (deletion of a large
70368+ file) if atom requires commit. */
70369+ if (*progress > CUT_TREE_MIN_ITERATIONS
70370+ && current_atom_should_commit()) {
70371+ result = -E_REPEAT;
70372+ break;
70373+ }
70374+ }
70375+ done_lh(&next_node_lock);
70376+ // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
70377+ return result;
70378+}
70379+
70380+/* there is a fundamental problem with optimizing deletes: VFS does it
70381+ one file at a time. Another problem is that if an item can be
70382+ anything, then deleting items must be done one at a time. It just
70383+ seems clean to writes this to specify a from and a to key, and cut
70384+ everything between them though. */
70385+
70386+/* use this function with care if deleting more than what is part of a single file. */
70387+/* do not use this when cutting a single item, it is suboptimal for that */
70388+
70389+/* You are encouraged to write plugin specific versions of this. It
70390+ cannot be optimal for all plugins because it works item at a time,
70391+ and some plugins could sometimes work node at a time. Regular files
70392+ however are not optimizable to work node at a time because of
70393+ extents needing to free the blocks they point to.
70394+
70395+ Optimizations compared to v3 code:
70396+
70397+ It does not balance (that task is left to memory pressure code).
70398+
70399+ Nodes are deleted only if empty.
70400+
70401+ Uses extents.
70402+
70403+ Performs read-ahead of formatted nodes whose contents are part of
70404+ the deletion.
70405+*/
70406+
70407+/**
70408+ * Delete everything from the reiser4 tree between two keys: @from_key and
70409+ * @to_key.
70410+ *
70411+ * @from_key: the beginning of the deleted key range,
70412+ * @to_key: the end of the deleted key range,
70413+ * @smallest_removed: the smallest removed key,
70414+ * @object: owner of cutting items.
70415+ * @truncate: true if called for file truncate.
70416+ * @progress: return true if a progress in file items deletions was made,
70417+ * @smallest_removed value is actual in that case.
70418+ *
70419+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70420+ * operation was interrupted for allowing atom commit .
70421+ */
70422+
70423+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70424+ const reiser4_key * to_key,
70425+ reiser4_key * smallest_removed_p,
70426+ struct inode *object, int truncate, int *progress)
70427+{
70428+ lock_handle lock;
70429+ int result;
70430+ tap_t tap;
70431+ coord_t right_coord;
70432+ reiser4_key smallest_removed;
70433+ int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70434+ const reiser4_key *, reiser4_key *,
70435+ struct inode *, int, int *);
70436+ STORE_COUNTERS;
70437+
70438+ assert("umka-329", tree != NULL);
70439+ assert("umka-330", from_key != NULL);
70440+ assert("umka-331", to_key != NULL);
70441+ assert("zam-936", keyle(from_key, to_key));
70442+
70443+ if (smallest_removed_p == NULL)
70444+ smallest_removed_p = &smallest_removed;
70445+
70446+ init_lh(&lock);
70447+
70448+ do {
70449+ /* Find rightmost item to cut away from the tree. */
70450+ result = reiser4_object_lookup(object, to_key, &right_coord,
70451+ &lock, ZNODE_WRITE_LOCK,
70452+ FIND_MAX_NOT_MORE_THAN,
70453+ TWIG_LEVEL, LEAF_LEVEL,
70454+ CBK_UNIQUE, NULL /*ra_info */);
70455+ if (result != CBK_COORD_FOUND)
70456+ break;
70457+ if (object == NULL
70458+ || inode_file_plugin(object)->cut_tree_worker == NULL)
70459+ cut_tree_worker = cut_tree_worker_common;
70460+ else
70461+ cut_tree_worker =
70462+ inode_file_plugin(object)->cut_tree_worker;
70463+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70464+ result =
70465+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70466+ object, truncate, progress);
70467+ reiser4_tap_done(&tap);
70468+
70469+ reiser4_preempt_point();
70470+
70471+ } while (0);
70472+
70473+ done_lh(&lock);
70474+
70475+ if (result) {
70476+ switch (result) {
70477+ case -E_NO_NEIGHBOR:
70478+ result = 0;
70479+ break;
70480+ case -E_DEADLOCK:
70481+ result = -E_REPEAT;
70482+ case -E_REPEAT:
70483+ case -ENOMEM:
70484+ case -ENOENT:
70485+ break;
70486+ default:
70487+ warning("nikita-2861", "failure: %i", result);
70488+ }
70489+ }
70490+
70491+ CHECK_COUNTERS;
70492+ return result;
70493+}
70494+
70495+/* repeat reiser4_cut_tree_object until everything is deleted.
70496+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT
70497+ * is returned by cut_tree_object. */
70498+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70499+ const reiser4_key * to, struct inode *inode, int truncate)
70500+{
70501+ int result;
70502+ int progress;
70503+
70504+ do {
70505+ result = reiser4_cut_tree_object(tree, from, to, NULL,
70506+ inode, truncate, &progress);
70507+ } while (result == -E_REPEAT);
70508+
70509+ return result;
70510+}
70511+
70512+/* finishing reiser4 initialization */
70513+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
70514+ * initialized */ ,
70515+ const reiser4_block_nr * root_block /* address of a root block
70516+ * on a disk */ ,
70517+ tree_level height /* height of a tree */ ,
70518+ node_plugin * nplug /* default node plugin */ )
70519+{
70520+ int result;
70521+
70522+ assert("nikita-306", tree != NULL);
70523+ assert("nikita-307", root_block != NULL);
70524+ assert("nikita-308", height > 0);
70525+ assert("nikita-309", nplug != NULL);
70526+ assert("zam-587", tree->super != NULL);
70527+
70528+ tree->root_block = *root_block;
70529+ tree->height = height;
70530+ tree->estimate_one_insert = calc_estimate_one_insert(height);
70531+ tree->nplug = nplug;
70532+
70533+ tree->znode_epoch = 1ull;
70534+
70535+ cbk_cache_init(&tree->cbk_cache);
70536+
70537+ result = znodes_tree_init(tree);
70538+ if (result == 0)
70539+ result = jnodes_tree_init(tree);
70540+ if (result == 0) {
70541+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
70542+ reiser4_ctx_gfp_mask_get());
70543+ if (IS_ERR(tree->uber)) {
70544+ result = PTR_ERR(tree->uber);
70545+ tree->uber = NULL;
70546+ }
70547+ }
70548+ return result;
70549+}
70550+
70551+/* release resources associated with @tree */
70552+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
70553+{
70554+ if (tree == NULL)
70555+ return;
70556+
70557+ if (tree->uber != NULL) {
70558+ zput(tree->uber);
70559+ tree->uber = NULL;
70560+ }
70561+ znodes_tree_done(tree);
70562+ jnodes_tree_done(tree);
70563+ cbk_cache_done(&tree->cbk_cache);
70564+}
70565+
70566+/* Make Linus happy.
70567+ Local variables:
70568+ c-indentation-style: "K&R"
70569+ mode-name: "LC"
70570+ c-basic-offset: 8
70571+ tab-width: 8
70572+ fill-column: 120
70573+ scroll-step: 1
70574+ End:
70575+*/
70576diff -urN linux-2.6.20.orig/fs/reiser4/tree.h linux-2.6.20/fs/reiser4/tree.h
70577--- linux-2.6.20.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
70578+++ linux-2.6.20/fs/reiser4/tree.h 2007-05-06 14:50:43.883033217 +0400
70579@@ -0,0 +1,577 @@
70580+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70581+ * reiser4/README */
70582+
70583+/* Tree operations. See fs/reiser4/tree.c for comments */
70584+
70585+#if !defined( __REISER4_TREE_H__ )
70586+#define __REISER4_TREE_H__
70587+
70588+#include "forward.h"
70589+#include "debug.h"
70590+#include "dformat.h"
70591+#include "plugin/node/node.h"
70592+#include "plugin/plugin.h"
70593+#include "znode.h"
70594+#include "tap.h"
70595+
70596+#include <linux/types.h> /* for __u?? */
70597+#include <linux/fs.h> /* for struct super_block */
70598+#include <linux/spinlock.h>
70599+#include <linux/sched.h> /* for struct task_struct */
70600+
70601+/* fictive block number never actually used */
70602+extern const reiser4_block_nr UBER_TREE_ADDR;
70603+
70604+/* &cbk_cache_slot - entry in a coord cache.
70605+
70606+ This is entry in a coord_by_key (cbk) cache, represented by
70607+ &cbk_cache.
70608+
70609+*/
70610+typedef struct cbk_cache_slot {
70611+ /* cached node */
70612+ znode *node;
70613+ /* linkage to the next cbk cache slot in a LRU order */
70614+ struct list_head lru;
70615+} cbk_cache_slot;
70616+
70617+/* &cbk_cache - coord cache. This is part of reiser4_tree.
70618+
70619+ cbk_cache is supposed to speed up tree lookups by caching results of recent
70620+ successful lookups (we don't cache negative results as dentry cache
70621+ does). Cache consists of relatively small number of entries kept in a LRU
70622+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
70623+ which we can obtain a range of keys that covered by this znode. Before
70624+ embarking into real tree traversal we scan cbk_cache slot by slot and for
70625+ each slot check whether key we are looking for is between minimal and
70626+ maximal keys for node pointed to by this slot. If no match is found, real
70627+ tree traversal is performed and if result is successful, appropriate entry
70628+ is inserted into cache, possibly pulling least recently used entry out of
70629+ it.
70630+
70631+ Tree spin lock is used to protect coord cache. If contention for this
70632+ lock proves to be too high, more finer grained locking can be added.
70633+
70634+ Invariants involving parts of this data-type:
70635+
70636+ [cbk-cache-invariant]
70637+*/
70638+typedef struct cbk_cache {
70639+ /* serializator */
70640+ rwlock_t guard;
70641+ int nr_slots;
70642+ /* head of LRU list of cache slots */
70643+ struct list_head lru;
70644+ /* actual array of slots */
70645+ cbk_cache_slot *slot;
70646+} cbk_cache;
70647+
70648+/* level_lookup_result - possible outcome of looking up key at some level.
70649+ This is used by coord_by_key when traversing tree downward. */
70650+typedef enum {
70651+ /* continue to the next level */
70652+ LOOKUP_CONT,
70653+ /* done. Either required item was found, or we can prove it
70654+ doesn't exist, or some error occurred. */
70655+ LOOKUP_DONE,
70656+ /* restart traversal from the root. Infamous "repetition". */
70657+ LOOKUP_REST
70658+} level_lookup_result;
70659+
70660+/* This is representation of internal reiser4 tree where all file-system
70661+ data and meta-data are stored. This structure is passed to all tree
70662+ manipulation functions. It's different from the super block because:
70663+ we don't want to limit ourselves to strictly one to one mapping
70664+ between super blocks and trees, and, because they are logically
70665+ different: there are things in a super block that have no relation to
70666+ the tree (bitmaps, journalling area, mount options, etc.) and there
70667+ are things in a tree that bear no relation to the super block, like
70668+ tree of znodes.
70669+
70670+ At this time, there is only one tree
70671+ per filesystem, and this struct is part of the super block. We only
70672+ call the super block the super block for historical reasons (most
70673+ other filesystems call the per filesystem metadata the super block).
70674+*/
70675+
70676+struct reiser4_tree {
70677+ /* block_nr == 0 is fake znode. Write lock it, while changing
70678+ tree height. */
70679+ /* disk address of root node of a tree */
70680+ reiser4_block_nr root_block;
70681+
70682+ /* level of the root node. If this is 1, tree consists of root
70683+ node only */
70684+ tree_level height;
70685+
70686+ /*
70687+ * this is cached here avoid calling plugins through function
70688+ * dereference all the time.
70689+ */
70690+ __u64 estimate_one_insert;
70691+
70692+ /* cache of recent tree lookup results */
70693+ cbk_cache cbk_cache;
70694+
70695+ /* hash table to look up znodes by block number. */
70696+ z_hash_table zhash_table;
70697+ z_hash_table zfake_table;
70698+ /* hash table to look up jnodes by inode and offset. */
70699+ j_hash_table jhash_table;
70700+
70701+ /* lock protecting:
70702+ - parent pointers,
70703+ - sibling pointers,
70704+ - znode hash table
70705+ - coord cache
70706+ */
70707+ /* NOTE: The "giant" tree lock can be replaced by more spin locks,
70708+ hoping they will be less contented. We can use one spin lock per one
70709+ znode hash bucket. With adding of some code complexity, sibling
70710+ pointers can be protected by both znode spin locks. However it looks
70711+ more SMP scalable we should test this locking change on n-ways (n >
70712+ 4) SMP machines. Current 4-ways machine test does not show that tree
70713+ lock is contented and it is a bottleneck (2003.07.25). */
70714+
70715+ rwlock_t tree_lock;
70716+
70717+ /* lock protecting delimiting keys */
70718+ rwlock_t dk_lock;
70719+
70720+ /* spin lock protecting znode_epoch */
70721+ spinlock_t epoch_lock;
70722+ /* version stamp used to mark znode updates. See seal.[ch] for more
70723+ * information. */
70724+ __u64 znode_epoch;
70725+
70726+ znode *uber;
70727+ node_plugin *nplug;
70728+ struct super_block *super;
70729+ struct {
70730+ /* carry flags used for insertion of new nodes */
70731+ __u32 new_node_flags;
70732+ /* carry flags used for insertion of new extents */
70733+ __u32 new_extent_flags;
70734+ /* carry flags used for paste operations */
70735+ __u32 paste_flags;
70736+ /* carry flags used for insert operations */
70737+ __u32 insert_flags;
70738+ } carry;
70739+};
70740+
70741+extern int reiser4_init_tree(reiser4_tree * tree,
70742+ const reiser4_block_nr * root_block,
70743+ tree_level height, node_plugin * default_plugin);
70744+extern void reiser4_done_tree(reiser4_tree * tree);
70745+
70746+/* cbk flags: options for coord_by_key() */
70747+typedef enum {
70748+ /* coord_by_key() is called for insertion. This is necessary because
70749+ of extents being located at the twig level. For explanation, see
70750+ comment just above is_next_item_internal().
70751+ */
70752+ CBK_FOR_INSERT = (1 << 0),
70753+ /* coord_by_key() is called with key that is known to be unique */
70754+ CBK_UNIQUE = (1 << 1),
70755+ /* coord_by_key() can trust delimiting keys. This options is not user
70756+ accessible. coord_by_key() will set it automatically. It will be
70757+ only cleared by special-case in extents-on-the-twig-level handling
70758+ where it is necessary to insert item with a key smaller than
70759+ leftmost key in a node. This is necessary because of extents being
70760+ located at the twig level. For explanation, see comment just above
70761+ is_next_item_internal().
70762+ */
70763+ CBK_TRUST_DK = (1 << 2),
70764+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
70765+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
70766+ CBK_DKSET = (1 << 5),
70767+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
70768+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */
70769+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
70770+ * lock */
70771+} cbk_flags;
70772+
70773+/* insertion outcome. IBK = insert by key */
70774+typedef enum {
70775+ IBK_INSERT_OK = 0,
70776+ IBK_ALREADY_EXISTS = -EEXIST,
70777+ IBK_IO_ERROR = -EIO,
70778+ IBK_NO_SPACE = -E_NODE_FULL,
70779+ IBK_OOM = -ENOMEM
70780+} insert_result;
70781+
70782+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
70783+
70784+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
70785+ lock_handle * lh, void *arg);
70786+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
70787+ lock_handle * lh,
70788+ tree_iterate_actor_t actor, void *arg,
70789+ znode_lock_mode mode, int through_units_p);
70790+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
70791+ znode_lock_request pri, lock_handle * lh);
70792+
70793+/* return node plugin of @node */
70794+static inline node_plugin *node_plugin_by_node(const znode *
70795+ node /* node to query */ )
70796+{
70797+ assert("vs-213", node != NULL);
70798+ assert("vs-214", znode_is_loaded(node));
70799+
70800+ return node->nplug;
70801+}
70802+
70803+/* number of items in @node */
70804+static inline pos_in_node_t node_num_items(const znode * node)
70805+{
70806+ assert("nikita-2754", znode_is_loaded(node));
70807+ assert("nikita-2468",
70808+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
70809+
70810+ return node->nr_items;
70811+}
70812+
70813+/* Return the number of items at the present node. Asserts coord->node !=
70814+ NULL. */
70815+static inline unsigned coord_num_items(const coord_t * coord)
70816+{
70817+ assert("jmacd-9805", coord->node != NULL);
70818+
70819+ return node_num_items(coord->node);
70820+}
70821+
70822+/* true if @node is empty */
70823+static inline int node_is_empty(const znode * node)
70824+{
70825+ return node_num_items(node) == 0;
70826+}
70827+
70828+typedef enum {
70829+ SHIFTED_SOMETHING = 0,
70830+ SHIFT_NO_SPACE = -E_NODE_FULL,
70831+ SHIFT_IO_ERROR = -EIO,
70832+ SHIFT_OOM = -ENOMEM,
70833+} shift_result;
70834+
70835+extern node_plugin *node_plugin_by_coord(const coord_t * coord);
70836+extern int is_coord_in_node(const coord_t * coord);
70837+extern int key_in_node(const reiser4_key *, const coord_t *);
70838+extern void coord_item_move_to(coord_t * coord, int items);
70839+extern void coord_unit_move_to(coord_t * coord, int units);
70840+
70841+/* there are two types of repetitive accesses (ra): intra-syscall
70842+ (local) and inter-syscall (global). Local ra is used when
70843+ during single syscall we add/delete several items and units in the
70844+ same place in a tree. Note that plan-A fragments local ra by
70845+ separating stat-data and file body in key-space. Global ra is
70846+ used when user does repetitive modifications in the same place in a
70847+ tree.
70848+
70849+ Our ra implementation serves following purposes:
70850+ 1 it affects balancing decisions so that next operation in a row
70851+ can be performed faster;
70852+ 2 it affects lower-level read-ahead in page-cache;
70853+ 3 it allows to avoid unnecessary lookups by maintaining some state
70854+ across several operations (this is only for local ra);
70855+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of
70856+ operations they are performed without actually doing any intra-node
70857+ shifts, until we finish sequence or scope of sequence leaves
70858+ current node, only then we really pack node (local ra only).
70859+*/
70860+
70861+/* another thing that can be useful is to keep per-tree and/or
70862+ per-process cache of recent lookups. This cache can be organised as a
70863+ list of block numbers of formatted nodes sorted by starting key in
70864+ this node. Balancings should invalidate appropriate parts of this
70865+ cache.
70866+*/
70867+
70868+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
70869+ coord_t * coord, lock_handle * handle,
70870+ znode_lock_mode lock, lookup_bias bias,
70871+ tree_level lock_level, tree_level stop_level,
70872+ __u32 flags, ra_info_t *);
70873+
70874+lookup_result reiser4_object_lookup(struct inode *object,
70875+ const reiser4_key * key,
70876+ coord_t * coord,
70877+ lock_handle * lh,
70878+ znode_lock_mode lock_mode,
70879+ lookup_bias bias,
70880+ tree_level lock_level,
70881+ tree_level stop_level,
70882+ __u32 flags, ra_info_t * info);
70883+
70884+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
70885+ reiser4_item_data * data, coord_t * coord,
70886+ lock_handle * lh,
70887+ tree_level stop_level, __u32 flags);
70888+insert_result insert_by_coord(coord_t * coord,
70889+ reiser4_item_data * data, const reiser4_key * key,
70890+ lock_handle * lh, __u32);
70891+insert_result insert_extent_by_coord(coord_t * coord,
70892+ reiser4_item_data * data,
70893+ const reiser4_key * key, lock_handle * lh);
70894+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
70895+ const reiser4_key * to_key,
70896+ reiser4_key * smallest_removed);
70897+int kill_node_content(coord_t * from, coord_t * to,
70898+ const reiser4_key * from_key, const reiser4_key * to_key,
70899+ reiser4_key * smallest_removed,
70900+ znode * locked_left_neighbor, struct inode *inode,
70901+ int truncate);
70902+
70903+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
70904+ reiser4_key * key, lock_handle * lh, cop_insert_flag);
70905+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
70906+ reiser4_item_data * data, unsigned);
70907+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
70908+int find_new_child_ptr(znode * parent, znode * child, znode * left,
70909+ coord_t * result);
70910+
70911+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
70912+int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
70913+
70914+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
70915+
70916+extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
70917+ const reiser4_key *, reiser4_key *,
70918+ struct inode *, int, int *);
70919+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
70920+ const reiser4_key *, reiser4_key *,
70921+ struct inode *, int, int *);
70922+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70923+ const reiser4_key * to, struct inode *, int);
70924+
70925+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
70926+extern int check_tree_pointer(const coord_t * pointer, const znode * child);
70927+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
70928+ znode * left, coord_t * result);
70929+extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
70930+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
70931+ znode * child);
70932+extern znode *child_znode(const coord_t * in_parent, znode * parent,
70933+ int incore_p, int setup_dkeys_p);
70934+
70935+extern int cbk_cache_init(cbk_cache * cache);
70936+extern void cbk_cache_done(cbk_cache * cache);
70937+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
70938+
70939+extern char *sprint_address(const reiser4_block_nr * block);
70940+
70941+#if REISER4_DEBUG
70942+extern void print_coord_content(const char *prefix, coord_t * p);
70943+extern void reiser4_print_address(const char *prefix,
70944+ const reiser4_block_nr * block);
70945+extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
70946+ __u32 flags);
70947+extern void check_dkeys(znode *node);
70948+#else
70949+#define print_coord_content(p, c) noop
70950+#define reiser4_print_address(p, b) noop
70951+#endif
70952+
70953+extern void forget_znode(lock_handle * handle);
70954+extern int deallocate_znode(znode * node);
70955+
70956+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
70957+
70958+/* struct used internally to pack all numerous arguments of tree lookup.
70959+ Used to avoid passing a lot of arguments to helper functions. */
70960+typedef struct cbk_handle {
70961+ /* tree we are in */
70962+ reiser4_tree *tree;
70963+ /* key we are going after */
70964+ const reiser4_key *key;
70965+ /* coord we will store result in */
70966+ coord_t *coord;
70967+ /* type of lock to take on target node */
70968+ znode_lock_mode lock_mode;
70969+ /* lookup bias. See comments at the declaration of lookup_bias */
70970+ lookup_bias bias;
70971+ /* lock level: level starting from which tree traversal starts taking
70972+ * write locks. */
70973+ tree_level lock_level;
70974+ /* level where search will stop. Either item will be found between
70975+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be
70976+ returned.
70977+ */
70978+ tree_level stop_level;
70979+ /* level we are currently at */
70980+ tree_level level;
70981+ /* block number of @active node. Tree traversal operates on two
70982+ nodes: active and parent. */
70983+ reiser4_block_nr block;
70984+ /* put here error message to be printed by caller */
70985+ const char *error;
70986+ /* result passed back to caller */
70987+ lookup_result result;
70988+ /* lock handles for active and parent */
70989+ lock_handle *parent_lh;
70990+ lock_handle *active_lh;
70991+ reiser4_key ld_key;
70992+ reiser4_key rd_key;
70993+ /* flags, passed to the cbk routine. Bits of this bitmask are defined
70994+ in tree.h:cbk_flags enum. */
70995+ __u32 flags;
70996+ ra_info_t *ra_info;
70997+ struct inode *object;
70998+} cbk_handle;
70999+
71000+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
71001+
71002+/* eottl.c */
71003+extern int handle_eottl(cbk_handle *h, int *outcome);
71004+
71005+int lookup_multikey(cbk_handle * handle, int nr_keys);
71006+int lookup_couple(reiser4_tree * tree,
71007+ const reiser4_key * key1, const reiser4_key * key2,
71008+ coord_t * coord1, coord_t * coord2,
71009+ lock_handle * lh1, lock_handle * lh2,
71010+ znode_lock_mode lock_mode, lookup_bias bias,
71011+ tree_level lock_level, tree_level stop_level, __u32 flags,
71012+ int *result1, int *result2);
71013+
71014+static inline void read_lock_tree(reiser4_tree *tree)
71015+{
71016+ /* check that tree is not locked */
71017+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71018+ LOCK_CNT_NIL(read_locked_tree) &&
71019+ LOCK_CNT_NIL(write_locked_tree)));
71020+ /* check that spinlocks of lower priorities are not held */
71021+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71022+ LOCK_CNT_NIL(rw_locked_dk) &&
71023+ LOCK_CNT_NIL(spin_locked_stack)));
71024+
71025+ read_lock(&(tree->tree_lock));
71026+
71027+ LOCK_CNT_INC(read_locked_tree);
71028+ LOCK_CNT_INC(rw_locked_tree);
71029+ LOCK_CNT_INC(spin_locked);
71030+}
71031+
71032+static inline void read_unlock_tree(reiser4_tree *tree)
71033+{
71034+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
71035+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71036+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71037+
71038+ LOCK_CNT_DEC(read_locked_tree);
71039+ LOCK_CNT_DEC(rw_locked_tree);
71040+ LOCK_CNT_DEC(spin_locked);
71041+
71042+ read_unlock(&(tree->tree_lock));
71043+}
71044+
71045+static inline void write_lock_tree(reiser4_tree *tree)
71046+{
71047+ /* check that tree is not locked */
71048+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71049+ LOCK_CNT_NIL(read_locked_tree) &&
71050+ LOCK_CNT_NIL(write_locked_tree)));
71051+ /* check that spinlocks of lower priorities are not held */
71052+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71053+ LOCK_CNT_NIL(rw_locked_dk) &&
71054+ LOCK_CNT_NIL(spin_locked_stack)));
71055+
71056+ write_lock(&(tree->tree_lock));
71057+
71058+ LOCK_CNT_INC(write_locked_tree);
71059+ LOCK_CNT_INC(rw_locked_tree);
71060+ LOCK_CNT_INC(spin_locked);
71061+}
71062+
71063+static inline void write_unlock_tree(reiser4_tree *tree)
71064+{
71065+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
71066+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71067+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71068+
71069+ LOCK_CNT_DEC(write_locked_tree);
71070+ LOCK_CNT_DEC(rw_locked_tree);
71071+ LOCK_CNT_DEC(spin_locked);
71072+
71073+ write_unlock(&(tree->tree_lock));
71074+}
71075+
71076+static inline void read_lock_dk(reiser4_tree *tree)
71077+{
71078+ /* check that dk is not locked */
71079+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71080+ LOCK_CNT_NIL(read_locked_dk) &&
71081+ LOCK_CNT_NIL(write_locked_dk)));
71082+ /* check that spinlocks of lower priorities are not held */
71083+ assert("", LOCK_CNT_NIL(spin_locked_stack));
71084+
71085+ read_lock(&((tree)->dk_lock));
71086+
71087+ LOCK_CNT_INC(read_locked_dk);
71088+ LOCK_CNT_INC(rw_locked_dk);
71089+ LOCK_CNT_INC(spin_locked);
71090+}
71091+
71092+static inline void read_unlock_dk(reiser4_tree *tree)
71093+{
71094+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
71095+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71096+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71097+
71098+ LOCK_CNT_DEC(read_locked_dk);
71099+ LOCK_CNT_DEC(rw_locked_dk);
71100+ LOCK_CNT_DEC(spin_locked);
71101+
71102+ read_unlock(&(tree->dk_lock));
71103+}
71104+
71105+static inline void write_lock_dk(reiser4_tree *tree)
71106+{
71107+ /* check that dk is not locked */
71108+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71109+ LOCK_CNT_NIL(read_locked_dk) &&
71110+ LOCK_CNT_NIL(write_locked_dk)));
71111+ /* check that spinlocks of lower priorities are not held */
71112+ assert("", LOCK_CNT_NIL(spin_locked_stack));
71113+
71114+ write_lock(&((tree)->dk_lock));
71115+
71116+ LOCK_CNT_INC(write_locked_dk);
71117+ LOCK_CNT_INC(rw_locked_dk);
71118+ LOCK_CNT_INC(spin_locked);
71119+}
71120+
71121+static inline void write_unlock_dk(reiser4_tree *tree)
71122+{
71123+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
71124+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71125+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71126+
71127+ LOCK_CNT_DEC(write_locked_dk);
71128+ LOCK_CNT_DEC(rw_locked_dk);
71129+ LOCK_CNT_DEC(spin_locked);
71130+
71131+ write_unlock(&(tree->dk_lock));
71132+}
71133+
71134+/* estimate api. Implementation is in estimate.c */
71135+reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
71136+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
71137+reiser4_block_nr estimate_insert_flow(tree_level);
71138+reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
71139+reiser4_block_nr calc_estimate_one_insert(tree_level);
71140+reiser4_block_nr estimate_dirty_cluster(struct inode *);
71141+reiser4_block_nr estimate_insert_cluster(struct inode *);
71142+reiser4_block_nr estimate_update_cluster(struct inode *);
71143+
71144+/* __REISER4_TREE_H__ */
71145+#endif
71146+
71147+/* Make Linus happy.
71148+ Local variables:
71149+ c-indentation-style: "K&R"
71150+ mode-name: "LC"
71151+ c-basic-offset: 8
71152+ tab-width: 8
71153+ fill-column: 120
71154+ scroll-step: 1
71155+ End:
71156+*/
71157diff -urN linux-2.6.20.orig/fs/reiser4/tree_mod.c linux-2.6.20/fs/reiser4/tree_mod.c
71158--- linux-2.6.20.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300
71159+++ linux-2.6.20/fs/reiser4/tree_mod.c 2007-05-06 14:50:43.887034467 +0400
71160@@ -0,0 +1,386 @@
71161+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71162+ * reiser4/README */
71163+
71164+/*
71165+ * Functions to add/delete new nodes to/from the tree.
71166+ *
71167+ * Functions from this file are used by carry (see carry*) to handle:
71168+ *
71169+ * . insertion of new formatted node into tree
71170+ *
71171+ * . addition of new tree root, increasing tree height
71172+ *
71173+ * . removing tree root, decreasing tree height
71174+ *
71175+ */
71176+
71177+#include "forward.h"
71178+#include "debug.h"
71179+#include "dformat.h"
71180+#include "key.h"
71181+#include "coord.h"
71182+#include "plugin/plugin.h"
71183+#include "jnode.h"
71184+#include "znode.h"
71185+#include "tree_mod.h"
71186+#include "block_alloc.h"
71187+#include "tree_walk.h"
71188+#include "tree.h"
71189+#include "super.h"
71190+
71191+#include <linux/err.h>
71192+
71193+static int add_child_ptr(znode * parent, znode * child);
71194+/* warning only issued if error is not -E_REPEAT */
71195+#define ewarning( error, ... ) \
71196+ if( ( error ) != -E_REPEAT ) \
71197+ warning( __VA_ARGS__ )
71198+
71199+/* allocate new node on the @level and immediately on the right of @brother. */
71200+znode * reiser4_new_node(znode * brother /* existing left neighbor
71201+ * of new node */,
71202+ tree_level level /* tree level at which new node is to
71203+ * be allocated */)
71204+{
71205+ znode *result;
71206+ int retcode;
71207+ reiser4_block_nr blocknr;
71208+
71209+ assert("nikita-930", brother != NULL);
71210+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
71211+
71212+ retcode = assign_fake_blocknr_formatted(&blocknr);
71213+ if (retcode == 0) {
71214+ result =
71215+ zget(znode_get_tree(brother), &blocknr, NULL, level,
71216+ reiser4_ctx_gfp_mask_get());
71217+ if (IS_ERR(result)) {
71218+ ewarning(PTR_ERR(result), "nikita-929",
71219+ "Cannot allocate znode for carry: %li",
71220+ PTR_ERR(result));
71221+ return result;
71222+ }
71223+ /* cheap test, can be executed even when debugging is off */
71224+ if (!znode_just_created(result)) {
71225+ warning("nikita-2213",
71226+ "Allocated already existing block: %llu",
71227+ (unsigned long long)blocknr);
71228+ zput(result);
71229+ return ERR_PTR(RETERR(-EIO));
71230+ }
71231+
71232+ assert("nikita-931", result != NULL);
71233+ result->nplug = znode_get_tree(brother)->nplug;
71234+ assert("nikita-933", result->nplug != NULL);
71235+
71236+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
71237+ if (retcode == 0) {
71238+ ZF_SET(result, JNODE_CREATED);
71239+ zrelse(result);
71240+ } else {
71241+ zput(result);
71242+ result = ERR_PTR(retcode);
71243+ }
71244+ } else {
71245+ /* failure to allocate new node during balancing.
71246+ This should never happen. Ever. Returning -E_REPEAT
71247+ is not viable solution, because "out of disk space"
71248+ is not transient error that will go away by itself.
71249+ */
71250+ ewarning(retcode, "nikita-928",
71251+ "Cannot allocate block for carry: %i", retcode);
71252+ result = ERR_PTR(retcode);
71253+ }
71254+ assert("nikita-1071", result != NULL);
71255+ return result;
71256+}
71257+
71258+/* allocate new root and add it to the tree
71259+
71260+ This helper function is called by add_new_root().
71261+
71262+*/
71263+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
71264+ znode * fake /* "fake" znode */ )
71265+{
71266+ reiser4_tree *tree = znode_get_tree(old_root);
71267+ znode *new_root = NULL; /* to shut gcc up */
71268+ int result;
71269+
71270+ assert("nikita-1069", old_root != NULL);
71271+ assert("umka-262", fake != NULL);
71272+ assert("umka-263", tree != NULL);
71273+
71274+ /* "fake" znode---one always hanging just above current root. This
71275+ node is locked when new root is created or existing root is
71276+ deleted. Downward tree traversal takes lock on it before taking
71277+ lock on a root node. This avoids race conditions with root
71278+ manipulations.
71279+
71280+ */
71281+ assert("nikita-1348", znode_above_root(fake));
71282+ assert("nikita-1211", znode_is_root(old_root));
71283+
71284+ result = 0;
71285+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
71286+ warning("nikita-1344", "Tree is too tall: %i", tree->height);
71287+ /* ext2 returns -ENOSPC when it runs out of free inodes with a
71288+ following comment (fs/ext2/ialloc.c:441): Is it really
71289+ ENOSPC?
71290+
71291+ -EXFULL? -EINVAL?
71292+ */
71293+ result = RETERR(-ENOSPC);
71294+ } else {
71295+ /* Allocate block for new root. It's not that
71296+ important where it will be allocated, as root is
71297+ almost always in memory. Moreover, allocate on
71298+ flush can be going here.
71299+ */
71300+ assert("nikita-1448", znode_is_root(old_root));
71301+ new_root = reiser4_new_node(fake, tree->height + 1);
71302+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71303+ lock_handle rlh;
71304+
71305+ init_lh(&rlh);
71306+ result =
71307+ longterm_lock_znode(&rlh, new_root,
71308+ ZNODE_WRITE_LOCK,
71309+ ZNODE_LOCK_LOPRI);
71310+ if (result == 0) {
71311+ parent_coord_t *in_parent;
71312+
71313+ znode_make_dirty(fake);
71314+
71315+ /* new root is a child of "fake" node */
71316+ write_lock_tree(tree);
71317+
71318+ ++tree->height;
71319+
71320+ /* recalculate max balance overhead */
71321+ tree->estimate_one_insert =
71322+ estimate_one_insert_item(tree);
71323+
71324+ tree->root_block = *znode_get_block(new_root);
71325+ in_parent = &new_root->in_parent;
71326+ init_parent_coord(in_parent, fake);
71327+ /* manually insert new root into sibling
71328+ * list. With this all nodes involved into
71329+ * balancing are connected after balancing is
71330+ * done---useful invariant to check. */
71331+ sibling_list_insert_nolock(new_root, NULL);
71332+ write_unlock_tree(tree);
71333+
71334+ /* insert into new root pointer to the
71335+ @old_root. */
71336+ assert("nikita-1110",
71337+ WITH_DATA(new_root,
71338+ node_is_empty(new_root)));
71339+ write_lock_dk(tree);
71340+ znode_set_ld_key(new_root, reiser4_min_key());
71341+ znode_set_rd_key(new_root, reiser4_max_key());
71342+ write_unlock_dk(tree);
71343+ if (REISER4_DEBUG) {
71344+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71345+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71346+ ZF_SET(old_root, JNODE_ORPHAN);
71347+ }
71348+ result = add_child_ptr(new_root, old_root);
71349+ done_lh(&rlh);
71350+ }
71351+ zrelse(new_root);
71352+ }
71353+ }
71354+ if (result != 0)
71355+ new_root = ERR_PTR(result);
71356+ return new_root;
71357+}
71358+
71359+/* build &reiser4_item_data for inserting child pointer
71360+
71361+ Build &reiser4_item_data that can be later used to insert pointer to @child
71362+ in its parent.
71363+
71364+*/
71365+void build_child_ptr_data(znode * child /* node pointer to which will be
71366+ * inserted */ ,
71367+ reiser4_item_data * data /* where to store result */ )
71368+{
71369+ assert("nikita-1116", child != NULL);
71370+ assert("nikita-1117", data != NULL);
71371+
71372+ /*
71373+ * NOTE: use address of child's blocknr as address of data to be
71374+ * inserted. As result of this data gets into on-disk structure in cpu
71375+ * byte order. internal's create_hook converts it to little endian byte
71376+ * order.
71377+ */
71378+ data->data = (char *)znode_get_block(child);
71379+ /* data -> data is kernel space */
71380+ data->user = 0;
71381+ data->length = sizeof(reiser4_block_nr);
71382+ /* FIXME-VS: hardcoded internal item? */
71383+
71384+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71385+ data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71386+}
71387+
71388+/* add pointer to @child into empty @parent.
71389+
71390+ This is used when pointer to old root is inserted into new root which is
71391+ empty.
71392+*/
71393+static int add_child_ptr(znode * parent, znode * child)
71394+{
71395+ coord_t coord;
71396+ reiser4_item_data data;
71397+ int result;
71398+ reiser4_key key;
71399+
71400+ assert("nikita-1111", parent != NULL);
71401+ assert("nikita-1112", child != NULL);
71402+ assert("nikita-1115",
71403+ znode_get_level(parent) == znode_get_level(child) + 1);
71404+
71405+ result = zload(parent);
71406+ if (result != 0)
71407+ return result;
71408+ assert("nikita-1113", node_is_empty(parent));
71409+ coord_init_first_unit(&coord, parent);
71410+
71411+ build_child_ptr_data(child, &data);
71412+ data.arg = NULL;
71413+
71414+ read_lock_dk(znode_get_tree(parent));
71415+ key = *znode_get_ld_key(child);
71416+ read_unlock_dk(znode_get_tree(parent));
71417+
71418+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71419+ NULL);
71420+ znode_make_dirty(parent);
71421+ zrelse(parent);
71422+ return result;
71423+}
71424+
71425+/* actually remove tree root */
71426+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
71427+ * being removed */,
71428+ znode * old_root /* root node that is being
71429+ * removed */ ,
71430+ znode * new_root /* new root---sole child of
71431+ * @old_root */,
71432+ const reiser4_block_nr * new_root_blk /* disk address of
71433+ * @new_root */)
71434+{
71435+ znode *uber;
71436+ int result;
71437+ lock_handle handle_for_uber;
71438+
71439+ assert("umka-265", tree != NULL);
71440+ assert("nikita-1198", new_root != NULL);
71441+ assert("nikita-1199",
71442+ znode_get_level(new_root) + 1 == znode_get_level(old_root));
71443+
71444+ assert("nikita-1201", znode_is_write_locked(old_root));
71445+
71446+ assert("nikita-1203",
71447+ disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71448+
71449+ init_lh(&handle_for_uber);
71450+ /* obtain and lock "fake" znode protecting changes in tree height. */
71451+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71452+ &handle_for_uber);
71453+ if (result == 0) {
71454+ uber = handle_for_uber.node;
71455+
71456+ znode_make_dirty(uber);
71457+
71458+ /* don't take long term lock a @new_root. Take spinlock. */
71459+
71460+ write_lock_tree(tree);
71461+
71462+ tree->root_block = *new_root_blk;
71463+ --tree->height;
71464+
71465+ /* recalculate max balance overhead */
71466+ tree->estimate_one_insert = estimate_one_insert_item(tree);
71467+
71468+ assert("nikita-1202",
71469+ tree->height == znode_get_level(new_root));
71470+
71471+ /* new root is child on "fake" node */
71472+ init_parent_coord(&new_root->in_parent, uber);
71473+ ++uber->c_count;
71474+
71475+ /* sibling_list_insert_nolock(new_root, NULL); */
71476+ write_unlock_tree(tree);
71477+
71478+ /* reinitialise old root. */
71479+ result = node_plugin_by_node(old_root)->init(old_root);
71480+ znode_make_dirty(old_root);
71481+ if (result == 0) {
71482+ assert("nikita-1279", node_is_empty(old_root));
71483+ ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71484+ old_root->c_count = 0;
71485+ }
71486+ }
71487+ done_lh(&handle_for_uber);
71488+
71489+ return result;
71490+}
71491+
71492+/* remove tree root
71493+
71494+ This function removes tree root, decreasing tree height by one. Tree root
71495+ and its only child (that is going to become new tree root) are write locked
71496+ at the entry.
71497+
71498+ To remove tree root we need to take lock on special "fake" znode that
71499+ protects changes of tree height. See comments in reiser4_add_tree_root() for
71500+ more on this.
71501+
71502+ Also parent pointers have to be updated in
71503+ old and new root. To simplify code, function is split into two parts: outer
71504+ reiser4_kill_tree_root() collects all necessary arguments and calls
71505+ reiser4_kill_root() to do the actual job.
71506+
71507+*/
71508+int reiser4_kill_tree_root(znode * old_root /* tree root that we are
71509+ removing*/)
71510+{
71511+ int result;
71512+ coord_t down_link;
71513+ znode *new_root;
71514+ reiser4_tree *tree;
71515+
71516+ assert("umka-266", current_tree != NULL);
71517+ assert("nikita-1194", old_root != NULL);
71518+ assert("nikita-1196", znode_is_root(old_root));
71519+ assert("nikita-1200", node_num_items(old_root) == 1);
71520+ assert("nikita-1401", znode_is_write_locked(old_root));
71521+
71522+ coord_init_first_unit(&down_link, old_root);
71523+
71524+ tree = znode_get_tree(old_root);
71525+ new_root = child_znode(&down_link, old_root, 0, 1);
71526+ if (!IS_ERR(new_root)) {
71527+ result =
71528+ reiser4_kill_root(tree, old_root, new_root,
71529+ znode_get_block(new_root));
71530+ zput(new_root);
71531+ } else
71532+ result = PTR_ERR(new_root);
71533+
71534+ return result;
71535+}
71536+
71537+/* Make Linus happy.
71538+ Local variables:
71539+ c-indentation-style: "K&R"
71540+ mode-name: "LC"
71541+ c-basic-offset: 8
71542+ tab-width: 8
71543+ fill-column: 120
71544+ scroll-step: 1
71545+ End:
71546+*/
71547diff -urN linux-2.6.20.orig/fs/reiser4/tree_mod.h linux-2.6.20/fs/reiser4/tree_mod.h
71548--- linux-2.6.20.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300
71549+++ linux-2.6.20/fs/reiser4/tree_mod.h 2007-05-06 14:50:43.887034467 +0400
71550@@ -0,0 +1,29 @@
71551+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71552+ * reiser4/README */
71553+
71554+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
71555+ * comments. */
71556+
71557+#if !defined( __REISER4_TREE_MOD_H__ )
71558+#define __REISER4_TREE_MOD_H__
71559+
71560+#include "forward.h"
71561+
71562+znode *reiser4_new_node(znode * brother, tree_level level);
71563+znode *reiser4_add_tree_root(znode * old_root, znode * fake);
71564+int reiser4_kill_tree_root(znode * old_root);
71565+void build_child_ptr_data(znode * child, reiser4_item_data * data);
71566+
71567+/* __REISER4_TREE_MOD_H__ */
71568+#endif
71569+
71570+/* Make Linus happy.
71571+ Local variables:
71572+ c-indentation-style: "K&R"
71573+ mode-name: "LC"
71574+ c-basic-offset: 8
71575+ tab-width: 8
71576+ fill-column: 120
71577+ scroll-step: 1
71578+ End:
71579+*/
71580diff -urN linux-2.6.20.orig/fs/reiser4/tree_walk.c linux-2.6.20/fs/reiser4/tree_walk.c
71581--- linux-2.6.20.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300
71582+++ linux-2.6.20/fs/reiser4/tree_walk.c 2007-05-06 14:50:43.887034467 +0400
71583@@ -0,0 +1,927 @@
71584+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71585+ * reiser4/README */
71586+
71587+/* Routines and macros to:
71588+
71589+ get_left_neighbor()
71590+
71591+ get_right_neighbor()
71592+
71593+ get_parent()
71594+
71595+ get_first_child()
71596+
71597+ get_last_child()
71598+
71599+ various routines to walk the whole tree and do things to it like
71600+ repack it, or move it to tertiary storage. Please make them as
71601+ generic as is reasonable.
71602+
71603+*/
71604+
71605+#include "forward.h"
71606+#include "debug.h"
71607+#include "dformat.h"
71608+#include "coord.h"
71609+#include "plugin/item/item.h"
71610+#include "jnode.h"
71611+#include "znode.h"
71612+#include "tree_walk.h"
71613+#include "tree.h"
71614+#include "super.h"
71615+
71616+/* These macros are used internally in tree_walk.c in attempt to make
71617+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
71618+ lock_left_neighbor */
71619+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
71620+#define FIELD_OFFSET(name) offsetof(znode, name)
71621+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
71622+#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
71623+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
71624+
71625+/* This is the generic procedure to get and lock `generic' neighbor (left or
71626+ right neighbor or parent). It implements common algorithm for all cases of
71627+ getting lock on neighbor node, only znode structure field is different in
71628+ each case. This is parameterized by ptr_offset argument, which is byte
71629+ offset for the pointer to the desired neighbor within the current node's
71630+ znode structure. This function should be called with the tree lock held */
71631+static int lock_neighbor(
71632+ /* resulting lock handle */
71633+ lock_handle * result,
71634+ /* znode to lock */
71635+ znode * node,
71636+ /* pointer to neighbor (or parent) znode field offset, in bytes from
71637+ the base address of znode structure */
71638+ int ptr_offset,
71639+ /* lock mode for longterm_lock_znode call */
71640+ znode_lock_mode mode,
71641+ /* lock request for longterm_lock_znode call */
71642+ znode_lock_request req,
71643+ /* GN_* flags */
71644+ int flags, int rlocked)
71645+{
71646+ reiser4_tree *tree = znode_get_tree(node);
71647+ znode *neighbor;
71648+ int ret;
71649+
71650+ assert("umka-236", node != NULL);
71651+ assert("umka-237", tree != NULL);
71652+ assert_rw_locked(&(tree->tree_lock));
71653+
71654+ if (flags & GN_TRY_LOCK)
71655+ req |= ZNODE_LOCK_NONBLOCK;
71656+ if (flags & GN_SAME_ATOM)
71657+ req |= ZNODE_LOCK_DONT_FUSE;
71658+
71659+ /* get neighbor's address by using of sibling link, quit while loop
71660+ (and return) if link is not available. */
71661+ while (1) {
71662+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
71663+
71664+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
71665+ * node pointed by it is not connected.
71666+ *
71667+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
71668+ * check and allows passing reference to not connected znode to
71669+ * subsequent longterm_lock_znode() call. This kills possible
71670+ * busy loop if we are trying to get longterm lock on locked but
71671+ * not yet connected parent node. */
71672+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
71673+ || znode_is_connected(neighbor))) {
71674+ return RETERR(-E_NO_NEIGHBOR);
71675+ }
71676+
71677+ /* protect it from deletion. */
71678+ zref(neighbor);
71679+
71680+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71681+
71682+ ret = longterm_lock_znode(result, neighbor, mode, req);
71683+
71684+ /* The lock handle obtains its own reference, release the one from above. */
71685+ zput(neighbor);
71686+
71687+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71688+
71689+ /* restart if node we got reference to is being
71690+ invalidated. we should not get reference to this node
71691+ again. */
71692+ if (ret == -EINVAL)
71693+ continue;
71694+ if (ret)
71695+ return ret;
71696+
71697+ /* check if neighbor link still points to just locked znode;
71698+ the link could have been changed while the process slept. */
71699+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
71700+ return 0;
71701+
71702+ /* znode was locked by mistake; unlock it and restart locking
71703+ process from beginning. */
71704+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71705+ longterm_unlock_znode(result);
71706+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71707+ }
71708+}
71709+
71710+/* get parent node with longterm lock, accepts GN* flags. */
71711+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
71712+ znode * node /* child node */ ,
71713+ znode_lock_mode mode
71714+ /* type of lock: read or write */ ,
71715+ int flags /* GN_* flags */ )
71716+{
71717+ int result;
71718+
71719+ read_lock_tree(znode_get_tree(node));
71720+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
71721+ ZNODE_LOCK_HIPRI, flags, 1);
71722+ read_unlock_tree(znode_get_tree(node));
71723+ return result;
71724+}
71725+
71726+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
71727+ bit in @flags parameter */
71728+/* Audited by: umka (2002.06.14) */
71729+static inline int
71730+lock_side_neighbor(lock_handle * result,
71731+ znode * node, znode_lock_mode mode, int flags, int rlocked)
71732+{
71733+ int ret;
71734+ int ptr_offset;
71735+ znode_lock_request req;
71736+
71737+ if (flags & GN_GO_LEFT) {
71738+ ptr_offset = LEFT_PTR_OFFSET;
71739+ req = ZNODE_LOCK_LOPRI;
71740+ } else {
71741+ ptr_offset = RIGHT_PTR_OFFSET;
71742+ req = ZNODE_LOCK_HIPRI;
71743+ }
71744+
71745+ ret =
71746+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
71747+
71748+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
71749+ * guarantee that neighbor is absent in the
71750+ * tree; in this case we return -ENOENT --
71751+ * means neighbor at least not found in
71752+ * cache */
71753+ return RETERR(-ENOENT);
71754+
71755+ return ret;
71756+}
71757+
71758+#if REISER4_DEBUG
71759+
71760+int check_sibling_list(znode * node)
71761+{
71762+ znode *scan;
71763+ znode *next;
71764+
71765+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
71766+
71767+ if (node == NULL)
71768+ return 1;
71769+
71770+ if (ZF_ISSET(node, JNODE_RIP))
71771+ return 1;
71772+
71773+ assert("nikita-3270", node != NULL);
71774+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
71775+
71776+ for (scan = node; znode_is_left_connected(scan); scan = next) {
71777+ next = scan->left;
71778+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71779+ assert("nikita-3271", znode_is_right_connected(next));
71780+ assert("nikita-3272", next->right == scan);
71781+ } else
71782+ break;
71783+ }
71784+ for (scan = node; znode_is_right_connected(scan); scan = next) {
71785+ next = scan->right;
71786+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71787+ assert("nikita-3273", znode_is_left_connected(next));
71788+ assert("nikita-3274", next->left == scan);
71789+ } else
71790+ break;
71791+ }
71792+ return 1;
71793+}
71794+
71795+#endif
71796+
71797+/* Znode sibling pointers maintenence. */
71798+
71799+/* Znode sibling pointers are established between any neighbored nodes which are
71800+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
71801+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
71802+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
71803+
71804+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
71805+ take care about searching (hash table lookup may be required) of znode
71806+ neighbors, establishing sibling pointers between them and setting
71807+ JNODE_*_CONNECTED state bits. */
71808+
71809+/* adjusting of sibling pointers and `connected' states for two
71810+ neighbors; works if one neighbor is NULL (was not found). */
71811+
71812+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
71813+void link_left_and_right(znode * left, znode * right)
71814+{
71815+ assert("nikita-3275", check_sibling_list(left));
71816+ assert("nikita-3275", check_sibling_list(right));
71817+
71818+ if (left != NULL) {
71819+ if (left->right == NULL) {
71820+ left->right = right;
71821+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
71822+
71823+ ON_DEBUG(left->right_version =
71824+ atomic_inc_return(&delim_key_version);
71825+ );
71826+
71827+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
71828+ && left->right != right) {
71829+
71830+ ON_DEBUG(left->right->left_version =
71831+ atomic_inc_return(&delim_key_version);
71832+ left->right_version =
71833+ atomic_inc_return(&delim_key_version););
71834+
71835+ left->right->left = NULL;
71836+ left->right = right;
71837+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
71838+ } else
71839+ /*
71840+ * there is a race condition in renew_sibling_link()
71841+ * and assertions below check that it is only one
71842+ * there. Thread T1 calls renew_sibling_link() without
71843+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor
71844+ * node, but before T1 gets to the
71845+ * link_left_and_right(), another thread T2 creates
71846+ * neighbor node and connects it. check for
71847+ * left->right == NULL above protects T1 from
71848+ * overwriting correct left->right pointer installed
71849+ * by T2.
71850+ */
71851+ assert("nikita-3302",
71852+ right == NULL || left->right == right);
71853+ }
71854+ if (right != NULL) {
71855+ if (right->left == NULL) {
71856+ right->left = left;
71857+ ZF_SET(right, JNODE_LEFT_CONNECTED);
71858+
71859+ ON_DEBUG(right->left_version =
71860+ atomic_inc_return(&delim_key_version);
71861+ );
71862+
71863+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
71864+ && right->left != left) {
71865+
71866+ ON_DEBUG(right->left->right_version =
71867+ atomic_inc_return(&delim_key_version);
71868+ right->left_version =
71869+ atomic_inc_return(&delim_key_version););
71870+
71871+ right->left->right = NULL;
71872+ right->left = left;
71873+ ZF_SET(right, JNODE_LEFT_CONNECTED);
71874+
71875+ } else
71876+ assert("nikita-3303",
71877+ left == NULL || right->left == left);
71878+ }
71879+ assert("nikita-3275", check_sibling_list(left));
71880+ assert("nikita-3275", check_sibling_list(right));
71881+}
71882+
71883+/* Audited by: umka (2002.06.14) */
71884+static void link_znodes(znode * first, znode * second, int to_left)
71885+{
71886+ if (to_left)
71887+ link_left_and_right(second, first);
71888+ else
71889+ link_left_and_right(first, second);
71890+}
71891+
71892+/* getting of next (to left or to right, depend on gn_to_left bit in flags)
71893+ coord's unit position in horizontal direction, even across node
71894+ boundary. Should be called under tree lock, it protects nonexistence of
71895+ sibling link on parent level, if lock_side_neighbor() fails with
71896+ -ENOENT. */
71897+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
71898+{
71899+ int ret;
71900+ znode *node;
71901+ reiser4_tree *tree;
71902+
71903+ assert("umka-243", coord != NULL);
71904+ assert("umka-244", handle != NULL);
71905+ assert("zam-1069", handle->node == NULL);
71906+
71907+ ret =
71908+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
71909+ coord_next_unit(coord);
71910+ if (!ret)
71911+ return 0;
71912+
71913+ ret =
71914+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
71915+ if (ret)
71916+ return ret;
71917+
71918+ node = handle->node;
71919+ tree = znode_get_tree(node);
71920+ write_unlock_tree(tree);
71921+
71922+ coord_init_zero(coord);
71923+
71924+ /* We avoid synchronous read here if it is specified by flag. */
71925+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
71926+ ret = jstartio(ZJNODE(handle->node));
71927+ if (!ret)
71928+ ret = -E_REPEAT;
71929+ goto error_locked;
71930+ }
71931+
71932+ /* corresponded zrelse() should be called by the clients of
71933+ far_next_coord(), in place when this node gets unlocked. */
71934+ ret = zload(handle->node);
71935+ if (ret)
71936+ goto error_locked;
71937+
71938+ if (flags & GN_GO_LEFT)
71939+ coord_init_last_unit(coord, node);
71940+ else
71941+ coord_init_first_unit(coord, node);
71942+
71943+ if (0) {
71944+ error_locked:
71945+ longterm_unlock_znode(handle);
71946+ }
71947+ write_lock_tree(tree);
71948+ return ret;
71949+}
71950+
71951+/* Very significant function which performs a step in horizontal direction
71952+ when sibling pointer is not available. Actually, it is only function which
71953+ does it.
71954+ Note: this function does not restore locking status at exit,
71955+ caller should does care about proper unlocking and zrelsing */
71956+static int
71957+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
71958+ tree_level level, int flags, int *nr_locked)
71959+{
71960+ int ret;
71961+ int to_left = flags & GN_GO_LEFT;
71962+ reiser4_block_nr da;
71963+ /* parent of the neighbor node; we set it to parent until not sharing
71964+ of one parent between child and neighbor node is detected */
71965+ znode *side_parent = coord->node;
71966+ reiser4_tree *tree = znode_get_tree(child);
71967+ znode *neighbor = NULL;
71968+
71969+ assert("umka-245", coord != NULL);
71970+ assert("umka-246", handle != NULL);
71971+ assert("umka-247", child != NULL);
71972+ assert("umka-303", tree != NULL);
71973+
71974+ init_lh(handle);
71975+ write_lock_tree(tree);
71976+ ret = far_next_coord(coord, handle, flags);
71977+
71978+ if (ret) {
71979+ if (ret != -ENOENT) {
71980+ write_unlock_tree(tree);
71981+ return ret;
71982+ }
71983+ } else {
71984+ item_plugin *iplug;
71985+
71986+ if (handle->node != NULL) {
71987+ (*nr_locked)++;
71988+ side_parent = handle->node;
71989+ }
71990+
71991+ /* does coord object points to internal item? We do not
71992+ support sibling pointers between znode for formatted and
71993+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */
71994+ iplug = item_plugin_by_coord(coord);
71995+ if (!item_is_internal(coord)) {
71996+ link_znodes(child, NULL, to_left);
71997+ write_unlock_tree(tree);
71998+ /* we know there can't be formatted neighbor */
71999+ return RETERR(-E_NO_NEIGHBOR);
72000+ }
72001+ write_unlock_tree(tree);
72002+
72003+ iplug->s.internal.down_link(coord, NULL, &da);
72004+
72005+ if (flags & GN_NO_ALLOC) {
72006+ neighbor = zlook(tree, &da);
72007+ } else {
72008+ neighbor =
72009+ zget(tree, &da, side_parent, level,
72010+ reiser4_ctx_gfp_mask_get());
72011+ }
72012+
72013+ if (IS_ERR(neighbor)) {
72014+ ret = PTR_ERR(neighbor);
72015+ return ret;
72016+ }
72017+
72018+ if (neighbor)
72019+ /* update delimiting keys */
72020+ set_child_delimiting_keys(coord->node, coord, neighbor);
72021+
72022+ write_lock_tree(tree);
72023+ }
72024+
72025+ if (likely(neighbor == NULL ||
72026+ (znode_get_level(child) == znode_get_level(neighbor)
72027+ && child != neighbor)))
72028+ link_znodes(child, neighbor, to_left);
72029+ else {
72030+ warning("nikita-3532",
72031+ "Sibling nodes on the different levels: %i != %i\n",
72032+ znode_get_level(child), znode_get_level(neighbor));
72033+ ret = RETERR(-EIO);
72034+ }
72035+
72036+ write_unlock_tree(tree);
72037+
72038+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
72039+ if (neighbor != NULL && (flags & GN_NO_ALLOC))
72040+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */
72041+ zput(neighbor);
72042+
72043+ return ret;
72044+}
72045+
72046+/* This function is for establishing of one side relation. */
72047+/* Audited by: umka (2002.06.14) */
72048+static int connect_one_side(coord_t * coord, znode * node, int flags)
72049+{
72050+ coord_t local;
72051+ lock_handle handle;
72052+ int nr_locked;
72053+ int ret;
72054+
72055+ assert("umka-248", coord != NULL);
72056+ assert("umka-249", node != NULL);
72057+
72058+ coord_dup_nocheck(&local, coord);
72059+
72060+ init_lh(&handle);
72061+
72062+ ret =
72063+ renew_sibling_link(&local, &handle, node, znode_get_level(node),
72064+ flags | GN_NO_ALLOC, &nr_locked);
72065+
72066+ if (handle.node != NULL) {
72067+ /* complementary operations for zload() and lock() in far_next_coord() */
72068+ zrelse(handle.node);
72069+ longterm_unlock_znode(&handle);
72070+ }
72071+
72072+ /* we catch error codes which are not interesting for us because we
72073+ run renew_sibling_link() only for znode connection. */
72074+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
72075+ return 0;
72076+
72077+ return ret;
72078+}
72079+
72080+/* if @child is not in `connected' state, performs hash searches for left and
72081+ right neighbor nodes and establishes horizontal sibling links */
72082+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72083+int connect_znode(coord_t * parent_coord, znode * child)
72084+{
72085+ reiser4_tree *tree = znode_get_tree(child);
72086+ int ret = 0;
72087+
72088+ assert("zam-330", parent_coord != NULL);
72089+ assert("zam-331", child != NULL);
72090+ assert("zam-332", parent_coord->node != NULL);
72091+ assert("umka-305", tree != NULL);
72092+
72093+ /* it is trivial to `connect' root znode because it can't have
72094+ neighbors */
72095+ if (znode_above_root(parent_coord->node)) {
72096+ child->left = NULL;
72097+ child->right = NULL;
72098+ ZF_SET(child, JNODE_LEFT_CONNECTED);
72099+ ZF_SET(child, JNODE_RIGHT_CONNECTED);
72100+
72101+ ON_DEBUG(child->left_version =
72102+ atomic_inc_return(&delim_key_version);
72103+ child->right_version =
72104+ atomic_inc_return(&delim_key_version););
72105+
72106+ return 0;
72107+ }
72108+
72109+ /* load parent node */
72110+ coord_clear_iplug(parent_coord);
72111+ ret = zload(parent_coord->node);
72112+
72113+ if (ret != 0)
72114+ return ret;
72115+
72116+ /* protect `connected' state check by tree_lock */
72117+ read_lock_tree(tree);
72118+
72119+ if (!znode_is_right_connected(child)) {
72120+ read_unlock_tree(tree);
72121+ /* connect right (default is right) */
72122+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
72123+ if (ret)
72124+ goto zrelse_and_ret;
72125+
72126+ read_lock_tree(tree);
72127+ }
72128+
72129+ ret = znode_is_left_connected(child);
72130+
72131+ read_unlock_tree(tree);
72132+
72133+ if (!ret) {
72134+ ret =
72135+ connect_one_side(parent_coord, child,
72136+ GN_NO_ALLOC | GN_GO_LEFT);
72137+ } else
72138+ ret = 0;
72139+
72140+ zrelse_and_ret:
72141+ zrelse(parent_coord->node);
72142+
72143+ return ret;
72144+}
72145+
72146+/* this function is like renew_sibling_link() but allocates neighbor node if
72147+ it doesn't exist and `connects' it. It may require making two steps in
72148+ horizontal direction, first one for neighbor node finding/allocation,
72149+ second one is for finding neighbor of neighbor to connect freshly allocated
72150+ znode. */
72151+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72152+static int
72153+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
72154+{
72155+ coord_t local;
72156+ lock_handle empty[2];
72157+ reiser4_tree *tree = znode_get_tree(node);
72158+ znode *neighbor = NULL;
72159+ int nr_locked = 0;
72160+ int ret;
72161+
72162+ assert("umka-250", coord != NULL);
72163+ assert("umka-251", node != NULL);
72164+ assert("umka-307", tree != NULL);
72165+ assert("umka-308", level <= tree->height);
72166+
72167+ /* umka (2002.06.14)
72168+ Here probably should be a check for given "level" validness.
72169+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
72170+ */
72171+
72172+ coord_dup(&local, coord);
72173+
72174+ ret =
72175+ renew_sibling_link(&local, &empty[0], node, level,
72176+ flags & ~GN_NO_ALLOC, &nr_locked);
72177+ if (ret)
72178+ goto out;
72179+
72180+ /* tree lock is not needed here because we keep parent node(s) locked
72181+ and reference to neighbor znode incremented */
72182+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
72183+
72184+ read_lock_tree(tree);
72185+ ret = znode_is_connected(neighbor);
72186+ read_unlock_tree(tree);
72187+ if (ret) {
72188+ ret = 0;
72189+ goto out;
72190+ }
72191+
72192+ ret =
72193+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
72194+ flags | GN_NO_ALLOC, &nr_locked);
72195+ /* second renew_sibling_link() call is used for znode connection only,
72196+ so we can live with these errors */
72197+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
72198+ ret = 0;
72199+
72200+ out:
72201+
72202+ for (--nr_locked; nr_locked >= 0; --nr_locked) {
72203+ zrelse(empty[nr_locked].node);
72204+ longterm_unlock_znode(&empty[nr_locked]);
72205+ }
72206+
72207+ if (neighbor != NULL)
72208+ /* decrement znode reference counter without actually
72209+ releasing it. */
72210+ atomic_dec(&ZJNODE(neighbor)->x_count);
72211+
72212+ return ret;
72213+}
72214+
72215+/*
72216+ reiser4_get_neighbor() -- lock node's neighbor.
72217+
72218+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
72219+ given parameter) using sibling link to it. If sibling link is not available
72220+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
72221+ level up for information about neighbor's disk address. We lock node's
72222+ parent, if it is common parent for both 'node' and its neighbor, neighbor's
72223+ disk address is in next (to left or to right) down link from link that points
72224+ to original node. If not, we need to lock parent's neighbor, read its content
72225+ and take first(last) downlink with neighbor's disk address. That locking
72226+ could be done by using sibling link and lock_neighbor() function, if sibling
72227+ link exists. In another case we have to go level up again until we find
72228+ common parent or valid sibling link. Then go down
72229+ allocating/connecting/locking/reading nodes until neighbor of first one is
72230+ locked.
72231+
72232+ @neighbor: result lock handle,
72233+ @node: a node which we lock neighbor of,
72234+ @lock_mode: lock mode {LM_READ, LM_WRITE},
72235+ @flags: logical OR of {GN_*} (see description above) subset.
72236+
72237+ @return: 0 if success, negative value if lock was impossible due to an error
72238+ or lack of neighbor node.
72239+*/
72240+
72241+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72242+int
72243+reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72244+ znode_lock_mode lock_mode, int flags)
72245+{
72246+ reiser4_tree *tree = znode_get_tree(node);
72247+ lock_handle path[REAL_MAX_ZTREE_HEIGHT];
72248+
72249+ coord_t coord;
72250+
72251+ tree_level base_level;
72252+ tree_level h = 0;
72253+ int ret;
72254+
72255+ assert("umka-252", tree != NULL);
72256+ assert("umka-253", neighbor != NULL);
72257+ assert("umka-254", node != NULL);
72258+
72259+ base_level = znode_get_level(node);
72260+
72261+ assert("umka-310", base_level <= tree->height);
72262+
72263+ coord_init_zero(&coord);
72264+
72265+ again:
72266+ /* first, we try to use simple lock_neighbor() which requires sibling
72267+ link existence */
72268+ read_lock_tree(tree);
72269+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
72270+ read_unlock_tree(tree);
72271+ if (!ret) {
72272+ /* load znode content if it was specified */
72273+ if (flags & GN_LOAD_NEIGHBOR) {
72274+ ret = zload(node);
72275+ if (ret)
72276+ longterm_unlock_znode(neighbor);
72277+ }
72278+ return ret;
72279+ }
72280+
72281+ /* only -ENOENT means we may look upward and try to connect
72282+ @node with its neighbor (if @flags allow us to do it) */
72283+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
72284+ return ret;
72285+
72286+ /* before establishing of sibling link we lock parent node; it is
72287+ required by renew_neighbor() to work. */
72288+ init_lh(&path[0]);
72289+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72290+ if (ret)
72291+ return ret;
72292+ if (znode_above_root(path[0].node)) {
72293+ longterm_unlock_znode(&path[0]);
72294+ return RETERR(-E_NO_NEIGHBOR);
72295+ }
72296+
72297+ while (1) {
72298+ znode *child = (h == 0) ? node : path[h - 1].node;
72299+ znode *parent = path[h].node;
72300+
72301+ ret = zload(parent);
72302+ if (ret)
72303+ break;
72304+
72305+ ret = find_child_ptr(parent, child, &coord);
72306+
72307+ if (ret) {
72308+ zrelse(parent);
72309+ break;
72310+ }
72311+
72312+ /* try to establish missing sibling link */
72313+ ret = renew_neighbor(&coord, child, h + base_level, flags);
72314+
72315+ zrelse(parent);
72316+
72317+ switch (ret) {
72318+ case 0:
72319+ /* unlocking of parent znode prevents simple
72320+ deadlock situation */
72321+ done_lh(&path[h]);
72322+
72323+ /* depend on tree level we stay on we repeat first
72324+ locking attempt ... */
72325+ if (h == 0)
72326+ goto again;
72327+
72328+ /* ... or repeat establishing of sibling link at
72329+ one level below. */
72330+ --h;
72331+ break;
72332+
72333+ case -ENOENT:
72334+ /* sibling link is not available -- we go
72335+ upward. */
72336+ init_lh(&path[h + 1]);
72337+ ret =
72338+ reiser4_get_parent(&path[h + 1], parent,
72339+ ZNODE_READ_LOCK);
72340+ if (ret)
72341+ goto fail;
72342+ ++h;
72343+ if (znode_above_root(path[h].node)) {
72344+ ret = RETERR(-E_NO_NEIGHBOR);
72345+ goto fail;
72346+ }
72347+ break;
72348+
72349+ case -E_DEADLOCK:
72350+ /* there was lock request from hi-pri locker. if
72351+ it is possible we unlock last parent node and
72352+ re-lock it again. */
72353+ for (; reiser4_check_deadlock(); h--) {
72354+ done_lh(&path[h]);
72355+ if (h == 0)
72356+ goto fail;
72357+ }
72358+
72359+ break;
72360+
72361+ default: /* other errors. */
72362+ goto fail;
72363+ }
72364+ }
72365+ fail:
72366+ ON_DEBUG(check_lock_node_data(node));
72367+ ON_DEBUG(check_lock_data());
72368+
72369+ /* unlock path */
72370+ do {
72371+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72372+ fail; path[0] is already done_lh-ed, therefore
72373+ longterm_unlock_znode(&path[h]); is not applicable */
72374+ done_lh(&path[h]);
72375+ --h;
72376+ } while (h + 1 != 0);
72377+
72378+ return ret;
72379+}
72380+
72381+/* remove node from sibling list */
72382+/* Audited by: umka (2002.06.14) */
72383+void sibling_list_remove(znode * node)
72384+{
72385+ reiser4_tree *tree;
72386+
72387+ tree = znode_get_tree(node);
72388+ assert("umka-255", node != NULL);
72389+ assert_rw_write_locked(&(tree->tree_lock));
72390+ assert("nikita-3275", check_sibling_list(node));
72391+
72392+ write_lock_dk(tree);
72393+ if (znode_is_right_connected(node) && node->right != NULL &&
72394+ znode_is_left_connected(node) && node->left != NULL) {
72395+ assert("zam-32245",
72396+ keyeq(znode_get_rd_key(node),
72397+ znode_get_ld_key(node->right)));
72398+ znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72399+ }
72400+ write_unlock_dk(tree);
72401+
72402+ if (znode_is_right_connected(node) && node->right != NULL) {
72403+ assert("zam-322", znode_is_left_connected(node->right));
72404+ node->right->left = node->left;
72405+ ON_DEBUG(node->right->left_version =
72406+ atomic_inc_return(&delim_key_version);
72407+ );
72408+ }
72409+ if (znode_is_left_connected(node) && node->left != NULL) {
72410+ assert("zam-323", znode_is_right_connected(node->left));
72411+ node->left->right = node->right;
72412+ ON_DEBUG(node->left->right_version =
72413+ atomic_inc_return(&delim_key_version);
72414+ );
72415+ }
72416+
72417+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72418+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72419+ ON_DEBUG(node->left = node->right = NULL;
72420+ node->left_version = atomic_inc_return(&delim_key_version);
72421+ node->right_version = atomic_inc_return(&delim_key_version););
72422+ assert("nikita-3276", check_sibling_list(node));
72423+}
72424+
72425+/* disconnect node from sibling list */
72426+void sibling_list_drop(znode * node)
72427+{
72428+ znode *right;
72429+ znode *left;
72430+
72431+ assert("nikita-2464", node != NULL);
72432+ assert("nikita-3277", check_sibling_list(node));
72433+
72434+ right = node->right;
72435+ if (right != NULL) {
72436+ assert("nikita-2465", znode_is_left_connected(right));
72437+ right->left = NULL;
72438+ ON_DEBUG(right->left_version =
72439+ atomic_inc_return(&delim_key_version);
72440+ );
72441+ }
72442+ left = node->left;
72443+ if (left != NULL) {
72444+ assert("zam-323", znode_is_right_connected(left));
72445+ left->right = NULL;
72446+ ON_DEBUG(left->right_version =
72447+ atomic_inc_return(&delim_key_version);
72448+ );
72449+ }
72450+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72451+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72452+ ON_DEBUG(node->left = node->right = NULL;
72453+ node->left_version = atomic_inc_return(&delim_key_version);
72454+ node->right_version = atomic_inc_return(&delim_key_version););
72455+}
72456+
72457+/* Insert new node into sibling list. Regular balancing inserts new node
72458+ after (at right side) existing and locked node (@before), except one case
72459+ of adding new tree root node. @before should be NULL in that case. */
72460+void sibling_list_insert_nolock(znode * new, znode * before)
72461+{
72462+ assert("zam-334", new != NULL);
72463+ assert("nikita-3298", !znode_is_left_connected(new));
72464+ assert("nikita-3299", !znode_is_right_connected(new));
72465+ assert("nikita-3300", new->left == NULL);
72466+ assert("nikita-3301", new->right == NULL);
72467+ assert("nikita-3278", check_sibling_list(new));
72468+ assert("nikita-3279", check_sibling_list(before));
72469+
72470+ if (before != NULL) {
72471+ assert("zam-333", znode_is_connected(before));
72472+ new->right = before->right;
72473+ new->left = before;
72474+ ON_DEBUG(new->right_version =
72475+ atomic_inc_return(&delim_key_version);
72476+ new->left_version =
72477+ atomic_inc_return(&delim_key_version););
72478+ if (before->right != NULL) {
72479+ before->right->left = new;
72480+ ON_DEBUG(before->right->left_version =
72481+ atomic_inc_return(&delim_key_version);
72482+ );
72483+ }
72484+ before->right = new;
72485+ ON_DEBUG(before->right_version =
72486+ atomic_inc_return(&delim_key_version);
72487+ );
72488+ } else {
72489+ new->right = NULL;
72490+ new->left = NULL;
72491+ ON_DEBUG(new->right_version =
72492+ atomic_inc_return(&delim_key_version);
72493+ new->left_version =
72494+ atomic_inc_return(&delim_key_version););
72495+ }
72496+ ZF_SET(new, JNODE_LEFT_CONNECTED);
72497+ ZF_SET(new, JNODE_RIGHT_CONNECTED);
72498+ assert("nikita-3280", check_sibling_list(new));
72499+ assert("nikita-3281", check_sibling_list(before));
72500+}
72501+
72502+/*
72503+ Local variables:
72504+ c-indentation-style: "K&R"
72505+ mode-name: "LC"
72506+ c-basic-offset: 8
72507+ tab-width: 8
72508+ fill-column: 80
72509+ End:
72510+*/
72511diff -urN linux-2.6.20.orig/fs/reiser4/tree_walk.h linux-2.6.20/fs/reiser4/tree_walk.h
72512--- linux-2.6.20.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300
72513+++ linux-2.6.20/fs/reiser4/tree_walk.h 2007-05-06 14:50:43.887034467 +0400
72514@@ -0,0 +1,125 @@
72515+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
72516+
72517+/* definitions of reiser4 tree walk functions */
72518+
72519+#ifndef __FS_REISER4_TREE_WALK_H__
72520+#define __FS_REISER4_TREE_WALK_H__
72521+
72522+#include "debug.h"
72523+#include "forward.h"
72524+
72525+/* establishes horizontal links between cached znodes */
72526+int connect_znode(coord_t * coord, znode * node);
72527+
72528+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
72529+ have the following common arguments:
72530+
72531+ return codes:
72532+
72533+ @return : 0 - OK,
72534+
72535+ZAM-FIXME-HANS: wrong return code name. Change them all.
72536+ -ENOENT - neighbor is not in cache, what is detected by sibling
72537+ link absence.
72538+
72539+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
72540+ found (because we are left-/right- most node of the
72541+ tree, for example). Also, this return code is for
72542+ reiser4_get_parent() when we see no parent link -- it
72543+ means that our node is root node.
72544+
72545+ -E_DEADLOCK - deadlock detected (request from high-priority process
72546+ received), other error codes are conformed to
72547+ /usr/include/asm/errno.h .
72548+*/
72549+
72550+int
72551+reiser4_get_parent_flags(lock_handle * result, znode * node,
72552+ znode_lock_mode mode, int flags);
72553+
72554+/* bits definition for reiser4_get_neighbor function `flags' arg. */
72555+typedef enum {
72556+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
72557+ * find not allocated not connected neigbor by going though upper
72558+ * levels */
72559+ GN_CAN_USE_UPPER_LEVELS = 0x1,
72560+ /* locking left neighbor instead of right one */
72561+ GN_GO_LEFT = 0x2,
72562+ /* automatically load neighbor node content */
72563+ GN_LOAD_NEIGHBOR = 0x4,
72564+ /* return -E_REPEAT if can't lock */
72565+ GN_TRY_LOCK = 0x8,
72566+ /* used internally in tree_walk.c, causes renew_sibling to not
72567+ allocate neighbor znode, but only search for it in znode cache */
72568+ GN_NO_ALLOC = 0x10,
72569+ /* do not go across atom boundaries */
72570+ GN_SAME_ATOM = 0x20,
72571+ /* allow to lock not connected nodes */
72572+ GN_ALLOW_NOT_CONNECTED = 0x40,
72573+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
72574+ GN_ASYNC = 0x80
72575+} znode_get_neigbor_flags;
72576+
72577+/* A commonly used wrapper for reiser4_get_parent_flags(). */
72578+static inline int reiser4_get_parent(lock_handle * result, znode * node,
72579+ znode_lock_mode mode)
72580+{
72581+ return reiser4_get_parent_flags(result, node, mode,
72582+ GN_ALLOW_NOT_CONNECTED);
72583+}
72584+
72585+int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72586+ znode_lock_mode lock_mode, int flags);
72587+
72588+/* there are wrappers for most common usages of reiser4_get_neighbor() */
72589+static inline int
72590+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
72591+ int flags)
72592+{
72593+ return reiser4_get_neighbor(result, node, lock_mode,
72594+ flags | GN_GO_LEFT);
72595+}
72596+
72597+static inline int
72598+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
72599+ int flags)
72600+{
72601+ ON_DEBUG(check_lock_node_data(node));
72602+ ON_DEBUG(check_lock_data());
72603+ return reiser4_get_neighbor(result, node, lock_mode,
72604+ flags & (~GN_GO_LEFT));
72605+}
72606+
72607+extern void sibling_list_remove(znode * node);
72608+extern void sibling_list_drop(znode * node);
72609+extern void sibling_list_insert_nolock(znode * new, znode * before);
72610+extern void link_left_and_right(znode * left, znode * right);
72611+
72612+/* Functions called by tree_walk() when tree_walk() ... */
72613+struct tree_walk_actor {
72614+ /* ... meets a formatted node, */
72615+ int (*process_znode) (tap_t *, void *);
72616+ /* ... meets an extent, */
72617+ int (*process_extent) (tap_t *, void *);
72618+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
72619+ * node or extent processing functions. */
72620+ int (*before) (void *);
72621+};
72622+
72623+#if REISER4_DEBUG
72624+int check_sibling_list(znode * node);
72625+#else
72626+#define check_sibling_list(n) (1)
72627+#endif
72628+
72629+#endif /* __FS_REISER4_TREE_WALK_H__ */
72630+
72631+/*
72632+ Local variables:
72633+ c-indentation-style: "K&R"
72634+ mode-name: "LC"
72635+ c-basic-offset: 8
72636+ tab-width: 8
72637+ fill-column: 120
72638+ End:
72639+*/
72640diff -urN linux-2.6.20.orig/fs/reiser4/txnmgr.c linux-2.6.20/fs/reiser4/txnmgr.c
72641--- linux-2.6.20.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
72642+++ linux-2.6.20/fs/reiser4/txnmgr.c 2007-05-06 14:50:43.895036966 +0400
72643@@ -0,0 +1,3164 @@
72644+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72645+ * reiser4/README */
72646+
72647+/* Joshua MacDonald wrote the first draft of this code. */
72648+
72649+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
72650+filesystem scales only as well as its worst locking design. You need to
72651+substantially restructure this code. Josh was not as experienced a programmer
72652+as you. Particularly review how the locking style differs from what you did
72653+for znodes usingt hi-lo priority locking, and present to me an opinion on
72654+whether the differences are well founded. */
72655+
72656+/* I cannot help but to disagree with the sentiment above. Locking of
72657+ * transaction manager is _not_ badly designed, and, at the very least, is not
72658+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
72659+ * locking on znodes, especially on the root node of the tree. --nikita,
72660+ * 2003.10.13 */
72661+
72662+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
72663+ txnmgr processes capture_block requests and manages the relationship between jnodes and
72664+ atoms through the various stages of a transcrash, and it also oversees the fusion and
72665+ capture-on-copy processes. The main difficulty with this task is maintaining a
72666+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
72667+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
72668+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you
72669+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
72670+ that any time you check the atom-pointer of a jnode or handle and then try to lock that
72671+ atom, you must use trylock() and possibly reverse the order.
72672+
72673+ This code implements the design documented at:
72674+
72675+ http://namesys.com/txn-doc.html
72676+
72677+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
72678+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
72679+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
72680+year old --- define all technical terms used.
72681+
72682+*/
72683+
72684+/* Thoughts on the external transaction interface:
72685+
72686+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
72687+ creates state that lasts for the duration of a system call and is called at the start
72688+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
72689+ occupying the scope of a single system call. We wish to give certain applications an
72690+ interface to begin and close (commit) transactions. Since our implementation of
72691+ transactions does not yet support isolation, allowing an application to open a
72692+ transaction implies trusting it to later close the transaction. Part of the
72693+ transaction interface will be aimed at enabling that trust, but the interface for
72694+ actually using transactions is fairly narrow.
72695+
72696+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
72697+ this identifier into a string that a shell-script could use, allowing you to start a
72698+ transaction by issuing a command. Once open, the transcrash should be set in the task
72699+ structure, and there should be options (I suppose) to allow it to be carried across
72700+ fork/exec. A transcrash has several options:
72701+
72702+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
72703+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
72704+ capture on reads as well, it should set READ_FUSING.
72705+
72706+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
72707+ eventually close (or else the machine must crash). If the application dies an
72708+ unexpected death with an open transcrash, for example, or if it hangs for a long
72709+ duration, one solution (to avoid crashing the machine) is to simply close it anyway.
72710+ This is a dangerous option, but it is one way to solve the problem until isolated
72711+ transcrashes are available for untrusted applications.
72712+
72713+ It seems to be what databases do, though it is unclear how one avoids a DoS attack
72714+ creating a vulnerability based on resource starvation. Guaranteeing that some
72715+ minimum amount of computational resources are made available would seem more correct
72716+ than guaranteeing some amount of time. When we again have someone to code the work,
72717+ this issue should be considered carefully. -Hans
72718+
72719+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
72720+ many dirty blocks it expects. The reserve_blocks interface should be called at a point
72721+ where it is safe for the application to fail, because the system may not be able to
72722+ grant the allocation and the application must be able to back-out. For this reason,
72723+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
72724+ the application may also wish to extend the allocation after beginning its transcrash.
72725+
72726+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
72727+ modifications that require transaction protection. When isolated transactions are
72728+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
72729+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling
72730+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
72731+ why, for safety, the application should call RESERVE_BLOCKS before making any changes).
72732+
72733+ For actually implementing these out-of-system-call-scopped transcrashes, the
72734+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open
72735+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
72736+ "struct kmem_cache *_txnh_slab" created for that purpose in this file.
72737+*/
72738+
72739+/* Extending the other system call interfaces for future transaction features:
72740+
72741+ Specialized applications may benefit from passing flags to the ordinary system call
72742+ interface such as read(), write(), or stat(). For example, the application specifies
72743+ WRITE_FUSING by default but wishes to add that a certain read() command should be
72744+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
72745+ read, or the file-data read? These issues are straight-forward, but there are a lot of
72746+ them and adding the necessary flags-passing code will be tedious.
72747+
72748+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
72749+ flag, which specifies that although it is a read operation being requested, a
72750+ write-lock should be taken. The reason is that read-locks are shared while write-locks
72751+ are exclusive, so taking a read-lock when a later-write is known in advance will often
72752+ leads to deadlock. If a reader knows it will write later, it should issue read
72753+ requests with the RMW flag set.
72754+*/
72755+
72756+/*
72757+ The znode/atom deadlock avoidance.
72758+
72759+ FIXME(Zam): writing of this comment is in progress.
72760+
72761+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
72762+ long-term locking, which makes reiser4 locking scheme more complex. It had
72763+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks
72764+ looked as the following: one stopped thread waits for a long-term lock on
72765+ znode, the thread who owns that lock waits when fusion with another atom will
72766+ be allowed.
72767+
72768+ The source of the deadlocks is an optimization of not capturing index nodes
72769+ for read. Let's prove it. Suppose we have dumb node capturing scheme which
72770+ unconditionally captures each block before locking it.
72771+
72772+ That scheme has no deadlocks. Let's begin with the thread which stage is
72773+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
72774+ a capture because it's stage allows fusion with any atom except which are
72775+ being committed currently. A process of atom commit can't deadlock because
72776+ atom commit procedure does not acquire locks and does not fuse with other
72777+ atoms. Reiser4 does capturing right before going to sleep inside the
72778+ longtertm_lock_znode() function, it means the znode which we want to lock is
72779+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
72780+ continue the analysis we understand that no one process in the sequence may
72781+ waits atom fusion. Thereby there are no deadlocks of described kind.
72782+
72783+ The capturing optimization makes the deadlocks possible. A thread can wait a
72784+ lock which owner did not captured that node. The lock owner's current atom
72785+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
72786+ state. A deadlock is possible when that atom meets another one which is in
72787+ ASTAGE_CAPTURE_WAIT already.
72788+
72789+ The deadlock avoidance scheme includes two algorithms:
72790+
72791+ First algorithm is used when a thread captures a node which is locked but not
72792+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
72793+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
72794+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
72795+ routine which forces all lock owners to join with current atom is executed.
72796+
72797+ Second algorithm does not allow to skip capturing of already captured nodes.
72798+
72799+ Both algorithms together prevent waiting a longterm lock without atom fusion
72800+ with atoms of all lock owners, which is a key thing for getting atom/znode
72801+ locking deadlocks.
72802+*/
72803+
72804+/*
72805+ * Transactions and mmap(2).
72806+ *
72807+ * 1. Transactions are not supported for accesses through mmap(2), because
72808+ * this would effectively amount to user-level transactions whose duration
72809+ * is beyond control of the kernel.
72810+ *
72811+ * 2. That said, we still want to preserve some decency with regard to
72812+ * mmap(2). During normal write(2) call, following sequence of events
72813+ * happens:
72814+ *
72815+ * 1. page is created;
72816+ *
72817+ * 2. jnode is created, dirtied and captured into current atom.
72818+ *
72819+ * 3. extent is inserted and modified.
72820+ *
72821+ * Steps (2) and (3) take place under long term lock on the twig node.
72822+ *
72823+ * When file is accessed through mmap(2) page is always created during
72824+ * page fault.
72825+ * After this (in reiser4_readpage()->reiser4_readpage_extent()):
72826+ *
72827+ * 1. if access is made to non-hole page new jnode is created, (if
72828+ * necessary)
72829+ *
72830+ * 2. if access is made to the hole page, jnode is not created (XXX
72831+ * not clear why).
72832+ *
72833+ * Also, even if page is created by write page fault it is not marked
72834+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races
72835+ * with page write-out.
72836+ *
72837+ * Dirty bit installed by hardware is only transferred to the struct page
72838+ * later, when page is unmapped (in zap_pte_range(), or
72839+ * try_to_unmap_one()).
72840+ *
72841+ * So, with mmap(2) we have to handle following irksome situations:
72842+ *
72843+ * 1. there exists modified page (clean or dirty) without jnode
72844+ *
72845+ * 2. there exists modified page (clean or dirty) with clean jnode
72846+ *
72847+ * 3. clean page which is a part of atom can be transparently modified
72848+ * at any moment through mapping without becoming dirty.
72849+ *
72850+ * (1) and (2) can lead to the out-of-memory situation: ->writepage()
72851+ * doesn't know what to do with such pages and ->sync_sb()/->writepages()
72852+ * don't see them, because these methods operate on atoms.
72853+ *
72854+ * (3) can lead to the loss of data: suppose we have dirty page with dirty
72855+ * captured jnode captured by some atom. As part of early flush (for
72856+ * example) page was written out. Dirty bit was cleared on both page and
72857+ * jnode. After this page is modified through mapping, but kernel doesn't
72858+ * notice and just discards page and jnode as part of commit. (XXX
72859+ * actually it doesn't, because to reclaim page ->releasepage() has to be
72860+ * called and before this dirty bit will be transferred to the struct
72861+ * page).
72862+ *
72863+ */
72864+
72865+#include "debug.h"
72866+#include "txnmgr.h"
72867+#include "jnode.h"
72868+#include "znode.h"
72869+#include "block_alloc.h"
72870+#include "tree.h"
72871+#include "wander.h"
72872+#include "ktxnmgrd.h"
72873+#include "super.h"
72874+#include "page_cache.h"
72875+#include "reiser4.h"
72876+#include "vfs_ops.h"
72877+#include "inode.h"
72878+#include "flush.h"
72879+
72880+#include <asm/atomic.h>
72881+#include <linux/types.h>
72882+#include <linux/fs.h>
72883+#include <linux/mm.h>
72884+#include <linux/slab.h>
72885+#include <linux/pagemap.h>
72886+#include <linux/writeback.h>
72887+#include <linux/swap.h> /* for totalram_pages */
72888+
72889+static void atom_free(txn_atom * atom);
72890+
72891+static int commit_txnh(txn_handle * txnh);
72892+
72893+static void wakeup_atom_waitfor_list(txn_atom * atom);
72894+static void wakeup_atom_waiting_list(txn_atom * atom);
72895+
72896+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
72897+
72898+static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
72899+
72900+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
72901+
72902+static int capture_init_fusion(jnode * node, txn_handle * txnh,
72903+ txn_capture mode);
72904+
72905+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
72906+
72907+static void capture_fuse_into(txn_atom * small, txn_atom * large);
72908+
72909+void reiser4_invalidate_list(struct list_head *);
72910+
72911+/* GENERIC STRUCTURES */
72912+
72913+typedef struct _txn_wait_links txn_wait_links;
72914+
72915+struct _txn_wait_links {
72916+ lock_stack *_lock_stack;
72917+ struct list_head _fwaitfor_link;
72918+ struct list_head _fwaiting_link;
72919+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72920+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72921+};
72922+
72923+/* FIXME: In theory, we should be using the slab cache init & destructor
72924+ methods instead of, e.g., jnode_init, etc. */
72925+static struct kmem_cache *_atom_slab = NULL;
72926+/* this is for user-visible, cross system-call transactions. */
72927+static struct kmem_cache *_txnh_slab = NULL;
72928+
72929+/**
72930+ * init_txnmgr_static - create transaction manager slab caches
72931+ *
72932+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
72933+ * initialization.
72934+ */
72935+int init_txnmgr_static(void)
72936+{
72937+ assert("jmacd-600", _atom_slab == NULL);
72938+ assert("jmacd-601", _txnh_slab == NULL);
72939+
72940+ ON_DEBUG(atomic_set(&flush_cnt, 0));
72941+
72942+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
72943+ SLAB_HWCACHE_ALIGN |
72944+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
72945+ if (_atom_slab == NULL)
72946+ return RETERR(-ENOMEM);
72947+
72948+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
72949+ SLAB_HWCACHE_ALIGN, NULL, NULL);
72950+ if (_txnh_slab == NULL) {
72951+ kmem_cache_destroy(_atom_slab);
72952+ _atom_slab = NULL;
72953+ return RETERR(-ENOMEM);
72954+ }
72955+
72956+ return 0;
72957+}
72958+
72959+/**
72960+ * done_txnmgr_static - delete txn_atom and txn_handle caches
72961+ *
72962+ * This is called on reiser4 module unloading or system shutdown.
72963+ */
72964+void done_txnmgr_static(void)
72965+{
72966+ destroy_reiser4_cache(&_atom_slab);
72967+ destroy_reiser4_cache(&_txnh_slab);
72968+}
72969+
72970+/**
72971+ * init_txnmgr - initialize a new transaction manager
72972+ * @mgr: pointer to transaction manager embedded in reiser4 super block
72973+ *
72974+ * This is called on mount. Makes necessary initializations.
72975+ */
72976+void reiser4_init_txnmgr(txn_mgr *mgr)
72977+{
72978+ assert("umka-169", mgr != NULL);
72979+
72980+ mgr->atom_count = 0;
72981+ mgr->id_count = 1;
72982+ INIT_LIST_HEAD(&mgr->atoms_list);
72983+ spin_lock_init(&mgr->tmgr_lock);
72984+ mutex_init(&mgr->commit_mutex);
72985+}
72986+
72987+/**
72988+ * reiser4_done_txnmgr - stop transaction manager
72989+ * @mgr: pointer to transaction manager embedded in reiser4 super block
72990+ *
72991+ * This is called on umount. Does sanity checks.
72992+ */
72993+void reiser4_done_txnmgr(txn_mgr *mgr)
72994+{
72995+ assert("umka-170", mgr != NULL);
72996+ assert("umka-1701", list_empty_careful(&mgr->atoms_list));
72997+ assert("umka-1702", mgr->atom_count == 0);
72998+}
72999+
73000+/* Initialize a transaction handle. */
73001+/* Audited by: umka (2002.06.13) */
73002+static void txnh_init(txn_handle * txnh, txn_mode mode)
73003+{
73004+ assert("umka-171", txnh != NULL);
73005+
73006+ txnh->mode = mode;
73007+ txnh->atom = NULL;
73008+ reiser4_ctx_gfp_mask_set();
73009+ txnh->flags = 0;
73010+ spin_lock_init(&txnh->hlock);
73011+ INIT_LIST_HEAD(&txnh->txnh_link);
73012+}
73013+
73014+#if REISER4_DEBUG
73015+/* Check if a transaction handle is clean. */
73016+static int txnh_isclean(txn_handle * txnh)
73017+{
73018+ assert("umka-172", txnh != NULL);
73019+ return txnh->atom == NULL &&
73020+ LOCK_CNT_NIL(spin_locked_txnh);
73021+}
73022+#endif
73023+
73024+/* Initialize an atom. */
73025+static void atom_init(txn_atom * atom)
73026+{
73027+ int level;
73028+
73029+ assert("umka-173", atom != NULL);
73030+
73031+ memset(atom, 0, sizeof(txn_atom));
73032+
73033+ atom->stage = ASTAGE_FREE;
73034+ atom->start_time = jiffies;
73035+
73036+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
73037+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
73038+
73039+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
73040+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
73041+ INIT_LIST_HEAD(ATOM_WB_LIST(atom));
73042+ INIT_LIST_HEAD(&atom->inodes);
73043+ spin_lock_init(&(atom->alock));
73044+ /* list of transaction handles */
73045+ INIT_LIST_HEAD(&atom->txnh_list);
73046+ /* link to transaction manager's list of atoms */
73047+ INIT_LIST_HEAD(&atom->atom_link);
73048+ INIT_LIST_HEAD(&atom->fwaitfor_list);
73049+ INIT_LIST_HEAD(&atom->fwaiting_list);
73050+ blocknr_set_init(&atom->delete_set);
73051+ blocknr_set_init(&atom->wandered_map);
73052+
73053+ init_atom_fq_parts(atom);
73054+}
73055+
73056+#if REISER4_DEBUG
73057+/* Check if an atom is clean. */
73058+static int atom_isclean(txn_atom * atom)
73059+{
73060+ int level;
73061+
73062+ assert("umka-174", atom != NULL);
73063+
73064+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73065+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
73066+ return 0;
73067+ }
73068+ }
73069+
73070+ return atom->stage == ASTAGE_FREE &&
73071+ atom->txnh_count == 0 &&
73072+ atom->capture_count == 0 &&
73073+ atomic_read(&atom->refcount) == 0 &&
73074+ (&atom->atom_link == atom->atom_link.next &&
73075+ &atom->atom_link == atom->atom_link.prev) &&
73076+ list_empty_careful(&atom->txnh_list) &&
73077+ list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
73078+ list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
73079+ list_empty_careful(ATOM_WB_LIST(atom)) &&
73080+ list_empty_careful(&atom->fwaitfor_list) &&
73081+ list_empty_careful(&atom->fwaiting_list) &&
73082+ atom_fq_parts_are_clean(atom);
73083+}
73084+#endif
73085+
73086+/* Begin a transaction in this context. Currently this uses the reiser4_context's
73087+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
73088+ this will be extended to allow transaction handles to span several contexts. */
73089+/* Audited by: umka (2002.06.13) */
73090+void reiser4_txn_begin(reiser4_context * context)
73091+{
73092+ assert("jmacd-544", context->trans == NULL);
73093+
73094+ context->trans = &context->trans_in_ctx;
73095+
73096+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
73097+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
73098+ stack allocated right now, but we would like to allow for dynamically allocated
73099+ transcrashes that span multiple system calls.
73100+ */
73101+ txnh_init(context->trans, TXN_WRITE_FUSING);
73102+}
73103+
73104+/* Finish a transaction handle context. */
73105+int reiser4_txn_end(reiser4_context * context)
73106+{
73107+ long ret = 0;
73108+ txn_handle *txnh;
73109+
73110+ assert("umka-283", context != NULL);
73111+ assert("nikita-3012", reiser4_schedulable());
73112+ assert("vs-24", context == get_current_context());
73113+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
73114+
73115+ txnh = context->trans;
73116+ if (txnh != NULL) {
73117+ if (txnh->atom != NULL)
73118+ ret = commit_txnh(txnh);
73119+ assert("jmacd-633", txnh_isclean(txnh));
73120+ context->trans = NULL;
73121+ }
73122+ return ret;
73123+}
73124+
73125+void reiser4_txn_restart(reiser4_context * context)
73126+{
73127+ reiser4_txn_end(context);
73128+ reiser4_preempt_point();
73129+ reiser4_txn_begin(context);
73130+}
73131+
73132+void reiser4_txn_restart_current(void)
73133+{
73134+ reiser4_txn_restart(get_current_context());
73135+}
73136+
73137+/* TXN_ATOM */
73138+
73139+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
73140+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
73141+ return NULL. */
73142+static txn_atom *txnh_get_atom(txn_handle * txnh)
73143+{
73144+ txn_atom *atom;
73145+
73146+ assert("umka-180", txnh != NULL);
73147+ assert_spin_not_locked(&(txnh->hlock));
73148+
73149+ while (1) {
73150+ spin_lock_txnh(txnh);
73151+ atom = txnh->atom;
73152+
73153+ if (atom == NULL)
73154+ break;
73155+
73156+ if (spin_trylock_atom(atom))
73157+ break;
73158+
73159+ atomic_inc(&atom->refcount);
73160+
73161+ spin_unlock_txnh(txnh);
73162+ spin_lock_atom(atom);
73163+ spin_lock_txnh(txnh);
73164+
73165+ if (txnh->atom == atom) {
73166+ atomic_dec(&atom->refcount);
73167+ break;
73168+ }
73169+
73170+ spin_unlock_txnh(txnh);
73171+ atom_dec_and_unlock(atom);
73172+ }
73173+
73174+ return atom;
73175+}
73176+
73177+/* Get the current atom and spinlock it if current atom present. May return NULL */
73178+txn_atom *get_current_atom_locked_nocheck(void)
73179+{
73180+ reiser4_context *cx;
73181+ txn_atom *atom;
73182+ txn_handle *txnh;
73183+
73184+ cx = get_current_context();
73185+ assert("zam-437", cx != NULL);
73186+
73187+ txnh = cx->trans;
73188+ assert("zam-435", txnh != NULL);
73189+
73190+ atom = txnh_get_atom(txnh);
73191+
73192+ spin_unlock_txnh(txnh);
73193+ return atom;
73194+}
73195+
73196+/* Get the atom belonging to a jnode, which is initially locked. Return with
73197+ both jnode and atom locked. This performs the necessary spin_trylock to
73198+ break the lock-ordering cycle. Assumes the jnode is already locked, and
73199+ returns NULL if atom is not set. */
73200+txn_atom *jnode_get_atom(jnode * node)
73201+{
73202+ txn_atom *atom;
73203+
73204+ assert("umka-181", node != NULL);
73205+
73206+ while (1) {
73207+ assert_spin_locked(&(node->guard));
73208+
73209+ atom = node->atom;
73210+ /* node is not in any atom */
73211+ if (atom == NULL)
73212+ break;
73213+
73214+ /* If atom is not locked, grab the lock and return */
73215+ if (spin_trylock_atom(atom))
73216+ break;
73217+
73218+ /* At least one jnode belongs to this atom it guarantees that
73219+ * atom->refcount > 0, we can safely increment refcount. */
73220+ atomic_inc(&atom->refcount);
73221+ spin_unlock_jnode(node);
73222+
73223+ /* re-acquire spin locks in the right order */
73224+ spin_lock_atom(atom);
73225+ spin_lock_jnode(node);
73226+
73227+ /* check if node still points to the same atom. */
73228+ if (node->atom == atom) {
73229+ atomic_dec(&atom->refcount);
73230+ break;
73231+ }
73232+
73233+ /* releasing of atom lock and reference requires not holding
73234+ * locks on jnodes. */
73235+ spin_unlock_jnode(node);
73236+
73237+ /* We do not sure that this atom has extra references except our
73238+ * one, so we should call proper function which may free atom if
73239+ * last reference is released. */
73240+ atom_dec_and_unlock(atom);
73241+
73242+ /* lock jnode again for getting valid node->atom pointer
73243+ * value. */
73244+ spin_lock_jnode(node);
73245+ }
73246+
73247+ return atom;
73248+}
73249+
73250+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
73251+ by flush code to indicate whether the next node (in some direction) is suitable for
73252+ flushing. */
73253+int
73254+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
73255+{
73256+ int compat;
73257+ txn_atom *atom;
73258+
73259+ assert("umka-182", node != NULL);
73260+ assert("umka-183", check != NULL);
73261+
73262+ /* Not sure what this function is supposed to do if supplied with @check that is
73263+ neither formatted nor unformatted (bitmap or so). */
73264+ assert("nikita-2373", jnode_is_znode(check)
73265+ || jnode_is_unformatted(check));
73266+
73267+ /* Need a lock on CHECK to get its atom and to check various state bits.
73268+ Don't need a lock on NODE once we get the atom lock. */
73269+ /* It is not enough to lock two nodes and check (node->atom ==
73270+ check->atom) because atom could be locked and being fused at that
73271+ moment, jnodes of the atom of that state (being fused) can point to
73272+ different objects, but the atom is the same. */
73273+ spin_lock_jnode(check);
73274+
73275+ atom = jnode_get_atom(check);
73276+
73277+ if (atom == NULL) {
73278+ compat = 0;
73279+ } else {
73280+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
73281+
73282+ if (compat && jnode_is_znode(check)) {
73283+ compat &= znode_is_connected(JZNODE(check));
73284+ }
73285+
73286+ if (compat && alloc_check) {
73287+ compat &= (alloc_value == jnode_is_flushprepped(check));
73288+ }
73289+
73290+ spin_unlock_atom(atom);
73291+ }
73292+
73293+ spin_unlock_jnode(check);
73294+
73295+ return compat;
73296+}
73297+
73298+/* Decrement the atom's reference count and if it falls to zero, free it. */
73299+void atom_dec_and_unlock(txn_atom * atom)
73300+{
73301+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73302+
73303+ assert("umka-186", atom != NULL);
73304+ assert_spin_locked(&(atom->alock));
73305+ assert("zam-1039", atomic_read(&atom->refcount) > 0);
73306+
73307+ if (atomic_dec_and_test(&atom->refcount)) {
73308+ /* take txnmgr lock and atom lock in proper order. */
73309+ if (!spin_trylock_txnmgr(mgr)) {
73310+ /* This atom should exist after we re-acquire its
73311+ * spinlock, so we increment its reference counter. */
73312+ atomic_inc(&atom->refcount);
73313+ spin_unlock_atom(atom);
73314+ spin_lock_txnmgr(mgr);
73315+ spin_lock_atom(atom);
73316+
73317+ if (!atomic_dec_and_test(&atom->refcount)) {
73318+ spin_unlock_atom(atom);
73319+ spin_unlock_txnmgr(mgr);
73320+ return;
73321+ }
73322+ }
73323+ assert_spin_locked(&(mgr->tmgr_lock));
73324+ atom_free(atom);
73325+ spin_unlock_txnmgr(mgr);
73326+ } else
73327+ spin_unlock_atom(atom);
73328+}
73329+
73330+/* Create new atom and connect it to given transaction handle. This adds the
73331+ atom to the transaction manager's list and sets its reference count to 1, an
73332+ artificial reference which is kept until it commits. We play strange games
73333+ to avoid allocation under jnode & txnh spinlocks.*/
73334+
73335+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73336+{
73337+ txn_atom *atom;
73338+ txn_mgr *mgr;
73339+
73340+ if (REISER4_DEBUG && rofs_tree(current_tree)) {
73341+ warning("nikita-3366", "Creating atom on rofs");
73342+ dump_stack();
73343+ }
73344+
73345+ if (*atom_alloc == NULL) {
73346+ (*atom_alloc) = kmem_cache_alloc(_atom_slab,
73347+ reiser4_ctx_gfp_mask_get());
73348+
73349+ if (*atom_alloc == NULL)
73350+ return RETERR(-ENOMEM);
73351+ }
73352+
73353+ /* and, also, txnmgr spin lock should be taken before jnode and txnh
73354+ locks. */
73355+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73356+ spin_lock_txnmgr(mgr);
73357+ spin_lock_txnh(txnh);
73358+
73359+ /* Check whether new atom still needed */
73360+ if (txnh->atom != NULL) {
73361+ /* NOTE-NIKITA probably it is rather better to free
73362+ * atom_alloc here than thread it up to reiser4_try_capture() */
73363+
73364+ spin_unlock_txnh(txnh);
73365+ spin_unlock_txnmgr(mgr);
73366+
73367+ return -E_REPEAT;
73368+ }
73369+
73370+ atom = *atom_alloc;
73371+ *atom_alloc = NULL;
73372+
73373+ atom_init(atom);
73374+
73375+ assert("jmacd-17", atom_isclean(atom));
73376+
73377+ /*
73378+ * lock ordering is broken here. It is ok, as long as @atom is new
73379+ * and inaccessible for others. We can't use spin_lock_atom or
73380+ * spin_lock(&atom->alock) because they care about locking
73381+ * dependencies. spin_trylock_lock doesn't.
73382+ */
73383+ check_me("", spin_trylock_atom(atom));
73384+
73385+ /* add atom to the end of transaction manager's list of atoms */
73386+ list_add_tail(&atom->atom_link, &mgr->atoms_list);
73387+ atom->atom_id = mgr->id_count++;
73388+ mgr->atom_count += 1;
73389+
73390+ /* Release txnmgr lock */
73391+ spin_unlock_txnmgr(mgr);
73392+
73393+ /* One reference until it commits. */
73394+ atomic_inc(&atom->refcount);
73395+ atom->stage = ASTAGE_CAPTURE_FUSE;
73396+ atom->super = reiser4_get_current_sb();
73397+ capture_assign_txnh_nolock(atom, txnh);
73398+
73399+ spin_unlock_atom(atom);
73400+ spin_unlock_txnh(txnh);
73401+
73402+ return -E_REPEAT;
73403+}
73404+
73405+/* Return true if an atom is currently "open". */
73406+static int atom_isopen(const txn_atom * atom)
73407+{
73408+ assert("umka-185", atom != NULL);
73409+
73410+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73411+}
73412+
73413+/* Return the number of pointers to this atom that must be updated during fusion. This
73414+ approximates the amount of work to be done. Fusion chooses the atom with fewer
73415+ pointers to fuse into the atom with more pointers. */
73416+static int atom_pointer_count(const txn_atom * atom)
73417+{
73418+ assert("umka-187", atom != NULL);
73419+
73420+ /* This is a measure of the amount of work needed to fuse this atom
73421+ * into another. */
73422+ return atom->txnh_count + atom->capture_count;
73423+}
73424+
73425+/* Called holding the atom lock, this removes the atom from the transaction manager list
73426+ and frees it. */
73427+static void atom_free(txn_atom * atom)
73428+{
73429+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73430+
73431+ assert("umka-188", atom != NULL);
73432+ assert_spin_locked(&(atom->alock));
73433+
73434+ /* Remove from the txn_mgr's atom list */
73435+ assert_spin_locked(&(mgr->tmgr_lock));
73436+ mgr->atom_count -= 1;
73437+ list_del_init(&atom->atom_link);
73438+
73439+ /* Clean the atom */
73440+ assert("jmacd-16",
73441+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73442+ atom->stage = ASTAGE_FREE;
73443+
73444+ blocknr_set_destroy(&atom->delete_set);
73445+ blocknr_set_destroy(&atom->wandered_map);
73446+
73447+ assert("jmacd-16", atom_isclean(atom));
73448+
73449+ spin_unlock_atom(atom);
73450+
73451+ kmem_cache_free(_atom_slab, atom);
73452+}
73453+
73454+static int atom_is_dotard(const txn_atom * atom)
73455+{
73456+ return time_after(jiffies, atom->start_time +
73457+ get_current_super_private()->tmgr.atom_max_age);
73458+}
73459+
73460+static int atom_can_be_committed(txn_atom * atom)
73461+{
73462+ assert_spin_locked(&(atom->alock));
73463+ assert("zam-885", atom->txnh_count > atom->nr_waiters);
73464+ return atom->txnh_count == atom->nr_waiters + 1;
73465+}
73466+
73467+/* Return true if an atom should commit now. This is determined by aging, atom
73468+ size or atom flags. */
73469+static int atom_should_commit(const txn_atom * atom)
73470+{
73471+ assert("umka-189", atom != NULL);
73472+ return
73473+ (atom->flags & ATOM_FORCE_COMMIT) ||
73474+ ((unsigned)atom_pointer_count(atom) >
73475+ get_current_super_private()->tmgr.atom_max_size)
73476+ || atom_is_dotard(atom);
73477+}
73478+
73479+/* return 1 if current atom exists and requires commit. */
73480+int current_atom_should_commit(void)
73481+{
73482+ txn_atom *atom;
73483+ int result = 0;
73484+
73485+ atom = get_current_atom_locked_nocheck();
73486+ if (atom) {
73487+ result = atom_should_commit(atom);
73488+ spin_unlock_atom(atom);
73489+ }
73490+ return result;
73491+}
73492+
73493+static int atom_should_commit_asap(const txn_atom * atom)
73494+{
73495+ unsigned int captured;
73496+ unsigned int pinnedpages;
73497+
73498+ assert("nikita-3309", atom != NULL);
73499+
73500+ captured = (unsigned)atom->capture_count;
73501+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73502+
73503+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
73504+}
73505+
73506+static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
73507+{
73508+ jnode *first_dirty;
73509+
73510+ list_for_each_entry(first_dirty, head, capture_link) {
73511+ if (!(flags & JNODE_FLUSH_COMMIT)) {
73512+ /*
73513+ * skip jnodes which "heard banshee" or having active
73514+ * I/O
73515+ */
73516+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
73517+ JF_ISSET(first_dirty, JNODE_WRITEBACK))
73518+ continue;
73519+ }
73520+ return first_dirty;
73521+ }
73522+ return NULL;
73523+}
73524+
73525+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
73526+ nodes on atom's lists */
73527+jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
73528+{
73529+ jnode *first_dirty;
73530+ tree_level level;
73531+
73532+ assert_spin_locked(&(atom->alock));
73533+
73534+ /* The flush starts from LEAF_LEVEL (=1). */
73535+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73536+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
73537+ continue;
73538+
73539+ first_dirty =
73540+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
73541+ flags);
73542+ if (first_dirty)
73543+ return first_dirty;
73544+ }
73545+
73546+ /* znode-above-root is on the list #0. */
73547+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
73548+}
73549+
73550+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
73551+{
73552+ jnode *cur;
73553+
73554+ assert("zam-905", atom_is_protected(atom));
73555+
73556+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
73557+ while (ATOM_WB_LIST(atom) != &cur->capture_link) {
73558+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
73559+
73560+ spin_lock_jnode(cur);
73561+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
73562+ if (JF_ISSET(cur, JNODE_DIRTY)) {
73563+ queue_jnode(fq, cur);
73564+ } else {
73565+ /* move jnode to atom's clean list */
73566+ list_move_tail(&cur->capture_link,
73567+ ATOM_CLEAN_LIST(atom));
73568+ }
73569+ }
73570+ spin_unlock_jnode(cur);
73571+
73572+ cur = next;
73573+ }
73574+}
73575+
73576+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
73577+ * jnodes to disk. */
73578+static int submit_wb_list(void)
73579+{
73580+ int ret;
73581+ flush_queue_t *fq;
73582+
73583+ fq = get_fq_for_current_atom();
73584+ if (IS_ERR(fq))
73585+ return PTR_ERR(fq);
73586+
73587+ dispatch_wb_list(fq->atom, fq);
73588+ spin_unlock_atom(fq->atom);
73589+
73590+ ret = reiser4_write_fq(fq, NULL, 1);
73591+ reiser4_fq_put(fq);
73592+
73593+ return ret;
73594+}
73595+
73596+/* Wait completion of all writes, re-submit atom writeback list if needed. */
73597+static int current_atom_complete_writes(void)
73598+{
73599+ int ret;
73600+
73601+ /* Each jnode from that list was modified and dirtied when it had i/o
73602+ * request running already. After i/o completion we have to resubmit
73603+ * them to disk again.*/
73604+ ret = submit_wb_list();
73605+ if (ret < 0)
73606+ return ret;
73607+
73608+ /* Wait all i/o completion */
73609+ ret = current_atom_finish_all_fq();
73610+ if (ret)
73611+ return ret;
73612+
73613+ /* Scan wb list again; all i/o should be completed, we re-submit dirty
73614+ * nodes to disk */
73615+ ret = submit_wb_list();
73616+ if (ret < 0)
73617+ return ret;
73618+
73619+ /* Wait all nodes we just submitted */
73620+ return current_atom_finish_all_fq();
73621+}
73622+
73623+#if REISER4_DEBUG
73624+
73625+static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
73626+{
73627+ if (atom == NULL) {
73628+ printk("%s: no atom\n", prefix);
73629+ return;
73630+ }
73631+
73632+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
73633+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
73634+ atomic_read(&atom->refcount), atom->atom_id, atom->flags,
73635+ atom->txnh_count, atom->capture_count, atom->stage,
73636+ atom->start_time, atom->flushed);
73637+}
73638+
73639+#else /* REISER4_DEBUG */
73640+
73641+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
73642+
73643+#endif /* REISER4_DEBUG */
73644+
73645+#define TOOMANYFLUSHES (1 << 13)
73646+
73647+/* Called with the atom locked and no open "active" transaction handlers except
73648+ ours, this function calls flush_current_atom() until all dirty nodes are
73649+ processed. Then it initiates commit processing.
73650+
73651+ Called by the single remaining open "active" txnh, which is closing. Other
73652+ open txnhs belong to processes which wait atom commit in commit_txnh()
73653+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
73654+ long as we hold the atom lock none of the jnodes can be captured and/or
73655+ locked.
73656+
73657+ Return value is an error code if commit fails.
73658+*/
73659+static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
73660+{
73661+ reiser4_super_info_data *sbinfo = get_current_super_private();
73662+ long ret = 0;
73663+ /* how many times jnode_flush() was called as a part of attempt to
73664+ * commit this atom. */
73665+ int flushiters;
73666+
73667+ assert("zam-888", atom != NULL && *atom != NULL);
73668+ assert_spin_locked(&((*atom)->alock));
73669+ assert("zam-887", get_current_context()->trans->atom == *atom);
73670+ assert("jmacd-151", atom_isopen(*atom));
73671+
73672+ assert("nikita-3184",
73673+ get_current_super_private()->delete_mutex_owner != current);
73674+
73675+ for (flushiters = 0;; ++flushiters) {
73676+ ret =
73677+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
73678+ JNODE_FLUSH_COMMIT,
73679+ LONG_MAX /* nr_to_write */ ,
73680+ nr_submitted, atom, NULL);
73681+ if (ret != -E_REPEAT)
73682+ break;
73683+
73684+ /* if atom's dirty list contains one znode which is
73685+ HEARD_BANSHEE and is locked we have to allow lock owner to
73686+ continue and uncapture that znode */
73687+ reiser4_preempt_point();
73688+
73689+ *atom = get_current_atom_locked();
73690+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
73691+ warning("nikita-3176",
73692+ "Flushing like mad: %i", flushiters);
73693+ reiser4_info_atom("atom", *atom);
73694+ DEBUGON(flushiters > (1 << 20));
73695+ }
73696+ }
73697+
73698+ if (ret)
73699+ return ret;
73700+
73701+ assert_spin_locked(&((*atom)->alock));
73702+
73703+ if (!atom_can_be_committed(*atom)) {
73704+ spin_unlock_atom(*atom);
73705+ return RETERR(-E_REPEAT);
73706+ }
73707+
73708+ if ((*atom)->capture_count == 0)
73709+ goto done;
73710+
73711+ /* Up to this point we have been flushing and after flush is called we
73712+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
73713+ at this point, commit should be successful. */
73714+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
73715+ ON_DEBUG(((*atom)->committer = current));
73716+ spin_unlock_atom(*atom);
73717+
73718+ ret = current_atom_complete_writes();
73719+ if (ret)
73720+ return ret;
73721+
73722+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
73723+
73724+ /* isolate critical code path which should be executed by only one
73725+ * thread using tmgr mutex */
73726+ mutex_lock(&sbinfo->tmgr.commit_mutex);
73727+
73728+ ret = reiser4_write_logs(nr_submitted);
73729+ if (ret < 0)
73730+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
73731+
73732+ /* The atom->ovrwr_nodes list is processed under commit mutex held
73733+ because of bitmap nodes which are captured by special way in
73734+ reiser4_pre_commit_hook_bitmap(), that way does not include
73735+ capture_fuse_wait() as a capturing of other nodes does -- the commit
73736+ mutex is used for transaction isolation instead. */
73737+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
73738+ mutex_unlock(&sbinfo->tmgr.commit_mutex);
73739+
73740+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
73741+ reiser4_invalidate_list(ATOM_WB_LIST(*atom));
73742+ assert("zam-927", list_empty(&(*atom)->inodes));
73743+
73744+ spin_lock_atom(*atom);
73745+ done:
73746+ reiser4_atom_set_stage(*atom, ASTAGE_DONE);
73747+ ON_DEBUG((*atom)->committer = NULL);
73748+
73749+ /* Atom's state changes, so wake up everybody waiting for this
73750+ event. */
73751+ wakeup_atom_waiting_list(*atom);
73752+
73753+ /* Decrement the "until commit" reference, at least one txnh (the caller) is
73754+ still open. */
73755+ atomic_dec(&(*atom)->refcount);
73756+
73757+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
73758+ assert("jmacd-1062", (*atom)->capture_count == 0);
73759+ BUG_ON((*atom)->capture_count != 0);
73760+ assert_spin_locked(&((*atom)->alock));
73761+
73762+ return ret;
73763+}
73764+
73765+/* TXN_TXNH */
73766+
73767+/**
73768+ * force_commit_atom - commit current atom and wait commit completion
73769+ * @txnh:
73770+ *
73771+ * Commits current atom and wait commit completion; current atom and @txnh have
73772+ * to be spinlocked before call, this function unlocks them on exit.
73773+ */
73774+int force_commit_atom(txn_handle *txnh)
73775+{
73776+ txn_atom *atom;
73777+
73778+ assert("zam-837", txnh != NULL);
73779+ assert_spin_locked(&(txnh->hlock));
73780+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
73781+
73782+ atom = txnh->atom;
73783+
73784+ assert("zam-834", atom != NULL);
73785+ assert_spin_locked(&(atom->alock));
73786+
73787+ /*
73788+ * Set flags for atom and txnh: forcing atom commit and waiting for
73789+ * commit completion
73790+ */
73791+ txnh->flags |= TXNH_WAIT_COMMIT;
73792+ atom->flags |= ATOM_FORCE_COMMIT;
73793+
73794+ spin_unlock_txnh(txnh);
73795+ spin_unlock_atom(atom);
73796+
73797+ /* commit is here */
73798+ reiser4_txn_restart_current();
73799+ return 0;
73800+}
73801+
73802+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
73803+ * should we commit all atoms including new ones which are created after this
73804+ * functions is called. */
73805+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
73806+{
73807+ int ret;
73808+ txn_atom *atom;
73809+ txn_mgr *mgr;
73810+ txn_handle *txnh;
73811+ unsigned long start_time = jiffies;
73812+ reiser4_context *ctx = get_current_context();
73813+
73814+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
73815+ assert("nikita-3058", reiser4_commit_check_locks());
73816+
73817+ reiser4_txn_restart_current();
73818+
73819+ mgr = &get_super_private(super)->tmgr;
73820+
73821+ txnh = ctx->trans;
73822+
73823+ again:
73824+
73825+ spin_lock_txnmgr(mgr);
73826+
73827+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
73828+ spin_lock_atom(atom);
73829+
73830+ /* Commit any atom which can be committed. If @commit_new_atoms
73831+ * is not set we commit only atoms which were created before
73832+ * this call is started. */
73833+ if (commit_all_atoms
73834+ || time_before_eq(atom->start_time, start_time)) {
73835+ if (atom->stage <= ASTAGE_POST_COMMIT) {
73836+ spin_unlock_txnmgr(mgr);
73837+
73838+ if (atom->stage < ASTAGE_PRE_COMMIT) {
73839+ spin_lock_txnh(txnh);
73840+ /* Add force-context txnh */
73841+ capture_assign_txnh_nolock(atom, txnh);
73842+ ret = force_commit_atom(txnh);
73843+ if (ret)
73844+ return ret;
73845+ } else
73846+ /* wait atom commit */
73847+ reiser4_atom_wait_event(atom);
73848+
73849+ goto again;
73850+ }
73851+ }
73852+
73853+ spin_unlock_atom(atom);
73854+ }
73855+
73856+#if REISER4_DEBUG
73857+ if (commit_all_atoms) {
73858+ reiser4_super_info_data *sbinfo = get_super_private(super);
73859+ spin_lock_reiser4_super(sbinfo);
73860+ assert("zam-813",
73861+ sbinfo->blocks_fake_allocated_unformatted == 0);
73862+ assert("zam-812", sbinfo->blocks_fake_allocated == 0);
73863+ spin_unlock_reiser4_super(sbinfo);
73864+ }
73865+#endif
73866+
73867+ spin_unlock_txnmgr(mgr);
73868+
73869+ return 0;
73870+}
73871+
73872+/* check whether commit_some_atoms() can commit @atom. Locking is up to the
73873+ * caller */
73874+static int atom_is_committable(txn_atom * atom)
73875+{
73876+ return
73877+ atom->stage < ASTAGE_PRE_COMMIT &&
73878+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
73879+}
73880+
73881+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
73882+ * lock at exit */
73883+int commit_some_atoms(txn_mgr * mgr)
73884+{
73885+ int ret = 0;
73886+ txn_atom *atom;
73887+ txn_handle *txnh;
73888+ reiser4_context *ctx;
73889+ struct list_head *pos, *tmp;
73890+
73891+ ctx = get_current_context();
73892+ assert("nikita-2444", ctx != NULL);
73893+
73894+ txnh = ctx->trans;
73895+ spin_lock_txnmgr(mgr);
73896+
73897+ /*
73898+ * this is to avoid gcc complain that atom might be used
73899+ * uninitialized
73900+ */
73901+ atom = NULL;
73902+
73903+ /* look for atom to commit */
73904+ list_for_each_safe(pos, tmp, &mgr->atoms_list) {
73905+ atom = list_entry(pos, txn_atom, atom_link);
73906+ /*
73907+ * first test without taking atom spin lock, whether it is
73908+ * eligible for committing at all
73909+ */
73910+ if (atom_is_committable(atom)) {
73911+ /* now, take spin lock and re-check */
73912+ spin_lock_atom(atom);
73913+ if (atom_is_committable(atom))
73914+ break;
73915+ spin_unlock_atom(atom);
73916+ }
73917+ }
73918+
73919+ ret = (&mgr->atoms_list == pos);
73920+ spin_unlock_txnmgr(mgr);
73921+
73922+ if (ret) {
73923+ /* nothing found */
73924+ spin_unlock(&mgr->daemon->guard);
73925+ return 0;
73926+ }
73927+
73928+ spin_lock_txnh(txnh);
73929+
73930+ BUG_ON(atom == NULL);
73931+ /* Set the atom to force committing */
73932+ atom->flags |= ATOM_FORCE_COMMIT;
73933+
73934+ /* Add force-context txnh */
73935+ capture_assign_txnh_nolock(atom, txnh);
73936+
73937+ spin_unlock_txnh(txnh);
73938+ spin_unlock_atom(atom);
73939+
73940+ /* we are about to release daemon spin lock, notify daemon it
73941+ has to rescan atoms */
73942+ mgr->daemon->rescan = 1;
73943+ spin_unlock(&mgr->daemon->guard);
73944+ reiser4_txn_restart_current();
73945+ return 0;
73946+}
73947+
73948+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
73949+{
73950+ int atom_stage;
73951+ txn_atom *atom_2;
73952+ int repeat;
73953+
73954+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
73955+
73956+ atom_stage = atom->stage;
73957+ repeat = 0;
73958+
73959+ if (!spin_trylock_txnmgr(tmgr)) {
73960+ atomic_inc(&atom->refcount);
73961+ spin_unlock_atom(atom);
73962+ spin_lock_txnmgr(tmgr);
73963+ spin_lock_atom(atom);
73964+ repeat = 1;
73965+ if (atom->stage != atom_stage) {
73966+ spin_unlock_txnmgr(tmgr);
73967+ atom_dec_and_unlock(atom);
73968+ return -E_REPEAT;
73969+ }
73970+ atomic_dec(&atom->refcount);
73971+ }
73972+
73973+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
73974+ if (atom == atom_2)
73975+ continue;
73976+ /*
73977+ * if trylock does not succeed we just do not fuse with that
73978+ * atom.
73979+ */
73980+ if (spin_trylock_atom(atom_2)) {
73981+ if (atom_2->stage < ASTAGE_PRE_COMMIT) {
73982+ spin_unlock_txnmgr(tmgr);
73983+ capture_fuse_into(atom_2, atom);
73984+ /* all locks are lost we can only repeat here */
73985+ return -E_REPEAT;
73986+ }
73987+ spin_unlock_atom(atom_2);
73988+ }
73989+ }
73990+ atom->flags |= ATOM_CANCEL_FUSION;
73991+ spin_unlock_txnmgr(tmgr);
73992+ if (repeat) {
73993+ spin_unlock_atom(atom);
73994+ return -E_REPEAT;
73995+ }
73996+ return 0;
73997+}
73998+
73999+/* Calls jnode_flush for current atom if it exists; if not, just take another
74000+ atom and call jnode_flush() for him. If current transaction handle has
74001+ already assigned atom (current atom) we have to close current transaction
74002+ prior to switch to another atom or do something with current atom. This
74003+ code tries to flush current atom.
74004+
74005+ flush_some_atom() is called as part of memory clearing process. It is
74006+ invoked from balance_dirty_pages(), pdflushd, and entd.
74007+
74008+ If we can flush no nodes, atom is committed, because this frees memory.
74009+
74010+ If atom is too large or too old it is committed also.
74011+*/
74012+int
74013+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
74014+ int flags)
74015+{
74016+ reiser4_context *ctx = get_current_context();
74017+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
74018+ txn_handle *txnh = ctx->trans;
74019+ txn_atom *atom;
74020+ int ret;
74021+
74022+ BUG_ON(wbc->nr_to_write == 0);
74023+ BUG_ON(*nr_submitted != 0);
74024+ assert("zam-1042", txnh != NULL);
74025+ repeat:
74026+ if (txnh->atom == NULL) {
74027+ /* current atom is not available, take first from txnmgr */
74028+ spin_lock_txnmgr(tmgr);
74029+
74030+ /* traverse the list of all atoms */
74031+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74032+ /* lock atom before checking its state */
74033+ spin_lock_atom(atom);
74034+
74035+ /*
74036+ * we need an atom which is not being committed and
74037+ * which has no flushers (jnode_flush() add one flusher
74038+ * at the beginning and subtract one at the end).
74039+ */
74040+ if (atom->stage < ASTAGE_PRE_COMMIT &&
74041+ atom->nr_flushers == 0) {
74042+ spin_lock_txnh(txnh);
74043+ capture_assign_txnh_nolock(atom, txnh);
74044+ spin_unlock_txnh(txnh);
74045+
74046+ goto found;
74047+ }
74048+
74049+ spin_unlock_atom(atom);
74050+ }
74051+
74052+ /*
74053+ * Write throttling is case of no one atom can be
74054+ * flushed/committed.
74055+ */
74056+ if (!current_is_pdflush() && !wbc->nonblocking) {
74057+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74058+ spin_lock_atom(atom);
74059+ /* Repeat the check from the above. */
74060+ if (atom->stage < ASTAGE_PRE_COMMIT
74061+ && atom->nr_flushers == 0) {
74062+ spin_lock_txnh(txnh);
74063+ capture_assign_txnh_nolock(atom, txnh);
74064+ spin_unlock_txnh(txnh);
74065+
74066+ goto found;
74067+ }
74068+ if (atom->stage <= ASTAGE_POST_COMMIT) {
74069+ spin_unlock_txnmgr(tmgr);
74070+ /*
74071+ * we just wait until atom's flusher
74072+ * makes a progress in flushing or
74073+ * committing the atom
74074+ */
74075+ reiser4_atom_wait_event(atom);
74076+ goto repeat;
74077+ }
74078+ spin_unlock_atom(atom);
74079+ }
74080+ }
74081+ spin_unlock_txnmgr(tmgr);
74082+ return 0;
74083+ found:
74084+ spin_unlock_txnmgr(tmgr);
74085+ } else
74086+ atom = get_current_atom_locked();
74087+
74088+ BUG_ON(atom->super != ctx->super);
74089+ assert("vs-35", atom->super == ctx->super);
74090+ if (start) {
74091+ spin_lock_jnode(start);
74092+ ret = (atom == start->atom) ? 1 : 0;
74093+ spin_unlock_jnode(start);
74094+ if (ret == 0)
74095+ start = NULL;
74096+ }
74097+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
74098+ if (ret == 0) {
74099+ /* flush_current_atom returns 0 only if it submitted for write
74100+ nothing */
74101+ BUG_ON(*nr_submitted != 0);
74102+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
74103+ if (atom->capture_count < tmgr->atom_min_size &&
74104+ !(atom->flags & ATOM_CANCEL_FUSION)) {
74105+ ret = txn_try_to_fuse_small_atom(tmgr, atom);
74106+ if (ret == -E_REPEAT) {
74107+ reiser4_preempt_point();
74108+ goto repeat;
74109+ }
74110+ }
74111+ /* if early flushing could not make more nodes clean,
74112+ * or atom is too old/large,
74113+ * we force current atom to commit */
74114+ /* wait for commit completion but only if this
74115+ * wouldn't stall pdflushd and ent thread. */
74116+ if (!wbc->nonblocking && !ctx->entd)
74117+ txnh->flags |= TXNH_WAIT_COMMIT;
74118+ atom->flags |= ATOM_FORCE_COMMIT;
74119+ }
74120+ spin_unlock_atom(atom);
74121+ } else if (ret == -E_REPEAT) {
74122+ if (*nr_submitted == 0) {
74123+ /* let others who hampers flushing (hold longterm locks,
74124+ for instance) to free the way for flush */
74125+ reiser4_preempt_point();
74126+ goto repeat;
74127+ }
74128+ ret = 0;
74129+ }
74130+/*
74131+ if (*nr_submitted > wbc->nr_to_write)
74132+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
74133+*/
74134+ reiser4_txn_restart(ctx);
74135+
74136+ return ret;
74137+}
74138+
74139+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
74140+void reiser4_invalidate_list(struct list_head *head)
74141+{
74142+ while (!list_empty(head)) {
74143+ jnode *node;
74144+
74145+ node = list_entry(head->next, jnode, capture_link);
74146+ spin_lock_jnode(node);
74147+ reiser4_uncapture_block(node);
74148+ jput(node);
74149+ }
74150+}
74151+
74152+static void init_wlinks(txn_wait_links * wlinks)
74153+{
74154+ wlinks->_lock_stack = get_current_lock_stack();
74155+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
74156+ INIT_LIST_HEAD(&wlinks->_fwaiting_link);
74157+ wlinks->waitfor_cb = NULL;
74158+ wlinks->waiting_cb = NULL;
74159+}
74160+
74161+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
74162+void reiser4_atom_wait_event(txn_atom * atom)
74163+{
74164+ txn_wait_links _wlinks;
74165+
74166+ assert_spin_locked(&(atom->alock));
74167+ assert("nikita-3156",
74168+ lock_stack_isclean(get_current_lock_stack()) ||
74169+ atom->nr_running_queues > 0);
74170+
74171+ init_wlinks(&_wlinks);
74172+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
74173+ atomic_inc(&atom->refcount);
74174+ spin_unlock_atom(atom);
74175+
74176+ reiser4_prepare_to_sleep(_wlinks._lock_stack);
74177+ reiser4_go_to_sleep(_wlinks._lock_stack);
74178+
74179+ spin_lock_atom(atom);
74180+ list_del(&_wlinks._fwaitfor_link);
74181+ atom_dec_and_unlock(atom);
74182+}
74183+
74184+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
74185+{
74186+ assert("nikita-3535", atom != NULL);
74187+ assert_spin_locked(&(atom->alock));
74188+ assert("nikita-3536", stage <= ASTAGE_INVALID);
74189+ /* Excelsior! */
74190+ assert("nikita-3537", stage >= atom->stage);
74191+ if (atom->stage != stage) {
74192+ atom->stage = stage;
74193+ reiser4_atom_send_event(atom);
74194+ }
74195+}
74196+
74197+/* wake all threads which wait for an event */
74198+void reiser4_atom_send_event(txn_atom * atom)
74199+{
74200+ assert_spin_locked(&(atom->alock));
74201+ wakeup_atom_waitfor_list(atom);
74202+}
74203+
74204+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
74205+ example, because it does fsync(2)) */
74206+static int should_wait_commit(txn_handle * h)
74207+{
74208+ return h->flags & TXNH_WAIT_COMMIT;
74209+}
74210+
74211+typedef struct commit_data {
74212+ txn_atom *atom;
74213+ txn_handle *txnh;
74214+ long nr_written;
74215+ /* as an optimization we start committing atom by first trying to
74216+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
74217+ * allows to reduce stalls due to other threads waiting for atom in
74218+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
74219+ * preliminary flushes. */
74220+ int preflush;
74221+ /* have we waited on atom. */
74222+ int wait;
74223+ int failed;
74224+ int wake_ktxnmgrd_up;
74225+} commit_data;
74226+
74227+/*
74228+ * Called from commit_txnh() repeatedly, until either error happens, or atom
74229+ * commits successfully.
74230+ */
74231+static int try_commit_txnh(commit_data * cd)
74232+{
74233+ int result;
74234+
74235+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
74236+
74237+ /* Get the atom and txnh locked. */
74238+ cd->atom = txnh_get_atom(cd->txnh);
74239+ assert("jmacd-309", cd->atom != NULL);
74240+ spin_unlock_txnh(cd->txnh);
74241+
74242+ if (cd->wait) {
74243+ cd->atom->nr_waiters--;
74244+ cd->wait = 0;
74245+ }
74246+
74247+ if (cd->atom->stage == ASTAGE_DONE)
74248+ return 0;
74249+
74250+ if (cd->failed)
74251+ return 0;
74252+
74253+ if (atom_should_commit(cd->atom)) {
74254+ /* if atom is _very_ large schedule it for commit as soon as
74255+ * possible. */
74256+ if (atom_should_commit_asap(cd->atom)) {
74257+ /*
74258+ * When atom is in PRE_COMMIT or later stage following
74259+ * invariant (encoded in atom_can_be_committed())
74260+ * holds: there is exactly one non-waiter transaction
74261+ * handle opened on this atom. When thread wants to
74262+ * wait until atom commits (for example sync()) it
74263+ * waits on atom event after increasing
74264+ * atom->nr_waiters (see blow in this function). It
74265+ * cannot be guaranteed that atom is already committed
74266+ * after receiving event, so loop has to be
74267+ * re-started. But if atom switched into PRE_COMMIT
74268+ * stage and became too large, we cannot change its
74269+ * state back to CAPTURE_WAIT (atom stage can only
74270+ * increase monotonically), hence this check.
74271+ */
74272+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
74273+ reiser4_atom_set_stage(cd->atom,
74274+ ASTAGE_CAPTURE_WAIT);
74275+ cd->atom->flags |= ATOM_FORCE_COMMIT;
74276+ }
74277+ if (cd->txnh->flags & TXNH_DONT_COMMIT) {
74278+ /*
74279+ * this thread (transaction handle that is) doesn't
74280+ * want to commit atom. Notify waiters that handle is
74281+ * closed. This can happen, for example, when we are
74282+ * under VFS directory lock and don't want to commit
74283+ * atom right now to avoid stalling other threads
74284+ * working in the same directory.
74285+ */
74286+
74287+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
74288+ * commit this atom: no atom waiters and only one
74289+ * (our) open transaction handle. */
74290+ cd->wake_ktxnmgrd_up =
74291+ cd->atom->txnh_count == 1 &&
74292+ cd->atom->nr_waiters == 0;
74293+ reiser4_atom_send_event(cd->atom);
74294+ result = 0;
74295+ } else if (!atom_can_be_committed(cd->atom)) {
74296+ if (should_wait_commit(cd->txnh)) {
74297+ /* sync(): wait for commit */
74298+ cd->atom->nr_waiters++;
74299+ cd->wait = 1;
74300+ reiser4_atom_wait_event(cd->atom);
74301+ result = RETERR(-E_REPEAT);
74302+ } else {
74303+ result = 0;
74304+ }
74305+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74306+ /*
74307+ * optimization: flush atom without switching it into
74308+ * ASTAGE_CAPTURE_WAIT.
74309+ *
74310+ * But don't do this for ktxnmgrd, because ktxnmgrd
74311+ * should never block on atom fusion.
74312+ */
74313+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74314+ LONG_MAX, &cd->nr_written,
74315+ &cd->atom, NULL);
74316+ if (result == 0) {
74317+ spin_unlock_atom(cd->atom);
74318+ cd->preflush = 0;
74319+ result = RETERR(-E_REPEAT);
74320+ } else /* Atoms wasn't flushed
74321+ * completely. Rinse. Repeat. */
74322+ --cd->preflush;
74323+ } else {
74324+ /* We change atom state to ASTAGE_CAPTURE_WAIT to
74325+ prevent atom fusion and count ourself as an active
74326+ flusher */
74327+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74328+ cd->atom->flags |= ATOM_FORCE_COMMIT;
74329+
74330+ result =
74331+ commit_current_atom(&cd->nr_written, &cd->atom);
74332+ if (result != 0 && result != -E_REPEAT)
74333+ cd->failed = 1;
74334+ }
74335+ } else
74336+ result = 0;
74337+
74338+#if REISER4_DEBUG
74339+ if (result == 0)
74340+ assert_spin_locked(&(cd->atom->alock));
74341+#endif
74342+
74343+ /* perfectly valid assertion, except that when atom/txnh is not locked
74344+ * fusion can take place, and cd->atom points nowhere. */
74345+ /*
74346+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74347+ */
74348+ return result;
74349+}
74350+
74351+/* Called to commit a transaction handle. This decrements the atom's number of open
74352+ handles and if it is the last handle to commit and the atom should commit, initiates
74353+ atom commit. if commit does not fail, return number of written blocks */
74354+static int commit_txnh(txn_handle * txnh)
74355+{
74356+ commit_data cd;
74357+ assert("umka-192", txnh != NULL);
74358+
74359+ memset(&cd, 0, sizeof cd);
74360+ cd.txnh = txnh;
74361+ cd.preflush = 10;
74362+
74363+ /* calls try_commit_txnh() until either atom commits, or error
74364+ * happens */
74365+ while (try_commit_txnh(&cd) != 0)
74366+ reiser4_preempt_point();
74367+
74368+ spin_lock_txnh(txnh);
74369+
74370+ cd.atom->txnh_count -= 1;
74371+ txnh->atom = NULL;
74372+ /* remove transaction handle from atom's list of transaction handles */
74373+ list_del_init(&txnh->txnh_link);
74374+
74375+ spin_unlock_txnh(txnh);
74376+ atom_dec_and_unlock(cd.atom);
74377+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74378+ * because it takes time) by current thread, we do that work
74379+ * asynchronously by ktxnmgrd daemon. */
74380+ if (cd.wake_ktxnmgrd_up)
74381+ ktxnmgrd_kick(&get_current_super_private()->tmgr);
74382+
74383+ return 0;
74384+}
74385+
74386+/* TRY_CAPTURE */
74387+
74388+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
74389+ condition indicates that the request should be retried, and it may block if the
74390+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74391+
74392+ This routine encodes the basic logic of block capturing described by:
74393+
74394+ http://namesys.com/v4/v4.html
74395+
74396+ Our goal here is to ensure that any two blocks that contain dependent modifications
74397+ should commit at the same time. This function enforces this discipline by initiating
74398+ fusion whenever a transaction handle belonging to one atom requests to read or write a
74399+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74400+
74401+ In addition, this routine handles the initial assignment of atoms to blocks and
74402+ transaction handles. These are possible outcomes of this function:
74403+
74404+ 1. The block and handle are already part of the same atom: return immediate success
74405+
74406+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74407+ the handle to the block's atom.
74408+
74409+ 3. The handle is assigned but the block is not: call capture_assign_block to assign
74410+ the block to the handle's atom.
74411+
74412+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74413+ to fuse atoms.
74414+
74415+ 5. Neither block nor handle are assigned: create a new atom and assign them both.
74416+
74417+ 6. A read request for a non-captured block: return immediate success.
74418+
74419+ This function acquires and releases the handle's spinlock. This function is called
74420+ under the jnode lock and if the return value is 0, it returns with the jnode lock still
74421+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is
74422+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
74423+ lock in the failure case.
74424+*/
74425+static int try_capture_block(
74426+ txn_handle * txnh, jnode * node, txn_capture mode,
74427+ txn_atom ** atom_alloc)
74428+{
74429+ txn_atom *block_atom;
74430+ txn_atom *txnh_atom;
74431+
74432+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
74433+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74434+
74435+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74436+ * node->tree somewhere. */
74437+ assert("umka-194", txnh != NULL);
74438+ assert("umka-195", node != NULL);
74439+
74440+ /* The jnode is already locked! Being called from reiser4_try_capture(). */
74441+ assert_spin_locked(&(node->guard));
74442+ block_atom = node->atom;
74443+
74444+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74445+ let us touch the atoms themselves. */
74446+ spin_lock_txnh(txnh);
74447+ txnh_atom = txnh->atom;
74448+ /* Process of capturing continues into one of four branches depends on
74449+ which atoms from (block atom (node->atom), current atom (txnh->atom))
74450+ exist. */
74451+ if (txnh_atom == NULL) {
74452+ if (block_atom == NULL) {
74453+ spin_unlock_txnh(txnh);
74454+ spin_unlock_jnode(node);
74455+ /* assign empty atom to the txnh and repeat */
74456+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74457+ } else {
74458+ atomic_inc(&block_atom->refcount);
74459+ /* node spin-lock isn't needed anymore */
74460+ spin_unlock_jnode(node);
74461+ if (!spin_trylock_atom(block_atom)) {
74462+ spin_unlock_txnh(txnh);
74463+ spin_lock_atom(block_atom);
74464+ spin_lock_txnh(txnh);
74465+ }
74466+ /* re-check state after getting txnh and the node
74467+ * atom spin-locked */
74468+ if (node->atom != block_atom || txnh->atom != NULL) {
74469+ spin_unlock_txnh(txnh);
74470+ atom_dec_and_unlock(block_atom);
74471+ return RETERR(-E_REPEAT);
74472+ }
74473+ atomic_dec(&block_atom->refcount);
74474+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74475+ (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74476+ block_atom->txnh_count != 0))
74477+ return capture_fuse_wait(txnh, block_atom, NULL, mode);
74478+ capture_assign_txnh_nolock(block_atom, txnh);
74479+ spin_unlock_txnh(txnh);
74480+ spin_unlock_atom(block_atom);
74481+ return RETERR(-E_REPEAT);
74482+ }
74483+ } else {
74484+ /* It is time to perform deadlock prevention check over the
74485+ node we want to capture. It is possible this node was locked
74486+ for read without capturing it. The optimization which allows
74487+ to do it helps us in keeping atoms independent as long as
74488+ possible but it may cause lock/fuse deadlock problems.
74489+
74490+ A number of similar deadlock situations with locked but not
74491+ captured nodes were found. In each situation there are two
74492+ or more threads: one of them does flushing while another one
74493+ does routine balancing or tree lookup. The flushing thread
74494+ (F) sleeps in long term locking request for node (N), another
74495+ thread (A) sleeps in trying to capture some node already
74496+ belonging the atom F, F has a state which prevents
74497+ immediately fusion .
74498+
74499+ Deadlocks of this kind cannot happen if node N was properly
74500+ captured by thread A. The F thread fuse atoms before locking
74501+ therefore current atom of thread F and current atom of thread
74502+ A became the same atom and thread A may proceed. This does
74503+ not work if node N was not captured because the fusion of
74504+ atom does not happens.
74505+
74506+ The following scheme solves the deadlock: If
74507+ longterm_lock_znode locks and does not capture a znode, that
74508+ znode is marked as MISSED_IN_CAPTURE. A node marked this way
74509+ is processed by the code below which restores the missed
74510+ capture and fuses current atoms of all the node lock owners
74511+ by calling the fuse_not_fused_lock_owners() function. */
74512+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
74513+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
74514+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
74515+ spin_unlock_txnh(txnh);
74516+ spin_unlock_jnode(node);
74517+ fuse_not_fused_lock_owners(txnh, JZNODE(node));
74518+ return RETERR(-E_REPEAT);
74519+ }
74520+ }
74521+ if (block_atom == NULL) {
74522+ atomic_inc(&txnh_atom->refcount);
74523+ spin_unlock_txnh(txnh);
74524+ if (!spin_trylock_atom(txnh_atom)) {
74525+ spin_unlock_jnode(node);
74526+ spin_lock_atom(txnh_atom);
74527+ spin_lock_jnode(node);
74528+ }
74529+ if (txnh->atom != txnh_atom || node->atom != NULL
74530+ || JF_ISSET(node, JNODE_IS_DYING)) {
74531+ spin_unlock_jnode(node);
74532+ atom_dec_and_unlock(txnh_atom);
74533+ return RETERR(-E_REPEAT);
74534+ }
74535+ atomic_dec(&txnh_atom->refcount);
74536+ capture_assign_block_nolock(txnh_atom, node);
74537+ spin_unlock_atom(txnh_atom);
74538+ } else {
74539+ if (txnh_atom != block_atom) {
74540+ if (mode & TXN_CAPTURE_DONT_FUSE) {
74541+ spin_unlock_txnh(txnh);
74542+ spin_unlock_jnode(node);
74543+ /* we are in a "no-fusion" mode and @node is
74544+ * already part of transaction. */
74545+ return RETERR(-E_NO_NEIGHBOR);
74546+ }
74547+ return capture_init_fusion(node, txnh, mode);
74548+ }
74549+ spin_unlock_txnh(txnh);
74550+ }
74551+ }
74552+ return 0;
74553+}
74554+
74555+static txn_capture
74556+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
74557+{
74558+ txn_capture cap_mode;
74559+
74560+ assert_spin_locked(&(node->guard));
74561+
74562+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
74563+
74564+ if (lock_mode == ZNODE_WRITE_LOCK) {
74565+ cap_mode = TXN_CAPTURE_WRITE;
74566+ } else if (node->atom != NULL) {
74567+ cap_mode = TXN_CAPTURE_WRITE;
74568+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
74569+ jnode_get_level(node) == LEAF_LEVEL) {
74570+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
74571+ /* We only need a READ_FUSING capture at the leaf level. This
74572+ is because the internal levels of the tree (twigs included)
74573+ are redundant from the point of the user that asked for a
74574+ read-fusing transcrash. The user only wants to read-fuse
74575+ atoms due to reading uncommitted data that another user has
74576+ written. It is the file system that reads/writes the
74577+ internal tree levels, the user only reads/writes leaves. */
74578+ cap_mode = TXN_CAPTURE_READ_ATOMIC;
74579+ } else {
74580+ /* In this case (read lock at a non-leaf) there's no reason to
74581+ * capture. */
74582+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
74583+ return 0;
74584+ }
74585+
74586+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
74587+ assert("nikita-3186", cap_mode != 0);
74588+ return cap_mode;
74589+}
74590+
74591+/* This is an external interface to try_capture_block(), it calls
74592+ try_capture_block() repeatedly as long as -E_REPEAT is returned.
74593+
74594+ @node: node to capture,
74595+ @lock_mode: read or write lock is used in capture mode calculation,
74596+ @flags: see txn_capture flags enumeration,
74597+ @can_coc : can copy-on-capture
74598+
74599+ @return: 0 - node was successfully captured, -E_REPEAT - capture request
74600+ cannot be processed immediately as it was requested in flags,
74601+ < 0 - other errors.
74602+*/
74603+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
74604+ txn_capture flags)
74605+{
74606+ txn_atom *atom_alloc = NULL;
74607+ txn_capture cap_mode;
74608+ txn_handle *txnh = get_current_context()->trans;
74609+ int ret;
74610+
74611+ assert_spin_locked(&(node->guard));
74612+
74613+ repeat:
74614+ if (JF_ISSET(node, JNODE_IS_DYING))
74615+ return RETERR(-EINVAL);
74616+ if (node->atom != NULL && txnh->atom == node->atom)
74617+ return 0;
74618+ cap_mode = build_capture_mode(node, lock_mode, flags);
74619+ if (cap_mode == 0 ||
74620+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
74621+ /* Mark this node as "MISSED". It helps in further deadlock
74622+ * analysis */
74623+ if (jnode_is_znode(node))
74624+ JF_SET(node, JNODE_MISSED_IN_CAPTURE);
74625+ return 0;
74626+ }
74627+ /* Repeat try_capture as long as -E_REPEAT is returned. */
74628+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
74629+ /* Regardless of non_blocking:
74630+
74631+ If ret == 0 then jnode is still locked.
74632+ If ret != 0 then jnode is unlocked.
74633+ */
74634+#if REISER4_DEBUG
74635+ if (ret == 0)
74636+ assert_spin_locked(&(node->guard));
74637+ else
74638+ assert_spin_not_locked(&(node->guard));
74639+#endif
74640+ assert_spin_not_locked(&(txnh->guard));
74641+
74642+ if (ret == -E_REPEAT) {
74643+ /* E_REPEAT implies all locks were released, therefore we need
74644+ to take the jnode's lock again. */
74645+ spin_lock_jnode(node);
74646+
74647+ /* Although this may appear to be a busy loop, it is not.
74648+ There are several conditions that cause E_REPEAT to be
74649+ returned by the call to try_capture_block, all cases
74650+ indicating some kind of state change that means you should
74651+ retry the request and will get a different result. In some
74652+ cases this could be avoided with some extra code, but
74653+ generally it is done because the necessary locks were
74654+ released as a result of the operation and repeating is the
74655+ simplest thing to do (less bug potential). The cases are:
74656+ atom fusion returns E_REPEAT after it completes (jnode and
74657+ txnh were unlocked); race conditions in assign_block,
74658+ assign_txnh, and init_fusion return E_REPEAT (trylock
74659+ failure); after going to sleep in capture_fuse_wait
74660+ (request was blocked but may now succeed). I'm not quite
74661+ sure how capture_copy works yet, but it may also return
74662+ E_REPEAT. When the request is legitimately blocked, the
74663+ requestor goes to sleep in fuse_wait, so this is not a busy
74664+ loop. */
74665+ /* NOTE-NIKITA: still don't understand:
74666+
74667+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
74668+
74669+ looks like busy loop?
74670+ */
74671+ goto repeat;
74672+ }
74673+
74674+ /* free extra atom object that was possibly allocated by
74675+ try_capture_block().
74676+
74677+ Do this before acquiring jnode spin lock to
74678+ minimize time spent under lock. --nikita */
74679+ if (atom_alloc != NULL) {
74680+ kmem_cache_free(_atom_slab, atom_alloc);
74681+ }
74682+
74683+ if (ret != 0) {
74684+ if (ret == -E_BLOCK) {
74685+ assert("nikita-3360",
74686+ cap_mode & TXN_CAPTURE_NONBLOCKING);
74687+ ret = -E_REPEAT;
74688+ }
74689+
74690+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May
74691+ want to fix the above code to avoid releasing the lock and
74692+ re-acquiring it, but there are cases were failure occurs
74693+ when the lock is not held, and those cases would need to be
74694+ modified to re-take the lock. */
74695+ spin_lock_jnode(node);
74696+ }
74697+
74698+ /* Jnode is still locked. */
74699+ assert_spin_locked(&(node->guard));
74700+ return ret;
74701+}
74702+
74703+static void release_two_atoms(txn_atom *one, txn_atom *two)
74704+{
74705+ spin_unlock_atom(one);
74706+ atom_dec_and_unlock(two);
74707+ spin_lock_atom(one);
74708+ atom_dec_and_unlock(one);
74709+}
74710+
74711+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
74712+ returned by that routine. The txn_capture request mode is computed here depending on
74713+ the transaction handle's type and the lock request. This is called from the depths of
74714+ the lock manager with the jnode lock held and it always returns with the jnode lock
74715+ held.
74716+*/
74717+
74718+/* fuse all 'active' atoms of lock owners of given node. */
74719+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
74720+{
74721+ lock_handle *lh;
74722+ int repeat;
74723+ txn_atom *atomh, *atomf;
74724+ reiser4_context *me = get_current_context();
74725+ reiser4_context *ctx = NULL;
74726+
74727+ assert_spin_not_locked(&(ZJNODE(node)->guard));
74728+ assert_spin_not_locked(&(txnh->hlock));
74729+
74730+ repeat:
74731+ repeat = 0;
74732+ atomh = txnh_get_atom(txnh);
74733+ spin_unlock_txnh(txnh);
74734+ assert("zam-692", atomh != NULL);
74735+
74736+ spin_lock_zlock(&node->lock);
74737+ /* inspect list of lock owners */
74738+ list_for_each_entry(lh, &node->lock.owners, owners_link) {
74739+ ctx = get_context_by_lock_stack(lh->owner);
74740+ if (ctx == me)
74741+ continue;
74742+ /* below we use two assumptions to avoid addition spin-locks
74743+ for checking the condition :
74744+
74745+ 1) if the lock stack has lock, the transaction should be
74746+ opened, i.e. ctx->trans != NULL;
74747+
74748+ 2) reading of well-aligned ctx->trans->atom is atomic, if it
74749+ equals to the address of spin-locked atomh, we take that
74750+ the atoms are the same, nothing has to be captured. */
74751+ if (atomh != ctx->trans->atom) {
74752+ reiser4_wake_up(lh->owner);
74753+ repeat = 1;
74754+ break;
74755+ }
74756+ }
74757+ if (repeat) {
74758+ if (!spin_trylock_txnh(ctx->trans)) {
74759+ spin_unlock_zlock(&node->lock);
74760+ spin_unlock_atom(atomh);
74761+ goto repeat;
74762+ }
74763+ atomf = ctx->trans->atom;
74764+ if (atomf == NULL) {
74765+ capture_assign_txnh_nolock(atomh, ctx->trans);
74766+ /* release zlock lock _after_ assigning the atom to the
74767+ * transaction handle, otherwise the lock owner thread
74768+ * may unlock all znodes, exit kernel context and here
74769+ * we would access an invalid transaction handle. */
74770+ spin_unlock_zlock(&node->lock);
74771+ spin_unlock_atom(atomh);
74772+ spin_unlock_txnh(ctx->trans);
74773+ goto repeat;
74774+ }
74775+ assert("zam-1059", atomf != atomh);
74776+ spin_unlock_zlock(&node->lock);
74777+ atomic_inc(&atomh->refcount);
74778+ atomic_inc(&atomf->refcount);
74779+ spin_unlock_txnh(ctx->trans);
74780+ if (atomf > atomh) {
74781+ spin_lock_atom_nested(atomf);
74782+ } else {
74783+ spin_unlock_atom(atomh);
74784+ spin_lock_atom(atomf);
74785+ spin_lock_atom_nested(atomh);
74786+ }
74787+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
74788+ release_two_atoms(atomf, atomh);
74789+ goto repeat;
74790+ }
74791+ atomic_dec(&atomh->refcount);
74792+ atomic_dec(&atomf->refcount);
74793+ capture_fuse_into(atomf, atomh);
74794+ goto repeat;
74795+ }
74796+ spin_unlock_zlock(&node->lock);
74797+ spin_unlock_atom(atomh);
74798+}
74799+
74800+/* This is the interface to capture unformatted nodes via their struct page
74801+ reference. Currently it is only used in reiser4_invalidatepage */
74802+int try_capture_page_to_invalidate(struct page *pg)
74803+{
74804+ int ret;
74805+ jnode *node;
74806+
74807+ assert("umka-292", pg != NULL);
74808+ assert("nikita-2597", PageLocked(pg));
74809+
74810+ if (IS_ERR(node = jnode_of_page(pg))) {
74811+ return PTR_ERR(node);
74812+ }
74813+
74814+ spin_lock_jnode(node);
74815+ unlock_page(pg);
74816+
74817+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
74818+ spin_unlock_jnode(node);
74819+ jput(node);
74820+ lock_page(pg);
74821+ return ret;
74822+}
74823+
74824+/* This informs the transaction manager when a node is deleted. Add the block to the
74825+ atom's delete set and uncapture the block.
74826+
74827+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
74828+explanations. find all the functions that use it, and unless there is some very
74829+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
74830+move the loop to inside the function.
74831+
74832+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
74833+ */
74834+void reiser4_uncapture_page(struct page *pg)
74835+{
74836+ jnode *node;
74837+ txn_atom *atom;
74838+
74839+ assert("umka-199", pg != NULL);
74840+ assert("nikita-3155", PageLocked(pg));
74841+
74842+ clear_page_dirty_for_io(pg);
74843+
74844+ reiser4_wait_page_writeback(pg);
74845+
74846+ node = jprivate(pg);
74847+ BUG_ON(node == NULL);
74848+
74849+ spin_lock_jnode(node);
74850+
74851+ atom = jnode_get_atom(node);
74852+ if (atom == NULL) {
74853+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74854+ spin_unlock_jnode(node);
74855+ return;
74856+ }
74857+
74858+ /* We can remove jnode from transaction even if it is on flush queue
74859+ * prepped list, we only need to be sure that flush queue is not being
74860+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
74861+ * spin lock for protection of the prepped nodes list, instead
74862+ * write_fq() increments atom's nr_running_queues counters for the time
74863+ * when prepped list is not protected by spin lock. Here we check this
74864+ * counter if we want to remove jnode from flush queue and, if the
74865+ * counter is not zero, wait all reiser4_write_fq() for this atom to
74866+ * complete. This is not significant overhead. */
74867+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
74868+ spin_unlock_jnode(node);
74869+ /*
74870+ * at this moment we want to wait for "atom event", viz. wait
74871+ * until @node can be removed from flush queue. But
74872+ * reiser4_atom_wait_event() cannot be called with page locked,
74873+ * because it deadlocks with jnode_extent_write(). Unlock page,
74874+ * after making sure (through page_cache_get()) that it cannot
74875+ * be released from memory.
74876+ */
74877+ page_cache_get(pg);
74878+ unlock_page(pg);
74879+ reiser4_atom_wait_event(atom);
74880+ lock_page(pg);
74881+ /*
74882+ * page may has been detached by ->writepage()->releasepage().
74883+ */
74884+ reiser4_wait_page_writeback(pg);
74885+ spin_lock_jnode(node);
74886+ page_cache_release(pg);
74887+ atom = jnode_get_atom(node);
74888+/* VS-FIXME-HANS: improve the commenting in this function */
74889+ if (atom == NULL) {
74890+ spin_unlock_jnode(node);
74891+ return;
74892+ }
74893+ }
74894+ reiser4_uncapture_block(node);
74895+ spin_unlock_atom(atom);
74896+ jput(node);
74897+}
74898+
74899+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
74900+ * inode's tree of jnodes */
74901+void reiser4_uncapture_jnode(jnode * node)
74902+{
74903+ txn_atom *atom;
74904+
74905+ assert_spin_locked(&(node->guard));
74906+ assert("", node->pg == 0);
74907+
74908+ atom = jnode_get_atom(node);
74909+ if (atom == NULL) {
74910+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74911+ spin_unlock_jnode(node);
74912+ return;
74913+ }
74914+
74915+ reiser4_uncapture_block(node);
74916+ spin_unlock_atom(atom);
74917+ jput(node);
74918+}
74919+
74920+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
74921+ increases atom refcount and txnh_count, adds to txnh_list. */
74922+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
74923+{
74924+ assert("umka-200", atom != NULL);
74925+ assert("umka-201", txnh != NULL);
74926+
74927+ assert_spin_locked(&(txnh->hlock));
74928+ assert_spin_locked(&(atom->alock));
74929+ assert("jmacd-824", txnh->atom == NULL);
74930+ assert("nikita-3540", atom_isopen(atom));
74931+ BUG_ON(txnh->atom != NULL);
74932+
74933+ atomic_inc(&atom->refcount);
74934+ txnh->atom = atom;
74935+ reiser4_ctx_gfp_mask_set();
74936+ list_add_tail(&txnh->txnh_link, &atom->txnh_list);
74937+ atom->txnh_count += 1;
74938+}
74939+
74940+/* No-locking version of assign_block. Sets the block's atom pointer, references the
74941+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
74942+static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
74943+{
74944+ assert("umka-202", atom != NULL);
74945+ assert("umka-203", node != NULL);
74946+ assert_spin_locked(&(node->guard));
74947+ assert_spin_locked(&(atom->alock));
74948+ assert("jmacd-323", node->atom == NULL);
74949+ BUG_ON(!list_empty_careful(&node->capture_link));
74950+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
74951+
74952+ /* Pointer from jnode to atom is not counted in atom->refcount. */
74953+ node->atom = atom;
74954+
74955+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
74956+ atom->capture_count += 1;
74957+ /* reference to jnode is acquired by atom. */
74958+ jref(node);
74959+
74960+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
74961+
74962+ LOCK_CNT_INC(t_refs);
74963+}
74964+
74965+/* common code for dirtying both unformatted jnodes and formatted znodes. */
74966+static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
74967+{
74968+ assert_spin_locked(&(node->guard));
74969+ assert_spin_locked(&(atom->alock));
74970+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
74971+
74972+ JF_SET(node, JNODE_DIRTY);
74973+
74974+ get_current_context()->nr_marked_dirty++;
74975+
74976+ /* We grab2flush_reserve one additional block only if node was
74977+ not CREATED and jnode_flush did not sort it into neither
74978+ relocate set nor overwrite one. If node is in overwrite or
74979+ relocate set we assume that atom's flush reserved counter was
74980+ already adjusted. */
74981+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
74982+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
74983+ && !jnode_is_cluster_page(node)) {
74984+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
74985+ assert("vs-1506", *jnode_get_block(node) != 0);
74986+ grabbed2flush_reserved_nolock(atom, (__u64) 1);
74987+ JF_SET(node, JNODE_FLUSH_RESERVED);
74988+ }
74989+
74990+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
74991+ /* If the atom is not set yet, it will be added to the appropriate list in
74992+ capture_assign_block_nolock. */
74993+ /* Sometimes a node is set dirty before being captured -- the case for new
74994+ jnodes. In that case the jnode will be added to the appropriate list
74995+ in capture_assign_block_nolock. Another reason not to re-link jnode is
74996+ that jnode is on a flush queue (see flush.c for details) */
74997+
74998+ int level = jnode_get_level(node);
74999+
75000+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
75001+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
75002+ assert("nikita-2607", 0 <= level);
75003+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
75004+
75005+ /* move node to atom's dirty list */
75006+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
75007+ ON_DEBUG(count_jnode
75008+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
75009+ }
75010+}
75011+
75012+/* Set the dirty status for this (spin locked) jnode. */
75013+void jnode_make_dirty_locked(jnode * node)
75014+{
75015+ assert("umka-204", node != NULL);
75016+ assert_spin_locked(&(node->guard));
75017+
75018+ if (REISER4_DEBUG && rofs_jnode(node)) {
75019+ warning("nikita-3365", "Dirtying jnode on rofs");
75020+ dump_stack();
75021+ }
75022+
75023+ /* Fast check for already dirty node */
75024+ if (!JF_ISSET(node, JNODE_DIRTY)) {
75025+ txn_atom *atom;
75026+
75027+ atom = jnode_get_atom(node);
75028+ assert("vs-1094", atom);
75029+ /* Check jnode dirty status again because node spin lock might
75030+ * be released inside jnode_get_atom(). */
75031+ if (likely(!JF_ISSET(node, JNODE_DIRTY)))
75032+ do_jnode_make_dirty(node, atom);
75033+ spin_unlock_atom(atom);
75034+ }
75035+}
75036+
75037+/* Set the dirty status for this znode. */
75038+void znode_make_dirty(znode * z)
75039+{
75040+ jnode *node;
75041+ struct page *page;
75042+
75043+ assert("umka-204", z != NULL);
75044+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
75045+ assert("nikita-3560", znode_is_write_locked(z));
75046+
75047+ node = ZJNODE(z);
75048+ /* znode is longterm locked, we can check dirty bit without spinlock */
75049+ if (JF_ISSET(node, JNODE_DIRTY)) {
75050+ /* znode is dirty already. All we have to do is to change znode version */
75051+ z->version = znode_build_version(jnode_get_tree(node));
75052+ return;
75053+ }
75054+
75055+ spin_lock_jnode(node);
75056+ jnode_make_dirty_locked(node);
75057+ page = jnode_page(node);
75058+ if (page != NULL) {
75059+ /* this is useful assertion (allows one to check that no
75060+ * modifications are lost due to update of in-flight page),
75061+ * but it requires locking on page to check PG_writeback
75062+ * bit. */
75063+ /* assert("nikita-3292",
75064+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
75065+ page_cache_get(page);
75066+
75067+ /* jnode lock is not needed for the rest of
75068+ * znode_set_dirty(). */
75069+ spin_unlock_jnode(node);
75070+ /* reiser4 file write code calls set_page_dirty for
75071+ * unformatted nodes, for formatted nodes we do it here. */
75072+ reiser4_set_page_dirty_internal(page);
75073+ page_cache_release(page);
75074+ /* bump version counter in znode */
75075+ z->version = znode_build_version(jnode_get_tree(node));
75076+ } else {
75077+ assert("zam-596", znode_above_root(JZNODE(node)));
75078+ spin_unlock_jnode(node);
75079+ }
75080+
75081+ assert("nikita-1900", znode_is_write_locked(z));
75082+ assert("jmacd-9777", node->atom != NULL);
75083+}
75084+
75085+int reiser4_sync_atom(txn_atom * atom)
75086+{
75087+ int result;
75088+ txn_handle *txnh;
75089+
75090+ txnh = get_current_context()->trans;
75091+
75092+ result = 0;
75093+ if (atom != NULL) {
75094+ if (atom->stage < ASTAGE_PRE_COMMIT) {
75095+ spin_lock_txnh(txnh);
75096+ capture_assign_txnh_nolock(atom, txnh);
75097+ result = force_commit_atom(txnh);
75098+ } else if (atom->stage < ASTAGE_POST_COMMIT) {
75099+ /* wait atom commit */
75100+ reiser4_atom_wait_event(atom);
75101+ /* try once more */
75102+ result = RETERR(-E_REPEAT);
75103+ } else
75104+ spin_unlock_atom(atom);
75105+ }
75106+ return result;
75107+}
75108+
75109+#if REISER4_DEBUG
75110+
75111+/* move jnode form one list to another
75112+ call this after atom->capture_count is updated */
75113+void
75114+count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
75115+ atom_list new_list, int check_lists)
75116+{
75117+ struct list_head *pos;
75118+
75119+ assert("zam-1018", atom_is_protected(atom));
75120+ assert_spin_locked(&(node->guard));
75121+ assert("", NODE_LIST(node) == old_list);
75122+
75123+ switch (NODE_LIST(node)) {
75124+ case NOT_CAPTURED:
75125+ break;
75126+ case DIRTY_LIST:
75127+ assert("", atom->dirty > 0);
75128+ atom->dirty--;
75129+ break;
75130+ case CLEAN_LIST:
75131+ assert("", atom->clean > 0);
75132+ atom->clean--;
75133+ break;
75134+ case FQ_LIST:
75135+ assert("", atom->fq > 0);
75136+ atom->fq--;
75137+ break;
75138+ case WB_LIST:
75139+ assert("", atom->wb > 0);
75140+ atom->wb--;
75141+ break;
75142+ case OVRWR_LIST:
75143+ assert("", atom->ovrwr > 0);
75144+ atom->ovrwr--;
75145+ break;
75146+ default:
75147+ impossible("", "");
75148+ }
75149+
75150+ switch (new_list) {
75151+ case NOT_CAPTURED:
75152+ break;
75153+ case DIRTY_LIST:
75154+ atom->dirty++;
75155+ break;
75156+ case CLEAN_LIST:
75157+ atom->clean++;
75158+ break;
75159+ case FQ_LIST:
75160+ atom->fq++;
75161+ break;
75162+ case WB_LIST:
75163+ atom->wb++;
75164+ break;
75165+ case OVRWR_LIST:
75166+ atom->ovrwr++;
75167+ break;
75168+ default:
75169+ impossible("", "");
75170+ }
75171+ ASSIGN_NODE_LIST(node, new_list);
75172+ if (0 && check_lists) {
75173+ int count;
75174+ tree_level level;
75175+
75176+ count = 0;
75177+
75178+ /* flush queue list */
75179+ /* reiser4_check_fq(atom); */
75180+
75181+ /* dirty list */
75182+ count = 0;
75183+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75184+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
75185+ count++;
75186+ }
75187+ if (count != atom->dirty)
75188+ warning("", "dirty counter %d, real %d\n", atom->dirty,
75189+ count);
75190+
75191+ /* clean list */
75192+ count = 0;
75193+ list_for_each(pos, ATOM_CLEAN_LIST(atom))
75194+ count++;
75195+ if (count != atom->clean)
75196+ warning("", "clean counter %d, real %d\n", atom->clean,
75197+ count);
75198+
75199+ /* wb list */
75200+ count = 0;
75201+ list_for_each(pos, ATOM_WB_LIST(atom))
75202+ count++;
75203+ if (count != atom->wb)
75204+ warning("", "wb counter %d, real %d\n", atom->wb,
75205+ count);
75206+
75207+ /* overwrite list */
75208+ count = 0;
75209+ list_for_each(pos, ATOM_OVRWR_LIST(atom))
75210+ count++;
75211+
75212+ if (count != atom->ovrwr)
75213+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
75214+ count);
75215+ }
75216+ assert("vs-1624", atom->num_queued == atom->fq);
75217+ if (atom->capture_count !=
75218+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
75219+ printk
75220+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
75221+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
75222+ atom->wb, atom->fq);
75223+ assert("vs-1622",
75224+ atom->capture_count ==
75225+ atom->dirty + atom->clean + atom->ovrwr + atom->wb +
75226+ atom->fq);
75227+ }
75228+}
75229+
75230+#endif
75231+
75232+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
75233+ * lock should be taken before calling this function. */
75234+void jnode_make_wander_nolock(jnode * node)
75235+{
75236+ txn_atom *atom;
75237+
75238+ assert("nikita-2431", node != NULL);
75239+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
75240+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
75241+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75242+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75243+
75244+ atom = node->atom;
75245+
75246+ assert("zam-895", atom != NULL);
75247+ assert("zam-894", atom_is_protected(atom));
75248+
75249+ JF_SET(node, JNODE_OVRWR);
75250+ /* move node to atom's overwrite list */
75251+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
75252+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
75253+}
75254+
75255+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
75256+ * this function. */
75257+void jnode_make_wander(jnode * node)
75258+{
75259+ txn_atom *atom;
75260+
75261+ spin_lock_jnode(node);
75262+ atom = jnode_get_atom(node);
75263+ assert("zam-913", atom != NULL);
75264+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
75265+
75266+ jnode_make_wander_nolock(node);
75267+ spin_unlock_atom(atom);
75268+ spin_unlock_jnode(node);
75269+}
75270+
75271+/* this just sets RELOC bit */
75272+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
75273+{
75274+ assert_spin_locked(&(node->guard));
75275+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
75276+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
75277+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
75278+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75279+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75280+ jnode_set_reloc(node);
75281+}
75282+
75283+/* Make znode RELOC and put it on flush queue */
75284+void znode_make_reloc(znode * z, flush_queue_t * fq)
75285+{
75286+ jnode *node;
75287+ txn_atom *atom;
75288+
75289+ node = ZJNODE(z);
75290+ spin_lock_jnode(node);
75291+
75292+ atom = jnode_get_atom(node);
75293+ assert("zam-919", atom != NULL);
75294+
75295+ jnode_make_reloc_nolock(fq, node);
75296+ queue_jnode(fq, node);
75297+
75298+ spin_unlock_atom(atom);
75299+ spin_unlock_jnode(node);
75300+
75301+}
75302+
75303+/* Make unformatted node RELOC and put it on flush queue */
75304+void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75305+{
75306+ assert("vs-1479", jnode_is_unformatted(node));
75307+
75308+ jnode_make_reloc_nolock(fq, node);
75309+ queue_jnode(fq, node);
75310+}
75311+
75312+int reiser4_capture_super_block(struct super_block *s)
75313+{
75314+ int result;
75315+ znode *uber;
75316+ lock_handle lh;
75317+
75318+ init_lh(&lh);
75319+ result = get_uber_znode(reiser4_get_tree(s),
75320+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75321+ if (result)
75322+ return result;
75323+
75324+ uber = lh.node;
75325+ /* Grabbing one block for superblock */
75326+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75327+ if (result != 0)
75328+ return result;
75329+
75330+ znode_make_dirty(uber);
75331+
75332+ done_lh(&lh);
75333+ return 0;
75334+}
75335+
75336+/* Wakeup every handle on the atom's WAITFOR list */
75337+static void wakeup_atom_waitfor_list(txn_atom * atom)
75338+{
75339+ txn_wait_links *wlinks;
75340+
75341+ assert("umka-210", atom != NULL);
75342+
75343+ /* atom is locked */
75344+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75345+ if (wlinks->waitfor_cb == NULL ||
75346+ wlinks->waitfor_cb(atom, wlinks))
75347+ /* Wake up. */
75348+ reiser4_wake_up(wlinks->_lock_stack);
75349+ }
75350+}
75351+
75352+/* Wakeup every handle on the atom's WAITING list */
75353+static void wakeup_atom_waiting_list(txn_atom * atom)
75354+{
75355+ txn_wait_links *wlinks;
75356+
75357+ assert("umka-211", atom != NULL);
75358+
75359+ /* atom is locked */
75360+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75361+ if (wlinks->waiting_cb == NULL ||
75362+ wlinks->waiting_cb(atom, wlinks))
75363+ /* Wake up. */
75364+ reiser4_wake_up(wlinks->_lock_stack);
75365+ }
75366+}
75367+
75368+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75369+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75370+{
75371+ assert("nikita-3330", atom != NULL);
75372+ assert_spin_locked(&(atom->alock));
75373+
75374+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75375+ * last transaction handle. */
75376+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75377+}
75378+
75379+/* The general purpose of this function is to wait on the first of two possible events.
75380+ The situation is that a handle (and its atom atomh) is blocked trying to capture a
75381+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
75382+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
75383+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75384+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
75385+ proceed and fuse the two atoms in the CAPTURE_WAIT state.
75386+
75387+ In other words, if either atomh or atomf change state, the handle will be awakened,
75388+ thus there are two lists per atom: WAITING and WAITFOR.
75389+
75390+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75391+ close but it is not assigned to an atom of its own.
75392+
75393+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75394+ BOTH_ATOM_LOCKS. Result: all four locks are released.
75395+*/
75396+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75397+ txn_atom * atomh, txn_capture mode)
75398+{
75399+ int ret;
75400+ txn_wait_links wlinks;
75401+
75402+ assert("umka-213", txnh != NULL);
75403+ assert("umka-214", atomf != NULL);
75404+
75405+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75406+ spin_unlock_txnh(txnh);
75407+ spin_unlock_atom(atomf);
75408+
75409+ if (atomh) {
75410+ spin_unlock_atom(atomh);
75411+ }
75412+
75413+ return RETERR(-E_BLOCK);
75414+ }
75415+
75416+ /* Initialize the waiting list links. */
75417+ init_wlinks(&wlinks);
75418+
75419+ /* Add txnh to atomf's waitfor list, unlock atomf. */
75420+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75421+ wlinks.waitfor_cb = wait_for_fusion;
75422+ atomic_inc(&atomf->refcount);
75423+ spin_unlock_atom(atomf);
75424+
75425+ if (atomh) {
75426+ /* Add txnh to atomh's waiting list, unlock atomh. */
75427+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75428+ atomic_inc(&atomh->refcount);
75429+ spin_unlock_atom(atomh);
75430+ }
75431+
75432+ /* Go to sleep. */
75433+ spin_unlock_txnh(txnh);
75434+
75435+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
75436+ if (ret == 0) {
75437+ reiser4_go_to_sleep(wlinks._lock_stack);
75438+ ret = RETERR(-E_REPEAT);
75439+ }
75440+
75441+ /* Remove from the waitfor list. */
75442+ spin_lock_atom(atomf);
75443+
75444+ list_del(&wlinks._fwaitfor_link);
75445+ atom_dec_and_unlock(atomf);
75446+
75447+ if (atomh) {
75448+ /* Remove from the waiting list. */
75449+ spin_lock_atom(atomh);
75450+ list_del(&wlinks._fwaiting_link);
75451+ atom_dec_and_unlock(atomh);
75452+ }
75453+ return ret;
75454+}
75455+
75456+static void lock_two_atoms(txn_atom * one, txn_atom * two)
75457+{
75458+ assert("zam-1067", one != two);
75459+
75460+ /* lock the atom with lesser address first */
75461+ if (one < two) {
75462+ spin_lock_atom(one);
75463+ spin_lock_atom_nested(two);
75464+ } else {
75465+ spin_lock_atom(two);
75466+ spin_lock_atom_nested(one);
75467+ }
75468+}
75469+
75470+/* Perform the necessary work to prepare for fusing two atoms, which involves
75471+ * acquiring two atom locks in the proper order. If one of the node's atom is
75472+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75473+ * atom is not then the handle's request is put to sleep. If the node's atom
75474+ * is committing, then the node can be copy-on-captured. Otherwise, pick the
75475+ * atom with fewer pointers to be fused into the atom with more pointer and
75476+ * call capture_fuse_into.
75477+ */
75478+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75479+{
75480+ txn_atom * txnh_atom = txnh->atom;
75481+ txn_atom * block_atom = node->atom;
75482+
75483+ atomic_inc(&txnh_atom->refcount);
75484+ atomic_inc(&block_atom->refcount);
75485+
75486+ spin_unlock_txnh(txnh);
75487+ spin_unlock_jnode(node);
75488+
75489+ lock_two_atoms(txnh_atom, block_atom);
75490+
75491+ if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75492+ release_two_atoms(txnh_atom, block_atom);
75493+ return RETERR(-E_REPEAT);
75494+ }
75495+
75496+ atomic_dec(&txnh_atom->refcount);
75497+ atomic_dec(&block_atom->refcount);
75498+
75499+ assert ("zam-1066", atom_isopen(txnh_atom));
75500+
75501+ if (txnh_atom->stage >= block_atom->stage ||
75502+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
75503+ capture_fuse_into(txnh_atom, block_atom);
75504+ return RETERR(-E_REPEAT);
75505+ }
75506+ spin_lock_txnh(txnh);
75507+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
75508+}
75509+
75510+/* This function splices together two jnode lists (small and large) and sets all jnodes in
75511+ the small list to point to the large atom. Returns the length of the list. */
75512+static int
75513+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
75514+ struct list_head *small_head)
75515+{
75516+ int count = 0;
75517+ jnode *node;
75518+
75519+ assert("umka-218", large != NULL);
75520+ assert("umka-219", large_head != NULL);
75521+ assert("umka-220", small_head != NULL);
75522+ /* small atom should be locked also. */
75523+ assert_spin_locked(&(large->alock));
75524+
75525+ /* For every jnode on small's capture list... */
75526+ list_for_each_entry(node, small_head, capture_link) {
75527+ count += 1;
75528+
75529+ /* With the jnode lock held, update atom pointer. */
75530+ spin_lock_jnode(node);
75531+ node->atom = large;
75532+ spin_unlock_jnode(node);
75533+ }
75534+
75535+ /* Splice the lists. */
75536+ list_splice_init(small_head, large_head->prev);
75537+
75538+ return count;
75539+}
75540+
75541+/* This function splices together two txnh lists (small and large) and sets all txn handles in
75542+ the small list to point to the large atom. Returns the length of the list. */
75543+static int
75544+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
75545+ struct list_head *small_head)
75546+{
75547+ int count = 0;
75548+ txn_handle *txnh;
75549+
75550+ assert("umka-221", large != NULL);
75551+ assert("umka-222", large_head != NULL);
75552+ assert("umka-223", small_head != NULL);
75553+
75554+ /* Adjust every txnh to the new atom. */
75555+ list_for_each_entry(txnh, small_head, txnh_link) {
75556+ count += 1;
75557+
75558+ /* With the txnh lock held, update atom pointer. */
75559+ spin_lock_txnh(txnh);
75560+ txnh->atom = large;
75561+ spin_unlock_txnh(txnh);
75562+ }
75563+
75564+ /* Splice the txn_handle list. */
75565+ list_splice_init(small_head, large_head->prev);
75566+
75567+ return count;
75568+}
75569+
75570+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
75571+ added to LARGE and their ->atom pointers are all updated. The associated counts are
75572+ updated as well, and any waiting handles belonging to either are awakened. Finally the
75573+ smaller atom's refcount is decremented.
75574+*/
75575+static void capture_fuse_into(txn_atom * small, txn_atom * large)
75576+{
75577+ int level;
75578+ unsigned zcount = 0;
75579+ unsigned tcount = 0;
75580+
75581+ assert("umka-224", small != NULL);
75582+ assert("umka-225", small != NULL);
75583+
75584+ assert_spin_locked(&(large->alock));
75585+ assert_spin_locked(&(small->alock));
75586+
75587+ assert("jmacd-201", atom_isopen(small));
75588+ assert("jmacd-202", atom_isopen(large));
75589+
75590+ /* Splice and update the per-level dirty jnode lists */
75591+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75592+ zcount +=
75593+ capture_fuse_jnode_lists(large,
75594+ ATOM_DIRTY_LIST(large, level),
75595+ ATOM_DIRTY_LIST(small, level));
75596+ }
75597+
75598+ /* Splice and update the [clean,dirty] jnode and txnh lists */
75599+ zcount +=
75600+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
75601+ ATOM_CLEAN_LIST(small));
75602+ zcount +=
75603+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
75604+ ATOM_OVRWR_LIST(small));
75605+ zcount +=
75606+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
75607+ ATOM_WB_LIST(small));
75608+ zcount +=
75609+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
75610+ tcount +=
75611+ capture_fuse_txnh_lists(large, &large->txnh_list,
75612+ &small->txnh_list);
75613+
75614+ /* Check our accounting. */
75615+ assert("jmacd-1063",
75616+ zcount + small->num_queued == small->capture_count);
75617+ assert("jmacd-1065", tcount == small->txnh_count);
75618+
75619+ /* sum numbers of waiters threads */
75620+ large->nr_waiters += small->nr_waiters;
75621+ small->nr_waiters = 0;
75622+
75623+ /* splice flush queues */
75624+ reiser4_fuse_fq(large, small);
75625+
75626+ /* update counter of jnode on every atom' list */
75627+ ON_DEBUG(large->dirty += small->dirty;
75628+ small->dirty = 0;
75629+ large->clean += small->clean;
75630+ small->clean = 0;
75631+ large->ovrwr += small->ovrwr;
75632+ small->ovrwr = 0;
75633+ large->wb += small->wb;
75634+ small->wb = 0;
75635+ large->fq += small->fq;
75636+ small->fq = 0;);
75637+
75638+ /* count flushers in result atom */
75639+ large->nr_flushers += small->nr_flushers;
75640+ small->nr_flushers = 0;
75641+
75642+ /* update counts of flushed nodes */
75643+ large->flushed += small->flushed;
75644+ small->flushed = 0;
75645+
75646+ /* Transfer list counts to large. */
75647+ large->txnh_count += small->txnh_count;
75648+ large->capture_count += small->capture_count;
75649+
75650+ /* Add all txnh references to large. */
75651+ atomic_add(small->txnh_count, &large->refcount);
75652+ atomic_sub(small->txnh_count, &small->refcount);
75653+
75654+ /* Reset small counts */
75655+ small->txnh_count = 0;
75656+ small->capture_count = 0;
75657+
75658+ /* Assign the oldest start_time, merge flags. */
75659+ large->start_time = min(large->start_time, small->start_time);
75660+ large->flags |= small->flags;
75661+
75662+ /* Merge blocknr sets. */
75663+ blocknr_set_merge(&small->delete_set, &large->delete_set);
75664+ blocknr_set_merge(&small->wandered_map, &large->wandered_map);
75665+
75666+ /* Merge allocated/deleted file counts */
75667+ large->nr_objects_deleted += small->nr_objects_deleted;
75668+ large->nr_objects_created += small->nr_objects_created;
75669+
75670+ small->nr_objects_deleted = 0;
75671+ small->nr_objects_created = 0;
75672+
75673+ /* Merge allocated blocks counts */
75674+ large->nr_blocks_allocated += small->nr_blocks_allocated;
75675+
75676+ large->nr_running_queues += small->nr_running_queues;
75677+ small->nr_running_queues = 0;
75678+
75679+ /* Merge blocks reserved for overwrite set. */
75680+ large->flush_reserved += small->flush_reserved;
75681+ small->flush_reserved = 0;
75682+
75683+ if (large->stage < small->stage) {
75684+ /* Large only needs to notify if it has changed state. */
75685+ reiser4_atom_set_stage(large, small->stage);
75686+ wakeup_atom_waiting_list(large);
75687+ }
75688+
75689+ reiser4_atom_set_stage(small, ASTAGE_INVALID);
75690+
75691+ /* Notify any waiters--small needs to unload its wait lists. Waiters
75692+ actually remove themselves from the list before returning from the
75693+ fuse_wait function. */
75694+ wakeup_atom_waiting_list(small);
75695+
75696+ /* Unlock atoms */
75697+ spin_unlock_atom(large);
75698+ atom_dec_and_unlock(small);
75699+}
75700+
75701+/* TXNMGR STUFF */
75702+
75703+/* Release a block from the atom, reversing the effects of being captured,
75704+ do not release atom's reference to jnode due to holding spin-locks.
75705+ Currently this is only called when the atom commits.
75706+
75707+ NOTE: this function does not release a (journal) reference to jnode
75708+ due to locking optimizations, you should call jput() somewhere after
75709+ calling reiser4_uncapture_block(). */
75710+void reiser4_uncapture_block(jnode * node)
75711+{
75712+ txn_atom *atom;
75713+
75714+ assert("umka-226", node != NULL);
75715+ atom = node->atom;
75716+ assert("umka-228", atom != NULL);
75717+
75718+ assert("jmacd-1021", node->atom == atom);
75719+ assert_spin_locked(&(node->guard));
75720+ assert("jmacd-1023", atom_is_protected(atom));
75721+
75722+ JF_CLR(node, JNODE_DIRTY);
75723+ JF_CLR(node, JNODE_RELOC);
75724+ JF_CLR(node, JNODE_OVRWR);
75725+ JF_CLR(node, JNODE_CREATED);
75726+ JF_CLR(node, JNODE_WRITEBACK);
75727+ JF_CLR(node, JNODE_REPACK);
75728+
75729+ list_del_init(&node->capture_link);
75730+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75731+ assert("zam-925", atom_isopen(atom));
75732+ assert("vs-1623", NODE_LIST(node) == FQ_LIST);
75733+ ON_DEBUG(atom->num_queued--);
75734+ JF_CLR(node, JNODE_FLUSH_QUEUED);
75735+ }
75736+ atom->capture_count -= 1;
75737+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
75738+ node->atom = NULL;
75739+
75740+ spin_unlock_jnode(node);
75741+ LOCK_CNT_DEC(t_refs);
75742+}
75743+
75744+/* Unconditional insert of jnode into atom's overwrite list. Currently used in
75745+ bitmap-based allocator code for adding modified bitmap blocks the
75746+ transaction. @atom and @node are spin locked */
75747+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
75748+{
75749+ assert("zam-538", atom_is_protected(atom));
75750+ assert_spin_locked(&(node->guard));
75751+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
75752+ assert("zam-543", node->atom == NULL);
75753+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
75754+
75755+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
75756+ jref(node);
75757+ node->atom = atom;
75758+ atom->capture_count++;
75759+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
75760+}
75761+
75762+static int count_deleted_blocks_actor(txn_atom * atom,
75763+ const reiser4_block_nr * a,
75764+ const reiser4_block_nr * b, void *data)
75765+{
75766+ reiser4_block_nr *counter = data;
75767+
75768+ assert("zam-995", data != NULL);
75769+ assert("zam-996", a != NULL);
75770+ if (b == NULL)
75771+ *counter += 1;
75772+ else
75773+ *counter += *b;
75774+ return 0;
75775+}
75776+
75777+reiser4_block_nr txnmgr_count_deleted_blocks(void)
75778+{
75779+ reiser4_block_nr result;
75780+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
75781+ txn_atom *atom;
75782+
75783+ result = 0;
75784+
75785+ spin_lock_txnmgr(tmgr);
75786+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
75787+ spin_lock_atom(atom);
75788+ if (atom_isopen(atom))
75789+ blocknr_set_iterator(
75790+ atom, &atom->delete_set,
75791+ count_deleted_blocks_actor, &result, 0);
75792+ spin_unlock_atom(atom);
75793+ }
75794+ spin_unlock_txnmgr(tmgr);
75795+
75796+ return result;
75797+}
75798+
75799+/*
75800+ * Local variables:
75801+ * c-indentation-style: "K&R"
75802+ * mode-name: "LC"
75803+ * c-basic-offset: 8
75804+ * tab-width: 8
75805+ * fill-column: 79
75806+ * End:
75807+ */
75808diff -urN linux-2.6.20.orig/fs/reiser4/txnmgr.h linux-2.6.20/fs/reiser4/txnmgr.h
75809--- linux-2.6.20.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
75810+++ linux-2.6.20/fs/reiser4/txnmgr.h 2007-05-06 14:50:43.899038216 +0400
75811@@ -0,0 +1,708 @@
75812+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75813+ * reiser4/README */
75814+
75815+/* data-types and function declarations for transaction manager. See txnmgr.c
75816+ * for details. */
75817+
75818+#ifndef __REISER4_TXNMGR_H__
75819+#define __REISER4_TXNMGR_H__
75820+
75821+#include "forward.h"
75822+#include "dformat.h"
75823+
75824+#include <linux/fs.h>
75825+#include <linux/mm.h>
75826+#include <linux/types.h>
75827+#include <linux/spinlock.h>
75828+#include <asm/atomic.h>
75829+#include <linux/wait.h>
75830+
75831+/* TYPE DECLARATIONS */
75832+
75833+/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
75834+ A capture request dynamically assigns a block to the calling thread's transaction
75835+ handle. */
75836+typedef enum {
75837+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's
75838+ atom should fuse in order to ensure that the block commits atomically with the
75839+ caller. */
75840+ TXN_CAPTURE_READ_ATOMIC = (1 << 0),
75841+
75842+ /* A READ_NONCOM request indicates that a block will be read and that the caller is
75843+ willing to read a non-committed block without causing atoms to fuse. */
75844+ TXN_CAPTURE_READ_NONCOM = (1 << 1),
75845+
75846+ /* A READ_MODIFY request indicates that a block will be read but that the caller
75847+ wishes for the block to be captured as it will be written. This capture request
75848+ mode is not currently used, but eventually it will be useful for preventing
75849+ deadlock in read-modify-write cycles. */
75850+ TXN_CAPTURE_READ_MODIFY = (1 << 2),
75851+
75852+ /* A WRITE capture request indicates that a block will be modified and that atoms
75853+ should fuse to make the commit atomic. */
75854+ TXN_CAPTURE_WRITE = (1 << 3),
75855+
75856+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
75857+ exclusive type designation from extra bits that may be supplied -- see
75858+ below. */
75859+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
75860+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
75861+ TXN_CAPTURE_WRITE),
75862+
75863+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
75864+ indicate modification will occur. */
75865+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
75866+
75867+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
75868+ prefer not to sleep waiting for an aging atom to commit. */
75869+ TXN_CAPTURE_NONBLOCKING = (1 << 4),
75870+
75871+ /* An option to reiser4_try_capture to prevent atom fusion, just simple
75872+ capturing is allowed */
75873+ TXN_CAPTURE_DONT_FUSE = (1 << 5)
75874+
75875+ /* This macro selects only the exclusive capture request types, stripping out any
75876+ options that were supplied (i.e., NONBLOCKING). */
75877+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
75878+} txn_capture;
75879+
75880+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
75881+ difference is in the handling of read requests. A WRITE_FUSING transaction handle
75882+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
75883+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
75884+typedef enum {
75885+ TXN_WRITE_FUSING = (1 << 0),
75886+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
75887+} txn_mode;
75888+
75889+/* Every atom has a stage, which is one of these exclusive values: */
75890+typedef enum {
75891+ /* Initially an atom is free. */
75892+ ASTAGE_FREE = 0,
75893+
75894+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
75895+ blocks and fuse with other atoms. */
75896+ ASTAGE_CAPTURE_FUSE = 1,
75897+
75898+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
75899+
75900+ /* When an atom reaches a certain age it must do all it can to commit. An atom in
75901+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
75902+ atoms in the CAPTURE_FUSE stage. */
75903+ ASTAGE_CAPTURE_WAIT = 2,
75904+
75905+ /* Waiting for I/O before commit. Copy-on-capture (see
75906+ http://namesys.com/v4/v4.html). */
75907+ ASTAGE_PRE_COMMIT = 3,
75908+
75909+ /* Post-commit overwrite I/O. Steal-on-capture. */
75910+ ASTAGE_POST_COMMIT = 4,
75911+
75912+ /* Atom which waits for the removal of the last reference to (it? ) to
75913+ * be deleted from memory */
75914+ ASTAGE_DONE = 5,
75915+
75916+ /* invalid atom. */
75917+ ASTAGE_INVALID = 6,
75918+
75919+} txn_stage;
75920+
75921+/* Certain flags may be set in the txn_atom->flags field. */
75922+typedef enum {
75923+ /* Indicates that the atom should commit as soon as possible. */
75924+ ATOM_FORCE_COMMIT = (1 << 0),
75925+ /* to avoid endless loop, mark the atom (which was considered as too
75926+ * small) after failed attempt to fuse it. */
75927+ ATOM_CANCEL_FUSION = (1 << 1)
75928+} txn_flags;
75929+
75930+/* Flags for controlling commit_txnh */
75931+typedef enum {
75932+ /* Wait commit atom completion in commit_txnh */
75933+ TXNH_WAIT_COMMIT = 0x2,
75934+ /* Don't commit atom when this handle is closed */
75935+ TXNH_DONT_COMMIT = 0x4
75936+} txn_handle_flags_t;
75937+
75938+/* TYPE DEFINITIONS */
75939+
75940+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
75941+ fields, so typically an operation on the atom through either of these objects must (1)
75942+ lock the object, (2) read the atom pointer, (3) lock the atom.
75943+
75944+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates
75945+ through the list of handles and pages held by the smaller of the two atoms. For each
75946+ handle and page referencing the smaller atom, the fusing process must: (1) lock the
75947+ object, and (2) update the atom pointer.
75948+
75949+ You can see that there is a conflict of lock ordering here, so the more-complex
75950+ procedure should have priority, i.e., the fusing process has priority so that it is
75951+ guaranteed to make progress and to avoid restarts.
75952+
75953+ This decision, however, means additional complexity for aquiring the atom lock in the
75954+ first place.
75955+
75956+ The general original procedure followed in the code was:
75957+
75958+ TXN_OBJECT *obj = ...;
75959+ TXN_ATOM *atom;
75960+
75961+ spin_lock (& obj->_lock);
75962+
75963+ atom = obj->_atom;
75964+
75965+ if (! spin_trylock_atom (atom))
75966+ {
75967+ spin_unlock (& obj->_lock);
75968+ RESTART OPERATION, THERE WAS A RACE;
75969+ }
75970+
75971+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75972+
75973+ It has however been found that this wastes CPU a lot in a manner that is
75974+ hard to profile. So, proper refcounting was added to atoms, and new
75975+ standard locking sequence is like following:
75976+
75977+ TXN_OBJECT *obj = ...;
75978+ TXN_ATOM *atom;
75979+
75980+ spin_lock (& obj->_lock);
75981+
75982+ atom = obj->_atom;
75983+
75984+ if (! spin_trylock_atom (atom))
75985+ {
75986+ atomic_inc (& atom->refcount);
75987+ spin_unlock (& obj->_lock);
75988+ spin_lock (&atom->_lock);
75989+ atomic_dec (& atom->refcount);
75990+ // HERE atom is locked
75991+ spin_unlock (&atom->_lock);
75992+ RESTART OPERATION, THERE WAS A RACE;
75993+ }
75994+
75995+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75996+
75997+ (core of this is implemented in trylock_throttle() function)
75998+
75999+ See the jnode_get_atom() function for a common case.
76000+
76001+ As an additional (and important) optimization allowing to avoid restarts,
76002+ it is possible to re-check required pre-conditions at the HERE point in
76003+ code above and proceed without restarting if they are still satisfied.
76004+*/
76005+
76006+/* An atomic transaction: this is the underlying system representation
76007+ of a transaction, not the one seen by clients.
76008+
76009+ Invariants involving this data-type:
76010+
76011+ [sb-fake-allocated]
76012+*/
76013+struct txn_atom {
76014+ /* The spinlock protecting the atom, held during fusion and various other state
76015+ changes. */
76016+ spinlock_t alock;
76017+
76018+ /* The atom's reference counter, increasing (in case of a duplication
76019+ of an existing reference or when we are sure that some other
76020+ reference exists) may be done without taking spinlock, decrementing
76021+ of the ref. counter requires a spinlock to be held.
76022+
76023+ Each transaction handle counts in ->refcount. All jnodes count as
76024+ one reference acquired in atom_begin_andlock(), released in
76025+ commit_current_atom().
76026+ */
76027+ atomic_t refcount;
76028+
76029+ /* The atom_id identifies the atom in persistent records such as the log. */
76030+ __u32 atom_id;
76031+
76032+ /* Flags holding any of the txn_flags enumerated values (e.g.,
76033+ ATOM_FORCE_COMMIT). */
76034+ __u32 flags;
76035+
76036+ /* Number of open handles. */
76037+ __u32 txnh_count;
76038+
76039+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the
76040+ dirty_nodes[level] and clean_nodes lists. */
76041+ __u32 capture_count;
76042+
76043+#if REISER4_DEBUG
76044+ int clean;
76045+ int dirty;
76046+ int ovrwr;
76047+ int wb;
76048+ int fq;
76049+#endif
76050+
76051+ __u32 flushed;
76052+
76053+ /* Current transaction stage. */
76054+ txn_stage stage;
76055+
76056+ /* Start time. */
76057+ unsigned long start_time;
76058+
76059+ /* The atom's delete set. It collects block numbers of the nodes
76060+ which were deleted during the transaction. */
76061+ struct list_head delete_set;
76062+
76063+ /* The atom's wandered_block mapping. */
76064+ struct list_head wandered_map;
76065+
76066+ /* The transaction's list of dirty captured nodes--per level. Index
76067+ by (level). dirty_nodes[0] is for znode-above-root */
76068+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
76069+
76070+ /* The transaction's list of clean captured nodes. */
76071+ struct list_head clean_nodes;
76072+
76073+ /* The atom's overwrite set */
76074+ struct list_head ovrwr_nodes;
76075+
76076+ /* nodes which are being written to disk */
76077+ struct list_head writeback_nodes;
76078+
76079+ /* list of inodes */
76080+ struct list_head inodes;
76081+
76082+ /* List of handles associated with this atom. */
76083+ struct list_head txnh_list;
76084+
76085+ /* Transaction list link: list of atoms in the transaction manager. */
76086+ struct list_head atom_link;
76087+
76088+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
76089+ struct list_head fwaitfor_list;
76090+
76091+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
76092+ struct list_head fwaiting_list;
76093+
76094+ /* Numbers of objects which were deleted/created in this transaction
76095+ thereby numbers of objects IDs which were released/deallocated. */
76096+ int nr_objects_deleted;
76097+ int nr_objects_created;
76098+ /* number of blocks allocated during the transaction */
76099+ __u64 nr_blocks_allocated;
76100+ /* All atom's flush queue objects are on this list */
76101+ struct list_head flush_queues;
76102+#if REISER4_DEBUG
76103+ /* number of flush queues for this atom. */
76104+ int nr_flush_queues;
76105+ /* Number of jnodes which were removed from atom's lists and put
76106+ on flush_queue */
76107+ int num_queued;
76108+#endif
76109+ /* number of threads who wait for this atom to complete commit */
76110+ int nr_waiters;
76111+ /* number of threads which do jnode_flush() over this atom */
76112+ int nr_flushers;
76113+ /* number of flush queues which are IN_USE and jnodes from fq->prepped
76114+ are submitted to disk by the reiser4_write_fq() routine. */
76115+ int nr_running_queues;
76116+ /* A counter of grabbed unformatted nodes, see a description of the
76117+ * reiser4 space reservation scheme at block_alloc.c */
76118+ reiser4_block_nr flush_reserved;
76119+#if REISER4_DEBUG
76120+ void *committer;
76121+#endif
76122+ struct super_block *super;
76123+};
76124+
76125+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
76126+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
76127+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
76128+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
76129+#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
76130+
76131+#define NODE_LIST(node) (node)->list
76132+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
76133+ON_DEBUG(void
76134+ count_jnode(txn_atom *, jnode *, atom_list old_list,
76135+ atom_list new_list, int check_lists));
76136+
76137+typedef struct protected_jnodes {
76138+ struct list_head inatom; /* link to atom's list these structures */
76139+ struct list_head nodes; /* head of list of protected nodes */
76140+} protected_jnodes;
76141+
76142+/* A transaction handle: the client obtains and commits this handle which is assigned by
76143+ the system to a txn_atom. */
76144+struct txn_handle {
76145+ /* Spinlock protecting ->atom pointer */
76146+ spinlock_t hlock;
76147+
76148+ /* Flags for controlling commit_txnh() behavior */
76149+ /* from txn_handle_flags_t */
76150+ txn_handle_flags_t flags;
76151+
76152+ /* Whether it is READ_FUSING or WRITE_FUSING. */
76153+ txn_mode mode;
76154+
76155+ /* If assigned, the atom it is part of. */
76156+ txn_atom *atom;
76157+
76158+ /* Transaction list link. Head is in txn_atom. */
76159+ struct list_head txnh_link;
76160+};
76161+
76162+/* The transaction manager: one is contained in the reiser4_super_info_data */
76163+struct txn_mgr {
76164+ /* A spinlock protecting the atom list, id_count, flush_control */
76165+ spinlock_t tmgr_lock;
76166+
76167+ /* List of atoms. */
76168+ struct list_head atoms_list;
76169+
76170+ /* Number of atoms. */
76171+ int atom_count;
76172+
76173+ /* A counter used to assign atom->atom_id values. */
76174+ __u32 id_count;
76175+
76176+ /* a mutex object for commit serialization */
76177+ struct mutex commit_mutex;
76178+
76179+ /* a list of all txnmrgs served by particular daemon. */
76180+ struct list_head linkage;
76181+
76182+ /* description of daemon for this txnmgr */
76183+ ktxnmgrd_context *daemon;
76184+
76185+ /* parameters. Adjustable through mount options. */
76186+ unsigned int atom_max_size;
76187+ unsigned int atom_max_age;
76188+ unsigned int atom_min_size;
76189+ /* max number of concurrent flushers for one atom, 0 - unlimited. */
76190+ unsigned int atom_max_flushers;
76191+ struct dentry *debugfs_atom_count;
76192+ struct dentry *debugfs_id_count;
76193+};
76194+
76195+/* FUNCTION DECLARATIONS */
76196+
76197+/* These are the externally (within Reiser4) visible transaction functions, therefore they
76198+ are prefixed with "txn_". For comments, see txnmgr.c. */
76199+
76200+extern int init_txnmgr_static(void);
76201+extern void done_txnmgr_static(void);
76202+
76203+extern void reiser4_init_txnmgr(txn_mgr *);
76204+extern void reiser4_done_txnmgr(txn_mgr *);
76205+
76206+extern int reiser4_txn_reserve(int reserved);
76207+
76208+extern void reiser4_txn_begin(reiser4_context * context);
76209+extern int reiser4_txn_end(reiser4_context * context);
76210+
76211+extern void reiser4_txn_restart(reiser4_context * context);
76212+extern void reiser4_txn_restart_current(void);
76213+
76214+extern int txnmgr_force_commit_all(struct super_block *, int);
76215+extern int current_atom_should_commit(void);
76216+
76217+extern jnode *find_first_dirty_jnode(txn_atom *, int);
76218+
76219+extern int commit_some_atoms(txn_mgr *);
76220+extern int force_commit_atom(txn_handle *);
76221+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
76222+
76223+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
76224+
76225+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
76226+
76227+extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
76228+ int alloc_value);
76229+extern void atom_dec_and_unlock(txn_atom * atom);
76230+
76231+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
76232+extern int try_capture_page_to_invalidate(struct page *pg);
76233+
76234+extern void reiser4_uncapture_page(struct page *pg);
76235+extern void reiser4_uncapture_block(jnode *);
76236+extern void reiser4_uncapture_jnode(jnode *);
76237+
76238+extern int reiser4_capture_inode(struct inode *);
76239+extern int reiser4_uncapture_inode(struct inode *);
76240+
76241+extern txn_atom *get_current_atom_locked_nocheck(void);
76242+
76243+#if REISER4_DEBUG
76244+
76245+/**
76246+ * atom_is_protected - make sure that nobody but us can do anything with atom
76247+ * @atom: atom to be checked
76248+ *
76249+ * This is used to assert that atom either entered commit stages or is spin
76250+ * locked.
76251+ */
76252+static inline int atom_is_protected(txn_atom *atom)
76253+{
76254+ if (atom->stage >= ASTAGE_PRE_COMMIT)
76255+ return 1;
76256+ assert_spin_locked(&(atom->alock));
76257+ return 1;
76258+}
76259+
76260+#endif
76261+
76262+/* Get the current atom and spinlock it if current atom present. May not return NULL */
76263+static inline txn_atom *get_current_atom_locked(void)
76264+{
76265+ txn_atom *atom;
76266+
76267+ atom = get_current_atom_locked_nocheck();
76268+ assert("zam-761", atom != NULL);
76269+
76270+ return atom;
76271+}
76272+
76273+extern txn_atom *jnode_get_atom(jnode *);
76274+
76275+extern void reiser4_atom_wait_event(txn_atom *);
76276+extern void reiser4_atom_send_event(txn_atom *);
76277+
76278+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
76279+extern int reiser4_capture_super_block(struct super_block *s);
76280+int capture_bulk(jnode **, int count);
76281+
76282+/* See the comment on the function blocknrset.c:blocknr_set_add for the
76283+ calling convention of these three routines. */
76284+extern void blocknr_set_init(struct list_head * bset);
76285+extern void blocknr_set_destroy(struct list_head * bset);
76286+extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
76287+extern int blocknr_set_add_extent(txn_atom * atom,
76288+ struct list_head * bset,
76289+ blocknr_set_entry ** new_bsep,
76290+ const reiser4_block_nr * start,
76291+ const reiser4_block_nr * len);
76292+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
76293+ blocknr_set_entry ** new_bsep,
76294+ const reiser4_block_nr * a,
76295+ const reiser4_block_nr * b);
76296+
76297+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76298+ const reiser4_block_nr *, void *);
76299+
76300+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
76301+ blocknr_set_actor_f actor, void *data,
76302+ int delete);
76303+
76304+/* flush code takes care about how to fuse flush queues */
76305+extern void flush_init_atom(txn_atom * atom);
76306+extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76307+
76308+static inline void spin_lock_atom(txn_atom *atom)
76309+{
76310+ /* check that spinlocks of lower priorities are not held */
76311+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76312+ LOCK_CNT_NIL(spin_locked_atom) &&
76313+ LOCK_CNT_NIL(spin_locked_jnode) &&
76314+ LOCK_CNT_NIL(spin_locked_zlock) &&
76315+ LOCK_CNT_NIL(rw_locked_dk) &&
76316+ LOCK_CNT_NIL(rw_locked_tree)));
76317+
76318+ spin_lock(&(atom->alock));
76319+
76320+ LOCK_CNT_INC(spin_locked_atom);
76321+ LOCK_CNT_INC(spin_locked);
76322+}
76323+
76324+static inline void spin_lock_atom_nested(txn_atom *atom)
76325+{
76326+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76327+ LOCK_CNT_NIL(spin_locked_jnode) &&
76328+ LOCK_CNT_NIL(spin_locked_zlock) &&
76329+ LOCK_CNT_NIL(rw_locked_dk) &&
76330+ LOCK_CNT_NIL(rw_locked_tree)));
76331+
76332+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
76333+
76334+ LOCK_CNT_INC(spin_locked_atom);
76335+ LOCK_CNT_INC(spin_locked);
76336+}
76337+
76338+static inline int spin_trylock_atom(txn_atom *atom)
76339+{
76340+ if (spin_trylock(&(atom->alock))) {
76341+ LOCK_CNT_INC(spin_locked_atom);
76342+ LOCK_CNT_INC(spin_locked);
76343+ return 1;
76344+ }
76345+ return 0;
76346+}
76347+
76348+static inline void spin_unlock_atom(txn_atom *atom)
76349+{
76350+ assert_spin_locked(&(atom->alock));
76351+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76352+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76353+
76354+ LOCK_CNT_DEC(spin_locked_atom);
76355+ LOCK_CNT_DEC(spin_locked);
76356+
76357+ spin_unlock(&(atom->alock));
76358+}
76359+
76360+static inline void spin_lock_txnh(txn_handle *txnh)
76361+{
76362+ /* check that spinlocks of lower priorities are not held */
76363+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76364+ LOCK_CNT_NIL(spin_locked_zlock) &&
76365+ LOCK_CNT_NIL(rw_locked_tree)));
76366+
76367+ spin_lock(&(txnh->hlock));
76368+
76369+ LOCK_CNT_INC(spin_locked_txnh);
76370+ LOCK_CNT_INC(spin_locked);
76371+}
76372+
76373+static inline int spin_trylock_txnh(txn_handle *txnh)
76374+{
76375+ if (spin_trylock(&(txnh->hlock))) {
76376+ LOCK_CNT_INC(spin_locked_txnh);
76377+ LOCK_CNT_INC(spin_locked);
76378+ return 1;
76379+ }
76380+ return 0;
76381+}
76382+
76383+static inline void spin_unlock_txnh(txn_handle *txnh)
76384+{
76385+ assert_spin_locked(&(txnh->hlock));
76386+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76387+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76388+
76389+ LOCK_CNT_DEC(spin_locked_txnh);
76390+ LOCK_CNT_DEC(spin_locked);
76391+
76392+ spin_unlock(&(txnh->hlock));
76393+}
76394+
76395+#define spin_ordering_pred_txnmgr(tmgr) \
76396+ ( LOCK_CNT_NIL(spin_locked_atom) && \
76397+ LOCK_CNT_NIL(spin_locked_txnh) && \
76398+ LOCK_CNT_NIL(spin_locked_jnode) && \
76399+ LOCK_CNT_NIL(rw_locked_zlock) && \
76400+ LOCK_CNT_NIL(rw_locked_dk) && \
76401+ LOCK_CNT_NIL(rw_locked_tree) )
76402+
76403+static inline void spin_lock_txnmgr(txn_mgr *mgr)
76404+{
76405+ /* check that spinlocks of lower priorities are not held */
76406+ assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76407+ LOCK_CNT_NIL(spin_locked_txnh) &&
76408+ LOCK_CNT_NIL(spin_locked_jnode) &&
76409+ LOCK_CNT_NIL(spin_locked_zlock) &&
76410+ LOCK_CNT_NIL(rw_locked_dk) &&
76411+ LOCK_CNT_NIL(rw_locked_tree)));
76412+
76413+ spin_lock(&(mgr->tmgr_lock));
76414+
76415+ LOCK_CNT_INC(spin_locked_txnmgr);
76416+ LOCK_CNT_INC(spin_locked);
76417+}
76418+
76419+static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76420+{
76421+ if (spin_trylock(&(mgr->tmgr_lock))) {
76422+ LOCK_CNT_INC(spin_locked_txnmgr);
76423+ LOCK_CNT_INC(spin_locked);
76424+ return 1;
76425+ }
76426+ return 0;
76427+}
76428+
76429+static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76430+{
76431+ assert_spin_locked(&(mgr->tmgr_lock));
76432+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76433+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76434+
76435+ LOCK_CNT_DEC(spin_locked_txnmgr);
76436+ LOCK_CNT_DEC(spin_locked);
76437+
76438+ spin_unlock(&(mgr->tmgr_lock));
76439+}
76440+
76441+typedef enum {
76442+ FQ_IN_USE = 0x1
76443+} flush_queue_state_t;
76444+
76445+typedef struct flush_queue flush_queue_t;
76446+
76447+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76448+ is filled by the jnode_flush() routine, and written to disk under memory
76449+ pressure or at atom commit time. */
76450+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76451+ field and fq->prepped list can be modified if atom is spin-locked and fq
76452+ object is "in-use" state. For read-only traversal of the fq->prepped list
76453+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76454+ only have atom spin-locked. */
76455+struct flush_queue {
76456+ /* linkage element is the first in this structure to make debugging
76457+ easier. See field in atom struct for description of list. */
76458+ struct list_head alink;
76459+ /* A spinlock to protect changes of fq state and fq->atom pointer */
76460+ spinlock_t guard;
76461+ /* flush_queue state: [in_use | ready] */
76462+ flush_queue_state_t state;
76463+ /* A list which contains queued nodes, queued nodes are removed from any
76464+ * atom's list and put on this ->prepped one. */
76465+ struct list_head prepped;
76466+ /* number of submitted i/o requests */
76467+ atomic_t nr_submitted;
76468+ /* number of i/o errors */
76469+ atomic_t nr_errors;
76470+ /* An atom this flush queue is attached to */
76471+ txn_atom *atom;
76472+ /* A wait queue head to wait on i/o completion */
76473+ wait_queue_head_t wait;
76474+#if REISER4_DEBUG
76475+ /* A thread which took this fq in exclusive use, NULL if fq is free,
76476+ * used for debugging. */
76477+ struct task_struct *owner;
76478+#endif
76479+};
76480+
76481+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
76482+extern void reiser4_fq_put_nolock(flush_queue_t *);
76483+extern void reiser4_fq_put(flush_queue_t *);
76484+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
76485+extern void queue_jnode(flush_queue_t *, jnode *);
76486+
76487+extern int reiser4_write_fq(flush_queue_t *, long *, int);
76488+extern int current_atom_finish_all_fq(void);
76489+extern void init_atom_fq_parts(txn_atom *);
76490+
76491+extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76492+
76493+extern void znode_make_dirty(znode * node);
76494+extern void jnode_make_dirty_locked(jnode * node);
76495+
76496+extern int reiser4_sync_atom(txn_atom * atom);
76497+
76498+#if REISER4_DEBUG
76499+extern int atom_fq_parts_are_clean(txn_atom *);
76500+#endif
76501+
76502+extern void add_fq_to_bio(flush_queue_t *, struct bio *);
76503+extern flush_queue_t *get_fq_for_current_atom(void);
76504+
76505+void protected_jnodes_init(protected_jnodes * list);
76506+void protected_jnodes_done(protected_jnodes * list);
76507+void reiser4_invalidate_list(struct list_head * head);
76508+
76509+# endif /* __REISER4_TXNMGR_H__ */
76510+
76511+/* Make Linus happy.
76512+ Local variables:
76513+ c-indentation-style: "K&R"
76514+ mode-name: "LC"
76515+ c-basic-offset: 8
76516+ tab-width: 8
76517+ fill-column: 120
76518+ End:
76519+*/
76520diff -urN linux-2.6.20.orig/fs/reiser4/type_safe_hash.h linux-2.6.20/fs/reiser4/type_safe_hash.h
76521--- linux-2.6.20.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
76522+++ linux-2.6.20/fs/reiser4/type_safe_hash.h 2007-05-06 14:50:43.899038216 +0400
76523@@ -0,0 +1,320 @@
76524+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76525+ * reiser4/README */
76526+
76527+/* A hash table class that uses hash chains (singly-linked) and is
76528+ parametrized to provide type safety. */
76529+
76530+#ifndef __REISER4_TYPE_SAFE_HASH_H__
76531+#define __REISER4_TYPE_SAFE_HASH_H__
76532+
76533+#include "debug.h"
76534+
76535+#include <asm/errno.h>
76536+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
76537+ based on the object type. You need to declare the item type before
76538+ this definition, define it after this definition. */
76539+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
76540+ \
76541+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
76542+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
76543+ \
76544+struct PREFIX##_hash_table_ \
76545+{ \
76546+ ITEM_TYPE **_table; \
76547+ __u32 _buckets; \
76548+}; \
76549+ \
76550+struct PREFIX##_hash_link_ \
76551+{ \
76552+ ITEM_TYPE *_next; \
76553+}
76554+
76555+/* Step 2: Define the object type of the hash: give it field of type
76556+ PREFIX_hash_link. */
76557+
76558+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
76559+ the type and field name used in step 3. The arguments are:
76560+
76561+ ITEM_TYPE The item type being hashed
76562+ KEY_TYPE The type of key being hashed
76563+ KEY_NAME The name of the key field within the item
76564+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
76565+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
76566+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
76567+
76568+ It implements these functions:
76569+
76570+ prefix_hash_init Initialize the table given its size.
76571+ prefix_hash_insert Insert an item
76572+ prefix_hash_insert_index Insert an item w/ precomputed hash_index
76573+ prefix_hash_find Find an item by key
76574+ prefix_hash_find_index Find an item w/ precomputed hash_index
76575+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
76576+ prefix_hash_remove_index Remove an item w/ precomputed hash_index
76577+
76578+ If you'd like something to be done differently, feel free to ask me
76579+ for modifications. Additional features that could be added but
76580+ have not been:
76581+
76582+ prefix_hash_remove_key Find and remove an item by key
76583+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
76584+
76585+ The hash_function currently receives only the key as an argument,
76586+ meaning it must somehow know the number of buckets. If this is a
76587+ problem let me know.
76588+
76589+ This hash table uses a single-linked hash chain. This means
76590+ insertion is fast but deletion requires searching the chain.
76591+
76592+ There is also the doubly-linked hash chain approach, under which
76593+ deletion requires no search but the code is longer and it takes two
76594+ pointers per item.
76595+
76596+ The circularly-linked approach has the shortest code but requires
76597+ two pointers per bucket, doubling the size of the bucket array (in
76598+ addition to two pointers per item).
76599+*/
76600+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
76601+ \
76602+static __inline__ void \
76603+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
76604+ __u32 hash UNUSED_ARG) \
76605+{ \
76606+ assert("nikita-2780", hash < table->_buckets); \
76607+} \
76608+ \
76609+static __inline__ int \
76610+PREFIX##_hash_init (PREFIX##_hash_table *hash, \
76611+ __u32 buckets) \
76612+{ \
76613+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
76614+ hash->_buckets = buckets; \
76615+ if (hash->_table == NULL) \
76616+ { \
76617+ return RETERR(-ENOMEM); \
76618+ } \
76619+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
76620+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
76621+ return 0; \
76622+} \
76623+ \
76624+static __inline__ void \
76625+PREFIX##_hash_done (PREFIX##_hash_table *hash) \
76626+{ \
76627+ if (REISER4_DEBUG && hash->_table != NULL) { \
76628+ __u32 i; \
76629+ for (i = 0 ; i < hash->_buckets ; ++ i) \
76630+ assert("nikita-2905", hash->_table[i] == NULL); \
76631+ } \
76632+ if (hash->_table != NULL) \
76633+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
76634+ hash->_table = NULL; \
76635+} \
76636+ \
76637+static __inline__ void \
76638+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
76639+{ \
76640+ prefetch(item->LINK_NAME._next); \
76641+} \
76642+ \
76643+static __inline__ void \
76644+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
76645+ __u32 index) \
76646+{ \
76647+ prefetch(hash->_table[index]); \
76648+} \
76649+ \
76650+static __inline__ ITEM_TYPE* \
76651+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
76652+ __u32 hash_index, \
76653+ KEY_TYPE const *find_key) \
76654+{ \
76655+ ITEM_TYPE *item; \
76656+ \
76657+ PREFIX##_check_hash(hash, hash_index); \
76658+ \
76659+ for (item = hash->_table[hash_index]; \
76660+ item != NULL; \
76661+ item = item->LINK_NAME._next) \
76662+ { \
76663+ prefetch(item->LINK_NAME._next); \
76664+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
76665+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \
76666+ { \
76667+ return item; \
76668+ } \
76669+ } \
76670+ \
76671+ return NULL; \
76672+} \
76673+ \
76674+static __inline__ ITEM_TYPE* \
76675+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
76676+ __u32 hash_index, \
76677+ KEY_TYPE const *find_key) \
76678+{ \
76679+ ITEM_TYPE ** item = &hash->_table[hash_index]; \
76680+ \
76681+ PREFIX##_check_hash(hash, hash_index); \
76682+ \
76683+ while (*item != NULL) { \
76684+ prefetch(&(*item)->LINK_NAME._next); \
76685+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
76686+ ITEM_TYPE *found; \
76687+ \
76688+ found = *item; \
76689+ *item = found->LINK_NAME._next; \
76690+ found->LINK_NAME._next = hash->_table[hash_index]; \
76691+ hash->_table[hash_index] = found; \
76692+ return found; \
76693+ } \
76694+ item = &(*item)->LINK_NAME._next; \
76695+ } \
76696+ return NULL; \
76697+} \
76698+ \
76699+static __inline__ int \
76700+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
76701+ __u32 hash_index, \
76702+ ITEM_TYPE *del_item) \
76703+{ \
76704+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
76705+ \
76706+ PREFIX##_check_hash(hash, hash_index); \
76707+ \
76708+ while (*hash_item_p != NULL) { \
76709+ prefetch(&(*hash_item_p)->LINK_NAME._next); \
76710+ if (*hash_item_p == del_item) { \
76711+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
76712+ return 1; \
76713+ } \
76714+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
76715+ } \
76716+ return 0; \
76717+} \
76718+ \
76719+static __inline__ void \
76720+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
76721+ __u32 hash_index, \
76722+ ITEM_TYPE *ins_item) \
76723+{ \
76724+ PREFIX##_check_hash(hash, hash_index); \
76725+ \
76726+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76727+ hash->_table[hash_index] = ins_item; \
76728+} \
76729+ \
76730+static __inline__ void \
76731+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
76732+ __u32 hash_index, \
76733+ ITEM_TYPE *ins_item) \
76734+{ \
76735+ PREFIX##_check_hash(hash, hash_index); \
76736+ \
76737+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76738+ smp_wmb(); \
76739+ hash->_table[hash_index] = ins_item; \
76740+} \
76741+ \
76742+static __inline__ ITEM_TYPE* \
76743+PREFIX##_hash_find (PREFIX##_hash_table *hash, \
76744+ KEY_TYPE const *find_key) \
76745+{ \
76746+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
76747+} \
76748+ \
76749+static __inline__ ITEM_TYPE* \
76750+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
76751+ KEY_TYPE const *find_key) \
76752+{ \
76753+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
76754+} \
76755+ \
76756+static __inline__ int \
76757+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
76758+ ITEM_TYPE *del_item) \
76759+{ \
76760+ return PREFIX##_hash_remove_index (hash, \
76761+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
76762+} \
76763+ \
76764+static __inline__ int \
76765+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
76766+ ITEM_TYPE *del_item) \
76767+{ \
76768+ return PREFIX##_hash_remove (hash, del_item); \
76769+} \
76770+ \
76771+static __inline__ void \
76772+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
76773+ ITEM_TYPE *ins_item) \
76774+{ \
76775+ return PREFIX##_hash_insert_index (hash, \
76776+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
76777+} \
76778+ \
76779+static __inline__ void \
76780+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
76781+ ITEM_TYPE *ins_item) \
76782+{ \
76783+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
76784+ ins_item); \
76785+} \
76786+ \
76787+static __inline__ ITEM_TYPE * \
76788+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
76789+{ \
76790+ ITEM_TYPE *first; \
76791+ \
76792+ for (first = NULL; ind < hash->_buckets; ++ ind) { \
76793+ first = hash->_table[ind]; \
76794+ if (first != NULL) \
76795+ break; \
76796+ } \
76797+ return first; \
76798+} \
76799+ \
76800+static __inline__ ITEM_TYPE * \
76801+PREFIX##_hash_next (PREFIX##_hash_table *hash, \
76802+ ITEM_TYPE *item) \
76803+{ \
76804+ ITEM_TYPE *next; \
76805+ \
76806+ if (item == NULL) \
76807+ return NULL; \
76808+ next = item->LINK_NAME._next; \
76809+ if (next == NULL) \
76810+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
76811+ return next; \
76812+} \
76813+ \
76814+typedef struct {} PREFIX##_hash_dummy
76815+
76816+#define for_all_ht_buckets(table, head) \
76817+for ((head) = &(table) -> _table[ 0 ] ; \
76818+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
76819+
76820+#define for_all_in_bucket(bucket, item, next, field) \
76821+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
76822+ (item) != NULL ; \
76823+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
76824+
76825+#define for_all_in_htable(table, prefix, item, next) \
76826+for ((item) = prefix ## _hash_first ((table), 0), \
76827+ (next) = prefix ## _hash_next ((table), (item)) ; \
76828+ (item) != NULL ; \
76829+ (item) = (next), \
76830+ (next) = prefix ## _hash_next ((table), (item)))
76831+
76832+/* __REISER4_TYPE_SAFE_HASH_H__ */
76833+#endif
76834+
76835+/* Make Linus happy.
76836+ Local variables:
76837+ c-indentation-style: "K&R"
76838+ mode-name: "LC"
76839+ c-basic-offset: 8
76840+ tab-width: 8
76841+ fill-column: 120
76842+ End:
76843+*/
76844diff -urN linux-2.6.20.orig/fs/reiser4/vfs_ops.c linux-2.6.20/fs/reiser4/vfs_ops.c
76845--- linux-2.6.20.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300
76846+++ linux-2.6.20/fs/reiser4/vfs_ops.c 2007-05-06 14:50:43.899038216 +0400
76847@@ -0,0 +1,259 @@
76848+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76849+ * reiser4/README */
76850+
76851+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
76852+ here. */
76853+
76854+#include "forward.h"
76855+#include "debug.h"
76856+#include "dformat.h"
76857+#include "coord.h"
76858+#include "plugin/item/item.h"
76859+#include "plugin/file/file.h"
76860+#include "plugin/security/perm.h"
76861+#include "plugin/disk_format/disk_format.h"
76862+#include "plugin/plugin.h"
76863+#include "plugin/plugin_set.h"
76864+#include "plugin/object.h"
76865+#include "txnmgr.h"
76866+#include "jnode.h"
76867+#include "znode.h"
76868+#include "block_alloc.h"
76869+#include "tree.h"
76870+#include "vfs_ops.h"
76871+#include "inode.h"
76872+#include "page_cache.h"
76873+#include "ktxnmgrd.h"
76874+#include "super.h"
76875+#include "reiser4.h"
76876+#include "entd.h"
76877+#include "status_flags.h"
76878+#include "flush.h"
76879+#include "dscale.h"
76880+
76881+#include <linux/profile.h>
76882+#include <linux/types.h>
76883+#include <linux/mount.h>
76884+#include <linux/vfs.h>
76885+#include <linux/mm.h>
76886+#include <linux/buffer_head.h>
76887+#include <linux/dcache.h>
76888+#include <linux/list.h>
76889+#include <linux/pagemap.h>
76890+#include <linux/slab.h>
76891+#include <linux/seq_file.h>
76892+#include <linux/init.h>
76893+#include <linux/module.h>
76894+#include <linux/writeback.h>
76895+#include <linux/blkdev.h>
76896+#include <linux/quotaops.h>
76897+#include <linux/security.h>
76898+#include <linux/reboot.h>
76899+#include <linux/rcupdate.h>
76900+
76901+/* update inode stat-data by calling plugin */
76902+int reiser4_update_sd(struct inode *object)
76903+{
76904+ file_plugin *fplug;
76905+
76906+ assert("nikita-2338", object != NULL);
76907+ /* check for read-only file system. */
76908+ if (IS_RDONLY(object))
76909+ return 0;
76910+
76911+ fplug = inode_file_plugin(object);
76912+ assert("nikita-2339", fplug != NULL);
76913+ return fplug->write_sd_by_inode(object);
76914+}
76915+
76916+/* helper function: increase inode nlink count and call plugin method to save
76917+ updated stat-data.
76918+
76919+ Used by link/create and during creation of dot and dotdot in mkdir
76920+*/
76921+int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
76922+ struct inode *parent /* parent where new entry will be */
76923+ ,
76924+ int write_sd_p /* true if stat-data has to be
76925+ * updated */ )
76926+{
76927+ file_plugin *fplug;
76928+ int result;
76929+
76930+ assert("nikita-1351", object != NULL);
76931+
76932+ fplug = inode_file_plugin(object);
76933+ assert("nikita-1445", fplug != NULL);
76934+
76935+ /* ask plugin whether it can add yet another link to this
76936+ object */
76937+ if (!fplug->can_add_link(object))
76938+ return RETERR(-EMLINK);
76939+
76940+ assert("nikita-2211", fplug->add_link != NULL);
76941+ /* call plugin to do actual addition of link */
76942+ result = fplug->add_link(object, parent);
76943+
76944+ /* optionally update stat data */
76945+ if (result == 0 && write_sd_p)
76946+ result = fplug->write_sd_by_inode(object);
76947+ return result;
76948+}
76949+
76950+/* helper function: decrease inode nlink count and call plugin method to save
76951+ updated stat-data.
76952+
76953+ Used by unlink/create
76954+*/
76955+int reiser4_del_nlink(struct inode *object /* object from which link is
76956+ * removed */ ,
76957+ struct inode *parent /* parent where entry was */ ,
76958+ int write_sd_p /* true is stat-data has to be
76959+ * updated */ )
76960+{
76961+ file_plugin *fplug;
76962+ int result;
76963+
76964+ assert("nikita-1349", object != NULL);
76965+
76966+ fplug = inode_file_plugin(object);
76967+ assert("nikita-1350", fplug != NULL);
76968+ assert("nikita-1446", object->i_nlink > 0);
76969+ assert("nikita-2210", fplug->rem_link != NULL);
76970+
76971+ /* call plugin to do actual deletion of link */
76972+ result = fplug->rem_link(object, parent);
76973+
76974+ /* optionally update stat data */
76975+ if (result == 0 && write_sd_p)
76976+ result = fplug->write_sd_by_inode(object);
76977+ return result;
76978+}
76979+
76980+/* Release reiser4 dentry. This is d_op->d_release() method. */
76981+static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
76982+{
76983+ reiser4_free_dentry_fsdata(dentry);
76984+}
76985+
76986+/*
76987+ * Called by reiser4_sync_inodes(), during speculative write-back (through
76988+ * pdflush, or balance_dirty_pages()).
76989+ */
76990+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
76991+{
76992+ long written = 0;
76993+ int repeats = 0;
76994+ int result;
76995+ struct address_space *mapping;
76996+
76997+ /*
76998+ * Performs early flushing, trying to free some memory. If there is
76999+ * nothing to flush, commits some atoms.
77000+ */
77001+
77002+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
77003+ sys_fsync(). */
77004+ if (wbc->sync_mode != WB_SYNC_NONE) {
77005+ txnmgr_force_commit_all(sb, 0);
77006+ return;
77007+ }
77008+
77009+ BUG_ON(reiser4_get_super_fake(sb) == NULL);
77010+ mapping = reiser4_get_super_fake(sb)->i_mapping;
77011+ do {
77012+ long nr_submitted = 0;
77013+ jnode *node = NULL;
77014+
77015+ /* do not put more requests to overload write queue */
77016+ if (wbc->nonblocking &&
77017+ bdi_write_congested(mapping->backing_dev_info)) {
77018+ blk_run_address_space(mapping);
77019+ wbc->encountered_congestion = 1;
77020+ break;
77021+ }
77022+ repeats++;
77023+ BUG_ON(wbc->nr_to_write <= 0);
77024+
77025+ if (get_current_context()->entd) {
77026+ entd_context *ent = get_entd_context(sb);
77027+
77028+ if (ent->cur_request->node)
77029+ /*
77030+ * this is ent thread and it managed to capture
77031+ * requested page itself - start flush from
77032+ * that page
77033+ */
77034+ node = jref(ent->cur_request->node);
77035+ }
77036+
77037+ result = flush_some_atom(node, &nr_submitted, wbc,
77038+ JNODE_FLUSH_WRITE_BLOCKS);
77039+ if (result != 0)
77040+ warning("nikita-31001", "Flush failed: %i", result);
77041+ if (node)
77042+ jput(node);
77043+ if (!nr_submitted)
77044+ break;
77045+
77046+ wbc->nr_to_write -= nr_submitted;
77047+ written += nr_submitted;
77048+ } while (wbc->nr_to_write > 0);
77049+}
77050+
77051+void reiser4_throttle_write(struct inode *inode)
77052+{
77053+ reiser4_txn_restart_current();
77054+ balance_dirty_pages_ratelimited(inode->i_mapping);
77055+}
77056+
77057+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
77058+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
77059+ * beginning of device */
77060+
77061+/*
77062+ * Reiser4 initialization/shutdown.
77063+ *
77064+ * Code below performs global reiser4 initialization that is done either as
77065+ * part of kernel initialization (when reiser4 is statically built-in), or
77066+ * during reiser4 module load (when compiled as module).
77067+ */
77068+
77069+void reiser4_handle_error(void)
77070+{
77071+ struct super_block *sb = reiser4_get_current_sb();
77072+
77073+ if (!sb)
77074+ return;
77075+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
77076+ "Filesystem error occured");
77077+ switch (get_super_private(sb)->onerror) {
77078+ case 0:
77079+ reiser4_panic("foobar-42", "Filesystem error occured\n");
77080+ case 1:
77081+ default:
77082+ if (sb->s_flags & MS_RDONLY)
77083+ return;
77084+ sb->s_flags |= MS_RDONLY;
77085+ break;
77086+ }
77087+}
77088+
77089+struct dentry_operations reiser4_dentry_operations = {
77090+ .d_revalidate = NULL,
77091+ .d_hash = NULL,
77092+ .d_compare = NULL,
77093+ .d_delete = NULL,
77094+ .d_release = reiser4_d_release,
77095+ .d_iput = NULL,
77096+};
77097+
77098+/* Make Linus happy.
77099+ Local variables:
77100+ c-indentation-style: "K&R"
77101+ mode-name: "LC"
77102+ c-basic-offset: 8
77103+ tab-width: 8
77104+ fill-column: 120
77105+ End:
77106+*/
77107diff -urN linux-2.6.20.orig/fs/reiser4/vfs_ops.h linux-2.6.20/fs/reiser4/vfs_ops.h
77108--- linux-2.6.20.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300
77109+++ linux-2.6.20/fs/reiser4/vfs_ops.h 2007-05-06 14:50:43.899038216 +0400
77110@@ -0,0 +1,53 @@
77111+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77112+ * reiser4/README */
77113+
77114+/* vfs_ops.c's exported symbols */
77115+
77116+#if !defined( __FS_REISER4_VFS_OPS_H__ )
77117+#define __FS_REISER4_VFS_OPS_H__
77118+
77119+#include "forward.h"
77120+#include "coord.h"
77121+#include "seal.h"
77122+#include "plugin/file/file.h"
77123+#include "super.h"
77124+#include "readahead.h"
77125+
77126+#include <linux/types.h> /* for loff_t */
77127+#include <linux/fs.h> /* for struct address_space */
77128+#include <linux/dcache.h> /* for struct dentry */
77129+#include <linux/mm.h>
77130+#include <linux/backing-dev.h>
77131+
77132+/* address space operations */
77133+int reiser4_writepage(struct page *, struct writeback_control *);
77134+int reiser4_set_page_dirty(struct page *);
77135+void reiser4_invalidatepage(struct page *, unsigned long offset);
77136+int reiser4_releasepage(struct page *, gfp_t);
77137+
77138+extern int reiser4_update_sd(struct inode *);
77139+extern int reiser4_add_nlink(struct inode *, struct inode *, int);
77140+extern int reiser4_del_nlink(struct inode *, struct inode *, int);
77141+
77142+extern int reiser4_start_up_io(struct page *page);
77143+extern void reiser4_throttle_write(struct inode *);
77144+extern int jnode_is_releasable(jnode *);
77145+
77146+#define CAPTURE_APAGE_BURST (1024l)
77147+void reiser4_writeout(struct super_block *, struct writeback_control *);
77148+
77149+extern void reiser4_handle_error(void);
77150+
77151+/* __FS_REISER4_VFS_OPS_H__ */
77152+#endif
77153+
77154+/* Make Linus happy.
77155+ Local variables:
77156+ c-indentation-style: "K&R"
77157+ mode-name: "LC"
77158+ c-basic-offset: 8
77159+ tab-width: 8
77160+ fill-column: 120
77161+ scroll-step: 1
77162+ End:
77163+*/
77164diff -urN linux-2.6.20.orig/fs/reiser4/wander.c linux-2.6.20/fs/reiser4/wander.c
77165--- linux-2.6.20.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
77166+++ linux-2.6.20/fs/reiser4/wander.c 2007-05-06 14:50:43.903039466 +0400
77167@@ -0,0 +1,1797 @@
77168+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77169+ * reiser4/README */
77170+
77171+/* Reiser4 Wandering Log */
77172+
77173+/* You should read http://www.namesys.com/txn-doc.html
77174+
77175+ That describes how filesystem operations are performed as atomic
77176+ transactions, and how we try to arrange it so that we can write most of the
77177+ data only once while performing the operation atomically.
77178+
77179+ For the purposes of this code, it is enough for it to understand that it
77180+ has been told a given block should be written either once, or twice (if
77181+ twice then once to the wandered location and once to the real location).
77182+
77183+ This code guarantees that those blocks that are defined to be part of an
77184+ atom either all take effect or none of them take effect.
77185+
77186+ Relocate set nodes are submitted to write by the jnode_flush() routine, and
77187+ the overwrite set is submitted by reiser4_write_log(). This is because with
77188+ the overwrite set we seek to optimize writes, and with the relocate set we
77189+ seek to cause disk order to correlate with the parent first pre-order.
77190+
77191+ reiser4_write_log() allocates and writes wandered blocks and maintains
77192+ additional on-disk structures of the atom as wander records (each wander
77193+ record occupies one block) for storing of the "wandered map" (a table which
77194+ contains a relation between wandered and real block numbers) and other
77195+ information which might be needed at transaction recovery time.
77196+
77197+ The wander records are unidirectionally linked into a circle: each wander
77198+ record contains a block number of the next wander record, the last wander
77199+ record points to the first one.
77200+
77201+ One wander record (named "tx head" in this file) has a format which is
77202+ different from the other wander records. The "tx head" has a reference to the
77203+ "tx head" block of the previously committed atom. Also, "tx head" contains
77204+ fs information (the free blocks counter, and the oid allocator state) which
77205+ is logged in a special way .
77206+
77207+ There are two journal control blocks, named journal header and journal
77208+ footer which have fixed on-disk locations. The journal header has a
77209+ reference to the "tx head" block of the last committed atom. The journal
77210+ footer points to the "tx head" of the last flushed atom. The atom is
77211+ "played" when all blocks from its overwrite set are written to disk the
77212+ second time (i.e. written to their real locations).
77213+
77214+ NOTE: People who know reiserfs internals and its journal structure might be
77215+ confused with these terms journal footer and journal header. There is a table
77216+ with terms of similar semantics in reiserfs (reiser3) and reiser4:
77217+
77218+ REISER3 TERM | REISER4 TERM | DESCRIPTION
77219+ --------------------+-----------------------+----------------------------
77220+ commit record | journal header | atomic write of this record
77221+ | | ends transaction commit
77222+ --------------------+-----------------------+----------------------------
77223+ journal header | journal footer | atomic write of this record
77224+ | | ends post-commit writes.
77225+ | | After successful
77226+ | | writing of this journal
77227+ | | blocks (in reiser3) or
77228+ | | wandered blocks/records are
77229+ | | free for re-use.
77230+ --------------------+-----------------------+----------------------------
77231+
77232+ The atom commit process is the following:
77233+
77234+ 1. The overwrite set is taken from atom's clean list, and its size is
77235+ counted.
77236+
77237+ 2. The number of necessary wander records (including tx head) is calculated,
77238+ and the wander record blocks are allocated.
77239+
77240+ 3. Allocate wandered blocks and populate wander records by wandered map.
77241+
77242+ 4. submit write requests for wander records and wandered blocks.
77243+
77244+ 5. wait until submitted write requests complete.
77245+
77246+ 6. update journal header: change the pointer to the block number of just
77247+ written tx head, submit an i/o for modified journal header block and wait
77248+ for i/o completion.
77249+
77250+ NOTE: The special logging for bitmap blocks and some reiser4 super block
77251+ fields makes processes of atom commit, flush and recovering a bit more
77252+ complex (see comments in the source code for details).
77253+
77254+ The atom playing process is the following:
77255+
77256+ 1. Write atom's overwrite set in-place.
77257+
77258+ 2. Wait on i/o.
77259+
77260+ 3. Update journal footer: change the pointer to block number of tx head
77261+ block of the atom we currently flushing, submit an i/o, wait on i/o
77262+ completion.
77263+
77264+ 4. Free disk space which was used for wandered blocks and wander records.
77265+
77266+ After the freeing of wandered blocks and wander records we have that journal
77267+ footer points to the on-disk structure which might be overwritten soon.
77268+ Neither the log writer nor the journal recovery procedure use that pointer
77269+ for accessing the data. When the journal recovery procedure finds the oldest
77270+ transaction it compares the journal footer pointer value with the "prev_tx"
77271+ pointer value in tx head, if values are equal the oldest not flushed
77272+ transaction is found.
77273+
77274+ NOTE on disk space leakage: the information about of what blocks and how many
77275+ blocks are allocated for wandered blocks, wandered records is not written to
77276+ the disk because of special logging for bitmaps and some super blocks
77277+ counters. After a system crash we the reiser4 does not remember those
77278+ objects allocation, thus we have no such a kind of disk space leakage.
77279+*/
77280+
77281+/* Special logging of reiser4 super block fields. */
77282+
77283+/* There are some reiser4 super block fields (free block count and OID allocator
77284+ state (number of files and next free OID) which are logged separately from
77285+ super block to avoid unnecessary atom fusion.
77286+
77287+ So, the reiser4 super block can be not captured by a transaction with
77288+ allocates/deallocates disk blocks or create/delete file objects. Moreover,
77289+ the reiser4 on-disk super block is not touched when such a transaction is
77290+ committed and flushed. Those "counters logged specially" are logged in "tx
77291+ head" blocks and in the journal footer block.
77292+
77293+ A step-by-step description of special logging:
77294+
77295+ 0. The per-atom information about deleted or created files and allocated or
77296+ freed blocks is collected during the transaction. The atom's
77297+ ->nr_objects_created and ->nr_objects_deleted are for object
77298+ deletion/creation tracking, the numbers of allocated and freed blocks are
77299+ calculated using atom's delete set and atom's capture list -- all new and
77300+ relocated nodes should be on atom's clean list and should have JNODE_RELOC
77301+ bit set.
77302+
77303+ 1. The "logged specially" reiser4 super block fields have their "committed"
77304+ versions in the reiser4 in-memory super block. They get modified only at
77305+ atom commit time. The atom's commit thread has an exclusive access to those
77306+ "committed" fields because the log writer implementation supports only one
77307+ atom commit a time (there is a per-fs "commit" mutex). At
77308+ that time "committed" counters are modified using per-atom information
77309+ collected during the transaction. These counters are stored on disk as a
77310+ part of tx head block when atom is committed.
77311+
77312+ 2. When the atom is flushed the value of the free block counter and the OID
77313+ allocator state get written to the journal footer block. A special journal
77314+ procedure (journal_recover_sb_data()) takes those values from the journal
77315+ footer and updates the reiser4 in-memory super block.
77316+
77317+ NOTE: That means free block count and OID allocator state are logged
77318+ separately from the reiser4 super block regardless of the fact that the
77319+ reiser4 super block has fields to store both the free block counter and the
77320+ OID allocator.
77321+
77322+ Writing the whole super block at commit time requires knowing true values of
77323+ all its fields without changes made by not yet committed transactions. It is
77324+ possible by having their "committed" version of the super block like the
77325+ reiser4 bitmap blocks have "committed" and "working" versions. However,
77326+ another scheme was implemented which stores special logged values in the
77327+ unused free space inside transaction head block. In my opinion it has an
77328+ advantage of not writing whole super block when only part of it was
77329+ modified. */
77330+
77331+#include "debug.h"
77332+#include "dformat.h"
77333+#include "txnmgr.h"
77334+#include "jnode.h"
77335+#include "znode.h"
77336+#include "block_alloc.h"
77337+#include "page_cache.h"
77338+#include "wander.h"
77339+#include "reiser4.h"
77340+#include "super.h"
77341+#include "vfs_ops.h"
77342+#include "writeout.h"
77343+#include "inode.h"
77344+#include "entd.h"
77345+
77346+#include <linux/types.h>
77347+#include <linux/fs.h> /* for struct super_block */
77348+#include <linux/mm.h> /* for struct page */
77349+#include <linux/pagemap.h>
77350+#include <linux/bio.h> /* for struct bio */
77351+#include <linux/blkdev.h>
77352+
77353+static int write_jnodes_to_disk_extent(
77354+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77355+
77356+/* The commit_handle is a container for objects needed at atom commit time */
77357+struct commit_handle {
77358+ /* A pointer to atom's list of OVRWR nodes */
77359+ struct list_head *overwrite_set;
77360+ /* atom's overwrite set size */
77361+ int overwrite_set_size;
77362+ /* jnodes for wander record blocks */
77363+ struct list_head tx_list;
77364+ /* number of wander records */
77365+ __u32 tx_size;
77366+ /* 'committed' sb counters are saved here until atom is completely
77367+ flushed */
77368+ __u64 free_blocks;
77369+ __u64 nr_files;
77370+ __u64 next_oid;
77371+ /* A pointer to the atom which is being committed */
77372+ txn_atom *atom;
77373+ /* A pointer to current super block */
77374+ struct super_block *super;
77375+ /* The counter of modified bitmaps */
77376+ reiser4_block_nr nr_bitmap;
77377+};
77378+
77379+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77380+{
77381+ memset(ch, 0, sizeof(struct commit_handle));
77382+ INIT_LIST_HEAD(&ch->tx_list);
77383+
77384+ ch->atom = atom;
77385+ ch->super = reiser4_get_current_sb();
77386+}
77387+
77388+static void done_commit_handle(struct commit_handle *ch)
77389+{
77390+ assert("zam-690", list_empty(&ch->tx_list));
77391+}
77392+
77393+static inline int reiser4_use_write_barrier(struct super_block * s)
77394+{
77395+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77396+}
77397+
77398+static void disable_write_barrier(struct super_block * s)
77399+{
77400+ notice("zam-1055", "%s does not support write barriers,"
77401+ " using synchronous write instead.", s->s_id);
77402+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77403+}
77404+
77405+/* fill journal header block data */
77406+static void format_journal_header(struct commit_handle *ch)
77407+{
77408+ struct reiser4_super_info_data *sbinfo;
77409+ struct journal_header *header;
77410+ jnode *txhead;
77411+
77412+ sbinfo = get_super_private(ch->super);
77413+ assert("zam-479", sbinfo != NULL);
77414+ assert("zam-480", sbinfo->journal_header != NULL);
77415+
77416+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77417+
77418+ jload(sbinfo->journal_header);
77419+
77420+ header = (struct journal_header *)jdata(sbinfo->journal_header);
77421+ assert("zam-484", header != NULL);
77422+
77423+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77424+ &header->last_committed_tx);
77425+
77426+ jrelse(sbinfo->journal_header);
77427+}
77428+
77429+/* fill journal footer block data */
77430+static void format_journal_footer(struct commit_handle *ch)
77431+{
77432+ struct reiser4_super_info_data *sbinfo;
77433+ struct journal_footer *footer;
77434+ jnode *tx_head;
77435+
77436+ sbinfo = get_super_private(ch->super);
77437+
77438+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77439+
77440+ assert("zam-493", sbinfo != NULL);
77441+ assert("zam-494", sbinfo->journal_header != NULL);
77442+
77443+ check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77444+
77445+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77446+ assert("zam-495", footer != NULL);
77447+
77448+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77449+ &footer->last_flushed_tx);
77450+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77451+
77452+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77453+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77454+
77455+ jrelse(sbinfo->journal_footer);
77456+}
77457+
77458+/* wander record capacity depends on current block size */
77459+static int wander_record_capacity(const struct super_block *super)
77460+{
77461+ return (super->s_blocksize -
77462+ sizeof(struct wander_record_header)) /
77463+ sizeof(struct wander_entry);
77464+}
77465+
77466+/* Fill first wander record (tx head) in accordance with supplied given data */
77467+static void format_tx_head(struct commit_handle *ch)
77468+{
77469+ jnode *tx_head;
77470+ jnode *next;
77471+ struct tx_header *header;
77472+
77473+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77474+ assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77475+
77476+ next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77477+ if (&ch->tx_list == &next->capture_link)
77478+ next = tx_head;
77479+
77480+ header = (struct tx_header *)jdata(tx_head);
77481+
77482+ assert("zam-460", header != NULL);
77483+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77484+
77485+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77486+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77487+
77488+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77489+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77490+ &header->prev_tx);
77491+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77492+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
77493+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
77494+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
77495+}
77496+
77497+/* prepare ordinary wander record block (fill all service fields) */
77498+static void
77499+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
77500+{
77501+ struct wander_record_header *LRH;
77502+ jnode *next;
77503+
77504+ assert("zam-464", node != NULL);
77505+
77506+ LRH = (struct wander_record_header *)jdata(node);
77507+ next = list_entry(node->capture_link.next, jnode, capture_link);
77508+
77509+ if (&ch->tx_list == &next->capture_link)
77510+ next = list_entry(ch->tx_list.next, jnode, capture_link);
77511+
77512+ assert("zam-465", LRH != NULL);
77513+ assert("zam-463",
77514+ ch->super->s_blocksize > sizeof(struct wander_record_header));
77515+
77516+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
77517+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
77518+
77519+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
77520+ put_unaligned(cpu_to_le32(serial), &LRH->serial);
77521+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
77522+}
77523+
77524+/* add one wandered map entry to formatted wander record */
77525+static void
77526+store_entry(jnode * node, int index, const reiser4_block_nr * a,
77527+ const reiser4_block_nr * b)
77528+{
77529+ char *data;
77530+ struct wander_entry *pairs;
77531+
77532+ data = jdata(node);
77533+ assert("zam-451", data != NULL);
77534+
77535+ pairs =
77536+ (struct wander_entry *)(data + sizeof(struct wander_record_header));
77537+
77538+ put_unaligned(cpu_to_le64(*a), &pairs[index].original);
77539+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
77540+}
77541+
77542+/* currently, wander records contains contain only wandered map, which depend on
77543+ overwrite set size */
77544+static void get_tx_size(struct commit_handle *ch)
77545+{
77546+ assert("zam-440", ch->overwrite_set_size != 0);
77547+ assert("zam-695", ch->tx_size == 0);
77548+
77549+ /* count all ordinary wander records
77550+ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
77551+ for tx head block */
77552+ ch->tx_size =
77553+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
77554+ 2;
77555+}
77556+
77557+/* A special structure for using in store_wmap_actor() for saving its state
77558+ between calls */
77559+struct store_wmap_params {
77560+ jnode *cur; /* jnode of current wander record to fill */
77561+ int idx; /* free element index in wander record */
77562+ int capacity; /* capacity */
77563+
77564+#if REISER4_DEBUG
77565+ struct list_head *tx_list;
77566+#endif
77567+};
77568+
77569+/* an actor for use in blocknr_set_iterator routine which populates the list
77570+ of pre-formatted wander records by wandered map info */
77571+static int
77572+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
77573+ const reiser4_block_nr * b, void *data)
77574+{
77575+ struct store_wmap_params *params = data;
77576+
77577+ if (params->idx >= params->capacity) {
77578+ /* a new wander record should be taken from the tx_list */
77579+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
77580+ assert("zam-454",
77581+ params->tx_list != &params->cur->capture_link);
77582+
77583+ params->idx = 0;
77584+ }
77585+
77586+ store_entry(params->cur, params->idx, a, b);
77587+ params->idx++;
77588+
77589+ return 0;
77590+}
77591+
77592+/* This function is called after Relocate set gets written to disk, Overwrite
77593+ set is written to wandered locations and all wander records are written
77594+ also. Updated journal header blocks contains a pointer (block number) to
77595+ first wander record of the just written transaction */
77596+static int update_journal_header(struct commit_handle *ch, int use_barrier)
77597+{
77598+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77599+ jnode *jh = sbinfo->journal_header;
77600+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
77601+ int ret;
77602+
77603+ format_journal_header(ch);
77604+
77605+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
77606+ use_barrier ? WRITEOUT_BARRIER : 0);
77607+ if (ret)
77608+ return ret;
77609+
77610+ // blk_run_address_space(sbinfo->fake->i_mapping);
77611+ /*blk_run_queues(); */
77612+
77613+ ret = jwait_io(jh, WRITE);
77614+
77615+ if (ret)
77616+ return ret;
77617+
77618+ sbinfo->last_committed_tx = *jnode_get_block(head);
77619+
77620+ return 0;
77621+}
77622+
77623+/* This function is called after write-back is finished. We update journal
77624+ footer block and free blocks which were occupied by wandered blocks and
77625+ transaction wander records */
77626+static int update_journal_footer(struct commit_handle *ch, int use_barrier)
77627+{
77628+ reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77629+
77630+ jnode *jf = sbinfo->journal_footer;
77631+
77632+ int ret;
77633+
77634+ format_journal_footer(ch);
77635+
77636+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
77637+ use_barrier ? WRITEOUT_BARRIER : 0);
77638+ if (ret)
77639+ return ret;
77640+
77641+ // blk_run_address_space(sbinfo->fake->i_mapping);
77642+ /*blk_run_queue(); */
77643+
77644+ ret = jwait_io(jf, WRITE);
77645+ if (ret)
77646+ return ret;
77647+
77648+ return 0;
77649+}
77650+
77651+/* free block numbers of wander records of already written in place transaction */
77652+static void dealloc_tx_list(struct commit_handle *ch)
77653+{
77654+ while (!list_empty(&ch->tx_list)) {
77655+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
77656+ list_del(&cur->capture_link);
77657+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
77658+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
77659+ BA_FORMATTED);
77660+
77661+ unpin_jnode_data(cur);
77662+ reiser4_drop_io_head(cur);
77663+ }
77664+}
77665+
77666+/* An actor for use in block_nr_iterator() routine which frees wandered blocks
77667+ from atom's overwrite set. */
77668+static int
77669+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
77670+ const reiser4_block_nr * a UNUSED_ARG,
77671+ const reiser4_block_nr * b, void *data UNUSED_ARG)
77672+{
77673+
77674+ assert("zam-499", b != NULL);
77675+ assert("zam-500", *b != 0);
77676+ assert("zam-501", !reiser4_blocknr_is_fake(b));
77677+
77678+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
77679+ return 0;
77680+}
77681+
77682+/* free wandered block locations of already written in place transaction */
77683+static void dealloc_wmap(struct commit_handle *ch)
77684+{
77685+ assert("zam-696", ch->atom != NULL);
77686+
77687+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
77688+ dealloc_wmap_actor, NULL, 1);
77689+}
77690+
77691+/* helper function for alloc wandered blocks, which refill set of block
77692+ numbers needed for wandered blocks */
77693+static int
77694+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
77695+{
77696+ reiser4_blocknr_hint hint;
77697+ int ret;
77698+
77699+ reiser4_block_nr wide_len = count;
77700+
77701+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
77702+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
77703+ reserved allocation area so as to get the best qualities of fixed
77704+ journals? */
77705+ reiser4_blocknr_hint_init(&hint);
77706+ hint.block_stage = BLOCK_GRABBED;
77707+
77708+ ret = reiser4_alloc_blocks(&hint, start, &wide_len,
77709+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
77710+ *len = (int)wide_len;
77711+
77712+ return ret;
77713+}
77714+
77715+/*
77716+ * roll back changes made before issuing BIO in the case of IO error.
77717+ */
77718+static void undo_bio(struct bio *bio)
77719+{
77720+ int i;
77721+
77722+ for (i = 0; i < bio->bi_vcnt; ++i) {
77723+ struct page *pg;
77724+ jnode *node;
77725+
77726+ pg = bio->bi_io_vec[i].bv_page;
77727+ ClearPageWriteback(pg);
77728+ node = jprivate(pg);
77729+ spin_lock_jnode(node);
77730+ JF_CLR(node, JNODE_WRITEBACK);
77731+ JF_SET(node, JNODE_DIRTY);
77732+ spin_unlock_jnode(node);
77733+ }
77734+ bio_put(bio);
77735+}
77736+
77737+/* put overwrite set back to atom's clean list */
77738+static void put_overwrite_set(struct commit_handle *ch)
77739+{
77740+ jnode *cur;
77741+
77742+ list_for_each_entry(cur, ch->overwrite_set, capture_link)
77743+ jrelse_tail(cur);
77744+}
77745+
77746+/* Count overwrite set size, grab disk space for wandered blocks allocation.
77747+ Since we have a separate list for atom's overwrite set we just scan the list,
77748+ count bitmap and other not leaf nodes which wandered blocks allocation we
77749+ have to grab space for. */
77750+static int get_overwrite_set(struct commit_handle *ch)
77751+{
77752+ int ret;
77753+ jnode *cur;
77754+ __u64 nr_not_leaves = 0;
77755+#if REISER4_DEBUG
77756+ __u64 nr_formatted_leaves = 0;
77757+ __u64 nr_unformatted_leaves = 0;
77758+#endif
77759+
77760+ assert("zam-697", ch->overwrite_set_size == 0);
77761+
77762+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
77763+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77764+
77765+ while (ch->overwrite_set != &cur->capture_link) {
77766+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
77767+
77768+ /* Count bitmap locks for getting correct statistics what number
77769+ * of blocks were cleared by the transaction commit. */
77770+ if (jnode_get_type(cur) == JNODE_BITMAP)
77771+ ch->nr_bitmap++;
77772+
77773+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
77774+ || jnode_get_type(cur) == JNODE_BITMAP);
77775+
77776+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
77777+ /* we replace fake znode by another (real)
77778+ znode which is suggested by disk_layout
77779+ plugin */
77780+
77781+ /* FIXME: it looks like fake znode should be
77782+ replaced by jnode supplied by
77783+ disk_layout. */
77784+
77785+ struct super_block *s = reiser4_get_current_sb();
77786+ reiser4_super_info_data *sbinfo =
77787+ get_current_super_private();
77788+
77789+ if (sbinfo->df_plug->log_super) {
77790+ jnode *sj = sbinfo->df_plug->log_super(s);
77791+
77792+ assert("zam-593", sj != NULL);
77793+
77794+ if (IS_ERR(sj))
77795+ return PTR_ERR(sj);
77796+
77797+ spin_lock_jnode(sj);
77798+ JF_SET(sj, JNODE_OVRWR);
77799+ insert_into_atom_ovrwr_list(ch->atom, sj);
77800+ spin_unlock_jnode(sj);
77801+
77802+ /* jload it as the rest of overwrite set */
77803+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
77804+
77805+ ch->overwrite_set_size++;
77806+ }
77807+ spin_lock_jnode(cur);
77808+ reiser4_uncapture_block(cur);
77809+ jput(cur);
77810+
77811+ } else {
77812+ int ret;
77813+ ch->overwrite_set_size++;
77814+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
77815+ if (ret)
77816+ reiser4_panic("zam-783",
77817+ "cannot load e-flushed jnode back (ret = %d)\n",
77818+ ret);
77819+ }
77820+
77821+ /* Count not leaves here because we have to grab disk space
77822+ * for wandered blocks. They were not counted as "flush
77823+ * reserved". Counting should be done _after_ nodes are pinned
77824+ * into memory by jload(). */
77825+ if (!jnode_is_leaf(cur))
77826+ nr_not_leaves++;
77827+ else {
77828+#if REISER4_DEBUG
77829+ /* at this point @cur either has JNODE_FLUSH_RESERVED
77830+ * or is eflushed. Locking is not strong enough to
77831+ * write an assertion checking for this. */
77832+ if (jnode_is_znode(cur))
77833+ nr_formatted_leaves++;
77834+ else
77835+ nr_unformatted_leaves++;
77836+#endif
77837+ JF_CLR(cur, JNODE_FLUSH_RESERVED);
77838+ }
77839+
77840+ cur = next;
77841+ }
77842+
77843+ /* Grab space for writing (wandered blocks) of not leaves found in
77844+ * overwrite set. */
77845+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
77846+ if (ret)
77847+ return ret;
77848+
77849+ /* Disk space for allocation of wandered blocks of leaf nodes already
77850+ * reserved as "flush reserved", move it to grabbed space counter. */
77851+ spin_lock_atom(ch->atom);
77852+ assert("zam-940",
77853+ nr_formatted_leaves + nr_unformatted_leaves <=
77854+ ch->atom->flush_reserved);
77855+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
77856+ spin_unlock_atom(ch->atom);
77857+
77858+ return ch->overwrite_set_size;
77859+}
77860+
77861+/**
77862+ * write_jnodes_to_disk_extent - submit write request
77863+ * @head:
77864+ * @first: first jnode of the list
77865+ * @nr: number of jnodes on the list
77866+ * @block_p:
77867+ * @fq:
77868+ * @flags: used to decide whether page is to get PG_reclaim flag
77869+ *
77870+ * Submits a write request for @nr jnodes beginning from the @first, other
77871+ * jnodes are after the @first on the double-linked "capture" list. All jnodes
77872+ * will be written to the disk region of @nr blocks starting with @block_p block
77873+ * number. If @fq is not NULL it means that waiting for i/o completion will be
77874+ * done more efficiently by using flush_queue_t objects.
77875+ * This function is the one which writes list of jnodes in batch mode. It does
77876+ * all low-level things as bio construction and page states manipulation.
77877+ *
77878+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
77879+ * aggregated in this function instead of being left to the layers below
77880+ *
77881+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
77882+ * Why that layer needed? Why BIOs cannot be constructed here?
77883+ */
77884+static int write_jnodes_to_disk_extent(
77885+ jnode *first, int nr, const reiser4_block_nr *block_p,
77886+ flush_queue_t *fq, int flags)
77887+{
77888+ struct super_block *super = reiser4_get_current_sb();
77889+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
77890+ int max_blocks;
77891+ jnode *cur = first;
77892+ reiser4_block_nr block;
77893+
77894+ assert("zam-571", first != NULL);
77895+ assert("zam-572", block_p != NULL);
77896+ assert("zam-570", nr > 0);
77897+
77898+ block = *block_p;
77899+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
77900+
77901+ while (nr > 0) {
77902+ struct bio *bio;
77903+ int nr_blocks = min(nr, max_blocks);
77904+ int i;
77905+ int nr_used;
77906+
77907+ bio = bio_alloc(GFP_NOIO, nr_blocks);
77908+ if (!bio)
77909+ return RETERR(-ENOMEM);
77910+
77911+ bio->bi_bdev = super->s_bdev;
77912+ bio->bi_sector = block * (super->s_blocksize >> 9);
77913+ for (nr_used = 0, i = 0; i < nr_blocks; i++) {
77914+ struct page *pg;
77915+
77916+ pg = jnode_page(cur);
77917+ assert("zam-573", pg != NULL);
77918+
77919+ page_cache_get(pg);
77920+
77921+ lock_and_wait_page_writeback(pg);
77922+
77923+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
77924+ /*
77925+ * underlying device is satiated. Stop adding
77926+ * pages to the bio.
77927+ */
77928+ unlock_page(pg);
77929+ page_cache_release(pg);
77930+ break;
77931+ }
77932+
77933+ spin_lock_jnode(cur);
77934+ assert("nikita-3166",
77935+ pg->mapping == jnode_get_mapping(cur));
77936+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
77937+#if REISER4_DEBUG
77938+ spin_lock(&cur->load);
77939+ assert("nikita-3165", !jnode_is_releasable(cur));
77940+ spin_unlock(&cur->load);
77941+#endif
77942+ JF_SET(cur, JNODE_WRITEBACK);
77943+ JF_CLR(cur, JNODE_DIRTY);
77944+ ON_DEBUG(cur->written++);
77945+ spin_unlock_jnode(cur);
77946+
77947+ ClearPageError(pg);
77948+ set_page_writeback(pg);
77949+
77950+ if (get_current_context()->entd) {
77951+ /* this is ent thread */
77952+ entd_context *ent = get_entd_context(super);
77953+ struct wbq *rq, *next;
77954+
77955+ spin_lock(&ent->guard);
77956+
77957+ if (pg == ent->cur_request->page) {
77958+ /*
77959+ * entd is called for this page. This
77960+ * request is not in th etodo list
77961+ */
77962+ ent->cur_request->written = 1;
77963+ } else {
77964+ /*
77965+ * if we have written a page for which writepage
77966+ * is called for - move request to another list.
77967+ */
77968+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
77969+ assert("", rq->magic == WBQ_MAGIC);
77970+ if (pg == rq->page) {
77971+ /*
77972+ * remove request from
77973+ * entd's queue, but do
77974+ * not wake up a thread
77975+ * which put this
77976+ * request
77977+ */
77978+ list_del_init(&rq->link);
77979+ ent->nr_todo_reqs --;
77980+ list_add_tail(&rq->link, &ent->done_list);
77981+ ent->nr_done_reqs ++;
77982+ rq->written = 1;
77983+ break;
77984+ }
77985+ }
77986+ }
77987+ spin_unlock(&ent->guard);
77988+ }
77989+
77990+ clear_page_dirty_for_io(pg);
77991+
77992+ unlock_page(pg);
77993+
77994+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77995+ nr_used++;
77996+ }
77997+ if (nr_used > 0) {
77998+ assert("nikita-3453",
77999+ bio->bi_size == super->s_blocksize * nr_used);
78000+ assert("nikita-3454", bio->bi_vcnt == nr_used);
78001+
78002+ /* Check if we are allowed to write at all */
78003+ if (super->s_flags & MS_RDONLY)
78004+ undo_bio(bio);
78005+ else {
78006+ int not_supported;
78007+
78008+ add_fq_to_bio(fq, bio);
78009+ bio_get(bio);
78010+ reiser4_submit_bio(write_op, bio);
78011+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
78012+ bio_put(bio);
78013+ if (not_supported)
78014+ return -EOPNOTSUPP;
78015+ }
78016+
78017+ block += nr_used - 1;
78018+ update_blocknr_hint_default(super, &block);
78019+ block += 1;
78020+ } else {
78021+ bio_put(bio);
78022+ }
78023+ nr -= nr_used;
78024+ }
78025+
78026+ return 0;
78027+}
78028+
78029+/* This is a procedure which recovers a contiguous sequences of disk block
78030+ numbers in the given list of j-nodes and submits write requests on this
78031+ per-sequence basis */
78032+int
78033+write_jnode_list(struct list_head *head, flush_queue_t *fq,
78034+ long *nr_submitted, int flags)
78035+{
78036+ int ret;
78037+ jnode *beg = list_entry(head->next, jnode, capture_link);
78038+
78039+ while (head != &beg->capture_link) {
78040+ int nr = 1;
78041+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
78042+
78043+ while (head != &cur->capture_link) {
78044+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
78045+ break;
78046+ ++nr;
78047+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78048+ }
78049+
78050+ ret = write_jnodes_to_disk_extent(
78051+ beg, nr, jnode_get_block(beg), fq, flags);
78052+ if (ret)
78053+ return ret;
78054+
78055+ if (nr_submitted)
78056+ *nr_submitted += nr;
78057+
78058+ beg = cur;
78059+ }
78060+
78061+ return 0;
78062+}
78063+
78064+/* add given wandered mapping to atom's wandered map */
78065+static int
78066+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
78067+{
78068+ int ret;
78069+ blocknr_set_entry *new_bsep = NULL;
78070+ reiser4_block_nr block;
78071+
78072+ txn_atom *atom;
78073+
78074+ assert("zam-568", block_p != NULL);
78075+ block = *block_p;
78076+ assert("zam-569", len > 0);
78077+
78078+ while ((len--) > 0) {
78079+ do {
78080+ atom = get_current_atom_locked();
78081+ assert("zam-536",
78082+ !reiser4_blocknr_is_fake(jnode_get_block(cur)));
78083+ ret =
78084+ blocknr_set_add_pair(atom, &atom->wandered_map,
78085+ &new_bsep,
78086+ jnode_get_block(cur), &block);
78087+ } while (ret == -E_REPEAT);
78088+
78089+ if (ret) {
78090+ /* deallocate blocks which were not added to wandered
78091+ map */
78092+ reiser4_block_nr wide_len = len;
78093+
78094+ reiser4_dealloc_blocks(&block, &wide_len,
78095+ BLOCK_NOT_COUNTED,
78096+ BA_FORMATTED
78097+ /* formatted, without defer */ );
78098+
78099+ return ret;
78100+ }
78101+
78102+ spin_unlock_atom(atom);
78103+
78104+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78105+ ++block;
78106+ }
78107+
78108+ return 0;
78109+}
78110+
78111+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
78112+ submit IO for allocated blocks. We assume that current atom is in a stage
78113+ when any atom fusion is impossible and atom is unlocked and it is safe. */
78114+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
78115+{
78116+ reiser4_block_nr block;
78117+
78118+ int rest;
78119+ int len;
78120+ int ret;
78121+
78122+ jnode *cur;
78123+
78124+ assert("zam-534", ch->overwrite_set_size > 0);
78125+
78126+ rest = ch->overwrite_set_size;
78127+
78128+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78129+ while (ch->overwrite_set != &cur->capture_link) {
78130+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
78131+
78132+ ret = get_more_wandered_blocks(rest, &block, &len);
78133+ if (ret)
78134+ return ret;
78135+
78136+ rest -= len;
78137+
78138+ ret = add_region_to_wmap(cur, len, &block);
78139+ if (ret)
78140+ return ret;
78141+
78142+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
78143+ if (ret)
78144+ return ret;
78145+
78146+ while ((len--) > 0) {
78147+ assert("zam-604",
78148+ ch->overwrite_set != &cur->capture_link);
78149+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78150+ }
78151+ }
78152+
78153+ return 0;
78154+}
78155+
78156+/* allocate given number of nodes over the journal area and link them into a
78157+ list, return pointer to the first jnode in the list */
78158+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
78159+{
78160+ reiser4_blocknr_hint hint;
78161+ reiser4_block_nr allocated = 0;
78162+ reiser4_block_nr first, len;
78163+ jnode *cur;
78164+ jnode *txhead;
78165+ int ret;
78166+ reiser4_context *ctx;
78167+ reiser4_super_info_data *sbinfo;
78168+
78169+ assert("zam-698", ch->tx_size > 0);
78170+ assert("zam-699", list_empty_careful(&ch->tx_list));
78171+
78172+ ctx = get_current_context();
78173+ sbinfo = get_super_private(ctx->super);
78174+
78175+ while (allocated < (unsigned)ch->tx_size) {
78176+ len = (ch->tx_size - allocated);
78177+
78178+ reiser4_blocknr_hint_init(&hint);
78179+
78180+ hint.block_stage = BLOCK_GRABBED;
78181+
78182+ /* FIXME: there should be some block allocation policy for
78183+ nodes which contain wander records */
78184+
78185+ /* We assume that disk space for wandered record blocks can be
78186+ * taken from reserved area. */
78187+ ret = reiser4_alloc_blocks(&hint, &first, &len,
78188+ BA_FORMATTED | BA_RESERVED |
78189+ BA_USE_DEFAULT_SEARCH_START);
78190+ reiser4_blocknr_hint_done(&hint);
78191+
78192+ if (ret)
78193+ return ret;
78194+
78195+ allocated += len;
78196+
78197+ /* create jnodes for all wander records */
78198+ while (len--) {
78199+ cur = reiser4_alloc_io_head(&first);
78200+
78201+ if (cur == NULL) {
78202+ ret = RETERR(-ENOMEM);
78203+ goto free_not_assigned;
78204+ }
78205+
78206+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
78207+
78208+ if (ret != 0) {
78209+ jfree(cur);
78210+ goto free_not_assigned;
78211+ }
78212+
78213+ pin_jnode_data(cur);
78214+
78215+ list_add_tail(&cur->capture_link, &ch->tx_list);
78216+
78217+ first++;
78218+ }
78219+ }
78220+
78221+ { /* format a on-disk linked list of wander records */
78222+ int serial = 1;
78223+
78224+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
78225+ format_tx_head(ch);
78226+
78227+ cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78228+ while (&ch->tx_list != &cur->capture_link) {
78229+ format_wander_record(ch, cur, serial++);
78230+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78231+ }
78232+ }
78233+
78234+ { /* Fill wander records with Wandered Set */
78235+ struct store_wmap_params params;
78236+ txn_atom *atom;
78237+
78238+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78239+
78240+ params.idx = 0;
78241+ params.capacity =
78242+ wander_record_capacity(reiser4_get_current_sb());
78243+
78244+ atom = get_current_atom_locked();
78245+ blocknr_set_iterator(atom, &atom->wandered_map,
78246+ &store_wmap_actor, &params, 0);
78247+ spin_unlock_atom(atom);
78248+ }
78249+
78250+ { /* relse all jnodes from tx_list */
78251+ cur = list_entry(ch->tx_list.next, jnode, capture_link);
78252+ while (&ch->tx_list != &cur->capture_link) {
78253+ jrelse(cur);
78254+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78255+ }
78256+ }
78257+
78258+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
78259+
78260+ return ret;
78261+
78262+ free_not_assigned:
78263+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The
78264+ caller takes care about invalidating of tx list */
78265+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
78266+
78267+ return ret;
78268+}
78269+
78270+static int commit_tx(struct commit_handle *ch)
78271+{
78272+ flush_queue_t *fq;
78273+ int barrier;
78274+ int ret;
78275+
78276+ /* Grab more space for wandered records. */
78277+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
78278+ if (ret)
78279+ return ret;
78280+
78281+ fq = get_fq_for_current_atom();
78282+ if (IS_ERR(fq))
78283+ return PTR_ERR(fq);
78284+
78285+ spin_unlock_atom(fq->atom);
78286+ do {
78287+ ret = alloc_wandered_blocks(ch, fq);
78288+ if (ret)
78289+ break;
78290+ ret = alloc_tx(ch, fq);
78291+ if (ret)
78292+ break;
78293+ } while (0);
78294+
78295+ reiser4_fq_put(fq);
78296+ if (ret)
78297+ return ret;
78298+ repeat_wo_barrier:
78299+ barrier = reiser4_use_write_barrier(ch->super);
78300+ if (!barrier) {
78301+ ret = current_atom_finish_all_fq();
78302+ if (ret)
78303+ return ret;
78304+ }
78305+ ret = update_journal_header(ch, barrier);
78306+ if (barrier) {
78307+ if (ret) {
78308+ if (ret == -EOPNOTSUPP) {
78309+ disable_write_barrier(ch->super);
78310+ goto repeat_wo_barrier;
78311+ }
78312+ return ret;
78313+ }
78314+ ret = current_atom_finish_all_fq();
78315+ }
78316+ return ret;
78317+}
78318+
78319+static int write_tx_back(struct commit_handle * ch)
78320+{
78321+ flush_queue_t *fq;
78322+ int ret;
78323+ int barrier;
78324+
78325+ reiser4_post_commit_hook();
78326+ fq = get_fq_for_current_atom();
78327+ if (IS_ERR(fq))
78328+ return PTR_ERR(fq);
78329+ spin_unlock_atom(fq->atom);
78330+ ret = write_jnode_list(
78331+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78332+ reiser4_fq_put(fq);
78333+ if (ret)
78334+ return ret;
78335+ repeat_wo_barrier:
78336+ barrier = reiser4_use_write_barrier(ch->super);
78337+ if (!barrier) {
78338+ ret = current_atom_finish_all_fq();
78339+ if (ret)
78340+ return ret;
78341+ }
78342+ ret = update_journal_footer(ch, barrier);
78343+ if (barrier) {
78344+ if (ret) {
78345+ if (ret == -EOPNOTSUPP) {
78346+ disable_write_barrier(ch->super);
78347+ goto repeat_wo_barrier;
78348+ }
78349+ return ret;
78350+ }
78351+ ret = current_atom_finish_all_fq();
78352+ }
78353+ if (ret)
78354+ return ret;
78355+ reiser4_post_write_back_hook();
78356+ return 0;
78357+}
78358+
78359+/* We assume that at this moment all captured blocks are marked as RELOC or
78360+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78361+ are submitted to write.
78362+*/
78363+
78364+int reiser4_write_logs(long *nr_submitted)
78365+{
78366+ txn_atom *atom;
78367+ struct super_block *super = reiser4_get_current_sb();
78368+ reiser4_super_info_data *sbinfo = get_super_private(super);
78369+ struct commit_handle ch;
78370+ int ret;
78371+
78372+ writeout_mode_enable();
78373+
78374+ /* block allocator may add j-nodes to the clean_list */
78375+ ret = reiser4_pre_commit_hook();
78376+ if (ret)
78377+ return ret;
78378+
78379+ /* No locks are required if we take atom which stage >=
78380+ * ASTAGE_PRE_COMMIT */
78381+ atom = get_current_context()->trans->atom;
78382+ assert("zam-965", atom != NULL);
78383+
78384+ /* relocate set is on the atom->clean_nodes list after
78385+ * current_atom_complete_writes() finishes. It can be safely
78386+ * uncaptured after commit_mutex is locked, because any atom that
78387+ * captures these nodes is guaranteed to commit after current one.
78388+ *
78389+ * This can only be done after reiser4_pre_commit_hook(), because it is where
78390+ * early flushed jnodes with CREATED bit are transferred to the
78391+ * overwrite list. */
78392+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
78393+ spin_lock_atom(atom);
78394+ /* There might be waiters for the relocate nodes which we have
78395+ * released, wake them up. */
78396+ reiser4_atom_send_event(atom);
78397+ spin_unlock_atom(atom);
78398+
78399+ if (REISER4_DEBUG) {
78400+ int level;
78401+
78402+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78403+ assert("nikita-3352",
78404+ list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78405+ }
78406+
78407+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78408+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78409+
78410+ init_commit_handle(&ch, atom);
78411+
78412+ ch.free_blocks = sbinfo->blocks_free_committed;
78413+ ch.nr_files = sbinfo->nr_files_committed;
78414+ /* ZAM-FIXME-HANS: email me what the contention level is for the super
78415+ * lock. */
78416+ ch.next_oid = oid_next(super);
78417+
78418+ /* count overwrite set and place it in a separate list */
78419+ ret = get_overwrite_set(&ch);
78420+
78421+ if (ret <= 0) {
78422+ /* It is possible that overwrite set is empty here, it means
78423+ all captured nodes are clean */
78424+ goto up_and_ret;
78425+ }
78426+
78427+ /* Inform the caller about what number of dirty pages will be
78428+ * submitted to disk. */
78429+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78430+
78431+ /* count all records needed for storing of the wandered set */
78432+ get_tx_size(&ch);
78433+
78434+ ret = commit_tx(&ch);
78435+ if (ret)
78436+ goto up_and_ret;
78437+
78438+ spin_lock_atom(atom);
78439+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
78440+ spin_unlock_atom(atom);
78441+
78442+ ret = write_tx_back(&ch);
78443+ reiser4_post_write_back_hook();
78444+
78445+ up_and_ret:
78446+ if (ret) {
78447+ /* there could be fq attached to current atom; the only way to
78448+ remove them is: */
78449+ current_atom_finish_all_fq();
78450+ }
78451+
78452+ /* free blocks of flushed transaction */
78453+ dealloc_tx_list(&ch);
78454+ dealloc_wmap(&ch);
78455+
78456+ put_overwrite_set(&ch);
78457+
78458+ done_commit_handle(&ch);
78459+
78460+ writeout_mode_disable();
78461+
78462+ return ret;
78463+}
78464+
78465+/* consistency checks for journal data/control blocks: header, footer, log
78466+ records, transactions head blocks. All functions return zero on success. */
78467+
78468+static int check_journal_header(const jnode * node UNUSED_ARG)
78469+{
78470+ /* FIXME: journal header has no magic field yet. */
78471+ return 0;
78472+}
78473+
78474+/* wait for write completion for all jnodes from given list */
78475+static int wait_on_jnode_list(struct list_head *head)
78476+{
78477+ jnode *scan;
78478+ int ret = 0;
78479+
78480+ list_for_each_entry(scan, head, capture_link) {
78481+ struct page *pg = jnode_page(scan);
78482+
78483+ if (pg) {
78484+ if (PageWriteback(pg))
78485+ wait_on_page_writeback(pg);
78486+
78487+ if (PageError(pg))
78488+ ret++;
78489+ }
78490+ }
78491+
78492+ return ret;
78493+}
78494+
78495+static int check_journal_footer(const jnode * node UNUSED_ARG)
78496+{
78497+ /* FIXME: journal footer has no magic field yet. */
78498+ return 0;
78499+}
78500+
78501+static int check_tx_head(const jnode * node)
78502+{
78503+ struct tx_header *header = (struct tx_header *)jdata(node);
78504+
78505+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
78506+ warning("zam-627", "tx head at block %s corrupted\n",
78507+ sprint_address(jnode_get_block(node)));
78508+ return RETERR(-EIO);
78509+ }
78510+
78511+ return 0;
78512+}
78513+
78514+static int check_wander_record(const jnode * node)
78515+{
78516+ struct wander_record_header *RH =
78517+ (struct wander_record_header *)jdata(node);
78518+
78519+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
78520+ 0) {
78521+ warning("zam-628", "wander record at block %s corrupted\n",
78522+ sprint_address(jnode_get_block(node)));
78523+ return RETERR(-EIO);
78524+ }
78525+
78526+ return 0;
78527+}
78528+
78529+/* fill commit_handler structure by everything what is needed for update_journal_footer */
78530+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
78531+{
78532+ struct tx_header *TXH;
78533+ int ret;
78534+
78535+ ret = jload(tx_head);
78536+ if (ret)
78537+ return ret;
78538+
78539+ TXH = (struct tx_header *)jdata(tx_head);
78540+
78541+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
78542+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
78543+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
78544+
78545+ jrelse(tx_head);
78546+
78547+ list_add(&tx_head->capture_link, &ch->tx_list);
78548+
78549+ return 0;
78550+}
78551+
78552+/* replay one transaction: restore and write overwrite set in place */
78553+static int replay_transaction(const struct super_block *s,
78554+ jnode * tx_head,
78555+ const reiser4_block_nr * log_rec_block_p,
78556+ const reiser4_block_nr * end_block,
78557+ unsigned int nr_wander_records)
78558+{
78559+ reiser4_block_nr log_rec_block = *log_rec_block_p;
78560+ struct commit_handle ch;
78561+ LIST_HEAD(overwrite_set);
78562+ jnode *log;
78563+ int ret;
78564+
78565+ init_commit_handle(&ch, NULL);
78566+ ch.overwrite_set = &overwrite_set;
78567+
78568+ restore_commit_handle(&ch, tx_head);
78569+
78570+ while (log_rec_block != *end_block) {
78571+ struct wander_record_header *header;
78572+ struct wander_entry *entry;
78573+
78574+ int i;
78575+
78576+ if (nr_wander_records == 0) {
78577+ warning("zam-631",
78578+ "number of wander records in the linked list"
78579+ " greater than number stored in tx head.\n");
78580+ ret = RETERR(-EIO);
78581+ goto free_ow_set;
78582+ }
78583+
78584+ log = reiser4_alloc_io_head(&log_rec_block);
78585+ if (log == NULL)
78586+ return RETERR(-ENOMEM);
78587+
78588+ ret = jload(log);
78589+ if (ret < 0) {
78590+ reiser4_drop_io_head(log);
78591+ return ret;
78592+ }
78593+
78594+ ret = check_wander_record(log);
78595+ if (ret) {
78596+ jrelse(log);
78597+ reiser4_drop_io_head(log);
78598+ return ret;
78599+ }
78600+
78601+ header = (struct wander_record_header *)jdata(log);
78602+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
78603+
78604+ entry = (struct wander_entry *)(header + 1);
78605+
78606+ /* restore overwrite set from wander record content */
78607+ for (i = 0; i < wander_record_capacity(s); i++) {
78608+ reiser4_block_nr block;
78609+ jnode *node;
78610+
78611+ block = le64_to_cpu(get_unaligned(&entry->wandered));
78612+ if (block == 0)
78613+ break;
78614+
78615+ node = reiser4_alloc_io_head(&block);
78616+ if (node == NULL) {
78617+ ret = RETERR(-ENOMEM);
78618+ /*
78619+ * FIXME-VS:???
78620+ */
78621+ jrelse(log);
78622+ reiser4_drop_io_head(log);
78623+ goto free_ow_set;
78624+ }
78625+
78626+ ret = jload(node);
78627+
78628+ if (ret < 0) {
78629+ reiser4_drop_io_head(node);
78630+ /*
78631+ * FIXME-VS:???
78632+ */
78633+ jrelse(log);
78634+ reiser4_drop_io_head(log);
78635+ goto free_ow_set;
78636+ }
78637+
78638+ block = le64_to_cpu(get_unaligned(&entry->original));
78639+
78640+ assert("zam-603", block != 0);
78641+
78642+ jnode_set_block(node, &block);
78643+
78644+ list_add_tail(&node->capture_link, ch.overwrite_set);
78645+
78646+ ++entry;
78647+ }
78648+
78649+ jrelse(log);
78650+ reiser4_drop_io_head(log);
78651+
78652+ --nr_wander_records;
78653+ }
78654+
78655+ if (nr_wander_records != 0) {
78656+ warning("zam-632", "number of wander records in the linked list"
78657+ " less than number stored in tx head.\n");
78658+ ret = RETERR(-EIO);
78659+ goto free_ow_set;
78660+ }
78661+
78662+ { /* write wandered set in place */
78663+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
78664+ ret = wait_on_jnode_list(ch.overwrite_set);
78665+
78666+ if (ret) {
78667+ ret = RETERR(-EIO);
78668+ goto free_ow_set;
78669+ }
78670+ }
78671+
78672+ ret = update_journal_footer(&ch, 0);
78673+
78674+ free_ow_set:
78675+
78676+ while (!list_empty(ch.overwrite_set)) {
78677+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
78678+ list_del_init(&cur->capture_link);
78679+ jrelse(cur);
78680+ reiser4_drop_io_head(cur);
78681+ }
78682+
78683+ list_del_init(&tx_head->capture_link);
78684+
78685+ done_commit_handle(&ch);
78686+
78687+ return ret;
78688+}
78689+
78690+/* find oldest committed and not played transaction and play it. The transaction
78691+ * was committed and journal header block was updated but the blocks from the
78692+ * process of writing the atom's overwrite set in-place and updating of journal
78693+ * footer block were not completed. This function completes the process by
78694+ * recovering the atom's overwrite set from their wandered locations and writes
78695+ * them in-place and updating the journal footer. */
78696+static int replay_oldest_transaction(struct super_block *s)
78697+{
78698+ reiser4_super_info_data *sbinfo = get_super_private(s);
78699+ jnode *jf = sbinfo->journal_footer;
78700+ unsigned int total;
78701+ struct journal_footer *F;
78702+ struct tx_header *T;
78703+
78704+ reiser4_block_nr prev_tx;
78705+ reiser4_block_nr last_flushed_tx;
78706+ reiser4_block_nr log_rec_block = 0;
78707+
78708+ jnode *tx_head;
78709+
78710+ int ret;
78711+
78712+ if ((ret = jload(jf)) < 0)
78713+ return ret;
78714+
78715+ F = (struct journal_footer *)jdata(jf);
78716+
78717+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
78718+
78719+ jrelse(jf);
78720+
78721+ if (sbinfo->last_committed_tx == last_flushed_tx) {
78722+ /* all transactions are replayed */
78723+ return 0;
78724+ }
78725+
78726+ prev_tx = sbinfo->last_committed_tx;
78727+
78728+ /* searching for oldest not flushed transaction */
78729+ while (1) {
78730+ tx_head = reiser4_alloc_io_head(&prev_tx);
78731+ if (!tx_head)
78732+ return RETERR(-ENOMEM);
78733+
78734+ ret = jload(tx_head);
78735+ if (ret < 0) {
78736+ reiser4_drop_io_head(tx_head);
78737+ return ret;
78738+ }
78739+
78740+ ret = check_tx_head(tx_head);
78741+ if (ret) {
78742+ jrelse(tx_head);
78743+ reiser4_drop_io_head(tx_head);
78744+ return ret;
78745+ }
78746+
78747+ T = (struct tx_header *)jdata(tx_head);
78748+
78749+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
78750+
78751+ if (prev_tx == last_flushed_tx)
78752+ break;
78753+
78754+ jrelse(tx_head);
78755+ reiser4_drop_io_head(tx_head);
78756+ }
78757+
78758+ total = le32_to_cpu(get_unaligned(&T->total));
78759+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
78760+
78761+ pin_jnode_data(tx_head);
78762+ jrelse(tx_head);
78763+
78764+ ret =
78765+ replay_transaction(s, tx_head, &log_rec_block,
78766+ jnode_get_block(tx_head), total - 1);
78767+
78768+ unpin_jnode_data(tx_head);
78769+ reiser4_drop_io_head(tx_head);
78770+
78771+ if (ret)
78772+ return ret;
78773+ return -E_REPEAT;
78774+}
78775+
78776+/* The reiser4 journal current implementation was optimized to not to capture
78777+ super block if certain super blocks fields are modified. Currently, the set
78778+ is (<free block count>, <OID allocator>). These fields are logged by
78779+ special way which includes storing them in each transaction head block at
78780+ atom commit time and writing that information to journal footer block at
78781+ atom flush time. For getting info from journal footer block to the
78782+ in-memory super block there is a special function
78783+ reiser4_journal_recover_sb_data() which should be called after disk format
78784+ plugin re-reads super block after journal replaying.
78785+*/
78786+
78787+/* get the information from journal footer in-memory super block */
78788+int reiser4_journal_recover_sb_data(struct super_block *s)
78789+{
78790+ reiser4_super_info_data *sbinfo = get_super_private(s);
78791+ struct journal_footer *jf;
78792+ int ret;
78793+
78794+ assert("zam-673", sbinfo->journal_footer != NULL);
78795+
78796+ ret = jload(sbinfo->journal_footer);
78797+ if (ret != 0)
78798+ return ret;
78799+
78800+ ret = check_journal_footer(sbinfo->journal_footer);
78801+ if (ret != 0)
78802+ goto out;
78803+
78804+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
78805+
78806+ /* was there at least one flushed transaction? */
78807+ if (jf->last_flushed_tx) {
78808+
78809+ /* restore free block counter logged in this transaction */
78810+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
78811+
78812+ /* restore oid allocator state */
78813+ oid_init_allocator(s,
78814+ le64_to_cpu(get_unaligned(&jf->nr_files)),
78815+ le64_to_cpu(get_unaligned(&jf->next_oid)));
78816+ }
78817+ out:
78818+ jrelse(sbinfo->journal_footer);
78819+ return ret;
78820+}
78821+
78822+/* reiser4 replay journal procedure */
78823+int reiser4_journal_replay(struct super_block *s)
78824+{
78825+ reiser4_super_info_data *sbinfo = get_super_private(s);
78826+ jnode *jh, *jf;
78827+ struct journal_header *header;
78828+ int nr_tx_replayed = 0;
78829+ int ret;
78830+
78831+ assert("zam-582", sbinfo != NULL);
78832+
78833+ jh = sbinfo->journal_header;
78834+ jf = sbinfo->journal_footer;
78835+
78836+ if (!jh || !jf) {
78837+ /* it is possible that disk layout does not support journal
78838+ structures, we just warn about this */
78839+ warning("zam-583",
78840+ "journal control blocks were not loaded by disk layout plugin. "
78841+ "journal replaying is not possible.\n");
78842+ return 0;
78843+ }
78844+
78845+ /* Take free block count from journal footer block. The free block
78846+ counter value corresponds the last flushed transaction state */
78847+ ret = jload(jf);
78848+ if (ret < 0)
78849+ return ret;
78850+
78851+ ret = check_journal_footer(jf);
78852+ if (ret) {
78853+ jrelse(jf);
78854+ return ret;
78855+ }
78856+
78857+ jrelse(jf);
78858+
78859+ /* store last committed transaction info in reiser4 in-memory super
78860+ block */
78861+ ret = jload(jh);
78862+ if (ret < 0)
78863+ return ret;
78864+
78865+ ret = check_journal_header(jh);
78866+ if (ret) {
78867+ jrelse(jh);
78868+ return ret;
78869+ }
78870+
78871+ header = (struct journal_header *)jdata(jh);
78872+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
78873+
78874+ jrelse(jh);
78875+
78876+ /* replay committed transactions */
78877+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
78878+ nr_tx_replayed++;
78879+
78880+ return ret;
78881+}
78882+
78883+/* load journal control block (either journal header or journal footer block) */
78884+static int
78885+load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
78886+{
78887+ int ret;
78888+
78889+ *node = reiser4_alloc_io_head(block);
78890+ if (!(*node))
78891+ return RETERR(-ENOMEM);
78892+
78893+ ret = jload(*node);
78894+
78895+ if (ret) {
78896+ reiser4_drop_io_head(*node);
78897+ *node = NULL;
78898+ return ret;
78899+ }
78900+
78901+ pin_jnode_data(*node);
78902+ jrelse(*node);
78903+
78904+ return 0;
78905+}
78906+
78907+/* unload journal header or footer and free jnode */
78908+static void unload_journal_control_block(jnode ** node)
78909+{
78910+ if (*node) {
78911+ unpin_jnode_data(*node);
78912+ reiser4_drop_io_head(*node);
78913+ *node = NULL;
78914+ }
78915+}
78916+
78917+/* release journal control blocks */
78918+void reiser4_done_journal_info(struct super_block *s)
78919+{
78920+ reiser4_super_info_data *sbinfo = get_super_private(s);
78921+
78922+ assert("zam-476", sbinfo != NULL);
78923+
78924+ unload_journal_control_block(&sbinfo->journal_header);
78925+ unload_journal_control_block(&sbinfo->journal_footer);
78926+ rcu_barrier();
78927+}
78928+
78929+/* load journal control blocks */
78930+int reiser4_init_journal_info(struct super_block *s)
78931+{
78932+ reiser4_super_info_data *sbinfo = get_super_private(s);
78933+ journal_location *loc;
78934+ int ret;
78935+
78936+ loc = &sbinfo->jloc;
78937+
78938+ assert("zam-651", loc != NULL);
78939+ assert("zam-652", loc->header != 0);
78940+ assert("zam-653", loc->footer != 0);
78941+
78942+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
78943+
78944+ if (ret)
78945+ return ret;
78946+
78947+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
78948+
78949+ if (ret) {
78950+ unload_journal_control_block(&sbinfo->journal_header);
78951+ }
78952+
78953+ return ret;
78954+}
78955+
78956+/* Make Linus happy.
78957+ Local variables:
78958+ c-indentation-style: "K&R"
78959+ mode-name: "LC"
78960+ c-basic-offset: 8
78961+ tab-width: 8
78962+ fill-column: 80
78963+ End:
78964+*/
78965diff -urN linux-2.6.20.orig/fs/reiser4/wander.h linux-2.6.20/fs/reiser4/wander.h
78966--- linux-2.6.20.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
78967+++ linux-2.6.20/fs/reiser4/wander.h 2007-05-06 14:50:43.903039466 +0400
78968@@ -0,0 +1,135 @@
78969+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
78970+
78971+#if !defined (__FS_REISER4_WANDER_H__)
78972+#define __FS_REISER4_WANDER_H__
78973+
78974+#include "dformat.h"
78975+
78976+#include <linux/fs.h> /* for struct super_block */
78977+
78978+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
78979+
78980+#define TX_HEADER_MAGIC "TxMagic4"
78981+#define WANDER_RECORD_MAGIC "LogMagc4"
78982+
78983+#define TX_HEADER_MAGIC_SIZE (8)
78984+#define WANDER_RECORD_MAGIC_SIZE (8)
78985+
78986+/* journal header block format */
78987+struct journal_header {
78988+ /* last written transaction head location */
78989+ d64 last_committed_tx;
78990+};
78991+
78992+typedef struct journal_location {
78993+ reiser4_block_nr footer;
78994+ reiser4_block_nr header;
78995+} journal_location;
78996+
78997+/* The wander.c head comment describes usage and semantic of all these structures */
78998+/* journal footer block format */
78999+struct journal_footer {
79000+ /* last flushed transaction location. */
79001+ /* This block number is no more valid after the transaction it points
79002+ to gets flushed, this number is used only at journal replaying time
79003+ for detection of the end of on-disk list of committed transactions
79004+ which were not flushed completely */
79005+ d64 last_flushed_tx;
79006+
79007+ /* free block counter is written in journal footer at transaction
79008+ flushing , not in super block because free blocks counter is logged
79009+ by another way than super block fields (root pointer, for
79010+ example). */
79011+ d64 free_blocks;
79012+
79013+ /* number of used OIDs and maximal used OID are logged separately from
79014+ super block */
79015+ d64 nr_files;
79016+ d64 next_oid;
79017+};
79018+
79019+/* Each wander record (except the first one) has unified format with wander
79020+ record header followed by an array of log entries */
79021+struct wander_record_header {
79022+ /* when there is no predefined location for wander records, this magic
79023+ string should help reiser4fsck. */
79024+ char magic[WANDER_RECORD_MAGIC_SIZE];
79025+
79026+ /* transaction id */
79027+ d64 id;
79028+
79029+ /* total number of wander records in current transaction */
79030+ d32 total;
79031+
79032+ /* this block number in transaction */
79033+ d32 serial;
79034+
79035+ /* number of previous block in commit */
79036+ d64 next_block;
79037+};
79038+
79039+/* The first wander record (transaction head) of written transaction has the
79040+ special format */
79041+struct tx_header {
79042+ /* magic string makes first block in transaction different from other
79043+ logged blocks, it should help fsck. */
79044+ char magic[TX_HEADER_MAGIC_SIZE];
79045+
79046+ /* transaction id */
79047+ d64 id;
79048+
79049+ /* total number of records (including this first tx head) in the
79050+ transaction */
79051+ d32 total;
79052+
79053+ /* align next field to 8-byte boundary; this field always is zero */
79054+ d32 padding;
79055+
79056+ /* block number of previous transaction head */
79057+ d64 prev_tx;
79058+
79059+ /* next wander record location */
79060+ d64 next_block;
79061+
79062+ /* committed versions of free blocks counter */
79063+ d64 free_blocks;
79064+
79065+ /* number of used OIDs (nr_files) and maximal used OID are logged
79066+ separately from super block */
79067+ d64 nr_files;
79068+ d64 next_oid;
79069+};
79070+
79071+/* A transaction gets written to disk as a set of wander records (each wander
79072+ record size is fs block) */
79073+
79074+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
79075+ by zeroes */
79076+struct wander_entry {
79077+ d64 original; /* block original location */
79078+ d64 wandered; /* block wandered location */
79079+};
79080+
79081+/* REISER4 JOURNAL WRITER FUNCTIONS */
79082+
79083+extern int reiser4_write_logs(long *);
79084+extern int reiser4_journal_replay(struct super_block *);
79085+extern int reiser4_journal_recover_sb_data(struct super_block *);
79086+
79087+extern int reiser4_init_journal_info(struct super_block *);
79088+extern void reiser4_done_journal_info(struct super_block *);
79089+
79090+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
79091+
79092+#endif /* __FS_REISER4_WANDER_H__ */
79093+
79094+/* Make Linus happy.
79095+ Local variables:
79096+ c-indentation-style: "K&R"
79097+ mode-name: "LC"
79098+ c-basic-offset: 8
79099+ tab-width: 8
79100+ fill-column: 80
79101+ scroll-step: 1
79102+ End:
79103+*/
79104diff -urN linux-2.6.20.orig/fs/reiser4/writeout.h linux-2.6.20/fs/reiser4/writeout.h
79105--- linux-2.6.20.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300
79106+++ linux-2.6.20/fs/reiser4/writeout.h 2007-05-06 14:50:43.907040716 +0400
79107@@ -0,0 +1,21 @@
79108+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
79109+
79110+#if !defined (__FS_REISER4_WRITEOUT_H__)
79111+
79112+#define WRITEOUT_SINGLE_STREAM (0x1)
79113+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
79114+#define WRITEOUT_BARRIER (0x4)
79115+
79116+extern int reiser4_get_writeout_flags(void);
79117+
79118+#endif /* __FS_REISER4_WRITEOUT_H__ */
79119+
79120+/* Make Linus happy.
79121+ Local variables:
79122+ c-indentation-style: "K&R"
79123+ mode-name: "LC"
79124+ c-basic-offset: 8
79125+ tab-width: 8
79126+ fill-column: 80
79127+ End:
79128+*/
79129diff -urN linux-2.6.20.orig/fs/reiser4/znode.c linux-2.6.20/fs/reiser4/znode.c
79130--- linux-2.6.20.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300
79131+++ linux-2.6.20/fs/reiser4/znode.c 2007-05-06 14:50:43.907040716 +0400
79132@@ -0,0 +1,1029 @@
79133+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
79134+ * reiser4/README */
79135+/* Znode manipulation functions. */
79136+/* Znode is the in-memory header for a tree node. It is stored
79137+ separately from the node itself so that it does not get written to
79138+ disk. In this respect znode is like buffer head or page head. We
79139+ also use znodes for additional reiser4 specific purposes:
79140+
79141+ . they are organized into tree structure which is a part of whole
79142+ reiser4 tree.
79143+ . they are used to implement node grained locking
79144+ . they are used to keep additional state associated with a
79145+ node
79146+ . they contain links to lists used by the transaction manager
79147+
79148+ Znode is attached to some variable "block number" which is instance of
79149+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
79150+ appropriate node being actually loaded in memory. Existence of znode itself
79151+ is regulated by reference count (->x_count) in it. Each time thread
79152+ acquires reference to znode through call to zget(), ->x_count is
79153+ incremented and decremented on call to zput(). Data (content of node) are
79154+ brought in memory through call to zload(), which also increments ->d_count
79155+ reference counter. zload can block waiting on IO. Call to zrelse()
79156+ decreases this counter. Also, ->c_count keeps track of number of child
79157+ znodes and prevents parent znode from being recycled until all of its
79158+ children are. ->c_count is decremented whenever child goes out of existence
79159+ (being actually recycled in zdestroy()) which can be some time after last
79160+ reference to this child dies if we support some form of LRU cache for
79161+ znodes.
79162+
79163+*/
79164+/* EVERY ZNODE'S STORY
79165+
79166+ 1. His infancy.
79167+
79168+ Once upon a time, the znode was born deep inside of zget() by call to
79169+ zalloc(). At the return from zget() znode had:
79170+
79171+ . reference counter (x_count) of 1
79172+ . assigned block number, marked as used in bitmap
79173+ . pointer to parent znode. Root znode parent pointer points
79174+ to its father: "fake" znode. This, in turn, has NULL parent pointer.
79175+ . hash table linkage
79176+ . no data loaded from disk
79177+ . no node plugin
79178+ . no sibling linkage
79179+
79180+ 2. His childhood
79181+
79182+ Each node is either brought into memory as a result of tree traversal, or
79183+ created afresh, creation of the root being a special case of the latter. In
79184+ either case it's inserted into sibling list. This will typically require
79185+ some ancillary tree traversing, but ultimately both sibling pointers will
79186+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
79187+ zjnode.state.
79188+
79189+ 3. His youth.
79190+
79191+ If znode is bound to already existing node in a tree, its content is read
79192+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
79193+ in zjnode.state and zdata() function starts to return non null for this
79194+ znode. zload() further calls zparse() that determines which node layout
79195+ this node is rendered in, and sets ->nplug on success.
79196+
79197+ If znode is for new node just created, memory for it is allocated and
79198+ zinit_new() function is called to initialise data, according to selected
79199+ node layout.
79200+
79201+ 4. His maturity.
79202+
79203+ After this point, znode lingers in memory for some time. Threads can
79204+ acquire references to znode either by blocknr through call to zget(), or by
79205+ following a pointer to unallocated znode from internal item. Each time
79206+ reference to znode is obtained, x_count is increased. Thread can read/write
79207+ lock znode. Znode data can be loaded through calls to zload(), d_count will
79208+ be increased appropriately. If all references to znode are released
79209+ (x_count drops to 0), znode is not recycled immediately. Rather, it is
79210+ still cached in the hash table in the hope that it will be accessed
79211+ shortly.
79212+
79213+ There are two ways in which znode existence can be terminated:
79214+
79215+ . sudden death: node bound to this znode is removed from the tree
79216+ . overpopulation: znode is purged out of memory due to memory pressure
79217+
79218+ 5. His death.
79219+
79220+ Death is complex process.
79221+
79222+ When we irrevocably commit ourselves to decision to remove node from the
79223+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
79224+ znode. This is done either in ->kill_hook() of internal item or in
79225+ reiser4_kill_root() function when tree root is removed.
79226+
79227+ At this moment znode still has:
79228+
79229+ . locks held on it, necessary write ones
79230+ . references to it
79231+ . disk block assigned to it
79232+ . data loaded from the disk
79233+ . pending requests for lock
79234+
79235+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
79236+ deletion. Node deletion includes two phases. First all ways to get
79237+ references to that znode (sibling and parent links and hash lookup using
79238+ block number stored in parent node) should be deleted -- it is done through
79239+ sibling_list_remove(), also we assume that nobody uses down link from
79240+ parent node due to its nonexistence or proper parent node locking and
79241+ nobody uses parent pointers from children due to absence of them. Second we
79242+ invalidate all pending lock requests which still are on znode's lock
79243+ request queue, this is done by reiser4_invalidate_lock(). Another
79244+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
79245+ Once it set all requesters are forced to return -EINVAL from
79246+ longterm_lock_znode(). Future locking attempts are not possible because all
79247+ ways to get references to that znode are removed already. Last, node is
79248+ uncaptured from transaction.
79249+
79250+ When last reference to the dying znode is just about to be released,
79251+ block number for this lock is released and znode is removed from the
79252+ hash table.
79253+
79254+ Now znode can be recycled.
79255+
79256+ [it's possible to free bitmap block and remove znode from the hash
79257+ table when last lock is released. This will result in having
79258+ referenced but completely orphaned znode]
79259+
79260+ 6. Limbo
79261+
79262+ As have been mentioned above znodes with reference counter 0 are
79263+ still cached in a hash table. Once memory pressure increases they are
79264+ purged out of there [this requires something like LRU list for
79265+ efficient implementation. LRU list would also greatly simplify
79266+ implementation of coord cache that would in this case morph to just
79267+ scanning some initial segment of LRU list]. Data loaded into
79268+ unreferenced znode are flushed back to the durable storage if
79269+ necessary and memory is freed. Znodes themselves can be recycled at
79270+ this point too.
79271+
79272+*/
79273+
79274+#include "debug.h"
79275+#include "dformat.h"
79276+#include "key.h"
79277+#include "coord.h"
79278+#include "plugin/plugin_header.h"
79279+#include "plugin/node/node.h"
79280+#include "plugin/plugin.h"
79281+#include "txnmgr.h"
79282+#include "jnode.h"
79283+#include "znode.h"
79284+#include "block_alloc.h"
79285+#include "tree.h"
79286+#include "tree_walk.h"
79287+#include "super.h"
79288+#include "reiser4.h"
79289+
79290+#include <linux/pagemap.h>
79291+#include <linux/spinlock.h>
79292+#include <linux/slab.h>
79293+#include <linux/err.h>
79294+
79295+static z_hash_table *get_htable(reiser4_tree *,
79296+ const reiser4_block_nr * const blocknr);
79297+static z_hash_table *znode_get_htable(const znode *);
79298+static void zdrop(znode *);
79299+
79300+/* hash table support */
79301+
79302+/* compare two block numbers for equality. Used by hash-table macros */
79303+static inline int
79304+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79305+{
79306+ assert("nikita-534", b1 != NULL);
79307+ assert("nikita-535", b2 != NULL);
79308+
79309+ return *b1 == *b2;
79310+}
79311+
79312+/* Hash znode by block number. Used by hash-table macros */
79313+/* Audited by: umka (2002.06.11) */
79314+static inline __u32
79315+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79316+{
79317+ assert("nikita-536", b != NULL);
79318+
79319+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79320+}
79321+
79322+/* The hash table definition */
79323+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
79324+#define KFREE(ptr, size) kfree(ptr)
79325+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79326+ blknrhashfn, blknreq);
79327+#undef KFREE
79328+#undef KMALLOC
79329+
79330+/* slab for znodes */
79331+static struct kmem_cache *znode_cache;
79332+
79333+int znode_shift_order;
79334+
79335+/**
79336+ * init_znodes - create znode cache
79337+ *
79338+ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79339+ */
79340+int init_znodes(void)
79341+{
79342+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79343+ SLAB_HWCACHE_ALIGN |
79344+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79345+ if (znode_cache == NULL)
79346+ return RETERR(-ENOMEM);
79347+
79348+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79349+ ++znode_shift_order);
79350+ --znode_shift_order;
79351+ return 0;
79352+}
79353+
79354+/**
79355+ * done_znodes - delete znode cache
79356+ *
79357+ * This is called on reiser4 module unloading or system shutdown.
79358+ */
79359+void done_znodes(void)
79360+{
79361+ destroy_reiser4_cache(&znode_cache);
79362+}
79363+
79364+/* call this to initialise tree of znodes */
79365+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79366+{
79367+ int result;
79368+ assert("umka-050", tree != NULL);
79369+
79370+ rwlock_init(&tree->dk_lock);
79371+
79372+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79373+ if (result != 0)
79374+ return result;
79375+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79376+ return result;
79377+}
79378+
79379+/* free this znode */
79380+void zfree(znode * node /* znode to free */ )
79381+{
79382+ assert("nikita-465", node != NULL);
79383+ assert("nikita-2120", znode_page(node) == NULL);
79384+ assert("nikita-2301", list_empty_careful(&node->lock.owners));
79385+ assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79386+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79387+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79388+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79389+ assert("nikita-3293", !znode_is_right_connected(node));
79390+ assert("nikita-3294", !znode_is_left_connected(node));
79391+ assert("nikita-3295", node->left == NULL);
79392+ assert("nikita-3296", node->right == NULL);
79393+
79394+ /* not yet phash_jnode_destroy(ZJNODE(node)); */
79395+
79396+ kmem_cache_free(znode_cache, node);
79397+}
79398+
79399+/* call this to free tree of znodes */
79400+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79401+{
79402+ znode *node;
79403+ znode *next;
79404+ z_hash_table *ztable;
79405+
79406+ /* scan znode hash-tables and kill all znodes, then free hash tables
79407+ * themselves. */
79408+
79409+ assert("nikita-795", tree != NULL);
79410+
79411+ ztable = &tree->zhash_table;
79412+
79413+ if (ztable->_table != NULL) {
79414+ for_all_in_htable(ztable, z, node, next) {
79415+ node->c_count = 0;
79416+ node->in_parent.node = NULL;
79417+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79418+ zdrop(node);
79419+ }
79420+
79421+ z_hash_done(&tree->zhash_table);
79422+ }
79423+
79424+ ztable = &tree->zfake_table;
79425+
79426+ if (ztable->_table != NULL) {
79427+ for_all_in_htable(ztable, z, node, next) {
79428+ node->c_count = 0;
79429+ node->in_parent.node = NULL;
79430+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79431+ zdrop(node);
79432+ }
79433+
79434+ z_hash_done(&tree->zfake_table);
79435+ }
79436+}
79437+
79438+/* ZNODE STRUCTURES */
79439+
79440+/* allocate fresh znode */
79441+znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79442+{
79443+ znode *node;
79444+
79445+ node = kmem_cache_alloc(znode_cache, gfp_flag);
79446+ return node;
79447+}
79448+
79449+/* Initialize fields of znode
79450+ @node: znode to initialize;
79451+ @parent: parent znode;
79452+ @tree: tree we are in. */
79453+void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79454+{
79455+ assert("nikita-466", node != NULL);
79456+ assert("umka-268", current_tree != NULL);
79457+
79458+ memset(node, 0, sizeof *node);
79459+
79460+ assert("umka-051", tree != NULL);
79461+
79462+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79463+ reiser4_init_lock(&node->lock);
79464+ init_parent_coord(&node->in_parent, parent);
79465+}
79466+
79467+/*
79468+ * remove znode from indices. This is called jput() when last reference on
79469+ * znode is released.
79470+ */
79471+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79472+{
79473+ assert("nikita-2108", node != NULL);
79474+ assert("nikita-470", node->c_count == 0);
79475+ assert_rw_write_locked(&(tree->tree_lock));
79476+
79477+ /* remove reference to this znode from cbk cache */
79478+ cbk_cache_invalidate(node, tree);
79479+
79480+ /* update c_count of parent */
79481+ if (znode_parent(node) != NULL) {
79482+ assert("nikita-472", znode_parent(node)->c_count > 0);
79483+ /* father, onto your hands I forward my spirit... */
79484+ znode_parent(node)->c_count--;
79485+ node->in_parent.node = NULL;
79486+ } else {
79487+ /* orphaned znode?! Root? */
79488+ }
79489+
79490+ /* remove znode from hash-table */
79491+ z_hash_remove_rcu(znode_get_htable(node), node);
79492+}
79493+
79494+/* zdrop() -- Remove znode from the tree.
79495+
79496+ This is called when znode is removed from the memory. */
79497+static void zdrop(znode * node /* znode to finish with */ )
79498+{
79499+ jdrop(ZJNODE(node));
79500+}
79501+
79502+/*
79503+ * put znode into right place in the hash table. This is called by relocate
79504+ * code.
79505+ */
79506+int znode_rehash(znode * node /* node to rehash */ ,
79507+ const reiser4_block_nr * new_block_nr /* new block number */ )
79508+{
79509+ z_hash_table *oldtable;
79510+ z_hash_table *newtable;
79511+ reiser4_tree *tree;
79512+
79513+ assert("nikita-2018", node != NULL);
79514+
79515+ tree = znode_get_tree(node);
79516+ oldtable = znode_get_htable(node);
79517+ newtable = get_htable(tree, new_block_nr);
79518+
79519+ write_lock_tree(tree);
79520+ /* remove znode from hash-table */
79521+ z_hash_remove_rcu(oldtable, node);
79522+
79523+ /* assertion no longer valid due to RCU */
79524+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
79525+
79526+ /* update blocknr */
79527+ znode_set_block(node, new_block_nr);
79528+ node->zjnode.key.z = *new_block_nr;
79529+
79530+ /* insert it into hash */
79531+ z_hash_insert_rcu(newtable, node);
79532+ write_unlock_tree(tree);
79533+ return 0;
79534+}
79535+
79536+/* ZNODE LOOKUP, GET, PUT */
79537+
79538+/* zlook() - get znode with given block_nr in a hash table or return NULL
79539+
79540+ If result is non-NULL then the znode's x_count is incremented. Internal version
79541+ accepts pre-computed hash index. The hash table is accessed under caller's
79542+ tree->hash_lock.
79543+*/
79544+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
79545+{
79546+ znode *result;
79547+ __u32 hash;
79548+ z_hash_table *htable;
79549+
79550+ assert("jmacd-506", tree != NULL);
79551+ assert("jmacd-507", blocknr != NULL);
79552+
79553+ htable = get_htable(tree, blocknr);
79554+ hash = blknrhashfn(htable, blocknr);
79555+
79556+ rcu_read_lock();
79557+ result = z_hash_find_index(htable, hash, blocknr);
79558+
79559+ if (result != NULL) {
79560+ add_x_ref(ZJNODE(result));
79561+ result = znode_rip_check(tree, result);
79562+ }
79563+ rcu_read_unlock();
79564+
79565+ return result;
79566+}
79567+
79568+/* return hash table where znode with block @blocknr is (or should be)
79569+ * stored */
79570+static z_hash_table *get_htable(reiser4_tree * tree,
79571+ const reiser4_block_nr * const blocknr)
79572+{
79573+ z_hash_table *table;
79574+ if (is_disk_addr_unallocated(blocknr))
79575+ table = &tree->zfake_table;
79576+ else
79577+ table = &tree->zhash_table;
79578+ return table;
79579+}
79580+
79581+/* return hash table where znode @node is (or should be) stored */
79582+static z_hash_table *znode_get_htable(const znode * node)
79583+{
79584+ return get_htable(znode_get_tree(node), znode_get_block(node));
79585+}
79586+
79587+/* zget() - get znode from hash table, allocating it if necessary.
79588+
79589+ First a call to zlook, locating a x-referenced znode if one
79590+ exists. If znode is not found, allocate new one and return. Result
79591+ is returned with x_count reference increased.
79592+
79593+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
79594+ LOCK ORDERING: NONE
79595+*/
79596+znode *zget(reiser4_tree * tree,
79597+ const reiser4_block_nr * const blocknr,
79598+ znode * parent, tree_level level, gfp_t gfp_flag)
79599+{
79600+ znode *result;
79601+ __u32 hashi;
79602+
79603+ z_hash_table *zth;
79604+
79605+ assert("jmacd-512", tree != NULL);
79606+ assert("jmacd-513", blocknr != NULL);
79607+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
79608+
79609+ zth = get_htable(tree, blocknr);
79610+ hashi = blknrhashfn(zth, blocknr);
79611+
79612+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not
79613+ implemented. */
79614+
79615+ z_hash_prefetch_bucket(zth, hashi);
79616+
79617+ rcu_read_lock();
79618+ /* Find a matching BLOCKNR in the hash table. If the znode is found,
79619+ we obtain an reference (x_count) but the znode remains unlocked.
79620+ Have to worry about race conditions later. */
79621+ result = z_hash_find_index(zth, hashi, blocknr);
79622+ /* According to the current design, the hash table lock protects new
79623+ znode references. */
79624+ if (result != NULL) {
79625+ add_x_ref(ZJNODE(result));
79626+ /* NOTE-NIKITA it should be so, but special case during
79627+ creation of new root makes such assertion highly
79628+ complicated. */
79629+ assert("nikita-2131", 1 || znode_parent(result) == parent ||
79630+ (ZF_ISSET(result, JNODE_ORPHAN)
79631+ && (znode_parent(result) == NULL)));
79632+ result = znode_rip_check(tree, result);
79633+ }
79634+
79635+ rcu_read_unlock();
79636+
79637+ if (!result) {
79638+ znode *shadow;
79639+
79640+ result = zalloc(gfp_flag);
79641+ if (!result) {
79642+ return ERR_PTR(RETERR(-ENOMEM));
79643+ }
79644+
79645+ zinit(result, parent, tree);
79646+ ZJNODE(result)->blocknr = *blocknr;
79647+ ZJNODE(result)->key.z = *blocknr;
79648+ result->level = level;
79649+
79650+ write_lock_tree(tree);
79651+
79652+ shadow = z_hash_find_index(zth, hashi, blocknr);
79653+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
79654+ jnode_list_remove(ZJNODE(result));
79655+ zfree(result);
79656+ result = shadow;
79657+ } else {
79658+ result->version = znode_build_version(tree);
79659+ z_hash_insert_index_rcu(zth, hashi, result);
79660+
79661+ if (parent != NULL)
79662+ ++parent->c_count;
79663+ }
79664+
79665+ add_x_ref(ZJNODE(result));
79666+
79667+ write_unlock_tree(tree);
79668+ }
79669+#if REISER4_DEBUG
79670+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
79671+ reiser4_check_block(blocknr, 1);
79672+#endif
79673+ /* Check for invalid tree level, return -EIO */
79674+ if (unlikely(znode_get_level(result) != level)) {
79675+ warning("jmacd-504",
79676+ "Wrong level for cached block %llu: %i expecting %i",
79677+ (unsigned long long)(*blocknr), znode_get_level(result),
79678+ level);
79679+ zput(result);
79680+ return ERR_PTR(RETERR(-EIO));
79681+ }
79682+
79683+ assert("nikita-1227", znode_invariant(result));
79684+
79685+ return result;
79686+}
79687+
79688+/* ZNODE PLUGINS/DATA */
79689+
79690+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
79691+ stored at the fixed offset from the beginning of the node. */
79692+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
79693+ * plugin of */ )
79694+{
79695+ reiser4_tree *tree;
79696+
79697+ assert("nikita-1053", node != NULL);
79698+ assert("nikita-1055", zdata(node) != NULL);
79699+
79700+ tree = znode_get_tree(node);
79701+ assert("umka-053", tree != NULL);
79702+
79703+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
79704+ return tree->nplug;
79705+ } else {
79706+ return node_plugin_by_disk_id
79707+ (tree, &((common_node_header *) zdata(node))->plugin_id);
79708+#ifdef GUESS_EXISTS
79709+ reiser4_plugin *plugin;
79710+
79711+ /* NOTE-NIKITA add locking here when dynamic plugins will be
79712+ * implemented */
79713+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
79714+ if ((plugin->u.node.guess != NULL)
79715+ && plugin->u.node.guess(node))
79716+ return plugin;
79717+ }
79718+ warning("nikita-1057", "Cannot guess node plugin");
79719+ print_znode("node", node);
79720+ return NULL;
79721+#endif
79722+ }
79723+}
79724+
79725+/* parse node header and install ->node_plugin */
79726+int zparse(znode * node /* znode to parse */ )
79727+{
79728+ int result;
79729+
79730+ assert("nikita-1233", node != NULL);
79731+ assert("nikita-2370", zdata(node) != NULL);
79732+
79733+ if (node->nplug == NULL) {
79734+ node_plugin *nplug;
79735+
79736+ nplug = znode_guess_plugin(node);
79737+ if (likely(nplug != NULL)) {
79738+ result = nplug->parse(node);
79739+ if (likely(result == 0))
79740+ node->nplug = nplug;
79741+ } else {
79742+ result = RETERR(-EIO);
79743+ }
79744+ } else
79745+ result = 0;
79746+ return result;
79747+}
79748+
79749+/* zload with readahead */
79750+int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
79751+{
79752+ int result;
79753+
79754+ assert("nikita-484", node != NULL);
79755+ assert("nikita-1377", znode_invariant(node));
79756+ assert("jmacd-7771", !znode_above_root(node));
79757+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
79758+ assert("nikita-3016", reiser4_schedulable());
79759+
79760+ if (info)
79761+ formatted_readahead(node, info);
79762+
79763+ result = jload(ZJNODE(node));
79764+ assert("nikita-1378", znode_invariant(node));
79765+ return result;
79766+}
79767+
79768+/* load content of node into memory */
79769+int zload(znode * node)
79770+{
79771+ return zload_ra(node, NULL);
79772+}
79773+
79774+/* call node plugin to initialise newly allocated node. */
79775+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
79776+{
79777+ return jinit_new(ZJNODE(node), gfp_flags);
79778+}
79779+
79780+/* drop reference to node data. When last reference is dropped, data are
79781+ unloaded. */
79782+void zrelse(znode * node /* znode to release references to */ )
79783+{
79784+ assert("nikita-1381", znode_invariant(node));
79785+
79786+ jrelse(ZJNODE(node));
79787+}
79788+
79789+/* returns free space in node */
79790+unsigned znode_free_space(znode * node /* znode to query */ )
79791+{
79792+ assert("nikita-852", node != NULL);
79793+ return node_plugin_by_node(node)->free_space(node);
79794+}
79795+
79796+/* left delimiting key of znode */
79797+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
79798+{
79799+ assert("nikita-958", node != NULL);
79800+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79801+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
79802+ assert("nikita-30671", node->rd_key_version != 0);
79803+ return &node->rd_key;
79804+}
79805+
79806+/* right delimiting key of znode */
79807+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
79808+{
79809+ assert("nikita-974", node != NULL);
79810+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79811+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
79812+ assert("nikita-30681", node->ld_key_version != 0);
79813+ return &node->ld_key;
79814+}
79815+
79816+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
79817+ )
79818+
79819+/* update right-delimiting key of @node */
79820+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
79821+{
79822+ assert("nikita-2937", node != NULL);
79823+ assert("nikita-2939", key != NULL);
79824+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79825+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
79826+ assert("nikita-2944",
79827+ znode_is_any_locked(node) ||
79828+ znode_get_level(node) != LEAF_LEVEL ||
79829+ keyge(key, &node->rd_key) ||
79830+ keyeq(&node->rd_key, reiser4_min_key()) ||
79831+ ZF_ISSET(node, JNODE_HEARD_BANSHEE));
79832+
79833+ node->rd_key = *key;
79834+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
79835+ return &node->rd_key;
79836+}
79837+
79838+/* update left-delimiting key of @node */
79839+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
79840+{
79841+ assert("nikita-2940", node != NULL);
79842+ assert("nikita-2941", key != NULL);
79843+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79844+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
79845+ assert("nikita-2943",
79846+ znode_is_any_locked(node) || keyeq(&node->ld_key,
79847+ reiser4_min_key()));
79848+
79849+ node->ld_key = *key;
79850+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
79851+ return &node->ld_key;
79852+}
79853+
79854+/* true if @key is inside key range for @node */
79855+int znode_contains_key(znode * node /* znode to look in */ ,
79856+ const reiser4_key * key /* key to look for */ )
79857+{
79858+ assert("nikita-1237", node != NULL);
79859+ assert("nikita-1238", key != NULL);
79860+
79861+ /* left_delimiting_key <= key <= right_delimiting_key */
79862+ return keyle(znode_get_ld_key(node), key)
79863+ && keyle(key, znode_get_rd_key(node));
79864+}
79865+
79866+/* same as znode_contains_key(), but lock dk lock */
79867+int znode_contains_key_lock(znode * node /* znode to look in */ ,
79868+ const reiser4_key * key /* key to look for */ )
79869+{
79870+ int result;
79871+
79872+ assert("umka-056", node != NULL);
79873+ assert("umka-057", key != NULL);
79874+
79875+ read_lock_dk(znode_get_tree(node));
79876+ result = znode_contains_key(node, key);
79877+ read_unlock_dk(znode_get_tree(node));
79878+ return result;
79879+}
79880+
79881+/* get parent pointer, assuming tree is not locked */
79882+znode *znode_parent_nolock(const znode * node /* child znode */ )
79883+{
79884+ assert("nikita-1444", node != NULL);
79885+ return node->in_parent.node;
79886+}
79887+
79888+/* get parent pointer of znode */
79889+znode *znode_parent(const znode * node /* child znode */ )
79890+{
79891+ assert("nikita-1226", node != NULL);
79892+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
79893+ return znode_parent_nolock(node);
79894+}
79895+
79896+/* detect uber znode used to protect in-superblock tree root pointer */
79897+int znode_above_root(const znode * node /* znode to query */ )
79898+{
79899+ assert("umka-059", node != NULL);
79900+
79901+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
79902+}
79903+
79904+/* check that @node is root---that its block number is recorder in the tree as
79905+ that of root node */
79906+#if REISER4_DEBUG
79907+static int znode_is_true_root(const znode * node /* znode to query */ )
79908+{
79909+ assert("umka-060", node != NULL);
79910+ assert("umka-061", current_tree != NULL);
79911+
79912+ return disk_addr_eq(znode_get_block(node),
79913+ &znode_get_tree(node)->root_block);
79914+}
79915+#endif
79916+
79917+/* check that @node is root */
79918+int znode_is_root(const znode * node /* znode to query */ )
79919+{
79920+ assert("nikita-1206", node != NULL);
79921+
79922+ return znode_get_level(node) == znode_get_tree(node)->height;
79923+}
79924+
79925+/* Returns true is @node was just created by zget() and wasn't ever loaded
79926+ into memory. */
79927+/* NIKITA-HANS: yes */
79928+int znode_just_created(const znode * node)
79929+{
79930+ assert("nikita-2188", node != NULL);
79931+ return (znode_page(node) == NULL);
79932+}
79933+
79934+/* obtain updated ->znode_epoch. See seal.c for description. */
79935+__u64 znode_build_version(reiser4_tree * tree)
79936+{
79937+ __u64 result;
79938+
79939+ spin_lock(&tree->epoch_lock);
79940+ result = ++tree->znode_epoch;
79941+ spin_unlock(&tree->epoch_lock);
79942+ return result;
79943+}
79944+
79945+void init_load_count(load_count * dh)
79946+{
79947+ assert("nikita-2105", dh != NULL);
79948+ memset(dh, 0, sizeof *dh);
79949+}
79950+
79951+void done_load_count(load_count * dh)
79952+{
79953+ assert("nikita-2106", dh != NULL);
79954+ if (dh->node != NULL) {
79955+ for (; dh->d_ref > 0; --dh->d_ref)
79956+ zrelse(dh->node);
79957+ dh->node = NULL;
79958+ }
79959+}
79960+
79961+static int incr_load_count(load_count * dh)
79962+{
79963+ int result;
79964+
79965+ assert("nikita-2110", dh != NULL);
79966+ assert("nikita-2111", dh->node != NULL);
79967+
79968+ result = zload(dh->node);
79969+ if (result == 0)
79970+ ++dh->d_ref;
79971+ return result;
79972+}
79973+
79974+int incr_load_count_znode(load_count * dh, znode * node)
79975+{
79976+ assert("nikita-2107", dh != NULL);
79977+ assert("nikita-2158", node != NULL);
79978+ assert("nikita-2109",
79979+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
79980+
79981+ dh->node = node;
79982+ return incr_load_count(dh);
79983+}
79984+
79985+int incr_load_count_jnode(load_count * dh, jnode * node)
79986+{
79987+ if (jnode_is_znode(node)) {
79988+ return incr_load_count_znode(dh, JZNODE(node));
79989+ }
79990+ return 0;
79991+}
79992+
79993+void copy_load_count(load_count * new, load_count * old)
79994+{
79995+ int ret = 0;
79996+ done_load_count(new);
79997+ new->node = old->node;
79998+ new->d_ref = 0;
79999+
80000+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
80001+ }
80002+
80003+ assert("jmacd-87589", ret == 0);
80004+}
80005+
80006+void move_load_count(load_count * new, load_count * old)
80007+{
80008+ done_load_count(new);
80009+ new->node = old->node;
80010+ new->d_ref = old->d_ref;
80011+ old->node = NULL;
80012+ old->d_ref = 0;
80013+}
80014+
80015+/* convert parent pointer into coord */
80016+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
80017+{
80018+ assert("nikita-3204", pcoord != NULL);
80019+ assert("nikita-3205", coord != NULL);
80020+
80021+ coord_init_first_unit_nocheck(coord, pcoord->node);
80022+ coord_set_item_pos(coord, pcoord->item_pos);
80023+ coord->between = AT_UNIT;
80024+}
80025+
80026+/* pack coord into parent_coord_t */
80027+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
80028+{
80029+ assert("nikita-3206", pcoord != NULL);
80030+ assert("nikita-3207", coord != NULL);
80031+
80032+ pcoord->node = coord->node;
80033+ pcoord->item_pos = coord->item_pos;
80034+}
80035+
80036+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
80037+ look for comments there) */
80038+void init_parent_coord(parent_coord_t * pcoord, const znode * node)
80039+{
80040+ pcoord->node = (znode *) node;
80041+ pcoord->item_pos = (unsigned short)~0;
80042+}
80043+
80044+#if REISER4_DEBUG
80045+
80046+/* debugging aid: znode invariant */
80047+static int znode_invariant_f(const znode * node /* znode to check */ ,
80048+ char const **msg /* where to store error
80049+ * message, if any */ )
80050+{
80051+#define _ergo(ant, con) \
80052+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
80053+
80054+#define _equi(e1, e2) \
80055+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
80056+
80057+#define _check(exp) ((*msg) = #exp, (exp))
80058+
80059+ return jnode_invariant_f(ZJNODE(node), msg) &&
80060+ /* [znode-fake] invariant */
80061+ /* fake znode doesn't have a parent, and */
80062+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
80063+ /* there is another way to express this very check, and */
80064+ _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
80065+ /* it has special block number, and */
80066+ _ergo(znode_get_level(node) == 0,
80067+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80068+ /* it is the only znode with such block number, and */
80069+ _ergo(!znode_above_root(node) && znode_is_loaded(node),
80070+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80071+ /* it is parent of the tree root node */
80072+ _ergo(znode_is_true_root(node),
80073+ znode_above_root(znode_parent(node))) &&
80074+ /* [znode-level] invariant */
80075+ /* level of parent znode is one larger than that of child,
80076+ except for the fake znode, and */
80077+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
80078+ znode_get_level(znode_parent(node)) ==
80079+ znode_get_level(node) + 1) &&
80080+ /* left neighbor is at the same level, and */
80081+ _ergo(znode_is_left_connected(node) && node->left != NULL,
80082+ znode_get_level(node) == znode_get_level(node->left)) &&
80083+ /* right neighbor is at the same level */
80084+ _ergo(znode_is_right_connected(node) && node->right != NULL,
80085+ znode_get_level(node) == znode_get_level(node->right)) &&
80086+ /* [znode-connected] invariant */
80087+ _ergo(node->left != NULL, znode_is_left_connected(node)) &&
80088+ _ergo(node->right != NULL, znode_is_right_connected(node)) &&
80089+ _ergo(!znode_is_root(node) && node->left != NULL,
80090+ znode_is_right_connected(node->left) &&
80091+ node->left->right == node) &&
80092+ _ergo(!znode_is_root(node) && node->right != NULL,
80093+ znode_is_left_connected(node->right) &&
80094+ node->right->left == node) &&
80095+ /* [znode-c_count] invariant */
80096+ /* for any znode, c_count of its parent is greater than 0 */
80097+ _ergo(znode_parent(node) != NULL &&
80098+ !znode_above_root(znode_parent(node)),
80099+ znode_parent(node)->c_count > 0) &&
80100+ /* leaves don't have children */
80101+ _ergo(znode_get_level(node) == LEAF_LEVEL,
80102+ node->c_count == 0) &&
80103+ _check(node->zjnode.jnodes.prev != NULL) &&
80104+ _check(node->zjnode.jnodes.next != NULL) &&
80105+ /* orphan doesn't have a parent */
80106+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
80107+ /* [znode-modify] invariant */
80108+ /* if znode is not write-locked, its checksum remains
80109+ * invariant */
80110+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
80111+ * cannot check this. */
80112+ /* [znode-refs] invariant */
80113+ /* only referenced znode can be long-term locked */
80114+ _ergo(znode_is_locked(node),
80115+ atomic_read(&ZJNODE(node)->x_count) != 0);
80116+}
80117+
80118+/* debugging aid: check znode invariant and panic if it doesn't hold */
80119+int znode_invariant(znode * node /* znode to check */ )
80120+{
80121+ char const *failed_msg;
80122+ int result;
80123+
80124+ assert("umka-063", node != NULL);
80125+ assert("umka-064", current_tree != NULL);
80126+
80127+ spin_lock_znode(node);
80128+ read_lock_tree(znode_get_tree(node));
80129+ result = znode_invariant_f(node, &failed_msg);
80130+ if (!result) {
80131+ /* print_znode("corrupted node", node); */
80132+ warning("jmacd-555", "Condition %s failed", failed_msg);
80133+ }
80134+ read_unlock_tree(znode_get_tree(node));
80135+ spin_unlock_znode(node);
80136+ return result;
80137+}
80138+
80139+/* return non-0 iff data are loaded into znode */
80140+int znode_is_loaded(const znode * node /* znode to query */ )
80141+{
80142+ assert("nikita-497", node != NULL);
80143+ return jnode_is_loaded(ZJNODE(node));
80144+}
80145+
80146+unsigned long znode_times_locked(const znode * z)
80147+{
80148+ return z->times_locked;
80149+}
80150+
80151+#endif /* REISER4_DEBUG */
80152+
80153+/* Make Linus happy.
80154+ Local variables:
80155+ c-indentation-style: "K&R"
80156+ mode-name: "LC"
80157+ c-basic-offset: 8
80158+ tab-width: 8
80159+ fill-column: 120
80160+ End:
80161+*/
80162diff -urN linux-2.6.20.orig/fs/reiser4/znode.h linux-2.6.20/fs/reiser4/znode.h
80163--- linux-2.6.20.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300
80164+++ linux-2.6.20/fs/reiser4/znode.h 2007-05-06 14:50:43.907040716 +0400
80165@@ -0,0 +1,434 @@
80166+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
80167+ * reiser4/README */
80168+
80169+/* Declaration of znode (Zam's node). See znode.c for more details. */
80170+
80171+#ifndef __ZNODE_H__
80172+#define __ZNODE_H__
80173+
80174+#include "forward.h"
80175+#include "debug.h"
80176+#include "dformat.h"
80177+#include "key.h"
80178+#include "coord.h"
80179+#include "plugin/node/node.h"
80180+#include "jnode.h"
80181+#include "lock.h"
80182+#include "readahead.h"
80183+
80184+#include <linux/types.h>
80185+#include <linux/spinlock.h>
80186+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
80187+#include <asm/atomic.h>
80188+#include <asm/semaphore.h>
80189+
80190+/* znode tracks its position within parent (internal item in a parent node,
80191+ * that contains znode's block number). */
80192+typedef struct parent_coord {
80193+ znode *node;
80194+ pos_in_node_t item_pos;
80195+} parent_coord_t;
80196+
80197+/* &znode - node in a reiser4 tree.
80198+
80199+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
80200+ cacheline pressure.
80201+
80202+ Locking:
80203+
80204+ Long term: data in a disk node attached to this znode are protected
80205+ by long term, deadlock aware lock ->lock;
80206+
80207+ Spin lock: the following fields are protected by the spin lock:
80208+
80209+ ->lock
80210+
80211+ Following fields are protected by the global tree lock:
80212+
80213+ ->left
80214+ ->right
80215+ ->in_parent
80216+ ->c_count
80217+
80218+ Following fields are protected by the global delimiting key lock (dk_lock):
80219+
80220+ ->ld_key (to update ->ld_key long-term lock on the node is also required)
80221+ ->rd_key
80222+
80223+ Following fields are protected by the long term lock:
80224+
80225+ ->nr_items
80226+
80227+ ->node_plugin is never changed once set. This means that after code made
80228+ itself sure that field is valid it can be accessed without any additional
80229+ locking.
80230+
80231+ ->level is immutable.
80232+
80233+ Invariants involving this data-type:
80234+
80235+ [znode-fake]
80236+ [znode-level]
80237+ [znode-connected]
80238+ [znode-c_count]
80239+ [znode-refs]
80240+ [jnode-refs]
80241+ [jnode-queued]
80242+ [znode-modify]
80243+
80244+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
80245+ Suggestions for how to do that are desired.*/
80246+struct znode {
80247+ /* Embedded jnode. */
80248+ jnode zjnode;
80249+
80250+ /* contains three subfields, node, pos_in_node, and pos_in_unit.
80251+
80252+ pos_in_node and pos_in_unit are only hints that are cached to
80253+ speed up lookups during balancing. They are not required to be up to
80254+ date. Synched in find_child_ptr().
80255+
80256+ This value allows us to avoid expensive binary searches.
80257+
80258+ in_parent->node points to the parent of this node, and is NOT a
80259+ hint.
80260+ */
80261+ parent_coord_t in_parent;
80262+
80263+ /*
80264+ * sibling list pointers
80265+ */
80266+
80267+ /* left-neighbor */
80268+ znode *left;
80269+ /* right-neighbor */
80270+ znode *right;
80271+
80272+ /* long term lock on node content. This lock supports deadlock
80273+ detection. See lock.c
80274+ */
80275+ zlock lock;
80276+
80277+ /* You cannot remove from memory a node that has children in
80278+ memory. This is because we rely on the fact that parent of given
80279+ node can always be reached without blocking for io. When reading a
80280+ node into memory you must increase the c_count of its parent, when
80281+ removing it from memory you must decrease the c_count. This makes
80282+ the code simpler, and the cases where it is suboptimal are truly
80283+ obscure.
80284+ */
80285+ int c_count;
80286+
80287+ /* plugin of node attached to this znode. NULL if znode is not
80288+ loaded. */
80289+ node_plugin *nplug;
80290+
80291+ /* version of znode data. This is increased on each modification. This
80292+ * is necessary to implement seals (see seal.[ch]) efficiently. */
80293+ __u64 version;
80294+
80295+ /* left delimiting key. Necessary to efficiently perform
80296+ balancing with node-level locking. Kept in memory only. */
80297+ reiser4_key ld_key;
80298+ /* right delimiting key. */
80299+ reiser4_key rd_key;
80300+
80301+ /* znode's tree level */
80302+ __u16 level;
80303+ /* number of items in this node. This field is modified by node
80304+ * plugin. */
80305+ __u16 nr_items;
80306+
80307+#if REISER4_DEBUG
80308+ void *creator;
80309+ reiser4_key first_key;
80310+ unsigned long times_locked;
80311+ int left_version; /* when node->left was updated */
80312+ int right_version; /* when node->right was updated */
80313+ int ld_key_version; /* when node->ld_key was updated */
80314+ int rd_key_version; /* when node->rd_key was updated */
80315+#endif
80316+
80317+} __attribute__ ((aligned(16)));
80318+
80319+ON_DEBUG(extern atomic_t delim_key_version;
80320+ )
80321+
80322+/* In general I think these macros should not be exposed. */
80323+#define znode_is_locked(node) (lock_is_locked(&node->lock))
80324+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
80325+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
80326+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
80327+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
80328+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80329+/* Macros for accessing the znode state. */
80330+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
80331+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
80332+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
80333+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80334+ znode * parent, tree_level level, gfp_t gfp_flag);
80335+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80336+extern int zload(znode * node);
80337+extern int zload_ra(znode * node, ra_info_t * info);
80338+extern int zinit_new(znode * node, gfp_t gfp_flags);
80339+extern void zrelse(znode * node);
80340+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80341+
80342+/* size of data in znode */
80343+static inline unsigned
80344+znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80345+{
80346+ assert("nikita-1416", node != NULL);
80347+ return PAGE_CACHE_SIZE;
80348+}
80349+
80350+extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80351+ coord_t * coord);
80352+extern void coord_to_parent_coord(const coord_t * coord,
80353+ parent_coord_t * pcoord);
80354+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80355+
80356+extern unsigned znode_free_space(znode * node);
80357+
80358+extern reiser4_key *znode_get_rd_key(znode * node);
80359+extern reiser4_key *znode_get_ld_key(znode * node);
80360+
80361+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80362+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80363+
80364+/* `connected' state checks */
80365+static inline int znode_is_right_connected(const znode * node)
80366+{
80367+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80368+}
80369+
80370+static inline int znode_is_left_connected(const znode * node)
80371+{
80372+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80373+}
80374+
80375+static inline int znode_is_connected(const znode * node)
80376+{
80377+ return znode_is_right_connected(node) && znode_is_left_connected(node);
80378+}
80379+
80380+extern int znode_shift_order;
80381+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80382+extern void znode_remove(znode *, reiser4_tree *);
80383+extern znode *znode_parent(const znode * node);
80384+extern znode *znode_parent_nolock(const znode * node);
80385+extern int znode_above_root(const znode * node);
80386+extern int init_znodes(void);
80387+extern void done_znodes(void);
80388+extern int znodes_tree_init(reiser4_tree * ztree);
80389+extern void znodes_tree_done(reiser4_tree * ztree);
80390+extern int znode_contains_key(znode * node, const reiser4_key * key);
80391+extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80392+extern unsigned znode_save_free_space(znode * node);
80393+extern unsigned znode_recover_free_space(znode * node);
80394+extern znode *zalloc(gfp_t gfp_flag);
80395+extern void zinit(znode *, const znode * parent, reiser4_tree *);
80396+extern int zparse(znode * node);
80397+
80398+extern int znode_just_created(const znode * node);
80399+
80400+extern void zfree(znode * node);
80401+
80402+#if REISER4_DEBUG
80403+extern void print_znode(const char *prefix, const znode * node);
80404+#else
80405+#define print_znode( p, n ) noop
80406+#endif
80407+
80408+/* Make it look like various znode functions exist instead of treating znodes as
80409+ jnodes in znode-specific code. */
80410+#define znode_page(x) jnode_page ( ZJNODE(x) )
80411+#define zdata(x) jdata ( ZJNODE(x) )
80412+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
80413+#define znode_created(x) jnode_created ( ZJNODE(x) )
80414+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
80415+#define znode_convertible(x) jnode_convertible (ZJNODE(x))
80416+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
80417+
80418+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
80419+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
80420+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
80421+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
80422+
80423+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
80424+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
80425+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
80426+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
80427+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80428+
80429+#if REISER4_DEBUG
80430+extern int znode_x_count_is_protected(const znode * node);
80431+extern int znode_invariant(znode * node);
80432+#endif
80433+
80434+/* acquire reference to @node */
80435+static inline znode *zref(znode * node)
80436+{
80437+ /* change of x_count from 0 to 1 is protected by tree spin-lock */
80438+ return JZNODE(jref(ZJNODE(node)));
80439+}
80440+
80441+/* release reference to @node */
80442+static inline void zput(znode * node)
80443+{
80444+ assert("nikita-3564", znode_invariant(node));
80445+ jput(ZJNODE(node));
80446+}
80447+
80448+/* get the level field for a znode */
80449+static inline tree_level znode_get_level(const znode * node)
80450+{
80451+ return node->level;
80452+}
80453+
80454+/* get the level field for a jnode */
80455+static inline tree_level jnode_get_level(const jnode * node)
80456+{
80457+ if (jnode_is_znode(node))
80458+ return znode_get_level(JZNODE(node));
80459+ else
80460+ /* unformatted nodes are all at the LEAF_LEVEL and for
80461+ "semi-formatted" nodes like bitmaps, level doesn't matter. */
80462+ return LEAF_LEVEL;
80463+}
80464+
80465+/* true if jnode is on leaf level */
80466+static inline int jnode_is_leaf(const jnode * node)
80467+{
80468+ if (jnode_is_znode(node))
80469+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80470+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80471+ return 1;
80472+ return 0;
80473+}
80474+
80475+/* return znode's tree */
80476+static inline reiser4_tree *znode_get_tree(const znode * node)
80477+{
80478+ assert("nikita-2692", node != NULL);
80479+ return jnode_get_tree(ZJNODE(node));
80480+}
80481+
80482+/* resolve race with zput */
80483+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80484+{
80485+ jnode *j;
80486+
80487+ j = jnode_rip_sync(tree, ZJNODE(node));
80488+ if (likely(j != NULL))
80489+ node = JZNODE(j);
80490+ else
80491+ node = NULL;
80492+ return node;
80493+}
80494+
80495+#if defined(REISER4_DEBUG)
80496+int znode_is_loaded(const znode * node /* znode to query */ );
80497+#endif
80498+
80499+extern __u64 znode_build_version(reiser4_tree * tree);
80500+
80501+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
80502+ must load the data for a node in many places. We could do this by simply calling
80503+ zload() everywhere, the difficulty arises when we must release the loaded data by
80504+ calling zrelse. In a function with many possible error/return paths, it requires extra
80505+ work to figure out which exit paths must call zrelse and those which do not. The data
80506+ handle automatically calls zrelse for every zload that it is responsible for. In that
80507+ sense, it acts much like a lock_handle.
80508+*/
80509+typedef struct load_count {
80510+ znode *node;
80511+ int d_ref;
80512+} load_count;
80513+
80514+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
80515+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
80516+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
80517+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
80518+ * incr_load_count_znode, otherwise do nothing (unformatted nodes
80519+ * don't require zload/zrelse treatment). */
80520+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
80521+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
80522+
80523+/* Variable initializers for load_count. */
80524+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
80525+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
80526+/* A convenience macro for use in assertions or debug-only code, where loaded
80527+ data is only required to perform the debugging check. This macro
80528+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */
80529+#define WITH_DATA( node, exp ) \
80530+({ \
80531+ long __with_dh_result; \
80532+ znode *__with_dh_node; \
80533+ \
80534+ __with_dh_node = ( node ); \
80535+ __with_dh_result = zload( __with_dh_node ); \
80536+ if( __with_dh_result == 0 ) { \
80537+ __with_dh_result = ( long )( exp ); \
80538+ zrelse( __with_dh_node ); \
80539+ } \
80540+ __with_dh_result; \
80541+})
80542+
80543+/* Same as above, but accepts a return value in case zload fails. */
80544+#define WITH_DATA_RET( node, ret, exp ) \
80545+({ \
80546+ int __with_dh_result; \
80547+ znode *__with_dh_node; \
80548+ \
80549+ __with_dh_node = ( node ); \
80550+ __with_dh_result = zload( __with_dh_node ); \
80551+ if( __with_dh_result == 0 ) { \
80552+ __with_dh_result = ( int )( exp ); \
80553+ zrelse( __with_dh_node ); \
80554+ } else \
80555+ __with_dh_result = ( ret ); \
80556+ __with_dh_result; \
80557+})
80558+
80559+#define WITH_COORD(coord, exp) \
80560+({ \
80561+ coord_t *__coord; \
80562+ \
80563+ __coord = (coord); \
80564+ coord_clear_iplug(__coord); \
80565+ WITH_DATA(__coord->node, exp); \
80566+})
80567+
80568+#if REISER4_DEBUG
80569+#define STORE_COUNTERS \
80570+ reiser4_lock_counters_info __entry_counters = \
80571+ *reiser4_lock_counters()
80572+#define CHECK_COUNTERS \
80573+ON_DEBUG_CONTEXT( \
80574+({ \
80575+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
80576+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
80577+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
80578+ assert("nikita-2159", \
80579+ !memcmp(&__entry_counters, reiser4_lock_counters(), \
80580+ sizeof __entry_counters)); \
80581+}) )
80582+
80583+#else
80584+#define STORE_COUNTERS
80585+#define CHECK_COUNTERS noop
80586+#endif
80587+
80588+/* __ZNODE_H__ */
80589+#endif
80590+
80591+/* Make Linus happy.
80592+ Local variables:
80593+ c-indentation-style: "K&R"
80594+ mode-name: "LC"
80595+ c-basic-offset: 8
80596+ tab-width: 8
80597+ fill-column: 120
80598+ End:
80599+*/
80600diff -urN linux-2.6.20.orig/include/linux/fs.h linux-2.6.20/include/linux/fs.h
80601--- linux-2.6.20.orig/include/linux/fs.h 2007-05-06 15:04:41.352625543 +0400
80602+++ linux-2.6.20/include/linux/fs.h 2007-05-06 14:50:43.911041966 +0400
80603@@ -1165,6 +1165,8 @@
80604 void (*clear_inode) (struct inode *);
80605 void (*umount_begin) (struct vfsmount *, int);
80606
80607+ void (*sync_inodes) (struct super_block *sb,
80608+ struct writeback_control *wbc);
80609 int (*show_options)(struct seq_file *, struct vfsmount *);
80610 int (*show_stats)(struct seq_file *, struct vfsmount *);
80611 #ifdef CONFIG_QUOTA
80612@@ -1583,6 +1585,7 @@
80613 extern int invalidate_inode_pages2_range(struct address_space *mapping,
80614 pgoff_t start, pgoff_t end);
80615 extern int write_inode_now(struct inode *, int);
80616+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
80617 extern int filemap_fdatawrite(struct address_space *);
80618 extern int filemap_flush(struct address_space *);
80619 extern int filemap_fdatawait(struct address_space *);
80620diff -urN linux-2.6.20.orig/lib/radix-tree.c linux-2.6.20/lib/radix-tree.c
80621--- linux-2.6.20.orig/lib/radix-tree.c 2007-05-06 15:04:42.096858012 +0400
80622+++ linux-2.6.20/lib/radix-tree.c 2007-05-06 14:50:43.915043216 +0400
80623@@ -151,6 +151,7 @@
80624 out:
80625 return ret;
80626 }
80627+EXPORT_SYMBOL(radix_tree_preload);
80628
80629 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
80630 int offset)
80631diff -urN linux-2.6.20.orig/mm/filemap.c linux-2.6.20/mm/filemap.c
80632--- linux-2.6.20.orig/mm/filemap.c 2007-05-06 15:04:42.108861762 +0400
80633+++ linux-2.6.20/mm/filemap.c 2007-05-06 14:50:43.919044465 +0400
80634@@ -121,6 +121,7 @@
80635 mapping->nrpages--;
80636 __dec_zone_page_state(page, NR_FILE_PAGES);
80637 }
80638+EXPORT_SYMBOL(__remove_from_page_cache);
80639
80640 void remove_from_page_cache(struct page *page)
80641 {
80642@@ -132,6 +133,7 @@
80643 __remove_from_page_cache(page);
80644 write_unlock_irq(&mapping->tree_lock);
80645 }
80646+EXPORT_SYMBOL(remove_from_page_cache);
80647
80648 static int sync_page(void *word)
80649 {
80650@@ -738,6 +740,7 @@
80651 read_unlock_irq(&mapping->tree_lock);
80652 return ret;
80653 }
80654+EXPORT_SYMBOL(add_to_page_cache_lru);
80655
80656 /**
80657 * find_get_pages_contig - gang contiguous pagecache lookup
80658@@ -798,6 +801,7 @@
80659 read_unlock_irq(&mapping->tree_lock);
80660 return ret;
80661 }
80662+EXPORT_SYMBOL(find_get_pages);
80663
80664 /**
80665 * grab_cache_page_nowait - returns locked page at given index in given cache
80666@@ -855,6 +859,7 @@
80667
80668 ra->ra_pages /= 4;
80669 }
80670+EXPORT_SYMBOL(find_get_pages_tag);
80671
80672 /**
80673 * do_generic_mapping_read - generic file read routine
80674diff -urN linux-2.6.20.orig/mm/readahead.c linux-2.6.20/mm/readahead.c
80675--- linux-2.6.20.orig/mm/readahead.c 2007-05-06 15:04:42.144873010 +0400
80676+++ linux-2.6.20/mm/readahead.c 2007-05-06 14:50:43.919044465 +0400
80677@@ -568,6 +568,7 @@
80678 ra->flags &= ~RA_FLAG_INCACHE;
80679 ra->cache_hit = 0;
80680 }
80681+EXPORT_SYMBOL_GPL(handle_ra_miss);
80682
80683 /*
80684 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
80685Files linux-2.6.20.orig/scripts/kconfig/mconf and linux-2.6.20/scripts/kconfig/mconf differ